diff --git a/.circleci/config.yml b/.circleci/config.yml new file mode 100644 index 000000000..785b383e1 --- /dev/null +++ b/.circleci/config.yml @@ -0,0 +1,27 @@ +version: 2.1 +jobs: + test-local-gcc: + machine: + image: default + working_directory: ~/criu + steps: + - checkout + - run: + name: "Test local with GCC" + command: sudo -E make -C scripts/ci local + test-local-clang: + machine: + image: default + working_directory: ~/criu + steps: + - checkout + - run: + name: "Test local with CLANG" + command: sudo -E make -C scripts/ci local CLANG=1 + +workflows: + version: 2 + builds: + jobs: + - test-local-gcc + - test-local-clang diff --git a/.cirrus.yml b/.cirrus.yml new file mode 100644 index 000000000..72dbb3898 --- /dev/null +++ b/.cirrus.yml @@ -0,0 +1,101 @@ +task: + name: Vagrant Fedora based test (no VDSO) + environment: + HOME: "/root" + CIRRUS_WORKING_DIR: "/tmp/criu" + + compute_engine_instance: + image_project: cirrus-images + image: family/docker-kvm + platform: linux + cpu: 4 + memory: 16G + nested_virtualization: true + + setup_script: | + contrib/apt-install make gcc pkg-config git perl-modules iproute2 kmod wget cpu-checker + sudo kvm-ok + build_script: | + make -C scripts/ci vagrant-fedora-no-vdso + +task: + name: CentOS Stream 9 based test + environment: + HOME: "/root" + CIRRUS_WORKING_DIR: "/tmp/criu" + + compute_engine_instance: + image_project: centos-cloud + image: family/centos-stream-9 + platform: linux + cpu: 4 + memory: 8G + + setup_script: | + dnf config-manager --set-enabled crb # Same as CentOS 8 powertools + dnf -y install epel-release epel-next-release + contrib/dependencies/dnf-packages.sh + # The image has a too old version of nettle which does not work with gnutls. + # Just upgrade to the latest to make the error go away. + dnf -y upgrade nettle nettle-devel + systemctl stop sssd + # Even with selinux in permissive mode the selinux tests will be executed. + # The Cirrus CI user runs as a service from selinux point of view and is + # much more restricted than a normal shell (system_u:system_r:unconfined_service_t:s0). + # The test case above (vagrant-fedora-no-vdso) should run selinux tests in enforcing mode. + setenforce 0 + + build_script: | + make -C scripts/ci local SKIP_CI_PREP=1 CC=gcc CD_TO_TOP=1 ZDTM_OPTS="-x zdtm/static/socket-raw" + +task: + name: Vagrant Fedora Rawhide based test + environment: + HOME: "/root" + CIRRUS_WORKING_DIR: "/tmp/criu" + + compute_engine_instance: + image_project: cirrus-images + image: family/docker-kvm + platform: linux + cpu: 4 + memory: 16G + nested_virtualization: true + + setup_script: | + contrib/apt-install make gcc pkg-config git perl-modules iproute2 kmod wget cpu-checker + sudo kvm-ok + build_script: | + make -C scripts/ci vagrant-fedora-rawhide + +task: + name: Vagrant Fedora based test (non-root) + environment: + HOME: "/root" + CIRRUS_WORKING_DIR: "/tmp/criu" + + compute_engine_instance: + image_project: cirrus-images + image: family/docker-kvm + platform: linux + cpu: 4 + memory: 16G + nested_virtualization: true + + setup_script: | + contrib/apt-install make gcc pkg-config git perl-modules iproute2 kmod wget cpu-checker + sudo kvm-ok + build_script: | + make -C scripts/ci vagrant-fedora-non-root + +task: + name: aarch64 Fedora Rawhide + arm_container: + image: registry.fedoraproject.org/fedora:rawhide + cpu: 4 + memory: 4G + script: uname -a + build_script: | + scripts/ci/prepare-for-fedora-rawhide.sh + make -C scripts/ci/ local CC=gcc SKIP_CI_PREP=1 SKIP_CI_TEST=1 CD_TO_TOP=1 + make -C test/zdtm -j 4 diff --git a/.clang-format b/.clang-format new file mode 100644 index 000000000..fb40bc613 --- /dev/null +++ b/.clang-format @@ -0,0 +1,565 @@ +# SPDX-License-Identifier: GPL-2.0 +# +# clang-format configuration file. Intended for clang-format >= 11. +# +# For more information, see: +# +# Documentation/process/clang-format.rst +# https://clang.llvm.org/docs/ClangFormat.html +# https://clang.llvm.org/docs/ClangFormatStyleOptions.html +# +--- +AccessModifierOffset: -4 +AlignAfterOpenBracket: Align +AlignConsecutiveAssignments: false +AlignConsecutiveDeclarations: false +AlignEscapedNewlines: Left # Unknown to clang-format-4.0 +AlignOperands: true +AlignTrailingComments: true +AlignConsecutiveMacros: true +AllowAllParametersOfDeclarationOnNextLine: false +AllowShortBlocksOnASingleLine: false +AllowShortCaseLabelsOnASingleLine: false +AllowShortFunctionsOnASingleLine: None +AllowShortIfStatementsOnASingleLine: false +AllowShortLoopsOnASingleLine: false +AlwaysBreakAfterDefinitionReturnType: None +AlwaysBreakAfterReturnType: None +AlwaysBreakBeforeMultilineStrings: false +AlwaysBreakTemplateDeclarations: false +BinPackArguments: true +BinPackParameters: true +BraceWrapping: + AfterClass: false + AfterControlStatement: false + AfterEnum: false + AfterFunction: true + AfterNamespace: true + AfterObjCDeclaration: false + AfterStruct: false + AfterUnion: false + AfterExternBlock: false # Unknown to clang-format-5.0 + BeforeCatch: false + BeforeElse: false + IndentBraces: false + SplitEmptyFunction: true # Unknown to clang-format-4.0 + SplitEmptyRecord: true # Unknown to clang-format-4.0 + SplitEmptyNamespace: true # Unknown to clang-format-4.0 +BreakBeforeBinaryOperators: None +BreakBeforeBraces: Custom +BreakBeforeInheritanceComma: false # Unknown to clang-format-4.0 +BreakBeforeTernaryOperators: false +BreakConstructorInitializersBeforeComma: false +BreakConstructorInitializers: BeforeComma # Unknown to clang-format-4.0 +BreakAfterJavaFieldAnnotations: false +BreakStringLiterals: false +ColumnLimit: 0 +CommentPragmas: '^ IWYU pragma:' +CompactNamespaces: false # Unknown to clang-format-4.0 +ConstructorInitializerAllOnOneLineOrOnePerLine: false +ConstructorInitializerIndentWidth: 8 +ContinuationIndentWidth: 8 +Cpp11BracedListStyle: false +DerivePointerAlignment: false +DisableFormat: false +ExperimentalAutoDetectBinPacking: false +FixNamespaceComments: false # Unknown to clang-format-4.0 + +# Taken from: +# git grep -h '^#define [^[:space:]]*for_each[^[:space:]]*(' include/ \ +# | sed "s,^#define \([^[:space:]]*for_each[^[:space:]]*\)(.*$, - '\1'," \ +# | sort | uniq +ForEachMacros: + - 'for_each_pstree_item' + - 'for_each_bit' + - 'apei_estatus_for_each_section' + - 'ata_for_each_dev' + - 'ata_for_each_link' + - '__ata_qc_for_each' + - 'ata_qc_for_each' + - 'ata_qc_for_each_raw' + - 'ata_qc_for_each_with_internal' + - 'ax25_for_each' + - 'ax25_uid_for_each' + - '__bio_for_each_bvec' + - 'bio_for_each_bvec' + - 'bio_for_each_bvec_all' + - 'bio_for_each_integrity_vec' + - '__bio_for_each_segment' + - 'bio_for_each_segment' + - 'bio_for_each_segment_all' + - 'bio_list_for_each' + - 'bip_for_each_vec' + - 'bitmap_for_each_clear_region' + - 'bitmap_for_each_set_region' + - 'blkg_for_each_descendant_post' + - 'blkg_for_each_descendant_pre' + - 'blk_queue_for_each_rl' + - 'bond_for_each_slave' + - 'bond_for_each_slave_rcu' + - 'bpf_for_each_spilled_reg' + - 'btree_for_each_safe128' + - 'btree_for_each_safe32' + - 'btree_for_each_safe64' + - 'btree_for_each_safel' + - 'card_for_each_dev' + - 'cgroup_taskset_for_each' + - 'cgroup_taskset_for_each_leader' + - 'cpufreq_for_each_entry' + - 'cpufreq_for_each_entry_idx' + - 'cpufreq_for_each_valid_entry' + - 'cpufreq_for_each_valid_entry_idx' + - 'css_for_each_child' + - 'css_for_each_descendant_post' + - 'css_for_each_descendant_pre' + - 'device_for_each_child_node' + - 'displayid_iter_for_each' + - 'dma_fence_chain_for_each' + - 'do_for_each_ftrace_op' + - 'drm_atomic_crtc_for_each_plane' + - 'drm_atomic_crtc_state_for_each_plane' + - 'drm_atomic_crtc_state_for_each_plane_state' + - 'drm_atomic_for_each_plane_damage' + - 'drm_client_for_each_connector_iter' + - 'drm_client_for_each_modeset' + - 'drm_connector_for_each_possible_encoder' + - 'drm_for_each_bridge_in_chain' + - 'drm_for_each_connector_iter' + - 'drm_for_each_crtc' + - 'drm_for_each_crtc_reverse' + - 'drm_for_each_encoder' + - 'drm_for_each_encoder_mask' + - 'drm_for_each_fb' + - 'drm_for_each_legacy_plane' + - 'drm_for_each_plane' + - 'drm_for_each_plane_mask' + - 'drm_for_each_privobj' + - 'drm_mm_for_each_hole' + - 'drm_mm_for_each_node' + - 'drm_mm_for_each_node_in_range' + - 'drm_mm_for_each_node_safe' + - 'flow_action_for_each' + - 'for_each_acpi_dev_match' + - 'for_each_active_dev_scope' + - 'for_each_active_drhd_unit' + - 'for_each_active_iommu' + - 'for_each_aggr_pgid' + - 'for_each_available_child_of_node' + - 'for_each_bio' + - 'for_each_board_func_rsrc' + - 'for_each_bvec' + - 'for_each_card_auxs' + - 'for_each_card_auxs_safe' + - 'for_each_card_components' + - 'for_each_card_dapms' + - 'for_each_card_pre_auxs' + - 'for_each_card_prelinks' + - 'for_each_card_rtds' + - 'for_each_card_rtds_safe' + - 'for_each_card_widgets' + - 'for_each_card_widgets_safe' + - 'for_each_cgroup_storage_type' + - 'for_each_child_of_node' + - 'for_each_clear_bit' + - 'for_each_clear_bit_from' + - 'for_each_cmsghdr' + - 'for_each_compatible_node' + - 'for_each_component_dais' + - 'for_each_component_dais_safe' + - 'for_each_comp_order' + - 'for_each_console' + - 'for_each_cpu' + - 'for_each_cpu_and' + - 'for_each_cpu_not' + - 'for_each_cpu_wrap' + - 'for_each_dapm_widgets' + - 'for_each_dev_addr' + - 'for_each_dev_scope' + - 'for_each_dma_cap_mask' + - 'for_each_dpcm_be' + - 'for_each_dpcm_be_rollback' + - 'for_each_dpcm_be_safe' + - 'for_each_dpcm_fe' + - 'for_each_drhd_unit' + - 'for_each_dss_dev' + - 'for_each_dtpm_table' + - 'for_each_efi_memory_desc' + - 'for_each_efi_memory_desc_in_map' + - 'for_each_element' + - 'for_each_element_extid' + - 'for_each_element_id' + - 'for_each_endpoint_of_node' + - 'for_each_evictable_lru' + - 'for_each_fib6_node_rt_rcu' + - 'for_each_fib6_walker_rt' + - 'for_each_free_mem_pfn_range_in_zone' + - 'for_each_free_mem_pfn_range_in_zone_from' + - 'for_each_free_mem_range' + - 'for_each_free_mem_range_reverse' + - 'for_each_func_rsrc' + - 'for_each_hstate' + - 'for_each_if' + - 'for_each_iommu' + - 'for_each_ip_tunnel_rcu' + - 'for_each_irq_nr' + - 'for_each_link_codecs' + - 'for_each_link_cpus' + - 'for_each_link_platforms' + - 'for_each_lru' + - 'for_each_matching_node' + - 'for_each_matching_node_and_match' + - 'for_each_member' + - 'for_each_memcg_cache_index' + - 'for_each_mem_pfn_range' + - '__for_each_mem_range' + - 'for_each_mem_range' + - '__for_each_mem_range_rev' + - 'for_each_mem_range_rev' + - 'for_each_mem_region' + - 'for_each_migratetype_order' + - 'for_each_msi_entry' + - 'for_each_msi_entry_safe' + - 'for_each_msi_vector' + - 'for_each_net' + - 'for_each_net_continue_reverse' + - 'for_each_netdev' + - 'for_each_netdev_continue' + - 'for_each_netdev_continue_rcu' + - 'for_each_netdev_continue_reverse' + - 'for_each_netdev_feature' + - 'for_each_netdev_in_bond_rcu' + - 'for_each_netdev_rcu' + - 'for_each_netdev_reverse' + - 'for_each_netdev_safe' + - 'for_each_net_rcu' + - 'for_each_new_connector_in_state' + - 'for_each_new_crtc_in_state' + - 'for_each_new_mst_mgr_in_state' + - 'for_each_new_plane_in_state' + - 'for_each_new_private_obj_in_state' + - 'for_each_node' + - 'for_each_node_by_name' + - 'for_each_node_by_type' + - 'for_each_node_mask' + - 'for_each_node_state' + - 'for_each_node_with_cpus' + - 'for_each_node_with_property' + - 'for_each_nonreserved_multicast_dest_pgid' + - 'for_each_of_allnodes' + - 'for_each_of_allnodes_from' + - 'for_each_of_cpu_node' + - 'for_each_of_pci_range' + - 'for_each_old_connector_in_state' + - 'for_each_old_crtc_in_state' + - 'for_each_old_mst_mgr_in_state' + - 'for_each_oldnew_connector_in_state' + - 'for_each_oldnew_crtc_in_state' + - 'for_each_oldnew_mst_mgr_in_state' + - 'for_each_oldnew_plane_in_state' + - 'for_each_oldnew_plane_in_state_reverse' + - 'for_each_oldnew_private_obj_in_state' + - 'for_each_old_plane_in_state' + - 'for_each_old_private_obj_in_state' + - 'for_each_online_cpu' + - 'for_each_online_node' + - 'for_each_online_pgdat' + - 'for_each_pci_bridge' + - 'for_each_pci_dev' + - 'for_each_pci_msi_entry' + - 'for_each_pcm_streams' + - 'for_each_physmem_range' + - 'for_each_populated_zone' + - 'for_each_possible_cpu' + - 'for_each_present_cpu' + - 'for_each_prime_number' + - 'for_each_prime_number_from' + - 'for_each_process' + - 'for_each_process_thread' + - 'for_each_prop_codec_conf' + - 'for_each_prop_dai_codec' + - 'for_each_prop_dai_cpu' + - 'for_each_prop_dlc_codecs' + - 'for_each_prop_dlc_cpus' + - 'for_each_prop_dlc_platforms' + - 'for_each_property_of_node' + - 'for_each_registered_fb' + - 'for_each_requested_gpio' + - 'for_each_requested_gpio_in_range' + - 'for_each_reserved_mem_range' + - 'for_each_reserved_mem_region' + - 'for_each_rtd_codec_dais' + - 'for_each_rtd_components' + - 'for_each_rtd_cpu_dais' + - 'for_each_rtd_dais' + - 'for_each_set_bit' + - 'for_each_set_bit_from' + - 'for_each_set_clump8' + - 'for_each_sg' + - 'for_each_sg_dma_page' + - 'for_each_sg_page' + - 'for_each_sgtable_dma_page' + - 'for_each_sgtable_dma_sg' + - 'for_each_sgtable_page' + - 'for_each_sgtable_sg' + - 'for_each_sibling_event' + - 'for_each_subelement' + - 'for_each_subelement_extid' + - 'for_each_subelement_id' + - '__for_each_thread' + - 'for_each_thread' + - 'for_each_unicast_dest_pgid' + - 'for_each_vsi' + - 'for_each_wakeup_source' + - 'for_each_zone' + - 'for_each_zone_zonelist' + - 'for_each_zone_zonelist_nodemask' + - 'fwnode_for_each_available_child_node' + - 'fwnode_for_each_child_node' + - 'fwnode_graph_for_each_endpoint' + - 'gadget_for_each_ep' + - 'genradix_for_each' + - 'genradix_for_each_from' + - 'hash_for_each' + - 'hash_for_each_possible' + - 'hash_for_each_possible_rcu' + - 'hash_for_each_possible_rcu_notrace' + - 'hash_for_each_possible_safe' + - 'hash_for_each_rcu' + - 'hash_for_each_safe' + - 'hctx_for_each_ctx' + - 'hlist_bl_for_each_entry' + - 'hlist_bl_for_each_entry_rcu' + - 'hlist_bl_for_each_entry_safe' + - 'hlist_for_each' + - 'hlist_for_each_entry' + - 'hlist_for_each_entry_continue' + - 'hlist_for_each_entry_continue_rcu' + - 'hlist_for_each_entry_continue_rcu_bh' + - 'hlist_for_each_entry_from' + - 'hlist_for_each_entry_from_rcu' + - 'hlist_for_each_entry_rcu' + - 'hlist_for_each_entry_rcu_bh' + - 'hlist_for_each_entry_rcu_notrace' + - 'hlist_for_each_entry_safe' + - 'hlist_for_each_entry_srcu' + - '__hlist_for_each_rcu' + - 'hlist_for_each_safe' + - 'hlist_nulls_for_each_entry' + - 'hlist_nulls_for_each_entry_from' + - 'hlist_nulls_for_each_entry_rcu' + - 'hlist_nulls_for_each_entry_safe' + - 'i3c_bus_for_each_i2cdev' + - 'i3c_bus_for_each_i3cdev' + - 'ide_host_for_each_port' + - 'ide_port_for_each_dev' + - 'ide_port_for_each_present_dev' + - 'idr_for_each_entry' + - 'idr_for_each_entry_continue' + - 'idr_for_each_entry_continue_ul' + - 'idr_for_each_entry_ul' + - 'in_dev_for_each_ifa_rcu' + - 'in_dev_for_each_ifa_rtnl' + - 'inet_bind_bucket_for_each' + - 'inet_lhash2_for_each_icsk_rcu' + - 'key_for_each' + - 'key_for_each_safe' + - 'klp_for_each_func' + - 'klp_for_each_func_safe' + - 'klp_for_each_func_static' + - 'klp_for_each_object' + - 'klp_for_each_object_safe' + - 'klp_for_each_object_static' + - 'kunit_suite_for_each_test_case' + - 'kvm_for_each_memslot' + - 'kvm_for_each_vcpu' + - 'list_for_each' + - 'list_for_each_codec' + - 'list_for_each_codec_safe' + - 'list_for_each_continue' + - 'list_for_each_entry' + - 'list_for_each_entry_continue' + - 'list_for_each_entry_continue_rcu' + - 'list_for_each_entry_continue_reverse' + - 'list_for_each_entry_from' + - 'list_for_each_entry_from_rcu' + - 'list_for_each_entry_from_reverse' + - 'list_for_each_entry_lockless' + - 'list_for_each_entry_rcu' + - 'list_for_each_entry_reverse' + - 'list_for_each_entry_safe' + - 'list_for_each_entry_safe_continue' + - 'list_for_each_entry_safe_from' + - 'list_for_each_entry_safe_reverse' + - 'list_for_each_entry_srcu' + - 'list_for_each_prev' + - 'list_for_each_prev_safe' + - 'list_for_each_safe' + - 'llist_for_each' + - 'llist_for_each_entry' + - 'llist_for_each_entry_safe' + - 'llist_for_each_safe' + - 'mci_for_each_dimm' + - 'media_device_for_each_entity' + - 'media_device_for_each_intf' + - 'media_device_for_each_link' + - 'media_device_for_each_pad' + - 'nanddev_io_for_each_page' + - 'netdev_for_each_lower_dev' + - 'netdev_for_each_lower_private' + - 'netdev_for_each_lower_private_rcu' + - 'netdev_for_each_mc_addr' + - 'netdev_for_each_uc_addr' + - 'netdev_for_each_upper_dev_rcu' + - 'netdev_hw_addr_list_for_each' + - 'nft_rule_for_each_expr' + - 'nla_for_each_attr' + - 'nla_for_each_nested' + - 'nlmsg_for_each_attr' + - 'nlmsg_for_each_msg' + - 'nr_neigh_for_each' + - 'nr_neigh_for_each_safe' + - 'nr_node_for_each' + - 'nr_node_for_each_safe' + - 'of_for_each_phandle' + - 'of_property_for_each_string' + - 'of_property_for_each_u32' + - 'pci_bus_for_each_resource' + - 'pcl_for_each_chunk' + - 'pcl_for_each_segment' + - 'pcm_for_each_format' + - 'ping_portaddr_for_each_entry' + - 'plist_for_each' + - 'plist_for_each_continue' + - 'plist_for_each_entry' + - 'plist_for_each_entry_continue' + - 'plist_for_each_entry_safe' + - 'plist_for_each_safe' + - 'pnp_for_each_card' + - 'pnp_for_each_dev' + - 'protocol_for_each_card' + - 'protocol_for_each_dev' + - 'queue_for_each_hw_ctx' + - 'radix_tree_for_each_slot' + - 'radix_tree_for_each_tagged' + - 'rb_for_each' + - 'rbtree_postorder_for_each_entry_safe' + - 'rdma_for_each_block' + - 'rdma_for_each_port' + - 'rdma_umem_for_each_dma_block' + - 'resource_list_for_each_entry' + - 'resource_list_for_each_entry_safe' + - 'rhl_for_each_entry_rcu' + - 'rhl_for_each_rcu' + - 'rht_for_each' + - 'rht_for_each_entry' + - 'rht_for_each_entry_from' + - 'rht_for_each_entry_rcu' + - 'rht_for_each_entry_rcu_from' + - 'rht_for_each_entry_safe' + - 'rht_for_each_from' + - 'rht_for_each_rcu' + - 'rht_for_each_rcu_from' + - '__rq_for_each_bio' + - 'rq_for_each_bvec' + - 'rq_for_each_segment' + - 'scsi_for_each_prot_sg' + - 'scsi_for_each_sg' + - 'sctp_for_each_hentry' + - 'sctp_skb_for_each' + - 'shdma_for_each_chan' + - '__shost_for_each_device' + - 'shost_for_each_device' + - 'sk_for_each' + - 'sk_for_each_bound' + - 'sk_for_each_entry_offset_rcu' + - 'sk_for_each_from' + - 'sk_for_each_rcu' + - 'sk_for_each_safe' + - 'sk_nulls_for_each' + - 'sk_nulls_for_each_from' + - 'sk_nulls_for_each_rcu' + - 'snd_array_for_each' + - 'snd_pcm_group_for_each_entry' + - 'snd_soc_dapm_widget_for_each_path' + - 'snd_soc_dapm_widget_for_each_path_safe' + - 'snd_soc_dapm_widget_for_each_sink_path' + - 'snd_soc_dapm_widget_for_each_source_path' + - 'tb_property_for_each' + - 'tcf_exts_for_each_action' + - 'udp_portaddr_for_each_entry' + - 'udp_portaddr_for_each_entry_rcu' + - 'usb_hub_for_each_child' + - 'v4l2_device_for_each_subdev' + - 'v4l2_m2m_for_each_dst_buf' + - 'v4l2_m2m_for_each_dst_buf_safe' + - 'v4l2_m2m_for_each_src_buf' + - 'v4l2_m2m_for_each_src_buf_safe' + - 'virtio_device_for_each_vq' + - 'while_for_each_ftrace_op' + - 'xa_for_each' + - 'xa_for_each_marked' + - 'xa_for_each_range' + - 'xa_for_each_start' + - 'xas_for_each' + - 'xas_for_each_conflict' + - 'xas_for_each_marked' + - 'xbc_array_for_each_value' + - 'xbc_for_each_key_value' + - 'xbc_node_for_each_array_value' + - 'xbc_node_for_each_child' + - 'xbc_node_for_each_key_value' + - 'zorro_for_each_dev' + +IncludeBlocks: Preserve # Unknown to clang-format-5.0 +IncludeCategories: + - Regex: '.*' + Priority: 1 +IncludeIsMainRegex: '(Test)?$' +IndentCaseLabels: false +IndentGotoLabels: false +IndentPPDirectives: None # Unknown to clang-format-5.0 +IndentWidth: 8 +IndentWrappedFunctionNames: false +JavaScriptQuotes: Leave +JavaScriptWrapImports: true +KeepEmptyLinesAtTheStartOfBlocks: false +MacroBlockBegin: '' +MacroBlockEnd: '' +MaxEmptyLinesToKeep: 1 +NamespaceIndentation: None +ObjCBinPackProtocolList: Auto # Unknown to clang-format-5.0 +ObjCBlockIndentWidth: 8 +ObjCSpaceAfterProperty: true +ObjCSpaceBeforeProtocolList: true + +# Taken from git's rules +PenaltyBreakAssignment: 10 # Unknown to clang-format-4.0 +PenaltyBreakBeforeFirstCallParameter: 30 +PenaltyBreakComment: 10 +PenaltyBreakFirstLessLess: 0 +PenaltyBreakString: 10 +PenaltyExcessCharacter: 100 +PenaltyReturnTypeOnItsOwnLine: 60 + +PointerAlignment: Right +ReflowComments: false +SortIncludes: false +SortUsingDeclarations: false # Unknown to clang-format-4.0 +SpaceAfterCStyleCast: false +SpaceAfterTemplateKeyword: true +SpaceBeforeAssignmentOperators: true +SpaceBeforeCtorInitializerColon: true # Unknown to clang-format-5.0 +SpaceBeforeInheritanceColon: true # Unknown to clang-format-5.0 +SpaceBeforeParens: ControlStatementsExceptForEachMacros +SpaceBeforeRangeBasedForLoopColon: true # Unknown to clang-format-5.0 +SpaceInEmptyParentheses: false +SpacesBeforeTrailingComments: 1 +SpacesInAngles: false +SpacesInContainerLiterals: false +SpacesInCStyleCastParentheses: false +SpacesInParentheses: false +SpacesInSquareBrackets: false +Standard: Cpp03 +TabWidth: 8 +UseTab: Always +... diff --git a/.codespellrc b/.codespellrc new file mode 100644 index 000000000..5def594b2 --- /dev/null +++ b/.codespellrc @@ -0,0 +1,3 @@ +[codespell] +skip = ./.git,./test/pki,./tags,./plugins/amdgpu/amdgpu_drm.h,./plugins/amdgpu/drm.h,./plugins/amdgpu/drm_mode.h +ignore-words-list = creat,fpr,fle,ue,bord,parms,nd,te,testng,inh,wronly,renderd,bui,clen,sems diff --git a/.github/ISSUE_TEMPLATE.md b/.github/ISSUE_TEMPLATE.md new file mode 100644 index 000000000..222d66156 --- /dev/null +++ b/.github/ISSUE_TEMPLATE.md @@ -0,0 +1,63 @@ + + +**Description** + + + +**Steps to reproduce the issue:** +1. +2. +3. + +**Describe the results you received:** + + +**Describe the results you expected:** + + +**Additional information you deem important (e.g. issue happens only occasionally):** + + +**CRIU logs and information:** + + + +
CRIU full dump/restore logs: +

+ +``` +(paste your output here) +``` + +

+
+ +
Output of `criu --version`: +

+ +``` +(paste your output here) +``` + +

+
+ +
Output of `criu check --all`: +

+ +``` +(paste your output here) +``` + +

+
+ +**Additional environment details:** diff --git a/.github/PULL_REQUEST_TEMPLATE.md b/.github/PULL_REQUEST_TEMPLATE.md new file mode 100644 index 000000000..62365b191 --- /dev/null +++ b/.github/PULL_REQUEST_TEMPLATE.md @@ -0,0 +1,18 @@ + diff --git a/.github/workflows/aarch64-test.yaml b/.github/workflows/aarch64-test.yaml new file mode 100644 index 000000000..ebbecadb3 --- /dev/null +++ b/.github/workflows/aarch64-test.yaml @@ -0,0 +1,34 @@ +name: aarch64 test + +on: [push, pull_request] + +# Cancel any preceding run on the pull request. +concurrency: + group: aarch64-test-${{ github.event.pull_request.number || github.ref }} + cancel-in-progress: ${{ github.ref != 'refs/heads/criu-dev' }} + +jobs: + build: + strategy: + matrix: + os: [ubuntu-24.04-arm, ubuntu-22.04-arm] + target: [GCC=1, CLANG=1] + + runs-on: ${{ matrix.os }} + + steps: + - uses: actions/checkout@v4 + - name: Run Tests ${{ matrix.target }} on ${{ matrix.os }} + # Following tests are failing on the VMs: + # ./change_mnt_context --pidfile=change_mnt_context.pid --outfile=change_mnt_context.out + # 45: ERR: change_mnt_context.c:23: mount (errno = 22 (Invalid argument)) + # + # In combination with '--remote-lazy-pages' following error occurs: + # 138: FAIL: maps05.c:84: Data corrupted at page 1639 (errno = 11 (Resource temporarily unavailable)) + run: | + # The 'sched_policy00' needs the following: + sudo sysctl -w kernel.sched_rt_runtime_us=-1 + # etc/hosts entry is needed for netns_lock_iptables + echo "127.0.0.1 localhost" | sudo tee -a /etc/hosts + sudo -E make -C scripts/ci local ${{ matrix.target }} RUN_TESTS=1 \ + ZDTM_OPTS="-x zdtm/static/change_mnt_context -x zdtm/static/maps05" diff --git a/.github/workflows/alpine-test.yml b/.github/workflows/alpine-test.yml new file mode 100644 index 000000000..0f5c20f48 --- /dev/null +++ b/.github/workflows/alpine-test.yml @@ -0,0 +1,21 @@ +name: Alpine Test + +on: [push, pull_request] + +# Cancel any preceding run on the pull request. +concurrency: + group: alpine-test-${{ github.event.pull_request.number || github.ref }} + cancel-in-progress: ${{ github.ref != 'refs/heads/criu-dev' }} + +jobs: + build: + strategy: + matrix: + os: [ubuntu-22.04, ubuntu-22.04-arm] + target: [GCC=1, CLANG=1] + runs-on: ${{ matrix.os }} + + steps: + - uses: actions/checkout@v4 + - name: Run Alpine ${{ matrix.target }} Test + run: sudo -E make -C scripts/ci alpine ${{ matrix.target }} diff --git a/.github/workflows/archlinux-test.yml b/.github/workflows/archlinux-test.yml new file mode 100644 index 000000000..425f0662b --- /dev/null +++ b/.github/workflows/archlinux-test.yml @@ -0,0 +1,16 @@ +name: Arch Linux Test + +on: [push, pull_request] + +# Cancel any preceding run on the pull request. +concurrency: + group: archlinux-test-${{ github.event.pull_request.number || github.ref }} + cancel-in-progress: ${{ github.ref != 'refs/heads/criu-dev' }} + +jobs: + build: + runs-on: ubuntu-22.04 + steps: + - uses: actions/checkout@v4 + - name: Run Arch Linux Test + run: sudo -E make -C scripts/ci archlinux diff --git a/.github/workflows/check-commits.yml b/.github/workflows/check-commits.yml new file mode 100644 index 000000000..bf7d06697 --- /dev/null +++ b/.github/workflows/check-commits.yml @@ -0,0 +1,30 @@ +name: Verify self-contained commits + +on: pull_request + +# Cancel any preceding run on the pull request +concurrency: + group: commit-test-${{ github.event.pull_request.number }} + +jobs: + build: + runs-on: ubuntu-latest + # Check if pull request does not have label "not-selfcontained-ok" + if: "!contains(github.event.pull_request.labels.*.name, 'not-selfcontained-ok')" + steps: + - uses: actions/checkout@v4 + with: + # Needed to rebase against the base branch + fetch-depth: 0 + # Checkout pull request HEAD commit instead of merge commit + ref: ${{ github.event.pull_request.head.sha }} + - name: Install dependencies + run: sudo contrib/apt-install libprotobuf-dev libprotobuf-c-dev protobuf-c-compiler protobuf-compiler python3-protobuf libnl-3-dev libnet-dev libcap-dev uuid-dev + - name: Configure git user details + run: | + git config --global user.email "checkpoint-restore@users.noreply.github.com" + git config --global user.name "checkpoint-restore" + - name: Configure base branch without switching current branch + run: git fetch origin ${{ github.base_ref }}:${{ github.base_ref }} + - name: Build each commit + run: git rebase ${{ github.base_ref }} -x "make -C scripts/ci check-commit" diff --git a/.github/workflows/codeql.yml b/.github/workflows/codeql.yml new file mode 100644 index 000000000..9c9e46c1b --- /dev/null +++ b/.github/workflows/codeql.yml @@ -0,0 +1,50 @@ +name: "CodeQL" + +on: + push: + branches: [ "criu-dev", "master" ] + pull_request: + branches: [ "criu-dev" ] + schedule: + - cron: "11 6 * * 3" + +# Cancel any preceding run on the pull request. +concurrency: + group: codeql-test-${{ github.event.pull_request.number || github.ref }} + cancel-in-progress: ${{ github.ref != 'refs/heads/criu-dev' }} + +jobs: + analyze: + name: Analyze + runs-on: ubuntu-latest + permissions: + actions: read + contents: read + security-events: write + + strategy: + fail-fast: false + matrix: + language: [ python, cpp ] + + steps: + - name: Checkout + uses: actions/checkout@v4 + + - name: Install Packages (cpp) + if: ${{ matrix.language == 'cpp' }} + run: | + sudo contrib/apt-install protobuf-c-compiler libprotobuf-c-dev libprotobuf-dev build-essential libprotobuf-dev libprotobuf-c-dev protobuf-c-compiler protobuf-compiler python3-protobuf libnet-dev pkg-config libnl-3-dev libbsd0 libbsd-dev iproute2 libcap-dev libaio-dev libbsd-dev python3-yaml libnl-route-3-dev gnutls-dev + - name: Initialize CodeQL + uses: github/codeql-action/init@v3 + with: + languages: ${{ matrix.language }} + queries: +security-and-quality + + - name: Autobuild + uses: github/codeql-action/autobuild@v3 + + - name: Perform CodeQL Analysis + uses: github/codeql-action/analyze@v3 + with: + category: "/language:${{ matrix.language }}" diff --git a/.github/workflows/compat-test.yml b/.github/workflows/compat-test.yml new file mode 100644 index 000000000..8a64ce185 --- /dev/null +++ b/.github/workflows/compat-test.yml @@ -0,0 +1,21 @@ +name: Compat Tests + +on: [push, pull_request] + +# Cancel any preceding run on the pull request. +concurrency: + group: compat-test-${{ github.event.pull_request.number || github.ref }} + cancel-in-progress: ${{ github.ref != 'refs/heads/criu-dev' }} + +jobs: + build: + runs-on: ubuntu-22.04 + strategy: + matrix: + target: [GCC, CLANG] + + + steps: + - uses: actions/checkout@v4 + - name: Run Compat Tests (${{ matrix.target }}) + run: sudo -E make -C scripts/ci local COMPAT_TEST=y ${{ matrix.target }}=1 diff --git a/.github/workflows/cross-compile-daily.yml b/.github/workflows/cross-compile-daily.yml new file mode 100644 index 000000000..c709cca00 --- /dev/null +++ b/.github/workflows/cross-compile-daily.yml @@ -0,0 +1,22 @@ +name: Daily Cross Compile Tests + +on: + schedule: + - cron: '30 12 * * *' + +jobs: + build: + + runs-on: ubuntu-latest + strategy: + matrix: + target: [armv7-stable-cross, aarch64-stable-cross, ppc64-stable-cross, mips64el-stable-cross, riscv64-stable-cross] + branches: [criu-dev, master] + + steps: + - uses: actions/checkout@v4 + with: + ref: ${{ matrix.branches }} + - name: Run Cross Compilation Targets + run: > + sudo make -C scripts/ci ${{ matrix.target }} diff --git a/.github/workflows/cross-compile.yml b/.github/workflows/cross-compile.yml new file mode 100644 index 000000000..96672b294 --- /dev/null +++ b/.github/workflows/cross-compile.yml @@ -0,0 +1,40 @@ +name: Cross Compile Tests + +on: [push, pull_request] + +# Cancel any preceding run on the pull request. +concurrency: + group: cross-compile-test-${{ github.event.pull_request.number || github.ref }} + cancel-in-progress: ${{ github.ref != 'refs/heads/criu-dev' }} + +jobs: + build: + + runs-on: ubuntu-latest + continue-on-error: ${{ matrix.experimental }} + strategy: + fail-fast: false + matrix: + experimental: [false] + target: [ + armv7-stable-cross, + aarch64-stable-cross, + ppc64-stable-cross, + mips64el-stable-cross, + riscv64-stable-cross, + ] + include: + - experimental: true + target: armv7-unstable-cross + - experimental: true + target: aarch64-unstable-cross + - experimental: true + target: ppc64-unstable-cross + - experimental: true + target: mips64el-unstable-cross + + steps: + - uses: actions/checkout@v4 + - name: Run Cross Compilation Targets + run: > + sudo make -C scripts/ci ${{ matrix.target }} diff --git a/.github/workflows/docker-test.yml b/.github/workflows/docker-test.yml new file mode 100644 index 000000000..23696905a --- /dev/null +++ b/.github/workflows/docker-test.yml @@ -0,0 +1,19 @@ +name: Docker Test + +on: [push, pull_request] + +# Cancel any preceding run on the pull request. +concurrency: + group: docker-test-${{ github.event.pull_request.number || github.ref }} + cancel-in-progress: ${{ github.ref != 'refs/heads/criu-dev' }} + +jobs: + build: + runs-on: ${{ matrix.os }} + strategy: + matrix: + os: [ubuntu-22.04] + steps: + - uses: actions/checkout@v4 + - name: Run Docker Test (${{ matrix.os }}) + run: sudo make -C scripts/ci docker-test diff --git a/.github/workflows/fedora-asan-test.yml b/.github/workflows/fedora-asan-test.yml new file mode 100644 index 000000000..02dc9a1b3 --- /dev/null +++ b/.github/workflows/fedora-asan-test.yml @@ -0,0 +1,17 @@ +name: Fedora ASAN Test + +on: [push, pull_request] + +# Cancel any preceding run on the pull request. +concurrency: + group: fedora-asan-test-${{ github.event.pull_request.number || github.ref }} + cancel-in-progress: ${{ github.ref != 'refs/heads/criu-dev' }} + +jobs: + build: + runs-on: ubuntu-22.04 + + steps: + - uses: actions/checkout@v4 + - name: Run Fedora ASAN Test + run: sudo -E make -C scripts/ci fedora-asan diff --git a/.github/workflows/fedora-rawhide-test.yml b/.github/workflows/fedora-rawhide-test.yml new file mode 100644 index 000000000..83e2ead82 --- /dev/null +++ b/.github/workflows/fedora-rawhide-test.yml @@ -0,0 +1,21 @@ +name: Fedora Rawhide Test + +on: [push, pull_request] + +# Cancel any preceding run on the pull request. +concurrency: + group: fedora-rawhide-test-${{ github.event.pull_request.number || github.ref }} + cancel-in-progress: ${{ github.ref != 'refs/heads/criu-dev' }} + +jobs: + build: + runs-on: ubuntu-22.04 + + steps: + - uses: actions/checkout@v4 + - name: Run Fedora Rawhide Test + # We need to pass environment variables from the CI environment to + # distinguish between CI environments. However, we need to make sure that + # XDG_RUNTIME_DIR environment variable is not set due to a bug in Podman. + # FIXME: https://github.com/containers/podman/issues/14920 + run: sudo -E XDG_RUNTIME_DIR= make -C scripts/ci fedora-rawhide CONTAINER_RUNTIME=podman BUILD_OPTIONS="--security-opt seccomp=unconfined" diff --git a/.github/workflows/gcov-test.yml b/.github/workflows/gcov-test.yml new file mode 100644 index 000000000..cc4e1d44a --- /dev/null +++ b/.github/workflows/gcov-test.yml @@ -0,0 +1,21 @@ +name: Coverage Tests + +on: [push, pull_request] + +# Cancel any preceding run on the pull request. +concurrency: + group: gcov-test-${{ github.event.pull_request.number || github.ref }} + cancel-in-progress: ${{ github.ref != 'refs/heads/criu-dev' }} + +jobs: + build: + runs-on: ubuntu-22.04 + + steps: + - uses: actions/checkout@v4 + - name: Run Coverage Tests + run: sudo -E make -C scripts/ci local GCOV=1 + - name: Run gcov + run: sudo -E find . -name '*gcda' -type f -print0 | sudo -E xargs --null --max-args 128 --max-procs 4 gcov + - name: Run Coverage Analysis + run: sudo -E make codecov diff --git a/.github/workflows/java-test.yml b/.github/workflows/java-test.yml new file mode 100644 index 000000000..cbd3c1f23 --- /dev/null +++ b/.github/workflows/java-test.yml @@ -0,0 +1,16 @@ +name: Java Test + +on: [push, pull_request] + +# Cancel any preceding run on the pull request. +concurrency: + group: java-test-${{ github.event.pull_request.number || github.ref }} + cancel-in-progress: ${{ github.ref != 'refs/heads/criu-dev' }} + +jobs: + build: + runs-on: ubuntu-22.04 + steps: + - uses: actions/checkout@v4 + - name: Run Java Test + run: sudo make -C scripts/ci java-test diff --git a/.github/workflows/lint.yml b/.github/workflows/lint.yml new file mode 100644 index 000000000..f7da4f6f6 --- /dev/null +++ b/.github/workflows/lint.yml @@ -0,0 +1,40 @@ +name: Run code linter + +on: [push, pull_request] + +# Cancel any preceding run on the pull request. +concurrency: + group: lint-test-${{ github.event.pull_request.number || github.ref }} + cancel-in-progress: ${{ github.ref != 'refs/heads/criu-dev' }} + +jobs: + build: + runs-on: ubuntu-latest + container: + image: registry.fedoraproject.org/fedora:latest + steps: + - name: Install tools + run: sudo dnf -y install git make ruff xz clang-tools-extra codespell git-clang-format ShellCheck + + - uses: actions/checkout@v4 + + - name: Set git safe directory + # https://github.com/actions/checkout/issues/760 + run: git config --global --add safe.directory "$GITHUB_WORKSPACE" + + - name: Run make lint + run: make lint + + - name: Run make indent + continue-on-error: true + run: | + if [ -z "${{github.base_ref}}" ]; then + git fetch --deepen=1 + make indent + else + git fetch origin ${{github.base_ref}} + make indent BASE=origin/${{github.base_ref}} + fi + - name: Raise in-line make indent warnings + run: | + git diff | ./scripts/github-indent-warnings.py diff --git a/.github/workflows/loongarch64-qemu-test.yml b/.github/workflows/loongarch64-qemu-test.yml new file mode 100644 index 000000000..d7c554c87 --- /dev/null +++ b/.github/workflows/loongarch64-qemu-test.yml @@ -0,0 +1,15 @@ +name: LoongArch64 Qemu Test + +on: [push, pull_request] + +# Cancel any preceding run on the pull request. +concurrency: + group: loongarch64-qemu-test-${{ github.event.pull_request.number || github.ref }} + cancel-in-progress: ${{ github.ref != 'refs/heads/criu-dev' }} + +jobs: + build: + runs-on: ubuntu-22.04 + steps: + - uses: actions/checkout@v4 + - run: sudo make -C scripts/ci loongarch64-qemu-test diff --git a/.github/workflows/manage-labels.yml b/.github/workflows/manage-labels.yml new file mode 100644 index 000000000..a2bcd8860 --- /dev/null +++ b/.github/workflows/manage-labels.yml @@ -0,0 +1,14 @@ +name: Remove labels +on: [issue_comment, pull_request_review_comment] +jobs: + remove-labels-on-comments: + name: Remove labels on comments + if: github.event_name == 'issue_comment' + runs-on: ubuntu-latest + steps: + - uses: mondeja/remove-labels-gh-action@v1 + with: + token: ${{ secrets.GITHUB_TOKEN }} + labels: | + changes requested + awaiting reply diff --git a/.github/workflows/nftables-test.yml b/.github/workflows/nftables-test.yml new file mode 100644 index 000000000..7a7d8bd30 --- /dev/null +++ b/.github/workflows/nftables-test.yml @@ -0,0 +1,24 @@ +name: Nftables bases testing + +on: [push, pull_request] + +# Cancel any preceding run on the pull request. +concurrency: + group: nftables-test-${{ github.event.pull_request.number || github.ref }} + cancel-in-progress: ${{ github.ref != 'refs/heads/criu-dev' }} + +jobs: + build: + runs-on: ubuntu-24.04 + steps: + - uses: actions/checkout@v4 + - name: Remove iptables + run: sudo apt remove -y iptables + - name: Install libnftables-dev + run: sudo contrib/apt-install libnftables-dev + - name: chmod 755 /home/runner + # CRIU's tests are sometimes running as some random user and need + # to be able to access the test files. + run: sudo chmod 755 /home/runner + - name: Build with nftables network locking backend + run: sudo make -C scripts/ci local COMPILE_FLAGS="NETWORK_LOCK_DEFAULT=NETWORK_LOCK_NFTABLES" diff --git a/.github/workflows/podman-test.yml b/.github/workflows/podman-test.yml new file mode 100644 index 000000000..a07edbe5b --- /dev/null +++ b/.github/workflows/podman-test.yml @@ -0,0 +1,16 @@ +name: Podman Test + +on: [push, pull_request] + +# Cancel any preceding run on the pull request. +concurrency: + group: podman-test-${{ github.event.pull_request.number || github.ref }} + cancel-in-progress: ${{ github.ref != 'refs/heads/criu-dev' }} + +jobs: + build: + runs-on: ubuntu-22.04 + steps: + - uses: actions/checkout@v4 + - name: Run Podman Test + run: sudo make -C scripts/ci podman-test diff --git a/.github/workflows/stale.yml b/.github/workflows/stale.yml new file mode 100644 index 000000000..76d55d4c9 --- /dev/null +++ b/.github/workflows/stale.yml @@ -0,0 +1,27 @@ +name: Mark stale issues and pull requests + +# Please refer to https://github.com/actions/stale/blob/master/action.yml +# to see all config knobs of the stale action. + +on: + schedule: + - cron: "0 0 * * *" + +jobs: + stale: + + runs-on: ubuntu-latest + + steps: + - uses: actions/stale@v5 + with: + repo-token: ${{ secrets.GITHUB_TOKEN }} + stale-issue-message: 'A friendly reminder that this issue had no activity for 30 days.' + stale-pr-message: 'A friendly reminder that this PR had no activity for 30 days.' + stale-issue-label: 'stale-issue' + stale-pr-label: 'stale-pr' + days-before-stale: 30 + days-before-close: 365 + remove-stale-when-updated: true + exempt-pr-labels: 'no-auto-close' + exempt-issue-labels: 'no-auto-close,new feature,enhancement' diff --git a/.github/workflows/stream-test.yml b/.github/workflows/stream-test.yml new file mode 100644 index 000000000..76bd96edf --- /dev/null +++ b/.github/workflows/stream-test.yml @@ -0,0 +1,17 @@ +name: CRIU Image Streamer Test + +on: [push, pull_request] + +# Cancel any preceding run on the pull request. +concurrency: + group: stream-test-${{ github.event.pull_request.number || github.ref }} + cancel-in-progress: ${{ github.ref != 'refs/heads/criu-dev' }} + +jobs: + build: + runs-on: ubuntu-22.04 + + steps: + - uses: actions/checkout@v4 + - name: Run CRIU Image Streamer Test + run: sudo -E make -C scripts/ci local STREAM_TEST=1 diff --git a/.github/workflows/x86-64-clang-test.yml b/.github/workflows/x86-64-clang-test.yml new file mode 100644 index 000000000..1f0a469bd --- /dev/null +++ b/.github/workflows/x86-64-clang-test.yml @@ -0,0 +1,16 @@ +name: X86_64 CLANG Test + +on: [push, pull_request] + +# Cancel any preceding run on the pull request. +concurrency: + group: clang-test-${{ github.event.pull_request.number || github.ref }} + cancel-in-progress: ${{ github.ref != 'refs/heads/criu-dev' }} + +jobs: + build: + runs-on: ubuntu-22.04 + steps: + - uses: actions/checkout@v4 + - name: Run X86_64 CLANG Test + run: sudo make -C scripts/ci x86_64 CLANG=1 diff --git a/.github/workflows/x86-64-gcc-test.yml b/.github/workflows/x86-64-gcc-test.yml new file mode 100644 index 000000000..15e84a0df --- /dev/null +++ b/.github/workflows/x86-64-gcc-test.yml @@ -0,0 +1,16 @@ +name: X86_64 GCC Test + +on: [push, pull_request] + +# Cancel any preceding run on the pull request. +concurrency: + group: gcc-test-${{ github.event.pull_request.number || github.ref }} + cancel-in-progress: ${{ github.ref != 'refs/heads/criu-dev' }} + +jobs: + build: + runs-on: ubuntu-22.04 + steps: + - uses: actions/checkout@v4 + - name: Run X86_64 GCC Test + run: sudo make -C scripts/ci x86_64 diff --git a/.gitignore b/.gitignore index c231104af..94daa13ea 100644 --- a/.gitignore +++ b/.gitignore @@ -20,25 +20,16 @@ compel/compel compel/compel-host-bin images/*.c images/*.h -images/google/protobuf/*.c -images/google/protobuf/*.h .gitid criu/criu -crit/crit -criu/arch/*/sys-exec-tbl*.c -# x86 syscalls-table is not generated -!criu/arch/x86/sys-exec-tbl.c -criu/arch/*/syscalls*.S -criu/include/syscall-codes*.h -criu/include/syscall*.h +criu/unittest/unittest criu/include/version.h criu/pie/restorer-blob.h criu/pie/parasite-blob.h criu/protobuf-desc-gen.h lib/build/ lib/c/criu.pc -scripts/build/qemu-user-static/* -lib/.crit-setup.files compel/include/asm include/common/asm include/common/config.h +build/** diff --git a/.lgtm.yml b/.lgtm.yml new file mode 100644 index 000000000..4beadcc63 --- /dev/null +++ b/.lgtm.yml @@ -0,0 +1,25 @@ +extraction: + cpp: + prepare: + packages: + - "protobuf-c-compiler" + - "libprotobuf-c-dev" + - "libprotobuf-dev" + - "build-essential" + - "libprotobuf-dev" + - "libprotobuf-c-dev" + - "protobuf-c-compiler" + - "protobuf-compiler" + - "python3-protobuf" + - "libnet-dev" + - "pkg-config" + - "libnl-3-dev" + - "libbsd0" + - "libbsd-dev" + - "iproute2" + - "libcap-dev" + - "libaio-dev" + - "libbsd-dev" + - "python3-yaml" + - "libnl-route-3-dev" + - "gnutls-dev" diff --git a/.mailmap b/.mailmap index d8c3f594d..8076f0bc9 100644 --- a/.mailmap +++ b/.mailmap @@ -1,6 +1,10 @@ Stanislav Kinsbursky Pavel Emelyanov -Andrey Vagin -Andrey Vagin -Andrey Vagin Andrew Vagin +Andrei Vagin +Andrei Vagin +Andrei Vagin +Andrei Vagin +Andrei Vagin Cyrill Gorcunov +Alexander Mikhalitsyn +Alexander Mikhalitsyn diff --git a/.travis.yml b/.travis.yml deleted file mode 100644 index 82ba9fbc8..000000000 --- a/.travis.yml +++ /dev/null @@ -1,43 +0,0 @@ -language: c -sudo: required -dist: xenial -cache: ccache -services: - - docker -env: - - TR_ARCH=local - - TR_ARCH=local CLANG=1 - - TR_ARCH=local COMPAT_TEST=y - - TR_ARCH=local CLANG=1 COMPAT_TEST=y - - TR_ARCH=alpine - - TR_ARCH=fedora-asan - - TR_ARCH=x86_64 - - TR_ARCH=x86_64 CLANG=1 - - TR_ARCH=armv7hf - - TR_ARCH=aarch64 - - TR_ARCH=ppc64le - - TR_ARCH=s390x - - TR_ARCH=armv7hf CLANG=1 - - TR_ARCH=aarch64 CLANG=1 - - TR_ARCH=ppc64le CLANG=1 - - TR_ARCH=alpine CLANG=1 - - TR_ARCH=docker-test - - TR_ARCH=fedora-rawhide - - TR_ARCH=fedora-rawhide-aarch64 - - TR_ARCH=centos - - TR_ARCH=podman-test -matrix: - allow_failures: - - env: TR_ARCH=docker-test - - env: TR_ARCH=fedora-rawhide - - env: TR_ARCH=fedora-rawhide-aarch64 - - env: TR_ARCH=s390x - - env: TR_ARCH=local GCOV=1 - - env: TR_ARCH=local COMPAT_TEST=y - - env: TR_ARCH=local CLANG=1 COMPAT_TEST=y -script: - - sudo make CCACHE=1 -C scripts/travis $TR_ARCH -after_success: - - ccache -s - - make -C scripts/travis after_success -group: deprecated-2017Q2 diff --git a/CLAUDE.md b/CLAUDE.md new file mode 120000 index 000000000..e3c5a92d9 --- /dev/null +++ b/CLAUDE.md @@ -0,0 +1 @@ +GEMINI.md \ No newline at end of file diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md new file mode 100644 index 000000000..03875639d --- /dev/null +++ b/CONTRIBUTING.md @@ -0,0 +1,417 @@ +## How to contribute to CRIU + +CRIU project is (almost) the never-ending story, because we have to always keep up with the +Linux kernel supporting checkpoint and restore for all the features it provides. Thus we're +looking for contributors of all kinds -- feedback, bug reports, testing, coding, writing, etc. +Here are some useful hints to get involved. + +* We have both -- [very simple](https://github.com/checkpoint-restore/criu/issues?q=is%3Aissue+is%3Aopen+label%3Aenhancement) and [more sophisticated](https://github.com/checkpoint-restore/criu/issues?q=is%3Aissue+is%3Aopen+label%3A%22new+feature%22) coding tasks; +* CRIU does need [extensive testing](https://github.com/checkpoint-restore/criu/issues?q=is%3Aissue+is%3Aopen+label%3Atesting); +* Documentation is always hard, we have [some information](https://criu.org/Category:Empty_articles) that is to be extracted from people's heads into wiki pages as well as [some texts](https://criu.org/Category:Editor_help_needed) that all need to be converted into useful articles; +* Feedback is expected on the GitHub issues page and on the [mailing list](https://lore.kernel.org/criu); +* We accept GitHub pull requests and this is the preferred way to contribute to CRIU. If you prefer to send patches by email, you are welcome to send them to [CRIU development mailing list](https://lore.kernel.org/criu). +Below we describe in more detail recommend practices for CRIU development. +* Spread the word about CRIU in [social networks](http://criu.org/Contacts); +* If you're giving a talk about CRIU -- let us know, we'll mention it on the [wiki main page](https://criu.org/News/events); + +### Setting up the development environment + +Although `criu` could be run as non-root (see [Security](https://criu.org/Security)), development is better to be done as root. For example, some tests require root. So, it would be a good idea to set up some recent Linux distro on a virtual machine. + +### Get the source code + +The CRIU sources are tracked by Git. Official CRIU repo is at https://github.com/checkpoint-restore/criu. + +The repository may contain multiple branches. Development happens in the **criu-dev** branch. + +To clone CRIU repo and switch to the proper branch, run: + +``` +git clone https://github.com/checkpoint-restore/criu criu +cd criu +git checkout criu-dev +``` + +### Building from source + +Follow these steps to compile CRIU from source code. + +#### Installing build dependencies + +First, you need to install the required build dependencies. We provide scripts to simplify this process for several Linux distributions in [contrib/dependencies](contrib/dependencies). For a complete list of dependencies, please refer to the [installation guide](https://criu.org/Installation). + +##### On Ubuntu/Debian-based systems: + +``` +./contrib/dependencies/apt-packages.sh +``` + +##### On Fedora/CentOS-based systems: + +``` +./contrib/dependencies/dnf-packages.sh +``` + +##### Using Nix: + +``` +nix develop +``` + +#### Compiling CRIU + +Once the dependencies are installed, you can compile CRIU by running the `make` command from the root of the source directory: + +``` +make +``` + +This should create the `./criu/criu` executable. + +## Edit the source code + +When you change the source code, please keep in mind the following code conventions: + +* code is written to be read, so the code readability is the most important thing you need to have in mind when preparing patches +* we prefer tabs and indentations to be 8 characters width +* we prefer line length of 80 characters or less, more is allowed if it helps with code readability +* CRIU mostly follows [Linux kernel coding style](https://www.kernel.org/doc/Documentation/process/coding-style.rst), but we are less strict than the kernel community + +Other conventions can be learned from the source code itself. In short, make sure your new code looks similar to what is already there. + +## Automatic tools to fix coding-style + +Important: These tools are there to advise you, but should not be considered as a "source of truth", as tools also make nasty mistakes from time to time which can completely break code readability. + +The following command can be used to automatically run a code linter for Python files (ruff), Shell scripts (shellcheck), +text spelling (codespell), and a number of CRIU-specific checks (usage of print macros and EOL whitespace for C files). + +``` +make lint +``` + +In addition, we have adopted a [clang-format configuration file](https://www.kernel.org/doc/Documentation/process/clang-format.rst) +based on the kernel source tree. However, compliance with the clang-format autoformat rules is optional. If the automatic code formatting +results in decreased readability, we may choose to ignore these errors. + +Run the following command to check if your changes are compliant with the clang-format rules: + +``` +make indent +``` + +This command is built upon the `git-clang-format` tool and supports two options `BASE` and `OPTS`. The `BASE` option allows you to +specify a range of commits to check for coding style issues. By default, it is set to `HEAD~1`, so that only the last commit is checked. +If you are developing on top of the criu-dev branch and want to check all your commits for compliance with the clang-format rules, you +can use `BASE=origin/criu-dev`. The `OPTS` option can be used to pass additional options to `git-clang-format`. For example, if you want +to check the last *N* commits for formatting errors, without applying the changes to the codebase you can use the following command. + +``` +make indent OPTS=--diff BASE=HEAD~N +``` + +Note that for pull requests, the "Run code linter" workflow runs these checks for all commits. If a clang-format error is detected +we need to review the suggested changes and decide if they should be fixed before merging. + +Here are some bad examples of clang-format-ing: + +* if clang-format tries to force 120 characters and breaks readability - it is wrong: + +``` +@@ -58,8 +59,7 @@ static int register_membarriers(void) + } + + if (!all_ok) { +- fail("can't register membarrier()s - tried %#x, kernel %#x", +- barriers_registered, barriers_supported); ++ fail("can't register membarrier()s - tried %#x, kernel %#x", barriers_registered, barriers_supported); + return -1; + } +``` + +* if clang-format breaks your beautiful readability friendly alignment in structures, comments or defines - it is wrong: + +``` +--- a/test/zdtm/static/membarrier.c ++++ b/test/zdtm/static/membarrier.c +@@ -27,9 +27,10 @@ static const struct { + int register_cmd; + int execute_cmd; + } membarrier_cmds[] = { +- { "", MEMBARRIER_CMD_REGISTER_PRIVATE_EXPEDITED, MEMBARRIER_CMD_PRIVATE_EXPEDITED }, +- { "_SYNC_CORE", MEMBARRIER_CMD_REGISTER_PRIVATE_EXPEDITED_SYNC_CORE, MEMBARRIER_CMD_PRIVATE_EXPEDITED_SYNC_CORE }, +- { "_RSEQ", MEMBARRIER_CMD_REGISTER_PRIVATE_EXPEDITED_RSEQ, MEMBARRIER_CMD_PRIVATE_EXPEDITED_RSEQ }, ++ { "", MEMBARRIER_CMD_REGISTER_PRIVATE_EXPEDITED, MEMBARRIER_CMD_PRIVATE_EXPEDITED }, ++ { "_SYNC_CORE", MEMBARRIER_CMD_REGISTER_PRIVATE_EXPEDITED_SYNC_CORE, ++ MEMBARRIER_CMD_PRIVATE_EXPEDITED_SYNC_CORE }, ++ { "_RSEQ", MEMBARRIER_CMD_REGISTER_PRIVATE_EXPEDITED_RSEQ, MEMBARRIER_CMD_PRIVATE_EXPEDITED_RSEQ }, + }; +``` + +## Test your changes + +CRIU comes with an extensive test suite. To check whether your changes introduce any regressions, run + +``` +make test +``` + +The command runs [ZDTM Test Suite](https://criu.org/ZDTM_Test_Suite). Check for any error messages produced by it. + +## Describe your changes + +Describe your problem. Whether your change is a one-line bug fix or +5000 lines of a new feature, there must be an underlying problem that +motivated you to do this work. Convince the reviewer that there is a +problem worth fixing and that it makes sense for them to read past the +first paragraph. + +Once the problem is established, describe what you are actually doing +about it in technical detail. It's important to describe the change +in plain English for the reviewer to verify that the code is behaving +as you intend it to. + +Solve only one problem per commit. If your description starts to get +long, that's a sign that you probably need to split up your commit. +See [Separate your changes](#separate-your-changes). + +Describe your changes in imperative mood, e.g. "make xyzzy do frotz" +instead of "[This commit] makes xyzzy do frotz" or "[I] changed xyzzy +to do frotz", as if you are giving orders to the codebase to change +its behaviour. + +If your change fixes a bug in a specific commit, e.g. you found an issue using +`git bisect`, please use the `Fixes:` tag with the abbreviation of +the SHA-1 ID, and the one line summary. For example: + +``` +Fixes: 9433b7b9db3e ("make: use cflags/ldflags for config.h detection mechanism") +``` + +The following `git config` settings can be used to add a pretty format for +outputting the above style in the `git log` or `git show` commands: + +``` +[pretty] + fixes = Fixes: %h (\"%s\") +``` + +If your change address an issue listed in GitHub, please use `Fixes:` tag with the number of the issue. For instance: + +``` +Fixes: #339 +``` + +The `Fixes:` tags should be put at the end of the detailed description. + +Please add a prefix to your commit subject line describing the part of the +project your change is related to. This can be either the name of the file or +directory you changed, or just a general word. If your patch is touching +multiple components you may separate prefixes with "/"-es. Here are some good +examples of subject lines from git log: + +``` +criu-ns: Convert to python3 style print() syntax +compel: Calculate sh_addr if not provided by linker +style: Enforce kernel style -Wstrict-prototypes +rpc/libcriu: Add lsm-profile option +``` + +You may refer to [How to Write a Git Commit +Message](https://chris.beams.io/posts/git-commit/) article for +recommendations for good commit message. + +## Separate your changes + +Separate each **logical change** into a separate commit. + +For example, if your changes include both bug fixes and performance +enhancements for a single driver, separate those changes into two +or more commits. If your changes include an API update, and a new +driver which uses that new API, separate those into two commits. + +On the other hand, if you make a single change to numerous files, +group those changes into a single commit. Thus a single logical change +is contained within a single commit. + +The point to remember is that each commit should make an easily understood +change that can be verified by reviewers. Each commit should be justifiable +on its own merits. + +When dividing your change into a series of commits, take special care to +ensure that CRIU builds and runs properly after each commit in the +series. Developers using `git bisect` to track down a problem can end up +splitting your patch series at any point; they will not thank you if you +introduce bugs in the middle. + +## Sign your work + +To improve tracking of who did what, we ask you to sign off the commits in +your fork of CRIU or the patches that are to be emailed. + +The sign-off is a simple line at the end of the explanation for the +patch, which certifies that you wrote it or otherwise have the right to +pass it on as an open-source patch. The rules are pretty simple: if you +can certify the below: + +### Developer's Certificate of Origin 1.1 + By making a contribution to this project, I certify that: + + (a) The contribution was created in whole or in part by me and I + have the right to submit it under the open source license + indicated in the file; or + + (b) The contribution is based upon previous work that, to the best + of my knowledge, is covered under an appropriate open source + license and I have the right under that license to submit that + work with modifications, whether created in whole or in part + by me, under the same open source license (unless I am + permitted to submit under a different license), as indicated + in the file; or + + (c) The contribution was provided directly to me by some other + person who certified (a), (b) or (c) and I have not modified + it. + + (d) I understand and agree that this project and the contribution + are public and that a record of the contribution (including all + personal information I submit with it, including my sign-off) is + maintained indefinitely and may be redistributed consistent with + this project or the open source license(s) involved. + +then you just add a line saying + +``` +Signed-off-by: Random J Developer +``` + +using your real name (please, no pseudonyms or anonymous contributions if +it possible). + +Hint: you can use `git commit -s` to add Signed-off-by line to your +commit message. To append such line to a commit you already made, use +`git commit --amend -s`. + +``` + From: Random J Developer +Subject: [PATCH] component: Short patch description + +Long patch description (could be skipped if patch +is trivial enough) + +Signed-off-by: Random J Developer +--- +Patch body here +``` + +## Submit your work upstream + +We accept GitHub pull requests and this is the preferred way to contribute to CRIU. +For that you should push your work to your fork of CRIU at [GitHub](https://github.com) and create a [pull request](https://help.github.com/en/github/collaborating-with-issues-and-pull-requests/about-pull-requests) + +### Pull request guidelines + +Pull request comment should contain description of the problem your changes +solve and a brief outline of the changes included in the pull request. + +Please avoid pushing fixup commits to an existent pull request. Each commit +should be self contained and there should not be fixup commits in a patch +series. Pull requests that contain one commit which breaks something +and another commit which fixes it, will be rejected. + +Please merge the fixup commits into the commits that has introduced the +problem before creating a pull request. + +It may happen that the reviewers were not completely happy with your +changes and requested changes to your patches. After you updated your +changes please close the old pull request and create a new one that +contains the following: + +* Description of the problem your changes solve and a brief outline of the + changes +* Link to the previous version of the pull request +* Brief description of the changes between old and new versions of the pull + request. If there were more than one previous pull request, all the + revisions should be listed. For example: + +``` +v3: rebase on the current criu-dev +v2: add commit to foo() and update bar() coding style +``` + +If there are only minor updates to the commits in a pull request, it is +possible to force-push them into an existing pull request. This only applies +to small changes and should be used with care. If you update an existing +pull request, remember to add the description of the changes from the +previous version. + +### Mailing list submission + +Historically, CRIU worked with mailing lists and patches so if you still prefer this way continue reading till the end of this section. + +### Make a patch + +To create a patch, run + +``` +git format-patch --signoff origin/criu-dev +``` + +You might need to read GIT documentation on how to prepare patches +for mail submission. Take a look at http://book.git-scm.com/ and/or +http://git-scm.com/documentation for details. It should not be hard +at all. + +We recommend to post patches using `git send-email` + +``` +git send-email --cover-letter --no-chain-reply-to --annotate \ + --confirm=always --to=criu@lists.linux.dev criu-dev +``` + +Note that the `git send-email` subcommand may not be in +the main git package and using it may require installation of a +separate package, for example the "git-email" package in Fedora and +Debian. + +If this is your first time using git send-email, you might need to +configure it to point it to your SMTP server with something like: + +``` +git config --global sendemail.smtpServer stmp.example.net +``` + +If you get tired of typing `--to=criu@lists.linux.dev` all the time, +you can configure that to be automatically handled as well: + +``` +git config sendemail.to criu@lists.linux.dev +``` + +If a developer is sending another version of the patch (e.g. to address +review comments), they are advised to note differences to previous versions +after the `---` line in the patch so that it helps reviewers but +doesn't become part of git history. Moreover, such patch needs to be prefixed +correctly with `--subject-prefix=PATCHv2` appended to +`git send-email` (substitute `v2` with the correct +version if needed though). + +### Mail patches + +The patches should be sent to CRIU development mailing list, `criu AT lists.linux.dev`. Note that you need to be subscribed first in order to post. The list web interface is available at https://lore.kernel.org/criu; you can also use standard mailman aliases to work with it. + +Please make sure the email client you're using doesn't screw your patch (line wrapping and so on). + +> **Note:** When sending a patch set that consists of more than one patch, please, push your changes in your local repo and provide the URL of the branch in the cover-letter + +### Wait for response + +Be patient. Most CRIU developers are pretty busy people so if +there is no immediate response on your patch — don't be surprised, +sometimes a patch may fly around a week before it gets reviewed. + +## Continuous integration + +Wiki article: [Continuous integration](https://criu.org/Continuous_integration) + +CRIU tests are run for each series sent to the mailing list. If you get a message from our patchwork that patches failed to pass the tests, you have to investigate what is wrong. diff --git a/Documentation/HOWTO.cross-compile b/Documentation/HOWTO.cross-compile index f1b17842b..44b19dfea 100644 --- a/Documentation/HOWTO.cross-compile +++ b/Documentation/HOWTO.cross-compile @@ -1,4 +1,10 @@ -This HOWTO explains how to cross-compile CRIU on x86 +How to cross-compile CRIU on x86: + +Use the Dockerfile provided: + scripts/build/Dockerfile.armv7-cross + +Historical guide how-to do it without docker container: +[Unsupported, may not work anymore!] 1. Download the protobuf sources. 2. Apply the patch http://16918.selcdn.ru/crtools/aarch64/0001-protobuf-added-the-support-for-the-acrchitecture-AAr.patch diff --git a/Documentation/Makefile b/Documentation/Makefile index cbc7ff2c8..de0cc448d 100644 --- a/Documentation/Makefile +++ b/Documentation/Makefile @@ -12,7 +12,9 @@ endif FOOTER := footer.txt SRC1 += crit.txt +SRC1 += criu-ns.txt SRC1 += compel.txt +SRC1 += criu-amdgpu-plugin.txt SRC8 += criu.txt SRC := $(SRC1) $(SRC8) XMLS := $(patsubst %.txt,%.xml,$(SRC)) @@ -54,7 +56,7 @@ ifneq ($(USE_ASCIIDOCTOR),) $(Q) $(ASCIIDOC) -b manpage -d manpage -o $@ $< else $(Q) $(ASCIIDOC) -b docbook -d manpage -o $(patsubst %.1,%.xml,$@) $< - $(Q) $(XMLTO) man -m custom.xsl $(patsubst %.1,%.xml,$@) 2>/dev/null + $(Q) $(XMLTO) man -m custom.xsl $(patsubst %.1,%.xml,$@) endif %.8: %.txt $(FOOTER) custom.xsl @@ -63,7 +65,7 @@ ifneq ($(USE_ASCIIDOCTOR),) $(Q) $(ASCIIDOC) -b manpage -d manpage -o $@ $< else $(Q) $(ASCIIDOC) -b docbook -d manpage -o $(patsubst %.8,%.xml,$@) $< - $(Q) $(XMLTO) man -m custom.xsl $(patsubst %.8,%.xml,$@) 2>/dev/null + $(Q) $(XMLTO) man -m custom.xsl $(patsubst %.8,%.xml,$@) endif %.ps: %.1 diff --git a/Documentation/compel.txt b/Documentation/compel.txt index 744a3b35d..506228f59 100644 --- a/Documentation/compel.txt +++ b/Documentation/compel.txt @@ -86,18 +86,21 @@ Infecting code ~~~~~~~~~~~~~~ The parasitic code is compiled and converted to a header using *compel*, and included here. -*#include * +*#include * *#include "parasite.h"* -Following steps are perfomed to infect the victim process: +Following steps are performed to infect the victim process: - stop the task: *int compel_stop_task(int pid);* - prepare infection handler: *struct parasite_ctl *compel_prepare(int pid);* - execute system call: *int compel_syscall(ctl, int syscall_nr, long *ret, int arg ...);* - infect victim: *int compel_infect(ctl, nr_thread, size_of_args_area);* - cure the victim: *int compel_cure(ctl);* //ctl pointer is freed by this call - - Resume victim: *int compel_resume_task(pid, orig_state, state);* + - Resume victim: *int compel_resume_task(pid, orig_state, state)* or + *int compel_resume_task_sig(pid, orig_state, state, stop_signo).* + //compel_resume_task_sig() could be used in case when victim is in stopped state. + stop_signo could be read by calling compel_parse_stop_signo(). *ctl* must be configured with blob information by calling *PREFIX_setup_c_header()*, with ctl as its argument. *PREFIX* is the argument given to *-p* when calling hgen, else it is deduced from file name. diff --git a/Documentation/criu-amdgpu-plugin.txt b/Documentation/criu-amdgpu-plugin.txt new file mode 100644 index 000000000..fe76fc3bc --- /dev/null +++ b/Documentation/criu-amdgpu-plugin.txt @@ -0,0 +1,114 @@ +ROCM Support(1) +=============== + +NAME +---- +criu-amdgpu-plugin - A plugin extension to CRIU to support checkpoint/restore in +userspace for AMD GPUs. + + +CURRENT SUPPORT +--------------- +Single and Multi GPU systems (Gfx9) +Checkpoint / Restore on different system +Checkpoint / Restore inside a docker container +Pytorch +Tensorflow +Using CRIU Image Streamer +Parallel Restore + +DESCRIPTION +----------- +Though *criu* is a great tool for checkpointing and restoring running +applications, it has certain limitations such as it cannot handle +applications that have device files open. In order to support *ROCm* based +workloads with *criu* we need to augment criu's core functionality with a +plugin based extension mechanism. *criu-amdgpu-plugin* provides the necessary support +to criu to allow Checkpoint / Restore with ROCm. + + +Dependencies +------------ +*amdkfd support*:: + In order to snapshot the *VRAM* and other *GPU* device states, we require + an updated version of amdkfd(amdgpu) driver. + +OPTIONS +------- +Optional parameters can be passed in as environment variables before +executing criu command. + +*KFD_FW_VER_CHECK*:: + Enable or disable firmware version check. + If enabled, firmware version on restored gpu needs to be greater than or + equal firmware version on checkpointed GPU. Default:Enabled + + E.g: + KFD_FW_VER_CHECK=0 + +*KFD_SDMA_FW_VER_CHECK*:: + Enable or disable SDMA firmware version check. + If enabled, SDMA firmware version on restored gpu needs to be greater than or + equal firmware version on checkpointed GPU. Default:Enabled + + E.g: + KFD_SDMA_FW_VER_CHECK=0 + +*KFD_CACHES_COUNT_CHECK*:: + Enable or disable caches count check. If enabled, the caches count on + restored GPU needs to be greater than or equal caches count on checkpointed + GPU. Default:Enabled + + E.g: + KFD_CACHES_COUNT_CHECK=0 + +*KFD_NUM_GWS_CHECK*:: + Enable or disable num_gws check. If enabled, the num_gws on + restored GPU needs to be greater than or equal num_gws on checkpointed + GPU. Default:Enabled + + E.g: + KFD_NUM_GWS_CHECK=0 + +*KFD_VRAM_SIZE_CHECK*:: + Enable or disable VRAM size check. If enabled, the VRAM size on + restored GPU needs to be greater than or equal VRAM size on checkpointed + GPU. Default:Enabled + + E.g: + KFD_VRAM_SIZE_CHECK=0 + +*KFD_NUMA_CHECK*:: + Enable or disable NUMA CPU region check. If enabled, the plugin will restore + GPUs that belong to one CPU NUMA region to the same CPU NUMA region. + Default:Enabled + + E.g: + KFD_NUMA_CHECK=1 + +*KFD_CAPABILITY_CHECK*:: + Enable or disable capability check. If enabled, the capability on + restored GPU needs to be equal to the capability on the checkpointed GPU. + Default:Enabled + + E.g: + KFD_CAPABILITY_CHECK=1 + +*KFD_MAX_BUFFER_SIZE*:: + On some systems, VRAM sizes may exceed RAM sizes, and so buffers for dumping + and restoring VRAM may be unable to fit. Set to a nonzero value (in bytes) + to set a limit on the plugin's memory usage. + Default:0 (Disabled) + + E.g: + KFD_MAX_BUFFER_SIZE="2G" + + +AUTHOR +------ +The AMDKFD team. + + +COPYRIGHT +--------- +Copyright \(C) 2020-2021, Advanced Micro Devices, Inc. (AMD) diff --git a/Documentation/criu-ns.txt b/Documentation/criu-ns.txt new file mode 100644 index 000000000..c6594a9bc --- /dev/null +++ b/Documentation/criu-ns.txt @@ -0,0 +1,32 @@ +CRIU-NS(1) +========== +include::footer.txt[] + +NAME +---- +criu-ns - run criu in different namespaces + +SYNOPSIS +-------- +*criu-ns* 'dump' -t PID [] + +*criu-ns* 'pre-dump' -t PID [] + +*criu-ns* 'restore' [] + +*criu-ns* 'check' [] + +DESCRIPTION +----------- +The *criu-ns* command executes 'criu' in a new PID and mount namespace. +The purpose of this wrapper script is to enable restoring a process tree +that might require a specific PID that is already used on the system; +so called "PID mismatch" problem. + +SEE ALSO +-------- +nsenter(1) namespaces(7) criu(8) + +AUTHOR +------ +The CRIU team diff --git a/Documentation/criu.txt b/Documentation/criu.txt index 94fc5428a..0c9a9e527 100644 --- a/Documentation/criu.txt +++ b/Documentation/criu.txt @@ -24,8 +24,8 @@ on a different system, or both. OPTIONS ------- -Most of the true / false long options (the ones without arguments) can be -prefixed with *--no-* to negate the option (example: *--display-stats* +Most of the long flags can be +prefixed with *no-* to negate the option (example: *--display-stats* and *--no-display-stats*). Common options @@ -33,12 +33,11 @@ Common options Common options are applicable to any 'command'. *-v*[*v*...], *--verbosity*:: - Increase verbosity up from the default level. Multiple *v* can be used, - each increasing verbosity by one level. Using long option without argument - increases verbosity by one level. + Increase verbosity up from the default level. In case of short option, + multiple *v* can be used, each increasing verbosity by one. -*-v*'num', *--verbosity*='num':: - Set verbosity level to 'num'. The higher the level, the more output +**-v**__num__, **--verbosity=**__num__:: + Set verbosity level to _num_. The higher the level, the more output is produced. + The following levels are available: @@ -57,26 +56,31 @@ The following levels are available: Pass a specific configuration file to criu. *--no-default-config*:: - Forbid parsing of default configuration files. + Disable parsing of default configuration files. *--pidfile* 'file':: Write root task, service or page-server pid into a 'file'. *-o*, *--log-file* 'file':: - Write logging messages to 'file'. + Write logging messages to a 'file'. *--display-stats*:: - During dump as well as during restore *criu* collects information - like the time required to dump or restore the process or the + During dump, as well as during restore, *criu* collects some statistics, + like the time required to dump or restore the process, or the number of pages dumped or restored. This information is always - written to the files 'stats-dump' and 'stats-restore' and can - be easily displayed using *crit*. The option *--display-stats* - additionally prints out this information on the console at the end - of a dump or a restore. + saved to the *stats-dump* and *stats-restore* files, and can + be shown using *crit*(1). The option *--display-stats* + prints out this information on the console at the end + of a dump or restore operation. *-D*, *--images-dir* 'path':: Use 'path' as a base directory where to look for sets of image files. +*--stream*:: + dump/restore images using criu-image-streamer. + See https://github.com/checkpoint-restore/criu-image-streamer for detailed + usage. + *--prev-images-dir* 'path':: Use 'path' as a parent directory where to look for sets of image files. This option makes sense in case of incremental dumps. @@ -91,6 +95,19 @@ The following levels are available: *-L*, *--libdir* 'path':: Path to plugins directory. +*--enable-fs* ['fs'[,'fs'...]]:: + Specify a comma-separated list of filesystem names that should + be auto-detected. The value 'all' enables auto-detection for + all filesystems. ++ +Note: This option is not safe, use at your own risk. +Auto-detecting a filesystem mount assumes that the mountpoint can +be restored with *mount(src, mountpoint, flags, options)*. When used, +*dump* is expected to always succeed if a mountpoint is to be +auto-detected, however *restore* may fail (or do something wrong) +if the assumption for restore logic is incorrect. This option is +not compatible with *--external* *dev*. + *--action-script* 'script':: Add an external action script to be executed at certain stages. The environment variable *CRTOOLS_SCRIPT_ACTION* is available @@ -138,6 +155,17 @@ The following levels are available: notification message contains a file descriptor for the master pty + *query-ext-files*::: + called after the process tree is stopped and network is locked. + This hook is used only in the RPC mode. The notification reply + contains file ids to be added to external file list (may be empty). + +*--unprivileged*:: + This option tells *criu* to accept the limitations when running + as non-root. Running as non-root requires *criu* at least to have + *CAP_SYS_ADMIN* or *CAP_CHECKPOINT_RESTORE*. For details about running + *criu* as non-root please consult the *NON-ROOT* section. + *-V*, *--version*:: Print program version and exit. @@ -156,6 +184,12 @@ In addition, *page-server* options may be specified. Turn on memory changes tracker in the kernel. If the option is not passed the memory tracker get turned on implicitly. +*--pre-dump-mode*='mode':: + There are two 'mode' to operate pre-dump algorithm. The 'splice' mode + is parasite based, whereas 'read' mode is based on process_vm_readv + syscall. The 'read' mode incurs reduced frozen time and reduced + memory pressure as compared to 'splice' mode. Default is 'splice' mode. + *dump* ~~~~~~ Performs a checkpoint procedure. @@ -179,7 +213,7 @@ In other words, do not use it unless really needed. *-s*, *--leave-stopped*:: Leave tasks in stopped state after checkpoint, instead of killing. -*--external* 'type'*[*'id'*]:*'value':: +*--external* __type__**[**__id__**]:**__value__:: Dump an instance of an external resource. The generic syntax is 'type' of resource, followed by resource 'id' (enclosed in literal square brackets), and optional 'value' (prepended by a literal colon). @@ -188,35 +222,48 @@ In other words, do not use it unless really needed. Note to restore external resources, either *--external* or *--inherit-fd* is used, depending on resource type. -*--external mnt[*'mountpoint'*]:*'name':: +*--external* **mnt[**__mountpoint__**]:**__name__:: Dump an external bind mount referenced by 'mountpoint', saving it to image under the identifier 'name'. -*--external mnt[]:*'flags':: +*--external* **mnt[]:**__flags__:: Dump all external bind mounts, autodetecting those. Optional 'flags' can contain *m* to also dump external master mounts, *s* to also dump external shared mounts (default behavior is to abort dumping if such mounts are found). If 'flags' are not provided, colon is optional. -*--external dev[*'major'*/*'minor'*]:*'name':: +*--external* **dev[**__major__**/**__minor__**]:**__name__:: Allow to dump a mount namespace having a real block device mounted. A block device is identified by its 'major' and 'minor' numbers, and *criu* saves its information to image under the identifier 'name'. -*--external file[*'mnt_id'*:*'inode'*]*:: +*--external* **file[**__mnt_id__**:**__inode__**]**:: Dump an external file, i.e. an opened file that is can not be resolved from the current mount namespace, which can not be dumped without using this option. The file is identified by 'mnt_id' (a field obtained from - */proc/*'pid'*/fdinfo/*'N') and 'inode' (as returned by *stat*(2)). + **/proc/**__pid__**/fdinfo/**__N__) and 'inode' (as returned by + *stat*(2)). -*--external tty[*'rdev'*:*'dev'*]*:: +*--external* **tty[**__rdev__**:**__dev__**]**:: Dump an external TTY, identified by *st_rdev* and *st_dev* fields returned by *stat*(2). -*--external unix[*'id'*]*:: +*--external* **unix[**__id__**]**:: Tell *criu* that one end of a pair of UNIX sockets (created by - *socketpair*(2)) with 'id' is OK to be disconnected. + *socketpair*(2)) with the given _id_ is OK to be disconnected. + +*--external* **net[**__inode__**]:**__name__:: + Mark a network namespace as external and do not include it in the + checkpoint. The label 'name' can be used with *--inherit-fd* during + restore to specify a file descriptor to a preconfigured network + namespace. + +*--external* **pid[**__inode__**]:**__name__:: + Mark a PID namespace as external. This can be later used to restore + a process into an existing PID namespace. The label 'name' can be + used to assign another PID namespace during restore with the help + of *--inherit-fd*. *--freeze-cgroup*:: Use cgroup freezer to collect processes. @@ -266,14 +313,42 @@ For example, the command line for the above example should look like this: discovered automatically (usually via */proc*). This option is useful when one needs *criu* to skip some controllers. -*--cgroup-props-ignore-default*:: - When combined with *--cgroup-props*, makes *criu* substitute - a predefined controller property with the new one shipped. If the option - is not used, the predefined properties are merged with the provided ones. +*--cgroup-yard* 'path':: + Instead of trying to mount cgroups in CRIU, provide a path to a directory + with already created cgroup yard. Useful if you don't want to grant + CAP_SYS_ADMIN to CRIU. For every cgroup mount there should be exactly one + directory. If there is only one controller in this mount, the dir's name + should be just the name of the controller. If there are multiple controllers + comounted, the directory name should have them be separated by a comma. ++ +For example, if */proc/cgroups* looks like this: ++ +---------- +#subsys_name hierarchy num_cgroups enabled +cpu 1 1 1 +devices 2 2 1 +freezer 2 2 1 +---------- ++ +then you can create the cgroup yard by the following commands: ++ +---------- +mkdir private_yard +cd private_yard +mkdir cpu +mount -t cgroup -o cpu none cpu +mkdir devices,freezer +mount -t cgroup -o devices,freezer none devices,freezer +---------- *--tcp-established*:: Checkpoint established TCP connections. +*--tcp-close*:: + Don't dump the state of, or block, established tcp connections + (including the connection is once established but now closed). + This is useful when tcp connections are not going to be restored. + *--skip-in-flight*:: This option skips in-flight TCP connections. If any TCP connections that are not yet completely established are found, *criu* ignores @@ -303,6 +378,10 @@ For example, the command line for the above example should look like this: Allows to link unlinked files back, if possible (modifies filesystem during *restore*). +*--timeout* 'number':: + Set a time limit in seconds for collecting tasks during the + dump operation. The timeout is 10 seconds by default. + *--ghost-limit* 'size':: Set the maximum size of deleted file to be carried inside image. By default, up to 1M file is allowed. Using this @@ -310,6 +389,13 @@ For example, the command line for the above example should look like this: 'size' may be postfixed with a *K*, *M* or *G*, which stands for kilo-, mega, and gigabytes, accordingly. +*--ghost-fiemap*:: + Enable an optimization based on fiemap ioctl that can reduce the + number of system calls used when checkpointing highly sparse ghost + files. This option is enabled by default, and it can be disabled + with *--no-ghost-fiemap*. An automatic fallback to SEEK_HOLE/SEEK_DATA + is used when fiemap is not supported. + *-j*, *--shell-job*:: Allow one to dump shell jobs. This implies the restored task will inherit session and process group ID from the *criu* itself. @@ -347,22 +433,78 @@ By default the option is set to *fpu* and *ins*. option is intended for post-copy (lazy) migration and should be used in conjunction with *restore* with appropriate options. +*--file-validation* ['mode']:: + Set the method to be used to validate open files. Validation is done + to ensure that the version of the file being restored is the same + version when it was dumped. ++ +The 'mode' may be one of the following: + + *filesize*::: + To explicitly use only the file size check all the time. + This is the fastest and least intensive check. + + *buildid*::: + To validate ELF files with their build-ID. If the + build-ID cannot be obtained, 'chksm-first' method will be + used. This is the default if mode is unspecified. + +*--network-lock* ['mode']:: + Set the method to be used for network locking/unlocking. Locking is done + to ensure that tcp packets are dropped between dump and restore. This is + done to avoid the kernel sending RST when a packet arrives destined for + the dumped process. ++ +The 'mode' may be one of the following: + + *iptables*::: Use iptables rules to drop the packets. + This is the default if 'mode' is not specified. + + *nftables*::: Use nftables rules to drop the packets. + + *skip*::: Don't lock the network. If *--tcp-close* is not used, the network + must be locked externally to allow CRIU to dump TCP connections. + +*--allow-uprobes*:: + Allow dumping when uprobes vma is present. When used on dump, this option is + required on restore as well. + + A uprobes vma is automatically created by the kernel once a uprobe is + triggered. This mapping is not removed even once the uprobe is deleted. So, + even if a process once had uprobes attached to it, and they're removed by + the time the process is dumped, this option is still required because criu + has no way of knowing whether there are active uprobes or not. + + When using this option on restore, make sure the uprobes (if any) active on + the dumped processes are still active. Otherwise, when execution reaches + a uprobe'd location in any of the restored processes, that process will be + sent a SIGTRAP. + + As an example, say a uprobe is set at function foo in the executable of the + process p_bar. Whenever execution in p_bar reaches function foo, the uprobe + is triggered. If the uprobe has been triggered at least once, then the kernel + will have created the uprobes vma. To dump p_bar, this option is + necessary. After dumping, say the uprobe is deleted. Now, on restoring with + this option, once execution reaches function foo, SIGTRAP will be sent to + the restored p_bar. Unless it has a signal handler installed for SIGTRAP, + it will be terminated and core dumped. + *restore* ~~~~~~~~~ Restores previously checkpointed processes. -*--inherit-fd* *fd[*'N'*]:*'resource':: +*--inherit-fd* **fd[**__N__**]:**__resource__:: Inherit a file descriptor. This option lets *criu* use an already opened file descriptor 'N' for restoring a file identified by 'resource'. This option can be used to restore an external resource dumped - with the help of *--external* *file*, *tty*, and *unix* options. + with the help of *--external* *file*, *tty*, *pid* and *unix* options. + The 'resource' argument can be one of the following: + - - *tty[*'rdev'*:*'dev'*]* - - *pipe[*'inode'*]* - - *socket[*'inode'*]* - - *file[*'mnt_id'*:*'inode'*]* + - **tty[**__rdev__**:**__dev__**]** + - **pipe:[**__inode__**]** + - **socket:[**__inode__*]* + - **file[**__mnt_id__**:**__inode__**]** - 'path/to/file' + @@ -385,8 +527,10 @@ usually need to be escaped from shell. *-r*, *--root* 'path':: Change the root filesystem to 'path' (when run in a mount namespace). + This option is required to restore a mount namespace. The directory + 'path' must be a mount point and its parent must not be overmounted. -*--external* 'type'*[*'id'*]:*'value':: +*--external* __type__**[**__id__**]:**__value__:: Restore an instance of an external resource. The generic syntax is 'type' of resource, followed by resource 'id' (enclosed in literal square brackets), and optional 'value' (prepended by a literal colon). @@ -396,7 +540,7 @@ usually need to be escaped from shell. the help of *--external* *file*, *tty*, and *unix* options), option *--inherit-fd* should be used. -*--external mnt[*'name'*]:*'mountpoint':: +*--external* **mnt[**__name__**]:**__mountpoint__:: Restore an external bind mount referenced in the image by 'name', bind-mounting it from the host 'mountpoint' to a proper mount point. @@ -404,26 +548,36 @@ usually need to be escaped from shell. Restore all external bind mounts (dumped with the help of *--external mnt[]* auto-detection). -*--external dev[*'name'*]:*'/dev/path':: +*--external* **dev[**__name__**]:**__/dev/path__:: Restore an external mount device, identified in the image by 'name', using the existing block device '/dev/path'. -*--external veth[*'inner_dev'*]:*'outer_dev'*@*'bridge':: +*--external* **veth[**__inner_dev__**]:**__outer_dev__**@**__bridge__:: Set the outer VETH device name (corresponding to 'inner_dev' being - restored) to 'outer_dev'. If optional *@*'bridge' is specified, + restored) to 'outer_dev'. If optional **@**_bridge_ is specified, 'outer_dev' is added to that bridge. If the option is not used, 'outer_dev' will be autogenerated by the kernel. -*--external macvlan[*'inner_dev'*]:*'outer_dev':: +*--external* **macvlan[**__inner_dev__**]:**__outer_dev__:: When restoring an image that have a MacVLAN device in it, this option must be used to specify to which 'outer_dev' (an existing network device in CRIU namespace) the restored 'inner_dev' should be bound to. +*-J*, *--join-ns* **NS**:{**PID**|**NS_FILE**}[,**EXTRA_OPTS**]:: + Restore process tree inside an existing namespace. The namespace can + be specified in 'PID' or 'NS_FILE' path format (example: + *--join-ns net:12345* or *--join-ns net:/foo/bar*). Currently supported + values for **NS** are: *ipc*, *net*, *time*, *user*, and *uts*. + This option doesn't support joining a PID namespace, however, this is + possible using *--external* and *--inheritfd*. 'EXTRA_OPTS' is optional + and can be used to specify UID and GID for user namespace (e.g., + *--join-ns user:PID,UID,GID*). + *--manage-cgroups* ['mode']:: Restore cgroups configuration associated with a task from the image. Controllers are always restored in an optimistic way -- if already present in system, *criu* reuses it, otherwise it will be created. - ++ The 'mode' may be one of the following: *none*::: Do not restore cgroup properties but require cgroup to @@ -433,7 +587,7 @@ The 'mode' may be one of the following: *soft*::: Restore cgroup properties if only cgroup has been created by *criu*, otherwise do not restore properties. This is the - default if mode is unspecified. + default if mode is unspecified. *full*::: Always restore all cgroups and their properties. @@ -442,6 +596,11 @@ The 'mode' may be one of the following: *ignore*::: Don't deal with cgroups and pretend that they don't exist. +*--cgroup-yard* 'path':: + Instead of trying to mount cgroups in CRIU, provide a path to a directory + with already created cgroup yard. For more information look in the *dump* + section. + *--cgroup-root* ['controller'*:*]/'newroot':: Change the root cgroup the controller will be installed into. No controller means that root is the default for all controllers not specified. @@ -454,16 +613,38 @@ The 'mode' may be one of the following: *--tcp-close*:: Restore connected TCP sockets in closed state. -*--veth-pair* 'IN'*=*'OUT':: +*--veth-pair* __IN__**=**__OUT__:: Correspondence between outside and inside names of veth devices. *-l*, *--file-locks*:: Restore file locks from the image. -*--lsm-profile* 'type'*:*'name':: - Specify an LSM profile to be used during restore. The `type` can be +*--lsm-profile* __type__**:**__name__:: + Specify an LSM profile to be used during restore. The _type_ can be either *apparmor* or *selinux*. +*--lsm-mount-context* 'context':: + Specify a new mount context to be used during restore. ++ +This option will only replace existing mount context information +with the one specified with this option. Mounts without the +'context=' option will not be changed. ++ +If a mountpoint has been checkpointed with an option like + + context="system_u:object_r:container_file_t:s0:c82,c137" ++ +it is possible to change this option using + + --lsm-mount-context "system_u:object_r:container_file_t:s0:c204,c495" ++ +which will result that the mountpoint will be restored +with the new 'context='. ++ +This option is useful if using *selinux* and if the *selinux* +labels need to be changed on restore like if a container is +restored into an existing Pod. + *--auto-dedup*:: As soon as a page is restored it get punched out from image. @@ -516,6 +697,29 @@ are not adequate, but this can be suppressed by using *--cpu-cap=none*. restored process. This option requires running *lazy-pages* daemon. +*--file-validation* ['mode']:: + Set the method to be used to validate open files. Validation is done + to ensure that the version of the file being restored is the same + version when it was dumped. ++ +The 'mode' may be one of the following: + + *filesize*::: + To explicitly use only the file size check all the time. + This is the fastest and least intensive check. + + *buildid*::: + To validate ELF files with their build-ID. If the + build-ID cannot be obtained, 'chksm-first' method will be + used. This is the default if mode is unspecified. + +*--skip-file-rwx-check*:: + Skip checking file permissions (r/w/x for u/g/o) on restore. + +*--allow-uprobes*:: + Required when dumped with this option. Refer to this option in the section + on dumping for more details. + *check* ~~~~~~~ Checks whether the kernel supports the features needed by *criu* to @@ -526,17 +730,17 @@ check* always checks Category 1 features unless *--feature* is specified which only checks a specified feature. *Category 1*::: Absolutely required. These are features like support for - */proc/PID/map_files*, *NETLINK_SOCK_DIAG* socket - monitoring, */proc/sys/kernel/ns_last_pid* etc. + */proc/PID/map_files*, *NETLINK_SOCK_DIAG* socket + monitoring, */proc/sys/kernel/ns_last_pid* etc. *Category 2*::: Required only for specific cases. These are features - like AIO remap, */dev/net/tun* and others that are only - required if a process being dumped or restored - is using those. + like AIO remap, */dev/net/tun* and others that are only + required if a process being dumped or restored + is using those. *Category 3*::: Experimental. These are features like *task-diag* that - are used for experimental purposes (mostly - during development). + are used for experimental purposes (mostly + during development). If there are no errors or warnings, *criu* prints "Looks good." and its exit code is 0. @@ -722,6 +926,42 @@ configuration file will overwrite all other configuration file settings or RPC options. *This can lead to undesired behavior of criu and should only be used carefully.* +NON-ROOT +-------- +*criu* can be used as non-root with either the *CAP_SYS_ADMIN* capability +or with the *CAP_CHECKPOINT_RESTORE* capability introduces in Linux kernel 5.9. +*CAP_CHECKPOINT_RESTORE* is the minimum that is required. + +*criu* also needs either *CAP_SYS_PTRACE* or a value of 0 in +*/proc/sys/kernel/yama/ptrace_scope* (see *ptrace*(2)) to be able to interrupt +the process for dumping. + +Running *criu* as non-root has many limitations and depending on the process +to checkpoint and restore it may not be possible. + +In addition to *CAP_CHECKPOINT_RESTORE* it is possible to give *criu* additional +capabilities to enable additional features in non-root mode. + +Currently *criu* can benefit from the following additional capabilities: + + - *CAP_NET_ADMIN* + - *CAP_SYS_CHROOT* + - *CAP_SETUID* + - *CAP_SYS_RESOURCE* + +Note that for some operations, having a capability in a namespace other than +the init namespace (i.e. the default/root namespace) is not sufficient. For +example, in order to read symlinks in proc/[pid]/map_files CRIU requires +CAP_CHECKPOINT_RESTORE in the init namespace; having CAP_CHECKPOINT_RESTORE +while running in another user namespace (e.g. in a container) does not allow +CRIU to read symlinks in /proc/[pid]/map_files. + +Without access to /proc/[pid]/map_files checkpointing/restoring processes +that have mapped deleted files may not be possible. + +Independent of the capabilities it is always necessary to use "*--unprivileged*" to +accept *criu*'s limitation in non-root mode. + EXAMPLES -------- To checkpoint a program with pid of *1234* and write all image files into diff --git a/Documentation/logo.svg b/Documentation/logo.svg new file mode 100644 index 000000000..f713e72b7 --- /dev/null +++ b/Documentation/logo.svg @@ -0,0 +1,136 @@ + + + + + + + diff --git a/GEMINI.md b/GEMINI.md new file mode 100644 index 000000000..e56c1de12 --- /dev/null +++ b/GEMINI.md @@ -0,0 +1,136 @@ +# CRIU (Checkpoint/Restore In User-space) + +CRIU is a tool for saving the state of a running application to a set of files +(checkpointing) and restoring it back to a live state. It is primarily used for +live migration of containers, in-place updates, and fast application startup. + +It is implemented as a command-line tool called `criu`. The two primary commands +are `dump` and `restore`. + +- `dump`: Saves a process tree and all its related resources (file + descriptors, IPC, sockets, namespaces, etc.) into a collection of image + files. +- `restore`: Restores processes from image files to the same state they were + in before the dump. + +## Quick Start + +To get a feel for `criu`, you can try checkpointing and restoring a simple +process. + +1. **Run a simple process:** + Open a terminal and run a command that will run for a while. Find its PID. + ```bash + sleep 1000 & + [1] 12345 + ``` + +2. **Dump the process:** + As root, use `criu dump` with the process ID (`-t`) and a directory for the + image files (`-D`). + ```bash + sudo criu dump -t 12345 -D /tmp/sleep_images -v4 --shell-job + ``` + The `sleep` process will no longer be running. + +3. **Restore the process:** + Use `criu restore` to bring the process back to life from the images. + ```bash + sudo criu restore -D /tmp/sleep_images -v4 --shell-job + ``` + The `sleep` process will be running again as if nothing happened. + +# For Developers and Contributors + +This section contains more technical details about CRIU's internals and +development process. + +## Dump Process + +On dump, CRIU uses available kernel interfaces to collect information about +processes. For properties that can only be retrieved from within the process +itself, CRIU injects a binary blob (called a "parasite") into the process's +address space and executes it in the context of one of the process's threads. +This injection is handled by a subproject called **Compel**. + +## Restore Process + +On restore, CRIU reads the image files to reconstruct the processes. The goal is +to restore them to the exact state they were in before the dump. The restore +process is divided into several stages (defined as `CR_STATE_*` in +`./criu/include/restorer.h`). + +The main `criu` process acts as a coordinator. It first restores resources with +inter-process dependencies (file descriptors, sockets, shared memory, +namespaces, etc.). It then forks the process tree and sets up namespaces. +Finally, it restores process-specific resources like file descriptors and memory +mappings. + +A key step involves a small, self-contained binary called the "restorer". All +restored processes switch to executing this code, which unmaps the CRIU-specific +memory and restores the application's original memory mappings. On the final +step, the restorer calls `sigreturn` on a prepared signal frame to resume the +process with the state it had at the moment of the dump. + +## Compel + +Compel is a subproject responsible for generating the binary blobs used for the +parasite code (for dumping) and the restorer code (for restoring). It provides a +library for injecting and executing this code within the target process's +address space. It is a separate project because the logic for generating and +injecting Position-Independent Executable (PIE) code is complex and +self-contained. + +## Coding Style + +The C code in the CRIU project follows the +[Linux Kernel Coding Style](https://www.kernel.org/doc/html/latest/process/coding-style.html). +Here are some of the main points: + +- **Indentation**: Use tabs, which are set to 8 characters. +- **Line Length**: The preferred line limit is 80 characters, but it can be + extended to 120 if it improves code readability. +- **Braces**: + - The opening brace for a function goes on a new line. + - The opening brace for a block (like `if`, `for`, `while`, `switch`) goes + on the same line. +- **Spaces**: Use spaces around operators (`+`, `-`, `*`, `/`, `%`, `<`, `>`, + `=`, etc.). +- **Naming**: Use descriptive names for functions and variables. +- **Comments**: Use C-style comments (`/* ... */`). For multi-line comments, + the preferred format is: + ```c + /* + * This is a multi-line + * comment. + */ + ``` + +## Code Layout + +The code is organized into the following directories: + +- `./compel`: The Compel sub-project. +- `./criu`: The main `criu` tool source code. +- `./images`: Protobuf descriptions for the image files. +- `./test`: All tests. +- `./test/zdtm`: The Zero-Downtime Migration (ZDTM) test suite. +- `./test/zdtm.py`: The executor script for ZDTM tests. +- `./scripts`: Helper scripts. +- `./scripts/build`: Docker image files used for CI and cross-compilation + checks. +- `./crit`: A tool to inspect and manipulate CRIU image files. +- `./soccr`: A library for TCP socket checkpoint/restore. + +## Tests + +The main test suite is ZDTM. Here is an example of how to run a single test: + +```bash +sudo ./test/zdtm.py run -t zdtm/static/env00 +``` + +Each ZDTM test has three stages: preparation, C/R, and results checks. During +the test, a process calls `test_daemon()` to signal it is ready for C/R, then +calls `test_waitsig()` to wait for the C/R stage to complete. After being +restored, the test checks that all its resources are still in a valid state. diff --git a/INSTALL.md b/INSTALL.md index d786d06eb..af0702518 100644 --- a/INSTALL.md +++ b/INSTALL.md @@ -1,11 +1,31 @@ +## Building CRIU from source code + +First, you need to install compile-time dependencies. Check [Installation dependencies](https://criu.org/Installation#Dependencies) for more info. + +To compile CRIU, run: +``` +make +``` +This should create the `./criu/criu` executable. + +To change the default behaviour of CRIU, the following variables can be passed +to the make command: + + * **NETWORK_LOCK_DEFAULT**, can be set to one of the following + values: `NETWORK_LOCK_IPTABLES`, `NETWORK_LOCK_NFTABLES`, + `NETWORK_LOCK_SKIP`. CRIU defaults to `NETWORK_LOCK_IPTABLES` + if nothing is specified. If another network locking backend is + needed, `make` can be called like this: + `make NETWORK_LOCK_DEFAULT=NETWORK_LOCK_NFTABLES` + ## Installing CRIU from source code Once CRIU is built one can easily setup the complete CRIU package (which includes executable itself, CRIT tool, libraries, manual and etc) simply typing - - make install - +``` +make install +``` this command accepts the following variables: * **DESTDIR**, to specify global root where all components will be placed under (empty by default); @@ -16,17 +36,17 @@ this command accepts the following variables: * **LIBDIR**, to specify directory where to put libraries (guess the correct path by default). Thus one can type - - make DESTDIR=/some/new/place install - +``` +make DESTDIR=/some/new/place install +``` and get everything installed under `/some/new/place`. ## Uninstalling CRIU To clean up previously installed CRIU instance one can type - - make uninstall - +``` +make uninstall +``` and everything should be removed. Note though that if some variable (**DESTDIR**, **BINDIR** and such) has been used during installation procedure, the same *must* be passed with uninstall action. diff --git a/MAINTAINERS b/MAINTAINERS new file mode 100644 index 000000000..8fee8e571 --- /dev/null +++ b/MAINTAINERS @@ -0,0 +1,8 @@ +Pavel Emelyanov (chief) +Andrey Vagin +Mike Rapoport +Dmitry Safonov <0x7f454c46@gmail.com> +Adrian Reber +Pavel Tikhomirov +Radostin Stoyanov +Alexander Mikhalitsyn diff --git a/MAINTAINERS_GUIDE.md b/MAINTAINERS_GUIDE.md new file mode 100644 index 000000000..5de8e6cb6 --- /dev/null +++ b/MAINTAINERS_GUIDE.md @@ -0,0 +1,136 @@ +## Introduction + +Dear maintainer. Thank you for investing the time and energy to help +make CRIU as useful as possible. Maintaining a project is difficult, +sometimes unrewarding work. Sure, you will contribute cool features +to the project, but most of your time will be spent reviewing patches, +cleaning things up, documenting, answering questions, justifying design +decisions - while everyone else will just have fun! But remember -- the +quality of the maintainers work is what distinguishes the good projects +from the great. So please be proud of your work, even the unglamorous +parts, and encourage a culture of appreciation and respect for *every* +aspect of improving the project -- not just the hot new features. + +Being a maintainer is a time consuming commitment and should not be +taken lightly. This document is a manual for maintainers old and new. +It explains what is expected of maintainers, how they should work, and +what tools are available to them. + +This is a living document - if you see something out of date or missing, +speak up! + +## What are a maintainer's responsibility? + +Part of a healthy project is to have active maintainers to support the +community in contributions and perform tasks to keep the project running. +It is every maintainer's responsibility to: + + * Keep the community a friendly place + * Deliver prompt feedback and decisions on pull requests and mailing + list threads + * Encourage other members to help each other, especially in cases the + maintainer is overloaded or feels the lack of needed expertise + * Make sure the changes made respects the philosophy, design and + roadmap of the project + +## How are decisions made? + +CRIU is an open-source project with an open design philosophy. This +means that the repository is the source of truth for EVERY aspect of the +project. *If it's part of the project, it's in the repo. It's in the +repo, it's part of the project.* + +All decisions affecting CRIU, big and small, follow the same 3 steps: + + * Submit a change. Anyone can do this + + * Discuss it. Anyone can and is encouraged to do this + + * Accept or decline it. Only maintainers do this + +*I'm a maintainer, should I make pull requests / send patches too?* + +Yes. Nobody should ever push to the repository directly. All changes +should be made through submitting (and accepting) the change. + +### Two-steps decision making ### + +Since CRIU is extremely complex piece of software we try double hard +not to make mistakes, that would be hard to fix in the future. In order +to facilitate this, the "final" decision is made in two stages: + + * We definitely want to try something out + + * We think that the attempt was successful + +Respectively, new features get accepted first into the *criu-dev* branch and +after they have been validated they are merged into the *master* branch. Yet, +urgent bug fixes may land directly in the master branch. If a change in +the criu-dev branch is considered to be bad (whatever it means), then it +can be reverted without propagation to the master branch. Reverting from +the master branch is expected not to happen at all, but if such an +extraordinary case occurs, the impact of this step, especially the question +of backward compatibility, should be considered in the most careful manner. + +## Who decides what? + +All decisions can be expressed as changes to the repository (either in the +form of pull requests, or patches sent to the mailing list), and maintainers +make decisions by merging or rejecting them. Review and approval or +disagreement can be done by anyone and is denoted by adding a respective +comment in the pull request. However, merging the change into either branch +only happens after approvals from maintainers. + +In order for a patch to be merged into the criu-dev branch at least two +maintainers should accept it. In order for a patch to be merged into the +master branch the majority of maintainers should decide that (then prepare +a pull request, submit it, etc.). + +Overall the maintainer system works because of mutual respect across the +maintainers of the project. The maintainers trust one another to make +decisions in the best interests of the project. Sometimes maintainers +can disagree and this is part of a healthy project to represent the point +of views of various people. In the case where maintainers cannot find +agreement on a specific change the role of a Chief Maintainer comes into +play. + +### Chief maintainer + +The chief maintainer for the project is responsible for overall architecture +of the project to maintain conceptual integrity. Large decisions and +architecture changes should be reviewed by the chief maintainer. + +Also the chief maintainer has the veto power on any change submitted +to any branch. Naturally, a change in the criu-dev branch can be reverted +after a chief maintainer veto, a change in the master branch must be +carefully reviewed by the chief maintainer and vetoed in advance. + +### How are maintainers added (and removed)? + +The best maintainers have a vested interest in the project. Maintainers +are first and foremost contributors that have shown they are committed to +the long term success of the project. Contributors wanting to become +maintainers are expected to be deeply involved in contributing code, +patches review, and paying needed attention to the issues in the project. +Just contributing does not make you a maintainer, it is about building trust +with the current maintainers of the project and being a person that they can +rely on and trust to make decisions in the best interest of the project. + +When a contributor wants to become a maintainer or nominate someone as a +maintainer, one can submit a "nomination", which technically is the +respective modification to the `MAINTAINERS` file. When a maintainer feels +they is unable to perform the required duties, or someone else wants to draw +the community attention to this fact, one can submit a "(self-)removing" +change. + +The final vote to add or to remove a maintainer is to be approved by the +majority of current maintainers (with the chief maintainer having veto power +on that too). + +One might have noticed, that the chief maintainer (re-)assignment is not +regulated by this document. That's true :) However, this can be done. If +the community decides that the chief maintainer needs to be changed the +respective "decision making rules" are to be prepared, submitted and +accepted into this file first. + +Good luck! diff --git a/Makefile b/Makefile index 0140330e1..e26807158 100644 --- a/Makefile +++ b/Makefile @@ -17,34 +17,41 @@ ifeq ($(origin HOSTCFLAGS), undefined) HOSTCFLAGS := $(CFLAGS) $(USERCFLAGS) endif -UNAME-M := $(shell uname -m) - # # Supported Architectures -ifneq ($(filter-out x86 arm aarch64 ppc64 s390,$(ARCH)),) +ifneq ($(filter-out x86 arm aarch64 ppc64 s390 mips loongarch64 riscv64,$(ARCH)),) $(error "The architecture $(ARCH) isn't supported") endif # The PowerPC 64 bits architecture could be big or little endian. # They are handled in the same way. -ifeq ($(UNAME-M),ppc64) +ifeq ($(SUBARCH),ppc64) error := $(error ppc64 big endian is not yet supported) endif # # Architecture specific options. ifeq ($(ARCH),arm) - ARMV := $(shell echo $(UNAME-M) | sed -nr 's/armv([[:digit:]]).*/\1/p; t; i7') - DEFINES := -DCONFIG_ARMV$(ARMV) -DCONFIG_VDSO_32 + ARMV := $(shell echo $(SUBARCH) | sed -nr 's/armv([[:digit:]]).*/\1/p; t; i7') ifeq ($(ARMV),6) - USERCFLAGS += -march=armv6 + ARCHCFLAGS += -march=armv6 endif ifeq ($(ARMV),7) - USERCFLAGS += -march=armv7-a + ARCHCFLAGS += -march=armv7-a+fp endif + ifeq ($(ARMV),8) + # Running 'setarch linux32 uname -m' returns armv8l on aarch64. + # This tells CRIU to handle armv8l just as armv7hf. Right now this is + # only used for compile testing. No further verification of armv8l exists. + ARCHCFLAGS += -march=armv7-a + ARMV := 7 + endif + + DEFINES := -DCONFIG_ARMV$(ARMV) -DCONFIG_VDSO_32 + PROTOUFIX := y # For simplicity - compile code in Arm mode without interwork. # We could choose Thumb mode as default instead - but a dirty @@ -57,6 +64,8 @@ endif ifeq ($(ARCH),aarch64) DEFINES := -DCONFIG_AARCH64 + CC_MBRANCH_PROT := $(shell $(CC) -c -x c /dev/null -mbranch-protection=none -o /dev/null >/dev/null 2>&1 && echo "-mbranch-protection=none") + CFLAGS_PIE := $(CC_MBRANCH_PROT) endif ifeq ($(ARCH),ppc64) @@ -69,6 +78,18 @@ ifeq ($(ARCH),x86) DEFINES := -DCONFIG_X86_64 endif +ifeq ($(ARCH),mips) + DEFINES := -DCONFIG_MIPS +endif + +ifeq ($(ARCH),loongarch64) + DEFINES := -DCONFIG_LOONGARCH64 +endif + +ifeq ($(ARCH),riscv64) + DEFINES := -DCONFIG_RISCV64 +endif + # # CFLAGS_PIE: # @@ -77,7 +98,6 @@ endif # commit "S/390: Fix 64 bit sibcall". ifeq ($(ARCH),s390) ARCH := s390 - SRCARCH := s390 DEFINES := -DCONFIG_S390 CFLAGS_PIE := -fno-optimize-sibling-calls endif @@ -85,25 +105,47 @@ endif CFLAGS_PIE += -DCR_NOGLIBC export CFLAGS_PIE -LDARCH ?= $(SRCARCH) +LDARCH ?= $(ARCH) export LDARCH export PROTOUFIX DEFINES # # Independent options for all tools. DEFINES += -D_FILE_OFFSET_BITS=64 +DEFINES += -D_LARGEFILE64_SOURCE DEFINES += -D_GNU_SOURCE -WARNINGS := -Wall -Wformat-security +WARNINGS := -Wall -Wformat-security -Wdeclaration-after-statement -Wstrict-prototypes + +# -Wdangling-pointer results in false warning when we add a list element to +# local list head variable. It is false positive because before leaving the +# function we always check that local list head variable is empty, thus +# insuring that pointer to it is not dangling anywhere, but gcc can't +# understand it. +# Note: There is similar problem with kernel list, where this warning is also +# disabled: https://github.com/torvalds/linux/commit/49beadbd47c2 +WARNINGS += -Wno-dangling-pointer -Wno-unknown-warning-option CFLAGS-GCOV := --coverage -fno-exceptions -fno-inline -fprofile-update=atomic export CFLAGS-GCOV +ifeq ($(ARCH),mips) +WARNINGS := -rdynamic +endif + +ifeq ($(ARCH),loongarch64) +WARNINGS += -Wno-implicit-function-declaration +endif + ifneq ($(GCOV),) LDFLAGS += -lgcov CFLAGS += $(CFLAGS-GCOV) endif +ifneq ($(NETWORK_LOCK_DEFAULT),) + CFLAGS += -DNETWORK_LOCK_DEFAULT=$(NETWORK_LOCK_DEFAULT) +endif + ifeq ($(ASAN),1) CFLAGS-ASAN := -fsanitize=address export CFLAGS-ASAN @@ -128,12 +170,12 @@ export GMON GMONLDOPT endif AFLAGS += -D__ASSEMBLY__ -CFLAGS += $(USERCFLAGS) $(WARNINGS) $(DEFINES) -iquote include/ +CFLAGS += $(USERCFLAGS) $(ARCHCFLAGS) $(WARNINGS) $(DEFINES) -iquote include/ HOSTCFLAGS += $(WARNINGS) $(DEFINES) -iquote include/ export AFLAGS CFLAGS USERCLFAGS HOSTCFLAGS # Default target -all: criu lib crit +all: criu lib crit cuda_plugin .PHONY: all # @@ -188,12 +230,13 @@ criu-deps += include/common/asm # # Configure variables. export CONFIG_HEADER := include/common/config.h -ifeq ($(filter tags etags cscope clean mrproper,$(MAKECMDGOALS)),) +ifeq ($(filter tags etags cscope clean lint indent fetch-clang-format help mrproper,$(MAKECMDGOALS)),) include Makefile.config else # To clean all files, enable make/build options here export CONFIG_COMPAT := y export CONFIG_GNUTLS := y +export CONFIG_HAS_LIBBPF := y endif # @@ -235,22 +278,19 @@ criu: $(criu-deps) $(Q) $(MAKE) $(build)=criu all .PHONY: criu -crit/Makefile: ; -crit/%: criu .FORCE - $(Q) $(MAKE) $(build)=crit $@ -crit: criu - $(Q) $(MAKE) $(build)=crit all -.PHONY: crit +unittest: $(criu-deps) + $(Q) $(MAKE) $(build)=criu unittest +.PHONY: unittest # -# Libraries next once crit it ready +# Libraries next once criu is ready # (we might generate headers and such # when building criu itself). lib/Makefile: ; -lib/%: crit .FORCE +lib/%: criu .FORCE $(Q) $(MAKE) $(build)=lib $@ -lib: crit +lib: criu $(Q) $(MAKE) $(build)=lib all .PHONY: lib @@ -259,21 +299,28 @@ clean mrproper: $(Q) $(MAKE) $(build)=criu $@ $(Q) $(MAKE) $(build)=soccr $@ $(Q) $(MAKE) $(build)=lib $@ + $(Q) $(MAKE) $(build)=crit $@ $(Q) $(MAKE) $(build)=compel $@ $(Q) $(MAKE) $(build)=compel/plugins $@ - $(Q) $(MAKE) $(build)=lib $@ - $(Q) $(MAKE) $(build)=crit $@ .PHONY: clean mrproper +clean-amdgpu_plugin: + $(Q) $(MAKE) -C plugins/amdgpu clean +.PHONY: clean-amdgpu_plugin + +clean-cuda_plugin: + $(Q) $(MAKE) -C plugins/cuda clean +.PHONY: clean-cuda_plugin + clean-top: $(Q) $(MAKE) -C Documentation clean $(Q) $(MAKE) $(build)=test/compel clean $(Q) $(RM) .gitid .PHONY: clean-top -clean: clean-top +clean: clean-top clean-amdgpu_plugin clean-cuda_plugin -mrproper-top: clean-top +mrproper-top: clean-top clean-amdgpu_plugin clean-cuda_plugin $(Q) $(RM) $(CONFIG_HEADER) $(Q) $(RM) $(VERSION_HEADER) $(Q) $(RM) $(COMPEL_VERSION_HEADER) @@ -301,6 +348,18 @@ test: zdtm $(Q) $(MAKE) -C test .PHONY: test +amdgpu_plugin: criu + $(Q) $(MAKE) -C plugins/amdgpu all +.PHONY: amdgpu_plugin + +cuda_plugin: criu + $(Q) $(MAKE) -C plugins/cuda all +.PHONY: cuda_plugin + +crit: lib + $(Q) $(MAKE) -C crit +.PHONY: crit + # # Generating tar requires tag matched CRIU_VERSION. # If not found then simply use GIT's describe with @@ -354,17 +413,19 @@ gcov: .PHONY: gcov docker-build: - $(MAKE) -C scripts/build/ x86_64 + $(MAKE) -C scripts/build/ x86_64 .PHONY: docker-build docker-test: - docker run --rm -it --privileged criu-x86_64 ./test/zdtm.py run -a -x tcp6 -x tcpbuf6 -x static/rtc -x cgroup + docker run --rm --privileged -v /lib/modules:/lib/modules --network=host --cgroupns=host criu-x86_64 \ + ./test/zdtm.py run -a --keep-going --ignore-taint .PHONY: docker-test help: @echo ' Targets:' @echo ' all - Build all [*] targets' @echo ' * criu - Build criu' + @echo ' * crit - Build crit' @echo ' zdtm - Build zdtm test-suite' @echo ' docs - Build documentation' @echo ' install - Install CRIU (see INSTALL.md)' @@ -377,14 +438,76 @@ help: @echo ' cscope - Generate cscope database' @echo ' test - Run zdtm test-suite' @echo ' gcov - Make code coverage report' + @echo ' unittest - Run unit tests' + @echo ' lint - Run code linters' + @echo ' indent - Indent C code' + @echo ' amdgpu_plugin - Make AMD GPU plugin' + @echo ' cuda_plugin - Make NVIDIA CUDA plugin' .PHONY: help -lint: - flake8 --version - flake8 --config=scripts/flake8.cfg test/zdtm.py - flake8 --config=scripts/flake8.cfg test/inhfd/*.py - flake8 --config=scripts/flake8.cfg test/others/rpc/config_file.py - flake8 --config=scripts/flake8.cfg lib/py/images/pb2dict.py +ruff: + @ruff --version + ruff check ${RUFF_FLAGS} --config=scripts/ruff.toml \ + test/zdtm.py \ + test/inhfd/*.py \ + test/others/rpc/config_file.py \ + test/others/action-script/check_actions.py \ + test/others/pycriu/*.py \ + lib/pycriu/criu.py \ + lib/pycriu/__init__.py \ + lib/pycriu/images/pb2dict.py \ + lib/pycriu/images/images.py \ + scripts/criu-ns \ + test/others/criu-ns/run.py \ + crit/*.py \ + crit/crit/*.py \ + scripts/uninstall_module.py \ + coredump/ coredump/coredump \ + scripts/github-indent-warnings.py + +shellcheck: + shellcheck --version + shellcheck scripts/*.sh + shellcheck scripts/ci/*.sh + shellcheck contrib/apt-install contrib/dependencies/*.sh + shellcheck -x test/others/crit/*.sh + shellcheck -x test/others/libcriu/*.sh + shellcheck -x test/others/crit/*.sh test/others/criu-coredump/*.sh + shellcheck -x test/others/config-file/*.sh + shellcheck -x test/others/action-script/*.sh + +codespell: + codespell + +lint: ruff shellcheck codespell + # Do not append \n to pr_perror, pr_pwarn or fail + ! git --no-pager grep -E '^\s*\<(pr_perror|pr_pwarn|fail)\>.*\\n"' + # Do not use %m with pr_* or fail + ! git --no-pager grep -E '^\s*\<(pr_(err|perror|warn|pwarn|debug|info|msg)|fail)\>.*%m' + # Do not use errno with pr_perror, pr_pwarn or fail + ! git --no-pager grep -E '^\s*\<(pr_perror|pr_pwarn|fail)\>\(".*".*errno' + # End pr_(err|warn|msg|info|debug) with \n + ! git --no-pager grep -En '^\s*\.*);$$' | grep -v '\\n' + # No EOL whitespace for C files + ! git --no-pager grep -E '\s+$$' \*.c \*.h +.PHONY: lint ruff shellcheck codespell + +codecov: SHELL := $(shell command -v bash) +codecov: + curl -Os https://uploader.codecov.io/latest/linux/codecov + chmod +x codecov + ./codecov +.PHONY: codecov + +fetch-clang-format: .FORCE + $(E) ".clang-format" + $(Q) scripts/fetch-clang-format.sh + +BASE ?= "HEAD~1" +OPTS ?= "--quiet" +indent: + git clang-format --style file --extensions c,h $(OPTS) $(BASE) +.PHONY: indent include Makefile.install diff --git a/Makefile.compel b/Makefile.compel index 764afadc8..a4209edc5 100644 --- a/Makefile.compel +++ b/Makefile.compel @@ -50,8 +50,8 @@ compel/plugins/%: $(compel-deps) .FORCE # # GNU make 4.x supports targets matching via wide -# match targeting, where GNU make 3.x series (used on -# Travis) is not, so we have to write them here explicitly. +# match targeting, where GNU make 3.x series is not, +# so we have to write them here explicitly. compel/plugins/std.lib.a: $(compel-deps) .FORCE $(Q) $(MAKE) $(build)=compel/plugins $@ diff --git a/Makefile.config b/Makefile.config index 1e4352b9d..5cf4b8216 100644 --- a/Makefile.config +++ b/Makefile.config @@ -2,12 +2,15 @@ include $(__nmk_dir)utils.mk include $(__nmk_dir)msg.mk include scripts/feature-tests.mak +# This is a kludge for $(info ...) to not eat spaces. +S := + ifeq ($(call try-cc,$(FEATURE_TEST_LIBBSD_DEV),-lbsd),true) LIBS_FEATURES += -lbsd FEATURE_DEFINES += -DCONFIG_HAS_LIBBSD else - $(info Note: Building without setproctitle() and strlcpy() support.) - $(info $(info) To enable these features, please install libbsd-devel (RPM) / libbsd-dev (DEB).) + $(info Note: Building without setproctitle() support.) + $(info $S Install libbsd-devel (RPM) / libbsd-dev (DEB) to fix.) endif ifeq ($(call pkg-config-check,libselinux),y) @@ -15,45 +18,82 @@ ifeq ($(call pkg-config-check,libselinux),y) FEATURE_DEFINES += -DCONFIG_HAS_SELINUX endif +ifeq ($(call pkg-config-check,libbpf),y) + LIBS_FEATURES += -lbpf + FEATURE_DEFINES += -DCONFIG_HAS_LIBBPF + export CONFIG_HAS_LIBBPF := y +endif + +ifeq ($(call pkg-config-check,libdrm),y) + export CONFIG_AMDGPU := y + $(info Note: Building with amdgpu_plugin.) +else + $(info Note: Building without amdgpu_plugin.) + $(info $S Install libdrm-devel (RPM) or libdrm-dev (DEB) to fix.) +endif + ifeq ($(NO_GNUTLS)x$(call pkg-config-check,gnutls),xy) LIBS_FEATURES += -lgnutls export CONFIG_GNUTLS := y FEATURE_DEFINES += -DCONFIG_GNUTLS else - $(info Note: Building without GnuTLS support) + $(info Note: Building without GnuTLS support.) + $(info $S Install gnutls-devel (RPM) or gnutls-dev (DEB) to fix.) +endif + +ifeq ($(call pkg-config-check,libnftables),y) + LIB_NFTABLES := $(shell $(PKG_CONFIG) --libs libnftables) + ifeq ($(call try-cc,$(FEATURE_TEST_NFTABLES_LIB_API_0),$(LIB_NFTABLES)),true) + LIBS_FEATURES += $(LIB_NFTABLES) + FEATURE_DEFINES += -DCONFIG_HAS_NFTABLES_LIB_API_0 + else ifeq ($(call try-cc,$(FEATURE_TEST_NFTABLES_LIB_API_1),$(LIB_NFTABLES)),true) + LIBS_FEATURES += $(LIB_NFTABLES) + FEATURE_DEFINES += -DCONFIG_HAS_NFTABLES_LIB_API_1 + else + $(info Warn: Building without nftables support (incompatible API version).) + endif +else + $(info Warn: Building without nftables support.) + $(info $S Install nftables-devel (RPM) or libnftables-dev (DEB) to fix.) endif export LIBS += $(LIBS_FEATURES) +ifneq ($(PLUGINDIR),) + FEATURE_DEFINES += -DCR_PLUGIN_DEFAULT="\"$(PLUGINDIR)\"" +endif + CONFIG_FILE = .config $(CONFIG_FILE): touch $(CONFIG_FILE) -ifeq ($(SRCARCH),x86) +ifeq ($(ARCH),x86) # CONFIG_COMPAT is only for x86 now, no need for compile-test other archs ifeq ($(call try-asm,$(FEATURE_TEST_X86_COMPAT)),true) export CONFIG_COMPAT := y FEATURE_DEFINES += -DCONFIG_COMPAT else - $(info Note: Building without ia32 C/R, missed ia32 support in gcc) - $(info $(info) That may be related to missing gcc-multilib in your) - $(info $(info) distribution or you may have Debian with buggy toolchain) - $(info $(info) (issue https://github.com/checkpoint-restore/criu/issues/315)) + $(info Note: Building without ia32 C/R, missing ia32 support in gcc.) + $(info $S It may be related to missing gcc-multilib in your) + $(info $S distribution, or you may have Debian with buggy toolchain.) + $(info $S See https://github.com/checkpoint-restore/criu/issues/315.) endif endif export DEFINES += $(FEATURE_DEFINES) export CFLAGS += $(FEATURE_DEFINES) -FEATURES_LIST := TCP_REPAIR STRLCPY STRLCAT PTRACE_PEEKSIGINFO \ - SETPROCTITLE_INIT MEMFD TCP_REPAIR_WINDOW +FEATURES_LIST := TCP_REPAIR PTRACE_PEEKSIGINFO \ + SETPROCTITLE_INIT TCP_REPAIR_WINDOW MEMFD_CREATE \ + OPENAT2 NO_LIBC_RSEQ_DEFS # $1 - config name define gen-feature-test ifeq ($$(call try-cc,$$(FEATURE_TEST_$(1)),$$(LIBS_FEATURES),$$(DEFINES)),true) $(Q) echo '#define CONFIG_HAS_$(1)' >> $$@ - $(Q) echo '' >> $$@ +else + $(Q) echo '// CONFIG_HAS_$(1) is not set' >> $$@ endif endef diff --git a/Makefile.install b/Makefile.install index 3987bcc6f..70c607ec6 100644 --- a/Makefile.install +++ b/Makefile.install @@ -7,6 +7,7 @@ MANDIR ?= $(PREFIX)/share/man INCLUDEDIR ?= $(PREFIX)/include LIBEXECDIR ?= $(PREFIX)/libexec RUNDIR ?= /run +PLUGINDIR ?= $(PREFIX)/lib/criu # # For recent Debian/Ubuntu with multiarch support. @@ -26,7 +27,34 @@ endif LIBDIR ?= $(PREFIX)/lib export PREFIX BINDIR SBINDIR MANDIR RUNDIR -export LIBDIR INCLUDEDIR LIBEXECDIR +export LIBDIR INCLUDEDIR LIBEXECDIR PLUGINDIR + +# Detect externally managed Python environment (PEP 668). +PYTHON_EXTERNALLY_MANAGED := $(shell $(PYTHON) -c 'import os, sysconfig; print(int(os.path.isfile(os.path.join(sysconfig.get_path("stdlib"), "EXTERNALLY-MANAGED"))))') +PIP_BREAK_SYSTEM_PACKAGES ?= 0 + +# If Python environment is externally managed and PIP_BREAK_SYSTEM_PACKAGES is not set, skip pip install. +SKIP_PIP_INSTALL := 0 +ifeq ($(PYTHON_EXTERNALLY_MANAGED),1) +ifeq ($(PIP_BREAK_SYSTEM_PACKAGES),0) + +SKIP_PIP_INSTALL := 1 +$(info Warn: Externally managed python environment) +$(info Consider using PIP_BREAK_SYSTEM_PACKAGES=1) + +endif +endif + +# Default flags for pip install: +# --ignore-installed: Overwrite already installed pycriu/crit packages +# --no-build-isolation: Use current Python environment to build pycriu/crit packages +# --no-deps: Don't install any dependencies +# --no-index: Don't use PyPI index to find packages +# --progress-bar: Cleaner output +# --upgrade: Treat the install as an upgrade when replacing the installed version +PIPFLAGS ?= --ignore-installed --no-build-isolation --no-deps --no-index --progress-bar off --upgrade + +export SKIP_PIP_INSTALL PIPFLAGS install-man: $(Q) $(MAKE) -C Documentation install @@ -36,22 +64,37 @@ install-lib: lib $(Q) $(MAKE) $(build)=lib install .PHONY: install-lib +install-crit: lib + $(Q) $(MAKE) $(build)=crit install +.PHONY: install-crit + install-criu: criu $(Q) $(MAKE) $(build)=criu install .PHONY: install-criu +install-amdgpu_plugin: amdgpu_plugin + $(Q) $(MAKE) -C plugins/amdgpu install +.PHONY: install-amdgpu_plugin + +install-cuda_plugin: cuda_plugin + $(Q) $(MAKE) -C plugins/cuda install +.PHONY: install-cuda_plugin + install-compel: $(compel-install-targets) $(Q) $(MAKE) $(build)=compel install $(Q) $(MAKE) $(build)=compel/plugins install .PHONY: install-compel -install: install-man install-lib install-criu install-compel ; +install: install-man install-lib install-crit install-criu install-compel install-amdgpu_plugin install-cuda_plugin ; .PHONY: install uninstall: $(Q) $(MAKE) -C Documentation $@ $(Q) $(MAKE) $(build)=lib $@ + $(Q) $(MAKE) $(build)=crit $@ $(Q) $(MAKE) $(build)=criu $@ $(Q) $(MAKE) $(build)=compel $@ $(Q) $(MAKE) $(build)=compel/plugins $@ + $(Q) $(MAKE) -C plugins/amdgpu $@ + $(Q) $(MAKE) -C plugins/cuda $@ .PHONY: uninstall diff --git a/Makefile.versions b/Makefile.versions index f3adcb0a6..3e6c9ed22 100644 --- a/Makefile.versions +++ b/Makefile.versions @@ -1,10 +1,10 @@ # # CRIU version. -CRIU_VERSION_MAJOR := 3 -CRIU_VERSION_MINOR := 13 +CRIU_VERSION_MAJOR := 4 +CRIU_VERSION_MINOR := 2 CRIU_VERSION_SUBLEVEL := CRIU_VERSION_EXTRA := -CRIU_VERSION_NAME := Silicon Willet +CRIU_VERSION_NAME := CRIUTIBILITY CRIU_VERSION := $(CRIU_VERSION_MAJOR)$(if $(CRIU_VERSION_MINOR),.$(CRIU_VERSION_MINOR))$(if $(CRIU_VERSION_SUBLEVEL),.$(CRIU_VERSION_SUBLEVEL))$(if $(CRIU_VERSION_EXTRA),.$(CRIU_VERSION_EXTRA)) export CRIU_VERSION_MAJOR CRIU_VERSION_MINOR CRIU_VERSION_SUBLEVEL diff --git a/README.md b/README.md index 16e8452b5..6e2a0de9e 100644 --- a/README.md +++ b/README.md @@ -1,22 +1,33 @@ -[![master](https://travis-ci.org/checkpoint-restore/criu.svg?branch=master)](https://travis-ci.org/checkpoint-restore/criu) -[![development](https://travis-ci.org/checkpoint-restore/criu.svg?branch=criu-dev)](https://travis-ci.org/checkpoint-restore/criu) -[![Codacy Badge](https://api.codacy.com/project/badge/Grade/55251ec7db28421da4481fc7c1cb0cee)](https://www.codacy.com/app/xemul/criu?utm_source=github.com&utm_medium=referral&utm_content=xemul/criu&utm_campaign=Badge_Grade) -

+[![X86_64 GCC Test](https://github.com/checkpoint-restore/criu/workflows/X86_64%20GCC%20Test/badge.svg)]( + https://github.com/checkpoint-restore/criu/actions/workflows/x86-64-gcc-test.yml) +[![Docker Test](https://github.com/checkpoint-restore/criu/actions/workflows/docker-test.yml/badge.svg)]( + https://github.com/checkpoint-restore/criu/actions/workflows/docker-test.yml) +[![Podman Test](https://github.com/checkpoint-restore/criu/actions/workflows/podman-test.yml/badge.svg)]( + https://github.com/checkpoint-restore/criu/actions/workflows/podman-test.yml) +[![CircleCI](https://circleci.com/gh/checkpoint-restore/criu.svg?style=svg)]( + https://circleci.com/gh/checkpoint-restore/criu) + +

## CRIU -- A project to implement checkpoint/restore functionality for Linux CRIU (stands for Checkpoint and Restore in Userspace) is a utility to checkpoint/restore Linux tasks. -Using this tool, you can freeze a running application (or part of it) and checkpoint +Using this tool, you can freeze a running application (or part of it) and checkpoint it to a hard drive as a collection of files. You can then use the files to restore and run the application from the point it was frozen at. The distinctive feature of the CRIU project is that it is mainly implemented in user space. There are some more projects -doing C/R for Linux, and so far CRIU [appears to be](https://criu.org/Comparison_to_other_CR_projects) +doing C/R for Linux, and so far CRIU [appears to be](https://criu.org/Comparison_to_other_CR_projects) the most feature-rich and up-to-date with the kernel. +CRIU project is (almost) the never-ending story, because we have to always keep up with the +Linux kernel supporting checkpoint and restore for all the features it provides. Thus we're +looking for contributors of all kinds -- feedback, bug reports, testing, coding, writing, etc. +Please refer to [CONTRIBUTING.md](CONTRIBUTING.md) if you would like to get involved. + The project [started](https://criu.org/History) as the way to do live migration for OpenVZ -Linux containers, but later grew to more sophisticated and flexible tool. It is currently -used by (integrated into) OpenVZ, LXC/LXD, Docker, and other software, project gets tremendous +Linux containers, but later grew to more sophisticated and flexible tool. It is currently +used by (integrated into) OpenVZ, LXC/LXD, Docker, and other software, project gets tremendous help from the community, and its packages are included into many Linux distributions. The project home is at http://criu.org. This wiki contains all the knowledge base for CRIU we have. @@ -24,15 +35,15 @@ Pages worth starting with are: - [Installation instructions](http://criu.org/Installation) - [A simple example of usage](http://criu.org/Simple_loop) - [Examples of more advanced usage](https://criu.org/Category:HOWTO) -- Troubleshooting can be hard, some help can be found [here](https://criu.org/When_C/R_fails), [here](https://criu.org/What_cannot_be_checkpointed) and [here](https://criu.org/FAQ) +- Troubleshooting can be hard, some help can be found [here](https://criu.org/When_C/R_fails), [here](https://criu.org/What_cannot_be_checkpointed) and [here](https://criu.org/index.php?title=FAQ) -### Checkpoint and restore of simple loop process -[

](https://asciinema.org/a/232445) +### Checkpoint and restore of simple loop process +

## Advanced features As main usage for CRIU is live migration, there's a library for it called P.Haul. Also the -project exposes two cool core features as standalone libraries. These are libcompel for parasite code +project exposes two cool core features as standalone libraries. These are libcompel for parasite code injection and libsoccr for TCP connections checkpoint-restore. ### Live migration @@ -56,21 +67,9 @@ One of the CRIU features is the ability to save and restore state of a TCP socke without breaking the connection. This functionality is considered to be useful by itself, and we have it available as the [libsoccr library](https://criu.org/Libsoccr). -## How to contribute - -CRIU project is (almost) the never-ending story, because we have to always keep up with the -Linux kernel supporting checkpoint and restore for all the features it provides. Thus we're -looking for contributors of all kinds -- feedback, bug reports, testing, coding, writing, etc. -Here are some useful hints to get involved. - -* We have both -- [very simple](https://checkpoint-restore/criu/issues?q=is%3Aissue+is%3Aopen+label%3Aenhancement) and [more sophisticated](https://checkpoint-restore/criu/issues?q=is%3Aissue+is%3Aopen+label%3A%22new+feature%22) coding tasks; -* CRIU does need [extensive testing](https://checkpoint-restore/criu/issues?q=is%3Aissue+is%3Aopen+label%3Atesting); -* Documentation is always hard, we have [some information](https://criu.org/Category:Empty_articles) that is to be extracted from people's heads into wiki pages as well as [some texts](https://criu.org/Category:Editor_help_needed) that all need to be converted into useful articles; -* Feedback is expected on the github issues page and on the [mailing list](https://lists.openvz.org/mailman/listinfo/criu); -* For historical reasons we do not accept PRs, instead [patches are welcome](http://criu.org/How_to_submit_patches); -* Spread the word about CRIU in [social networks](http://criu.org/Contacts); -* If you're giving a talk about CRIU -- let us know, we'll mention it on the [wiki main page](https://criu.org/News/events); - ## Licence The project is licensed under GPLv2 (though files sitting in the lib/ directory are LGPLv2.1). + +All files in the images/ directory are licensed under the Expat license (so-called MIT). +See the images/LICENSE file. diff --git a/compel/.gitignore b/compel/.gitignore index eab3337d6..5e770a86c 100644 --- a/compel/.gitignore +++ b/compel/.gitignore @@ -4,6 +4,9 @@ arch/arm/plugins/std/syscalls/syscalls.S arch/aarch64/plugins/std/syscalls/syscalls.S arch/s390/plugins/std/syscalls/syscalls.S arch/ppc64/plugins/std/syscalls/syscalls.S +arch/mips/plugins/std/syscalls/syscalls-64.S +arch/loongarch64/plugins/std/syscalls/syscalls-64.S +arch/riscv64/plugins/std/syscalls/syscalls.S include/version.h plugins/include/uapi/std/asm/syscall-types.h plugins/include/uapi/std/syscall-64.h diff --git a/compel/Makefile b/compel/Makefile index de9318c42..c0b8a82a0 100644 --- a/compel/Makefile +++ b/compel/Makefile @@ -28,8 +28,12 @@ lib-y += src/lib/infect-util.o lib-y += src/lib/infect.o lib-y += src/lib/ptrace.o -# handle_elf() has no support of ELF relocations on ARM (yet?) -ifneq ($(filter arm aarch64,$(ARCH)),) +ifeq ($(ARCH),x86) +lib-y += arch/$(ARCH)/src/lib/thread_area.o +endif + +# handle_elf() has no support of ELF relocations on ARM and RISCV64 (yet?) +ifneq ($(filter arm aarch64 loongarch64 riscv64,$(ARCH)),) CFLAGS += -DNO_RELOCS HOSTCFLAGS += -DNO_RELOCS endif diff --git a/compel/arch/aarch64/plugins/include/asm/syscall-types.h b/compel/arch/aarch64/plugins/include/asm/syscall-types.h index ee0e2185d..45fd57af6 100644 --- a/compel/arch/aarch64/plugins/include/asm/syscall-types.h +++ b/compel/arch/aarch64/plugins/include/asm/syscall-types.h @@ -1,7 +1,7 @@ #ifndef COMPEL_ARCH_SYSCALL_TYPES_H__ #define COMPEL_ARCH_SYSCALL_TYPES_H__ -#define SA_RESTORER 0x04000000 +#define SA_RESTORER 0x04000000 typedef void rt_signalfn_t(int, siginfo_t *, void *); typedef rt_signalfn_t *rt_sighandler_t; @@ -9,20 +9,20 @@ typedef rt_signalfn_t *rt_sighandler_t; typedef void rt_restorefn_t(void); typedef rt_restorefn_t *rt_sigrestore_t; -#define _KNSIG 64 -#define _NSIG_BPW 64 +#define _KNSIG 64 +#define _NSIG_BPW 64 -#define _KNSIG_WORDS (_KNSIG / _NSIG_BPW) +#define _KNSIG_WORDS (_KNSIG / _NSIG_BPW) typedef struct { unsigned long sig[_KNSIG_WORDS]; } k_rtsigset_t; typedef struct { - rt_sighandler_t rt_sa_handler; - unsigned long rt_sa_flags; - rt_sigrestore_t rt_sa_restorer; - k_rtsigset_t rt_sa_mask; + rt_sighandler_t rt_sa_handler; + unsigned long rt_sa_flags; + rt_sigrestore_t rt_sa_restorer; + k_rtsigset_t rt_sa_mask; } rt_sigaction_t; #endif /* COMPEL_ARCH_SYSCALL_TYPES_H__ */ diff --git a/compel/arch/aarch64/plugins/std/parasite-head.S b/compel/arch/aarch64/plugins/std/parasite-head.S index 5e7067f6b..456c2117d 100644 --- a/compel/arch/aarch64/plugins/std/parasite-head.S +++ b/compel/arch/aarch64/plugins/std/parasite-head.S @@ -2,19 +2,6 @@ .section .head.text, "ax" ENTRY(__export_parasite_head_start) - adr x2, __export_parasite_head_start // get the address of this instruction - - ldr x0, __export_parasite_cmd - - ldr x1, parasite_args_ptr - add x1, x1, x2 // fixup __export_parasite_args - bl parasite_service brk #0 // the instruction BRK #0 generates the signal SIGTRAP in Linux - -parasite_args_ptr: - .quad __export_parasite_args - -__export_parasite_cmd: - .quad 0 END(__export_parasite_head_start) diff --git a/compel/arch/aarch64/plugins/std/syscalls/syscall-aux.h b/compel/arch/aarch64/plugins/std/syscalls/syscall-aux.h index 6272bf3a8..3c7124856 100644 --- a/compel/arch/aarch64/plugins/std/syscalls/syscall-aux.h +++ b/compel/arch/aarch64/plugins/std/syscalls/syscall-aux.h @@ -1,3 +1,3 @@ #ifndef __NR_openat -# define __NR_openat 56 +#define __NR_openat 56 #endif diff --git a/compel/arch/aarch64/scripts/compel-pack.lds.S b/compel/arch/aarch64/scripts/compel-pack.lds.S index eba89cd5f..57895ec9b 100644 --- a/compel/arch/aarch64/scripts/compel-pack.lds.S +++ b/compel/arch/aarch64/scripts/compel-pack.lds.S @@ -29,8 +29,4 @@ SECTIONS *(.eh_frame*) *(*) } - -/* Parasite args should have 4 bytes align, as we have futex inside. */ -. = ALIGN(4); -__export_parasite_args = .; } diff --git a/compel/arch/aarch64/src/lib/cpu.c b/compel/arch/aarch64/src/lib/cpu.c index cfaab1e65..538a29887 100644 --- a/compel/arch/aarch64/src/lib/cpu.c +++ b/compel/arch/aarch64/src/lib/cpu.c @@ -7,7 +7,7 @@ #include "log.h" -#undef LOG_PREFIX +#undef LOG_PREFIX #define LOG_PREFIX "cpu: " static compel_cpuinfo_t rt_info; @@ -22,11 +22,24 @@ static void fetch_rt_cpuinfo(void) } } -void compel_set_cpu_cap(compel_cpuinfo_t *info, unsigned int feature) { } -void compel_clear_cpu_cap(compel_cpuinfo_t *info, unsigned int feature) { } -int compel_test_cpu_cap(compel_cpuinfo_t *info, unsigned int feature) { return 0; } -int compel_test_fpu_cap(compel_cpuinfo_t *info, unsigned int feature) { return 0; } -int compel_cpuid(compel_cpuinfo_t *info) { return 0; } +void compel_set_cpu_cap(compel_cpuinfo_t *info, unsigned int feature) +{ +} +void compel_clear_cpu_cap(compel_cpuinfo_t *info, unsigned int feature) +{ +} +int compel_test_cpu_cap(compel_cpuinfo_t *info, unsigned int feature) +{ + return 0; +} +int compel_test_fpu_cap(compel_cpuinfo_t *info, unsigned int feature) +{ + return 0; +} +int compel_cpuid(compel_cpuinfo_t *info) +{ + return 0; +} bool compel_cpu_has_feature(unsigned int feature) { diff --git a/compel/arch/aarch64/src/lib/handle-elf.c b/compel/arch/aarch64/src/lib/handle-elf.c index 1c3686c48..206aef4cd 100644 --- a/compel/arch/aarch64/src/lib/handle-elf.c +++ b/compel/arch/aarch64/src/lib/handle-elf.c @@ -1,20 +1,17 @@ #include - -#include "uapi/compel.h" +#include #include "handle-elf.h" #include "piegen.h" #include "log.h" -static const unsigned char __maybe_unused -elf_ident_64_le[EI_NIDENT] = { - 0x7f, 0x45, 0x4c, 0x46, 0x02, 0x01, 0x01, 0x00, +static const unsigned char __maybe_unused elf_ident_64_le[EI_NIDENT] = { + 0x7f, 0x45, 0x4c, 0x46, 0x02, 0x01, 0x01, 0x00, /* clang-format */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, }; -static const unsigned char __maybe_unused -elf_ident_64_be[EI_NIDENT] = { - 0x7f, 0x45, 0x4c, 0x46, 0x02, 0x02, 0x01, 0x00, +static const unsigned char __maybe_unused elf_ident_64_be[EI_NIDENT] = { + 0x7f, 0x45, 0x4c, 0x46, 0x02, 0x02, 0x01, 0x00, /* clang-format */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, }; diff --git a/compel/arch/aarch64/src/lib/include/handle-elf.h b/compel/arch/aarch64/src/lib/include/handle-elf.h index 0f64b34cb..9f1a75081 100644 --- a/compel/arch/aarch64/src/lib/include/handle-elf.h +++ b/compel/arch/aarch64/src/lib/include/handle-elf.h @@ -3,8 +3,8 @@ #include "elf64-types.h" -#define __handle_elf handle_elf_aarch64 -#define arch_is_machine_supported(e_machine) (e_machine == EM_AARCH64) +#define __handle_elf handle_elf_aarch64 +#define arch_is_machine_supported(e_machine) (e_machine == EM_AARCH64) extern int handle_elf_aarch64(void *mem, size_t size); diff --git a/compel/arch/aarch64/src/lib/include/syscall.h b/compel/arch/aarch64/src/lib/include/syscall.h index e2ec1272e..13ee906e1 100644 --- a/compel/arch/aarch64/src/lib/include/syscall.h +++ b/compel/arch/aarch64/src/lib/include/syscall.h @@ -1,4 +1,8 @@ #ifndef __COMPEL_SYSCALL_H__ #define __COMPEL_SYSCALL_H__ -#define __NR(syscall, compat) __NR_##syscall +#define __NR(syscall, compat) \ + ({ \ + (void)compat; \ + __NR_##syscall; \ + }) #endif diff --git a/compel/arch/aarch64/src/lib/include/uapi/asm/breakpoints.h b/compel/arch/aarch64/src/lib/include/uapi/asm/breakpoints.h index 5f090490d..8a61b268f 100644 --- a/compel/arch/aarch64/src/lib/include/uapi/asm/breakpoints.h +++ b/compel/arch/aarch64/src/lib/include/uapi/asm/breakpoints.h @@ -2,14 +2,41 @@ #define __COMPEL_BREAKPOINTS_H__ #define ARCH_SI_TRAP TRAP_BRKPT -static inline int ptrace_set_breakpoint(pid_t pid, void *addr) -{ - return 0; -} +#include +#include -static inline int ptrace_flush_breakpoints(pid_t pid) -{ - return 0; -} +struct hwbp_cap { + char arch; + char bp_count; +}; + +/* copied from `linux/arch/arm64/include/asm/hw_breakpoint.h` */ +/* Lengths */ +#define ARM_BREAKPOINT_LEN_1 0x1 +#define ARM_BREAKPOINT_LEN_2 0x3 +#define ARM_BREAKPOINT_LEN_3 0x7 +#define ARM_BREAKPOINT_LEN_4 0xf +#define ARM_BREAKPOINT_LEN_5 0x1f +#define ARM_BREAKPOINT_LEN_6 0x3f +#define ARM_BREAKPOINT_LEN_7 0x7f +#define ARM_BREAKPOINT_LEN_8 0xff + +/* Privilege Levels */ +#define AARCH64_BREAKPOINT_EL1 1 +#define AARCH64_BREAKPOINT_EL0 2 + +/* Breakpoint */ +#define ARM_BREAKPOINT_EXECUTE 0 + +/* Watchpoints */ +#define ARM_BREAKPOINT_LOAD 1 +#define ARM_BREAKPOINT_STORE 2 +#define AARCH64_ESR_ACCESS_MASK (1 << 6) + +#define DISABLE_HBP 0 +#define ENABLE_HBP 1 + +int ptrace_set_breakpoint(pid_t pid, void *addr); +int ptrace_flush_breakpoints(pid_t pid); #endif diff --git a/compel/arch/aarch64/src/lib/include/uapi/asm/cpu.h b/compel/arch/aarch64/src/lib/include/uapi/asm/cpu.h index c35460e15..12e749508 100644 --- a/compel/arch/aarch64/src/lib/include/uapi/asm/cpu.h +++ b/compel/arch/aarch64/src/lib/include/uapi/asm/cpu.h @@ -1,6 +1,7 @@ #ifndef UAPI_COMPEL_ASM_CPU_H__ #define UAPI_COMPEL_ASM_CPU_H__ -typedef struct { } compel_cpuinfo_t; +typedef struct { +} compel_cpuinfo_t; #endif /* UAPI_COMPEL_ASM_CPU_H__ */ diff --git a/compel/arch/aarch64/src/lib/include/uapi/asm/gcs-types.h b/compel/arch/aarch64/src/lib/include/uapi/asm/gcs-types.h new file mode 100644 index 000000000..9f9655e3b --- /dev/null +++ b/compel/arch/aarch64/src/lib/include/uapi/asm/gcs-types.h @@ -0,0 +1,47 @@ +#ifndef __UAPI_ASM_GCS_TYPES_H__ +#define __UAPI_ASM_GCS_TYPES_H__ + +#ifndef NT_ARM_GCS +#define NT_ARM_GCS 0x410 /* ARM GCS state */ +#endif + +/* Shadow Stack/Guarded Control Stack interface */ +#define PR_GET_SHADOW_STACK_STATUS 74 +#define PR_SET_SHADOW_STACK_STATUS 75 +#define PR_LOCK_SHADOW_STACK_STATUS 76 + +/* When set PR_SHADOW_STACK_ENABLE flag allocates a Guarded Control Stack */ +#ifndef PR_SHADOW_STACK_ENABLE +#define PR_SHADOW_STACK_ENABLE (1UL << 0) +#endif + +/* Allows explicit GCS stores (eg. using GCSSTR) */ +#ifndef PR_SHADOW_STACK_WRITE +#define PR_SHADOW_STACK_WRITE (1UL << 1) +#endif + +/* Allows explicit GCS pushes (eg. using GCSPUSHM) */ +#ifndef PR_SHADOW_STACK_PUSH +#define PR_SHADOW_STACK_PUSH (1UL << 2) +#endif + +#ifndef SHADOW_STACK_SET_TOKEN +#define SHADOW_STACK_SET_TOKEN 0x1 /* Set up a restore token in the shadow stack */ +#endif + +#define PR_SHADOW_STACK_ALL_MODES \ + PR_SHADOW_STACK_ENABLE | PR_SHADOW_STACK_WRITE | PR_SHADOW_STACK_PUSH + +/* copied from: arch/arm64/include/asm/sysreg.h */ +#define GCS_CAP_VALID_TOKEN 0x1 +#define GCS_CAP_ADDR_MASK 0xFFFFFFFFFFFFF000ULL +#define GCS_CAP(x) ((((unsigned long)x) & GCS_CAP_ADDR_MASK) | GCS_CAP_VALID_TOKEN) +#define GCS_SIGNAL_CAP(addr) (((unsigned long)addr) & GCS_CAP_ADDR_MASK) + +#include + +#ifndef HWCAP_GCS +#define HWCAP_GCS (1UL << 32) +#endif + +#endif /* __UAPI_ASM_GCS_TYPES_H__ */ \ No newline at end of file diff --git a/compel/arch/aarch64/src/lib/include/uapi/asm/infect-types.h b/compel/arch/aarch64/src/lib/include/uapi/asm/infect-types.h index 4662f7689..606c92ffe 100644 --- a/compel/arch/aarch64/src/lib/include/uapi/asm/infect-types.h +++ b/compel/arch/aarch64/src/lib/include/uapi/asm/infect-types.h @@ -2,12 +2,13 @@ #define UAPI_COMPEL_ASM_TYPES_H__ #include +#include #include #include #include -#define SIGMAX 64 -#define SIGMAX_OLD 31 +#define SIGMAX 64 +#define SIGMAX_OLD 31 /* * Copied from the Linux kernel header arch/arm64/include/uapi/asm/ptrace.h @@ -15,18 +16,53 @@ * A thread ARM CPU context */ -typedef struct user_pt_regs user_regs_struct_t; -typedef struct user_fpsimd_state user_fpregs_struct_t; +typedef struct user_pt_regs user_regs_struct_t; -#define REG_RES(r) ((uint64_t)(r).regs[0]) -#define REG_IP(r) ((uint64_t)(r).pc) -#define REG_SP(r) ((uint64_t)((r).sp)) -#define REG_SYSCALL_NR(r) ((uint64_t)(r).regs[8]) +/* + * GCS (Guarded Control Stack) + * + * This mirrors the kernel definition but renamed to cr_user_gcs + * to avoid conflict with kernel headers (/usr/include/asm/ptrace.h). + */ +struct cr_user_gcs { + __u64 features_enabled; + __u64 features_locked; + __u64 gcspr_el0; +}; -#define user_regs_native(pregs) true +struct user_fpregs_struct { + struct user_fpsimd_state fpstate; + struct cr_user_gcs gcs; +}; +typedef struct user_fpregs_struct user_fpregs_struct_t; -#define ARCH_SI_TRAP TRAP_BRKPT +#define __compel_arch_fetch_thread_area(tid, th) 0 +#define compel_arch_fetch_thread_area(tctl) 0 +#define compel_arch_get_tls_task(ctl, tls) +#define compel_arch_get_tls_thread(tctl, tls) -#define __NR(syscall, compat) __NR_##syscall +#define REG_RES(r) ((uint64_t)(r).regs[0]) +#define REG_IP(r) ((uint64_t)(r).pc) +#define SET_REG_IP(r, val) ((r).pc = (val)) +#define REG_SP(r) ((uint64_t)((r).sp)) +#define REG_SYSCALL_NR(r) ((uint64_t)(r).regs[8]) + +#define user_regs_native(pregs) true + +#define ARCH_SI_TRAP TRAP_BRKPT + +#define __NR(syscall, compat) \ + ({ \ + (void)compat; \ + __NR_##syscall; \ + }) + +extern bool __compel_host_supports_gcs(void); +#define compel_host_supports_gcs __compel_host_supports_gcs + +struct parasite_ctl; +extern int __parasite_setup_shstk(struct parasite_ctl *ctl, + user_fpregs_struct_t *ext_regs); +#define parasite_setup_shstk __parasite_setup_shstk #endif /* UAPI_COMPEL_ASM_TYPES_H__ */ diff --git a/compel/arch/aarch64/src/lib/include/uapi/asm/sigframe.h b/compel/arch/aarch64/src/lib/include/uapi/asm/sigframe.h index bff714cca..7efee528f 100644 --- a/compel/arch/aarch64/src/lib/include/uapi/asm/sigframe.h +++ b/compel/arch/aarch64/src/lib/include/uapi/asm/sigframe.h @@ -1,37 +1,48 @@ #ifndef UAPI_COMPEL_ASM_SIGFRAME_H__ #define UAPI_COMPEL_ASM_SIGFRAME_H__ -#include +#include #include #include +#include /* Copied from the kernel header arch/arm64/include/uapi/asm/sigcontext.h */ -#define FPSIMD_MAGIC 0x46508001 +#define FPSIMD_MAGIC 0x46508001 +#define GCS_MAGIC 0x47435300 -typedef struct fpsimd_context fpu_state_t; +typedef struct fpsimd_context fpu_state_t; -struct aux_context { - struct fpsimd_context fpsimd; - /* additional context to be added before "end" */ - struct _aarch64_ctx end; +struct gcs_context { + struct _aarch64_ctx head; + __u64 gcspr; + __u64 features_enabled; + __u64 reserved; }; -// XXX: the idetifier rt_sigcontext is expected to be struct by the CRIU code -#define rt_sigcontext sigcontext +struct aux_context { + struct fpsimd_context fpsimd; + struct gcs_context gcs; + /* additional context to be added before "end" */ + struct _aarch64_ctx end; +}; + +// XXX: the identifier rt_sigcontext is expected to be struct by the CRIU code +#define rt_sigcontext sigcontext #include /* Copied from the kernel source arch/arm64/kernel/signal.c */ struct rt_sigframe { - siginfo_t info; - ucontext_t uc; - uint64_t fp; - uint64_t lr; + siginfo_t info; + ucontext_t uc; + uint64_t fp; + uint64_t lr; }; +/* clang-format off */ #define ARCH_RT_SIGRETURN(new_sp, rt_sigframe) \ asm volatile( \ "mov sp, %0 \n" \ @@ -40,30 +51,30 @@ struct rt_sigframe { : \ : "r"(new_sp) \ : "x8", "memory") +/* clang-format on */ /* cr_sigcontext is copied from arch/arm64/include/uapi/asm/sigcontext.h */ struct cr_sigcontext { - __u64 fault_address; - /* AArch64 registers */ - __u64 regs[31]; - __u64 sp; - __u64 pc; - __u64 pstate; - /* 4K reserved for FP/SIMD state and future expansion */ - __u8 __reserved[4096] __attribute__((__aligned__(16))); + __u64 fault_address; + /* AArch64 registers */ + __u64 regs[31]; + __u64 sp; + __u64 pc; + __u64 pstate; + /* 4K reserved for FP/SIMD state and future expansion */ + __u8 __reserved[4096] __attribute__((__aligned__(16))); }; -#define RT_SIGFRAME_UC(rt_sigframe) (&rt_sigframe->uc) -#define RT_SIGFRAME_REGIP(rt_sigframe) ((long unsigned int)(rt_sigframe)->uc.uc_mcontext.pc) -#define RT_SIGFRAME_HAS_FPU(rt_sigframe) (1) -#define RT_SIGFRAME_SIGCONTEXT(rt_sigframe) ((struct cr_sigcontext *)&(rt_sigframe)->uc.uc_mcontext) -#define RT_SIGFRAME_AUX_CONTEXT(rt_sigframe) ((struct aux_context*)&(RT_SIGFRAME_SIGCONTEXT(rt_sigframe)->__reserved)) -#define RT_SIGFRAME_FPU(rt_sigframe) (&RT_SIGFRAME_AUX_CONTEXT(rt_sigframe)->fpsimd) -#define RT_SIGFRAME_OFFSET(rt_sigframe) 0 +#define RT_SIGFRAME_UC(rt_sigframe) (&rt_sigframe->uc) +#define RT_SIGFRAME_REGIP(rt_sigframe) ((long unsigned int)(rt_sigframe)->uc.uc_mcontext.pc) +#define RT_SIGFRAME_HAS_FPU(rt_sigframe) (1) +#define RT_SIGFRAME_SIGCONTEXT(rt_sigframe) ((struct cr_sigcontext *)&(rt_sigframe)->uc.uc_mcontext) +#define RT_SIGFRAME_AUX_CONTEXT(rt_sigframe) ((struct aux_context *)&(RT_SIGFRAME_SIGCONTEXT(rt_sigframe)->__reserved)) +#define RT_SIGFRAME_FPU(rt_sigframe) (&RT_SIGFRAME_AUX_CONTEXT(rt_sigframe)->fpsimd) +#define RT_SIGFRAME_OFFSET(rt_sigframe) 0 +#define RT_SIGFRAME_GCS(rt_sigframe) (&RT_SIGFRAME_AUX_CONTEXT(rt_sigframe)->gcs) -#define rt_sigframe_erase_sigset(sigframe) \ - memset(&sigframe->uc.uc_sigmask, 0, sizeof(k_rtsigset_t)) -#define rt_sigframe_copy_sigset(sigframe, from) \ - memcpy(&sigframe->uc.uc_sigmask, from, sizeof(k_rtsigset_t)) +#define rt_sigframe_erase_sigset(sigframe) memset(&sigframe->uc.uc_sigmask, 0, sizeof(k_rtsigset_t)) +#define rt_sigframe_copy_sigset(sigframe, from) memcpy(&sigframe->uc.uc_sigmask, from, sizeof(k_rtsigset_t)) #endif /* UAPI_COMPEL_ASM_SIGFRAME_H__ */ diff --git a/compel/arch/aarch64/src/lib/infect.c b/compel/arch/aarch64/src/lib/infect.c index 4b5939022..42f593c79 100644 --- a/compel/arch/aarch64/src/lib/infect.c +++ b/compel/arch/aarch64/src/lib/infect.c @@ -2,7 +2,9 @@ #include #include #include -#include +#include +#include + #include #include "common/page.h" #include "uapi/compel/asm/infect-types.h" @@ -10,6 +12,9 @@ #include "errno.h" #include "infect.h" #include "infect-priv.h" +#include "asm/breakpoints.h" +#include "asm/gcs-types.h" +#include unsigned __page_size = 0; unsigned __page_shift = 0; @@ -18,12 +23,11 @@ unsigned __page_shift = 0; * Injected syscall instruction */ const char code_syscall[] = { - 0x01, 0x00, 0x00, 0xd4, /* SVC #0 */ - 0x00, 0x00, 0x20, 0xd4 /* BRK #0 */ + 0x01, 0x00, 0x00, 0xd4, /* SVC #0 */ + 0x00, 0x00, 0x20, 0xd4 /* BRK #0 */ }; -static const int -code_syscall_aligned = round_up(sizeof(code_syscall), sizeof(long)); +static const int code_syscall_aligned = round_up(sizeof(code_syscall), sizeof(long)); static inline void __always_unused __check_code_syscall(void) { @@ -31,40 +35,66 @@ static inline void __always_unused __check_code_syscall(void) BUILD_BUG_ON(!is_log2(sizeof(code_syscall))); } -int sigreturn_prep_regs_plain(struct rt_sigframe *sigframe, - user_regs_struct_t *regs, - user_fpregs_struct_t *fpregs) +bool __compel_host_supports_gcs(void) +{ + unsigned long hwcap = getauxval(AT_HWCAP); + return (hwcap & HWCAP_GCS) != 0; +} + +static bool __compel_gcs_enabled(struct cr_user_gcs *gcs) +{ + if (!compel_host_supports_gcs()) + return false; + + return gcs && (gcs->features_enabled & PR_SHADOW_STACK_ENABLE) != 0; +} + +int sigreturn_prep_regs_plain(struct rt_sigframe *sigframe, user_regs_struct_t *regs, user_fpregs_struct_t *fpregs) { struct fpsimd_context *fpsimd = RT_SIGFRAME_FPU(sigframe); + struct gcs_context *gcs = RT_SIGFRAME_GCS(sigframe); memcpy(sigframe->uc.uc_mcontext.regs, regs->regs, sizeof(regs->regs)); - sigframe->uc.uc_mcontext.sp = regs->sp; - sigframe->uc.uc_mcontext.pc = regs->pc; - sigframe->uc.uc_mcontext.pstate = regs->pstate; + pr_debug("sigreturn_prep_regs_plain: sp %lx pc %lx\n", (long)regs->sp, (long)regs->pc); - memcpy(fpsimd->vregs, fpregs->vregs, 32 * sizeof(__uint128_t)); + sigframe->uc.uc_mcontext.sp = regs->sp; + sigframe->uc.uc_mcontext.pc = regs->pc; + sigframe->uc.uc_mcontext.pstate = regs->pstate; - fpsimd->fpsr = fpregs->fpsr; - fpsimd->fpcr = fpregs->fpcr; + memcpy(fpsimd->vregs, fpregs->fpstate.vregs, 32 * sizeof(__uint128_t)); + + fpsimd->fpsr = fpregs->fpstate.fpsr; + fpsimd->fpcr = fpregs->fpstate.fpcr; fpsimd->head.magic = FPSIMD_MAGIC; fpsimd->head.size = sizeof(*fpsimd); + if (__compel_gcs_enabled(&fpregs->gcs)) { + gcs->head.magic = GCS_MAGIC; + gcs->head.size = sizeof(*gcs); + gcs->reserved = 0; + gcs->gcspr = fpregs->gcs.gcspr_el0 - 8; + gcs->features_enabled = fpregs->gcs.features_enabled; + + pr_debug("sigframe gcspr=%llx features_enabled=%llx\n", fpregs->gcs.gcspr_el0 - 8, fpregs->gcs.features_enabled); + } else { + pr_debug("sigframe gcspr=[disabled]\n"); + memset(gcs, 0, sizeof(*gcs)); + } + return 0; } -int sigreturn_prep_fpu_frame_plain(struct rt_sigframe *sigframe, - struct rt_sigframe *rsigframe) +int sigreturn_prep_fpu_frame_plain(struct rt_sigframe *sigframe, struct rt_sigframe *rsigframe) { return 0; } -int get_task_regs(pid_t pid, user_regs_struct_t *regs, save_regs_t save, - void *arg, __maybe_unused unsigned long flags) +int compel_get_task_regs(pid_t pid, user_regs_struct_t *regs, user_fpregs_struct_t *ext_regs, save_regs_t save, + void *arg, __maybe_unused unsigned long flags) { struct iovec iov; - user_fpregs_struct_t fpsimd; int ret; pr_info("Dumping GP/FPU registers for %d\n", pid); @@ -76,25 +106,79 @@ int get_task_regs(pid_t pid, user_regs_struct_t *regs, save_regs_t save, goto err; } - iov.iov_base = &fpsimd; - iov.iov_len = sizeof(fpsimd); + iov.iov_base = &ext_regs->fpstate; + iov.iov_len = sizeof(ext_regs->fpstate); if ((ret = ptrace(PTRACE_GETREGSET, pid, NT_PRFPREG, &iov))) { pr_perror("Failed to obtain FPU registers for %d", pid); goto err; } - ret = save(arg, regs, &fpsimd); + memset(&ext_regs->gcs, 0, sizeof(ext_regs->gcs)); + + iov.iov_base = &ext_regs->gcs; + iov.iov_len = sizeof(ext_regs->gcs); + if (ptrace(PTRACE_GETREGSET, pid, NT_ARM_GCS, &iov) == 0) { + pr_info("gcs: GCSPR_EL0 for %d: 0x%llx, features: 0x%llx\n", + pid, ext_regs->gcs.gcspr_el0, ext_regs->gcs.features_enabled); + + if (!__compel_gcs_enabled(&ext_regs->gcs)) + pr_info("gcs: GCS is NOT enabled\n"); + } else { + pr_info("gcs: GCS state not available for %d\n", pid); + } + + ret = save(pid, arg, regs, ext_regs); err: return ret; } -int compel_syscall(struct parasite_ctl *ctl, int nr, long *ret, - unsigned long arg1, - unsigned long arg2, - unsigned long arg3, - unsigned long arg4, - unsigned long arg5, - unsigned long arg6) +int compel_set_task_ext_regs(pid_t pid, user_fpregs_struct_t *ext_regs) +{ + struct iovec iov; + + struct cr_user_gcs gcs; + struct iovec gcs_iov = { .iov_base = &gcs, .iov_len = sizeof(gcs) }; + + pr_info("Restoring GP/FPU registers for %d\n", pid); + + iov.iov_base = &ext_regs->fpstate; + iov.iov_len = sizeof(ext_regs->fpstate); + if (ptrace(PTRACE_SETREGSET, pid, NT_PRFPREG, &iov)) { + pr_perror("Failed to set FPU registers for %d", pid); + return -1; + } + + if (ptrace(PTRACE_GETREGSET, pid, NT_ARM_GCS, &gcs_iov) < 0) { + pr_warn("gcs: Failed to get GCS for %d\n", pid); + } else { + ext_regs->gcs = gcs; + compel_set_task_gcs_regs(pid, ext_regs); + } + + return 0; +} + +int compel_set_task_gcs_regs(pid_t pid, user_fpregs_struct_t *ext_regs) +{ + struct iovec iov; + + pr_info("gcs: restoring GCS registers for %d\n", pid); + pr_info("gcs: restoring GCS: gcspr=%llx features=%llx\n", + ext_regs->gcs.gcspr_el0, ext_regs->gcs.features_enabled); + + iov.iov_base = &ext_regs->gcs; + iov.iov_len = sizeof(ext_regs->gcs); + + if (ptrace(PTRACE_SETREGSET, pid, NT_ARM_GCS, &iov)) { + pr_perror("gcs: Failed to set GCS registers for %d", pid); + return -1; + } + + return 0; +} + +int compel_syscall(struct parasite_ctl *ctl, int nr, long *ret, unsigned long arg1, unsigned long arg2, + unsigned long arg3, unsigned long arg4, unsigned long arg5, unsigned long arg6) { user_regs_struct_t regs = ctl->orig.regs; int err; @@ -115,15 +199,12 @@ int compel_syscall(struct parasite_ctl *ctl, int nr, long *ret, return err; } -void *remote_mmap(struct parasite_ctl *ctl, - void *addr, size_t length, int prot, - int flags, int fd, off_t offset) +void *remote_mmap(struct parasite_ctl *ctl, void *addr, size_t length, int prot, int flags, int fd, off_t offset) { long map; int err; - err = compel_syscall(ctl, __NR_mmap, &map, - (unsigned long)addr, length, prot, flags, fd, offset); + err = compel_syscall(ctl, __NR_mmap, &map, (unsigned long)addr, length, prot, flags, fd, offset); if (err < 0 || (long)map < 0) map = 0; @@ -150,9 +231,7 @@ int arch_fetch_sas(struct parasite_ctl *ctl, struct rt_sigframe *s) long ret; int err; - err = compel_syscall(ctl, __NR_sigaltstack, - &ret, 0, (unsigned long)&s->uc.uc_stack, - 0, 0, 0, 0); + err = compel_syscall(ctl, __NR_sigaltstack, &ret, 0, (unsigned long)&s->uc.uc_stack, 0, 0, 0, 0); return err ? err : ret; } @@ -176,3 +255,175 @@ unsigned long compel_task_size(void) return task_size; } +static struct hwbp_cap *ptrace_get_hwbp_cap(pid_t pid) +{ + static struct hwbp_cap info; + static int available = -1; + + if (available == -1) { + unsigned int val; + struct iovec iovec = { + .iov_base = &val, + .iov_len = sizeof(val), + }; + + if (ptrace(PTRACE_GETREGSET, pid, NT_ARM_HW_BREAK, &iovec) < 0) + available = 0; + else { + info.arch = (char)((val >> 8) & 0xff); + info.bp_count = (char)(val & 0xff); + + available = (info.arch != 0); + } + } + + return available == 1 ? &info : NULL; +} + +int ptrace_set_breakpoint(pid_t pid, void *addr) +{ + k_rtsigset_t block; + struct hwbp_cap *info = ptrace_get_hwbp_cap(pid); + struct user_hwdebug_state regs = {}; + unsigned int ctrl = 0; + struct iovec iovec; + + if (info == NULL || info->bp_count == 0) + return 0; + + /* + * The struct is copied from `arch/arm64/include/asm/hw_breakpoint.h` in + * linux kernel: + * struct arch_hw_breakpoint_ctrl { + * __u32 __reserved : 19, + * len : 8, + * type : 2, + * privilege : 2, + * enabled : 1; + * }; + * + * The part of `struct arch_hw_breakpoint_ctrl` bits meaning is defined + * in <>, + * D13.3.2 DBGBCR_EL1, Debug Breakpoint Control Registers. + */ + ctrl = ARM_BREAKPOINT_LEN_4; + ctrl = (ctrl << 2) | ARM_BREAKPOINT_EXECUTE; + ctrl = (ctrl << 2) | AARCH64_BREAKPOINT_EL0; + ctrl = (ctrl << 1) | ENABLE_HBP; + regs.dbg_regs[0].addr = (__u64)addr; + regs.dbg_regs[0].ctrl = ctrl; + iovec.iov_base = ®s; + iovec.iov_len = (offsetof(struct user_hwdebug_state, dbg_regs) + sizeof(regs.dbg_regs[0])); + + if (ptrace(PTRACE_SETREGSET, pid, NT_ARM_HW_BREAK, &iovec)) + return -1; + + /* + * FIXME(issues/1429): SIGTRAP can't be blocked, otherwise its handler + * will be reset to the default one. + */ + ksigfillset(&block); + ksigdelset(&block, SIGTRAP); + if (ptrace(PTRACE_SETSIGMASK, pid, sizeof(k_rtsigset_t), &block)) { + pr_perror("Can't block signals for %d", pid); + return -1; + } + + if (ptrace(PTRACE_CONT, pid, NULL, NULL) != 0) { + pr_perror("Unable to restart the stopped tracee process %d", pid); + return -1; + } + + return 1; +} + +int ptrace_flush_breakpoints(pid_t pid) +{ + struct hwbp_cap *info = ptrace_get_hwbp_cap(pid); + struct user_hwdebug_state regs = {}; + unsigned int ctrl = 0; + struct iovec iovec; + + if (info == NULL || info->bp_count == 0) + return 0; + + ctrl = ARM_BREAKPOINT_LEN_4; + ctrl = (ctrl << 2) | ARM_BREAKPOINT_EXECUTE; + ctrl = (ctrl << 2) | AARCH64_BREAKPOINT_EL0; + ctrl = (ctrl << 1) | DISABLE_HBP; + regs.dbg_regs[0].addr = 0ul; + regs.dbg_regs[0].ctrl = ctrl; + + iovec.iov_base = ®s; + iovec.iov_len = (offsetof(struct user_hwdebug_state, dbg_regs) + sizeof(regs.dbg_regs[0])); + + if (ptrace(PTRACE_SETREGSET, pid, NT_ARM_HW_BREAK, &iovec)) + return -1; + + return 0; +} + +int inject_gcs_cap_token(struct parasite_ctl *ctl, pid_t pid, struct cr_user_gcs *gcs) +{ + struct iovec gcs_iov = { .iov_base = gcs, .iov_len = sizeof(*gcs) }; + + uint64_t token_addr = gcs->gcspr_el0 - 8; + uint64_t sigtramp_addr = gcs->gcspr_el0 - 16; + + uint64_t cap_token = ALIGN_DOWN(GCS_SIGNAL_CAP(token_addr), 8); + unsigned long restorer_addr; + + pr_info("gcs: (setup) CAP token: 0x%lx at addr: 0x%lx\n", cap_token, token_addr); + + /* Inject capability token at gcspr_el0 - 8 */ + if (ptrace(PTRACE_POKEDATA, pid, (void *)token_addr, cap_token)) { + pr_perror("gcs: (setup) Inject GCS cap token failed"); + return -1; + } + + /* Inject restorer trampoline address (gcspr_el0 - 16) */ + restorer_addr = ctl->parasite_ip; + if (ptrace(PTRACE_POKEDATA, pid, (void *)sigtramp_addr, restorer_addr)) { + pr_perror("gcs: (setup) Inject GCS restorer failed"); + return -1; + } + + /* Update GCSPR_EL0 */ + gcs->gcspr_el0 = token_addr; + if (ptrace(PTRACE_SETREGSET, pid, NT_ARM_GCS, &gcs_iov)) { + pr_perror("gcs: PTRACE_SETREGS FAILED"); + return -1; + } + + pr_debug("gcs: parasite_ip=%#lx sp=%#llx gcspr_el0=%#llx\n", + ctl->parasite_ip, ctl->orig.regs.sp, gcs->gcspr_el0); + + return 0; +} + +int parasite_setup_shstk(struct parasite_ctl *ctl, user_fpregs_struct_t *ext_regs) +{ + struct cr_user_gcs gcs; + struct iovec gcs_iov = { .iov_base = &gcs, .iov_len = sizeof(gcs) }; + pid_t pid = ctl->rpid; + + if(!__compel_host_supports_gcs()) + return 0; + + if (ptrace(PTRACE_GETREGSET, pid, NT_ARM_GCS, &gcs_iov) != 0) { + pr_perror("GCS state not available for %d", pid); + return -1; + } + + if (!__compel_gcs_enabled(&gcs)) + return 0; + + if (inject_gcs_cap_token(ctl, pid, &gcs)) { + pr_perror("Failed to inject GCS cap token for %d", pid); + return -1; + } + + pr_info("gcs: GCS enabled for %d\n", pid); + + return 0; +} diff --git a/compel/arch/arm/plugins/include/asm/syscall-types.h b/compel/arch/arm/plugins/include/asm/syscall-types.h index cdb03ef4c..acc03de52 100644 --- a/compel/arch/arm/plugins/include/asm/syscall-types.h +++ b/compel/arch/arm/plugins/include/asm/syscall-types.h @@ -1,7 +1,7 @@ #ifndef COMPEL_ARCH_SYSCALL_TYPES_H__ #define COMPEL_ARCH_SYSCALL_TYPES_H__ -#define SA_RESTORER 0x04000000 +#define SA_RESTORER 0x04000000 typedef void rt_signalfn_t(int, siginfo_t *, void *); typedef rt_signalfn_t *rt_sighandler_t; @@ -9,20 +9,20 @@ typedef rt_signalfn_t *rt_sighandler_t; typedef void rt_restorefn_t(void); typedef rt_restorefn_t *rt_sigrestore_t; -#define _KNSIG 64 -#define _NSIG_BPW 32 +#define _KNSIG 64 +#define _NSIG_BPW 32 -#define _KNSIG_WORDS (_KNSIG / _NSIG_BPW) +#define _KNSIG_WORDS (_KNSIG / _NSIG_BPW) typedef struct { unsigned long sig[_KNSIG_WORDS]; } k_rtsigset_t; typedef struct { - rt_sighandler_t rt_sa_handler; - unsigned long rt_sa_flags; - rt_sigrestore_t rt_sa_restorer; - k_rtsigset_t rt_sa_mask; + rt_sighandler_t rt_sa_handler; + unsigned long rt_sa_flags; + rt_sigrestore_t rt_sa_restorer; + k_rtsigset_t rt_sa_mask; } rt_sigaction_t; #endif /* COMPEL_ARCH_SYSCALL_TYPES_H__ */ diff --git a/compel/arch/arm/plugins/std/parasite-head.S b/compel/arch/arm/plugins/std/parasite-head.S index e72646b50..6e46bed1f 100644 --- a/compel/arch/arm/plugins/std/parasite-head.S +++ b/compel/arch/arm/plugins/std/parasite-head.S @@ -2,21 +2,7 @@ .section .head.text, "ax" ENTRY(__export_parasite_head_start) - sub r2, pc, #8 @ get the address of this instruction - - adr r0, __export_parasite_cmd - ldr r0, [r0] - - adr r1, parasite_args_ptr - ldr r1, [r1] - add r1, r1, r2 @ fixup __export_parasite_args - bl parasite_service .byte 0xf0, 0x01, 0xf0, 0xe7 @ the instruction UDF #32 generates the signal SIGTRAP in Linux -parasite_args_ptr: - .long __export_parasite_args - -__export_parasite_cmd: - .long 0 END(__export_parasite_head_start) diff --git a/compel/arch/arm/plugins/std/syscalls/syscall-aux.h b/compel/arch/arm/plugins/std/syscalls/syscall-aux.h index 3d2056b5a..7418546e1 100644 --- a/compel/arch/arm/plugins/std/syscalls/syscall-aux.h +++ b/compel/arch/arm/plugins/std/syscalls/syscall-aux.h @@ -1,27 +1,27 @@ #ifndef __NR_mmap2 -# define __NR_mmap2 192 +#define __NR_mmap2 192 #endif #ifndef __ARM_NR_BASE -# define __ARM_NR_BASE 0x0f0000 +#define __ARM_NR_BASE 0x0f0000 #endif #ifndef __ARM_NR_breakpoint -# define __ARM_NR_breakpoint (__ARM_NR_BASE+1) +#define __ARM_NR_breakpoint (__ARM_NR_BASE + 1) #endif #ifndef __ARM_NR_cacheflush -# define __ARM_NR_cacheflush (__ARM_NR_BASE+2) +#define __ARM_NR_cacheflush (__ARM_NR_BASE + 2) #endif #ifndef __ARM_NR_usr26 -# define __ARM_NR_usr26 (__ARM_NR_BASE+3) +#define __ARM_NR_usr26 (__ARM_NR_BASE + 3) #endif #ifndef __ARM_NR_usr32 -# define __ARM_NR_usr32 (__ARM_NR_BASE+4) +#define __ARM_NR_usr32 (__ARM_NR_BASE + 4) #endif #ifndef __ARM_NR_set_tls -# define __ARM_NR_set_tls (__ARM_NR_BASE+5) +#define __ARM_NR_set_tls (__ARM_NR_BASE + 5) #endif diff --git a/compel/arch/arm/plugins/std/syscalls/syscall.def b/compel/arch/arm/plugins/std/syscalls/syscall.def index 721ff16dc..f4deb02b2 100644 --- a/compel/arch/arm/plugins/std/syscalls/syscall.def +++ b/compel/arch/arm/plugins/std/syscalls/syscall.def @@ -39,7 +39,7 @@ recvfrom 207 292 (int sockfd, void *ubuf, size_t size, unsigned int flags, str sendmsg 211 296 (int sockfd, const struct msghdr *msg, int flags) recvmsg 212 297 (int sockfd, struct msghdr *msg, int flags) shutdown 210 293 (int sockfd, int how) -bind 235 282 (int sockfd, const struct sockaddr *addr, int addrlen) +bind 200 282 (int sockfd, const struct sockaddr *addr, int addrlen) setsockopt 208 294 (int sockfd, int level, int optname, const void *optval, socklen_t optlen) getsockopt 209 295 (int sockfd, int level, int optname, const void *optval, socklen_t *optlen) clone 220 120 (unsigned long flags, void *child_stack, void *parent_tid, unsigned long newtls, void *child_tid) @@ -85,7 +85,7 @@ timer_settime 110 258 (kernel_timer_t timer_id, int flags, const struct itimer timer_gettime 108 259 (int timer_id, const struct itimerspec *setting) timer_getoverrun 109 260 (int timer_id) timer_delete 111 261 (kernel_timer_t timer_id) -clock_gettime 113 263 (const clockid_t which_clock, const struct timespec *tp) +clock_gettime 113 263 (clockid_t which_clock, struct timespec *tp) exit_group 94 248 (int error_code) set_robust_list 99 338 (struct robust_list_head *head, size_t len) get_robust_list 100 339 (int pid, struct robust_list_head **head_ptr, size_t *len_ptr) @@ -112,3 +112,16 @@ userfaultfd 282 388 (int flags) fallocate 47 352 (int fd, int mode, loff_t offset, loff_t len) cacheflush ! 983042 (void *start, void *end, int flags) ppoll 73 336 (struct pollfd *fds, unsigned int nfds, const struct timespec *tmo, const sigset_t *sigmask, size_t sigsetsize) +open_tree 428 428 (int dirfd, const char *pathname, unsigned int flags) +move_mount 429 429 (int from_dfd, const char *from_pathname, int to_dfd, const char *to_pathname, int flags) +fsopen 430 430 (char *fsname, unsigned int flags) +fsconfig 431 431 (int fd, unsigned int cmd, const char *key, const char *value, int aux) +fsmount 432 432 (int fd, unsigned int flags, unsigned int attr_flags) +clone3 435 435 (struct clone_args *uargs, size_t size) +close_range 436 436 (unsigned int fd, unsigned int max_fd, unsigned int flags) +pidfd_open 434 434 (pid_t pid, unsigned int flags) +openat2 437 437 (int dirfd, char *pathname, struct open_how *how, size_t size) +pidfd_getfd 438 438 (int pidfd, int targetfd, unsigned int flags) +rseq 293 398 (void *rseq, uint32_t rseq_len, int flags, uint32_t sig) +membarrier 283 389 (int cmd, unsigned int flags, int cpu_id) +map_shadow_stack 453 ! (unsigned long addr, unsigned long size, unsigned int flags) \ No newline at end of file diff --git a/compel/arch/arm/scripts/compel-pack.lds.S b/compel/arch/arm/scripts/compel-pack.lds.S index f8a4739f3..3d97bb139 100644 --- a/compel/arch/arm/scripts/compel-pack.lds.S +++ b/compel/arch/arm/scripts/compel-pack.lds.S @@ -29,8 +29,4 @@ SECTIONS *(.eh_frame*) *(*) } - -/* Parasite args should have 4 bytes align, as we have futex inside. */ -. = ALIGN(4); -__export_parasite_args = .; } diff --git a/compel/arch/arm/src/lib/handle-elf.c b/compel/arch/arm/src/lib/handle-elf.c index 8abf8dad1..a84524abd 100644 --- a/compel/arch/arm/src/lib/handle-elf.c +++ b/compel/arch/arm/src/lib/handle-elf.c @@ -1,14 +1,12 @@ #include - -#include "uapi/compel.h" +#include #include "handle-elf.h" #include "piegen.h" #include "log.h" -static const unsigned char __maybe_unused -elf_ident_32[EI_NIDENT] = { - 0x7f, 0x45, 0x4c, 0x46, 0x01, 0x01, 0x01, 0x00, +static const unsigned char __maybe_unused elf_ident_32[EI_NIDENT] = { + 0x7f, 0x45, 0x4c, 0x46, 0x01, 0x01, 0x01, 0x00, /* clang-format */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, }; diff --git a/compel/arch/arm/src/lib/include/handle-elf.h b/compel/arch/arm/src/lib/include/handle-elf.h index e5971f37b..4b5e1457a 100644 --- a/compel/arch/arm/src/lib/include/handle-elf.h +++ b/compel/arch/arm/src/lib/include/handle-elf.h @@ -3,8 +3,8 @@ #include "elf32-types.h" -#define __handle_elf handle_elf_arm -#define arch_is_machine_supported(e_machine) (e_machine == EM_ARM) +#define __handle_elf handle_elf_arm +#define arch_is_machine_supported(e_machine) (e_machine == EM_ARM) extern int handle_elf_arm(void *mem, size_t size); diff --git a/compel/arch/arm/src/lib/include/syscall.h b/compel/arch/arm/src/lib/include/syscall.h index e2ec1272e..13ee906e1 100644 --- a/compel/arch/arm/src/lib/include/syscall.h +++ b/compel/arch/arm/src/lib/include/syscall.h @@ -1,4 +1,8 @@ #ifndef __COMPEL_SYSCALL_H__ #define __COMPEL_SYSCALL_H__ -#define __NR(syscall, compat) __NR_##syscall +#define __NR(syscall, compat) \ + ({ \ + (void)compat; \ + __NR_##syscall; \ + }) #endif diff --git a/compel/arch/arm/src/lib/include/uapi/asm/cpu.h b/compel/arch/arm/src/lib/include/uapi/asm/cpu.h index c35460e15..12e749508 100644 --- a/compel/arch/arm/src/lib/include/uapi/asm/cpu.h +++ b/compel/arch/arm/src/lib/include/uapi/asm/cpu.h @@ -1,6 +1,7 @@ #ifndef UAPI_COMPEL_ASM_CPU_H__ #define UAPI_COMPEL_ASM_CPU_H__ -typedef struct { } compel_cpuinfo_t; +typedef struct { +} compel_cpuinfo_t; #endif /* UAPI_COMPEL_ASM_CPU_H__ */ diff --git a/compel/arch/arm/src/lib/include/uapi/asm/infect-types.h b/compel/arch/arm/src/lib/include/uapi/asm/infect-types.h index b8286d404..8d328252e 100644 --- a/compel/arch/arm/src/lib/include/uapi/asm/infect-types.h +++ b/compel/arch/arm/src/lib/include/uapi/asm/infect-types.h @@ -4,8 +4,8 @@ #include #include -#define SIGMAX 64 -#define SIGMAX_OLD 31 +#define SIGMAX 64 +#define SIGMAX_OLD 31 /* * Copied from the Linux kernel header arch/arm/include/asm/ptrace.h @@ -14,53 +14,62 @@ */ typedef struct { - long uregs[18]; + long uregs[18]; } user_regs_struct_t; -typedef struct user_vfp user_fpregs_struct_t; +#define __compel_arch_fetch_thread_area(tid, th) 0 +#define compel_arch_fetch_thread_area(tctl) 0 +#define compel_arch_get_tls_task(ctl, tls) +#define compel_arch_get_tls_thread(tctl, tls) -#define ARM_cpsr uregs[16] -#define ARM_pc uregs[15] -#define ARM_lr uregs[14] -#define ARM_sp uregs[13] -#define ARM_ip uregs[12] -#define ARM_fp uregs[11] -#define ARM_r10 uregs[10] -#define ARM_r9 uregs[9] -#define ARM_r8 uregs[8] -#define ARM_r7 uregs[7] -#define ARM_r6 uregs[6] -#define ARM_r5 uregs[5] -#define ARM_r4 uregs[4] -#define ARM_r3 uregs[3] -#define ARM_r2 uregs[2] -#define ARM_r1 uregs[1] -#define ARM_r0 uregs[0] -#define ARM_ORIG_r0 uregs[17] +typedef struct user_vfp user_fpregs_struct_t; +#define ARM_cpsr uregs[16] +#define ARM_pc uregs[15] +#define ARM_lr uregs[14] +#define ARM_sp uregs[13] +#define ARM_ip uregs[12] +#define ARM_fp uregs[11] +#define ARM_r10 uregs[10] +#define ARM_r9 uregs[9] +#define ARM_r8 uregs[8] +#define ARM_r7 uregs[7] +#define ARM_r6 uregs[6] +#define ARM_r5 uregs[5] +#define ARM_r4 uregs[4] +#define ARM_r3 uregs[3] +#define ARM_r2 uregs[2] +#define ARM_r1 uregs[1] +#define ARM_r0 uregs[0] +#define ARM_ORIG_r0 uregs[17] /* Copied from arch/arm/include/asm/user.h */ struct user_vfp { - unsigned long long fpregs[32]; - unsigned long fpscr; + unsigned long long fpregs[32]; + unsigned long fpscr; }; struct user_vfp_exc { - unsigned long fpexc; - unsigned long fpinst; - unsigned long fpinst2; + unsigned long fpexc; + unsigned long fpinst; + unsigned long fpinst2; }; -#define REG_RES(regs) ((regs).ARM_r0) -#define REG_IP(regs) ((regs).ARM_pc) -#define REG_SP(regs) ((regs).ARM_sp) -#define REG_SYSCALL_NR(regs) ((regs).ARM_r7) +#define REG_RES(regs) ((regs).ARM_r0) +#define REG_IP(regs) ((regs).ARM_pc) +#define SET_REG_IP(regs, val) ((regs).ARM_pc = (val)) +#define REG_SP(regs) ((regs).ARM_sp) +#define REG_SYSCALL_NR(regs) ((regs).ARM_r7) -#define user_regs_native(pregs) true +#define user_regs_native(pregs) true -#define ARCH_SI_TRAP TRAP_BRKPT +#define ARCH_SI_TRAP TRAP_BRKPT -#define __NR(syscall, compat) __NR_##syscall +#define __NR(syscall, compat) \ + ({ \ + (void)compat; \ + __NR_##syscall; \ + }) #endif /* UAPI_COMPEL_ASM_TYPES_H__ */ diff --git a/compel/arch/arm/src/lib/include/uapi/asm/processor-flags.h b/compel/arch/arm/src/lib/include/uapi/asm/processor-flags.h index 8745f4459..36edf231a 100644 --- a/compel/arch/arm/src/lib/include/uapi/asm/processor-flags.h +++ b/compel/arch/arm/src/lib/include/uapi/asm/processor-flags.h @@ -6,37 +6,37 @@ /* * PSR bits */ -#define USR26_MODE 0x00000000 -#define FIQ26_MODE 0x00000001 -#define IRQ26_MODE 0x00000002 -#define SVC26_MODE 0x00000003 -#define USR_MODE 0x00000010 -#define FIQ_MODE 0x00000011 -#define IRQ_MODE 0x00000012 -#define SVC_MODE 0x00000013 -#define ABT_MODE 0x00000017 -#define UND_MODE 0x0000001b -#define SYSTEM_MODE 0x0000001f -#define MODE32_BIT 0x00000010 -#define MODE_MASK 0x0000001f -#define PSR_T_BIT 0x00000020 -#define PSR_F_BIT 0x00000040 -#define PSR_I_BIT 0x00000080 -#define PSR_A_BIT 0x00000100 -#define PSR_E_BIT 0x00000200 -#define PSR_J_BIT 0x01000000 -#define PSR_Q_BIT 0x08000000 -#define PSR_V_BIT 0x10000000 -#define PSR_C_BIT 0x20000000 -#define PSR_Z_BIT 0x40000000 -#define PSR_N_BIT 0x80000000 +#define USR26_MODE 0x00000000 +#define FIQ26_MODE 0x00000001 +#define IRQ26_MODE 0x00000002 +#define SVC26_MODE 0x00000003 +#define USR_MODE 0x00000010 +#define FIQ_MODE 0x00000011 +#define IRQ_MODE 0x00000012 +#define SVC_MODE 0x00000013 +#define ABT_MODE 0x00000017 +#define UND_MODE 0x0000001b +#define SYSTEM_MODE 0x0000001f +#define MODE32_BIT 0x00000010 +#define MODE_MASK 0x0000001f +#define PSR_T_BIT 0x00000020 +#define PSR_F_BIT 0x00000040 +#define PSR_I_BIT 0x00000080 +#define PSR_A_BIT 0x00000100 +#define PSR_E_BIT 0x00000200 +#define PSR_J_BIT 0x01000000 +#define PSR_Q_BIT 0x08000000 +#define PSR_V_BIT 0x10000000 +#define PSR_C_BIT 0x20000000 +#define PSR_Z_BIT 0x40000000 +#define PSR_N_BIT 0x80000000 /* * Groups of PSR bits */ -#define PSR_f 0xff000000 /* Flags */ -#define PSR_s 0x00ff0000 /* Status */ -#define PSR_x 0x0000ff00 /* Extension */ -#define PSR_c 0x000000ff /* Control */ +#define PSR_f 0xff000000 /* Flags */ +#define PSR_s 0x00ff0000 /* Status */ +#define PSR_x 0x0000ff00 /* Extension */ +#define PSR_c 0x000000ff /* Control */ #endif diff --git a/compel/arch/arm/src/lib/include/uapi/asm/sigframe.h b/compel/arch/arm/src/lib/include/uapi/asm/sigframe.h index 23ada50aa..3db9978d0 100644 --- a/compel/arch/arm/src/lib/include/uapi/asm/sigframe.h +++ b/compel/arch/arm/src/lib/include/uapi/asm/sigframe.h @@ -6,42 +6,42 @@ /* Copied from the Linux kernel header arch/arm/include/asm/sigcontext.h */ struct rt_sigcontext { - unsigned long trap_no; - unsigned long error_code; - unsigned long oldmask; - unsigned long arm_r0; - unsigned long arm_r1; - unsigned long arm_r2; - unsigned long arm_r3; - unsigned long arm_r4; - unsigned long arm_r5; - unsigned long arm_r6; - unsigned long arm_r7; - unsigned long arm_r8; - unsigned long arm_r9; - unsigned long arm_r10; - unsigned long arm_fp; - unsigned long arm_ip; - unsigned long arm_sp; - unsigned long arm_lr; - unsigned long arm_pc; - unsigned long arm_cpsr; - unsigned long fault_address; + unsigned long trap_no; + unsigned long error_code; + unsigned long oldmask; + unsigned long arm_r0; + unsigned long arm_r1; + unsigned long arm_r2; + unsigned long arm_r3; + unsigned long arm_r4; + unsigned long arm_r5; + unsigned long arm_r6; + unsigned long arm_r7; + unsigned long arm_r8; + unsigned long arm_r9; + unsigned long arm_r10; + unsigned long arm_fp; + unsigned long arm_ip; + unsigned long arm_sp; + unsigned long arm_lr; + unsigned long arm_pc; + unsigned long arm_cpsr; + unsigned long fault_address; }; /* Copied from the Linux kernel header arch/arm/include/asm/ucontext.h */ -#define VFP_MAGIC 0x56465001 -#define VFP_STORAGE_SIZE sizeof(struct vfp_sigframe) +#define VFP_MAGIC 0x56465001 +#define VFP_STORAGE_SIZE sizeof(struct vfp_sigframe) struct vfp_sigframe { - unsigned long magic; - unsigned long size; - struct user_vfp ufp; - struct user_vfp_exc ufp_exc; + unsigned long magic; + unsigned long size; + struct user_vfp ufp; + struct user_vfp_exc ufp_exc; }; -typedef struct vfp_sigframe fpu_state_t; +typedef struct vfp_sigframe fpu_state_t; struct aux_sigframe { /* @@ -49,23 +49,23 @@ struct aux_sigframe { struct iwmmxt_sigframe iwmmxt; */ - struct vfp_sigframe vfp; - unsigned long end_magic; + struct vfp_sigframe vfp; + unsigned long end_magic; } __attribute__((aligned(8))); #include struct sigframe { - struct rt_ucontext uc; - unsigned long retcode[2]; + struct rt_ucontext uc; + unsigned long retcode[2]; }; struct rt_sigframe { - struct rt_siginfo info; - struct sigframe sig; + struct rt_siginfo info; + struct sigframe sig; }; - +/* clang-format off */ #define ARCH_RT_SIGRETURN(new_sp, rt_sigframe) \ asm volatile( \ "mov sp, %0 \n" \ @@ -74,17 +74,16 @@ struct rt_sigframe { : \ : "r"(new_sp) \ : "memory") +/* clang-format on */ -#define RT_SIGFRAME_UC(rt_sigframe) (&rt_sigframe->sig.uc) -#define RT_SIGFRAME_REGIP(rt_sigframe) (rt_sigframe)->sig.uc.uc_mcontext.arm_ip -#define RT_SIGFRAME_HAS_FPU(rt_sigframe) 1 -#define RT_SIGFRAME_AUX_SIGFRAME(rt_sigframe) ((struct aux_sigframe *)&(rt_sigframe)->sig.uc.uc_regspace) -#define RT_SIGFRAME_FPU(rt_sigframe) (&RT_SIGFRAME_AUX_SIGFRAME(rt_sigframe)->vfp) -#define RT_SIGFRAME_OFFSET(rt_sigframe) 0 +#define RT_SIGFRAME_UC(rt_sigframe) (&rt_sigframe->sig.uc) +#define RT_SIGFRAME_REGIP(rt_sigframe) (rt_sigframe)->sig.uc.uc_mcontext.arm_ip +#define RT_SIGFRAME_HAS_FPU(rt_sigframe) 1 +#define RT_SIGFRAME_AUX_SIGFRAME(rt_sigframe) ((struct aux_sigframe *)&(rt_sigframe)->sig.uc.uc_regspace) +#define RT_SIGFRAME_FPU(rt_sigframe) (&RT_SIGFRAME_AUX_SIGFRAME(rt_sigframe)->vfp) +#define RT_SIGFRAME_OFFSET(rt_sigframe) 0 -#define rt_sigframe_erase_sigset(sigframe) \ - memset(&sigframe->sig.uc.uc_sigmask, 0, sizeof(k_rtsigset_t)) -#define rt_sigframe_copy_sigset(sigframe, from) \ - memcpy(&sigframe->sig.uc.uc_sigmask, from, sizeof(k_rtsigset_t)) +#define rt_sigframe_erase_sigset(sigframe) memset(&sigframe->sig.uc.uc_sigmask, 0, sizeof(k_rtsigset_t)) +#define rt_sigframe_copy_sigset(sigframe, from) memcpy(&sigframe->sig.uc.uc_sigmask, from, sizeof(k_rtsigset_t)) #endif /* UAPI_COMPEL_ASM_SIGFRAME_H__ */ diff --git a/compel/arch/arm/src/lib/infect.c b/compel/arch/arm/src/lib/infect.c index c17cb9c9b..a9fb639e2 100644 --- a/compel/arch/arm/src/lib/infect.c +++ b/compel/arch/arm/src/lib/infect.c @@ -1,8 +1,11 @@ #include #include #include +#include #include #include +#include + #include "common/page.h" #include "uapi/compel/asm/infect-types.h" #include "log.h" @@ -14,12 +17,11 @@ * Injected syscall instruction */ const char code_syscall[] = { - 0x00, 0x00, 0x00, 0xef, /* SVC #0 */ - 0xf0, 0x01, 0xf0, 0xe7 /* UDF #32 */ + 0x00, 0x00, 0x00, 0xef, /* SVC #0 */ + 0xf0, 0x01, 0xf0, 0xe7 /* UDF #32 */ }; -static const int -code_syscall_aligned = round_up(sizeof(code_syscall), sizeof(long)); +static const int code_syscall_aligned = round_up(sizeof(code_syscall), sizeof(long)); static inline __always_unused void __check_code_syscall(void) { @@ -27,9 +29,7 @@ static inline __always_unused void __check_code_syscall(void) BUILD_BUG_ON(!is_log2(sizeof(code_syscall))); } -int sigreturn_prep_regs_plain(struct rt_sigframe *sigframe, - user_regs_struct_t *regs, - user_fpregs_struct_t *fpregs) +int sigreturn_prep_regs_plain(struct rt_sigframe *sigframe, user_regs_struct_t *regs, user_fpregs_struct_t *fpregs) { struct aux_sigframe *aux = (struct aux_sigframe *)(void *)&sigframe->sig.uc.uc_regspace; @@ -59,22 +59,20 @@ int sigreturn_prep_regs_plain(struct rt_sigframe *sigframe, return 0; } -int sigreturn_prep_fpu_frame_plain(struct rt_sigframe *sigframe, - struct rt_sigframe *rsigframe) +int sigreturn_prep_fpu_frame_plain(struct rt_sigframe *sigframe, struct rt_sigframe *rsigframe) { return 0; } #define PTRACE_GETVFPREGS 27 -int get_task_regs(pid_t pid, user_regs_struct_t *regs, save_regs_t save, - void *arg, __maybe_unused unsigned long flags) +int compel_get_task_regs(pid_t pid, user_regs_struct_t *regs, user_fpregs_struct_t *vfp, save_regs_t save, + void *arg, __maybe_unused unsigned long flags) { - user_fpregs_struct_t vfp; int ret = -1; pr_info("Dumping GP/FPU registers for %d\n", pid); - if (ptrace(PTRACE_GETVFPREGS, pid, NULL, &vfp)) { + if (ptrace(PTRACE_GETVFPREGS, pid, NULL, vfp)) { pr_perror("Can't obtain FPU registers for %d", pid); goto err; } @@ -90,24 +88,30 @@ int get_task_regs(pid_t pid, user_regs_struct_t *regs, save_regs_t save, regs->ARM_pc -= 4; break; case -ERESTART_RESTARTBLOCK: - regs->ARM_r0 = __NR_restart_syscall; - regs->ARM_pc -= 4; + pr_warn("Will restore %d with interrupted system call\n", pid); + regs->ARM_r0 = -EINTR; break; } } - ret = save(arg, regs, &vfp); + ret = save(pid, arg, regs, vfp); err: return ret; } -int compel_syscall(struct parasite_ctl *ctl, int nr, long *ret, - unsigned long arg1, - unsigned long arg2, - unsigned long arg3, - unsigned long arg4, - unsigned long arg5, - unsigned long arg6) +int compel_set_task_ext_regs(pid_t pid, user_fpregs_struct_t *ext_regs) +{ + pr_info("Restoring GP/FPU registers for %d\n", pid); + + if (ptrace(PTRACE_SETVFPREGS, pid, NULL, ext_regs)) { + pr_perror("Can't set FPU registers for %d", pid); + return -1; + } + return 0; +} + +int compel_syscall(struct parasite_ctl *ctl, int nr, long *ret, unsigned long arg1, unsigned long arg2, + unsigned long arg3, unsigned long arg4, unsigned long arg5, unsigned long arg6) { user_regs_struct_t regs = ctl->orig.regs; int err; @@ -126,9 +130,7 @@ int compel_syscall(struct parasite_ctl *ctl, int nr, long *ret, return err; } -void *remote_mmap(struct parasite_ctl *ctl, - void *addr, size_t length, int prot, - int flags, int fd, off_t offset) +void *remote_mmap(struct parasite_ctl *ctl, void *addr, size_t length, int prot, int flags, int fd, off_t offset) { long map; int err; @@ -136,8 +138,7 @@ void *remote_mmap(struct parasite_ctl *ctl, if (offset & ~PAGE_MASK) return 0; - err = compel_syscall(ctl, __NR_mmap2, &map, - (unsigned long)addr, length, prot, flags, fd, offset >> 12); + err = compel_syscall(ctl, __NR_mmap2, &map, (unsigned long)addr, length, prot, flags, fd, offset >> 12); if (err < 0 || map > ctl->ictx.task_size) map = 0; @@ -167,9 +168,7 @@ int arch_fetch_sas(struct parasite_ctl *ctl, struct rt_sigframe *s) long ret; int err; - err = compel_syscall(ctl, __NR_sigaltstack, - &ret, 0, (unsigned long)&s->sig.uc.uc_stack, - 0, 0, 0, 0); + err = compel_syscall(ctl, __NR_sigaltstack, &ret, 0, (unsigned long)&s->sig.uc.uc_stack, 0, 0, 0, 0); return err ? err : ret; } @@ -178,9 +177,9 @@ int arch_fetch_sas(struct parasite_ctl *ctl, struct rt_sigframe *s) * arch/arm/include/asm/memory.h * arch/arm/Kconfig (PAGE_OFFSET values in Memory split section) */ -#define TASK_SIZE_MIN 0x3f000000 -#define TASK_SIZE_MAX 0xbf000000 -#define SZ_1G 0x40000000 +#define TASK_SIZE_MIN 0x3f000000 +#define TASK_SIZE_MAX 0xbf000000 +#define SZ_1G 0x40000000 unsigned long compel_task_size(void) { @@ -192,4 +191,3 @@ unsigned long compel_task_size(void) return task_size; } - diff --git a/compel/arch/loongarch64/plugins/include/asm/prologue.h b/compel/arch/loongarch64/plugins/include/asm/prologue.h new file mode 100644 index 000000000..c19ce54d7 --- /dev/null +++ b/compel/arch/loongarch64/plugins/include/asm/prologue.h @@ -0,0 +1,35 @@ +#ifndef __ASM_PROLOGUE_H__ +#define __ASM_PROLOGUE_H__ + +#ifndef __ASSEMBLY__ + +#include +#include +#include + +#include + +#define sys_recv(sockfd, ubuf, size, flags) sys_recvfrom(sockfd, ubuf, size, flags, NULL, NULL) + +typedef struct prologue_init_args { + struct sockaddr_un ctl_sock_addr; + unsigned int ctl_sock_addr_len; + + unsigned int arg_s; + void *arg_p; + + void *sigframe; +} prologue_init_args_t; + +#endif /* __ASSEMBLY__ */ + +/* + * Reserve enough space for sigframe. + * + * FIXME It is rather should be taken from sigframe header. + */ +#define PROLOGUE_SGFRAME_SIZE 4096 + +#define PROLOGUE_INIT_ARGS_SIZE 1024 + +#endif /* __ASM_PROLOGUE_H__ */ diff --git a/compel/arch/loongarch64/plugins/include/asm/syscall-types.h b/compel/arch/loongarch64/plugins/include/asm/syscall-types.h new file mode 100644 index 000000000..b883bd8be --- /dev/null +++ b/compel/arch/loongarch64/plugins/include/asm/syscall-types.h @@ -0,0 +1,30 @@ +#ifndef COMPEL_ARCH_SYSCALL_TYPES_H__ +#define COMPEL_ARCH_SYSCALL_TYPES_H__ + +#include +/* Types for sigaction, sigprocmask syscalls */ +typedef void rt_signalfn_t(int, siginfo_t *, void *); +typedef rt_signalfn_t *rt_sighandler_t; + +typedef void rt_restorefn_t(void); +typedef rt_restorefn_t *rt_sigrestore_t; + +/* refer to arch/loongarch/include/uapi/asm/signal.h */ +#define _KNSIG 64 +#define _NSIG_BPW BITS_PER_LONG +#define _KNSIG_WORDS (_KNSIG / _NSIG_BPW) + +typedef struct { + uint64_t sig[_KNSIG_WORDS]; +} k_rtsigset_t; + +typedef struct { + rt_sighandler_t rt_sa_handler; + unsigned long rt_sa_flags; + rt_sigrestore_t rt_sa_restorer; + k_rtsigset_t rt_sa_mask; +} rt_sigaction_t; + +#define SA_RESTORER 0x04000000 + +#endif /* COMPEL_ARCH_SYSCALL_TYPES_H__ */ diff --git a/compel/arch/loongarch64/plugins/include/features.h b/compel/arch/loongarch64/plugins/include/features.h new file mode 100644 index 000000000..b4a3cded2 --- /dev/null +++ b/compel/arch/loongarch64/plugins/include/features.h @@ -0,0 +1,4 @@ +#ifndef __COMPEL_ARCH_FEATURES_H +#define __COMPEL_ARCH_FEATURES_H + +#endif /* __COMPEL_ARCH_FEATURES_H */ diff --git a/compel/arch/loongarch64/plugins/std/parasite-head.S b/compel/arch/loongarch64/plugins/std/parasite-head.S new file mode 100644 index 000000000..3a960490e --- /dev/null +++ b/compel/arch/loongarch64/plugins/std/parasite-head.S @@ -0,0 +1,9 @@ + +#include "common/asm/linkage.h" + + .section .head.text, "ax" +ENTRY(__export_parasite_head_start) + bl parasite_service; + break 0; +END(__export_parasite_head_start) + diff --git a/compel/arch/loongarch64/plugins/std/syscalls/Makefile.syscalls b/compel/arch/loongarch64/plugins/std/syscalls/Makefile.syscalls new file mode 100644 index 000000000..0d08f34e1 --- /dev/null +++ b/compel/arch/loongarch64/plugins/std/syscalls/Makefile.syscalls @@ -0,0 +1,117 @@ +std-lib-y += ./$(PLUGIN_ARCH_DIR)/std/syscalls-64.o +sys-proto-types := $(obj)/include/uapi/std/syscall-types.h +sys-proto-generic := $(obj)/include/uapi/std/syscall.h +sys-codes-generic := $(obj)/include/uapi/std/syscall-codes.h +sys-codes = $(obj)/include/uapi/std/syscall-codes-$(1).h +sys-proto = $(obj)/include/uapi/std/syscall-$(1).h +sys-def = $(PLUGIN_ARCH_DIR)/std/syscalls/syscall_$(1).tbl +sys-asm = $(PLUGIN_ARCH_DIR)/std/syscalls-$(1).S +sys-asm-common-name = std/syscalls/syscall-common-loongarch-$(1).S +sys-asm-common = $(PLUGIN_ARCH_DIR)/$(sys-asm-common-name) +sys-asm-types := $(obj)/include/uapi/std/asm/syscall-types.h +sys-exec-tbl = $(PLUGIN_ARCH_DIR)/std/sys-exec-tbl-$(1).c + +sys-bits := 64 + +AV := $$$$ + +define gen-rule-sys-codes +$(sys-codes): $(sys-def) $(sys-proto-types) + $(call msg-gen, $$@) + $(Q) echo "/* Autogenerated, don't edit */" > $$@ + $(Q) echo "#ifndef ASM_SYSCALL_CODES_H_$(1)__" >> $$@ + $(Q) echo "#define ASM_SYSCALL_CODES_H_$(1)__" >> $$@ + $(Q) cat $$< | awk '/^__NR/{SYSN=$(AV)1; \ + sub("^__NR", "SYS", SYSN); \ + print "\n#ifndef ", $(AV)1; \ + print "#define", $(AV)1, $(AV)2; \ + print "#endif"; \ + print "\n#ifndef ", SYSN; \ + print "#define ", SYSN, $(AV)1; \ + print "#endif";}' >> $$@ + $(Q) echo "#endif /* ASM_SYSCALL_CODES_H_$(1)__ */" >> $$@ +endef + +define gen-rule-sys-proto +$(sys-proto): $(sys-def) $(sys-proto-types) + $(call msg-gen, $$@) + $(Q) echo "/* Autogenerated, don't edit */" > $$@ + $(Q) echo "#ifndef ASM_SYSCALL_PROTO_H_$(1)__" >> $$@ + $(Q) echo "#define ASM_SYSCALL_PROTO_H_$(1)__" >> $$@ + $(Q) echo '#include ' >> $$@ + $(Q) echo '#include ' >> $$@ +ifeq ($(1),32) + $(Q) echo '#include "asm/syscall32.h"' >> $$@ +endif + $(Q) cat $$< | awk '/^__NR/{print "extern long", $(AV)3, \ + substr($(AV)0, index($(AV)0,$(AV)4)), ";"}' >> $$@ + $(Q) echo "#endif /* ASM_SYSCALL_PROTO_H_$(1)__ */" >> $$@ +endef + +define gen-rule-sys-asm +$(sys-asm): $(sys-def) $(sys-asm-common) $(sys-codes) $(sys-proto) $(sys-proto-types) + $(call msg-gen, $$@) + $(Q) echo "/* Autogenerated, don't edit */" > $$@ + $(Q) echo '#include ' >> $$@ + $(Q) echo '#include "$(sys-asm-common-name)"' >> $$@ + $(Q) cat $$< | awk '/^__NR/{print "SYSCALL(", $(AV)3, ",", $(AV)2, ")"}' >> $$@ +endef + +define gen-rule-sys-exec-tbl +$(sys-exec-tbl): $(sys-def) $(sys-codes) $(sys-proto) $(sys-proto-generic) $(sys-proto-types) + $(call msg-gen, $$@) + $(Q) echo "/* Autogenerated, don't edit */" > $$@ + $(Q) cat $$< | awk '/^__NR/{print \ + "SYSCALL(", substr($(AV)3, 5), ",", $(AV)2, ")"}' >> $$@ +endef + +$(sys-codes-generic): $(sys-proto-types) + $(call msg-gen, $@) + $(Q) echo "/* Autogenerated, don't edit */" > $@ + $(Q) echo "#ifndef __ASM_CR_SYSCALL_CODES_H__" >> $@ + $(Q) echo "#define __ASM_CR_SYSCALL_CODES_H__" >> $@ + $(Q) echo '#include ' >> $@ + $(Q) cat $< | awk '/^__NR/{NR32=$$1; \ + sub("^__NR", "__NR32", NR32); \ + print "\n#ifndef ", NR32; \ + print "#define ", NR32, $$2; \ + print "#endif";}' >> $@ + $(Q) echo "#endif /* __ASM_CR_SYSCALL_CODES_H__ */" >> $@ +mrproper-y += $(sys-codes-generic) + +$(sys-proto-generic): $(strip $(call map,sys-proto,$(sys-bits))) $(sys-proto-types) + $(call msg-gen, $@) + $(Q) echo "/* Autogenerated, don't edit */" > $@ + $(Q) echo "#ifndef __ASM_CR_SYSCALL_PROTO_H__" >> $@ + $(Q) echo "#define __ASM_CR_SYSCALL_PROTO_H__" >> $@ + $(Q) echo "" >> $@ + $(Q) echo '#include ' >> $@ + $(Q) echo "" >> $@ + $(Q) echo "#endif /* __ASM_CR_SYSCALL_PROTO_H__ */" >> $@ +mrproper-y += $(sys-proto-generic) + +define gen-rule-sys-exec-tbl +$(sys-exec-tbl): $(sys-def) $(sys-codes) $(sys-proto) $(sys-proto-generic) + $(call msg-gen, $$@) + $(Q) echo "/* Autogenerated, don't edit */" > $$@ + $(Q) cat $$< | awk '/^__NR/{print \ + "SYSCALL(", substr($(AV)3, 5), ",", $(AV)2, ")"}' >> $$@ +endef + +$(eval $(call map,gen-rule-sys-codes,$(sys-bits))) +$(eval $(call map,gen-rule-sys-proto,$(sys-bits))) +$(eval $(call map,gen-rule-sys-asm,$(sys-bits))) +$(eval $(call map,gen-rule-sys-exec-tbl,$(sys-bits))) + +$(sys-asm-types): $(PLUGIN_ARCH_DIR)/include/asm/syscall-types.h + $(call msg-gen, $@) + $(Q) ln -s ../../../../../../$(PLUGIN_ARCH_DIR)/include/asm/syscall-types.h $(sys-asm-types) + +std-headers-deps += $(call sys-codes,$(sys-bits)) +std-headers-deps += $(call sys-proto,$(sys-bits)) +std-headers-deps += $(call sys-asm,$(sys-bits)) +std-headers-deps += $(call sys-exec-tbl,$(sys-bits)) +std-headers-deps += $(sys-codes-generic) +std-headers-deps += $(sys-proto-generic) +std-headers-deps += $(sys-asm-types) +mrproper-y += $(std-headers-deps) diff --git a/compel/arch/loongarch64/plugins/std/syscalls/syscall-common-loongarch-64.S b/compel/arch/loongarch64/plugins/std/syscalls/syscall-common-loongarch-64.S new file mode 100644 index 000000000..fff894466 --- /dev/null +++ b/compel/arch/loongarch64/plugins/std/syscalls/syscall-common-loongarch-64.S @@ -0,0 +1,44 @@ +#include "common/asm/linkage.h" + +#define SYSCALL(name, opcode) \ +ENTRY(name); \ + addi.d $a7, $zero, opcode; \ + syscall 0; \ + jirl $r0, $r1, 0; \ +END(name) + +#ifndef AT_FDCWD +#define AT_FDCWD -100 +#endif + +#ifndef AT_REMOVEDIR +#define AT_REMOVEDIR 0x200 +#endif + +ENTRY(sys_open) + or $a3, $zero, $a2 + or $a2, $zero, $a1 + or $a1, $zero, $a0 + addi.d $a0, $zero, AT_FDCWD + b sys_openat +END(sys_open) + +ENTRY(sys_mkdir) + or $a3, $zero, $a2 + or $a2, $zero, $a1 + or $a1, $zero, $a0 + addi.d $a0, $zero, AT_FDCWD + b sys_mkdirat +END(sys_mkdir) + +ENTRY(sys_rmdir) + addi.d $a2, $zero, AT_REMOVEDIR + or $a1, $zero, $a0 + addi.d $a0, $zero, AT_FDCWD + b sys_unlinkat +END(sys_rmdir) + +ENTRY(__cr_restore_rt) + addi.d $a7, $zero, __NR_rt_sigreturn + syscall 0 +END(__cr_restore_rt) diff --git a/compel/arch/loongarch64/plugins/std/syscalls/syscall_64.tbl b/compel/arch/loongarch64/plugins/std/syscalls/syscall_64.tbl new file mode 100644 index 000000000..83dcdab4a --- /dev/null +++ b/compel/arch/loongarch64/plugins/std/syscalls/syscall_64.tbl @@ -0,0 +1,122 @@ +# +# System calls table, please make sure the table consist only the syscalls +# really used somewhere in project. +# from kernel/linux-3.10.84/arch/mips/include/uapi/asm/unistd.h Linux 64-bit syscalls are in the range from 5000 to 5999. +# +# __NR_name code name arguments +# ------------------------------------------------------------------------------------------------------------------------------------------------------------- +__NR_io_setup 0 sys_io_setup (unsigned nr_events, aio_context_t *ctx) +__NR_io_submit 2 sys_io_submit (aio_context_t ctx, long nr, struct iocb **iocbpp) +__NR_io_getevents 4 sys_io_getevents (aio_context_t ctx, long min_nr, long nr, struct io_event *evs, struct timespec *tmo) +__NR_fcntl 25 sys_fcntl (int fd, int type, long arg) +__NR_ioctl 29 sys_ioctl (unsigned int fd, unsigned int cmd, unsigned long arg) +__NR_flock 32 sys_flock (int fd, unsigned long cmd) +__NR_mkdirat 34 sys_mkdirat (int dfd, const char *pathname, int flag) +__NR_unlinkat 35 sys_unlinkat (int dfd, const char *pathname, int flag) +__NR_umount2 39 sys_umount2 (char *name, int flags) +__NR_mount 40 sys_mount (char *dev_nmae, char *dir_name, char *type, unsigned long flags, void *data) +__NR_fallocate 47 sys_fallocate (int fd, int mode, loff_t offset, loff_t len) +__NR_close 57 sys_close (int fd) +__NR_openat 56 sys_openat (int dfd, const char *filename, int flags, int mode) +__NR_lseek 62 sys_lseek (int fd, unsigned long offset, unsigned long origin) +__NR_read 63 sys_read (int fd, void *buf, unsigned long count) +__NR_write 64 sys_write (int fd, const void *buf, unsigned long count) +__NR_pread64 67 sys_pread (unsigned int fd, char *buf, size_t count, loff_t pos) +__NR_preadv 69 sys_preadv_raw (int fd, struct iovec *iov, unsigned long nr, unsigned long pos_l, unsigned long pos_h) +__NR_ppoll 73 sys_ppoll (struct pollfd *fds, unsigned int nfds, const struct timespec *tmo, const sigset_t *sigmask, size_t sigsetsize) +__NR_signalfd4 74 sys_signalfd (int fd, k_rtsigset_t *mask, size_t sizemask, int flags) +__NR_vmsplice 75 sys_vmsplice (int fd, const struct iovec *iov, unsigned long nr_segs, unsigned int flags) +__NR_readlinkat 78 sys_readlinkat (int fd, const char *path, char *buf, int bufsize) +__NR_timerfd_settime 86 sys_timerfd_settime (int ufd, int flags, const struct itimerspec *utmr, struct itimerspec *otmr) +__NR_capget 90 sys_capget (struct cap_header *h, struct cap_data *d) +__NR_capset 91 sys_capset (struct cap_header *h, struct cap_data *d) +__NR_personality 92 sys_personality (unsigned int personality) +__NR_exit 93 sys_exit (unsigned long error_code) +__NR_exit_group 94 sys_exit_group (int error_code) +__NR_waitid 95 sys_waitid (int which, pid_t pid, struct siginfo *infop, int options, struct rusage *ru) +__NR_set_tid_address 96 sys_set_tid_address (int *tid_addr) +__NR_futex 98 sys_futex (uint32_t *uaddr, int op, uint32_t val, struct timespec *utime, uint32_t *uaddr2, uint32_t val3) +__NR_set_robust_list 99 sys_set_robust_list (struct robust_list_head *head, size_t len) +__NR_get_robust_list 100 sys_get_robust_list (int pid, struct robust_list_head **head_ptr, size_t *len_ptr) +__NR_nanosleep 101 sys_nanosleep (struct timespec *req, struct timespec *rem) +__NR_getitimer 102 sys_getitimer (int which, const struct itimerval *val) +__NR_setitimer 103 sys_setitimer (int which, const struct itimerval *val, struct itimerval *old) +__NR_sys_timer_create 107 sys_timer_create (clockid_t which_clock, struct sigevent *timer_event_spec, kernel_timer_t *created_timer_id) +__NR_sys_timer_gettime 108 sys_timer_gettime (int timer_id, const struct itimerspec *setting) +__NR_sys_timer_getoverrun 109 sys_timer_getoverrun (int timer_id) +__NR_sys_timer_settime 110 sys_timer_settime (kernel_timer_t timer_id, int flags, const struct itimerspec *new_setting, struct itimerspec *old_setting) +__NR_sys_timer_delete 111 sys_timer_delete (kernel_timer_t timer_id) +__NR_clock_gettime 113 sys_clock_gettime (clockid_t which_clock, struct timespec *tp) +__NR_sched_setscheduler 119 sys_sched_setscheduler (int pid, int policy, struct sched_param *p) +__NR_restart_syscall 128 sys_restart_syscall (void) +__NR_kill 129 sys_kill (long pid, int sig) +__NR_sigaltstack 132 sys_sigaltstack (const void *uss, void *uoss) +__NR_rt_sigaction 134 sys_sigaction (int signum, const rt_sigaction_t *act, rt_sigaction_t *oldact, size_t sigsetsize) +__NR_rt_sigprocmask 135 sys_sigprocmask (int how, k_rtsigset_t *set, k_rtsigset_t *old, size_t sigsetsize) +__NR_rt_sigqueueinfo 138 sys_rt_sigqueueinfo (pid_t pid, int sig, siginfo_t *info) +__NR_rt_sigreturn 139 sys_rt_sigreturn (void) +__NR_setpriority 140 sys_setpriority (int which, int who, int nice) +__NR_setresuid 147 sys_setresuid (int uid, int euid, int suid) +__NR_getresuid 148 sys_getresuid (int *uid, int *euid, int *suid) +__NR_setresgid 149 sys_setresgid (int gid, int egid, int sgid) +__NR_getresgid 150 sys_getresgid (int *gid, int *egid, int *sgid) +__NR_getpgid 155 sys_getpgid (pid_t pid) +__NR_setfsuid 151 sys_setfsuid (int fsuid) +__NR_setfsgid 152 sys_setfsgid (int fsgid) +__NR_getsid 156 sys_getsid (void) +__NR_getgroups 158 sys_getgroups (int gsize, unsigned int *groups) +__NR_setgroups 159 sys_setgroups (int gsize, unsigned int *groups) +__NR_setrlimit 164 sys_setrlimit (int resource, struct krlimit *rlim) +__NR_umask 166 sys_umask (int mask) +__NR_prctl 167 sys_prctl (int option, unsigned long arg2, unsigned long arg3, unsigned long arg4, unsigned long arg5) +__NR_gettimeofday 169 sys_gettimeofday (struct timeval *tv, struct timezone *tz) +__NR_getpid 172 sys_getpid (void) +__NR_ptrace 177 sys_ptrace (long request, pid_t pid, void *addr, void *data) +__NR_gettid 178 sys_gettid (void) +__NR_shmat 196 sys_shmat (int shmid, void *shmaddr, int shmflag) +__NR_socket 198 sys_socket (int domain, int type, int protocol) +__NR_bind 200 sys_bind (int sockfd, const struct sockaddr *addr, int addrlen) +__NR_connect 203 sys_connect (int sockfd, struct sockaddr *addr, int addrlen) +__NR_sendto 206 sys_sendto (int sockfd, void *buff, size_t len, unsigned int flags, struct sockaddr *addr, int addr_len) +__NR_recvfrom 207 sys_recvfrom (int sockfd, void *ubuf, size_t size, unsigned int flags, struct sockaddr *addr, int *addr_len) +__NR_setsockopt 208 sys_setsockopt (int sockfd, int level, int optname, const void *optval, socklen_t optlen) +__NR_getsockopt 209 sys_getsockopt (int sockfd, int level, int optname, const void *optval, socklen_t *optlen) +__NR_shutdown 210 sys_shutdown (int sockfd, int how) +__NR_sendmsg 211 sys_sendmsg (int sockfd, const struct msghdr *msg, int flags) +__NR_recvmsg 212 sys_recvmsg (int sockfd, struct msghdr *msg, int flags) +__NR_brk 214 sys_brk (void *addr) +__NR_munmap 215 sys_munmap (void *addr, unsigned long len) +__NR_mremap 216 sys_mremap (unsigned long addr, unsigned long old_len, unsigned long new_len, unsigned long flags, unsigned long new_addr) +__NR_clone 220 sys_clone (unsigned long flags, void *child_stack, void *parent_tid, unsigned long newtls, void *child_tid) +__NR_mmap 222 sys_mmap (void *addr, unsigned long len, unsigned long prot, unsigned long flags, unsigned long fd, unsigned long offset) +__NR_mprotect 226 sys_mprotect (const void *addr, unsigned long len, unsigned long prot) +__NR_mincore 232 sys_mincore (void *addr, unsigned long size, unsigned char *vec) +__NR_madvise 233 sys_madvise (unsigned long start, size_t len, int behavior) +__NR_rt_tgsigqueueinfo 240 sys_rt_tgsigqueueinfo (pid_t tgid, pid_t pid, int sig, siginfo_t *info) +__NR_wait4 260 sys_wait4 (int pid, int *status, int options, struct rusage *ru) +__NR_fanotify_init 262 sys_fanotify_init (unsigned int flags, unsigned int event_f_flags) +__NR_fanotify_mark 263 sys_fanotify_mark (int fanotify_fd, unsigned int flags, uint64_t mask, int dfd, const char *pathname) +__NR_open_by_handle_at 265 sys_open_by_handle_at (int mountdirfd, struct file_handle *handle, int flags) +__NR_setns 268 sys_setns (int fd, int nstype) +__NR_kcmp 272 sys_kcmp (pid_t pid1, pid_t pid2, int type, unsigned long idx1, unsigned long idx2) +__NR_seccomp 277 sys_seccomp (unsigned int op, unsigned int flags, const char *uargs) +__NR_memfd_create 279 sys_memfd_create (const char *name, unsigned int flags) +__NR_userfaultfd 282 sys_userfaultfd (int flags) +__NR_membarrier 283 sys_membarrier (int cmd, unsigned int flags, int cpu_id) +__NR_rseq 293 sys_rseq (void *rseq, uint32_t rseq_len, int flags, uint32_t sig) +__NR_open_tree 428 sys_open_tree (int dirfd, const char *pathname, unsigned int flags) +__NR_move_mount 429 sys_move_mount (int from_dfd, const char *from_pathname, int to_dfd, const char *to_pathname, int flags) +__NR_fsopen 430 sys_fsopen (char *fsname, unsigned int flags) +__NR_fsconfig 431 sys_fsconfig (int fd, unsigned int cmd, const char *key, const char *value, int aux) +__NR_fsmount 432 sys_fsmount (int fd, unsigned int flags, unsigned int attr_flags) +__NR_pidfd_open 434 sys_pidfd_open (pid_t pid, unsigned int flags) +__NR_clone3 435 sys_clone3 (struct clone_args *uargs, size_t size) +__NR_openat2 437 sys_openat2 (int dirfd, char *pathname, struct open_how *how, size_t size) +__NR_pidfd_getfd 438 sys_pidfd_getfd (int pidfd, int targetfd, unsigned int flags) +#__NR_dup2 ! sys_dup2 (int oldfd, int newfd) +#__NR_rmdir ! sys_rmdir (const char *name) +#__NR_unlink ! sys_unlink (char *pathname) +#__NR_cacheflush ! sys_cacheflush (char *addr, int nbytes, int cache) +#__NR_set_thread_area ! sys_set_thread_area (unsigned long *addr) +#__NR_mkdir ! sys_mkdir (const char *name, int mode) +#__NR_open ! sys_open (const char *filename, unsigned long flags, unsigned long mode) diff --git a/compel/arch/loongarch64/scripts/compel-pack.lds.S b/compel/arch/loongarch64/scripts/compel-pack.lds.S new file mode 100644 index 000000000..cfb7a2fb3 --- /dev/null +++ b/compel/arch/loongarch64/scripts/compel-pack.lds.S @@ -0,0 +1,32 @@ +OUTPUT_ARCH(loongarch) +EXTERN(__export_parasite_head_start) + +SECTIONS +{ + .crblob 0x0 : { + *(.head.text) + ASSERT(DEFINED(__export_parasite_head_start), + "Symbol __export_parasite_head_start is missing"); + *(.text*) + . = ALIGN(32); + *(.data*) + . = ALIGN(32); + *(.rodata*) + . = ALIGN(32); + *(.bss*) + . = ALIGN(32); + *(.got*) + . = ALIGN(32); + *(.toc*) + . = ALIGN(32); + } =0x00000000, + + /DISCARD/ : { + *(.debug*) + *(.comment*) + *(.note*) + *(.group*) + *(.eh_frame*) + *(*) + } +} diff --git a/compel/arch/loongarch64/src/lib/cpu.c b/compel/arch/loongarch64/src/lib/cpu.c new file mode 100644 index 000000000..172b90e27 --- /dev/null +++ b/compel/arch/loongarch64/src/lib/cpu.c @@ -0,0 +1,41 @@ +#include +#include + +#include "compel-cpu.h" +#include "common/bitops.h" +#include "common/compiler.h" +#include "log.h" + +#undef LOG_PREFIX +#define LOG_PREFIX "cpu: " + +static compel_cpuinfo_t rt_info; +static bool rt_info_done = false; + +void compel_set_cpu_cap(compel_cpuinfo_t *c, unsigned int feature) +{ +} + +void compel_clear_cpu_cap(compel_cpuinfo_t *c, unsigned int feature) +{ +} + +int compel_test_cpu_cap(compel_cpuinfo_t *c, unsigned int feature) +{ + return 0; +} + +int compel_cpuid(compel_cpuinfo_t *c) +{ + return 0; +} + +bool compel_cpu_has_feature(unsigned int feature) +{ + if (!rt_info_done) { + compel_cpuid(&rt_info); + rt_info_done = true; + } + + return compel_test_cpu_cap(&rt_info, feature); +} diff --git a/compel/arch/loongarch64/src/lib/handle-elf-host.c b/compel/arch/loongarch64/src/lib/handle-elf-host.c new file mode 100644 index 000000000..a605a5a45 --- /dev/null +++ b/compel/arch/loongarch64/src/lib/handle-elf-host.c @@ -0,0 +1,22 @@ +#include +#include + +#include "handle-elf.h" +#include "piegen.h" +#include "log.h" + +static const unsigned char __maybe_unused elf_ident_64_le[EI_NIDENT] = { + 0x7f, 0x45, 0x4c, 0x46, 0x02, 0x01, 0x01, 0x00, /* clang-format */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, +}; + +extern int __handle_elf(void *mem, size_t size); + +int handle_binary(void *mem, size_t size) +{ + if (memcmp(mem, elf_ident_64_le, sizeof(elf_ident_64_le)) == 0) + return __handle_elf(mem, size); + + pr_err("Unsupported Elf format detected\n"); + return -EINVAL; +} diff --git a/compel/arch/loongarch64/src/lib/handle-elf.c b/compel/arch/loongarch64/src/lib/handle-elf.c new file mode 100644 index 000000000..a605a5a45 --- /dev/null +++ b/compel/arch/loongarch64/src/lib/handle-elf.c @@ -0,0 +1,22 @@ +#include +#include + +#include "handle-elf.h" +#include "piegen.h" +#include "log.h" + +static const unsigned char __maybe_unused elf_ident_64_le[EI_NIDENT] = { + 0x7f, 0x45, 0x4c, 0x46, 0x02, 0x01, 0x01, 0x00, /* clang-format */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, +}; + +extern int __handle_elf(void *mem, size_t size); + +int handle_binary(void *mem, size_t size) +{ + if (memcmp(mem, elf_ident_64_le, sizeof(elf_ident_64_le)) == 0) + return __handle_elf(mem, size); + + pr_err("Unsupported Elf format detected\n"); + return -EINVAL; +} diff --git a/compel/arch/loongarch64/src/lib/include/handle-elf.h b/compel/arch/loongarch64/src/lib/include/handle-elf.h new file mode 100644 index 000000000..b0a66ef87 --- /dev/null +++ b/compel/arch/loongarch64/src/lib/include/handle-elf.h @@ -0,0 +1,8 @@ +#ifndef COMPEL_HANDLE_ELF_H__ +#define COMPEL_HANDLE_ELF_H__ + +#include "elf64-types.h" + +#define arch_is_machine_supported(e_machine) (e_machine == EM_LOONGARCH) + +#endif /* COMPEL_HANDLE_ELF_H__ */ diff --git a/compel/arch/loongarch64/src/lib/include/syscall.h b/compel/arch/loongarch64/src/lib/include/syscall.h new file mode 100644 index 000000000..ac3e2799a --- /dev/null +++ b/compel/arch/loongarch64/src/lib/include/syscall.h @@ -0,0 +1,8 @@ +#ifndef __COMPEL_SYSCALL_H__ +#define __COMPEL_SYSCALL_H__ + +#ifndef SIGSTKFLT +#define SIGSTKFLT 16 +#endif + +#endif diff --git a/compel/arch/loongarch64/src/lib/include/uapi/asm/breakpoints.h b/compel/arch/loongarch64/src/lib/include/uapi/asm/breakpoints.h new file mode 100644 index 000000000..21eb1309f --- /dev/null +++ b/compel/arch/loongarch64/src/lib/include/uapi/asm/breakpoints.h @@ -0,0 +1,6 @@ +#ifndef __COMPEL_BREAKPOINTS_H__ +#define __COMPEL_BREAKPOINTS_H__ +#define ARCH_SI_TRAP TRAP_BRKPT +extern int ptrace_set_breakpoint(pid_t pid, void *addr); +extern int ptrace_flush_breakpoints(pid_t pid); +#endif diff --git a/compel/arch/loongarch64/src/lib/include/uapi/asm/cpu.h b/compel/arch/loongarch64/src/lib/include/uapi/asm/cpu.h new file mode 100644 index 000000000..e568df789 --- /dev/null +++ b/compel/arch/loongarch64/src/lib/include/uapi/asm/cpu.h @@ -0,0 +1,6 @@ +#ifndef __CR_ASM_CPU_H__ +#define __CR_ASM_CPU_H__ + +typedef struct { +} compel_cpuinfo_t; +#endif /* __CR_ASM_CPU_H__ */ diff --git a/compel/arch/loongarch64/src/lib/include/uapi/asm/fpu.h b/compel/arch/loongarch64/src/lib/include/uapi/asm/fpu.h new file mode 100644 index 000000000..7f476d541 --- /dev/null +++ b/compel/arch/loongarch64/src/lib/include/uapi/asm/fpu.h @@ -0,0 +1,4 @@ +#ifndef __CR_ASM_FPU_H__ +#define __CR_ASM_FPU_H__ + +#endif /* __CR_ASM_FPU_H__ */ diff --git a/compel/arch/loongarch64/src/lib/include/uapi/asm/infect-types.h b/compel/arch/loongarch64/src/lib/include/uapi/asm/infect-types.h new file mode 100644 index 000000000..0b047a5b0 --- /dev/null +++ b/compel/arch/loongarch64/src/lib/include/uapi/asm/infect-types.h @@ -0,0 +1,67 @@ +#ifndef UAPI_COMPEL_ASM_TYPES_H__ +#define UAPI_COMPEL_ASM_TYPES_H__ + +#include + +#define SIGMAX 64 +#define SIGMAX_OLD 31 + +/* + * From the Linux kernel header arch/loongarch/include/uapi/asm/ptrace.h + * + * A thread LoongArch CPU context + * + * struct user_fp_state { + * uint64_t fpr[32]; + * uint64_t fcc; + * uint32_t fcsr; + * }; + * + * struct user_pt_regs { + * unsigned long regs[32]; + * unsigned long csr_era; + * unsigned long csr_badv; + * unsigned long reserved[11]; + * }; + */ + +struct user_gp_regs { + uint64_t regs[32]; + uint64_t orig_a0; + uint64_t pc; + uint64_t csr_badv; + uint64_t reserved[10]; +} __attribute__((aligned(8))); + +struct user_fp_regs { + uint64_t regs[32]; + uint64_t fcc; + uint32_t fcsr; +}; + +typedef struct user_gp_regs user_regs_struct_t; +typedef struct user_fp_regs user_fpregs_struct_t; + +#define user_regs_native(regs) true + +#define __compel_arch_fetch_thread_area(tid, th) 0 +#define compel_arch_fetch_thread_area(tctl) 0 +#define compel_arch_get_tls_task(ctl, tls) +#define compel_arch_get_tls_thread(tctl, tls) + +#define REG_RES(r) ((uint64_t)(r).regs[4]) +#define REG_IP(r) ((uint64_t)(r).pc) +#define REG_SP(r) ((uint64_t)(r).regs[3]) +#define REG_SYSCALL_NR(r) ((uint64_t)(r).regs[11]) +#define SET_REG_IP(r, val) ((r).pc = (val)) + +#define GPR_NUM 32 +#define FPR_NUM 32 + +#define __NR(syscall, compat) \ + ({ \ + (void)compat; \ + __NR_##syscall; \ + }) + +#endif /* UAPI_COMPEL_ASM_TYPES_H__ */ diff --git a/compel/arch/loongarch64/src/lib/include/uapi/asm/sigframe.h b/compel/arch/loongarch64/src/lib/include/uapi/asm/sigframe.h new file mode 100644 index 000000000..fcb545a1d --- /dev/null +++ b/compel/arch/loongarch64/src/lib/include/uapi/asm/sigframe.h @@ -0,0 +1,86 @@ +#ifndef UAPI_COMPEL_ASM_SIGFRAME_H__ +#define UAPI_COMPEL_ASM_SIGFRAME_H__ + +#include +#include +#include + +#include +#include + +#include + +#define rt_sigcontext sigcontext +/* sigcontext defined in usr/include/uapi/asm/sigcontext.h*/ +#include +typedef __u32 u32; + +typedef struct sigcontext_t { + __u64 pc; + __u64 regs[32]; + __u32 flags; + __u64 extcontext[0] __attribute__((__aligned__(16))); +} sigcontext_t; + +typedef struct context_info_t { + __u32 magic; + __u32 size; + __u64 padding; +} context_info_t; + +#define FPU_CTX_MAGIC 0x46505501 +#define FPU_CTX_ALIGN 8 +typedef struct fpu_context_t { + __u64 regs[32]; + __u64 fcc; + __u64 fcsr; +} fpu_context_t; + +typedef struct ucontext { + unsigned long uc_flags; + struct ucontext *uc_link; + stack_t uc_stack; + sigset_t uc_sigmask; + __u8 __unused[1024 / 8 - sizeof(sigset_t)]; + sigcontext_t uc_mcontext; +} ucontext; + +/* Copy from the kernel source arch/loongarch/kernel/signal.c */ +struct rt_sigframe { + rt_siginfo_t rs_info; + ucontext rs_uc; +}; + +#define RT_SIGFRAME_UC(rt_sigframe) (&(rt_sigframe->rs_uc)) +#define RT_SIGFRAME_SIGMASK(rt_sigframe) ((k_rtsigset_t *)&RT_SIGFRAME_UC(rt_sigframe)->uc_sigmask) +#define RT_SIGFRAME_SIGCTX(rt_sigframe) (&(RT_SIGFRAME_UC(rt_sigframe)->uc_mcontext)) +#define RT_SIGFRAME_REGIP(rt_sigframe) ((long unsigned int)(RT_SIGFRAME_SIGCTX(rt_sigframe)->pc)) +#define RT_SIGFRAME_HAS_FPU(rt_sigframe) (1) + +#define RT_SIGFRAME_FPU(rt_sigframe) \ + ({ \ + context_info_t *ctx = (context_info_t *)RT_SIGFRAME_SIGCTX(rt_sigframe)->extcontext; \ + ctx->magic = FPU_CTX_MAGIC; \ + ctx->size = sizeof(context_info_t) + sizeof(fpu_context_t); \ + (fpu_context_t *)((char *)ctx + sizeof(context_info_t)); \ + }) + +#define RT_SIGFRAME_OFFSET(rt_sigframe) 0 + +/* clang-format off */ +#define ARCH_RT_SIGRETURN(new_sp, rt_sigframe) \ + asm volatile( \ + "addi.d $sp, %0, 0 \n" \ + "addi.d $a7, $zero, "__stringify(__NR_rt_sigreturn)" \n" \ + "syscall 0" \ + : \ + :"r"(new_sp) \ + : "$a7", "memory") +/* clang-format on */ + +int sigreturn_prep_fpu_frame(struct rt_sigframe *sigframe, struct rt_sigframe *rsigframe); + +#define rt_sigframe_erase_sigset(sigframe) memset(RT_SIGFRAME_SIGMASK(sigframe), 0, sizeof(k_rtsigset_t)) +#define rt_sigframe_copy_sigset(sigframe, from) memcpy(RT_SIGFRAME_SIGMASK(sigframe), from, sizeof(k_rtsigset_t)) + +#endif /* UAPI_COMPEL_ASM_SIGFRAME_H__ */ diff --git a/compel/arch/loongarch64/src/lib/infect.c b/compel/arch/loongarch64/src/lib/infect.c new file mode 100644 index 000000000..190c39227 --- /dev/null +++ b/compel/arch/loongarch64/src/lib/infect.c @@ -0,0 +1,204 @@ +#include +#include +#include +#include +#include + +#include +#include +#include "errno.h" +#include +#include +#include "common/err.h" +#include "common/page.h" +#include "asm/infect-types.h" +#include "ptrace.h" +#include "infect.h" +#include "infect-priv.h" +#include "log.h" +#include "common/bug.h" + +/* + * Injected syscall instruction + * loongarch64 is Little Endian + */ +const char code_syscall[] = { + 0x00, 0x00, 0x2b, 0x00, /* syscall */ + 0x00, 0x00, 0x2a, 0x00 /* break */ +}; + +int sigreturn_prep_regs_plain(struct rt_sigframe *sigframe, user_regs_struct_t *regs, user_fpregs_struct_t *fpregs) +{ + sigcontext_t *sc; + fpu_context_t *fpu; + + sc = RT_SIGFRAME_SIGCTX(sigframe); + memcpy(sc->regs, regs->regs, sizeof(regs->regs)); + sc->pc = regs->pc; + + fpu = RT_SIGFRAME_FPU(sigframe); + memcpy(fpu->regs, fpregs->regs, sizeof(fpregs->regs)); + fpu->fcc = fpregs->fcc; + fpu->fcsr = fpregs->fcsr; + return 0; +} + +int sigreturn_prep_fpu_frame_plain(struct rt_sigframe *sigframe, struct rt_sigframe *rsigframe) +{ + return 0; +} + +int compel_get_task_regs(pid_t pid, user_regs_struct_t *regs, user_fpregs_struct_t *ext_regs, save_regs_t save, + void *arg, __maybe_unused unsigned long flags) +{ + user_fpregs_struct_t tmp, *fpregs = ext_regs ? ext_regs : &tmp; + struct iovec iov; + int ret; + + pr_info("Dumping GP/FPU registers for %d\n", pid); + + iov.iov_base = regs; + iov.iov_len = sizeof(user_regs_struct_t); + if ((ret = ptrace(PTRACE_GETREGSET, pid, NT_PRSTATUS, &iov))) { + pr_perror("Failed to obtain CPU registers for %d", pid); + goto err; + } + + /* + * Refer to Linux kernel arch/loongarch/kernel/signal.c + */ + if (regs->regs[0]) { + switch (regs->regs[4]) { + case -ERESTARTNOHAND: + case -ERESTARTSYS: + case -ERESTARTNOINTR: + regs->regs[4] = regs->orig_a0; + regs->pc -= 4; + break; + case -ERESTART_RESTARTBLOCK: + regs->regs[4] = regs->orig_a0; + regs->regs[11] = __NR_restart_syscall; + regs->pc -= 4; + break; + } + regs->regs[0] = 0; /* Don't deal with this again. */ + } + + iov.iov_base = fpregs; + iov.iov_len = sizeof(user_fpregs_struct_t); + if ((ret = ptrace(PTRACE_GETREGSET, pid, NT_PRFPREG, &iov))) { + pr_perror("Failed to obtain FPU registers for %d", pid); + goto err; + } + + ret = save(pid, arg, regs, fpregs); +err: + return 0; +} + +int compel_set_task_ext_regs(pid_t pid, user_fpregs_struct_t *ext_regs) +{ + struct iovec iov; + + pr_info("Restoring GP/FPU registers for %d\n", pid); + + iov.iov_base = ext_regs; + iov.iov_len = sizeof(*ext_regs); + if (ptrace(PTRACE_SETREGSET, pid, NT_PRFPREG, &iov)) { + pr_perror("Failed to set FPU registers for %d", pid); + return -1; + } + return 0; +} + +/* + * Registers $4 ~ $11 represents arguments a0 ~ a7, especially a7 is + * used as syscall number. + */ +int compel_syscall(struct parasite_ctl *ctl, int nr, long *ret, unsigned long arg1, unsigned long arg2, + unsigned long arg3, unsigned long arg4, unsigned long arg5, unsigned long arg6) +{ + int err; + user_regs_struct_t regs = ctl->orig.regs; + + regs.regs[11] = (unsigned long)nr; + regs.regs[4] = arg1; + regs.regs[5] = arg2; + regs.regs[6] = arg3; + regs.regs[7] = arg4; + regs.regs[8] = arg5; + regs.regs[9] = arg6; + err = compel_execute_syscall(ctl, ®s, code_syscall); + + *ret = regs.regs[4]; + + return err; +} + +void *remote_mmap(struct parasite_ctl *ctl, void *addr, size_t length, int prot, int flags, int fd, off_t offset) +{ + long map; + int err; + + err = compel_syscall(ctl, __NR_mmap, &map, (unsigned long)addr, length, prot, flags, fd, offset >> PAGE_SHIFT); + + if (err < 0 || IS_ERR_VALUE(map)) { + pr_err("remote mmap() failed: %s\n", strerror(-map)); + return NULL; + } + + return (void *)map; +} + +/* + * regs must be inited when calling this function from original context + */ +void parasite_setup_regs(unsigned long new_ip, void *stack, user_regs_struct_t *regs) +{ + regs->pc = new_ip; + if (stack) + regs->regs[4] = (unsigned long)stack; +} + +bool arch_can_dump_task(struct parasite_ctl *ctl) +{ + return true; +} + +int arch_fetch_sas(struct parasite_ctl *ctl, struct rt_sigframe *s) +{ + long ret; + int err; + + err = compel_syscall(ctl, __NR_sigaltstack, &ret, 0, (unsigned long)&s->rs_uc.uc_stack, 0, 0, 0, 0); + return err ? err : ret; +} + +/* + * TODO: add feature + */ +int ptrace_set_breakpoint(pid_t pid, void *addr) +{ + return 0; +} + +int ptrace_flush_breakpoints(pid_t pid) +{ + return 0; +} + +/* + * Refer to Linux kernel arch/loongarch/include/asm/processor.h + */ +#define TASK_SIZE32 (1UL) << 31 +#define TASK_SIZE64_MIN (1UL) << 40 +#define TASK_SIZE64_MAX (1UL) << 48 + +unsigned long compel_task_size(void) +{ + unsigned long task_size; + for (task_size = TASK_SIZE64_MIN; task_size < TASK_SIZE64_MAX; task_size <<= 1) + if (munmap((void *)task_size, page_size())) + break; + return task_size; +} diff --git a/compel/arch/mips/plugins/include/asm/prologue.h b/compel/arch/mips/plugins/include/asm/prologue.h new file mode 100644 index 000000000..c19ce54d7 --- /dev/null +++ b/compel/arch/mips/plugins/include/asm/prologue.h @@ -0,0 +1,35 @@ +#ifndef __ASM_PROLOGUE_H__ +#define __ASM_PROLOGUE_H__ + +#ifndef __ASSEMBLY__ + +#include +#include +#include + +#include + +#define sys_recv(sockfd, ubuf, size, flags) sys_recvfrom(sockfd, ubuf, size, flags, NULL, NULL) + +typedef struct prologue_init_args { + struct sockaddr_un ctl_sock_addr; + unsigned int ctl_sock_addr_len; + + unsigned int arg_s; + void *arg_p; + + void *sigframe; +} prologue_init_args_t; + +#endif /* __ASSEMBLY__ */ + +/* + * Reserve enough space for sigframe. + * + * FIXME It is rather should be taken from sigframe header. + */ +#define PROLOGUE_SGFRAME_SIZE 4096 + +#define PROLOGUE_INIT_ARGS_SIZE 1024 + +#endif /* __ASM_PROLOGUE_H__ */ diff --git a/compel/arch/mips/plugins/include/asm/syscall-types.h b/compel/arch/mips/plugins/include/asm/syscall-types.h new file mode 100644 index 000000000..bd7e8dfb3 --- /dev/null +++ b/compel/arch/mips/plugins/include/asm/syscall-types.h @@ -0,0 +1,36 @@ +#ifndef COMPEL_ARCH_SYSCALL_TYPES_H__ +#define COMPEL_ARCH_SYSCALL_TYPES_H__ + +/* Types for sigaction, sigprocmask syscalls */ +typedef void rt_signalfn_t(int, siginfo_t *, void *); +typedef rt_signalfn_t *rt_sighandler_t; + +typedef void rt_restorefn_t(void); +typedef rt_restorefn_t *rt_sigrestore_t; + +#define SA_RESTORER 0x04000000 + +/** refer to linux-3.10/arch/mips/include/uapi/asm/signal.h*/ +#define _KNSIG 128 +#define _NSIG_BPW 64 + +#define _KNSIG_WORDS (_KNSIG / _NSIG_BPW) + +/* + * Note: as k_rtsigset_t is the same size for 32-bit and 64-bit, + * sig defined as uint64_t rather than (unsigned long) - for the + * purpose if we ever going to support native 32-bit compilation. + */ + +typedef struct { + uint64_t sig[_KNSIG_WORDS]; +} k_rtsigset_t; + +typedef struct { + rt_sighandler_t rt_sa_handler; + unsigned long rt_sa_flags; + rt_sigrestore_t rt_sa_restorer; + k_rtsigset_t rt_sa_mask; +} rt_sigaction_t; + +#endif /* COMPEL_ARCH_SYSCALL_TYPES_H__ */ diff --git a/compel/arch/mips/plugins/include/features.h b/compel/arch/mips/plugins/include/features.h new file mode 100644 index 000000000..0f35725fa --- /dev/null +++ b/compel/arch/mips/plugins/include/features.h @@ -0,0 +1,6 @@ +#ifndef __COMPEL_ARCH_FEATURES_H +#define __COMPEL_ARCH_FEATURES_H + +#define ARCH_HAS_MEMCPY + +#endif /* __COMPEL_ARCH_FEATURES_H */ diff --git a/compel/arch/mips/plugins/std/memcpy.S b/compel/arch/mips/plugins/std/memcpy.S new file mode 100644 index 000000000..5d13a1590 --- /dev/null +++ b/compel/arch/mips/plugins/std/memcpy.S @@ -0,0 +1,22 @@ + +#include "common/asm/linkage.h" + + .section .head.text, "ax" +ENTRY(memcpy) + .set noreorder + dadd v0,zero,a0 + daddiu t1,zero,0 +loop: + beq t1,a2,exit + nop + lb t2,0(a1) + sb t2,0(a0) + daddiu t1,t1,1 + daddiu a0,a0,1 + daddiu a1,a1,1 + j loop + nop +exit: + jr ra + nop +END(memcpy) diff --git a/compel/arch/mips/plugins/std/parasite-head.S b/compel/arch/mips/plugins/std/parasite-head.S new file mode 100644 index 000000000..33d04db96 --- /dev/null +++ b/compel/arch/mips/plugins/std/parasite-head.S @@ -0,0 +1,14 @@ + +#include "common/asm/linkage.h" + + .section .head.text, "ax" +ENTRY(__export_parasite_head_start) + .set push + .set noreorder + jal parasite_service + nop + .byte 0x0d, 0x00, 0x00, 0x00 //break + .set pop +// .byte 0x40,0x01,0x00,0x00 //pause +END(__export_parasite_head_start) + diff --git a/compel/arch/mips/plugins/std/syscalls/Makefile.syscalls b/compel/arch/mips/plugins/std/syscalls/Makefile.syscalls new file mode 100644 index 000000000..ef75f9e95 --- /dev/null +++ b/compel/arch/mips/plugins/std/syscalls/Makefile.syscalls @@ -0,0 +1,117 @@ +std-lib-y += ./$(PLUGIN_ARCH_DIR)/std/syscalls-64.o +sys-proto-types := $(obj)/include/uapi/std/syscall-types.h +sys-proto-generic := $(obj)/include/uapi/std/syscall.h +sys-codes-generic := $(obj)/include/uapi/std/syscall-codes.h +sys-codes = $(obj)/include/uapi/std/syscall-codes-$(1).h +sys-proto = $(obj)/include/uapi/std/syscall-$(1).h +sys-def = $(PLUGIN_ARCH_DIR)/std/syscalls/syscall_$(1).tbl +sys-asm = $(PLUGIN_ARCH_DIR)/std/syscalls-$(1).S +sys-asm-common-name = std/syscalls/syscall-common-mips-$(1).S +sys-asm-common = $(PLUGIN_ARCH_DIR)/$(sys-asm-common-name) +sys-asm-types := $(obj)/include/uapi/std/asm/syscall-types.h +sys-exec-tbl = $(PLUGIN_ARCH_DIR)/std/sys-exec-tbl-$(1).c + +sys-bits := 64 + +AV := $$$$ + +define gen-rule-sys-codes +$(sys-codes): $(sys-def) $(sys-proto-types) + $(call msg-gen, $$@) + $(Q) echo "/* Autogenerated, don't edit */" > $$@ + $(Q) echo "#ifndef ASM_SYSCALL_CODES_H_$(1)__" >> $$@ + $(Q) echo "#define ASM_SYSCALL_CODES_H_$(1)__" >> $$@ + $(Q) cat $$< | awk '/^__NR/{SYSN=$(AV)1; \ + sub("^__NR", "SYS", SYSN); \ + print "\n#ifndef ", $(AV)1; \ + print "#define", $(AV)1, $(AV)2; \ + print "#endif"; \ + print "\n#ifndef ", SYSN; \ + print "#define ", SYSN, $(AV)1; \ + print "#endif";}' >> $$@ + $(Q) echo "#endif /* ASM_SYSCALL_CODES_H_$(1)__ */" >> $$@ +endef + +define gen-rule-sys-proto +$(sys-proto): $(sys-def) $(sys-proto-types) + $(call msg-gen, $$@) + $(Q) echo "/* Autogenerated, don't edit */" > $$@ + $(Q) echo "#ifndef ASM_SYSCALL_PROTO_H_$(1)__" >> $$@ + $(Q) echo "#define ASM_SYSCALL_PROTO_H_$(1)__" >> $$@ + $(Q) echo '#include ' >> $$@ + $(Q) echo '#include ' >> $$@ +ifeq ($(1),32) + $(Q) echo '#include "asm/syscall32.h"' >> $$@ +endif + $(Q) cat $$< | awk '/^__NR/{print "extern long", $(AV)3, \ + substr($(AV)0, index($(AV)0,$(AV)4)), ";"}' >> $$@ + $(Q) echo "#endif /* ASM_SYSCALL_PROTO_H_$(1)__ */" >> $$@ +endef + +define gen-rule-sys-asm +$(sys-asm): $(sys-def) $(sys-asm-common) $(sys-codes) $(sys-proto) $(sys-proto-types) + $(call msg-gen, $$@) + $(Q) echo "/* Autogenerated, don't edit */" > $$@ + $(Q) echo '#include ' >> $$@ + $(Q) echo '#include "$(sys-asm-common-name)"' >> $$@ + $(Q) cat $$< | awk '/^__NR/{print "SYSCALL(", $(AV)3, ",", $(AV)2, ")"}' >> $$@ +endef + +define gen-rule-sys-exec-tbl +$(sys-exec-tbl): $(sys-def) $(sys-codes) $(sys-proto) $(sys-proto-generic) $(sys-proto-types) + $(call msg-gen, $$@) + $(Q) echo "/* Autogenerated, don't edit */" > $$@ + $(Q) cat $$< | awk '/^__NR/{print \ + "SYSCALL(", substr($(AV)3, 5), ",", $(AV)2, ")"}' >> $$@ +endef + +$(sys-codes-generic): $(sys-proto-types) + $(call msg-gen, $@) + $(Q) echo "/* Autogenerated, don't edit */" > $@ + $(Q) echo "#ifndef __ASM_CR_SYSCALL_CODES_H__" >> $@ + $(Q) echo "#define __ASM_CR_SYSCALL_CODES_H__" >> $@ + $(Q) echo '#include ' >> $@ + $(Q) cat $< | awk '/^__NR/{NR32=$$1; \ + sub("^__NR", "__NR32", NR32); \ + print "\n#ifndef ", NR32; \ + print "#define ", NR32, $$2; \ + print "#endif";}' >> $@ + $(Q) echo "#endif /* __ASM_CR_SYSCALL_CODES_H__ */" >> $@ +mrproper-y += $(sys-codes-generic) + +$(sys-proto-generic): $(strip $(call map,sys-proto,$(sys-bits))) $(sys-proto-types) + $(call msg-gen, $@) + $(Q) echo "/* Autogenerated, don't edit */" > $@ + $(Q) echo "#ifndef __ASM_CR_SYSCALL_PROTO_H__" >> $@ + $(Q) echo "#define __ASM_CR_SYSCALL_PROTO_H__" >> $@ + $(Q) echo "" >> $@ + $(Q) echo '#include ' >> $@ + $(Q) echo "" >> $@ + $(Q) echo "#endif /* __ASM_CR_SYSCALL_PROTO_H__ */" >> $@ +mrproper-y += $(sys-proto-generic) + +define gen-rule-sys-exec-tbl +$(sys-exec-tbl): $(sys-def) $(sys-codes) $(sys-proto) $(sys-proto-generic) + $(call msg-gen, $$@) + $(Q) echo "/* Autogenerated, don't edit */" > $$@ + $(Q) cat $$< | awk '/^__NR/{print \ + "SYSCALL(", substr($(AV)3, 5), ",", $(AV)2, ")"}' >> $$@ +endef + +$(eval $(call map,gen-rule-sys-codes,$(sys-bits))) +$(eval $(call map,gen-rule-sys-proto,$(sys-bits))) +$(eval $(call map,gen-rule-sys-asm,$(sys-bits))) +$(eval $(call map,gen-rule-sys-exec-tbl,$(sys-bits))) + +$(sys-asm-types): $(PLUGIN_ARCH_DIR)/include/asm/syscall-types.h + $(call msg-gen, $@) + $(Q) ln -s ../../../../../../$(PLUGIN_ARCH_DIR)/include/asm/syscall-types.h $(sys-asm-types) + +std-headers-deps += $(call sys-codes,$(sys-bits)) +std-headers-deps += $(call sys-proto,$(sys-bits)) +std-headers-deps += $(call sys-asm,$(sys-bits)) +std-headers-deps += $(call sys-exec-tbl,$(sys-bits)) +std-headers-deps += $(sys-codes-generic) +std-headers-deps += $(sys-proto-generic) +std-headers-deps += $(sys-asm-types) +mrproper-y += $(std-headers-deps) diff --git a/compel/arch/mips/plugins/std/syscalls/syscall-common-mips-64.S b/compel/arch/mips/plugins/std/syscalls/syscall-common-mips-64.S new file mode 100644 index 000000000..3478488da --- /dev/null +++ b/compel/arch/mips/plugins/std/syscalls/syscall-common-mips-64.S @@ -0,0 +1,12 @@ +#include "common/asm/linkage.h" + +#define SYSCALL(name, opcode) \ + ENTRY(name); \ + li v0, opcode; \ + syscall; \ + jr ra; \ + nop; \ + END(name) + +ENTRY(__cr_restore_rt) +END(__cr_restore_rt) diff --git a/compel/arch/mips/plugins/std/syscalls/syscall_64.tbl b/compel/arch/mips/plugins/std/syscalls/syscall_64.tbl new file mode 100644 index 000000000..ad3d44634 --- /dev/null +++ b/compel/arch/mips/plugins/std/syscalls/syscall_64.tbl @@ -0,0 +1,123 @@ +# +# System calls table, please make sure the table consist only the syscalls +# really used somewhere in project. +# from kernel/linux-3.10.84/arch/mips/include/uapi/asm/unistd.h Linux 64-bit syscalls are in the range from 5000 to 5999. +# +# __NR_name code name arguments +# ------------------------------------------------------------------------------------------------------------------------------------------------------------- +__NR_read 5000 sys_read (int fd, void *buf, unsigned long count) +__NR_write 5001 sys_write (int fd, const void *buf, unsigned long count) +__NR_open 5002 sys_open (const char *filename, unsigned long flags, unsigned long mode) +__NR_close 5003 sys_close (int fd) +__NR_lseek 5008 sys_lseek (int fd, unsigned long offset, unsigned long origin) +__NR_mmap 5009 sys_mmap (void *addr, unsigned long len, unsigned long prot, unsigned long flags, unsigned long fd, unsigned long offset) +__NR_mprotect 5010 sys_mprotect (const void *addr, unsigned long len, unsigned long prot) +__NR_munmap 5011 sys_munmap (void *addr, unsigned long len) +__NR_brk 5012 sys_brk (void *addr) +__NR_rt_sigaction 5013 sys_sigaction (int signum, const rt_sigaction_t *act, rt_sigaction_t *oldact, size_t sigsetsize) +__NR_rt_sigprocmask 5014 sys_sigprocmask (int how, k_rtsigset_t *set, k_rtsigset_t *old, size_t sigsetsize) +__NR_rt_sigreturn 5211 sys_rt_sigreturn (void) +__NR_ioctl 5015 sys_ioctl (unsigned int fd, unsigned int cmd, unsigned long arg) +__NR_pread64 5016 sys_pread (unsigned int fd, char *buf, size_t count, loff_t pos) +__NR_mremap 5024 sys_mremap (unsigned long addr, unsigned long old_len, unsigned long new_len, unsigned long flags, unsigned long new_addr) +__NR_mincore 5026 sys_mincore (void *addr, unsigned long size, unsigned char *vec) +__NR_madvise 5027 sys_madvise (unsigned long start, size_t len, int behavior) +__NR_shmat 5029 sys_shmat (int shmid, void *shmaddr, int shmflag) +__NR_dup2 5032 sys_dup2 (int oldfd, int newfd) +__NR_nanosleep 5034 sys_nanosleep (struct timespec *req, struct timespec *rem) +__NR_getitimer 5035 sys_getitimer (int which, const struct itimerval *val) +__NR_setitimer 5036 sys_setitimer (int which, const struct itimerval *val, struct itimerval *old) +__NR_getpid 5038 sys_getpid (void) +__NR_socket 5040 sys_socket (int domain, int type, int protocol) +__NR_connect 5041 sys_connect (int sockfd, struct sockaddr *addr, int addrlen) +__NR_sendto 5043 sys_sendto (int sockfd, void *buff, size_t len, unsigned int flags, struct sockaddr *addr, int addr_len) +__NR_recvfrom 5044 sys_recvfrom (int sockfd, void *ubuf, size_t size, unsigned int flags, struct sockaddr *addr, int *addr_len) +__NR_sendmsg 5045 sys_sendmsg (int sockfd, const struct msghdr *msg, int flags) +__NR_recvmsg 5046 sys_recvmsg (int sockfd, struct msghdr *msg, int flags) +__NR_shutdown 5047 sys_shutdown (int sockfd, int how) +__NR_bind 5048 sys_bind (int sockfd, const struct sockaddr *addr, int addrlen) +__NR_setsockopt 5053 sys_setsockopt (int sockfd, int level, int optname, const void *optval, socklen_t optlen) +__NR_getsockopt 5054 sys_getsockopt (int sockfd, int level, int optname, const void *optval, socklen_t *optlen) +__NR_clone 5055 sys_clone (unsigned long flags, void *child_stack, void *parent_tid, unsigned long newtls, void *child_tid) +__NR_exit 5058 sys_exit (unsigned long error_code) +__NR_wait4 5059 sys_wait4 (int pid, int *status, int options, struct rusage *ru) +__NR_kill 5060 sys_kill (long pid, int sig) +__NR_fcntl 5070 sys_fcntl (int fd, int type, long arg) +__NR_flock 5071 sys_flock (int fd, unsigned long cmd) +__NR_mkdir 5081 sys_mkdir (const char *name, int mode) +__NR_rmdir 5082 sys_rmdir (const char *name) +__NR_unlink 5085 sys_unlink (char *pathname) +__NR_umask 5093 sys_umask (int mask) +__NR_gettimeofday 5094 sys_gettimeofday (struct timeval *tv, struct timezone *tz) +__NR_ptrace 5099 sys_ptrace (long request, pid_t pid, void *addr, void *data) +__NR_getgroups 5113 sys_getgroups (int gsize, unsigned int *groups) +__NR_setgroups 5114 sys_setgroups (int gsize, unsigned int *groups) +__NR_setresuid 5115 sys_setresuid (int uid, int euid, int suid) +__NR_getresuid 5116 sys_getresuid (int *uid, int *euid, int *suid) +__NR_setresgid 5117 sys_setresgid (int gid, int egid, int sgid) +__NR_getresgid 5118 sys_getresgid (int *gid, int *egid, int *sgid) +__NR_getpgid 5119 sys_getpgid (pid_t pid) +__NR_setfsuid 5120 sys_setfsuid (int fsuid) +__NR_setfsgid 5121 sys_setfsgid (int fsgid) +__NR_getsid 5122 sys_getsid (void) +__NR_capget 5123 sys_capget (struct cap_header *h, struct cap_data *d) +__NR_capset 5124 sys_capset (struct cap_header *h, struct cap_data *d) +__NR_rt_sigqueueinfo 5127 sys_rt_sigqueueinfo (pid_t pid, int sig, siginfo_t *info) +__NR_sigaltstack 5129 sys_sigaltstack (const void *uss, void *uoss) +__NR_personality 5132 sys_personality (unsigned int personality) +__NR_setpriority 5138 sys_setpriority (int which, int who, int nice) +__NR_sched_setscheduler 5141 sys_sched_setscheduler (int pid, int policy, struct sched_param *p) +__NR_prctl 5153 sys_prctl (int option, unsigned long arg2, unsigned long arg3, unsigned long arg4, unsigned long arg5) +__NR_setrlimit 5155 sys_setrlimit (int resource, struct krlimit *rlim) +__NR_mount 5160 sys_mount (char *dev_nmae, char *dir_name, char *type, unsigned long flags, void *data) +__NR_umount2 5161 sys_umount2 (char *name, int flags) +__NR_gettid 5178 sys_gettid (void) +__NR_futex 5194 sys_futex (uint32_t *uaddr, int op, uint32_t val, struct timespec *utime, uint32_t *uaddr2, uint32_t val3) +__NR_cacheflush 5197 sys_cacheflush (char *addr, int nbytes, int cache) +__NR_io_setup 5200 sys_io_setup (unsigned nr_events, aio_context_t *ctx) +__NR_io_getevents 5202 sys_io_getevents (aio_context_t ctx, long min_nr, long nr, struct io_event *evs, struct timespec *tmo) +__NR_io_submit 5203 sys_io_submit (aio_context_t ctx, long nr, struct iocb **iocbpp) +__NR_set_tid_address 5212 sys_set_tid_address (int *tid_addr) +__NR_restart_syscall 5213 sys_restart_syscall (void) +__NR_sys_timer_create 5216 sys_timer_create (clockid_t which_clock, struct sigevent *timer_event_spec, kernel_timer_t *created_timer_id) +__NR_sys_timer_settime 5217 sys_timer_settime (kernel_timer_t timer_id, int flags, const struct itimerspec *new_setting, struct itimerspec *old_setting) +__NR_sys_timer_gettime 5218 sys_timer_gettime (int timer_id, const struct itimerspec *setting) +__NR_sys_timer_getoverrun 5219 sys_timer_getoverrun (int timer_id) +__NR_sys_timer_delete 5220 sys_timer_delete (kernel_timer_t timer_id) +__NR_clock_gettime 5222 sys_clock_gettime (clockid_t which_clock, struct timespec *tp) +__NR_exit_group 5205 sys_exit_group (int error_code) +__NR_set_thread_area 5242 sys_set_thread_area (unsigned long *addr) +__NR_openat 5247 sys_openat (int dfd, const char *filename, int flags, int mode) +__NR_waitid 5237 sys_waitid (int which, pid_t pid, struct siginfo *infop, int options, struct rusage *ru) +__NR_readlinkat 5257 sys_readlinkat (int fd, const char *path, char *buf, int bufsize) +__NR_ppoll 5261 sys_ppoll (struct pollfd *fds, unsigned int nfds, const struct timespec *tmo, const sigset_t *sigmask, size_t sigsetsize) +__NR_set_robust_list 5268 sys_set_robust_list (struct robust_list_head *head, size_t len) +__NR_get_robust_list 5269 sys_get_robust_list (int pid, struct robust_list_head **head_ptr, size_t *len_ptr) +__NR_fallocate 5279 sys_fallocate (int fd, int mode, loff_t offset, loff_t len) +__NR_seccomp 5312 sys_seccomp (unsigned int op, unsigned int flags, const char *uargs) +__NR_vmsplice 5266 sys_vmsplice (int fd, const struct iovec *iov, unsigned long nr_segs, unsigned int flags) +__NR_timerfd_settime 5282 sys_timerfd_settime (int ufd, int flags, const struct itimerspec *utmr, struct itimerspec *otmr) +__NR_signalfd4 5283 sys_signalfd (int fd, k_rtsigset_t *mask, size_t sizemask, int flags) +__NR_preadv 5289 sys_preadv_raw (int fd, struct iovec *iov, unsigned long nr, unsigned long pos_l, unsigned long pos_h) +__NR_rt_tgsigqueueinfo 5291 sys_rt_tgsigqueueinfo (pid_t tgid, pid_t pid, int sig, siginfo_t *info) +__NR_fanotify_init 5295 sys_fanotify_init (unsigned int flags, unsigned int event_f_flags) +__NR_fanotify_mark 5296 sys_fanotify_mark (int fanotify_fd, unsigned int flags, uint64_t mask, int dfd, const char *pathname) +__NR_open_by_handle_at 5299 sys_open_by_handle_at (int mountdirfd, struct file_handle *handle, int flags) +__NR_setns 5303 sys_setns (int fd, int nstype) +__NR_kcmp 5306 sys_kcmp (pid_t pid1, pid_t pid2, int type, unsigned long idx1, unsigned long idx2) +__NR_memfd_create 5314 sys_memfd_create (const char *name, unsigned int flags) +__NR_userfaultfd 5317 sys_userfaultfd (int flags) + +##TODO for kernel +__NR_open_tree 5428 sys_open_tree (int dirfd, const char *pathname, unsigned int flags) +__NR_move_mount 5429 sys_move_mount (int from_dfd, const char *from_pathname, int to_dfd, const char *to_pathname, int flags) +__NR_fsopen 5430 sys_fsopen (char *fsname, unsigned int flags) +__NR_fsconfig 5431 sys_fsconfig (int fd, unsigned int cmd, const char *key, const char *value, int aux) +__NR_fsmount 5432 sys_fsmount (int fd, unsigned int flags, unsigned int attr_flags) +__NR_clone3 5435 sys_clone3 (struct clone_args *uargs, size_t size) +__NR_close_range 5436 sys_close_range (unsigned int fd, unsigned int max_fd, unsigned int flags) +__NR_pidfd_open 5434 sys_pidfd_open (pid_t pid, unsigned int flags) +__NR_openat2 5437 sys_openat2 (int dirfd, char *pathname, struct open_how *how, size_t size) +__NR_pidfd_getfd 5438 sys_pidfd_getfd (int pidfd, int targetfd, unsigned int flags) +__NR_rseq 5327 sys_rseq (void *rseq, uint32_t rseq_len, int flags, uint32_t sig) +__NR_membarrier 5318 sys_membarrier (int cmd, unsigned int flags, int cpu_id) diff --git a/compel/arch/mips/scripts/compel-pack-compat.lds.S b/compel/arch/mips/scripts/compel-pack-compat.lds.S new file mode 100644 index 000000000..fe54847aa --- /dev/null +++ b/compel/arch/mips/scripts/compel-pack-compat.lds.S @@ -0,0 +1,3 @@ +OUTPUT_ARCH(mips) +EXTERN(__export_parasite_head_start) +ASSERT(0,"Compatible PIEs are unsupported on mips") diff --git a/compel/arch/mips/scripts/compel-pack.lds.S b/compel/arch/mips/scripts/compel-pack.lds.S new file mode 100644 index 000000000..370cac68f --- /dev/null +++ b/compel/arch/mips/scripts/compel-pack.lds.S @@ -0,0 +1,33 @@ +OUTPUT_ARCH(mips) +EXTERN(__export_parasite_head_start) + +SECTIONS +{ + .text : { + *(.head.text) + ASSERT(DEFINED(__export_parasite_head_start), + "Symbol __export_parasite_head_start is missing"); + *(.text*) + *(.compel.exit) + *(.compel.init) + /* .rodata section*/ + *(.rodata*) + *(.got*) + /* .data section */ + *(.data*) + *(.bss*) + *(.sbss*) + *(.toc*) + } + + /DISCARD/ : { /*segments need to discard */ + *(.debug*) + *(.pdr) + *(.comment*) + *(.note*) + *(.group*) + *(.eh_frame*) + *(.MIPS.options) + *(.gnu.attributes) + } +} diff --git a/compel/arch/mips/src/lib/cpu.c b/compel/arch/mips/src/lib/cpu.c new file mode 100644 index 000000000..172b90e27 --- /dev/null +++ b/compel/arch/mips/src/lib/cpu.c @@ -0,0 +1,41 @@ +#include +#include + +#include "compel-cpu.h" +#include "common/bitops.h" +#include "common/compiler.h" +#include "log.h" + +#undef LOG_PREFIX +#define LOG_PREFIX "cpu: " + +static compel_cpuinfo_t rt_info; +static bool rt_info_done = false; + +void compel_set_cpu_cap(compel_cpuinfo_t *c, unsigned int feature) +{ +} + +void compel_clear_cpu_cap(compel_cpuinfo_t *c, unsigned int feature) +{ +} + +int compel_test_cpu_cap(compel_cpuinfo_t *c, unsigned int feature) +{ + return 0; +} + +int compel_cpuid(compel_cpuinfo_t *c) +{ + return 0; +} + +bool compel_cpu_has_feature(unsigned int feature) +{ + if (!rt_info_done) { + compel_cpuid(&rt_info); + rt_info_done = true; + } + + return compel_test_cpu_cap(&rt_info, feature); +} diff --git a/compel/arch/mips/src/lib/handle-elf-host.c b/compel/arch/mips/src/lib/handle-elf-host.c new file mode 120000 index 000000000..fe4611886 --- /dev/null +++ b/compel/arch/mips/src/lib/handle-elf-host.c @@ -0,0 +1 @@ +handle-elf.c \ No newline at end of file diff --git a/compel/arch/mips/src/lib/handle-elf.c b/compel/arch/mips/src/lib/handle-elf.c new file mode 100644 index 000000000..e086761c2 --- /dev/null +++ b/compel/arch/mips/src/lib/handle-elf.c @@ -0,0 +1,35 @@ +#include +#include + +#include "handle-elf.h" +#include "piegen.h" +#include "log.h" + +extern int __handle_elf(void *mem, size_t size); + +int handle_binary(void *mem, size_t size) +{ + Elf64_Ehdr *ehdr = (Elf64_Ehdr *)mem; + + /* check ELF magic */ + if (ehdr->e_ident[EI_MAG0] != ELFMAG0 || + ehdr->e_ident[EI_MAG1] != ELFMAG1 || + ehdr->e_ident[EI_MAG2] != ELFMAG2 || + ehdr->e_ident[EI_MAG3] != ELFMAG3) { + pr_err("Invalid ELF magic\n"); + return -EINVAL; + } + + /* check ELF class and data encoding */ + if (ehdr->e_ident[EI_CLASS] != ELFCLASS64 || + ehdr->e_ident[EI_DATA] != ELFDATA2LSB) { + pr_err("Unsupported ELF class or data encoding\n"); + return -EINVAL; + } + + if (ehdr->e_ident[EI_ABIVERSION] != 0) { + pr_warn("Unusual ABI version: %d\n", ehdr->e_ident[EI_ABIVERSION]); + } + + return __handle_elf(mem, size); +} diff --git a/compel/arch/mips/src/lib/include/handle-elf.h b/compel/arch/mips/src/lib/include/handle-elf.h new file mode 100644 index 000000000..aa650a2f6 --- /dev/null +++ b/compel/arch/mips/src/lib/include/handle-elf.h @@ -0,0 +1,8 @@ +#ifndef COMPEL_HANDLE_ELF_H__ +#define COMPEL_HANDLE_ELF_H__ + +#include "elf64-types.h" + +#define arch_is_machine_supported(e_machine) (e_machine == EM_MIPS) + +#endif /* COMPEL_HANDLE_ELF_H__ */ diff --git a/compel/arch/mips/src/lib/include/ldsodefs.h b/compel/arch/mips/src/lib/include/ldsodefs.h new file mode 100644 index 000000000..97e79755d --- /dev/null +++ b/compel/arch/mips/src/lib/include/ldsodefs.h @@ -0,0 +1,130 @@ +/* + * Run-time dynamic linker data structures for loaded ELF shared objects. + * Copyright (C) 2000-2014 Free Software Foundation, Inc. + * This file is part of the GNU C Library. + * + * The GNU C Library is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + * The GNU C Library is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with the GNU C Library. If not, see + * . + */ + +#ifndef _MIPS_LDSODEFS_H +#define _MIPS_LDSODEFS_H 1 + +#include + +struct La_mips_32_regs; +struct La_mips_32_retval; +struct La_mips_64_regs; +struct La_mips_64_retval; + +#define ARCH_PLTENTER_MEMBERS \ + Elf32_Addr (*mips_o32_gnu_pltenter)(Elf32_Sym *, unsigned int, uintptr_t *, uintptr_t *, \ + struct La_mips_32_regs *, unsigned int *, const char *name, \ + long int *framesizep); \ + Elf32_Addr (*mips_n32_gnu_pltenter)(Elf32_Sym *, unsigned int, uintptr_t *, uintptr_t *, \ + struct La_mips_64_regs *, unsigned int *, const char *name, \ + long int *framesizep); \ + Elf64_Addr (*mips_n64_gnu_pltenter)(Elf64_Sym *, unsigned int, uintptr_t *, uintptr_t *, \ + struct La_mips_64_regs *, unsigned int *, const char *name, \ + long int *framesizep); + +#define ARCH_PLTEXIT_MEMBERS \ + unsigned int (*mips_o32_gnu_pltexit)(Elf32_Sym *, unsigned int, uintptr_t *, uintptr_t *, \ + const struct La_mips_32_regs *, struct La_mips_32_retval *, \ + const char *); \ + unsigned int (*mips_n32_gnu_pltexit)(Elf32_Sym *, unsigned int, uintptr_t *, uintptr_t *, \ + const struct La_mips_64_regs *, struct La_mips_64_retval *, \ + const char *); \ + unsigned int (*mips_n64_gnu_pltexit)(Elf64_Sym *, unsigned int, uintptr_t *, uintptr_t *, \ + const struct La_mips_64_regs *, struct La_mips_64_retval *, \ + const char *); + +/* The MIPS ABI specifies that the dynamic section has to be read-only. */ + +/* + * The 64-bit MIPS ELF ABI uses an unusual reloc format. Each + * relocation entry specifies up to three actual relocations, all at + * the same address. The first relocation which required a symbol + * uses the symbol in the r_sym field. The second relocation which + * requires a symbol uses the symbol in the r_ssym field. If all + * three relocations require a symbol, the third one uses a zero + * value. + * + * We define these structures in internal headers because we're not + * sure we want to make them part of the ABI yet. Eventually, some of + * this may move into elf/elf.h. + */ + +/* An entry in a 64 bit SHT_REL section. */ + +typedef struct { + Elf32_Word r_sym; /* Symbol index */ + unsigned char r_ssym; /* Special symbol for 2nd relocation */ + unsigned char r_type3; /* 3rd relocation type */ + unsigned char r_type2; /* 2nd relocation type */ + unsigned char r_type1; /* 1st relocation type */ +} _Elf64_Mips_R_Info; + +typedef union { + Elf64_Xword r_info_number; + _Elf64_Mips_R_Info r_info_fields; +} _Elf64_Mips_R_Info_union; + +typedef struct { + Elf64_Addr r_offset; /* Address */ + _Elf64_Mips_R_Info_union r_info; /* Relocation type and symbol index */ +} Elf64_Mips_Rel; + +typedef struct { + Elf64_Addr r_offset; /* Address */ + _Elf64_Mips_R_Info_union r_info; /* Relocation type and symbol index */ + Elf64_Sxword r_addend; /* Addend */ +} Elf64_Mips_Rela; + +#define ELF64_MIPS_R_SYM(i) ((__extension__(_Elf64_Mips_R_Info_union)(i)).r_info_fields.r_sym) + +#define ELF64_MIPS_R_TYPE(i) \ + (((_Elf64_Mips_R_Info_union)(i)).r_info_fields.r_type1 | \ + ((Elf32_Word)(__extension__(_Elf64_Mips_R_Info_union)(i)).r_info_fields.r_type2 << 8) | \ + ((Elf32_Word)(__extension__(_Elf64_Mips_R_Info_union)(i)).r_info_fields.r_type3 << 16) | \ + ((Elf32_Word)(__extension__(_Elf64_Mips_R_Info_union)(i)).r_info_fields.r_ssym << 24)) + +#define ELF64_MIPS_R_INFO(sym, type) \ + (__extension__(_Elf64_Mips_R_Info_union)( \ + __extension__(_Elf64_Mips_R_Info){ (sym), ELF64_MIPS_R_SSYM(type), ELF64_MIPS_R_TYPE3(type), \ + ELF64_MIPS_R_TYPE2(type), ELF64_MIPS_R_TYPE1(type) }) \ + .r_info_number) + +/* + * These macros decompose the value returned by ELF64_MIPS_R_TYPE, and + * compose it back into a value that it can be used as an argument to + * ELF64_MIPS_R_INFO. + */ +#define ELF64_MIPS_R_SSYM(i) (((i) >> 24) & 0xff) +#define ELF64_MIPS_R_TYPE3(i) (((i) >> 16) & 0xff) +#define ELF64_MIPS_R_TYPE2(i) (((i) >> 8) & 0xff) +#define ELF64_MIPS_R_TYPE1(i) ((i)&0xff) +#define ELF64_MIPS_R_TYPEENC(type1, type2, type3, ssym) \ + ((type1) | ((Elf32_Word)(type2) << 8) | ((Elf32_Word)(type3) << 16) | ((Elf32_Word)(ssym) << 24)) + +#undef ELF64_R_SYM +#define ELF64_R_SYM(i) ELF64_MIPS_R_SYM(i) +#undef ELF64_R_TYPE + +/*fixme*/ +#define ELF64_R_TYPE(i) (ELF64_MIPS_R_TYPE(i) & 0x00ff) +#undef ELF64_R_INFO +#define ELF64_R_INFO(sym, type) ELF64_MIPS_R_INFO((sym), (type)) + +#endif diff --git a/compel/arch/mips/src/lib/include/syscall.h b/compel/arch/mips/src/lib/include/syscall.h new file mode 100644 index 000000000..6cad7ca73 --- /dev/null +++ b/compel/arch/mips/src/lib/include/syscall.h @@ -0,0 +1,7 @@ +#ifndef __COMPEL_SYSCALL_H__ +#define __COMPEL_SYSCALL_H__ + +#ifndef SIGSTKFLT +#define SIGSTKFLT 16 +#endif +#endif diff --git a/compel/arch/mips/src/lib/include/uapi/asm/breakpoints.h b/compel/arch/mips/src/lib/include/uapi/asm/breakpoints.h new file mode 100644 index 000000000..21eb1309f --- /dev/null +++ b/compel/arch/mips/src/lib/include/uapi/asm/breakpoints.h @@ -0,0 +1,6 @@ +#ifndef __COMPEL_BREAKPOINTS_H__ +#define __COMPEL_BREAKPOINTS_H__ +#define ARCH_SI_TRAP TRAP_BRKPT +extern int ptrace_set_breakpoint(pid_t pid, void *addr); +extern int ptrace_flush_breakpoints(pid_t pid); +#endif diff --git a/compel/arch/mips/src/lib/include/uapi/asm/cpu.h b/compel/arch/mips/src/lib/include/uapi/asm/cpu.h new file mode 100644 index 000000000..e568df789 --- /dev/null +++ b/compel/arch/mips/src/lib/include/uapi/asm/cpu.h @@ -0,0 +1,6 @@ +#ifndef __CR_ASM_CPU_H__ +#define __CR_ASM_CPU_H__ + +typedef struct { +} compel_cpuinfo_t; +#endif /* __CR_ASM_CPU_H__ */ diff --git a/compel/arch/mips/src/lib/include/uapi/asm/fpu.h b/compel/arch/mips/src/lib/include/uapi/asm/fpu.h new file mode 100644 index 000000000..7f476d541 --- /dev/null +++ b/compel/arch/mips/src/lib/include/uapi/asm/fpu.h @@ -0,0 +1,4 @@ +#ifndef __CR_ASM_FPU_H__ +#define __CR_ASM_FPU_H__ + +#endif /* __CR_ASM_FPU_H__ */ diff --git a/compel/arch/mips/src/lib/include/uapi/asm/infect-types.h b/compel/arch/mips/src/lib/include/uapi/asm/infect-types.h new file mode 100644 index 000000000..481566a12 --- /dev/null +++ b/compel/arch/mips/src/lib/include/uapi/asm/infect-types.h @@ -0,0 +1,68 @@ +#ifndef UAPI_COMPEL_ASM_TYPES_H__ +#define UAPI_COMPEL_ASM_TYPES_H__ + +#include +#include +#include +#include +#include +#define SIGMAX 64 +#define SIGMAX_OLD 31 + +/* + * Copied from the Linux kernel header arch/mips/include/asm/ptrace.h + * + * A thread MIPS CPU context + */ +typedef struct { + /* Saved main processor registers. */ + __u64 regs[32]; + + /* Saved special registers. */ + __u64 lo; + __u64 hi; + __u64 cp0_epc; + __u64 cp0_badvaddr; + __u64 cp0_status; + __u64 cp0_cause; +} user_regs_struct_t; + +/* from linux-3.10/arch/mips/kernel/ptrace.c */ +typedef struct { + /* Saved fpu registers. */ + __u64 regs[32]; + + __u32 fpu_fcr31; + __u32 fpu_id; + +} user_fpregs_struct_t; + +#define MIPS_a0 regs[4] //arguments a0-a3 +#define MIPS_t0 regs[8] //temporaries t0-t7 +#define MIPS_v0 regs[2] +#define MIPS_v1 regs[3] +#define MIPS_sp regs[29] +#define MIPS_ra regs[31] + +#define NATIVE_MAGIC 0x0A +#define COMPAT_MAGIC 0x0C +static inline bool user_regs_native(user_regs_struct_t *pregs) +{ + return true; +} + +#define __compel_arch_fetch_thread_area(tid, th) 0 +#define compel_arch_fetch_thread_area(tctl) 0 +#define compel_arch_get_tls_task(ctl, tls) +#define compel_arch_get_tls_thread(tctl, tls) + +#define REG_RES(regs) ((regs).MIPS_v0) +#define REG_IP(regs) ((regs).cp0_epc) +#define SET_REG_IP(regs, val) ((regs).cp0_epc = (val)) +#define REG_SP(regs) ((regs).MIPS_sp) +#define REG_SYSCALL_NR(regs) ((regs).MIPS_v0) + +//#define __NR(syscall, compat) ((compat) ? __NR32_##syscall : __NR_##syscall) +#define __NR(syscall, compat) __NR_##syscall + +#endif /* UAPI_COMPEL_ASM_TYPES_H__ */ diff --git a/compel/arch/mips/src/lib/include/uapi/asm/sigframe.h b/compel/arch/mips/src/lib/include/uapi/asm/sigframe.h new file mode 100644 index 000000000..5d0a0628e --- /dev/null +++ b/compel/arch/mips/src/lib/include/uapi/asm/sigframe.h @@ -0,0 +1,58 @@ +#ifndef UAPI_COMPEL_ASM_SIGFRAME_H__ +#define UAPI_COMPEL_ASM_SIGFRAME_H__ + +#include +#include + +#include +#include + +#include +#define u32 __u32 + +/* sigcontext defined in /usr/include/asm/sigcontext.h*/ +#define rt_sigcontext sigcontext + +#include + +/* refer to linux-3.10/include/uapi/asm-generic/ucontext.h */ +struct k_ucontext { + unsigned long uc_flags; + struct k_ucontext *uc_link; + stack_t uc_stack; + struct sigcontext uc_mcontext; + k_rtsigset_t uc_sigmask; +}; + +/* Copy from the kernel source arch/mips/kernel/signal.c */ +struct rt_sigframe { + u32 rs_ass[4]; /* argument save space for o32 */ + u32 rs_pad[2]; /* Was: signal trampoline */ + siginfo_t rs_info; + struct k_ucontext rs_uc; +}; + +#define RT_SIGFRAME_UC(rt_sigframe) (&rt_sigframe->rs_uc) +#define RT_SIGFRAME_UC_SIGMASK(rt_sigframe) ((k_rtsigset_t *)(void *)&rt_sigframe->rs_uc.uc_sigmask) +#define RT_SIGFRAME_REGIP(rt_sigframe) ((long unsigned int)0x00) +#define RT_SIGFRAME_FPU(rt_sigframe) +#define RT_SIGFRAME_HAS_FPU(rt_sigframe) 1 + +#define RT_SIGFRAME_OFFSET(rt_sigframe) 0 + +/* clang-format off */ +#define ARCH_RT_SIGRETURN(new_sp, rt_sigframe) \ + asm volatile( \ + "move $29, %0 \n" \ + "li $2, "__stringify(__NR_rt_sigreturn)" \n" \ + "syscall \n" \ + : \ + : "r"(new_sp) \ + : "$2","memory") +/* clang-format on */ + +int sigreturn_prep_fpu_frame(struct rt_sigframe *sigframe, struct rt_sigframe *rsigframe); + +#define rt_sigframe_erase_sigset(sigframe) memset(&sigframe->rs_uc.uc_sigmask, 0, sizeof(k_rtsigset_t)) +#define rt_sigframe_copy_sigset(sigframe, from) memcpy(&sigframe->rs_uc.uc_sigmask, from, sizeof(k_rtsigset_t)) +#endif /* UAPI_COMPEL_ASM_SIGFRAME_H__ */ diff --git a/compel/arch/mips/src/lib/include/uapi/asm/siginfo.h b/compel/arch/mips/src/lib/include/uapi/asm/siginfo.h new file mode 100644 index 000000000..6db1ddbd3 --- /dev/null +++ b/compel/arch/mips/src/lib/include/uapi/asm/siginfo.h @@ -0,0 +1,123 @@ +/* + * This file is subject to the terms and conditions of the GNU General Public + * License. See the file "COPYING" in the main directory of this archive + * for more details. + * + * Copyright (C) 1998, 1999, 2001, 2003 Ralf Baechle + * Copyright (C) 2000, 2001 Silicon Graphics, Inc. + */ +#ifndef _UAPI_ASM_SIGINFO_H +#define _UAPI_ASM_SIGINFO_H + +#define __ARCH_SIGEV_PREAMBLE_SIZE (sizeof(long) + 2 * sizeof(int)) +#undef __ARCH_SI_TRAPNO /* exception code needs to fill this ... */ + +#define HAVE_ARCH_SIGINFO_T + +/* + * Careful to keep union _sifields from shifting ... + */ + +#define __ARCH_SI_PREAMBLE_SIZE (4 * sizeof(int)) + +#define __ARCH_SIGSYS + +#define SI_MAX_SIZE 128 +#define SI_PAD_SIZE ((SI_MAX_SIZE - __ARCH_SI_PREAMBLE_SIZE) / sizeof(int)) +#define __ARCH_SI_UID_T __kernel_uid32_t + +#ifndef __ARCH_SI_UID_T +#define __ARCH_SI_UID_T __kernel_uid32_t +#endif + +#ifndef __ARCH_SI_BAND_T +#define __ARCH_SI_BAND_T long +#endif + +#ifndef __ARCH_SI_CLOCK_T +#define __ARCH_SI_CLOCK_T __kernel_clock_t +#endif + +#ifndef __ARCH_SI_ATTRIBUTES +#define __ARCH_SI_ATTRIBUTES +#endif + +typedef struct siginfo { + int si_signo; + int si_errno; + int si_code; + + union { + int _pad[SI_PAD_SIZE]; + + /* kill() */ + struct { + __kernel_pid_t _pid; /* sender's pid */ + __ARCH_SI_UID_T _uid; /* sender's uid */ + } _kill; + + /* POSIX.1b timers */ + struct { + __kernel_timer_t _tid; /* timer id */ + int _overrun; /* overrun count */ + char _pad[sizeof(__ARCH_SI_UID_T) - sizeof(int)]; + sigval_t _sigval; /* same as below */ + int _sys_private; /* not to be passed to user */ + } _timer; + + /* POSIX.1b signals */ + struct { + __kernel_pid_t _pid; /* sender's pid */ + __ARCH_SI_UID_T _uid; /* sender's uid */ + sigval_t _sigval; + } _rt; + + /* SIGCHLD */ + struct { + __kernel_pid_t _pid; /* which child */ + __ARCH_SI_UID_T _uid; /* sender's uid */ + int _status; /* exit code */ + __ARCH_SI_CLOCK_T _utime; + __ARCH_SI_CLOCK_T _stime; + } _sigchld; + + /* SIGILL, SIGFPE, SIGSEGV, SIGBUS */ + struct { + void *_addr; /* faulting insn/memory ref. */ +#ifdef __ARCH_SI_TRAPNO + int _trapno; /* TRAP # which caused the signal */ +#endif + short _addr_lsb; /* LSB of the reported address */ +#ifndef __GENKSYMS__ + struct { + void *_lower; + void *_upper; + } _addr_bnd; +#endif + } _sigfault; + + /* SIGPOLL */ + struct { + __ARCH_SI_BAND_T _band; /* POLL_IN, POLL_OUT, POLL_MSG */ + int _fd; + } _sigpoll; + + /* SIGSYS */ + struct { + void *_call_addr; /* calling user insn */ + int _syscall; /* triggering system call number */ + unsigned int _arch; /* AUDIT_ARCH_* of syscall */ + } _sigsys; + } _sifields; +} __ARCH_SI_ATTRIBUTES siginfo_t; + +/* + * si_code values + * Again these have been chosen to be IRIX compatible. + */ +#undef SI_ASYNCIO +#undef SI_TIMER +#undef SI_MESGQ +#define SI_ASYNCIO -2 /* sent by AIO completion */ + +#endif /* _UAPI_ASM_SIGINFO_H */ diff --git a/compel/arch/mips/src/lib/infect.c b/compel/arch/mips/src/lib/infect.c new file mode 100644 index 000000000..a1d4865cc --- /dev/null +++ b/compel/arch/mips/src/lib/infect.c @@ -0,0 +1,310 @@ +#include +#include +#include +#include +#include + +#include +#include +#include "errno.h" +#include +#include +#include "common/err.h" +#include "common/page.h" +#include "asm/infect-types.h" +#include "ptrace.h" +#include "infect.h" +#include "infect-priv.h" +#include "log.h" +#include "common/bug.h" + +/* + * Injected syscall instruction + * mips64el is Little Endian + */ +const char code_syscall[] = { + 0x0c, 0x00, 0x00, 0x00, /* syscall */ + 0x0d, 0x00, 0x00, 0x00 /* break */ +}; + +/* 10-byte legacy floating point register */ +struct fpreg { + uint16_t significand[4]; + uint16_t exponent; +}; + +/* 16-byte floating point register */ +struct fpxreg { + uint16_t significand[4]; + uint16_t exponent; + uint16_t padding[3]; +}; + +int sigreturn_prep_regs_plain(struct rt_sigframe *sigframe, user_regs_struct_t *regs, user_fpregs_struct_t *fpregs) +{ + sigframe->rs_uc.uc_mcontext.sc_regs[0] = regs->regs[0]; + sigframe->rs_uc.uc_mcontext.sc_regs[1] = regs->regs[1]; + sigframe->rs_uc.uc_mcontext.sc_regs[2] = regs->regs[2]; + sigframe->rs_uc.uc_mcontext.sc_regs[3] = regs->regs[3]; + sigframe->rs_uc.uc_mcontext.sc_regs[4] = regs->regs[4]; + sigframe->rs_uc.uc_mcontext.sc_regs[5] = regs->regs[5]; + sigframe->rs_uc.uc_mcontext.sc_regs[6] = regs->regs[6]; + sigframe->rs_uc.uc_mcontext.sc_regs[7] = regs->regs[7]; + sigframe->rs_uc.uc_mcontext.sc_regs[8] = regs->regs[8]; + sigframe->rs_uc.uc_mcontext.sc_regs[9] = regs->regs[9]; + sigframe->rs_uc.uc_mcontext.sc_regs[10] = regs->regs[10]; + sigframe->rs_uc.uc_mcontext.sc_regs[11] = regs->regs[11]; + sigframe->rs_uc.uc_mcontext.sc_regs[12] = regs->regs[12]; + sigframe->rs_uc.uc_mcontext.sc_regs[13] = regs->regs[13]; + sigframe->rs_uc.uc_mcontext.sc_regs[14] = regs->regs[14]; + sigframe->rs_uc.uc_mcontext.sc_regs[15] = regs->regs[15]; + sigframe->rs_uc.uc_mcontext.sc_regs[16] = regs->regs[16]; + sigframe->rs_uc.uc_mcontext.sc_regs[17] = regs->regs[17]; + sigframe->rs_uc.uc_mcontext.sc_regs[18] = regs->regs[18]; + sigframe->rs_uc.uc_mcontext.sc_regs[19] = regs->regs[19]; + sigframe->rs_uc.uc_mcontext.sc_regs[20] = regs->regs[20]; + sigframe->rs_uc.uc_mcontext.sc_regs[21] = regs->regs[21]; + sigframe->rs_uc.uc_mcontext.sc_regs[22] = regs->regs[22]; + sigframe->rs_uc.uc_mcontext.sc_regs[23] = regs->regs[23]; + sigframe->rs_uc.uc_mcontext.sc_regs[24] = regs->regs[24]; + sigframe->rs_uc.uc_mcontext.sc_regs[25] = regs->regs[25]; + sigframe->rs_uc.uc_mcontext.sc_regs[26] = regs->regs[26]; + sigframe->rs_uc.uc_mcontext.sc_regs[27] = regs->regs[27]; + sigframe->rs_uc.uc_mcontext.sc_regs[28] = regs->regs[28]; + sigframe->rs_uc.uc_mcontext.sc_regs[29] = regs->regs[29]; + sigframe->rs_uc.uc_mcontext.sc_regs[30] = regs->regs[30]; + sigframe->rs_uc.uc_mcontext.sc_regs[31] = regs->regs[31]; + sigframe->rs_uc.uc_mcontext.sc_mdlo = regs->lo; + sigframe->rs_uc.uc_mcontext.sc_mdhi = regs->hi; + sigframe->rs_uc.uc_mcontext.sc_pc = regs->cp0_epc; + + sigframe->rs_uc.uc_mcontext.sc_fpregs[0] = fpregs->regs[0]; + sigframe->rs_uc.uc_mcontext.sc_fpregs[1] = fpregs->regs[1]; + sigframe->rs_uc.uc_mcontext.sc_fpregs[2] = fpregs->regs[2]; + sigframe->rs_uc.uc_mcontext.sc_fpregs[3] = fpregs->regs[3]; + sigframe->rs_uc.uc_mcontext.sc_fpregs[4] = fpregs->regs[4]; + sigframe->rs_uc.uc_mcontext.sc_fpregs[5] = fpregs->regs[5]; + sigframe->rs_uc.uc_mcontext.sc_fpregs[6] = fpregs->regs[6]; + sigframe->rs_uc.uc_mcontext.sc_fpregs[7] = fpregs->regs[7]; + sigframe->rs_uc.uc_mcontext.sc_fpregs[8] = fpregs->regs[8]; + sigframe->rs_uc.uc_mcontext.sc_fpregs[9] = fpregs->regs[9]; + sigframe->rs_uc.uc_mcontext.sc_fpregs[10] = fpregs->regs[10]; + sigframe->rs_uc.uc_mcontext.sc_fpregs[11] = fpregs->regs[11]; + sigframe->rs_uc.uc_mcontext.sc_fpregs[12] = fpregs->regs[12]; + sigframe->rs_uc.uc_mcontext.sc_fpregs[13] = fpregs->regs[13]; + sigframe->rs_uc.uc_mcontext.sc_fpregs[14] = fpregs->regs[14]; + sigframe->rs_uc.uc_mcontext.sc_fpregs[15] = fpregs->regs[15]; + sigframe->rs_uc.uc_mcontext.sc_fpregs[16] = fpregs->regs[16]; + sigframe->rs_uc.uc_mcontext.sc_fpregs[17] = fpregs->regs[17]; + sigframe->rs_uc.uc_mcontext.sc_fpregs[18] = fpregs->regs[18]; + sigframe->rs_uc.uc_mcontext.sc_fpregs[19] = fpregs->regs[19]; + sigframe->rs_uc.uc_mcontext.sc_fpregs[20] = fpregs->regs[20]; + sigframe->rs_uc.uc_mcontext.sc_fpregs[21] = fpregs->regs[21]; + sigframe->rs_uc.uc_mcontext.sc_fpregs[22] = fpregs->regs[22]; + sigframe->rs_uc.uc_mcontext.sc_fpregs[23] = fpregs->regs[23]; + sigframe->rs_uc.uc_mcontext.sc_fpregs[24] = fpregs->regs[24]; + sigframe->rs_uc.uc_mcontext.sc_fpregs[25] = fpregs->regs[25]; + sigframe->rs_uc.uc_mcontext.sc_fpregs[26] = fpregs->regs[26]; + sigframe->rs_uc.uc_mcontext.sc_fpregs[27] = fpregs->regs[27]; + sigframe->rs_uc.uc_mcontext.sc_fpregs[28] = fpregs->regs[28]; + sigframe->rs_uc.uc_mcontext.sc_fpregs[29] = fpregs->regs[29]; + sigframe->rs_uc.uc_mcontext.sc_fpregs[30] = fpregs->regs[30]; + sigframe->rs_uc.uc_mcontext.sc_fpregs[31] = fpregs->regs[31]; + + return 0; +} + +int sigreturn_prep_fpu_frame_plain(struct rt_sigframe *sigframe, struct rt_sigframe *rsigframe) +{ + return 0; +} + +int compel_get_task_regs(pid_t pid, user_regs_struct_t *regs, user_fpregs_struct_t *xs, save_regs_t save, + void *arg, __maybe_unused unsigned long flags) +{ + int ret = -1; + + pr_info("Dumping GP/FPU registers for %d\n", pid); + + if (ptrace(PTRACE_GETFPREGS, pid, NULL, xs)) { + pr_perror("Can't obtain FPU registers for %d", pid); + return ret; + } + + /*Restart the system call*/ + if (regs->regs[0]) { + switch ((long)(int)regs->regs[2]) { + case ERESTARTNOHAND: + case ERESTARTSYS: + case ERESTARTNOINTR: + regs->regs[2] = regs->regs[0]; + regs->regs[7] = regs->regs[26]; + regs->cp0_epc -= 4; + break; + case ERESTART_RESTARTBLOCK: + pr_warn("Will restore %d with interrupted system call\n", pid); + regs->regs[2] = -EINTR; + break; + } + regs->regs[0] = 0; + } + + ret = save(pid, arg, regs, xs); + return ret; +} + +int compel_set_task_ext_regs(pid_t pid, user_fpregs_struct_t *ext_regs) +{ + pr_info("Restoring GP/FPU registers for %d\n", pid); + + if (ptrace(PTRACE_SETFPREGS, pid, NULL, ext_regs)) { + pr_perror("Can't set FPU registers for %d", pid); + return -1; + } + return 0; +} + +int compel_syscall(struct parasite_ctl *ctl, int nr, long *ret, unsigned long arg1, unsigned long arg2, + unsigned long arg3, unsigned long arg4, unsigned long arg5, unsigned long arg6) +{ + /*refer to glibc-2.20/sysdeps/unix/sysv/linux/mips/mips64/syscall.S*/ + user_regs_struct_t regs = ctl->orig.regs; + int err; + + regs.regs[2] = (unsigned long)nr; //syscall_number will be in v0 + regs.regs[4] = arg1; + regs.regs[5] = arg2; + regs.regs[6] = arg3; + regs.regs[7] = arg4; + regs.regs[8] = arg5; + regs.regs[9] = arg6; + + err = compel_execute_syscall(ctl, ®s, code_syscall); + *ret = regs.regs[2]; + + return err; +} + +void *remote_mmap(struct parasite_ctl *ctl, void *addr, size_t length, int prot, int flags, int fd, off_t offset) +{ + long map; + int err; + + err = compel_syscall(ctl, __NR_mmap, &map, (unsigned long)addr, length, prot, flags, fd, offset >> PAGE_SHIFT); + + if (err < 0 || IS_ERR_VALUE(map)) { + pr_err("remote mmap() failed: %s\n", strerror(-map)); + return NULL; + } + + return (void *)map; +} + +/* + * regs must be inited when calling this function from original context + */ +void parasite_setup_regs(unsigned long new_ip, void *stack, user_regs_struct_t *regs) +{ + regs->cp0_epc = new_ip; + if (stack) { + /* regs[29] is sp */ + regs->regs[29] = (unsigned long)stack; + } +} + +bool arch_can_dump_task(struct parasite_ctl *ctl) +{ + return true; +} + +int arch_fetch_sas(struct parasite_ctl *ctl, struct rt_sigframe *s) +{ + long ret; + int err; + + err = compel_syscall(ctl, __NR_sigaltstack, &ret, 0, (unsigned long)&s->rs_uc.uc_stack, 0, 0, 0, 0); + return err ? err : ret; +} + +int ptrace_set_breakpoint(pid_t pid, void *addr) +{ + return 0; +} + +int ptrace_flush_breakpoints(pid_t pid) +{ + return 0; +} + +/*refer to kernel linux-3.10/arch/mips/include/asm/processor.h*/ +#define TASK_SIZE32 0x7fff8000UL +#define TASK_SIZE64 0x10000000000UL +#define TASK_SIZE TASK_SIZE64 + +unsigned long compel_task_size(void) +{ + return TASK_SIZE; +} + +/* + * Get task registers (overwrites weak function) + * + */ +int ptrace_get_regs(int pid, user_regs_struct_t *regs) +{ + return ptrace(PTRACE_GETREGS, pid, NULL, regs); +} + +/* + * Set task registers (overwrites weak function) + */ +int ptrace_set_regs(int pid, user_regs_struct_t *regs) +{ + return ptrace(PTRACE_SETREGS, pid, NULL, regs); +} + +void compel_relocs_apply_mips(void *mem, void *vbase, struct parasite_blob_desc *pbd) +{ + compel_reloc_t *elf_relocs = pbd->hdr.relocs; + size_t nr_relocs = pbd->hdr.nr_relocs; + size_t i, j; + + /* + * mips rebasing :load time relocation + * parasite.built-in.o and restorer.built-in.o is ELF 64-bit LSB relocatable for mips. + * so we have to relocate some type for R_MIPS_26 R_MIPS_HIGHEST R_MIPS_HIGHER R_MIPS_HI16 and R_MIPS_LO16 in there. + * for mips64el .if toload/store data or jump instruct ,need to relocation R_TYPE + */ + for (i = 0, j = 0; i < nr_relocs; i++) { + if (elf_relocs[i].type & COMPEL_TYPE_MIPS_26) { + int *where = (mem + elf_relocs[i].offset); + *where = *where | + ((elf_relocs[i].addend + ((unsigned long)vbase & 0x00fffffff) /*low 28 bit*/) >> 2); + } else if (elf_relocs[i].type & COMPEL_TYPE_MIPS_64) { + unsigned long *where = (mem + elf_relocs[i].offset); + *where = elf_relocs[i].addend + (unsigned long)vbase; + } else if (elf_relocs[i].type & COMPEL_TYPE_MIPS_HI16) { + /* refer to binutils mips.cc */ + int *where = (mem + elf_relocs[i].offset); + int v_lo16 = (unsigned long)vbase & 0x00ffff; + + if ((v_lo16 + elf_relocs[i].value + elf_relocs[i].addend) >= 0x8000) { + *where = *where | ((((unsigned long)vbase >> 16) & 0xffff) + 0x1); + } else { + *where = *where | ((((unsigned long)vbase >> 16) & 0xffff)); + } + } else if (elf_relocs[i].type & COMPEL_TYPE_MIPS_LO16) { + int *where = (mem + elf_relocs[i].offset); + int v_lo16 = (unsigned long)vbase & 0x00ffff; + *where = *where | ((v_lo16 + elf_relocs[i].addend) & 0xffff); + } else if (elf_relocs[i].type & COMPEL_TYPE_MIPS_HIGHER) { + int *where = (mem + elf_relocs[i].offset); + *where = *where | ((((unsigned long)vbase + (uint64_t)0x80008000) >> 32) & 0xffff); + } else if (elf_relocs[i].type & COMPEL_TYPE_MIPS_HIGHEST) { + int *where = (mem + elf_relocs[i].offset); + *where = *where | ((((unsigned long)vbase + (uint64_t)0x800080008000llu) >> 48) & 0xffff); + } else { + BUG(); + } + } +} diff --git a/compel/arch/ppc64/plugins/include/asm/syscall-types.h b/compel/arch/ppc64/plugins/include/asm/syscall-types.h index 7754721e2..1bea8496b 100644 --- a/compel/arch/ppc64/plugins/include/asm/syscall-types.h +++ b/compel/arch/ppc64/plugins/include/asm/syscall-types.h @@ -1,7 +1,7 @@ #ifndef COMPEL_ARCH_SYSCALL_TYPES_H__ #define COMPEL_ARCH_SYSCALL_TYPES_H__ -#define SA_RESTORER 0x04000000U +#define SA_RESTORER 0x04000000U typedef void rt_signalfn_t(int, siginfo_t *, void *); typedef rt_signalfn_t *rt_sighandler_t; @@ -9,20 +9,20 @@ typedef rt_signalfn_t *rt_sighandler_t; typedef void rt_restorefn_t(void); typedef rt_restorefn_t *rt_sigrestore_t; -#define _KNSIG 64 -#define _NSIG_BPW 64 +#define _KNSIG 64 +#define _NSIG_BPW 64 -#define _KNSIG_WORDS (_KNSIG / _NSIG_BPW) +#define _KNSIG_WORDS (_KNSIG / _NSIG_BPW) typedef struct { - unsigned long sig[_KNSIG_WORDS]; + unsigned long sig[_KNSIG_WORDS]; } k_rtsigset_t; typedef struct { - rt_sighandler_t rt_sa_handler; - unsigned long rt_sa_flags; - rt_sigrestore_t rt_sa_restorer; - k_rtsigset_t rt_sa_mask; + rt_sighandler_t rt_sa_handler; + unsigned long rt_sa_flags; + rt_sigrestore_t rt_sa_restorer; + k_rtsigset_t rt_sa_mask; } rt_sigaction_t; #endif /* COMPEL_ARCH_SYSCALL_TYPES_H__ */ diff --git a/compel/arch/ppc64/plugins/std/parasite-head.S b/compel/arch/ppc64/plugins/std/parasite-head.S index c870efdc2..c675ab508 100644 --- a/compel/arch/ppc64/plugins/std/parasite-head.S +++ b/compel/arch/ppc64/plugins/std/parasite-head.S @@ -4,10 +4,6 @@ .align 8 ENTRY(__export_parasite_head_start) - - // int __used parasite_service(unsigned int cmd, void *args) - // cmd = r3 = *__export_parasite_cmd (u32 ?) - // args = r4 = @parasite_args_ptr + @pc bl 0f 0: mflr r2 @@ -15,12 +11,6 @@ ENTRY(__export_parasite_head_start) addis reg,r2,(name - 0b)@ha; \ addi reg,r2,(name - 0b)@l; - LOAD_REG_ADDR(r3,__export_parasite_cmd) - lwz r3,0(r3) - - LOAD_REG_ADDR(r4,parasite_args_ptr) - ld r4,0(r4) - LOAD_REG_ADDR(r12,parasite_service_ptr) ld r12,0(r12) mtctr r12 @@ -28,9 +18,6 @@ ENTRY(__export_parasite_head_start) bctrl // call parasite_service twi 31,0,0 // Should generate SIGTRAP -parasite_args_ptr: - .quad __export_parasite_args - parasite_service_ptr: // We want to run the function prototype to set r2. // Since the relocation will prefer the local entry @@ -39,7 +26,4 @@ parasite_service_ptr: // FIXME: There should be a way to specify the global entry here. .quad parasite_service - 8 -__export_parasite_cmd: - .long 0 - END(__export_parasite_head_start) diff --git a/compel/arch/ppc64/plugins/std/syscalls/syscall-ppc64.tbl b/compel/arch/ppc64/plugins/std/syscalls/syscall-ppc64.tbl index 3b3079040..3deb41cf7 100644 --- a/compel/arch/ppc64/plugins/std/syscalls/syscall-ppc64.tbl +++ b/compel/arch/ppc64/plugins/std/syscalls/syscall-ppc64.tbl @@ -82,7 +82,7 @@ __NR_sys_timer_settime 241 sys_timer_settime (kernel_timer_t timer_id, int flag __NR_sys_timer_gettime 242 sys_timer_gettime (int timer_id, const struct itimerspec *setting) __NR_sys_timer_getoverrun 243 sys_timer_getoverrun (int timer_id) __NR_sys_timer_delete 244 sys_timer_delete (kernel_timer_t timer_id) -__NR_clock_gettime 246 sys_clock_gettime (const clockid_t which_clock, const struct timespec *tp) +__NR_clock_gettime 246 sys_clock_gettime (clockid_t which_clock, struct timespec *tp) __NR_exit_group 234 sys_exit_group (int error_code) __NR_waitid 272 sys_waitid (int which, pid_t pid, struct siginfo *infop, int options, struct rusage *ru) __NR_set_robust_list 300 sys_set_robust_list (struct robust_list_head *head, size_t len) @@ -108,3 +108,15 @@ __NR_gettimeofday 78 sys_gettimeofday (struct timeval *tv, struct timezone *tz) __NR_preadv 320 sys_preadv_raw (int fd, struct iovec *iov, unsigned long nr, unsigned long pos_l, unsigned long pos_h) __NR_userfaultfd 364 sys_userfaultfd (int flags) __NR_ppoll 281 sys_ppoll (struct pollfd *fds, unsigned int nfds, const struct timespec *tmo, const sigset_t *sigmask, size_t sigsetsize) +__NR_open_tree 428 sys_open_tree (int dirfd, const char *pathname, unsigned int flags) +__NR_move_mount 429 sys_move_mount (int from_dfd, const char *from_pathname, int to_dfd, const char *to_pathname, int flags) +__NR_fsopen 430 sys_fsopen (char *fsname, unsigned int flags) +__NR_fsconfig 431 sys_fsconfig (int fd, unsigned int cmd, const char *key, const char *value, int aux) +__NR_fsmount 432 sys_fsmount (int fd, unsigned int flags, unsigned int attr_flags) +__NR_clone3 435 sys_clone3 (struct clone_args *uargs, size_t size) +__NR_close_range 436 sys_close_range (unsigned int fd, unsigned int max_fd, unsigned int flags) +__NR_pidfd_open 434 sys_pidfd_open (pid_t pid, unsigned int flags) +__NR_openat2 437 sys_openat2 (int dirfd, char *pathname, struct open_how *how, size_t size) +__NR_pidfd_getfd 438 sys_pidfd_getfd (int pidfd, int targetfd, unsigned int flags) +__NR_rseq 387 sys_rseq (void *rseq, uint32_t rseq_len, int flags, uint32_t sig) +__NR_membarrier 365 sys_membarrier (int cmd, unsigned int flags, int cpu_id) diff --git a/compel/arch/ppc64/scripts/compel-pack.lds.S b/compel/arch/ppc64/scripts/compel-pack.lds.S index e0f826d7d..f197fb999 100644 --- a/compel/arch/ppc64/scripts/compel-pack.lds.S +++ b/compel/arch/ppc64/scripts/compel-pack.lds.S @@ -12,7 +12,7 @@ SECTIONS *(.compel.init) } - .data : { + .data : ALIGN(0x10000) { *(.data*) *(.bss*) } @@ -33,8 +33,4 @@ SECTIONS *(.group*) *(.eh_frame*) } - -/* Parasite args should have 4 bytes align, as we have futex inside. */ -. = ALIGN(4); -__export_parasite_args = .; } diff --git a/compel/arch/ppc64/src/lib/cpu.c b/compel/arch/ppc64/src/lib/cpu.c index 338ab4891..f7a128ca3 100644 --- a/compel/arch/ppc64/src/lib/cpu.c +++ b/compel/arch/ppc64/src/lib/cpu.c @@ -2,6 +2,7 @@ #include #include #include +#include #include "compel-cpu.h" @@ -9,7 +10,7 @@ #include "log.h" -#undef LOG_PREFIX +#undef LOG_PREFIX #define LOG_PREFIX "cpu: " static compel_cpuinfo_t rt_info; @@ -24,10 +25,20 @@ static void fetch_rt_cpuinfo(void) } } -void compel_set_cpu_cap(compel_cpuinfo_t *info, unsigned int feature) { } -void compel_clear_cpu_cap(compel_cpuinfo_t *info, unsigned int feature) { } -int compel_test_fpu_cap(compel_cpuinfo_t *info, unsigned int feature) { return 0; } -int compel_test_cpu_cap(compel_cpuinfo_t *info, unsigned int feature) { return 0; } +void compel_set_cpu_cap(compel_cpuinfo_t *info, unsigned int feature) +{ +} +void compel_clear_cpu_cap(compel_cpuinfo_t *info, unsigned int feature) +{ +} +int compel_test_fpu_cap(compel_cpuinfo_t *info, unsigned int feature) +{ + return 0; +} +int compel_test_cpu_cap(compel_cpuinfo_t *info, unsigned int feature) +{ + return 0; +} int compel_cpuid(compel_cpuinfo_t *info) { diff --git a/compel/arch/ppc64/src/lib/handle-elf.c b/compel/arch/ppc64/src/lib/handle-elf.c index 3d4020f59..84a360c43 100644 --- a/compel/arch/ppc64/src/lib/handle-elf.c +++ b/compel/arch/ppc64/src/lib/handle-elf.c @@ -1,20 +1,17 @@ #include - -#include "uapi/compel.h" +#include #include "handle-elf.h" #include "piegen.h" #include "log.h" -static const unsigned char __maybe_unused -elf_ident_64_le[EI_NIDENT] = { - 0x7f, 0x45, 0x4c, 0x46, 0x02, 0x01, 0x01, 0x00, +static const unsigned char __maybe_unused elf_ident_64_le[EI_NIDENT] = { + 0x7f, 0x45, 0x4c, 0x46, 0x02, 0x01, 0x01, 0x00, /* clang-format */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, }; -static const unsigned char __maybe_unused -elf_ident_64_be[EI_NIDENT] = { - 0x7f, 0x45, 0x4c, 0x46, 0x02, 0x02, 0x01, 0x00, +static const unsigned char __maybe_unused elf_ident_64_be[EI_NIDENT] = { + 0x7f, 0x45, 0x4c, 0x46, 0x02, 0x02, 0x01, 0x00, /* clang-format */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, }; diff --git a/compel/arch/ppc64/src/lib/include/handle-elf.h b/compel/arch/ppc64/src/lib/include/handle-elf.h index 1a8217e6b..ae20186a2 100644 --- a/compel/arch/ppc64/src/lib/include/handle-elf.h +++ b/compel/arch/ppc64/src/lib/include/handle-elf.h @@ -5,8 +5,8 @@ #define ELF_PPC64 -#define __handle_elf handle_elf_ppc64 -#define arch_is_machine_supported(e_machine) (e_machine == EM_PPC64) +#define __handle_elf handle_elf_ppc64 +#define arch_is_machine_supported(e_machine) (e_machine == EM_PPC64) extern int handle_elf_ppc64(void *mem, size_t size); diff --git a/compel/arch/ppc64/src/lib/include/syscall.h b/compel/arch/ppc64/src/lib/include/syscall.h index e2ec1272e..13ee906e1 100644 --- a/compel/arch/ppc64/src/lib/include/syscall.h +++ b/compel/arch/ppc64/src/lib/include/syscall.h @@ -1,4 +1,8 @@ #ifndef __COMPEL_SYSCALL_H__ #define __COMPEL_SYSCALL_H__ -#define __NR(syscall, compat) __NR_##syscall +#define __NR(syscall, compat) \ + ({ \ + (void)compat; \ + __NR_##syscall; \ + }) #endif diff --git a/compel/arch/ppc64/src/lib/include/uapi/asm/cpu.h b/compel/arch/ppc64/src/lib/include/uapi/asm/cpu.h index 59925868c..475e2bd59 100644 --- a/compel/arch/ppc64/src/lib/include/uapi/asm/cpu.h +++ b/compel/arch/ppc64/src/lib/include/uapi/asm/cpu.h @@ -4,7 +4,7 @@ #include typedef struct { - uint64_t hwcap[2]; + uint64_t hwcap[2]; } compel_cpuinfo_t; #endif /* UAPI_COMPEL_ASM_CPU_H__ */ diff --git a/compel/arch/ppc64/src/lib/include/uapi/asm/infect-types.h b/compel/arch/ppc64/src/lib/include/uapi/asm/infect-types.h index 89fc4aa3c..25fc747e2 100644 --- a/compel/arch/ppc64/src/lib/include/uapi/asm/infect-types.h +++ b/compel/arch/ppc64/src/lib/include/uapi/asm/infect-types.h @@ -5,8 +5,8 @@ #include #include -#define SIGMAX_OLD 31 -#define SIGMAX 64 +#define SIGMAX_OLD 31 +#define SIGMAX 64 /* * Copied from kernel header arch/powerpc/include/uapi/asm/ptrace.h @@ -15,44 +15,44 @@ typedef struct { unsigned long gpr[32]; unsigned long nip; unsigned long msr; - unsigned long orig_gpr3; /* Used for restarting system calls */ + unsigned long orig_gpr3; /* Used for restarting system calls */ unsigned long ctr; unsigned long link; unsigned long xer; unsigned long ccr; - unsigned long softe; /* Soft enabled/disabled */ - unsigned long trap; /* Reason for being here */ + unsigned long softe; /* Soft enabled/disabled */ + unsigned long trap; /* Reason for being here */ /* * N.B. for critical exceptions on 4xx, the dar and dsisr * fields are overloaded to hold srr0 and srr1. */ - unsigned long dar; /* Fault registers */ - unsigned long dsisr; /* on 4xx/Book-E used for ESR */ - unsigned long result; /* Result of a system call */ + unsigned long dar; /* Fault registers */ + unsigned long dsisr; /* on 4xx/Book-E used for ESR */ + unsigned long result; /* Result of a system call */ } user_regs_struct_t; -#define NVSXREG 32 +#define NVSXREG 32 -#define USER_FPREGS_FL_FP 0x00001 -#define USER_FPREGS_FL_ALTIVEC 0x00002 -#define USER_FPREGS_FL_VSX 0x00004 -#define USER_FPREGS_FL_TM 0x00010 +#define USER_FPREGS_FL_FP 0x00001 +#define USER_FPREGS_FL_ALTIVEC 0x00002 +#define USER_FPREGS_FL_VSX 0x00004 +#define USER_FPREGS_FL_TM 0x00010 #ifndef NT_PPC_TM_SPR -# define NT_PPC_TM_CGPR 0x108 /* TM checkpointed GPR Registers */ -# define NT_PPC_TM_CFPR 0x109 /* TM checkpointed FPR Registers */ -# define NT_PPC_TM_CVMX 0x10a /* TM checkpointed VMX Registers */ -# define NT_PPC_TM_CVSX 0x10b /* TM checkpointed VSX Registers */ -# define NT_PPC_TM_SPR 0x10c /* TM Special Purpose Registers */ +#define NT_PPC_TM_CGPR 0x108 /* TM checkpointed GPR Registers */ +#define NT_PPC_TM_CFPR 0x109 /* TM checkpointed FPR Registers */ +#define NT_PPC_TM_CVMX 0x10a /* TM checkpointed VMX Registers */ +#define NT_PPC_TM_CVSX 0x10b /* TM checkpointed VSX Registers */ +#define NT_PPC_TM_SPR 0x10c /* TM Special Purpose Registers */ #endif -#define MSR_TMA (1UL<<34) /* bit 29 Trans Mem state: Transactional */ -#define MSR_TMS (1UL<<33) /* bit 30 Trans Mem state: Suspended */ -#define MSR_TM (1UL<<32) /* bit 31 Trans Mem Available */ -#define MSR_VEC (1UL<<25) -#define MSR_VSX (1UL<<23) +#define MSR_TMA (1UL << 34) /* bit 29 Trans Mem state: Transactional */ +#define MSR_TMS (1UL << 33) /* bit 30 Trans Mem state: Suspended */ +#define MSR_TM (1UL << 32) /* bit 31 Trans Mem Available */ +#define MSR_VEC (1UL << 25) +#define MSR_VSX (1UL << 23) -#define MSR_TM_ACTIVE(x) ((((x) & MSR_TM) && ((x)&(MSR_TMA|MSR_TMS))) != 0) +#define MSR_TM_ACTIVE(x) ((((x)&MSR_TM) && ((x) & (MSR_TMA | MSR_TMS))) != 0) typedef struct { uint64_t fpregs[NFPREG]; @@ -72,15 +72,25 @@ typedef struct { } tm; } user_fpregs_struct_t; -#define REG_RES(regs) ((uint64_t)(regs).gpr[3]) -#define REG_IP(regs) ((uint64_t)(regs).nip) -#define REG_SP(regs) ((uint64_t)(regs).gpr[1]) -#define REG_SYSCALL_NR(regs) ((uint64_t)(regs).gpr[0]) +#define REG_RES(regs) ((uint64_t)(regs).gpr[3]) +#define REG_IP(regs) ((uint64_t)(regs).nip) +#define SET_REG_IP(regs, val) ((regs).nip = (val)) +#define REG_SP(regs) ((uint64_t)(regs).gpr[1]) +#define REG_SYSCALL_NR(regs) ((uint64_t)(regs).gpr[0]) -#define user_regs_native(pregs) true +#define user_regs_native(pregs) true #define ARCH_SI_TRAP TRAP_BRKPT -#define __NR(syscall, compat) __NR_##syscall +#define __NR(syscall, compat) \ + ({ \ + (void)compat; \ + __NR_##syscall; \ + }) + +#define __compel_arch_fetch_thread_area(tid, th) 0 +#define compel_arch_fetch_thread_area(tctl) 0 +#define compel_arch_get_tls_task(ctl, tls) +#define compel_arch_get_tls_thread(tctl, tls) #endif /* UAPI_COMPEL_ASM_TYPES_H__ */ diff --git a/compel/arch/ppc64/src/lib/include/uapi/asm/sigframe.h b/compel/arch/ppc64/src/lib/include/uapi/asm/sigframe.h index 9467a1b99..0c4ccb648 100644 --- a/compel/arch/ppc64/src/lib/include/uapi/asm/sigframe.h +++ b/compel/arch/ppc64/src/lib/include/uapi/asm/sigframe.h @@ -14,35 +14,41 @@ */ #include -// XXX: the idetifier rt_sigcontext is expected to be struct by the CRIU code +// XXX: the identifier rt_sigcontext is expected to be struct by the CRIU code #define rt_sigcontext sigcontext #include -#define RT_SIGFRAME_OFFSET(rt_sigframe) 0 +#define RT_SIGFRAME_OFFSET(rt_sigframe) 0 /* Copied from the Linux kernel header arch/powerpc/include/asm/ptrace.h */ -#define USER_REDZONE_SIZE 512 +#define USER_REDZONE_SIZE 512 +#if _CALL_ELF != 2 +#error Only supporting ABIv2. +#else +#define STACK_FRAME_MIN_SIZE 32 +#endif /* Copied from the Linux kernel source file arch/powerpc/kernel/signal_64.c */ -#define TRAMP_SIZE 6 +#define TRAMP_SIZE 6 /* * ucontext_t defined in /usr/include/powerpc64le-linux-gnu/sys/ucontext.h */ struct rt_sigframe { - /* sys_rt_sigreturn requires the ucontext be the first field */ - ucontext_t uc; - ucontext_t uc_transact; /* Transactional state */ - unsigned long _unused[2]; - unsigned int tramp[TRAMP_SIZE]; - struct rt_siginfo *pinfo; - void *puc; - struct rt_siginfo info; - /* New 64 bit little-endian ABI allows redzone of 512 bytes below sp */ - char abigap[USER_REDZONE_SIZE]; + /* sys_rt_sigreturn requires the ucontext be the first field */ + ucontext_t uc; + ucontext_t uc_transact; /* Transactional state */ + unsigned long _unused[2]; + unsigned int tramp[TRAMP_SIZE]; + struct rt_siginfo *pinfo; + void *puc; + struct rt_siginfo info; + /* New 64 bit little-endian ABI allows redzone of 512 bytes below sp */ + char abigap[USER_REDZONE_SIZE]; } __attribute__((aligned(16))); +/* clang-format off */ #define ARCH_RT_SIGRETURN(new_sp, rt_sigframe) \ asm volatile( \ "mr 1, %0 \n" \ @@ -50,30 +56,29 @@ struct rt_sigframe { "sc \n" \ : \ : "r"(new_sp) \ - : "1", "memory") + : "memory") +/* clang-format on */ #if _CALL_ELF != 2 -# error Only supporting ABIv2. +#error Only supporting ABIv2. #else -# define FRAME_MIN_SIZE_PARM 96 +#define FRAME_MIN_SIZE_PARM 96 #endif -#define RT_SIGFRAME_UC(rt_sigframe) (&(rt_sigframe)->uc) -#define RT_SIGFRAME_REGIP(rt_sigframe) ((long unsigned int)(rt_sigframe)->uc.uc_mcontext.gp_regs[PT_NIP]) -#define RT_SIGFRAME_HAS_FPU(rt_sigframe) (1) -#define RT_SIGFRAME_FPU(rt_sigframe) (&(rt_sigframe)->uc.uc_mcontext) +#define RT_SIGFRAME_UC(rt_sigframe) (&(rt_sigframe)->uc) +#define RT_SIGFRAME_REGIP(rt_sigframe) ((long unsigned int)(rt_sigframe)->uc.uc_mcontext.gp_regs[PT_NIP]) +#define RT_SIGFRAME_HAS_FPU(rt_sigframe) (1) +#define RT_SIGFRAME_FPU(rt_sigframe) (&(rt_sigframe)->uc.uc_mcontext) -#define rt_sigframe_erase_sigset(sigframe) \ - memset(&sigframe->uc.uc_sigmask, 0, sizeof(k_rtsigset_t)) -#define rt_sigframe_copy_sigset(sigframe, from) \ - memcpy(&sigframe->uc.uc_sigmask, from, sizeof(k_rtsigset_t)) +#define rt_sigframe_erase_sigset(sigframe) memset(&sigframe->uc.uc_sigmask, 0, sizeof(k_rtsigset_t)) +#define rt_sigframe_copy_sigset(sigframe, from) memcpy(&sigframe->uc.uc_sigmask, from, sizeof(k_rtsigset_t)) -#define MSR_TMA (1UL<<34) /* bit 29 Trans Mem state: Transactional */ -#define MSR_TMS (1UL<<33) /* bit 30 Trans Mem state: Suspended */ -#define MSR_TM (1UL<<32) /* bit 31 Trans Mem Available */ -#define MSR_VEC (1UL<<25) -#define MSR_VSX (1UL<<23) +#define MSR_TMA (1UL << 34) /* bit 29 Trans Mem state: Transactional */ +#define MSR_TMS (1UL << 33) /* bit 30 Trans Mem state: Suspended */ +#define MSR_TM (1UL << 32) /* bit 31 Trans Mem Available */ +#define MSR_VEC (1UL << 25) +#define MSR_VSX (1UL << 23) -#define MSR_TM_ACTIVE(x) ((((x) & MSR_TM) && ((x)&(MSR_TMA|MSR_TMS))) != 0) +#define MSR_TM_ACTIVE(x) ((((x)&MSR_TM) && ((x) & (MSR_TMA | MSR_TMS))) != 0) #endif /* UAPI_COMPEL_ASM_SIGFRAME_H__ */ diff --git a/compel/arch/ppc64/src/lib/infect.c b/compel/arch/ppc64/src/lib/infect.c index defed3d85..54abd48a4 100644 --- a/compel/arch/ppc64/src/lib/infect.c +++ b/compel/arch/ppc64/src/lib/infect.c @@ -11,15 +11,16 @@ #include "log.h" #include "common/bug.h" #include "common/page.h" +#include "common/err.h" #include "infect.h" #include "infect-priv.h" #ifndef NT_PPC_TM_SPR -#define NT_PPC_TM_CGPR 0x108 /* TM checkpointed GPR Registers */ -#define NT_PPC_TM_CFPR 0x109 /* TM checkpointed FPR Registers */ -#define NT_PPC_TM_CVMX 0x10a /* TM checkpointed VMX Registers */ -#define NT_PPC_TM_CVSX 0x10b /* TM checkpointed VSX Registers */ -#define NT_PPC_TM_SPR 0x10c /* TM Special Purpose Registers */ +#define NT_PPC_TM_CGPR 0x108 /* TM checkpointed GPR Registers */ +#define NT_PPC_TM_CFPR 0x109 /* TM checkpointed FPR Registers */ +#define NT_PPC_TM_CVMX 0x10a /* TM checkpointed VMX Registers */ +#define NT_PPC_TM_CVSX 0x10b /* TM checkpointed VSX Registers */ +#define NT_PPC_TM_SPR 0x10c /* TM Special Purpose Registers */ #endif unsigned __page_size = 0; @@ -29,8 +30,8 @@ unsigned __page_shift = 0; * Injected syscall instruction */ const uint32_t code_syscall[] = { - 0x44000002, /* sc */ - 0x0fe00000 /* twi 31,0,0 */ + 0x44000002, /* sc */ + 0x0fe00000 /* twi 31,0,0 */ }; static inline __always_unused void __check_code_syscall(void) @@ -43,14 +44,14 @@ static void prep_gp_regs(mcontext_t *dst, user_regs_struct_t *regs) { memcpy(dst->gp_regs, regs->gpr, sizeof(regs->gpr)); - dst->gp_regs[PT_NIP] = regs->nip; - dst->gp_regs[PT_MSR] = regs->msr; - dst->gp_regs[PT_ORIG_R3] = regs->orig_gpr3; - dst->gp_regs[PT_CTR] = regs->ctr; - dst->gp_regs[PT_LNK] = regs->link; - dst->gp_regs[PT_XER] = regs->xer; - dst->gp_regs[PT_CCR] = regs->ccr; - dst->gp_regs[PT_TRAP] = regs->trap; + dst->gp_regs[PT_NIP] = regs->nip; + dst->gp_regs[PT_MSR] = regs->msr; + dst->gp_regs[PT_ORIG_R3] = regs->orig_gpr3; + dst->gp_regs[PT_CTR] = regs->ctr; + dst->gp_regs[PT_LNK] = regs->link; + dst->gp_regs[PT_XER] = regs->xer; + dst->gp_regs[PT_CCR] = regs->ccr; + dst->gp_regs[PT_TRAP] = regs->trap; } static void put_fpu_regs(mcontext_t *mc, uint64_t *fpregs) @@ -74,9 +75,7 @@ static void put_vsx_regs(mcontext_t *mc, uint64_t *vsxregs) memcpy((uint64_t *)(mc->v_regs + 1), vsxregs, sizeof(*vsxregs) * NVSXREG); } -int sigreturn_prep_regs_plain(struct rt_sigframe *sigframe, - user_regs_struct_t *regs, - user_fpregs_struct_t *fpregs) +int sigreturn_prep_regs_plain(struct rt_sigframe *sigframe, user_regs_struct_t *regs, user_fpregs_struct_t *fpregs) { mcontext_t *dst_tc = &sigframe->uc_transact.uc_mcontext; mcontext_t *dst = &sigframe->uc.uc_mcontext; @@ -134,14 +133,12 @@ static void update_vregs(mcontext_t *lcontext, mcontext_t *rcontext) uint64_t offset = (uint64_t)(lcontext->v_regs) - (uint64_t)lcontext; lcontext->v_regs = (vrregset_t *)((uint64_t)rcontext + offset); - pr_debug("Updated v_regs:%llx (rcontext:%llx)\n", - (unsigned long long)lcontext->v_regs, + pr_debug("Updated v_regs:%llx (rcontext:%llx)\n", (unsigned long long)lcontext->v_regs, (unsigned long long)rcontext); } } -int sigreturn_prep_fpu_frame_plain(struct rt_sigframe *frame, - struct rt_sigframe *rframe) +int sigreturn_prep_fpu_frame_plain(struct rt_sigframe *frame, struct rt_sigframe *rframe) { uint64_t msr = frame->uc.uc_mcontext.gp_regs[PT_MSR]; @@ -155,9 +152,8 @@ int sigreturn_prep_fpu_frame_plain(struct rt_sigframe *frame, /* Updating the transactional state address if any */ if (frame->uc.uc_link) { - update_vregs(&frame->uc_transact.uc_mcontext, - &rframe->uc_transact.uc_mcontext); - frame->uc.uc_link = &rframe->uc_transact; + update_vregs(&frame->uc_transact.uc_mcontext, &rframe->uc_transact.uc_mcontext); + frame->uc.uc_link = &rframe->uc_transact; } return 0; @@ -214,7 +210,7 @@ static int get_fpu_regs(pid_t pid, user_fpregs_struct_t *fp) static int get_altivec_regs(pid_t pid, user_fpregs_struct_t *fp) { - if (ptrace(PTRACE_GETVRREGS, pid, 0, (void*)&fp->vrregs) < 0) { + if (ptrace(PTRACE_GETVRREGS, pid, 0, (void *)&fp->vrregs) < 0) { /* PTRACE_GETVRREGS returns EIO if Altivec is not supported. * This should not happen if msr_vec is set. */ if (errno != EIO) { @@ -222,8 +218,7 @@ static int get_altivec_regs(pid_t pid, user_fpregs_struct_t *fp) return -1; } pr_debug("Altivec not supported\n"); - } - else { + } else { pr_debug("Dumping Altivec registers\n"); fp->flags |= USER_FPREGS_FL_ALTIVEC; } @@ -241,7 +236,7 @@ static int get_altivec_regs(pid_t pid, user_fpregs_struct_t *fp) */ static int get_vsx_regs(pid_t pid, user_fpregs_struct_t *fp) { - if (ptrace(PTRACE_GETVSRREGS, pid, 0, (void*)fp->vsxregs) < 0) { + if (ptrace(PTRACE_GETVSRREGS, pid, 0, (void *)fp->vsxregs) < 0) { /* * EIO is returned in the case PTRACE_GETVRREGS is not * supported. @@ -251,8 +246,7 @@ static int get_vsx_regs(pid_t pid, user_fpregs_struct_t *fp) return -1; } pr_debug("VSX register's dump not supported.\n"); - } - else { + } else { pr_debug("Dumping VSX registers\n"); fp->flags |= USER_FPREGS_FL_VSX; } @@ -265,22 +259,23 @@ static int get_tm_regs(pid_t pid, user_fpregs_struct_t *fpregs) pr_debug("Dumping TM registers\n"); -#define TM_REQUIRED 0 -#define TM_OPTIONAL 1 -#define PTRACE_GET_TM(s,n,c,u) do { \ - iov.iov_base = &s; \ - iov.iov_len = sizeof(s); \ - if (ptrace(PTRACE_GETREGSET, pid, c, &iov)) { \ - if (!u || errno != EIO) { \ - pr_perror("Couldn't get TM "n); \ - pr_err("Your kernel seems to not support the " \ - "new TM ptrace API (>= 4.8)\n"); \ - goto out_free; \ - } \ - pr_debug("TM "n" not supported.\n"); \ - iov.iov_base = NULL; \ - } \ -} while(0) +#define TM_REQUIRED 0 +#define TM_OPTIONAL 1 +#define PTRACE_GET_TM(s, n, c, u) \ + do { \ + iov.iov_base = &s; \ + iov.iov_len = sizeof(s); \ + if (ptrace(PTRACE_GETREGSET, pid, c, &iov)) { \ + if (!u || errno != EIO) { \ + pr_perror("Couldn't get TM " n); \ + pr_err("Your kernel seems to not support the " \ + "new TM ptrace API (>= 4.8)\n"); \ + goto out_free; \ + } \ + pr_debug("TM " n " not supported.\n"); \ + iov.iov_base = NULL; \ + } \ + } while (0) /* Get special registers */ PTRACE_GET_TM(fpregs->tm.tm_spr_regs, "SPR", NT_PPC_TM_SPR, TM_REQUIRED); @@ -306,37 +301,61 @@ static int get_tm_regs(pid_t pid, user_fpregs_struct_t *fpregs) return 0; out_free: - return -1; /* still failing the checkpoint */ + return -1; /* still failing the checkpoint */ } -static int __get_task_regs(pid_t pid, user_regs_struct_t *regs, - user_fpregs_struct_t *fpregs) +/* + * This is inspired by kernel function check_syscall_restart in + * arch/powerpc/kernel/signal.c + */ + +#ifndef TRAP +#define TRAP(r) ((r).trap & ~0xF) +#endif + +static bool trap_is_scv(user_regs_struct_t *regs) +{ + return TRAP(*regs) == 0x3000; +} + +static bool trap_is_syscall(user_regs_struct_t *regs) +{ + return trap_is_scv(regs) || TRAP(*regs) == 0x0C00; +} + +static void handle_syscall(pid_t pid, user_regs_struct_t *regs) +{ + unsigned long ret = regs->gpr[3]; + + if (trap_is_scv(regs)) { + if (!IS_ERR_VALUE(ret)) + return; + ret = -ret; + } else if (!(regs->ccr & 0x10000000)) { + return; + } + + /* Restart or interrupt the system call */ + switch (ret) { + case ERESTARTNOHAND: + case ERESTARTSYS: + case ERESTARTNOINTR: + regs->gpr[3] = regs->orig_gpr3; + regs->nip -= 4; + break; + case ERESTART_RESTARTBLOCK: + pr_warn("Will restore %d with interrupted system call\n", pid); + regs->gpr[3] = trap_is_scv(regs) ? -EINTR : EINTR; + break; + } +} + +static int __get_task_regs(pid_t pid, user_regs_struct_t *regs, user_fpregs_struct_t *fpregs) { pr_info("Dumping GP/FPU registers for %d\n", pid); - /* - * This is inspired by kernel function check_syscall_restart in - * arch/powerpc/kernel/signal.c - */ -#ifndef TRAP -#define TRAP(r) ((r).trap & ~0xF) -#endif - - if (TRAP(*regs) == 0x0C00 && regs->ccr & 0x10000000) { - /* Restart the system call */ - switch (regs->gpr[3]) { - case ERESTARTNOHAND: - case ERESTARTSYS: - case ERESTARTNOINTR: - regs->gpr[3] = regs->orig_gpr3; - regs->nip -= 4; - break; - case ERESTART_RESTARTBLOCK: - regs->gpr[0] = __NR_restart_syscall; - regs->nip -= 4; - break; - } - } + if (trap_is_syscall(regs)) + handle_syscall(pid, regs); /* Resetting trap since we are now coming from user space. */ regs->trap = 0; @@ -349,10 +368,8 @@ static int __get_task_regs(pid_t pid, user_regs_struct_t *regs, * impossible) or suspended (easy to get). */ if (MSR_TM_ACTIVE(regs->msr)) { - pr_debug("Task %d has %s TM operation at 0x%lx\n", - pid, - (regs->msr & MSR_TMS) ? "a suspended" : "an active", - regs->nip); + pr_debug("Task %d has %s TM operation at 0x%lx\n", pid, + (regs->msr & MSR_TMS) ? "a suspended" : "an active", regs->nip); if (get_tm_regs(pid, fpregs)) return -1; fpregs->flags = USER_FPREGS_FL_TM; @@ -374,26 +391,48 @@ static int __get_task_regs(pid_t pid, user_regs_struct_t *regs, return 0; } -int get_task_regs(pid_t pid, user_regs_struct_t *regs, save_regs_t save, - void *arg, __maybe_unused unsigned long flags) +int compel_get_task_regs(pid_t pid, user_regs_struct_t *regs, user_fpregs_struct_t *fpregs, save_regs_t save, + void *arg, __maybe_unused unsigned long flags) { - user_fpregs_struct_t fpregs; int ret; - ret = __get_task_regs(pid, regs, &fpregs); + ret = __get_task_regs(pid, regs, fpregs); if (ret) return ret; - return save(arg, regs, &fpregs); + return save(pid, arg, regs, fpregs); } -int compel_syscall(struct parasite_ctl *ctl, int nr, long *ret, - unsigned long arg1, - unsigned long arg2, - unsigned long arg3, - unsigned long arg4, - unsigned long arg5, - unsigned long arg6) +int compel_set_task_ext_regs(pid_t pid, user_fpregs_struct_t *ext_regs) +{ + int ret = 0; + + pr_info("Restoring GP/FPU registers for %d\n", pid); + + /* XXX: should restore TM registers somehow? */ + if (ext_regs->flags & USER_FPREGS_FL_FP) { + if (ptrace(PTRACE_SETFPREGS, pid, 0, (void *)&ext_regs->fpregs) < 0) { + pr_perror("Couldn't set floating-point registers"); + ret = -1; + } + } + + if (ext_regs->flags & USER_FPREGS_FL_ALTIVEC) { + if (ptrace(PTRACE_SETVRREGS, pid, 0, (void *)&ext_regs->vrregs) < 0) { + pr_perror("Couldn't set Altivec registers"); + ret = -1; + } + if (ptrace(PTRACE_SETVSRREGS, pid, 0, (void *)ext_regs->vsxregs) < 0) { + pr_perror("Couldn't set VSX registers"); + ret = -1; + } + } + + return ret; +} + +int compel_syscall(struct parasite_ctl *ctl, int nr, long *ret, unsigned long arg1, unsigned long arg2, + unsigned long arg3, unsigned long arg4, unsigned long arg5, unsigned long arg6) { user_regs_struct_t regs = ctl->orig.regs; int err; @@ -406,21 +445,18 @@ int compel_syscall(struct parasite_ctl *ctl, int nr, long *ret, regs.gpr[7] = arg5; regs.gpr[8] = arg6; - err = compel_execute_syscall(ctl, ®s, (char*)code_syscall); + err = compel_execute_syscall(ctl, ®s, (char *)code_syscall); *ret = regs.gpr[3]; return err; } -void *remote_mmap(struct parasite_ctl *ctl, - void *addr, size_t length, int prot, - int flags, int fd, off_t offset) +void *remote_mmap(struct parasite_ctl *ctl, void *addr, size_t length, int prot, int flags, int fd, off_t offset) { long map = 0; int err; - err = compel_syscall(ctl, __NR_mmap, &map, - (unsigned long)addr, length, prot, flags, fd, offset); + err = compel_syscall(ctl, __NR_mmap, &map, (unsigned long)addr, length, prot, flags, fd, offset); if (err < 0 || (long)map < 0) map = 0; @@ -430,13 +466,13 @@ void *remote_mmap(struct parasite_ctl *ctl, void parasite_setup_regs(unsigned long new_ip, void *stack, user_regs_struct_t *regs) { /* - * OpenPOWER ABI requires that r12 is set to the calling function addressi + * OpenPOWER ABI requires that r12 is set to the calling function address * to compute the TOC pointer. */ regs->gpr[12] = new_ip; regs->nip = new_ip; if (stack) - regs->gpr[1] = (unsigned long) stack; + regs->gpr[1] = (unsigned long)stack - STACK_FRAME_MIN_SIZE; regs->trap = 0; } @@ -453,9 +489,7 @@ int arch_fetch_sas(struct parasite_ctl *ctl, struct rt_sigframe *s) long ret; int err; - err = compel_syscall(ctl, __NR_sigaltstack, - &ret, 0, (unsigned long)&s->uc.uc_stack, - 0, 0, 0, 0); + err = compel_syscall(ctl, __NR_sigaltstack, &ret, 0, (unsigned long)&s->uc.uc_stack, 0, 0, 0, 0); return err ? err : ret; } @@ -464,7 +498,7 @@ int arch_fetch_sas(struct parasite_ctl *ctl, struct rt_sigframe *s) * * NOTE: 32bit tasks are not supported. */ -#define TASK_SIZE_64TB (0x0000400000000000UL) +#define TASK_SIZE_64TB (0x0000400000000000UL) #define TASK_SIZE_512TB (0x0002000000000000UL) #define TASK_SIZE_MIN TASK_SIZE_64TB diff --git a/compel/arch/riscv64/plugins/include/asm/prologue.h b/compel/arch/riscv64/plugins/include/asm/prologue.h new file mode 100644 index 000000000..5c22b7b06 --- /dev/null +++ b/compel/arch/riscv64/plugins/include/asm/prologue.h @@ -0,0 +1,35 @@ +#ifndef __ASM_PROLOGUE_H__ +#define __ASM_PROLOGUE_H__ + +#ifndef __ASSEMBLY__ + +#include +#include +#include + +#include + +#define sys_recv(sockfd, ubuf, size, flags) sys_recvfrom(sockfd, ubuf, size, flags, NULL, NULL) + +typedef struct prologue_init_args { + struct sockaddr_un ctl_sock_addr; + unsigned int ctl_sock_addr_len; + + unsigned int arg_s; + void *arg_p; + + void *sigframe; +} prologue_init_args_t; + +#endif /* __ASSEMBLY__ */ + +/* + * Reserve enough space for sigframe. + * + * FIXME It is rather should be taken from sigframe header. + */ +#define PROLOGUE_SGFRAME_SIZE 4096 + +#define PROLOGUE_INIT_ARGS_SIZE 1024 + +#endif /* __ASM_PROLOGUE_H__ */ \ No newline at end of file diff --git a/compel/arch/riscv64/plugins/include/asm/syscall-types.h b/compel/arch/riscv64/plugins/include/asm/syscall-types.h new file mode 100644 index 000000000..b9740a9ee --- /dev/null +++ b/compel/arch/riscv64/plugins/include/asm/syscall-types.h @@ -0,0 +1,28 @@ +#ifndef COMPEL_ARCH_SYSCALL_TYPES_H__ +#define COMPEL_ARCH_SYSCALL_TYPES_H__ + +#define SA_RESTORER 0x04000000 + +typedef void rt_signalfn_t(int, siginfo_t *, void *); +typedef rt_signalfn_t *rt_sighandler_t; + +typedef void rt_restorefn_t(void); +typedef rt_restorefn_t *rt_sigrestore_t; + +#define _KNSIG 64 // number of signals +#define _NSIG_BPW 64 // number of signals per word + +#define _KNSIG_WORDS (_KNSIG / _NSIG_BPW) + +typedef struct { + unsigned long sig[_KNSIG_WORDS]; +} k_rtsigset_t; + +typedef struct { + rt_sighandler_t rt_sa_handler; + unsigned long rt_sa_flags; + rt_sigrestore_t rt_sa_restorer; + k_rtsigset_t rt_sa_mask; +} rt_sigaction_t; + +#endif /* COMPEL_ARCH_SYSCALL_TYPES_H__ */ \ No newline at end of file diff --git a/compel/arch/riscv64/plugins/include/features.h b/compel/arch/riscv64/plugins/include/features.h new file mode 100644 index 000000000..274cee52a --- /dev/null +++ b/compel/arch/riscv64/plugins/include/features.h @@ -0,0 +1,4 @@ +#ifndef __COMPEL_ARCH_FEATURES_H +#define __COMPEL_ARCH_FEATURES_H + +#endif /* __COMPEL_ARCH_FEATURES_H */ \ No newline at end of file diff --git a/compel/arch/riscv64/plugins/std/parasite-head.S b/compel/arch/riscv64/plugins/std/parasite-head.S new file mode 100644 index 000000000..3e9d272e3 --- /dev/null +++ b/compel/arch/riscv64/plugins/std/parasite-head.S @@ -0,0 +1,7 @@ +#include "common/asm/linkage.h" + + .section .head.text, "ax" +ENTRY(__export_parasite_head_start) + jal parasite_service + ebreak +END(__export_parasite_head_start) \ No newline at end of file diff --git a/compel/arch/riscv64/plugins/std/syscalls/Makefile.syscalls b/compel/arch/riscv64/plugins/std/syscalls/Makefile.syscalls new file mode 100644 index 000000000..5af35bcb4 --- /dev/null +++ b/compel/arch/riscv64/plugins/std/syscalls/Makefile.syscalls @@ -0,0 +1,59 @@ +ccflags-y += -iquote $(PLUGIN_ARCH_DIR)/std/syscalls/ +asflags-y += -iquote $(PLUGIN_ARCH_DIR)/std/syscalls/ + +sys-types := $(obj)/include/uapi/std/syscall-types.h +sys-codes := $(obj)/include/uapi/std/syscall-codes.h +sys-proto := $(obj)/include/uapi/std/syscall.h + +sys-def := $(PLUGIN_ARCH_DIR)/std/syscalls/syscall.def +sys-asm-common-name := std/syscalls/syscall-common.S +sys-asm-common := $(PLUGIN_ARCH_DIR)/$(sys-asm-common-name) +sys-asm-types := $(obj)/include/uapi/std/asm/syscall-types.h +sys-exec-tbl = $(PLUGIN_ARCH_DIR)/std/sys-exec-tbl.c + +sys-gen := $(PLUGIN_ARCH_DIR)/std/syscalls/gen-syscalls.pl +sys-gen-tbl := $(PLUGIN_ARCH_DIR)/std/syscalls/gen-sys-exec-tbl.pl + +sys-asm := ./$(PLUGIN_ARCH_DIR)/std/syscalls/syscalls.S +std-lib-y += $(sys-asm:.S=).o + +ifeq ($(ARCH),arm) +arch_bits := 32 +else +arch_bits := 64 +endif + +sys-exec-tbl := sys-exec-tbl.c + +$(sys-asm) $(sys-types) $(sys-codes) $(sys-proto): $(sys-gen) $(sys-def) $(sys-asm-common) $(sys-asm-types) + $(E) " GEN " $@ + $(Q) perl \ + $(sys-gen) \ + $(sys-def) \ + $(sys-codes) \ + $(sys-proto) \ + $(sys-asm) \ + $(sys-asm-common-name) \ + $(sys-types) \ + $(arch_bits) + +$(sys-asm:.S=).o: $(sys-asm) + +$(sys-exec-tbl): $(sys-gen-tbl) $(sys-def) + $(E) " GEN " $@ + $(Q) perl \ + $(sys-gen-tbl) \ + $(sys-def) \ + $(sys-exec-tbl) \ + $(arch_bits) + +$(sys-asm-types): $(PLUGIN_ARCH_DIR)/include/asm/syscall-types.h + $(call msg-gen, $@) + $(Q) ln -s ../../../../../../$(PLUGIN_ARCH_DIR)/include/asm/syscall-types.h $(sys-asm-types) + $(Q) ln -s ../../../../../$(PLUGIN_ARCH_DIR)/std/syscalls/syscall-aux.S $(obj)/include/uapi/std/syscall-aux.S + $(Q) ln -s ../../../../../$(PLUGIN_ARCH_DIR)/std/syscalls/syscall-aux.h $(obj)/include/uapi/std/syscall-aux.h + +std-headers-deps += $(sys-asm) $(sys-codes) $(sys-proto) $(sys-asm-types) $(sys-codes) +mrproper-y += $(std-headers-deps) +mrproper-y += $(obj)/include/uapi/std/syscall-aux.S +mrproper-y += $(obj)/include/uapi/std/syscall-aux.h \ No newline at end of file diff --git a/compel/arch/riscv64/plugins/std/syscalls/gen-sys-exec-tbl.pl b/compel/arch/riscv64/plugins/std/syscalls/gen-sys-exec-tbl.pl new file mode 100755 index 000000000..61a807eb6 --- /dev/null +++ b/compel/arch/riscv64/plugins/std/syscalls/gen-sys-exec-tbl.pl @@ -0,0 +1,43 @@ +#!/usr/bin/perl + +use strict; +use warnings; + +my $in = $ARGV[0]; +my $tblout = $ARGV[1]; +my $bits = $ARGV[2]; + +my $code = "code$bits"; + +open TBLOUT, ">", $tblout or die $!; +open IN, "<", $in or die $!; + +print TBLOUT "/* Autogenerated, don't edit */\n"; +print TBLOUT "static struct syscall_exec_desc sc_exec_table[] = {\n"; + +for () { + if ($_ =~ /\#/) { + next; + } + + my $sys_name; + my $sys_num; + + if (/(?\S+)\s+(?\S+)\s+(?\d+|\!)\s+(?(?:\d+|\!))\s+\((?.+)\)/) { + $sys_name = $+{alias}; + } elsif (/(?\S+)\s+(?\d+|\!)\s+(?(?:\d+|\!))\s+\((?.+)\)/) { + $sys_name = $+{name}; + } else { + unlink $tblout; + die "Invalid syscall definition file: invalid entry $_\n"; + } + + $sys_num = $+{$code}; + + if ($sys_num ne "!") { + print TBLOUT "SYSCALL($sys_name, $sys_num)\n"; + } +} + +print TBLOUT " { }, /* terminator */"; +print TBLOUT "};" \ No newline at end of file diff --git a/compel/arch/riscv64/plugins/std/syscalls/gen-syscalls.pl b/compel/arch/riscv64/plugins/std/syscalls/gen-syscalls.pl new file mode 100755 index 000000000..a53f1962f --- /dev/null +++ b/compel/arch/riscv64/plugins/std/syscalls/gen-syscalls.pl @@ -0,0 +1,99 @@ +#!/usr/bin/perl + +use strict; +use warnings; + +my $in = $ARGV[0]; +my $codesout = $ARGV[1]; +my $codes = $ARGV[1]; +$codes =~ s/.*include\/uapi\//compel\/plugins\//g; +my $protosout = $ARGV[2]; +my $protos = $ARGV[2]; +$protos =~ s/.*include\/uapi\//compel\/plugins\//g; +my $asmout = $ARGV[3]; +my $asmcommon = $ARGV[4]; +my $prototypes = $ARGV[5]; +$prototypes =~ s/.*include\/uapi\//compel\/plugins\//g; +my $bits = $ARGV[6]; + +my $codesdef = $codes; +$codesdef =~ tr/.\-\//_/; +my $protosdef = $protos; +$protosdef =~ tr/.\-\//_/; +my $code = "code$bits"; +my $need_aux = 0; + +unlink $codesout; +unlink $protosout; +unlink $asmout; + +open CODESOUT, ">", $codesout or die $!; +open PROTOSOUT, ">", $protosout or die $!; +open ASMOUT, ">", $asmout or die $!; +open IN, "<", $in or die $!; + +print CODESOUT <<"END"; +/* Autogenerated, don't edit */ +#ifndef $codesdef +#define $codesdef +END + +print PROTOSOUT <<"END"; +/* Autogenerated, don't edit */ +#ifndef $protosdef +#define $protosdef +#include <$prototypes> +#include <$codes> +END + +print ASMOUT <<"END"; +/* Autogenerated, don't edit */ +#include <$codes> +#include "$asmcommon" +END + + +for () { + if ($_ =~ /\#/) { + next; + } + + my $code_macro; + my $sys_macro; + my $sys_name; + + if (/(?\S+)\s+(?\S+)\s+(?\d+|\!)\s+(?(?:\d+|\!))\s+\((?.+)\)/) { + $code_macro = "__NR_$+{name}"; + $sys_macro = "SYS_$+{name}"; + $sys_name = "sys_$+{alias}"; + } elsif (/(?\S+)\s+(?\d+|\!)\s+(?(?:\d+|\!))\s+\((?.+)\)/) { + $code_macro = "__NR_$+{name}"; + $sys_macro = "SYS_$+{name}"; + $sys_name = "sys_$+{name}"; + } else { + unlink $codesout; + unlink $protosout; + unlink $asmout; + + die "Invalid syscall definition file: invalid entry $_\n"; + } + + if ($+{$code} ne "!") { + print CODESOUT "#ifndef $code_macro\n#define $code_macro $+{$code}\n#endif\n"; + print CODESOUT "#ifndef $sys_macro\n#define $sys_macro $code_macro\n#endif\n"; + print ASMOUT "syscall $sys_name, $code_macro\n"; + + } else { + $need_aux = 1; + } + + print PROTOSOUT "extern long $sys_name($+{args});\n"; +} + +if ($need_aux == 1) { + print ASMOUT "#include \n"; + print CODESOUT "#include \n"; +} + +print CODESOUT "#endif /* $codesdef */"; +print PROTOSOUT "#endif /* $protosdef */"; \ No newline at end of file diff --git a/compel/arch/riscv64/plugins/std/syscalls/syscall-aux.S b/compel/arch/riscv64/plugins/std/syscalls/syscall-aux.S new file mode 100644 index 000000000..04160b7ac --- /dev/null +++ b/compel/arch/riscv64/plugins/std/syscalls/syscall-aux.S @@ -0,0 +1,37 @@ +/** + * This source contains emulation of syscalls + * that are not implemented in the riscv64 Linux kernel + */ + +ENTRY(sys_open) + add a3, x0, a2 + add a2, x0, a1 + add a1, x0, a0 + addi a0, x0, -100 + j sys_openat +END(sys_open) + + +ENTRY(sys_mkdir) + add a3,x0, a2 + add a2, x0, a1 + add a1, x0, a0 + addi a0, x0, -100 + j sys_mkdirat +END(sys_mkdir) + + +ENTRY(sys_rmdir) + addi a2, x0, 0x200 // flags = AT_REMOVEDIR + add a1, x0, a0 + addi a0, x0, -100 + j sys_unlinkat +END(sys_rmdir) + + +ENTRY(sys_unlink) + addi a2, x0, 0 // flags = 0 + add a1, x0, a0 + addi a0, x0, -100 + j sys_unlinkat +END(sys_unlink) \ No newline at end of file diff --git a/compel/arch/riscv64/plugins/std/syscalls/syscall-aux.h b/compel/arch/riscv64/plugins/std/syscalls/syscall-aux.h new file mode 100644 index 000000000..881765bbb --- /dev/null +++ b/compel/arch/riscv64/plugins/std/syscalls/syscall-aux.h @@ -0,0 +1,3 @@ +#ifndef __NR_openat +#define __NR_openat 56 +#endif \ No newline at end of file diff --git a/compel/arch/riscv64/plugins/std/syscalls/syscall-common.S b/compel/arch/riscv64/plugins/std/syscalls/syscall-common.S new file mode 100644 index 000000000..fdef3b47a --- /dev/null +++ b/compel/arch/riscv64/plugins/std/syscalls/syscall-common.S @@ -0,0 +1,17 @@ +#include "common/asm/linkage.h" + +syscall_common: + ecall + ret + +.macro syscall name, nr + ENTRY(\name) + li a7, \nr + j syscall_common + END(\name) +.endm + +ENTRY(__cr_restore_rt) + li a7, __NR_rt_sigreturn + ecall +END(__cr_restore_rt) \ No newline at end of file diff --git a/compel/arch/riscv64/plugins/std/syscalls/syscall.def b/compel/arch/riscv64/plugins/std/syscalls/syscall.def new file mode 100644 index 000000000..967f097f9 --- /dev/null +++ b/compel/arch/riscv64/plugins/std/syscalls/syscall.def @@ -0,0 +1,125 @@ +# +# System calls table, please make sure the table consists of only the syscalls +# really used somewhere in the project. +# +# The template is (name and arguments are optional if you need only __NR_x +# defined, but no real entry point in syscalls lib). +# +# name/alias code64 code32 arguments +# ----------------------------------------------------------------------- +# +read 63 3 (int fd, void *buf, unsigned long count) +write 64 4 (int fd, const void *buf, unsigned long count) +open ! 5 (const char *filename, unsigned long flags, unsigned long mode) +close 57 6 (int fd) +lseek 62 19 (int fd, unsigned long offset, unsigned long origin) +mmap 222 ! (void *addr, unsigned long len, unsigned long prot, unsigned long flags, unsigned long fd, unsigned long offset) +mprotect 226 125 (const void *addr, unsigned long len, unsigned long prot) +munmap 215 91 (void *addr, unsigned long len) +brk 214 45 (void *addr) +rt_sigaction sigaction 134 174 (int signum, const rt_sigaction_t *act, rt_sigaction_t *oldact, size_t sigsetsize) +rt_sigprocmask sigprocmask 135 175 (int how, k_rtsigset_t *set, k_rtsigset_t *old, size_t sigsetsize) +rt_sigreturn 139 173 (void) +ioctl 29 54 (unsigned int fd, unsigned int cmd, unsigned long arg) +pread64 67 180 (unsigned int fd, char *buf, size_t count, loff_t pos) +ptrace 117 26 (long request, pid_t pid, void *addr, void *data) +mremap 216 163 (unsigned long addr, unsigned long old_len, unsigned long new_len, unsigned long flag, unsigned long new_addr) +mincore 232 219 (void *addr, unsigned long size, unsigned char *vec) +madvise 233 220 (unsigned long start, size_t len, int behavior) +shmat 196 305 (int shmid, void *shmaddr, int shmflag) +pause 1061 29 (void) +nanosleep 101 162 (struct timespec *req, struct timespec *rem) +getitimer 102 105 (int which, const struct itimerval *val) +setitimer 103 104 (int which, const struct itimerval *val, struct itimerval *old) +getpid 172 20 (void) +socket 198 281 (int domain, int type, int protocol) +connect 203 283 (int sockfd, struct sockaddr *addr, int addrlen) +sendto 206 290 (int sockfd, void *buff, size_t len, unsigned int flags, struct sockaddr *addr, int addr_len) +recvfrom 207 292 (int sockfd, void *ubuf, size_t size, unsigned int flags, struct sockaddr *addr, int *addr_len) +sendmsg 211 296 (int sockfd, const struct msghdr *msg, int flags) +recvmsg 212 297 (int sockfd, struct msghdr *msg, int flags) +shutdown 210 293 (int sockfd, int how) +bind 235 282 (int sockfd, const struct sockaddr *addr, int addrlen) +setsockopt 208 294 (int sockfd, int level, int optname, const void *optval, socklen_t optlen) +getsockopt 209 295 (int sockfd, int level, int optname, const void *optval, socklen_t *optlen) +clone 220 120 (unsigned long flags, void *child_stack, void *parent_tid, unsigned long newtls, void *child_tid) +exit 93 1 (unsigned long error_code) +wait4 260 114 (int pid, int *status, int options, struct rusage *ru) +waitid 95 280 (int which, pid_t pid, struct siginfo *infop, int options, struct rusage *ru) +kill 129 37 (long pid, int sig) +fcntl 25 55 (int fd, int type, long arg) +flock 32 143 (int fd, unsigned long cmd) +mkdir ! 39 (const char *name, int mode) +rmdir ! 40 (const char *name) +unlink ! 10 (char *pathname) +readlinkat 78 332 (int fd, const char *path, char *buf, int bufsize) +umask 166 60 (int mask) +getgroups 158 205 (int gsize, unsigned int *groups) +setgroups 159 206 (int gsize, unsigned int *groups) +setresuid 147 164 (int uid, int euid, int suid) +getresuid 148 165 (int *uid, int *euid, int *suid) +setresgid 149 170 (int gid, int egid, int sgid) +getresgid 150 171 (int *gid, int *egid, int *sgid) +getpgid 155 132 (pid_t pid) +setfsuid 151 138 (int fsuid) +setfsgid 152 139 (int fsgid) +getsid 156 147 (void) +capget 90 184 (struct cap_header *h, struct cap_data *d) +capset 91 185 (struct cap_header *h, struct cap_data *d) +rt_sigqueueinfo 138 178 (pid_t pid, int sig, siginfo_t *info) +setpriority 140 97 (int which, int who, int nice) +sched_setscheduler 119 156 (int pid, int policy, struct sched_param *p) +sigaltstack 132 186 (const void *uss, void *uoss) +personality 92 136 (unsigned int personality) +prctl 167 172 (int option, unsigned long arg2, unsigned long arg3, unsigned long arg4, unsigned long arg5) +arch_prctl ! 17 (int option, unsigned long addr) +setrlimit 164 75 (int resource, struct krlimit *rlim) +mount 40 21 (char *dev_nmae, char *dir_name, char *type, unsigned long flags, void *data) +umount2 39 52 (char *name, int flags) +gettid 178 224 (void) +futex 98 240 (uint32_t *uaddr, int op, uint32_t val, struct timespec *utime, uint32_t *uaddr2, uint32_t val3) +set_tid_address 96 256 (int *tid_addr) +restart_syscall 128 0 (void) +timer_create 107 257 (clockid_t which_clock, struct sigevent *timer_event_spec, kernel_timer_t *created_timer_id) +timer_settime 110 258 (kernel_timer_t timer_id, int flags, const struct itimerspec *new_setting, struct itimerspec *old_setting) +timer_gettime 108 259 (int timer_id, const struct itimerspec *setting) +timer_getoverrun 109 260 (int timer_id) +timer_delete 111 261 (kernel_timer_t timer_id) +clock_gettime 113 263 (clockid_t which_clock, struct timespec *tp) +exit_group 94 248 (int error_code) +set_robust_list 99 338 (struct robust_list_head *head, size_t len) +get_robust_list 100 339 (int pid, struct robust_list_head **head_ptr, size_t *len_ptr) +signalfd4 74 355 (int fd, k_rtsigset_t *mask, size_t sizemask, int flags) +rt_tgsigqueueinfo 240 363 (pid_t tgid, pid_t pid, int sig, siginfo_t *info) +vmsplice 75 343 (int fd, const struct iovec *iov, unsigned long nr_segs, unsigned int flags) +timerfd_settime 86 353 (int ufd, int flags, const struct itimerspec *utmr, struct itimerspec *otmr) +fanotify_init 262 367 (unsigned int flags, unsigned int event_f_flags) +fanotify_mark 263 368 (int fanotify_fd, unsigned int flags, uint64_t mask, int dfd, const char *pathname) +open_by_handle_at 265 371 (int mountdirfd, struct file_handle *handle, int flags) +setns 268 375 (int fd, int nstype) +kcmp 272 378 (pid_t pid1, pid_t pid2, int type, unsigned long idx1, unsigned long idx2) +openat 56 322 (int dirfd, const char *pathname, int flags, mode_t mode) +mkdirat 34 323 (int dirfd, const char *pathname, mode_t mode) +unlinkat 35 328 (int dirfd, const char *pathname, int flags) +memfd_create 279 385 (const char *name, unsigned int flags) +io_setup 0 243 (unsigned nr_events, aio_context_t *ctx) +io_submit 2 246 (aio_context_t ctx_id, long nr, struct iocb **iocbpp) +io_getevents 4 245 (aio_context_t ctx, long min_nr, long nr, struct io_event *evs, struct timespec *tmo) +seccomp 277 383 (unsigned int op, unsigned int flags, const char *uargs) +gettimeofday 169 78 (struct timeval *tv, struct timezone *tz) +preadv_raw 69 361 (int fd, struct iovec *iov, unsigned long nr, unsigned long pos_l, unsigned long pos_h) +userfaultfd 282 388 (int flags) +fallocate 47 352 (int fd, int mode, loff_t offset, loff_t len) +cacheflush ! 983042 (void *start, void *end, int flags) +ppoll 73 336 (struct pollfd *fds, unsigned int nfds, const struct timespec *tmo, const sigset_t *sigmask, size_t sigsetsize) +fsopen 430 430 (char *fsname, unsigned int flags) +fsconfig 431 431 (int fd, unsigned int cmd, const char *key, const char *value, int aux) +fsmount 432 432 (int fd, unsigned int flags, unsigned int attr_flags) +clone3 435 435 (struct clone_args *uargs, size_t size) +pidfd_open 434 434 (pid_t pid, unsigned int flags) +pidfd_getfd 438 438 (int pidfd, int targetfd, unsigned int flags) +rseq 293 293 (void *rseq, uint32_t rseq_len, int flags, uint32_t sig) +move_mount 429 429 (int from_dfd, const char *from_pathname, int to_dfd, const char *to_pathname, int flags) +open_tree 428 428 (int dirfd, const char *pathname, unsigned int flags) +openat2 437 437 (int dirfd, char *pathname, struct open_how *how, size_t size) +membarrier 283 283 (int cmd, unsigned int flags, int cpu_id) diff --git a/compel/arch/riscv64/scripts/compel-pack.lds.S b/compel/arch/riscv64/scripts/compel-pack.lds.S new file mode 100644 index 000000000..a61235b44 --- /dev/null +++ b/compel/arch/riscv64/scripts/compel-pack.lds.S @@ -0,0 +1,32 @@ +OUTPUT_ARCH(riscv) +EXTERN(__export_parasite_head_start) + +SECTIONS +{ + .crblob 0x0 : { + *(.head.text) + ASSERT(DEFINED(__export_parasite_head_start), + "Symbol __export_parasite_head_start is missing"); + *(.text*) + . = ALIGN(32); + *(.data*) + . = ALIGN(32); + *(.rodata*) + . = ALIGN(32); + *(.bss*) + . = ALIGN(32); + *(.got*) + . = ALIGN(32); + *(.toc*) + . = ALIGN(32); + } =0x00000000, + + /DISCARD/ : { + *(.debug*) + *(.comment*) + *(.note*) + *(.group*) + *(.eh_frame*) + *(*) + } +} \ No newline at end of file diff --git a/compel/arch/riscv64/src/lib/cpu.c b/compel/arch/riscv64/src/lib/cpu.c new file mode 100644 index 000000000..9a0291f70 --- /dev/null +++ b/compel/arch/riscv64/src/lib/cpu.c @@ -0,0 +1,78 @@ +#include +#include + +#include "compel-cpu.h" + +#include "common/bitops.h" + +#include "log.h" + +#undef LOG_PREFIX +#define LOG_PREFIX "cpu: " + +static compel_cpuinfo_t rt_info; + +static void fetch_rt_cpuinfo(void) +{ + static bool rt_info_done = false; + + if (!rt_info_done) { + compel_cpuid(&rt_info); + rt_info_done = true; + } +} + +void compel_set_cpu_cap(compel_cpuinfo_t *info, unsigned int feature) +{ +} +void compel_clear_cpu_cap(compel_cpuinfo_t *info, unsigned int feature) +{ +} +int compel_test_cpu_cap(compel_cpuinfo_t *info, unsigned int feature) +{ + return 0; +} +int compel_test_fpu_cap(compel_cpuinfo_t *info, unsigned int feature) +{ + return 0; +} +int compel_cpuid(compel_cpuinfo_t *info) +{ + return 0; +} + +bool compel_cpu_has_feature(unsigned int feature) +{ + fetch_rt_cpuinfo(); + return compel_test_cpu_cap(&rt_info, feature); +} + +bool compel_fpu_has_feature(unsigned int feature) +{ + fetch_rt_cpuinfo(); + return compel_test_fpu_cap(&rt_info, feature); +} + +uint32_t compel_fpu_feature_size(unsigned int feature) +{ + fetch_rt_cpuinfo(); + return 0; +} + +uint32_t compel_fpu_feature_offset(unsigned int feature) +{ + fetch_rt_cpuinfo(); + return 0; +} + +void compel_cpu_clear_feature(unsigned int feature) +{ + fetch_rt_cpuinfo(); + return compel_clear_cpu_cap(&rt_info, feature); +} + +void compel_cpu_copy_cpuinfo(compel_cpuinfo_t *c) +{ + fetch_rt_cpuinfo(); + memcpy(c, &rt_info, sizeof(rt_info)); +} \ No newline at end of file diff --git a/compel/arch/riscv64/src/lib/handle-elf-host.c b/compel/arch/riscv64/src/lib/handle-elf-host.c new file mode 120000 index 000000000..fe4611886 --- /dev/null +++ b/compel/arch/riscv64/src/lib/handle-elf-host.c @@ -0,0 +1 @@ +handle-elf.c \ No newline at end of file diff --git a/compel/arch/riscv64/src/lib/handle-elf.c b/compel/arch/riscv64/src/lib/handle-elf.c new file mode 100644 index 000000000..22420bc78 --- /dev/null +++ b/compel/arch/riscv64/src/lib/handle-elf.c @@ -0,0 +1,32 @@ +#include +#include + +#include "handle-elf.h" +#include "piegen.h" +#include "log.h" + +static const unsigned char __maybe_unused elf_ident_64_le[EI_NIDENT] = { + 0x7f, 0x45, 0x4c, 0x46, 0x02, 0x01, 0x01, 0x00, /* clang-format */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, +}; + +static const unsigned char __maybe_unused elf_ident_64_be[EI_NIDENT] = { + 0x7f, 0x45, 0x4c, 0x46, 0x02, 0x02, 0x01, 0x00, /* clang-format */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, +}; + +int handle_binary(void *mem, size_t size) +{ + const unsigned char *elf_ident = +#if __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__ + elf_ident_64_le; +#else + elf_ident_64_be; +#endif + + if (memcmp(mem, elf_ident, sizeof(elf_ident_64_le)) == 0) + return handle_elf_riscv64(mem, size); + + pr_err("Unsupported Elf format detected\n"); + return -EINVAL; +} \ No newline at end of file diff --git a/test/zdtm/static/mntns-deleted-dst b/compel/arch/riscv64/src/lib/include/cpu.h similarity index 100% rename from test/zdtm/static/mntns-deleted-dst rename to compel/arch/riscv64/src/lib/include/cpu.h diff --git a/compel/arch/riscv64/src/lib/include/handle-elf.h b/compel/arch/riscv64/src/lib/include/handle-elf.h new file mode 100644 index 000000000..582770583 --- /dev/null +++ b/compel/arch/riscv64/src/lib/include/handle-elf.h @@ -0,0 +1,12 @@ +#ifndef COMPEL_HANDLE_ELF_H__ +#define COMPEL_HANDLE_ELF_H__ + +#include "elf64-types.h" + +#define __handle_elf handle_elf_riscv64 +#define ELF_RISCV +#define arch_is_machine_supported(e_machine) (e_machine == EM_RISCV) + +extern int handle_elf_riscv64(void *mem, size_t size); + +#endif /* COMPEL_HANDLE_ELF_H__ */ \ No newline at end of file diff --git a/compel/arch/riscv64/src/lib/include/syscall.h b/compel/arch/riscv64/src/lib/include/syscall.h new file mode 100644 index 000000000..53f10525d --- /dev/null +++ b/compel/arch/riscv64/src/lib/include/syscall.h @@ -0,0 +1,8 @@ +#ifndef __COMPEL_SYSCALL_H__ +#define __COMPEL_SYSCALL_H__ +#define __NR(syscall, compat) \ + ({ \ + (void)compat; \ + __NR_##syscall; \ + }) +#endif \ No newline at end of file diff --git a/compel/arch/riscv64/src/lib/include/uapi/asm/breakpoints.h b/compel/arch/riscv64/src/lib/include/uapi/asm/breakpoints.h new file mode 100644 index 000000000..f2ba799cb --- /dev/null +++ b/compel/arch/riscv64/src/lib/include/uapi/asm/breakpoints.h @@ -0,0 +1,15 @@ +#ifndef __COMPEL_BREAKPOINTS_H__ +#define __COMPEL_BREAKPOINTS_H__ +#define ARCH_SI_TRAP TRAP_BRKPT + +static inline int ptrace_set_breakpoint(pid_t pid, void *addr) +{ + return 0; +} + +static inline int ptrace_flush_breakpoints(pid_t pid) +{ + return 0; +} + +#endif \ No newline at end of file diff --git a/compel/arch/riscv64/src/lib/include/uapi/asm/cpu.h b/compel/arch/riscv64/src/lib/include/uapi/asm/cpu.h new file mode 100644 index 000000000..ac58567e3 --- /dev/null +++ b/compel/arch/riscv64/src/lib/include/uapi/asm/cpu.h @@ -0,0 +1,7 @@ +#ifndef UAPI_COMPEL_ASM_CPU_H__ +#define UAPI_COMPEL_ASM_CPU_H__ + +typedef struct { +} compel_cpuinfo_t; + +#endif /* UAPI_COMPEL_ASM_CPU_H__ */ \ No newline at end of file diff --git a/compel/arch/riscv64/src/lib/include/uapi/asm/fpu.h b/compel/arch/riscv64/src/lib/include/uapi/asm/fpu.h new file mode 100644 index 000000000..a74decc23 --- /dev/null +++ b/compel/arch/riscv64/src/lib/include/uapi/asm/fpu.h @@ -0,0 +1,4 @@ +#ifndef __CR_ASM_FPU_H__ +#define __CR_ASM_FPU_H__ + +#endif /* __CR_ASM_FPU_H__ */ \ No newline at end of file diff --git a/compel/arch/riscv64/src/lib/include/uapi/asm/infect-types.h b/compel/arch/riscv64/src/lib/include/uapi/asm/infect-types.h new file mode 100644 index 000000000..192810cac --- /dev/null +++ b/compel/arch/riscv64/src/lib/include/uapi/asm/infect-types.h @@ -0,0 +1,52 @@ +#ifndef UAPI_COMPEL_ASM_TYPES_H__ +#define UAPI_COMPEL_ASM_TYPES_H__ + +#include +#include +#include +#include + +#define SIGMAX 64 +#define SIGMAX_OLD 31 + +/* + * Copied from the Linux kernel header arch/riscv/include/uapi/asm/ptrace.h + * + * A thread RISC-V CPU context + */ +typedef struct user_regs_struct user_regs_struct_t; +typedef struct __riscv_d_ext_state user_fpregs_struct_t; + +#define __compel_arch_fetch_thread_area(tid, th) 0 +#define compel_arch_fetch_thread_area(tctl) 0 +#define compel_arch_get_tls_task(ctl, tls) +#define compel_arch_get_tls_thread(tctl, tls) + +#define REG_RES(registers) ((uint64_t)(registers).a0) +#define REG_IP(registers) ((uint64_t)(registers).pc) +#define SET_REG_IP(registers, val) ((registers).pc = (val)) + +/* + * REG_SP is also defined in riscv64-linux-gnu/include/sys/ucontext.h + * with a different meaning, and it's not used in CRIU. So we have to + * undefine it here. + */ +#ifdef REG_SP +#undef REG_SP +#endif + +#define REG_SP(registers) ((uint64_t)((registers).sp)) + +#define REG_SYSCALL_NR(registers) ((uint64_t)(registers).a7) + +#define user_regs_native(pregs) true + +#define ARCH_SI_TRAP TRAP_BRKPT + +#define __NR(syscall, compat) \ + ({ \ + (void)compat; \ + __NR_##syscall; \ + }) + +#endif /* UAPI_COMPEL_ASM_TYPES_H__ */ \ No newline at end of file diff --git a/compel/arch/riscv64/src/lib/include/uapi/asm/instruction_formats.h b/compel/arch/riscv64/src/lib/include/uapi/asm/instruction_formats.h new file mode 100644 index 000000000..e231d0465 --- /dev/null +++ b/compel/arch/riscv64/src/lib/include/uapi/asm/instruction_formats.h @@ -0,0 +1,26 @@ +#ifndef COMPEL_RELOCATIONS_H__ +#define COMPEL_RELOCATIONS_H__ + +#include + +static inline uint32_t riscv_b_imm(uint32_t val) +{ + return (val & 0x00001000) << 19 | (val & 0x000007e0) << 20 | (val & 0x0000001e) << 7 | (val & 0x00000800) >> 4; +} + +static inline uint32_t riscv_i_imm(uint32_t val) +{ + return val << 20; +} + +static inline uint32_t riscv_u_imm(uint32_t val) +{ + return val & 0xfffff000; +} + +static inline uint32_t riscv_j_imm(uint32_t val) +{ + return (val & 0x00100000) << 11 | (val & 0x000007fe) << 20 | (val & 0x00000800) << 9 | (val & 0x000ff000); +} + +#endif /* COMPEL_RELOCATIONS_H__ */ \ No newline at end of file diff --git a/compel/arch/riscv64/src/lib/include/uapi/asm/processor-flags.h b/compel/arch/riscv64/src/lib/include/uapi/asm/processor-flags.h new file mode 100644 index 000000000..e40fb6fce --- /dev/null +++ b/compel/arch/riscv64/src/lib/include/uapi/asm/processor-flags.h @@ -0,0 +1,4 @@ +#ifndef UAPI_COMPEL_ASM_PROCESSOR_FLAGS_H__ +#define UAPI_COMPEL_ASM_PROCESSOR_FLAGS_H__ + +#endif /* UAPI_COMPEL_ASM_PROCESSOR_FLAGS_H__ */ \ No newline at end of file diff --git a/compel/arch/riscv64/src/lib/include/uapi/asm/sigframe.h b/compel/arch/riscv64/src/lib/include/uapi/asm/sigframe.h new file mode 100644 index 000000000..761a08f62 --- /dev/null +++ b/compel/arch/riscv64/src/lib/include/uapi/asm/sigframe.h @@ -0,0 +1,68 @@ +#ifndef UAPI_COMPEL_ASM_SIGFRAME_H__ +#define UAPI_COMPEL_ASM_SIGFRAME_H__ + +#include + +#include + +#include + +/* Copied from the kernel header arch/riscv/include/uapi/asm/sigcontext.h */ +/* + * Signal context structure + * + * This contains the context saved before a signal handler is invoked; + * it is restored by sys_sigreturn / sys_rt_sigreturn. + */ +// struct sigcontext { +// struct user_regs_struct sc_regs; +// union __riscv_fp_state sc_fpregs; +// /* +// * 4K + 128 reserved for vector state and future expansion. +// * This space is enough to store the vector context whose VLENB +// * is less or equal to 128. +// * (The size of the vector context is 4144 byte as VLENB is 128) +// */ +// __u8 __reserved[4224] __attribute__((__aligned__(16))); +// }; + +#define rt_sigcontext sigcontext + +#include + +/* Copied from the kernel source arch/riscv/kernel/signal.c */ +struct rt_sigframe { + siginfo_t info; + ucontext_t uc; //ucontext_t structure holds the user context, e.g., the signal mask, GP regs +}; + +/* + generates inline assembly code for triggering the rt_sigreturn system call. + used to return from a signal handler back to the normal execution flow of the process. +*/ +/* clang-format off */ +#define ARCH_RT_SIGRETURN(new_sp, rt_sigframe) \ + asm volatile( \ + "mv sp, %0\n" \ + "li a7, "__stringify(__NR_rt_sigreturn)" \n" \ + "ecall\n" \ + : \ + : "r"(new_sp) \ + : "a7", "memory") +/* clang-format on */ + +#define RT_SIGFRAME_UC(rt_sigframe) (&rt_sigframe->uc) +#define RT_SIGFRAME_REGIP(rt_sigframe) ((long unsigned int)(rt_sigframe)->uc.uc_mcontext.__gregs[REG_PC]) +#define RT_SIGFRAME_HAS_FPU(rt_sigframe) 1 +#define RT_SIGFRAME_OFFSET(rt_sigframe) 0 + +// #define RT_SIGFRAME_SIGCONTEXT(rt_sigframe) ((struct cr_sigcontext *)&(rt_sigframe)->uc.uc_mcontext) +// #define RT_SIGFRAME_AUX_CONTEXT(rt_sigframe) ((struct sigcontext *)&(RT_SIGFRAME_SIGCONTEXT(rt_sigframe)->__reserved)) +// #define RT_SIGFRAME_FPU(rt_sigframe) (&RT_SIGFRAME_AUX_CONTEXT(rt_sigframe)->fpsimd) + +#define rt_sigframe_erase_sigset(sigframe) \ + memset(&sigframe->uc.uc_sigmask, 0, sizeof(k_rtsigset_t)) // erase the signal mask +#define rt_sigframe_copy_sigset(sigframe, from) \ + memcpy(&sigframe->uc.uc_sigmask, from, sizeof(k_rtsigset_t)) // copy the signal mask + +#endif /* UAPI_COMPEL_ASM_SIGFRAME_H__ */ \ No newline at end of file diff --git a/compel/arch/riscv64/src/lib/infect.c b/compel/arch/riscv64/src/lib/infect.c new file mode 100644 index 000000000..3f3a4b7ec --- /dev/null +++ b/compel/arch/riscv64/src/lib/infect.c @@ -0,0 +1,224 @@ +#include +#include +#include +#include +#include +#include +#include "common/page.h" +#include "uapi/compel/asm/infect-types.h" +#include "log.h" +#include "errno.h" +#include "infect.h" +#include "infect-priv.h" + +unsigned __page_size = 0; +unsigned __page_shift = 0; + +/* + * Injected syscall instruction + */ +const char code_syscall[] = { + 0x73, 0x00, 0x00, 0x00, /* ecall */ + 0x73, 0x00, 0x10, 0x00 /* ebreak */ +}; + +static const int code_syscall_aligned = round_up(sizeof(code_syscall), sizeof(long)); + +static inline void __always_unused __check_code_syscall(void) +{ + BUILD_BUG_ON(code_syscall_aligned != BUILTIN_SYSCALL_SIZE); + BUILD_BUG_ON(!is_log2(sizeof(code_syscall))); +} + +int sigreturn_prep_regs_plain(struct rt_sigframe *sigframe, user_regs_struct_t *regs, user_fpregs_struct_t *fpregs) +{ + sigframe->uc.uc_mcontext.__gregs[0] = regs->pc; + sigframe->uc.uc_mcontext.__gregs[1] = regs->ra; + sigframe->uc.uc_mcontext.__gregs[2] = regs->sp; + sigframe->uc.uc_mcontext.__gregs[3] = regs->gp; + sigframe->uc.uc_mcontext.__gregs[4] = regs->tp; + sigframe->uc.uc_mcontext.__gregs[5] = regs->t0; + sigframe->uc.uc_mcontext.__gregs[6] = regs->t1; + sigframe->uc.uc_mcontext.__gregs[7] = regs->t2; + sigframe->uc.uc_mcontext.__gregs[8] = regs->s0; + sigframe->uc.uc_mcontext.__gregs[9] = regs->s1; + sigframe->uc.uc_mcontext.__gregs[10] = regs->a0; + sigframe->uc.uc_mcontext.__gregs[11] = regs->a1; + sigframe->uc.uc_mcontext.__gregs[12] = regs->a2; + sigframe->uc.uc_mcontext.__gregs[13] = regs->a3; + sigframe->uc.uc_mcontext.__gregs[14] = regs->a4; + sigframe->uc.uc_mcontext.__gregs[15] = regs->a5; + sigframe->uc.uc_mcontext.__gregs[16] = regs->a6; + sigframe->uc.uc_mcontext.__gregs[17] = regs->a7; + sigframe->uc.uc_mcontext.__gregs[18] = regs->s2; + sigframe->uc.uc_mcontext.__gregs[19] = regs->s3; + sigframe->uc.uc_mcontext.__gregs[20] = regs->s4; + sigframe->uc.uc_mcontext.__gregs[21] = regs->s5; + sigframe->uc.uc_mcontext.__gregs[22] = regs->s6; + sigframe->uc.uc_mcontext.__gregs[23] = regs->s7; + sigframe->uc.uc_mcontext.__gregs[24] = regs->s8; + sigframe->uc.uc_mcontext.__gregs[25] = regs->s9; + sigframe->uc.uc_mcontext.__gregs[26] = regs->s10; + sigframe->uc.uc_mcontext.__gregs[27] = regs->s11; + sigframe->uc.uc_mcontext.__gregs[28] = regs->t3; + sigframe->uc.uc_mcontext.__gregs[29] = regs->t4; + sigframe->uc.uc_mcontext.__gregs[30] = regs->t5; + sigframe->uc.uc_mcontext.__gregs[31] = regs->t6; + + memcpy(sigframe->uc.uc_mcontext.__fpregs.__d.__f, fpregs->f, sizeof(fpregs->f)); + sigframe->uc.uc_mcontext.__fpregs.__d.__fcsr = fpregs->fcsr; + + return 0; +} + +int sigreturn_prep_fpu_frame_plain(struct rt_sigframe *sigframe, struct rt_sigframe *rsigframe) +{ + return 0; +} + +int compel_get_task_regs(pid_t pid, user_regs_struct_t *regs, user_fpregs_struct_t *ext_regs, save_regs_t save, + void *arg, __maybe_unused unsigned long flags) +{ + user_fpregs_struct_t tmp, *fpsimd = ext_regs ? ext_regs : &tmp; + struct iovec iov; + int ret = -1; + + pr_info("Dumping FPU registers for %d\n", pid); + + iov.iov_base = fpsimd; + iov.iov_len = sizeof(*fpsimd); + if ((ret = ptrace(PTRACE_GETREGSET, pid, NT_PRFPREG, &iov))) { + pr_perror("Failed to obtain FPU registers for %d", pid); + return -1; + } + + ret = save(pid, arg, regs, fpsimd); + return ret; +} + +int compel_set_task_ext_regs(pid_t pid, user_fpregs_struct_t *ext_regs) +{ + struct iovec iov; + + pr_info("Restoring GP/FPU registers for %d\n", pid); + + iov.iov_base = ext_regs; + iov.iov_len = sizeof(*ext_regs); + if (ptrace(PTRACE_SETREGSET, pid, NT_PRFPREG, &iov)) { + pr_perror("Failed to set FPU registers for %d", pid); + return -1; + } + return 0; +} + +int compel_syscall(struct parasite_ctl *ctl, int nr, long *ret, unsigned long arg1, unsigned long arg2, + unsigned long arg3, unsigned long arg4, unsigned long arg5, unsigned long arg6) +{ + user_regs_struct_t regs = ctl->orig.regs; + int err; + + regs.a7 = (unsigned long)nr; + regs.a0 = arg1; + regs.a1 = arg2; + regs.a2 = arg3; + regs.a3 = arg4; + regs.a4 = arg5; + regs.a5 = arg6; + regs.a6 = 0; + + err = compel_execute_syscall(ctl, ®s, code_syscall); + + *ret = regs.a0; + return err; +} + +/* + * Calling the mmap system call in the context of the target (victim) process using the compel_syscall function. + * Used during the infection process to allocate memory for the parasite code. +*/ +void *remote_mmap(struct parasite_ctl *ctl, void *addr, size_t length, int prot, int flags, int fd, off_t offset) +{ + long map; + int err; + + err = compel_syscall(ctl, __NR_mmap, &map, (unsigned long)addr, length, prot, flags, fd, offset); + if (err < 0 || (long)map < 0) + map = 0; + + return (void *)map; +} + +void parasite_setup_regs(unsigned long new_ip, void *stack, user_regs_struct_t *regs) +{ + regs->pc = new_ip; + if (stack) + regs->sp = (unsigned long)stack; +} + +bool arch_can_dump_task(struct parasite_ctl *ctl) +{ + /* + * TODO: Add proper check here. + */ + return true; +} + +/* + * Fetch the signal alternate stack (sigaltstack), + * sas is a separate memory area for the signal handler to run on, + * avoiding potential issues with the main process stack +*/ +int arch_fetch_sas(struct parasite_ctl *ctl, struct rt_sigframe *s) +{ + long ret; + int err; + + err = compel_syscall(ctl, __NR_sigaltstack, &ret, 0, (unsigned long)&s->uc.uc_stack, 0, 0, 0, 0); + return err ? err : ret; +} + +/* + * Task size is the maximum virtual address space size that a process can occupy in the memory + * Refer to linux kernel arch/riscv/include/asm/pgtable.h, + * task size is: + * - 0x9fc00000 (~2.5GB) for RV32. + * - 0x4000000000 ( 256GB) for RV64 using SV39 mmu + * - 0x800000000000 ( 128TB) for RV64 using SV48 mmu + * - 0x100000000000000 ( 64PB) for RV64 using SV57 mmu + */ +#define TASK_SIZE_MIN (1UL << 38) +#define TASK_SIZE_MAX (1UL << 56) + +unsigned long compel_task_size(void) +{ + unsigned long task_size; + + for (task_size = TASK_SIZE_MIN; task_size < TASK_SIZE_MAX; task_size <<= 1) + if (munmap((void *)task_size, page_size())) + break; + return task_size; +} + +/* + * Get task registers (overwrites weak function) + */ +int ptrace_get_regs(int pid, user_regs_struct_t *regs) +{ + struct iovec iov; + + iov.iov_base = regs; + iov.iov_len = sizeof(user_regs_struct_t); + return ptrace(PTRACE_GETREGSET, pid, NT_PRSTATUS, &iov); +} + +/* + * Set task registers (overwrites weak function) + */ +int ptrace_set_regs(int pid, user_regs_struct_t *regs) +{ + struct iovec iov; + + iov.iov_base = regs; + iov.iov_len = sizeof(user_regs_struct_t); + return ptrace(PTRACE_SETREGSET, pid, NT_PRSTATUS, &iov); +} diff --git a/compel/arch/s390/plugins/include/asm/syscall-types.h b/compel/arch/s390/plugins/include/asm/syscall-types.h index 55d7ddb61..dd635399d 100644 --- a/compel/arch/s390/plugins/include/asm/syscall-types.h +++ b/compel/arch/s390/plugins/include/asm/syscall-types.h @@ -1,7 +1,7 @@ #ifndef COMPEL_ARCH_SYSCALL_TYPES_H__ #define COMPEL_ARCH_SYSCALL_TYPES_H__ -#define SA_RESTORER 0x04000000U +#define SA_RESTORER 0x04000000U typedef void rt_signalfn_t(int, siginfo_t *, void *); typedef rt_signalfn_t *rt_sighandler_t; @@ -9,13 +9,13 @@ typedef rt_signalfn_t *rt_sighandler_t; typedef void rt_restorefn_t(void); typedef rt_restorefn_t *rt_sigrestore_t; -#define _KNSIG 64 -#define _NSIG_BPW 64 +#define _KNSIG 64 +#define _NSIG_BPW 64 -#define _KNSIG_WORDS (_KNSIG / _NSIG_BPW) +#define _KNSIG_WORDS (_KNSIG / _NSIG_BPW) typedef struct { - unsigned long sig[_KNSIG_WORDS]; + unsigned long sig[_KNSIG_WORDS]; } k_rtsigset_t; /* @@ -23,10 +23,10 @@ typedef struct { * include/linux/signal.h. */ typedef struct { - rt_sighandler_t rt_sa_handler; - unsigned long rt_sa_flags; - rt_sigrestore_t rt_sa_restorer; - k_rtsigset_t rt_sa_mask; + rt_sighandler_t rt_sa_handler; + unsigned long rt_sa_flags; + rt_sigrestore_t rt_sa_restorer; + k_rtsigset_t rt_sa_mask; } rt_sigaction_t; struct mmap_arg_struct; diff --git a/compel/arch/s390/plugins/std/parasite-head.S b/compel/arch/s390/plugins/std/parasite-head.S index f4cb37276..1e276a2f5 100644 --- a/compel/arch/s390/plugins/std/parasite-head.S +++ b/compel/arch/s390/plugins/std/parasite-head.S @@ -2,25 +2,7 @@ .section .head.text, "ax" -/* - * Entry point for parasite_service() - * - * Addresses of symbols are exported in auto-generated criu/pie/parasite-blob.h - * - * Function is called via parasite_run(). The command for parasite_service() - * is stored in global variable __export_parasite_cmd. - * - * Load parameters for parasite_service(unsigned int cmd, void *args): - * - * - Parameter 1 (cmd) : %r2 = *(uint32 *)(__export_parasite_cmd + pc) - * - Parameter 2 (args): %r3 = __export_parasite_args + pc - */ ENTRY(__export_parasite_head_start) - larl %r14,__export_parasite_cmd - llgf %r2,0(%r14) - larl %r3,__export_parasite_args brasl %r14,parasite_service .long 0x00010001 /* S390_BREAKPOINT_U16: Generates SIGTRAP */ -__export_parasite_cmd: - .long 0 END(__export_parasite_head_start) diff --git a/compel/arch/s390/plugins/std/syscalls/syscall-s390.tbl b/compel/arch/s390/plugins/std/syscalls/syscall-s390.tbl index cc13a63dd..ff2f33006 100644 --- a/compel/arch/s390/plugins/std/syscalls/syscall-s390.tbl +++ b/compel/arch/s390/plugins/std/syscalls/syscall-s390.tbl @@ -82,7 +82,7 @@ __NR_sys_timer_settime 255 sys_timer_settime (kernel_timer_t timer_id, int flag __NR_sys_timer_gettime 256 sys_timer_gettime (int timer_id, const struct itimerspec *setting) __NR_sys_timer_getoverrun 257 sys_timer_getoverrun (int timer_id) __NR_sys_timer_delete 258 sys_timer_delete (kernel_timer_t timer_id) -__NR_clock_gettime 260 sys_clock_gettime (const clockid_t which_clock, const struct timespec *tp) +__NR_clock_gettime 260 sys_clock_gettime (clockid_t which_clock, struct timespec *tp) __NR_exit_group 248 sys_exit_group (int error_code) __NR_waitid 281 sys_waitid (int which, pid_t pid, struct siginfo *infop, int options, struct rusage *ru) __NR_set_robust_list 304 sys_set_robust_list (struct robust_list_head *head, size_t len) @@ -108,3 +108,15 @@ __NR_userfaultfd 355 sys_userfaultfd (int flags) __NR_preadv 328 sys_preadv_raw (int fd, struct iovec *iov, unsigned long nr, unsigned long pos_l, unsigned long pos_h) __NR_gettimeofday 78 sys_gettimeofday (struct timeval *tv, struct timezone *tz) __NR_ppoll 302 sys_ppoll (struct pollfd *fds, unsigned int nfds, const struct timespec *tmo, const sigset_t *sigmask, size_t sigsetsize) +__NR_open_tree 428 sys_open_tree (int dirfd, const char *pathname, unsigned int flags) +__NR_move_mount 429 sys_move_mount (int from_dfd, const char *from_pathname, int to_dfd, const char *to_pathname, int flags) +__NR_fsopen 430 sys_fsopen (char *fsname, unsigned int flags) +__NR_fsconfig 431 sys_fsconfig (int fd, unsigned int cmd, const char *key, const char *value, int aux) +__NR_fsmount 432 sys_fsmount (int fd, unsigned int flags, unsigned int attr_flags) +__NR_clone3 435 sys_clone3 (struct clone_args *uargs, size_t size) +__NR_close_range 436 sys_close_range (unsigned int fd, unsigned int max_fd, unsigned int flags) +__NR_pidfd_open 434 sys_pidfd_open (pid_t pid, unsigned int flags) +__NR_openat2 437 sys_openat2 (int dirfd, char *pathname, struct open_how *how, size_t size) +__NR_pidfd_getfd 438 sys_pidfd_getfd (int pidfd, int targetfd, unsigned int flags) +__NR_rseq 383 sys_rseq (void *rseq, uint32_t rseq_len, int flags, uint32_t sig) +__NR_membarrier 356 sys_membarrier (int cmd, unsigned int flags, int cpu_id) diff --git a/compel/arch/s390/plugins/std/syscalls/syscalls-s390.c b/compel/arch/s390/plugins/std/syscalls/syscalls-s390.c index 2b35cca4a..11c3284ab 100644 --- a/compel/arch/s390/plugins/std/syscalls/syscalls-s390.c +++ b/compel/arch/s390/plugins/std/syscalls/syscalls-s390.c @@ -3,14 +3,13 @@ /* * Define prototype because of compile error if we include uapi/std/syscall.h */ -long sys_old_mmap (struct mmap_arg_struct *); +long sys_old_mmap(struct mmap_arg_struct *); /* * On s390 we have defined __ARCH_WANT_SYS_OLD_MMAP - Therefore implement * system call with one parameter "mmap_arg_struct". */ -unsigned long sys_mmap(void *addr, unsigned long len, unsigned long prot, - unsigned long flags, unsigned long fd, +unsigned long sys_mmap(void *addr, unsigned long len, unsigned long prot, unsigned long flags, unsigned long fd, unsigned long offset) { struct mmap_arg_struct arg_struct; diff --git a/compel/arch/s390/scripts/compel-pack.lds.S b/compel/arch/s390/scripts/compel-pack.lds.S index 91ffbda3e..a82118983 100644 --- a/compel/arch/s390/scripts/compel-pack.lds.S +++ b/compel/arch/s390/scripts/compel-pack.lds.S @@ -12,7 +12,7 @@ SECTIONS *(.compel.init) } - .data : { + .data : ALIGN(0x1000) { *(.data*) *(.bss*) } @@ -33,8 +33,4 @@ SECTIONS *(.group*) *(.eh_frame*) } - -/* Parasite args should have 4 bytes align, as we have futex inside. */ -. = ALIGN(4); -__export_parasite_args = .; } diff --git a/compel/arch/s390/src/lib/cpu.c b/compel/arch/s390/src/lib/cpu.c index 5d86bf239..c98607e16 100644 --- a/compel/arch/s390/src/lib/cpu.c +++ b/compel/arch/s390/src/lib/cpu.c @@ -9,7 +9,7 @@ #include "log.h" -#undef LOG_PREFIX +#undef LOG_PREFIX #define LOG_PREFIX "cpu: " static compel_cpuinfo_t rt_info; @@ -23,10 +23,20 @@ static void fetch_rt_cpuinfo(void) } } -void compel_set_cpu_cap(compel_cpuinfo_t *c, unsigned int feature) { } -void compel_clear_cpu_cap(compel_cpuinfo_t *c, unsigned int feature) { } -int compel_test_fpu_cap(compel_cpuinfo_t *info, unsigned int feature) { return 0; } -int compel_test_cpu_cap(compel_cpuinfo_t *c, unsigned int feature) { return 0; } +void compel_set_cpu_cap(compel_cpuinfo_t *c, unsigned int feature) +{ +} +void compel_clear_cpu_cap(compel_cpuinfo_t *c, unsigned int feature) +{ +} +int compel_test_fpu_cap(compel_cpuinfo_t *info, unsigned int feature) +{ + return 0; +} +int compel_test_cpu_cap(compel_cpuinfo_t *c, unsigned int feature) +{ + return 0; +} int compel_cpuid(compel_cpuinfo_t *info) { diff --git a/compel/arch/s390/src/lib/handle-elf.c b/compel/arch/s390/src/lib/handle-elf.c index 01a8bf4c8..8e766dc1b 100644 --- a/compel/arch/s390/src/lib/handle-elf.c +++ b/compel/arch/s390/src/lib/handle-elf.c @@ -1,14 +1,12 @@ #include - -#include "uapi/compel.h" +#include #include "handle-elf.h" #include "piegen.h" #include "log.h" -static const unsigned char __maybe_unused -elf_ident_64[EI_NIDENT] = { - 0x7f, 0x45, 0x4c, 0x46, 0x02, 0x02, 0x01, 0x00, +static const unsigned char __maybe_unused elf_ident_64[EI_NIDENT] = { + 0x7f, 0x45, 0x4c, 0x46, 0x02, 0x02, 0x01, 0x00, /* clang-format */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, }; diff --git a/compel/arch/s390/src/lib/include/handle-elf.h b/compel/arch/s390/src/lib/include/handle-elf.h index cd1357401..597d8059f 100644 --- a/compel/arch/s390/src/lib/include/handle-elf.h +++ b/compel/arch/s390/src/lib/include/handle-elf.h @@ -5,8 +5,8 @@ #define ELF_S390 -#define __handle_elf handle_elf_s390 -#define arch_is_machine_supported(e_machine) (e_machine == EM_S390) +#define __handle_elf handle_elf_s390 +#define arch_is_machine_supported(e_machine) (e_machine == EM_S390) int handle_elf_s390(void *mem, size_t size); diff --git a/compel/arch/s390/src/lib/include/syscall.h b/compel/arch/s390/src/lib/include/syscall.h index 57d49121f..828f29e4b 100644 --- a/compel/arch/s390/src/lib/include/syscall.h +++ b/compel/arch/s390/src/lib/include/syscall.h @@ -1,8 +1,7 @@ #ifndef __COMPEL_SYSCALL_H__ #define __COMPEL_SYSCALL_H__ -unsigned long sys_mmap(void *addr, unsigned long len, unsigned long prot, - unsigned long flags, unsigned long fd, +unsigned long sys_mmap(void *addr, unsigned long len, unsigned long prot, unsigned long flags, unsigned long fd, unsigned long offset); #endif diff --git a/compel/arch/s390/src/lib/include/uapi/asm/cpu.h b/compel/arch/s390/src/lib/include/uapi/asm/cpu.h index b01db511d..c2652b2f4 100644 --- a/compel/arch/s390/src/lib/include/uapi/asm/cpu.h +++ b/compel/arch/s390/src/lib/include/uapi/asm/cpu.h @@ -4,7 +4,7 @@ #include typedef struct { - uint64_t hwcap[2]; + uint64_t hwcap[2]; } compel_cpuinfo_t; #endif /* __CR_ASM_CPU_H__ */ diff --git a/compel/arch/s390/src/lib/include/uapi/asm/infect-types.h b/compel/arch/s390/src/lib/include/uapi/asm/infect-types.h index fddf65d3b..87283bc6b 100644 --- a/compel/arch/s390/src/lib/include/uapi/asm/infect-types.h +++ b/compel/arch/s390/src/lib/include/uapi/asm/infect-types.h @@ -7,8 +7,8 @@ #include #include "common/page.h" -#define SIGMAX 64 -#define SIGMAX_OLD 31 +#define SIGMAX 64 +#define SIGMAX_OLD 31 /* * Definitions from /usr/include/asm/ptrace.h: @@ -33,28 +33,28 @@ typedef struct { } vector128_t; struct prfpreg { - uint32_t fpc; - uint64_t fprs[16]; + uint32_t fpc; + uint64_t fprs[16]; }; -#define USER_FPREGS_VXRS 0x000000001 +#define USER_FPREGS_VXRS 0x000000001 /* Guarded-storage control block */ -#define USER_GS_CB 0x000000002 +#define USER_GS_CB 0x000000002 /* Guarded-storage broadcast control block */ -#define USER_GS_BC 0x000000004 +#define USER_GS_BC 0x000000004 /* Runtime-instrumentation control block */ -#define USER_RI_CB 0x000000008 +#define USER_RI_CB 0x000000008 /* Runtime-instrumentation bit set */ -#define USER_RI_ON 0x000000010 +#define USER_RI_ON 0x000000010 typedef struct { - uint32_t flags; - struct prfpreg prfpreg; - uint64_t vxrs_low[16]; - vector128_t vxrs_high[16]; - uint64_t gs_cb[4]; - uint64_t gs_bc[4]; - uint64_t ri_cb[8]; + uint32_t flags; + struct prfpreg prfpreg; + uint64_t vxrs_low[16]; + vector128_t vxrs_high[16]; + uint64_t gs_cb[4]; + uint64_t gs_bc[4]; + uint64_t ri_cb[8]; } user_fpregs_struct_t; typedef struct { @@ -62,18 +62,23 @@ typedef struct { uint32_t system_call; } user_regs_struct_t; -#define REG_RES(r) ((uint64_t)(r).prstatus.gprs[2]) -#define REG_IP(r) ((uint64_t)(r).prstatus.psw.addr) -#define REG_SP(r) ((uint64_t)(r).prstatus.gprs[15]) +#define REG_RES(r) ((uint64_t)(r).prstatus.gprs[2]) +#define REG_IP(r) ((uint64_t)(r).prstatus.psw.addr) +#define SET_REG_IP(r, val) ((r).prstatus.psw.addr = (val)) +#define REG_SP(r) ((uint64_t)(r).prstatus.gprs[15]) /* * We assume that REG_SYSCALL_NR() is only used for pie code where we * always use svc 0 with opcode in %r1. */ -#define REG_SYSCALL_NR(r) ((uint64_t)(r).prstatus.gprs[1]) +#define REG_SYSCALL_NR(r) ((uint64_t)(r).prstatus.gprs[1]) -#define user_regs_native(pregs) true +#define user_regs_native(pregs) true -#define __NR(syscall, compat) __NR_##syscall +#define __NR(syscall, compat) \ + ({ \ + (void)compat; \ + __NR_##syscall; \ + }) struct mmap_arg_struct { unsigned long addr; @@ -84,4 +89,9 @@ struct mmap_arg_struct { unsigned long offset; }; +#define __compel_arch_fetch_thread_area(tid, th) 0 +#define compel_arch_fetch_thread_area(tctl) 0 +#define compel_arch_get_tls_task(ctl, tls) +#define compel_arch_get_tls_thread(tctl, tls) + #endif /* UAPI_COMPEL_ASM_TYPES_H__ */ diff --git a/compel/arch/s390/src/lib/include/uapi/asm/sigframe.h b/compel/arch/s390/src/lib/include/uapi/asm/sigframe.h index b6b894473..965fef102 100644 --- a/compel/arch/s390/src/lib/include/uapi/asm/sigframe.h +++ b/compel/arch/s390/src/lib/include/uapi/asm/sigframe.h @@ -36,14 +36,14 @@ typedef struct { * From /usr/include/uapi/asm/ucontext.h */ struct ucontext_extended { - unsigned long uc_flags; - ucontext_t *uc_link; - stack_t uc_stack; - _sigregs uc_mcontext; - sigset_t uc_sigmask; + unsigned long uc_flags; + ucontext_t *uc_link; + stack_t uc_stack; + _sigregs uc_mcontext; + sigset_t uc_sigmask; /* Allow for uc_sigmask growth. Glibc uses a 1024-bit sigset_t. */ - unsigned char __unused[128 - sizeof(sigset_t)]; - _sigregs_ext uc_mcontext_ext; + unsigned char __unused[128 - sizeof(sigset_t)]; + _sigregs_ext uc_mcontext_ext; }; /* @@ -59,6 +59,7 @@ struct rt_sigframe { /* * Do rt_sigreturn SVC */ +/* clang-format off */ #define ARCH_RT_SIGRETURN(new_sp, rt_sigframe) \ asm volatile( \ "lgr %%r15,%0\n" \ @@ -66,15 +67,14 @@ struct rt_sigframe { "svc 0\n" \ : \ : "d" (new_sp) \ - : "15", "memory") + : "memory") +/* clang-format on */ -#define RT_SIGFRAME_UC(rt_sigframe) (&rt_sigframe->uc) -#define RT_SIGFRAME_REGIP(rt_sigframe) (rt_sigframe)->uc.uc_mcontext.regs.psw.addr -#define RT_SIGFRAME_HAS_FPU(rt_sigframe) (1) +#define RT_SIGFRAME_UC(rt_sigframe) (&rt_sigframe->uc) +#define RT_SIGFRAME_REGIP(rt_sigframe) (rt_sigframe)->uc.uc_mcontext.regs.psw.addr +#define RT_SIGFRAME_HAS_FPU(rt_sigframe) (1) -#define rt_sigframe_erase_sigset(sigframe) \ - memset(&sigframe->uc.uc_sigmask, 0, sizeof(k_rtsigset_t)) -#define rt_sigframe_copy_sigset(sigframe, from) \ - memcpy(&sigframe->uc.uc_sigmask, from, sizeof(k_rtsigset_t)) +#define rt_sigframe_erase_sigset(sigframe) memset(&sigframe->uc.uc_sigmask, 0, sizeof(k_rtsigset_t)) +#define rt_sigframe_copy_sigset(sigframe, from) memcpy(&sigframe->uc.uc_sigmask, from, sizeof(k_rtsigset_t)) #endif /* UAPI_COMPEL_ASM_SIGFRAME_H__ */ diff --git a/compel/arch/s390/src/lib/infect.c b/compel/arch/s390/src/lib/infect.c index 00e9c36d2..a77b38917 100644 --- a/compel/arch/s390/src/lib/infect.c +++ b/compel/arch/s390/src/lib/infect.c @@ -5,6 +5,7 @@ #include #include #include +#include #include #include #include @@ -16,25 +17,23 @@ #include "ptrace.h" #include "infect-priv.h" -#define NT_PRFPREG 2 -#define NT_S390_VXRS_LOW 0x309 -#define NT_S390_VXRS_HIGH 0x30a -#define NT_S390_GS_CB 0x30b -#define NT_S390_GS_BC 0x30c -#define NT_S390_RI_CB 0x30d +#define NT_PRFPREG 2 +#define NT_S390_VXRS_LOW 0x309 +#define NT_S390_VXRS_HIGH 0x30a +#define NT_S390_GS_CB 0x30b +#define NT_S390_GS_BC 0x30c +#define NT_S390_RI_CB 0x30d /* * Print general purpose and access registers */ -static void print_user_regs_struct(const char *msg, int pid, - user_regs_struct_t *regs) +static void print_user_regs_struct(const char *msg, int pid, user_regs_struct_t *regs) { int i; pr_debug("%s: Registers for pid=%d\n", msg, pid); - pr_debug("system_call %08lx\n", (unsigned long) regs->system_call); - pr_debug(" psw %016lx %016lx\n", regs->prstatus.psw.mask, - regs->prstatus.psw.addr); + pr_debug("system_call %08lx\n", (unsigned long)regs->system_call); + pr_debug(" psw %016lx %016lx\n", regs->prstatus.psw.mask, regs->prstatus.psw.addr); pr_debug(" orig_gpr2 %016lx\n", regs->prstatus.orig_gpr2); for (i = 0; i < 16; i++) pr_debug(" g%02d %016lx\n", i, regs->prstatus.gprs[i]); @@ -56,9 +55,7 @@ static void print_vxrs(user_fpregs_struct_t *fpregs) for (i = 0; i < 16; i++) pr_debug(" vx_low%02d %016lx\n", i, fpregs->vxrs_low[i]); for (i = 0; i < 16; i++) - pr_debug(" vx_high%02d %016lx %016lx\n", i, - fpregs->vxrs_high[i].part1, - fpregs->vxrs_high[i].part2); + pr_debug(" vx_high%02d %016lx %016lx\n", i, fpregs->vxrs_high[i].part1, fpregs->vxrs_high[i].part2); } /* @@ -110,8 +107,7 @@ static void print_ri_cb(user_fpregs_struct_t *fpregs) * Print FP registers, VX registers, guarded-storage, and * runtime-instrumentation */ -static void print_user_fpregs_struct(const char *msg, int pid, - user_fpregs_struct_t *fpregs) +static void print_user_fpregs_struct(const char *msg, int pid, user_fpregs_struct_t *fpregs) { int i; @@ -125,28 +121,19 @@ static void print_user_fpregs_struct(const char *msg, int pid, print_ri_cb(fpregs); } -int sigreturn_prep_regs_plain(struct rt_sigframe *sigframe, - user_regs_struct_t *regs, - user_fpregs_struct_t *fpregs) +int sigreturn_prep_regs_plain(struct rt_sigframe *sigframe, user_regs_struct_t *regs, user_fpregs_struct_t *fpregs) { _sigregs_ext *dst_ext = &sigframe->uc.uc_mcontext_ext; _sigregs *dst = &sigframe->uc.uc_mcontext; - memcpy(dst->regs.gprs, regs->prstatus.gprs, - sizeof(regs->prstatus.gprs)); - memcpy(dst->regs.acrs, regs->prstatus.acrs, - sizeof(regs->prstatus.acrs)); - memcpy(&dst->regs.psw, ®s->prstatus.psw, - sizeof(regs->prstatus.psw)); - memcpy(&dst->fpregs.fpc, &fpregs->prfpreg.fpc, - sizeof(fpregs->prfpreg.fpc)); - memcpy(&dst->fpregs.fprs, &fpregs->prfpreg.fprs, - sizeof(fpregs->prfpreg.fprs)); + memcpy(dst->regs.gprs, regs->prstatus.gprs, sizeof(regs->prstatus.gprs)); + memcpy(dst->regs.acrs, regs->prstatus.acrs, sizeof(regs->prstatus.acrs)); + memcpy(&dst->regs.psw, ®s->prstatus.psw, sizeof(regs->prstatus.psw)); + memcpy(&dst->fpregs.fpc, &fpregs->prfpreg.fpc, sizeof(fpregs->prfpreg.fpc)); + memcpy(&dst->fpregs.fprs, &fpregs->prfpreg.fprs, sizeof(fpregs->prfpreg.fprs)); if (fpregs->flags & USER_FPREGS_VXRS) { - memcpy(&dst_ext->vxrs_low, &fpregs->vxrs_low, - sizeof(fpregs->vxrs_low)); - memcpy(&dst_ext->vxrs_high, &fpregs->vxrs_high, - sizeof(fpregs->vxrs_high)); + memcpy(&dst_ext->vxrs_low, &fpregs->vxrs_low, sizeof(fpregs->vxrs_low)); + memcpy(&dst_ext->vxrs_high, &fpregs->vxrs_high, sizeof(fpregs->vxrs_high)); } else { memset(&dst_ext->vxrs_low, 0, sizeof(dst_ext->vxrs_low)); memset(&dst_ext->vxrs_high, 0, sizeof(dst_ext->vxrs_high)); @@ -154,8 +141,7 @@ int sigreturn_prep_regs_plain(struct rt_sigframe *sigframe, return 0; } -int sigreturn_prep_fpu_frame_plain(struct rt_sigframe *sigframe, - struct rt_sigframe *rsigframe) +int sigreturn_prep_fpu_frame_plain(struct rt_sigframe *sigframe, struct rt_sigframe *rsigframe) { return 0; } @@ -168,9 +154,7 @@ static inline void rewind_psw(psw_t *psw, unsigned long bytes) unsigned long mask; pr_debug("Rewind psw: %016lx bytes=%lu\n", psw->addr, bytes); - mask = (psw->mask & PSW_MASK_EA) ? -1UL : - (psw->mask & PSW_MASK_BA) ? (1UL << 31) - 1 : - (1UL << 24) - 1; + mask = (psw->mask & PSW_MASK_EA) ? -1UL : (psw->mask & PSW_MASK_BA) ? (1UL << 31) - 1 : (1UL << 24) - 1; psw->addr = (psw->addr - bytes) & mask; } @@ -195,13 +179,13 @@ int get_vx_regs(pid_t pid, user_fpregs_struct_t *fpregs) pr_debug("VXRS registers not supported\n"); return 0; } - pr_perror("Couldn't get VXRS_LOW\n"); + pr_perror("Couldn't get VXRS_LOW"); return -1; } iov.iov_base = &fpregs->vxrs_high; iov.iov_len = sizeof(fpregs->vxrs_high); if (ptrace(PTRACE_GETREGSET, pid, NT_S390_VXRS_HIGH, &iov) < 0) { - pr_perror("Couldn't get VXRS_HIGH\n"); + pr_perror("Couldn't get VXRS_HIGH"); return -1; } fpregs->flags |= USER_FPREGS_VXRS; @@ -242,7 +226,7 @@ int get_gs_cb(pid_t pid, user_fpregs_struct_t *fpregs) pr_debug("GS_BC not set\n"); return 0; } - pr_perror("Couldn't get GS_BC\n"); + pr_perror("Couldn't get GS_BC"); return -1; } fpregs->flags |= USER_GS_BC; @@ -273,7 +257,7 @@ int get_ri_cb(pid_t pid, user_fpregs_struct_t *fpregs) pr_debug("RI_CB not set\n"); return 0; default: - pr_perror("Couldn't get RI_CB\n"); + pr_perror("Couldn't get RI_CB"); return -1; } } @@ -309,31 +293,30 @@ static int s390_disable_ri_bit(pid_t pid, user_regs_struct_t *regs) /* * Prepare task registers for restart */ -int get_task_regs(pid_t pid, user_regs_struct_t *regs, save_regs_t save, - void *arg, __maybe_unused unsigned long flags) +int compel_get_task_regs(pid_t pid, user_regs_struct_t *regs, user_fpregs_struct_t *fpregs, save_regs_t save, + void *arg, __maybe_unused unsigned long flags) { - user_fpregs_struct_t fpregs; struct iovec iov; int rewind; - print_user_regs_struct("get_task_regs", pid, regs); + print_user_regs_struct("compel_get_task_regs", pid, regs); - memset(&fpregs, 0, sizeof(fpregs)); - iov.iov_base = &fpregs.prfpreg; - iov.iov_len = sizeof(fpregs.prfpreg); + memset(fpregs, 0, sizeof(*fpregs)); + iov.iov_base = &fpregs->prfpreg; + iov.iov_len = sizeof(fpregs->prfpreg); if (ptrace(PTRACE_GETREGSET, pid, NT_PRFPREG, &iov) < 0) { pr_perror("Couldn't get floating-point registers"); return -1; } - if (get_vx_regs(pid, &fpregs)) { + if (get_vx_regs(pid, fpregs)) { pr_perror("Couldn't get vector registers"); return -1; } - if (get_gs_cb(pid, &fpregs)) { + if (get_gs_cb(pid, fpregs)) { pr_perror("Couldn't get guarded-storage"); return -1; } - if (get_ri_cb(pid, &fpregs)) { + if (get_ri_cb(pid, fpregs)) { pr_perror("Couldn't get runtime-instrumentation"); return -1; } @@ -342,10 +325,10 @@ int get_task_regs(pid_t pid, user_regs_struct_t *regs, save_regs_t save, * before we execute parasite code. Otherwise parasite operations * would be recorded. */ - if (fpregs.flags & USER_RI_ON) + if (fpregs->flags & USER_RI_ON) s390_disable_ri_bit(pid, regs); - print_user_fpregs_struct("get_task_regs", pid, &fpregs); + print_user_fpregs_struct("compel_get_task_regs", pid, fpregs); /* Check for system call restarting. */ if (regs->system_call) { rewind = regs->system_call >> 16; @@ -365,17 +348,72 @@ int get_task_regs(pid_t pid, user_regs_struct_t *regs, save_regs_t save, } } /* Call save_task_regs() */ - return save(arg, regs, &fpregs); + return save(pid, arg, regs, fpregs); +} + +int compel_set_task_ext_regs(pid_t pid, user_fpregs_struct_t *ext_regs) +{ + struct iovec iov; + int ret = 0; + + iov.iov_base = &ext_regs->prfpreg; + iov.iov_len = sizeof(ext_regs->prfpreg); + if (ptrace(PTRACE_SETREGSET, pid, NT_PRFPREG, &iov) < 0) { + pr_perror("Couldn't set floating-point registers"); + ret = -1; + } + + if (ext_regs->flags & USER_FPREGS_VXRS) { + iov.iov_base = &ext_regs->vxrs_low; + iov.iov_len = sizeof(ext_regs->vxrs_low); + if (ptrace(PTRACE_SETREGSET, pid, NT_S390_VXRS_LOW, &iov) < 0) { + pr_perror("Couldn't set VXRS_LOW"); + ret = -1; + } + + iov.iov_base = &ext_regs->vxrs_high; + iov.iov_len = sizeof(ext_regs->vxrs_high); + if (ptrace(PTRACE_SETREGSET, pid, NT_S390_VXRS_HIGH, &iov) < 0) { + pr_perror("Couldn't set VXRS_HIGH"); + ret = -1; + } + } + + if (ext_regs->flags & USER_GS_CB) { + iov.iov_base = &ext_regs->gs_cb; + iov.iov_len = sizeof(ext_regs->gs_cb); + if (ptrace(PTRACE_SETREGSET, pid, NT_S390_GS_CB, &iov) < 0) { + pr_perror("Couldn't set GS_CB"); + ret = -1; + } + iov.iov_base = &ext_regs->gs_bc; + iov.iov_len = sizeof(ext_regs->gs_bc); + if (ptrace(PTRACE_SETREGSET, pid, NT_S390_GS_BC, &iov) < 0) { + pr_perror("Couldn't set GS_BC"); + ret = -1; + } + } + + if (ext_regs->flags & USER_RI_CB) { + iov.iov_base = &ext_regs->ri_cb; + iov.iov_len = sizeof(ext_regs->ri_cb); + if (ptrace(PTRACE_SETREGSET, pid, NT_S390_RI_CB, &iov) < 0) { + pr_perror("Couldn't set RI_CB"); + ret = -1; + } + } + + return ret; } /* * Injected syscall instruction */ const char code_syscall[] = { - 0x0a, 0x00, /* sc 0 */ - 0x00, 0x01, /* S390_BREAKPOINT_U16 */ - 0x00, 0x01, /* S390_BREAKPOINT_U16 */ - 0x00, 0x01, /* S390_BREAKPOINT_U16 */ + 0x0a, 0x00, /* sc 0 */ + 0x00, 0x01, /* S390_BREAKPOINT_U16 */ + 0x00, 0x01, /* S390_BREAKPOINT_U16 */ + 0x00, 0x01, /* S390_BREAKPOINT_U16 */ }; static inline void __check_code_syscall(void) @@ -387,19 +425,14 @@ static inline void __check_code_syscall(void) /* * Issue s390 system call */ -int compel_syscall(struct parasite_ctl *ctl, int nr, long *ret, - unsigned long arg1, - unsigned long arg2, - unsigned long arg3, - unsigned long arg4, - unsigned long arg5, - unsigned long arg6) +int compel_syscall(struct parasite_ctl *ctl, int nr, long *ret, unsigned long arg1, unsigned long arg2, + unsigned long arg3, unsigned long arg4, unsigned long arg5, unsigned long arg6) { user_regs_struct_t regs = ctl->orig.regs; int err; /* Load syscall number into %r1 */ - regs.prstatus.gprs[1] = (unsigned long) nr; + regs.prstatus.gprs[1] = (unsigned long)nr; /* Load parameter registers %r2-%r7 */ regs.prstatus.gprs[2] = arg1; regs.prstatus.gprs[3] = arg2; @@ -408,7 +441,7 @@ int compel_syscall(struct parasite_ctl *ctl, int nr, long *ret, regs.prstatus.gprs[6] = arg5; regs.prstatus.gprs[7] = arg6; - err = compel_execute_syscall(ctl, ®s, (char *) code_syscall); + err = compel_execute_syscall(ctl, ®s, (char *)code_syscall); /* Return code from system is in %r2 */ if (ret) @@ -419,9 +452,7 @@ int compel_syscall(struct parasite_ctl *ctl, int nr, long *ret, /* * Issue s390 mmap call */ -void *remote_mmap(struct parasite_ctl *ctl, - void *addr, size_t length, int prot, - int flags, int fd, off_t offset) +void *remote_mmap(struct parasite_ctl *ctl, void *addr, size_t length, int prot, int flags, int fd, off_t offset) { void *where = (void *)ctl->ictx.syscall_ip + BUILTIN_SYSCALL_SIZE; struct mmap_arg_struct arg_struct; @@ -444,8 +475,7 @@ void *remote_mmap(struct parasite_ctl *ctl, } /* Do syscall */ - err = compel_syscall(ctl, __NR_mmap, &map, (unsigned long) where, - 0, 0, 0, 0, 0); + err = compel_syscall(ctl, __NR_mmap, &map, (unsigned long)where, 0, 0, 0, 0, 0); if (err < 0 || (long)map < 0) map = 0; @@ -453,8 +483,9 @@ void *remote_mmap(struct parasite_ctl *ctl, if (ptrace_poke_area(pid, &arg_struct, where, sizeof(arg_struct))) { pr_err("Can't restore mmap args (pid: %d)\n", pid); if (map != 0) { - compel_syscall(ctl, __NR_munmap, NULL, map, - length, 0, 0, 0, 0); + err = compel_syscall(ctl, __NR_munmap, NULL, map, length, 0, 0, 0, 0); + if (err) + pr_err("Can't munmap %d\n", err); map = 0; } } @@ -465,14 +496,12 @@ void *remote_mmap(struct parasite_ctl *ctl, /* * Setup registers for parasite call */ -void parasite_setup_regs(unsigned long new_ip, void *stack, - user_regs_struct_t *regs) +void parasite_setup_regs(unsigned long new_ip, void *stack, user_regs_struct_t *regs) { regs->prstatus.psw.addr = new_ip; if (!stack) return; - regs->prstatus.gprs[15] = ((unsigned long) stack) - - STACK_FRAME_OVERHEAD; + regs->prstatus.gprs[15] = ((unsigned long)stack) - STACK_FRAME_OVERHEAD; } /* @@ -520,9 +549,7 @@ int arch_fetch_sas(struct parasite_ctl *ctl, struct rt_sigframe *s) long ret; int err; - err = compel_syscall(ctl, __NR_sigaltstack, - &ret, 0, (unsigned long)&s->uc.uc_stack, - 0, 0, 0, 0); + err = compel_syscall(ctl, __NR_sigaltstack, &ret, 0, (unsigned long)&s->uc.uc_stack, 0, 0, 0, 0); return err ? err : ret; } @@ -596,9 +623,9 @@ enum kernel_ts_level { }; /* See arch/s390/include/asm/processor.h */ -#define TASK_SIZE_LEVEL_3 0x40000000000UL /* 4 TB */ -#define TASK_SIZE_LEVEL_4 0x20000000000000UL /* 8 PB */ -#define TASK_SIZE_LEVEL_5 0xffffffffffffefffUL /* 16 EB - 0x1000 */ +#define TASK_SIZE_LEVEL_3 0x40000000000UL /* 4 TB */ +#define TASK_SIZE_LEVEL_4 0x20000000000000UL /* 8 PB */ +#define TASK_SIZE_LEVEL_5 0xffffffffffffefffUL /* 16 EB - 0x1000 */ /* * Return detected kernel version regarding task size level @@ -612,12 +639,12 @@ static enum kernel_ts_level get_kernel_ts_level(void) /* Check for 5 levels */ if (criu_end_addr >= TASK_SIZE_LEVEL_4) return KERNEL_TS_LEVEL_5; - else if (munmap((void *) TASK_SIZE_LEVEL_4, 0x1000) == 0) + else if (munmap((void *)TASK_SIZE_LEVEL_4, 0x1000) == 0) return KERNEL_TS_LEVEL_5; if (criu_end_addr < TASK_SIZE_LEVEL_3) { /* Check for 4 level kernel with fix */ - if (munmap((void *) TASK_SIZE_LEVEL_3, 0x1000) == 0) + if (munmap((void *)TASK_SIZE_LEVEL_3, 0x1000) == 0) return KERNEL_TS_LEVEL_4_FIX_YES; else return KERNEL_TS_LEVEL_4_FIX_NO; @@ -658,14 +685,6 @@ unsigned long compel_task_size(void) /* * Get task registers (overwrites weak function) - * - * We don't store floating point and vector registers here because we - * assue that compel/pie code does not change them. - * - * For verification issue: - * - * $ objdump -S criu/pie/parasite.built-in.bin.o | grep "%f" - * $ objdump -S criu/pie/restorer.built-in.bin.o | grep "%f" */ int ptrace_get_regs(int pid, user_regs_struct_t *regs) { diff --git a/compel/arch/x86/plugins/include/asm/prologue.h b/compel/arch/x86/plugins/include/asm/prologue.h index 9d812eec9..c19ce54d7 100644 --- a/compel/arch/x86/plugins/include/asm/prologue.h +++ b/compel/arch/x86/plugins/include/asm/prologue.h @@ -9,17 +9,16 @@ #include -#define sys_recv(sockfd, ubuf, size, flags) \ - sys_recvfrom(sockfd, ubuf, size, flags, NULL, NULL) +#define sys_recv(sockfd, ubuf, size, flags) sys_recvfrom(sockfd, ubuf, size, flags, NULL, NULL) typedef struct prologue_init_args { - struct sockaddr_un ctl_sock_addr; - unsigned int ctl_sock_addr_len; + struct sockaddr_un ctl_sock_addr; + unsigned int ctl_sock_addr_len; - unsigned int arg_s; - void *arg_p; + unsigned int arg_s; + void *arg_p; - void *sigframe; + void *sigframe; } prologue_init_args_t; #endif /* __ASSEMBLY__ */ @@ -29,8 +28,8 @@ typedef struct prologue_init_args { * * FIXME It is rather should be taken from sigframe header. */ -#define PROLOGUE_SGFRAME_SIZE 4096 +#define PROLOGUE_SGFRAME_SIZE 4096 -#define PROLOGUE_INIT_ARGS_SIZE 1024 +#define PROLOGUE_INIT_ARGS_SIZE 1024 #endif /* __ASM_PROLOGUE_H__ */ diff --git a/compel/arch/x86/plugins/include/asm/syscall-types.h b/compel/arch/x86/plugins/include/asm/syscall-types.h index 9874fd0be..6987aad16 100644 --- a/compel/arch/x86/plugins/include/asm/syscall-types.h +++ b/compel/arch/x86/plugins/include/asm/syscall-types.h @@ -8,12 +8,12 @@ typedef rt_signalfn_t *rt_sighandler_t; typedef void rt_restorefn_t(void); typedef rt_restorefn_t *rt_sigrestore_t; -#define SA_RESTORER 0x04000000 +#define SA_RESTORER 0x04000000 -#define _KNSIG 64 -#define _NSIG_BPW 64 +#define _KNSIG 64 +#define _NSIG_BPW 64 -#define _KNSIG_WORDS (_KNSIG / _NSIG_BPW) +#define _KNSIG_WORDS (_KNSIG / _NSIG_BPW) /* * Note: as k_rtsigset_t is the same size for 32-bit and 64-bit, @@ -21,14 +21,14 @@ typedef rt_restorefn_t *rt_sigrestore_t; * purpose if we ever going to support native 32-bit compilation. */ typedef struct { - uint64_t sig[_KNSIG_WORDS]; + uint64_t sig[_KNSIG_WORDS]; } k_rtsigset_t; typedef struct { - rt_sighandler_t rt_sa_handler; - unsigned long rt_sa_flags; - rt_sigrestore_t rt_sa_restorer; - k_rtsigset_t rt_sa_mask; + rt_sighandler_t rt_sa_handler; + unsigned long rt_sa_flags; + rt_sigrestore_t rt_sa_restorer; + k_rtsigset_t rt_sa_mask; } rt_sigaction_t; /* @@ -37,24 +37,24 @@ typedef struct { * with unaligned rt_sa_mask. */ typedef struct __attribute__((packed)) { - unsigned int rt_sa_handler; - unsigned int rt_sa_flags; - unsigned int rt_sa_restorer; - k_rtsigset_t rt_sa_mask; + unsigned int rt_sa_handler; + unsigned int rt_sa_flags; + unsigned int rt_sa_restorer; + k_rtsigset_t rt_sa_mask; } rt_sigaction_t_compat; /* Types for set_thread_area, get_thread_area syscalls */ typedef struct { - unsigned int entry_number; - unsigned int base_addr; - unsigned int limit; - unsigned int seg_32bit:1; - unsigned int contents:2; - unsigned int read_exec_only:1; - unsigned int limit_in_pages:1; - unsigned int seg_not_present:1; - unsigned int useable:1; - unsigned int lm:1; + unsigned int entry_number; + unsigned int base_addr; + unsigned int limit; + unsigned int seg_32bit : 1; + unsigned int contents : 2; + unsigned int read_exec_only : 1; + unsigned int limit_in_pages : 1; + unsigned int seg_not_present : 1; + unsigned int usable : 1; + unsigned int lm : 1; } user_desc_t; #endif /* COMPEL_ARCH_SYSCALL_TYPES_H__ */ diff --git a/compel/arch/x86/plugins/std/parasite-head.S b/compel/arch/x86/plugins/std/parasite-head.S index a988de9d4..42cad4808 100644 --- a/compel/arch/x86/plugins/std/parasite-head.S +++ b/compel/arch/x86/plugins/std/parasite-head.S @@ -6,16 +6,6 @@ # error 64-bit parasite should compile with CONFIG_X86_64 #endif -.macro PARASITE_ENTRY num - subq $16, %rsp - andq $~15, %rsp - pushq $\num - movq %rsp, %rbp - movl __export_parasite_cmd(%rip), %edi - leaq __export_parasite_args(%rip), %rsi - call parasite_service -.endm - #ifdef CONFIG_COMPAT .code32 ENTRY(__export_parasite_head_start_compat) @@ -23,9 +13,11 @@ ENTRY(__export_parasite_head_start_compat) jmp $__USER_CS,$1f 1: .code64 - PARASITE_ENTRY 0 + call parasite_service pushq $__USER32_CS - pushq $2f + xor %r11, %r11 + movl $2f, %r11d + pushq %r11 lretq 2: .code32 @@ -42,11 +34,21 @@ END(__export_parasite_head_start_compat) .code64 #endif +/* + * When parasite_service() runs in the daemon mode it will return the stack + * pointer for the sigreturn frame in %rax and we call sigreturn directly + * from here. + * Since a valid stack pointer is positive, it is safe to presume that + * return value <= 0 means that parasite_service() called parasite_trap_cmd() + * in non-daemon mode, and the parasite should stop at int3. + */ ENTRY(__export_parasite_head_start) - PARASITE_ENTRY 0 + call parasite_service + cmp $0, %rax + jle 1f + movq %rax, %rsp + movq $15, %rax + syscall +1: int $0x03 END(__export_parasite_head_start) - -.align 8 -GLOBAL(__export_parasite_cmd) - .long 0 diff --git a/compel/arch/x86/plugins/std/prologue.S b/compel/arch/x86/plugins/std/prologue.S deleted file mode 100644 index 79ad1f6f2..000000000 --- a/compel/arch/x86/plugins/std/prologue.S +++ /dev/null @@ -1,33 +0,0 @@ -#include "common/asm/linkage.h" -#include "asm/prologue.h" - -#include "uapi/std/syscall-codes.h" - - .section .compel.prologue.text, "ax" -ENTRY(__export_std_prologue_start) - push %rsp - - leaq __export_std_prologue_init_args(%rip), %rdi - movq __export_std_plugin_begin(%rip), %rsi - movq __export_std_plugin_size(%rip), %rdx - call __export_std_compel_start - -do_rt_sigreturn: - leaq __export_std_prologue_sigframe(%rip), %rax - addq $8, %rax - movq %rax, %rsp # we can't use sys_rt_sigreturn here - mov $__NR_rt_sigreturn, %eax # because we're adjusting stack - syscall - -GLOBAL(__export_std_prologue_init_args) - .space PROLOGUE_INIT_ARGS_SIZE, 0 - -GLOBAL(__export_std_plugin_begin) - .space 8, 0 -GLOBAL(__export_std_plugin_size) - .space 8, 0 - - .align 64 -GLOBAL(__export_std_prologue_sigframe) - .space PROLOGUE_SGFRAME_SIZE, 0 -END(__export_std_prologue_start) diff --git a/compel/arch/x86/plugins/std/syscalls/Makefile.syscalls b/compel/arch/x86/plugins/std/syscalls/Makefile.syscalls index 4ba4b56c8..62c25f3e0 100644 --- a/compel/arch/x86/plugins/std/syscalls/Makefile.syscalls +++ b/compel/arch/x86/plugins/std/syscalls/Makefile.syscalls @@ -39,6 +39,10 @@ $(sys-proto): $(sys-def) $(sys-proto-types) $(Q) echo "/* Autogenerated, don't edit */" > $$@ $(Q) echo "#ifndef ASM_SYSCALL_PROTO_H_$(1)__" >> $$@ $(Q) echo "#define ASM_SYSCALL_PROTO_H_$(1)__" >> $$@ + $(Q) echo "/* musl defines loff_t as off_t */" >> $$@ + $(Q) echo '#ifndef loff_t' >> $$@ + $(Q) echo '#define loff_t off_t' >> $$@ + $(Q) echo '#endif' >> $$@ $(Q) echo '#include ' >> $$@ $(Q) echo '#include ' >> $$@ ifeq ($(1),32) @@ -71,6 +75,10 @@ $(sys-codes-generic): $(PLUGIN_ARCH_DIR)/std/syscalls/syscall_32.tbl $(sys-proto $(Q) echo "/* Autogenerated, don't edit */" > $@ $(Q) echo "#ifndef __ASM_CR_SYSCALL_CODES_H__" >> $@ $(Q) echo "#define __ASM_CR_SYSCALL_CODES_H__" >> $@ + $(Q) echo "/* musl defines loff_t as off_t */" >> $@ + $(Q) echo '#ifndef loff_t' >> $@ + $(Q) echo '#define loff_t off_t' >> $@ + $(Q) echo '#endif' >> $@ $(Q) echo '#include ' >> $@ $(Q) cat $< | awk '/^__NR/{NR32=$$1; \ sub("^__NR", "__NR32", NR32); \ diff --git a/compel/arch/x86/plugins/std/syscalls/syscall32.c b/compel/arch/x86/plugins/std/syscalls/syscall32.c index e172cacff..d09fd38c7 100644 --- a/compel/arch/x86/plugins/std/syscalls/syscall32.c +++ b/compel/arch/x86/plugins/std/syscalls/syscall32.c @@ -1,16 +1,16 @@ #include "asm/types.h" #include "syscall-32.h" -#define SYS_SOCKET 1 /* sys_socket(2) */ -#define SYS_BIND 2 /* sys_bind(2) */ -#define SYS_CONNECT 3 /* sys_connect(2) */ -#define SYS_SENDTO 11 /* sys_sendto(2) */ -#define SYS_RECVFROM 12 /* sys_recvfrom(2) */ -#define SYS_SHUTDOWN 13 /* sys_shutdown(2) */ -#define SYS_SETSOCKOPT 14 /* sys_setsockopt(2) */ -#define SYS_GETSOCKOPT 15 /* sys_getsockopt(2) */ -#define SYS_SENDMSG 16 /* sys_sendmsg(2) */ -#define SYS_RECVMSG 17 /* sys_recvmsg(2) */ +#define SYS_SOCKET 1 /* sys_socket(2) */ +#define SYS_BIND 2 /* sys_bind(2) */ +#define SYS_CONNECT 3 /* sys_connect(2) */ +#define SYS_SENDTO 11 /* sys_sendto(2) */ +#define SYS_RECVFROM 12 /* sys_recvfrom(2) */ +#define SYS_SHUTDOWN 13 /* sys_shutdown(2) */ +#define SYS_SETSOCKOPT 14 /* sys_setsockopt(2) */ +#define SYS_GETSOCKOPT 15 /* sys_getsockopt(2) */ +#define SYS_SENDMSG 16 /* sys_sendmsg(2) */ +#define SYS_RECVMSG 17 /* sys_recvmsg(2) */ long sys_socket(int domain, int type, int protocol) { @@ -20,59 +20,61 @@ long sys_socket(int domain, int type, int protocol) long sys_connect(int sockfd, struct sockaddr *addr, int addrlen) { - uint32_t a[] = {(uint32_t)sockfd, (uint32_t)addr, (uint32_t)addrlen}; + uint32_t a[] = { (uint32_t)sockfd, (uint32_t)addr, (uint32_t)addrlen }; return sys_socketcall(SYS_CONNECT, (unsigned long *)a); } long sys_sendto(int sockfd, void *buff, size_t len, unsigned int flags, struct sockaddr *addr, int addr_len) { - uint32_t a[] = {(uint32_t)sockfd, (uint32_t)buff, (uint32_t)len, (uint32_t)flags, (uint32_t)addr, (uint32_t)addr_len}; + uint32_t a[] = { (uint32_t)sockfd, (uint32_t)buff, (uint32_t)len, + (uint32_t)flags, (uint32_t)addr, (uint32_t)addr_len }; return sys_socketcall(SYS_SENDTO, (unsigned long *)a); } long sys_recvfrom(int sockfd, void *ubuf, size_t size, unsigned int flags, struct sockaddr *addr, int *addr_len) { - uint32_t a[] = {(uint32_t)sockfd, (uint32_t)ubuf, (uint32_t)size, (uint32_t)flags, (uint32_t)addr, (uint32_t)addr_len}; + uint32_t a[] = { (uint32_t)sockfd, (uint32_t)ubuf, (uint32_t)size, + (uint32_t)flags, (uint32_t)addr, (uint32_t)addr_len }; return sys_socketcall(SYS_RECVFROM, (unsigned long *)a); } long sys_sendmsg(int sockfd, const struct msghdr *msg, int flags) { - uint32_t a[] = {(uint32_t)sockfd, (uint32_t)msg, (uint32_t)flags}; + uint32_t a[] = { (uint32_t)sockfd, (uint32_t)msg, (uint32_t)flags }; return sys_socketcall(SYS_SENDMSG, (unsigned long *)a); } long sys_recvmsg(int sockfd, struct msghdr *msg, int flags) { - uint32_t a[] = {(uint32_t)sockfd, (uint32_t)msg, (uint32_t)flags}; + uint32_t a[] = { (uint32_t)sockfd, (uint32_t)msg, (uint32_t)flags }; return sys_socketcall(SYS_RECVMSG, (unsigned long *)a); } long sys_shutdown(int sockfd, int how) { - uint32_t a[] = {(uint32_t)sockfd, (uint32_t)how}; + uint32_t a[] = { (uint32_t)sockfd, (uint32_t)how }; return sys_socketcall(SYS_SHUTDOWN, (unsigned long *)a); } long sys_bind(int sockfd, const struct sockaddr *addr, int addrlen) { - uint32_t a[] = {(uint32_t)sockfd, (uint32_t)addr, (uint32_t)addrlen}; + uint32_t a[] = { (uint32_t)sockfd, (uint32_t)addr, (uint32_t)addrlen }; return sys_socketcall(SYS_BIND, (unsigned long *)a); } long sys_setsockopt(int sockfd, int level, int optname, const void *optval, unsigned int optlen) { - uint32_t a[] = {(uint32_t)sockfd, (uint32_t)level, (uint32_t)optname, (uint32_t)optval, (uint32_t)optlen}; + uint32_t a[] = { (uint32_t)sockfd, (uint32_t)level, (uint32_t)optname, (uint32_t)optval, (uint32_t)optlen }; return sys_socketcall(SYS_SETSOCKOPT, (unsigned long *)a); } long sys_getsockopt(int sockfd, int level, int optname, const void *optval, unsigned int *optlen) { - uint32_t a[] = {(uint32_t)sockfd, (uint32_t)level, (uint32_t)optname, (uint32_t)optval, (uint32_t)optlen}; + uint32_t a[] = { (uint32_t)sockfd, (uint32_t)level, (uint32_t)optname, (uint32_t)optval, (uint32_t)optlen }; return sys_socketcall(SYS_GETSOCKOPT, (unsigned long *)a); } -#define SHMAT 21 +#define SHMAT 21 long sys_shmat(int shmid, void *shmaddr, int shmflag) { diff --git a/compel/arch/x86/plugins/std/syscalls/syscall_32.tbl b/compel/arch/x86/plugins/std/syscalls/syscall_32.tbl index 7903ab150..cc23dc3f3 100644 --- a/compel/arch/x86/plugins/std/syscalls/syscall_32.tbl +++ b/compel/arch/x86/plugins/std/syscalls/syscall_32.tbl @@ -96,3 +96,15 @@ __NR_seccomp 354 sys_seccomp (unsigned int op, unsigned int flags, const char __NR_memfd_create 356 sys_memfd_create (const char *name, unsigned int flags) __NR_userfaultfd 374 sys_userfaultfd (int flags) __NR_ppoll 309 sys_ppoll (struct pollfd *fds, unsigned int nfds, const struct timespec *tmo, const sigset_t *sigmask, size_t sigsetsize) +__NR_open_tree 428 sys_open_tree (int dirfd, const char *pathname, unsigned int flags) +__NR_move_mount 429 sys_move_mount (int from_dfd, const char *from_pathname, int to_dfd, const char *to_pathname, int flags) +__NR_fsopen 430 sys_fsopen (char *fsname, unsigned int flags) +__NR_fsconfig 431 sys_fsconfig (int fd, unsigned int cmd, const char *key, const char *value, int aux) +__NR_fsmount 432 sys_fsmount (int fd, unsigned int flags, unsigned int attr_flags) +__NR_clone3 435 sys_clone3 (struct clone_args *uargs, size_t size) +__NR_close_range 436 sys_close_range (unsigned int fd, unsigned int max_fd, unsigned int flags) +__NR_pidfd_open 434 sys_pidfd_open (pid_t pid, unsigned int flags) +__NR_openat2 437 sys_openat2 (int dirfd, char *pathname, struct open_how *how, size_t size) +__NR_pidfd_getfd 438 sys_pidfd_getfd (int pidfd, int targetfd, unsigned int flags) +__NR_rseq 386 sys_rseq (void *rseq, uint32_t rseq_len, int flags, uint32_t sig) +__NR_membarrier 375 sys_membarrier (int cmd, unsigned int flags, int cpu_id) diff --git a/compel/arch/x86/plugins/std/syscalls/syscall_64.tbl b/compel/arch/x86/plugins/std/syscalls/syscall_64.tbl index 4ac9164ea..8c3620c2a 100644 --- a/compel/arch/x86/plugins/std/syscalls/syscall_64.tbl +++ b/compel/arch/x86/plugins/std/syscalls/syscall_64.tbl @@ -85,7 +85,7 @@ __NR_sys_timer_settime 223 sys_timer_settime (kernel_timer_t timer_id, int fla __NR_sys_timer_gettime 224 sys_timer_gettime (int timer_id, const struct itimerspec *setting) __NR_sys_timer_getoverrun 225 sys_timer_getoverrun (int timer_id) __NR_sys_timer_delete 226 sys_timer_delete (kernel_timer_t timer_id) -__NR_clock_gettime 228 sys_clock_gettime (const clockid_t which_clock, const struct timespec *tp) +__NR_clock_gettime 228 sys_clock_gettime (clockid_t which_clock, struct timespec *tp) __NR_exit_group 231 sys_exit_group (int error_code) __NR_openat 257 sys_openat (int dfd, const char *filename, int flags, int mode) __NR_waitid 247 sys_waitid (int which, pid_t pid, struct siginfo *infop, int options, struct rusage *ru) @@ -107,3 +107,16 @@ __NR_kcmp 312 sys_kcmp (pid_t pid1, pid_t pid2, int type, unsigned long idx1 __NR_memfd_create 319 sys_memfd_create (const char *name, unsigned int flags) __NR_userfaultfd 323 sys_userfaultfd (int flags) __NR_ppoll 271 sys_ppoll (struct pollfd *fds, unsigned int nfds, const struct timespec *tmo, const sigset_t *sigmask, size_t sigsetsize) +__NR_open_tree 428 sys_open_tree (int dirfd, const char *pathname, unsigned int flags) +__NR_move_mount 429 sys_move_mount (int from_dfd, const char *from_pathname, int to_dfd, const char *to_pathname, int flags) +__NR_fsopen 430 sys_fsopen (char *fsname, unsigned int flags) +__NR_fsconfig 431 sys_fsconfig (int fd, unsigned int cmd, const char *key, const char *value, int aux) +__NR_fsmount 432 sys_fsmount (int fd, unsigned int flags, unsigned int attr_flags) +__NR_clone3 435 sys_clone3 (struct clone_args *uargs, size_t size) +__NR_close_range 436 sys_close_range (unsigned int fd, unsigned int max_fd, unsigned int flags) +__NR_pidfd_open 434 sys_pidfd_open (pid_t pid, unsigned int flags) +__NR_openat2 437 sys_openat2 (int dirfd, char *pathname, struct open_how *how, size_t size) +__NR_pidfd_getfd 438 sys_pidfd_getfd (int pidfd, int targetfd, unsigned int flags) +__NR_rseq 334 sys_rseq (void *rseq, uint32_t rseq_len, int flags, uint32_t sig) +__NR_membarrier 324 sys_membarrier (int cmd, unsigned int flags, int cpu_id) +__NR_map_shadow_stack 453 sys_map_shadow_stack (unsigned long addr, unsigned long size, unsigned int flags) diff --git a/compel/arch/x86/scripts/compel-pack-compat.lds.S b/compel/arch/x86/scripts/compel-pack-compat.lds.S index ff9c2c6b2..2d907a4a7 100644 --- a/compel/arch/x86/scripts/compel-pack-compat.lds.S +++ b/compel/arch/x86/scripts/compel-pack-compat.lds.S @@ -34,8 +34,4 @@ SECTIONS *(.group*) *(.eh_frame*) } - -/* Parasite args should have 4 bytes align, as we have futex inside. */ -. = ALIGN(4); -__export_parasite_args = .; } diff --git a/compel/arch/x86/scripts/compel-pack.lds.S b/compel/arch/x86/scripts/compel-pack.lds.S index 0c936f84d..44e705e29 100644 --- a/compel/arch/x86/scripts/compel-pack.lds.S +++ b/compel/arch/x86/scripts/compel-pack.lds.S @@ -13,7 +13,7 @@ SECTIONS *(.compel.init) } - .data : { + .data : ALIGN(0x1000) { *(.data*) *(.bss*) } @@ -34,8 +34,4 @@ SECTIONS *(.group*) *(.eh_frame*) } - -/* Parasite args should have 4 bytes align, as we have futex inside. */ -. = ALIGN(4); -__export_parasite_args = .; } diff --git a/compel/arch/x86/src/lib/cpu.c b/compel/arch/x86/src/lib/cpu.c index 617512167..f57fb3152 100644 --- a/compel/arch/x86/src/lib/cpu.c +++ b/compel/arch/x86/src/lib/cpu.c @@ -8,7 +8,7 @@ #include "log.h" #include "common/bug.h" -#undef LOG_PREFIX +#undef LOG_PREFIX #define LOG_PREFIX "cpu: " static compel_cpuinfo_t rt_info; @@ -29,32 +29,24 @@ static void fetch_rt_cpuinfo(void) * to save/restore PT state in Linux. */ -static const char * const xfeature_names[] = { - "x87 floating point registers" , - "SSE registers" , - "AVX registers" , - "MPX bounds registers" , - "MPX CSR" , - "AVX-512 opmask" , - "AVX-512 Hi256" , - "AVX-512 ZMM_Hi256" , - "Processor Trace" , +static const char *const xfeature_names[] = { + "x87 floating point registers", + "SSE registers", + "AVX registers", + "MPX bounds registers", + "MPX CSR", + "AVX-512 opmask", + "AVX-512 Hi256", + "AVX-512 ZMM_Hi256", + "Processor Trace", "Protection Keys User registers", - "Hardware Duty Cycling" , + "Hardware Duty Cycling", }; static short xsave_cpuid_features[] = { - X86_FEATURE_FPU, - X86_FEATURE_XMM, - X86_FEATURE_AVX, - X86_FEATURE_MPX, - X86_FEATURE_MPX, - X86_FEATURE_AVX512F, - X86_FEATURE_AVX512F, - X86_FEATURE_AVX512F, - X86_FEATURE_INTEL_PT, - X86_FEATURE_PKU, - X86_FEATURE_HDC, + X86_FEATURE_FPU, X86_FEATURE_XMM, X86_FEATURE_AVX, X86_FEATURE_MPX, + X86_FEATURE_MPX, X86_FEATURE_AVX512F, X86_FEATURE_AVX512F, X86_FEATURE_AVX512F, + X86_FEATURE_INTEL_PT, X86_FEATURE_PKU, X86_FEATURE_HDC, }; void compel_set_cpu_cap(compel_cpuinfo_t *c, unsigned int feature) @@ -89,8 +81,7 @@ static int compel_fpuid(compel_cpuinfo_t *c) uint32_t eax, ebx, ecx, edx; size_t i; - BUILD_BUG_ON(ARRAY_SIZE(xsave_cpuid_features) != - ARRAY_SIZE(xfeature_names)); + BUILD_BUG_ON(ARRAY_SIZE(xsave_cpuid_features) != ARRAY_SIZE(xfeature_names)); if (!compel_test_cpu_cap(c, X86_FEATURE_FPU)) { pr_err("fpu: No FPU detected\n"); @@ -98,9 +89,7 @@ static int compel_fpuid(compel_cpuinfo_t *c) } if (!compel_test_cpu_cap(c, X86_FEATURE_XSAVE)) { - pr_info("fpu: x87 FPU will use %s\n", - compel_test_cpu_cap(c, X86_FEATURE_FXSR) ? - "FXSAVE" : "FSAVE"); + pr_info("fpu: x87 FPU will use %s\n", compel_test_cpu_cap(c, X86_FEATURE_FXSR) ? "FXSAVE" : "FSAVE"); return 0; } @@ -125,7 +114,7 @@ static int compel_fpuid(compel_cpuinfo_t *c) c->xfeatures_mask &= ~(1 << i); } - c->xfeatures_mask &= XCNTXT_MASK; + c->xfeatures_mask &= XFEATURE_MASK_USER; c->xfeatures_mask &= ~XFEATURE_MASK_SUPERVISOR; /* @@ -140,19 +129,18 @@ static int compel_fpuid(compel_cpuinfo_t *c) c->xsaves_size = ebx; pr_debug("fpu: xfeatures_mask 0x%llx xsave_size %u xsave_size_max %u xsaves_size %u\n", - (unsigned long long)c->xfeatures_mask, - c->xsave_size, c->xsave_size_max, c->xsaves_size); + (unsigned long long)c->xfeatures_mask, c->xsave_size, c->xsave_size_max, c->xsaves_size); if (c->xsave_size_max > sizeof(struct xsave_struct)) - pr_warn_once("fpu: max xsave frame exceed xsave_struct (%u %u)\n", - c->xsave_size_max, (unsigned)sizeof(struct xsave_struct)); + pr_warn_once("fpu: max xsave frame exceed xsave_struct (%u %u)\n", c->xsave_size_max, + (unsigned)sizeof(struct xsave_struct)); memset(c->xstate_offsets, 0xff, sizeof(c->xstate_offsets)); memset(c->xstate_sizes, 0xff, sizeof(c->xstate_sizes)); memset(c->xstate_comp_offsets, 0xff, sizeof(c->xstate_comp_offsets)); memset(c->xstate_comp_sizes, 0xff, sizeof(c->xstate_comp_sizes)); - /* start at the beginnning of the "extended state" */ + /* start at the beginning of the "extended state" */ last_good_offset = offsetof(struct xsave_struct, extended_state_area); /* @@ -160,10 +148,10 @@ static int compel_fpuid(compel_cpuinfo_t *c) * in the fixed offsets in the xsave area in either compacted form * or standard form. */ - c->xstate_offsets[0] = 0; - c->xstate_sizes[0] = offsetof(struct i387_fxsave_struct, xmm_space); - c->xstate_offsets[1] = c->xstate_sizes[0]; - c->xstate_sizes[1] = FIELD_SIZEOF(struct i387_fxsave_struct, xmm_space); + c->xstate_offsets[0] = 0; + c->xstate_sizes[0] = offsetof(struct i387_fxsave_struct, xmm_space); + c->xstate_offsets[1] = c->xstate_sizes[0]; + c->xstate_sizes[1] = FIELD_SIZEOF(struct i387_fxsave_struct, xmm_space); for (i = FIRST_EXTENDED_XFEATURE; i < XFEATURE_MAX; i++) { if (!(c->xfeatures_mask & (1UL << i))) @@ -189,8 +177,7 @@ static int compel_fpuid(compel_cpuinfo_t *c) * highest offset in the buffer. Ensure it does. */ if (last_good_offset > c->xstate_offsets[i]) - pr_warn_once("fpu: misordered xstate %d %d\n", - last_good_offset, c->xstate_offsets[i]); + pr_warn_once("fpu: misordered xstate %d %d\n", last_good_offset, c->xstate_offsets[i]); last_good_offset = c->xstate_offsets[i]; } @@ -198,10 +185,10 @@ static int compel_fpuid(compel_cpuinfo_t *c) BUILD_BUG_ON(sizeof(c->xstate_offsets) != sizeof(c->xstate_sizes)); BUILD_BUG_ON(sizeof(c->xstate_comp_offsets) != sizeof(c->xstate_comp_sizes)); - c->xstate_comp_offsets[0] = 0; - c->xstate_comp_sizes[0] = offsetof(struct i387_fxsave_struct, xmm_space); - c->xstate_comp_offsets[1] = c->xstate_comp_sizes[0]; - c->xstate_comp_sizes[1] = FIELD_SIZEOF(struct i387_fxsave_struct, xmm_space); + c->xstate_comp_offsets[0] = 0; + c->xstate_comp_sizes[0] = offsetof(struct i387_fxsave_struct, xmm_space); + c->xstate_comp_offsets[1] = c->xstate_comp_sizes[0]; + c->xstate_comp_sizes[1] = FIELD_SIZEOF(struct i387_fxsave_struct, xmm_space); if (!compel_test_cpu_cap(c, X86_FEATURE_XSAVES)) { for (i = FIRST_EXTENDED_XFEATURE; i < XFEATURE_MAX; i++) { @@ -211,8 +198,7 @@ static int compel_fpuid(compel_cpuinfo_t *c) } } } else { - c->xstate_comp_offsets[FIRST_EXTENDED_XFEATURE] = - FXSAVE_SIZE + XSAVE_HDR_SIZE; + c->xstate_comp_offsets[FIRST_EXTENDED_XFEATURE] = FXSAVE_SIZE + XSAVE_HDR_SIZE; for (i = FIRST_EXTENDED_XFEATURE; i < XFEATURE_MAX; i++) { if ((c->xfeatures_mask & (1UL << i))) @@ -221,8 +207,7 @@ static int compel_fpuid(compel_cpuinfo_t *c) c->xstate_comp_sizes[i] = 0; if (i > FIRST_EXTENDED_XFEATURE) { - c->xstate_comp_offsets[i] = c->xstate_comp_offsets[i-1] - + c->xstate_comp_sizes[i-1]; + c->xstate_comp_offsets[i] = c->xstate_comp_offsets[i - 1] + c->xstate_comp_sizes[i - 1]; /* * The value returned by ECX[1] indicates the alignment @@ -240,9 +225,9 @@ static int compel_fpuid(compel_cpuinfo_t *c) for (i = 0; i < ARRAY_SIZE(c->xstate_offsets); i++) { if (!(c->xfeatures_mask & (1UL << i))) continue; - pr_debug("fpu: %-32s xstate_offsets %6d / %-6d xstate_sizes %6d / %-6d\n", - xfeature_names[i], c->xstate_offsets[i], c->xstate_comp_offsets[i], - c->xstate_sizes[i], c->xstate_comp_sizes[i]); + pr_debug("fpu: %-32s xstate_offsets %6d / %-6d xstate_sizes %6d / %-6d\n", xfeature_names[i], + c->xstate_offsets[i], c->xstate_comp_offsets[i], c->xstate_sizes[i], + c->xstate_comp_sizes[i]); } } @@ -261,20 +246,15 @@ int compel_cpuid(compel_cpuinfo_t *c) */ /* Get vendor name */ - cpuid(0x00000000, - (unsigned int *)&c->cpuid_level, - (unsigned int *)&c->x86_vendor_id[0], - (unsigned int *)&c->x86_vendor_id[8], - (unsigned int *)&c->x86_vendor_id[4]); + cpuid(0x00000000, (unsigned int *)&c->cpuid_level, (unsigned int *)&c->x86_vendor_id[0], + (unsigned int *)&c->x86_vendor_id[8], (unsigned int *)&c->x86_vendor_id[4]); if (!strcmp(c->x86_vendor_id, "GenuineIntel")) { c->x86_vendor = X86_VENDOR_INTEL; - } else if (!strcmp(c->x86_vendor_id, "AuthenticAMD") || - !strcmp(c->x86_vendor_id, "HygonGenuine")) { + } else if (!strcmp(c->x86_vendor_id, "AuthenticAMD") || !strcmp(c->x86_vendor_id, "HygonGenuine")) { c->x86_vendor = X86_VENDOR_AMD; } else { - pr_err("Unsupported CPU vendor %s\n", - c->x86_vendor_id); + pr_err("Unsupported CPU vendor %s\n", c->x86_vendor_id); return -1; } @@ -369,7 +349,7 @@ int compel_cpuid(compel_cpuinfo_t *c) while (*p) *q++ = *p++; while (q <= &c->x86_model_id[48]) - *q++ = '\0'; /* Zero-pad the rest */ + *q++ = '\0'; /* Zero-pad the rest */ } } @@ -440,8 +420,7 @@ int compel_cpuid(compel_cpuinfo_t *c) break; } - pr_debug("x86_family %u x86_vendor_id %s x86_model_id %s\n", - c->x86_family, c->x86_vendor_id, c->x86_model_id); + pr_debug("x86_family %u x86_vendor_id %s x86_model_id %s\n", c->x86_family, c->x86_vendor_id, c->x86_model_id); return compel_fpuid(c); } @@ -461,8 +440,7 @@ bool compel_fpu_has_feature(unsigned int feature) uint32_t compel_fpu_feature_size(unsigned int feature) { fetch_rt_cpuinfo(); - if (feature >= FIRST_EXTENDED_XFEATURE && - feature < XFEATURE_MAX) + if (feature >= FIRST_EXTENDED_XFEATURE && feature < XFEATURE_MAX) return rt_info.xstate_sizes[feature]; return 0; } @@ -470,8 +448,7 @@ uint32_t compel_fpu_feature_size(unsigned int feature) uint32_t compel_fpu_feature_offset(unsigned int feature) { fetch_rt_cpuinfo(); - if (feature >= FIRST_EXTENDED_XFEATURE && - feature < XFEATURE_MAX) + if (feature >= FIRST_EXTENDED_XFEATURE && feature < XFEATURE_MAX) return rt_info.xstate_offsets[feature]; return 0; } diff --git a/compel/arch/x86/src/lib/handle-elf.c b/compel/arch/x86/src/lib/handle-elf.c index 62fb28f49..78b23f28a 100644 --- a/compel/arch/x86/src/lib/handle-elf.c +++ b/compel/arch/x86/src/lib/handle-elf.c @@ -1,14 +1,12 @@ #include - -#include "uapi/compel.h" +#include #include "handle-elf.h" #include "piegen.h" #include "log.h" -static const unsigned char __maybe_unused -elf_ident_64_le[EI_NIDENT] = { - 0x7f, 0x45, 0x4c, 0x46, 0x02, 0x01, 0x01, 0x00, +static const unsigned char __maybe_unused elf_ident_64_le[EI_NIDENT] = { + 0x7f, 0x45, 0x4c, 0x46, 0x02, 0x01, 0x01, 0x00, /* clang-format */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, }; diff --git a/compel/arch/x86/src/lib/include/cpu.h b/compel/arch/x86/src/lib/include/cpu.h index 60b7d24d4..de3b0a0ff 100644 --- a/compel/arch/x86/src/lib/include/cpu.h +++ b/compel/arch/x86/src/lib/include/cpu.h @@ -1,31 +1,21 @@ #ifndef __COMPEL_ASM_CPU_H__ #define __COMPEL_ASM_CPU_H__ -static inline void native_cpuid(unsigned int *eax, unsigned int *ebx, - unsigned int *ecx, unsigned int *edx) +static inline void native_cpuid(unsigned int *eax, unsigned int *ebx, unsigned int *ecx, unsigned int *edx) { /* ecx is often an input as well as an output. */ - asm volatile("cpuid" - : "=a" (*eax), - "=b" (*ebx), - "=c" (*ecx), - "=d" (*edx) - : "0" (*eax), "2" (*ecx) - : "memory"); + asm volatile("cpuid" : "=a"(*eax), "=b"(*ebx), "=c"(*ecx), "=d"(*edx) : "0"(*eax), "2"(*ecx) : "memory"); } -static inline void cpuid(unsigned int op, - unsigned int *eax, unsigned int *ebx, - unsigned int *ecx, unsigned int *edx) +static inline void cpuid(unsigned int op, unsigned int *eax, unsigned int *ebx, unsigned int *ecx, unsigned int *edx) { *eax = op; *ecx = 0; native_cpuid(eax, ebx, ecx, edx); } -static inline void cpuid_count(unsigned int op, int count, - unsigned int *eax, unsigned int *ebx, - unsigned int *ecx, unsigned int *edx) +static inline void cpuid_count(unsigned int op, int count, unsigned int *eax, unsigned int *ebx, unsigned int *ecx, + unsigned int *edx) { *eax = op; *ecx = count; diff --git a/compel/arch/x86/src/lib/include/handle-elf.h b/compel/arch/x86/src/lib/include/handle-elf.h index e68fe3bcf..f3150f3f1 100644 --- a/compel/arch/x86/src/lib/include/handle-elf.h +++ b/compel/arch/x86/src/lib/include/handle-elf.h @@ -6,15 +6,15 @@ #define ELF_X86_64 #ifndef R_X86_64_GOTPCRELX -# define R_X86_64_GOTPCRELX 41 +#define R_X86_64_GOTPCRELX 41 #endif #ifndef R_X86_64_REX_GOTPCRELX -# define R_X86_64_REX_GOTPCRELX 42 +#define R_X86_64_REX_GOTPCRELX 42 #endif -#define __handle_elf handle_elf_x86_64 -#define arch_is_machine_supported(e_machine) (e_machine == EM_X86_64) +#define __handle_elf handle_elf_x86_64 +#define arch_is_machine_supported(e_machine) (e_machine == EM_X86_64) extern int handle_elf_x86_32(void *mem, size_t size); extern int handle_elf_x86_64(void *mem, size_t size); diff --git a/compel/arch/x86/src/lib/include/syscall.h b/compel/arch/x86/src/lib/include/syscall.h index 9af1b1f99..a1b742b11 100644 --- a/compel/arch/x86/src/lib/include/syscall.h +++ b/compel/arch/x86/src/lib/include/syscall.h @@ -1,6 +1,6 @@ #ifndef __COMPEL_SYSCALL_H__ #define __COMPEL_SYSCALL_H__ -#define __NR(syscall, compat) ((compat) ? __NR32_##syscall : __NR_##syscall) +#define __NR(syscall, compat) ((compat) ? __NR32_##syscall : __NR_##syscall) /* * For x86_32 __NR_mmap inside the kernel represents old_mmap system diff --git a/compel/arch/x86/src/lib/include/uapi/asm/cpu.h b/compel/arch/x86/src/lib/include/uapi/asm/cpu.h index bb1914da4..11c50e0e5 100644 --- a/compel/arch/x86/src/lib/include/uapi/asm/cpu.h +++ b/compel/arch/x86/src/lib/include/uapi/asm/cpu.h @@ -15,334 +15,335 @@ * to keep it here, since it's an ABI now. */ enum cpuid_leafs { - CPUID_1_EDX = 0, - CPUID_8000_0001_EDX = 1, - CPUID_8086_0001_EDX = 2, - CPUID_LNX_1 = 3, - CPUID_1_ECX = 4, - CPUID_C000_0001_EDX = 5, - CPUID_8000_0001_ECX = 6, - CPUID_LNX_2 = 7, - CPUID_LNX_3 = 8, - CPUID_7_0_EBX = 9, - CPUID_D_1_EAX = 10, - CPUID_7_0_ECX = 11, - CPUID_F_1_EDX = 12, - CPUID_8000_0008_EBX = 13, - CPUID_6_EAX = 14, - CPUID_8000_000A_EDX = 15, - CPUID_F_0_EDX = 16, - CPUID_8000_0007_EBX = 17, - CPUID_7_0_EDX = 18, + CPUID_1_EDX = 0, + CPUID_8000_0001_EDX = 1, + CPUID_8086_0001_EDX = 2, + CPUID_LNX_1 = 3, + CPUID_1_ECX = 4, + CPUID_C000_0001_EDX = 5, + CPUID_8000_0001_ECX = 6, + CPUID_LNX_2 = 7, + CPUID_LNX_3 = 8, + CPUID_7_0_EBX = 9, + CPUID_D_1_EAX = 10, + CPUID_7_0_ECX = 11, + CPUID_F_1_EDX = 12, + CPUID_8000_0008_EBX = 13, + CPUID_6_EAX = 14, + CPUID_8000_000A_EDX = 15, + CPUID_F_0_EDX = 16, + CPUID_8000_0007_EBX = 17, + CPUID_7_0_EDX = 18, }; -#define NCAPINTS_V1 12 -#define NCAPINTS_V2 19 +#define NCAPINTS_V1 12 +#define NCAPINTS_V2 19 -#define NCAPINTS (NCAPINTS_V2) /* N 32-bit words worth of info */ -#define NCAPINTS_BITS (NCAPINTS * 32) +#define NCAPINTS (NCAPINTS_V2) /* N 32-bit words worth of info */ +#define NCAPINTS_BITS (NCAPINTS * 32) /* Intel-defined CPU features, CPUID level 0x00000001 (EDX), word 0 */ -#define X86_FEATURE_FPU (0*32+ 0) /* Onboard FPU */ -#define X86_FEATURE_VME (0*32+ 1) /* Virtual Mode Extensions */ -#define X86_FEATURE_DE (0*32+ 2) /* Debugging Extensions */ -#define X86_FEATURE_PSE (0*32+ 3) /* Page Size Extensions */ -#define X86_FEATURE_TSC (0*32+ 4) /* Time Stamp Counter */ -#define X86_FEATURE_MSR (0*32+ 5) /* Model-Specific Registers */ -#define X86_FEATURE_PAE (0*32+ 6) /* Physical Address Extensions */ -#define X86_FEATURE_MCE (0*32+ 7) /* Machine Check Exception */ -#define X86_FEATURE_CX8 (0*32+ 8) /* CMPXCHG8 instruction */ -#define X86_FEATURE_APIC (0*32+ 9) /* Onboard APIC */ -#define X86_FEATURE_SEP (0*32+11) /* SYSENTER/SYSEXIT */ -#define X86_FEATURE_MTRR (0*32+12) /* Memory Type Range Registers */ -#define X86_FEATURE_PGE (0*32+13) /* Page Global Enable */ -#define X86_FEATURE_MCA (0*32+14) /* Machine Check Architecture */ -#define X86_FEATURE_CMOV (0*32+15) /* CMOV instructions (plus FCMOVcc, FCOMI with FPU) */ -#define X86_FEATURE_PAT (0*32+16) /* Page Attribute Table */ -#define X86_FEATURE_PSE36 (0*32+17) /* 36-bit PSEs */ -#define X86_FEATURE_PN (0*32+18) /* Processor serial number */ -#define X86_FEATURE_CLFLUSH (0*32+19) /* CLFLUSH instruction */ -#define X86_FEATURE_DS (0*32+21) /* "dts" Debug Store */ -#define X86_FEATURE_ACPI (0*32+22) /* ACPI via MSR */ -#define X86_FEATURE_MMX (0*32+23) /* Multimedia Extensions */ -#define X86_FEATURE_FXSR (0*32+24) /* FXSAVE/FXRSTOR, CR4.OSFXSR */ -#define X86_FEATURE_XMM (0*32+25) /* "sse" */ -#define X86_FEATURE_XMM2 (0*32+26) /* "sse2" */ -#define X86_FEATURE_SELFSNOOP (0*32+27) /* "ss" CPU self snoop */ -#define X86_FEATURE_HT (0*32+28) /* Hyper-Threading */ -#define X86_FEATURE_ACC (0*32+29) /* "tm" Automatic clock control */ -#define X86_FEATURE_IA64 (0*32+30) /* IA-64 processor */ -#define X86_FEATURE_PBE (0*32+31) /* Pending Break Enable */ +#define X86_FEATURE_FPU (0 * 32 + 0) /* Onboard FPU */ +#define X86_FEATURE_VME (0 * 32 + 1) /* Virtual Mode Extensions */ +#define X86_FEATURE_DE (0 * 32 + 2) /* Debugging Extensions */ +#define X86_FEATURE_PSE (0 * 32 + 3) /* Page Size Extensions */ +#define X86_FEATURE_TSC (0 * 32 + 4) /* Time Stamp Counter */ +#define X86_FEATURE_MSR (0 * 32 + 5) /* Model-Specific Registers */ +#define X86_FEATURE_PAE (0 * 32 + 6) /* Physical Address Extensions */ +#define X86_FEATURE_MCE (0 * 32 + 7) /* Machine Check Exception */ +#define X86_FEATURE_CX8 (0 * 32 + 8) /* CMPXCHG8 instruction */ +#define X86_FEATURE_APIC (0 * 32 + 9) /* Onboard APIC */ +#define X86_FEATURE_SEP (0 * 32 + 11) /* SYSENTER/SYSEXIT */ +#define X86_FEATURE_MTRR (0 * 32 + 12) /* Memory Type Range Registers */ +#define X86_FEATURE_PGE (0 * 32 + 13) /* Page Global Enable */ +#define X86_FEATURE_MCA (0 * 32 + 14) /* Machine Check Architecture */ +#define X86_FEATURE_CMOV (0 * 32 + 15) /* CMOV instructions (plus FCMOVcc, FCOMI with FPU) */ +#define X86_FEATURE_PAT (0 * 32 + 16) /* Page Attribute Table */ +#define X86_FEATURE_PSE36 (0 * 32 + 17) /* 36-bit PSEs */ +#define X86_FEATURE_PN (0 * 32 + 18) /* Processor serial number */ +#define X86_FEATURE_CLFLUSH (0 * 32 + 19) /* CLFLUSH instruction */ +#define X86_FEATURE_DS (0 * 32 + 21) /* "dts" Debug Store */ +#define X86_FEATURE_ACPI (0 * 32 + 22) /* ACPI via MSR */ +#define X86_FEATURE_MMX (0 * 32 + 23) /* Multimedia Extensions */ +#define X86_FEATURE_FXSR (0 * 32 + 24) /* FXSAVE/FXRSTOR, CR4.OSFXSR */ +#define X86_FEATURE_XMM (0 * 32 + 25) /* "sse" */ +#define X86_FEATURE_XMM2 (0 * 32 + 26) /* "sse2" */ +#define X86_FEATURE_SELFSNOOP (0 * 32 + 27) /* "ss" CPU self snoop */ +#define X86_FEATURE_HT (0 * 32 + 28) /* Hyper-Threading */ +#define X86_FEATURE_ACC (0 * 32 + 29) /* "tm" Automatic clock control */ +#define X86_FEATURE_IA64 (0 * 32 + 30) /* IA-64 processor */ +#define X86_FEATURE_PBE (0 * 32 + 31) /* Pending Break Enable */ /* AMD-defined CPU features, CPUID level 0x80000001, word 1 */ /* Don't duplicate feature flags which are redundant with Intel! */ -#define X86_FEATURE_SYSCALL (1*32+11) /* SYSCALL/SYSRET */ -#define X86_FEATURE_MP (1*32+19) /* MP Capable */ -#define X86_FEATURE_NX (1*32+20) /* Execute Disable */ -#define X86_FEATURE_MMXEXT (1*32+22) /* AMD MMX extensions */ -#define X86_FEATURE_FXSR_OPT (1*32+25) /* FXSAVE/FXRSTOR optimizations */ -#define X86_FEATURE_GBPAGES (1*32+26) /* "pdpe1gb" GB pages */ -#define X86_FEATURE_RDTSCP (1*32+27) /* RDTSCP */ -#define X86_FEATURE_LM (1*32+29) /* Long Mode (x86-64, 64-bit support) */ -#define X86_FEATURE_3DNOWEXT (1*32+30) /* AMD 3DNow extensions */ -#define X86_FEATURE_3DNOW (1*32+31) /* 3DNow */ +#define X86_FEATURE_SYSCALL (1 * 32 + 11) /* SYSCALL/SYSRET */ +#define X86_FEATURE_MP (1 * 32 + 19) /* MP Capable */ +#define X86_FEATURE_NX (1 * 32 + 20) /* Execute Disable */ +#define X86_FEATURE_MMXEXT (1 * 32 + 22) /* AMD MMX extensions */ +#define X86_FEATURE_FXSR_OPT (1 * 32 + 25) /* FXSAVE/FXRSTOR optimizations */ +#define X86_FEATURE_GBPAGES (1 * 32 + 26) /* "pdpe1gb" GB pages */ +#define X86_FEATURE_RDTSCP (1 * 32 + 27) /* RDTSCP */ +#define X86_FEATURE_LM (1 * 32 + 29) /* Long Mode (x86-64, 64-bit support) */ +#define X86_FEATURE_3DNOWEXT (1 * 32 + 30) /* AMD 3DNow extensions */ +#define X86_FEATURE_3DNOW (1 * 32 + 31) /* 3DNow */ /* Transmeta-defined CPU features, CPUID level 0x80860001, word 2 */ -#define X86_FEATURE_RECOVERY (2*32+ 0) /* CPU in recovery mode */ -#define X86_FEATURE_LONGRUN (2*32+ 1) /* Longrun power control */ -#define X86_FEATURE_LRTI (2*32+ 3) /* LongRun table interface */ +#define X86_FEATURE_RECOVERY (2 * 32 + 0) /* CPU in recovery mode */ +#define X86_FEATURE_LONGRUN (2 * 32 + 1) /* Longrun power control */ +#define X86_FEATURE_LRTI (2 * 32 + 3) /* LongRun table interface */ /* Other features, Linux-defined mapping, word 3 */ /* This range is used for feature bits which conflict or are synthesized */ -#define X86_FEATURE_CXMMX (3*32+ 0) /* Cyrix MMX extensions */ -#define X86_FEATURE_K6_MTRR (3*32+ 1) /* AMD K6 nonstandard MTRRs */ -#define X86_FEATURE_CYRIX_ARR (3*32+ 2) /* Cyrix ARRs (= MTRRs) */ -#define X86_FEATURE_CENTAUR_MCR (3*32+ 3) /* Centaur MCRs (= MTRRs) */ +#define X86_FEATURE_CXMMX (3 * 32 + 0) /* Cyrix MMX extensions */ +#define X86_FEATURE_K6_MTRR (3 * 32 + 1) /* AMD K6 nonstandard MTRRs */ +#define X86_FEATURE_CYRIX_ARR (3 * 32 + 2) /* Cyrix ARRs (= MTRRs) */ +#define X86_FEATURE_CENTAUR_MCR (3 * 32 + 3) /* Centaur MCRs (= MTRRs) */ /* CPU types for specific tunings: */ -#define X86_FEATURE_K8 (3*32+ 4) /* "" Opteron, Athlon64 */ -#define X86_FEATURE_K7 (3*32+ 5) /* "" Athlon */ -#define X86_FEATURE_P3 (3*32+ 6) /* "" P3 */ -#define X86_FEATURE_P4 (3*32+ 7) /* "" P4 */ -#define X86_FEATURE_CONSTANT_TSC (3*32+ 8) /* TSC ticks at a constant rate */ -#define X86_FEATURE_UP (3*32+ 9) /* SMP kernel running on UP */ -#define X86_FEATURE_ART (3*32+10) /* Always running timer (ART) */ -#define X86_FEATURE_ARCH_PERFMON (3*32+11) /* Intel Architectural PerfMon */ -#define X86_FEATURE_PEBS (3*32+12) /* Precise-Event Based Sampling */ -#define X86_FEATURE_BTS (3*32+13) /* Branch Trace Store */ -#define X86_FEATURE_SYSCALL32 (3*32+14) /* "" syscall in IA32 userspace */ -#define X86_FEATURE_SYSENTER32 (3*32+15) /* "" sysenter in IA32 userspace */ -#define X86_FEATURE_REP_GOOD (3*32+16) /* REP microcode works well */ -#define X86_FEATURE_MFENCE_RDTSC (3*32+17) /* "" MFENCE synchronizes RDTSC */ -#define X86_FEATURE_LFENCE_RDTSC (3*32+18) /* "" LFENCE synchronizes RDTSC */ -#define X86_FEATURE_ACC_POWER (3*32+19) /* AMD Accumulated Power Mechanism */ -#define X86_FEATURE_NOPL (3*32+20) /* The NOPL (0F 1F) instructions */ -#define X86_FEATURE_ALWAYS (3*32+21) /* "" Always-present feature */ -#define X86_FEATURE_XTOPOLOGY (3*32+22) /* CPU topology enum extensions */ -#define X86_FEATURE_TSC_RELIABLE (3*32+23) /* TSC is known to be reliable */ -#define X86_FEATURE_NONSTOP_TSC (3*32+24) /* TSC does not stop in C states */ -#define X86_FEATURE_CPUID (3*32+25) /* CPU has CPUID instruction itself */ -#define X86_FEATURE_EXTD_APICID (3*32+26) /* Extended APICID (8 bits) */ -#define X86_FEATURE_AMD_DCM (3*32+27) /* AMD multi-node processor */ -#define X86_FEATURE_APERFMPERF (3*32+28) /* P-State hardware coordination feedback capability (APERF/MPERF MSRs) */ -#define X86_FEATURE_NONSTOP_TSC_S3 (3*32+30) /* TSC doesn't stop in S3 state */ -#define X86_FEATURE_TSC_KNOWN_FREQ (3*32+31) /* TSC has known frequency */ +#define X86_FEATURE_K8 (3 * 32 + 4) /* "" Opteron, Athlon64 */ +#define X86_FEATURE_K7 (3 * 32 + 5) /* "" Athlon */ +#define X86_FEATURE_P3 (3 * 32 + 6) /* "" P3 */ +#define X86_FEATURE_P4 (3 * 32 + 7) /* "" P4 */ +#define X86_FEATURE_CONSTANT_TSC (3 * 32 + 8) /* TSC ticks at a constant rate */ +#define X86_FEATURE_UP (3 * 32 + 9) /* SMP kernel running on UP */ +#define X86_FEATURE_ART (3 * 32 + 10) /* Always running timer (ART) */ +#define X86_FEATURE_ARCH_PERFMON (3 * 32 + 11) /* Intel Architectural PerfMon */ +#define X86_FEATURE_PEBS (3 * 32 + 12) /* Precise-Event Based Sampling */ +#define X86_FEATURE_BTS (3 * 32 + 13) /* Branch Trace Store */ +#define X86_FEATURE_SYSCALL32 (3 * 32 + 14) /* "" syscall in IA32 userspace */ +#define X86_FEATURE_SYSENTER32 (3 * 32 + 15) /* "" sysenter in IA32 userspace */ +#define X86_FEATURE_REP_GOOD (3 * 32 + 16) /* REP microcode works well */ +#define X86_FEATURE_MFENCE_RDTSC (3 * 32 + 17) /* "" MFENCE synchronizes RDTSC */ +#define X86_FEATURE_LFENCE_RDTSC (3 * 32 + 18) /* "" LFENCE synchronizes RDTSC */ +#define X86_FEATURE_ACC_POWER (3 * 32 + 19) /* AMD Accumulated Power Mechanism */ +#define X86_FEATURE_NOPL (3 * 32 + 20) /* The NOPL (0F 1F) instructions */ +#define X86_FEATURE_ALWAYS (3 * 32 + 21) /* "" Always-present feature */ +#define X86_FEATURE_XTOPOLOGY (3 * 32 + 22) /* CPU topology enum extensions */ +#define X86_FEATURE_TSC_RELIABLE (3 * 32 + 23) /* TSC is known to be reliable */ +#define X86_FEATURE_NONSTOP_TSC (3 * 32 + 24) /* TSC does not stop in C states */ +#define X86_FEATURE_CPUID (3 * 32 + 25) /* CPU has CPUID instruction itself */ +#define X86_FEATURE_EXTD_APICID (3 * 32 + 26) /* Extended APICID (8 bits) */ +#define X86_FEATURE_AMD_DCM (3 * 32 + 27) /* AMD multi-node processor */ +#define X86_FEATURE_APERFMPERF (3 * 32 + 28) /* P-State hardware coordination feedback capability (APERF/MPERF MSRs) */ +#define X86_FEATURE_NONSTOP_TSC_S3 (3 * 32 + 30) /* TSC doesn't stop in S3 state */ +#define X86_FEATURE_TSC_KNOWN_FREQ (3 * 32 + 31) /* TSC has known frequency */ /* Intel-defined CPU features, CPUID level 0x00000001 (ECX), word 4 */ -#define X86_FEATURE_XMM3 (4*32+ 0) /* "pni" SSE-3 */ -#define X86_FEATURE_PCLMULQDQ (4*32+ 1) /* PCLMULQDQ instruction */ -#define X86_FEATURE_DTES64 (4*32+ 2) /* 64-bit Debug Store */ -#define X86_FEATURE_MWAIT (4*32+ 3) /* "monitor" MONITOR/MWAIT support */ -#define X86_FEATURE_DSCPL (4*32+ 4) /* "ds_cpl" CPL-qualified (filtered) Debug Store */ -#define X86_FEATURE_VMX (4*32+ 5) /* Hardware virtualization */ -#define X86_FEATURE_SMX (4*32+ 6) /* Safer Mode eXtensions */ -#define X86_FEATURE_EST (4*32+ 7) /* Enhanced SpeedStep */ -#define X86_FEATURE_TM2 (4*32+ 8) /* Thermal Monitor 2 */ -#define X86_FEATURE_SSSE3 (4*32+ 9) /* Supplemental SSE-3 */ -#define X86_FEATURE_CID (4*32+10) /* Context ID */ -#define X86_FEATURE_SDBG (4*32+11) /* Silicon Debug */ -#define X86_FEATURE_FMA (4*32+12) /* Fused multiply-add */ -#define X86_FEATURE_CX16 (4*32+13) /* CMPXCHG16B instruction */ -#define X86_FEATURE_XTPR (4*32+14) /* Send Task Priority Messages */ -#define X86_FEATURE_PDCM (4*32+15) /* Perf/Debug Capabilities MSR */ -#define X86_FEATURE_PCID (4*32+17) /* Process Context Identifiers */ -#define X86_FEATURE_DCA (4*32+18) /* Direct Cache Access */ -#define X86_FEATURE_XMM4_1 (4*32+19) /* "sse4_1" SSE-4.1 */ -#define X86_FEATURE_XMM4_2 (4*32+20) /* "sse4_2" SSE-4.2 */ -#define X86_FEATURE_X2APIC (4*32+21) /* X2APIC */ -#define X86_FEATURE_MOVBE (4*32+22) /* MOVBE instruction */ -#define X86_FEATURE_POPCNT (4*32+23) /* POPCNT instruction */ -#define X86_FEATURE_TSC_DEADLINE_TIMER (4*32+24) /* TSC deadline timer */ -#define X86_FEATURE_AES (4*32+25) /* AES instructions */ -#define X86_FEATURE_XSAVE (4*32+26) /* XSAVE/XRSTOR/XSETBV/XGETBV instructions */ -#define X86_FEATURE_OSXSAVE (4*32+27) /* "" XSAVE instruction enabled in the OS */ -#define X86_FEATURE_AVX (4*32+28) /* Advanced Vector Extensions */ -#define X86_FEATURE_F16C (4*32+29) /* 16-bit FP conversions */ -#define X86_FEATURE_RDRAND (4*32+30) /* RDRAND instruction */ -#define X86_FEATURE_HYPERVISOR (4*32+31) /* Running on a hypervisor */ +#define X86_FEATURE_XMM3 (4 * 32 + 0) /* "pni" SSE-3 */ +#define X86_FEATURE_PCLMULQDQ (4 * 32 + 1) /* PCLMULQDQ instruction */ +#define X86_FEATURE_DTES64 (4 * 32 + 2) /* 64-bit Debug Store */ +#define X86_FEATURE_MWAIT (4 * 32 + 3) /* "monitor" MONITOR/MWAIT support */ +#define X86_FEATURE_DSCPL (4 * 32 + 4) /* "ds_cpl" CPL-qualified (filtered) Debug Store */ +#define X86_FEATURE_VMX (4 * 32 + 5) /* Hardware virtualization */ +#define X86_FEATURE_SMX (4 * 32 + 6) /* Safer Mode eXtensions */ +#define X86_FEATURE_EST (4 * 32 + 7) /* Enhanced SpeedStep */ +#define X86_FEATURE_TM2 (4 * 32 + 8) /* Thermal Monitor 2 */ +#define X86_FEATURE_SSSE3 (4 * 32 + 9) /* Supplemental SSE-3 */ +#define X86_FEATURE_CID (4 * 32 + 10) /* Context ID */ +#define X86_FEATURE_SDBG (4 * 32 + 11) /* Silicon Debug */ +#define X86_FEATURE_FMA (4 * 32 + 12) /* Fused multiply-add */ +#define X86_FEATURE_CX16 (4 * 32 + 13) /* CMPXCHG16B instruction */ +#define X86_FEATURE_XTPR (4 * 32 + 14) /* Send Task Priority Messages */ +#define X86_FEATURE_PDCM (4 * 32 + 15) /* Perf/Debug Capabilities MSR */ +#define X86_FEATURE_PCID (4 * 32 + 17) /* Process Context Identifiers */ +#define X86_FEATURE_DCA (4 * 32 + 18) /* Direct Cache Access */ +#define X86_FEATURE_XMM4_1 (4 * 32 + 19) /* "sse4_1" SSE-4.1 */ +#define X86_FEATURE_XMM4_2 (4 * 32 + 20) /* "sse4_2" SSE-4.2 */ +#define X86_FEATURE_X2APIC (4 * 32 + 21) /* X2APIC */ +#define X86_FEATURE_MOVBE (4 * 32 + 22) /* MOVBE instruction */ +#define X86_FEATURE_POPCNT (4 * 32 + 23) /* POPCNT instruction */ +#define X86_FEATURE_TSC_DEADLINE_TIMER (4 * 32 + 24) /* TSC deadline timer */ +#define X86_FEATURE_AES (4 * 32 + 25) /* AES instructions */ +#define X86_FEATURE_XSAVE (4 * 32 + 26) /* XSAVE/XRSTOR/XSETBV/XGETBV instructions */ +#define X86_FEATURE_OSXSAVE (4 * 32 + 27) /* "" XSAVE instruction enabled in the OS */ +#define X86_FEATURE_AVX (4 * 32 + 28) /* Advanced Vector Extensions */ +#define X86_FEATURE_F16C (4 * 32 + 29) /* 16-bit FP conversions */ +#define X86_FEATURE_RDRAND (4 * 32 + 30) /* RDRAND instruction */ +#define X86_FEATURE_HYPERVISOR (4 * 32 + 31) /* Running on a hypervisor */ /* VIA/Cyrix/Centaur-defined CPU features, CPUID level 0xC0000001, word 5 */ -#define X86_FEATURE_XSTORE (5*32+ 2) /* "rng" RNG present (xstore) */ -#define X86_FEATURE_XSTORE_EN (5*32+ 3) /* "rng_en" RNG enabled */ -#define X86_FEATURE_XCRYPT (5*32+ 6) /* "ace" on-CPU crypto (xcrypt) */ -#define X86_FEATURE_XCRYPT_EN (5*32+ 7) /* "ace_en" on-CPU crypto enabled */ -#define X86_FEATURE_ACE2 (5*32+ 8) /* Advanced Cryptography Engine v2 */ -#define X86_FEATURE_ACE2_EN (5*32+ 9) /* ACE v2 enabled */ -#define X86_FEATURE_PHE (5*32+10) /* PadLock Hash Engine */ -#define X86_FEATURE_PHE_EN (5*32+11) /* PHE enabled */ -#define X86_FEATURE_PMM (5*32+12) /* PadLock Montgomery Multiplier */ -#define X86_FEATURE_PMM_EN (5*32+13) /* PMM enabled */ +#define X86_FEATURE_XSTORE (5 * 32 + 2) /* "rng" RNG present (xstore) */ +#define X86_FEATURE_XSTORE_EN (5 * 32 + 3) /* "rng_en" RNG enabled */ +#define X86_FEATURE_XCRYPT (5 * 32 + 6) /* "ace" on-CPU crypto (xcrypt) */ +#define X86_FEATURE_XCRYPT_EN (5 * 32 + 7) /* "ace_en" on-CPU crypto enabled */ +#define X86_FEATURE_ACE2 (5 * 32 + 8) /* Advanced Cryptography Engine v2 */ +#define X86_FEATURE_ACE2_EN (5 * 32 + 9) /* ACE v2 enabled */ +#define X86_FEATURE_PHE (5 * 32 + 10) /* PadLock Hash Engine */ +#define X86_FEATURE_PHE_EN (5 * 32 + 11) /* PHE enabled */ +#define X86_FEATURE_PMM (5 * 32 + 12) /* PadLock Montgomery Multiplier */ +#define X86_FEATURE_PMM_EN (5 * 32 + 13) /* PMM enabled */ /* More extended AMD flags: CPUID level 0x80000001, ECX, word 6 */ -#define X86_FEATURE_LAHF_LM (6*32+ 0) /* LAHF/SAHF in long mode */ -#define X86_FEATURE_CMP_LEGACY (6*32+ 1) /* If yes HyperThreading not valid */ -#define X86_FEATURE_SVM (6*32+ 2) /* Secure Virtual Machine */ -#define X86_FEATURE_EXTAPIC (6*32+ 3) /* Extended APIC space */ -#define X86_FEATURE_CR8_LEGACY (6*32+ 4) /* CR8 in 32-bit mode */ -#define X86_FEATURE_ABM (6*32+ 5) /* Advanced bit manipulation */ -#define X86_FEATURE_SSE4A (6*32+ 6) /* SSE-4A */ -#define X86_FEATURE_MISALIGNSSE (6*32+ 7) /* Misaligned SSE mode */ -#define X86_FEATURE_3DNOWPREFETCH (6*32+ 8) /* 3DNow prefetch instructions */ -#define X86_FEATURE_OSVW (6*32+ 9) /* OS Visible Workaround */ -#define X86_FEATURE_IBS (6*32+10) /* Instruction Based Sampling */ -#define X86_FEATURE_XOP (6*32+11) /* extended AVX instructions */ -#define X86_FEATURE_SKINIT (6*32+12) /* SKINIT/STGI instructions */ -#define X86_FEATURE_WDT (6*32+13) /* Watchdog timer */ -#define X86_FEATURE_LWP (6*32+15) /* Light Weight Profiling */ -#define X86_FEATURE_FMA4 (6*32+16) /* 4 operands MAC instructions */ -#define X86_FEATURE_TCE (6*32+17) /* Translation Cache Extension */ -#define X86_FEATURE_NODEID_MSR (6*32+19) /* NodeId MSR */ -#define X86_FEATURE_TBM (6*32+21) /* Trailing Bit Manipulations */ -#define X86_FEATURE_TOPOEXT (6*32+22) /* Topology extensions CPUID leafs */ -#define X86_FEATURE_PERFCTR_CORE (6*32+23) /* Core performance counter extensions */ -#define X86_FEATURE_PERFCTR_NB (6*32+24) /* NB performance counter extensions */ -#define X86_FEATURE_BPEXT (6*32+26) /* Data breakpoint extension */ -#define X86_FEATURE_PTSC (6*32+27) /* Performance time-stamp counter */ -#define X86_FEATURE_PERFCTR_LLC (6*32+28) /* Last Level Cache performance counter extensions */ -#define X86_FEATURE_MWAITX (6*32+29) /* MWAIT extension (MONITORX/MWAITX instructions) */ +#define X86_FEATURE_LAHF_LM (6 * 32 + 0) /* LAHF/SAHF in long mode */ +#define X86_FEATURE_CMP_LEGACY (6 * 32 + 1) /* If yes HyperThreading not valid */ +#define X86_FEATURE_SVM (6 * 32 + 2) /* Secure Virtual Machine */ +#define X86_FEATURE_EXTAPIC (6 * 32 + 3) /* Extended APIC space */ +#define X86_FEATURE_CR8_LEGACY (6 * 32 + 4) /* CR8 in 32-bit mode */ +#define X86_FEATURE_ABM (6 * 32 + 5) /* Advanced bit manipulation */ +#define X86_FEATURE_SSE4A (6 * 32 + 6) /* SSE-4A */ +#define X86_FEATURE_MISALIGNSSE (6 * 32 + 7) /* Misaligned SSE mode */ +#define X86_FEATURE_3DNOWPREFETCH (6 * 32 + 8) /* 3DNow prefetch instructions */ +#define X86_FEATURE_OSVW (6 * 32 + 9) /* OS Visible Workaround */ +#define X86_FEATURE_IBS (6 * 32 + 10) /* Instruction Based Sampling */ +#define X86_FEATURE_XOP (6 * 32 + 11) /* extended AVX instructions */ +#define X86_FEATURE_SKINIT (6 * 32 + 12) /* SKINIT/STGI instructions */ +#define X86_FEATURE_WDT (6 * 32 + 13) /* Watchdog timer */ +#define X86_FEATURE_LWP (6 * 32 + 15) /* Light Weight Profiling */ +#define X86_FEATURE_FMA4 (6 * 32 + 16) /* 4 operands MAC instructions */ +#define X86_FEATURE_TCE (6 * 32 + 17) /* Translation Cache Extension */ +#define X86_FEATURE_NODEID_MSR (6 * 32 + 19) /* NodeId MSR */ +#define X86_FEATURE_TBM (6 * 32 + 21) /* Trailing Bit Manipulations */ +#define X86_FEATURE_TOPOEXT (6 * 32 + 22) /* Topology extensions CPUID leafs */ +#define X86_FEATURE_PERFCTR_CORE (6 * 32 + 23) /* Core performance counter extensions */ +#define X86_FEATURE_PERFCTR_NB (6 * 32 + 24) /* NB performance counter extensions */ +#define X86_FEATURE_BPEXT (6 * 32 + 26) /* Data breakpoint extension */ +#define X86_FEATURE_PTSC (6 * 32 + 27) /* Performance time-stamp counter */ +#define X86_FEATURE_PERFCTR_LLC (6 * 32 + 28) /* Last Level Cache performance counter extensions */ +#define X86_FEATURE_MWAITX (6 * 32 + 29) /* MWAIT extension (MONITORX/MWAITX instructions) */ /* Intel-defined CPU features, CPUID level 0x00000007:0 (EBX), word 9 */ -#define X86_FEATURE_FSGSBASE (9*32+ 0) /* RDFSBASE, WRFSBASE, RDGSBASE, WRGSBASE instructions*/ -#define X86_FEATURE_TSC_ADJUST (9*32+ 1) /* TSC adjustment MSR 0x3B */ -#define X86_FEATURE_BMI1 (9*32+ 3) /* 1st group bit manipulation extensions */ -#define X86_FEATURE_HLE (9*32+ 4) /* Hardware Lock Elision */ -#define X86_FEATURE_AVX2 (9*32+ 5) /* AVX2 instructions */ -#define X86_FEATURE_SMEP (9*32+ 7) /* Supervisor Mode Execution Protection */ -#define X86_FEATURE_BMI2 (9*32+ 8) /* 2nd group bit manipulation extensions */ -#define X86_FEATURE_ERMS (9*32+ 9) /* Enhanced REP MOVSB/STOSB instructions */ -#define X86_FEATURE_INVPCID (9*32+10) /* Invalidate Processor Context ID */ -#define X86_FEATURE_RTM (9*32+11) /* Restricted Transactional Memory */ -#define X86_FEATURE_CQM (9*32+12) /* Cache QoS Monitoring */ -#define X86_FEATURE_MPX (9*32+14) /* Memory Protection Extension */ -#define X86_FEATURE_RDT_A (9*32+15) /* Resource Director Technology Allocation */ -#define X86_FEATURE_AVX512F (9*32+16) /* AVX-512 Foundation */ -#define X86_FEATURE_AVX512DQ (9*32+17) /* AVX-512 DQ (Double/Quad granular) Instructions */ -#define X86_FEATURE_RDSEED (9*32+18) /* RDSEED instruction */ -#define X86_FEATURE_ADX (9*32+19) /* ADCX and ADOX instructions */ -#define X86_FEATURE_SMAP (9*32+20) /* Supervisor Mode Access Prevention */ -#define X86_FEATURE_AVX512IFMA (9*32+21) /* AVX-512 Integer Fused Multiply-Add instructions */ -#define X86_FEATURE_CLFLUSHOPT (9*32+23) /* CLFLUSHOPT instruction */ -#define X86_FEATURE_CLWB (9*32+24) /* CLWB instruction */ -#define X86_FEATURE_INTEL_PT (9*32+25) /* Intel Processor Trace */ -#define X86_FEATURE_AVX512PF (9*32+26) /* AVX-512 Prefetch */ -#define X86_FEATURE_AVX512ER (9*32+27) /* AVX-512 Exponential and Reciprocal */ -#define X86_FEATURE_AVX512CD (9*32+28) /* AVX-512 Conflict Detection */ -#define X86_FEATURE_SHA_NI (9*32+29) /* SHA1/SHA256 Instruction Extensions */ -#define X86_FEATURE_AVX512BW (9*32+30) /* AVX-512 BW (Byte/Word granular) Instructions */ -#define X86_FEATURE_AVX512VL (9*32+31) /* AVX-512 VL (128/256 Vector Length) Extensions */ +#define X86_FEATURE_FSGSBASE (9 * 32 + 0) /* RDFSBASE, WRFSBASE, RDGSBASE, WRGSBASE instructions*/ +#define X86_FEATURE_TSC_ADJUST (9 * 32 + 1) /* TSC adjustment MSR 0x3B */ +#define X86_FEATURE_BMI1 (9 * 32 + 3) /* 1st group bit manipulation extensions */ +#define X86_FEATURE_HLE (9 * 32 + 4) /* Hardware Lock Elision */ +#define X86_FEATURE_AVX2 (9 * 32 + 5) /* AVX2 instructions */ +#define X86_FEATURE_SMEP (9 * 32 + 7) /* Supervisor Mode Execution Protection */ +#define X86_FEATURE_BMI2 (9 * 32 + 8) /* 2nd group bit manipulation extensions */ +#define X86_FEATURE_ERMS (9 * 32 + 9) /* Enhanced REP MOVSB/STOSB instructions */ +#define X86_FEATURE_INVPCID (9 * 32 + 10) /* Invalidate Processor Context ID */ +#define X86_FEATURE_RTM (9 * 32 + 11) /* Restricted Transactional Memory */ +#define X86_FEATURE_CQM (9 * 32 + 12) /* Cache QoS Monitoring */ +#define X86_FEATURE_MPX (9 * 32 + 14) /* Memory Protection Extension */ +#define X86_FEATURE_RDT_A (9 * 32 + 15) /* Resource Director Technology Allocation */ +#define X86_FEATURE_AVX512F (9 * 32 + 16) /* AVX-512 Foundation */ +#define X86_FEATURE_AVX512DQ (9 * 32 + 17) /* AVX-512 DQ (Double/Quad granular) Instructions */ +#define X86_FEATURE_RDSEED (9 * 32 + 18) /* RDSEED instruction */ +#define X86_FEATURE_ADX (9 * 32 + 19) /* ADCX and ADOX instructions */ +#define X86_FEATURE_SMAP (9 * 32 + 20) /* Supervisor Mode Access Prevention */ +#define X86_FEATURE_AVX512IFMA (9 * 32 + 21) /* AVX-512 Integer Fused Multiply-Add instructions */ +#define X86_FEATURE_CLFLUSHOPT (9 * 32 + 23) /* CLFLUSHOPT instruction */ +#define X86_FEATURE_CLWB (9 * 32 + 24) /* CLWB instruction */ +#define X86_FEATURE_INTEL_PT (9 * 32 + 25) /* Intel Processor Trace */ +#define X86_FEATURE_AVX512PF (9 * 32 + 26) /* AVX-512 Prefetch */ +#define X86_FEATURE_AVX512ER (9 * 32 + 27) /* AVX-512 Exponential and Reciprocal */ +#define X86_FEATURE_AVX512CD (9 * 32 + 28) /* AVX-512 Conflict Detection */ +#define X86_FEATURE_SHA_NI (9 * 32 + 29) /* SHA1/SHA256 Instruction Extensions */ +#define X86_FEATURE_AVX512BW (9 * 32 + 30) /* AVX-512 BW (Byte/Word granular) Instructions */ +#define X86_FEATURE_AVX512VL (9 * 32 + 31) /* AVX-512 VL (128/256 Vector Length) Extensions */ /* Extended state features, CPUID level 0x0000000d:1 (EAX), word 10 */ -#define X86_FEATURE_XSAVEOPT (10*32+ 0) /* XSAVEOPT instruction */ -#define X86_FEATURE_XSAVEC (10*32+ 1) /* XSAVEC instruction */ -#define X86_FEATURE_XGETBV1 (10*32+ 2) /* XGETBV with ECX = 1 instruction */ -#define X86_FEATURE_XSAVES (10*32+ 3) /* XSAVES/XRSTORS instructions */ +#define X86_FEATURE_XSAVEOPT (10 * 32 + 0) /* XSAVEOPT instruction */ +#define X86_FEATURE_XSAVEC (10 * 32 + 1) /* XSAVEC instruction */ +#define X86_FEATURE_XGETBV1 (10 * 32 + 2) /* XGETBV with ECX = 1 instruction */ +#define X86_FEATURE_XSAVES (10 * 32 + 3) /* XSAVES/XRSTORS instructions */ /* Intel-defined CPU features, CPUID level 0x00000007:0 (ECX), word 11 */ -#define X86_FEATURE_PREFETCHWT1 (11*32+ 0) /* PREFETCHWT1 Intel® Xeon PhiTM only */ -#define X86_FEATURE_AVX512VBMI (11*32+ 1) /* AVX512 Vector Bit Manipulation instructions*/ -#define X86_FEATURE_UMIP (11*32+ 2) /* User Mode Instruction Protection */ -#define X86_FEATURE_PKU (11*32+ 3) /* Protection Keys for Userspace */ -#define X86_FEATURE_OSPKE (11*32+ 4) /* OS Protection Keys Enable */ -#define X86_FEATURE_AVX512_VBMI2 (11*32+ 6) /* Additional AVX512 Vector Bit Manipulation Instructions */ -#define X86_FEATURE_GFNI (11*32+ 8) /* Galois Field New Instructions */ -#define X86_FEATURE_VAES (11*32+ 9) /* Vector AES */ -#define X86_FEATURE_VPCLMULQDQ (11*32+10) /* Carry-Less Multiplication Double Quadword */ -#define X86_FEATURE_AVX512_VNNI (11*32+11) /* Vector Neural Network Instructions */ -#define X86_FEATURE_AVX512_BITALG (11*32+12) /* Support for VPOPCNT[B,W] and VPSHUF-BITQMB instructions */ -#define X86_FEATURE_TME (11*32+13) /* Intel Total Memory Encryption */ -#define X86_FEATURE_AVX512_VPOPCNTDQ (11*32+14) /* POPCNT for vectors of DW/QW */ -#define X86_FEATURE_LA57 (11*32+16) /* 5-level page tables */ -#define X86_FEATURE_RDPID (11*32+22) /* RDPID instruction */ -#define X86_FEATURE_CLDEMOTE (11*32+25) /* CLDEMOTE instruction */ +#define X86_FEATURE_PREFETCHWT1 (11 * 32 + 0) /* PREFETCHWT1 Intel® Xeon PhiTM only */ +#define X86_FEATURE_AVX512VBMI (11 * 32 + 1) /* AVX512 Vector Bit Manipulation instructions*/ +#define X86_FEATURE_UMIP (11 * 32 + 2) /* User Mode Instruction Protection */ +#define X86_FEATURE_PKU (11 * 32 + 3) /* Protection Keys for Userspace */ +#define X86_FEATURE_OSPKE (11 * 32 + 4) /* OS Protection Keys Enable */ +#define X86_FEATURE_AVX512_VBMI2 (11 * 32 + 6) /* Additional AVX512 Vector Bit Manipulation Instructions */ +#define X86_FEATURE_SHSTK (11 * 32 + 7) /* Shadow Stack */ +#define X86_FEATURE_GFNI (11 * 32 + 8) /* Galois Field New Instructions */ +#define X86_FEATURE_VAES (11 * 32 + 9) /* Vector AES */ +#define X86_FEATURE_VPCLMULQDQ (11 * 32 + 10) /* Carry-Less Multiplication Double Quadword */ +#define X86_FEATURE_AVX512_VNNI (11 * 32 + 11) /* Vector Neural Network Instructions */ +#define X86_FEATURE_AVX512_BITALG (11 * 32 + 12) /* Support for VPOPCNT[B,W] and VPSHUF-BITQMB instructions */ +#define X86_FEATURE_TME (11 * 32 + 13) /* Intel Total Memory Encryption */ +#define X86_FEATURE_AVX512_VPOPCNTDQ (11 * 32 + 14) /* POPCNT for vectors of DW/QW */ +#define X86_FEATURE_LA57 (11 * 32 + 16) /* 5-level page tables */ +#define X86_FEATURE_RDPID (11 * 32 + 22) /* RDPID instruction */ +#define X86_FEATURE_CLDEMOTE (11 * 32 + 25) /* CLDEMOTE instruction */ /* Intel-defined CPU QoS Sub-leaf, CPUID level 0x0000000F:1 (EDX), word 12 */ -#define X86_FEATURE_CQM_OCCUP_LLC (12*32+ 0) /* LLC occupancy monitoring */ -#define X86_FEATURE_CQM_MBM_TOTAL (12*32+ 1) /* LLC Total MBM monitoring */ -#define X86_FEATURE_CQM_MBM_LOCAL (12*32+ 2) /* LLC Local MBM monitoring */ +#define X86_FEATURE_CQM_OCCUP_LLC (12 * 32 + 0) /* LLC occupancy monitoring */ +#define X86_FEATURE_CQM_MBM_TOTAL (12 * 32 + 1) /* LLC Total MBM monitoring */ +#define X86_FEATURE_CQM_MBM_LOCAL (12 * 32 + 2) /* LLC Local MBM monitoring */ /* AMD-defined CPU features, CPUID level 0x80000008 (EBX), word 13 */ -#define X86_FEATURE_CLZERO (13*32+ 0) /* CLZERO instruction */ -#define X86_FEATURE_IRPERF (13*32+ 1) /* Instructions Retired Count */ -#define X86_FEATURE_XSAVEERPTR (13*32+ 2) /* Always save/restore FP error pointers */ -#define X86_FEATURE_IBPB (13*32+12) /* Indirect Branch Prediction Barrier */ -#define X86_FEATURE_IBRS (13*32+14) /* Indirect Branch Restricted Speculation */ -#define X86_FEATURE_STIBP (13*32+15) /* Single Thread Indirect Branch Predictors */ +#define X86_FEATURE_CLZERO (13 * 32 + 0) /* CLZERO instruction */ +#define X86_FEATURE_IRPERF (13 * 32 + 1) /* Instructions Retired Count */ +#define X86_FEATURE_XSAVEERPTR (13 * 32 + 2) /* Always save/restore FP error pointers */ +#define X86_FEATURE_IBPB (13 * 32 + 12) /* Indirect Branch Prediction Barrier */ +#define X86_FEATURE_IBRS (13 * 32 + 14) /* Indirect Branch Restricted Speculation */ +#define X86_FEATURE_STIBP (13 * 32 + 15) /* Single Thread Indirect Branch Predictors */ /* Thermal and Power Management Leaf, CPUID level 0x00000006 (EAX), word 14 */ -#define X86_FEATURE_DTHERM (14*32+ 0) /* Digital Thermal Sensor */ -#define X86_FEATURE_IDA (14*32+ 1) /* Intel Dynamic Acceleration */ -#define X86_FEATURE_ARAT (14*32+ 2) /* Always Running APIC Timer */ -#define X86_FEATURE_PLN (14*32+ 4) /* Intel Power Limit Notification */ -#define X86_FEATURE_PTS (14*32+ 6) /* Intel Package Thermal Status */ -#define X86_FEATURE_HWP (14*32+ 7) /* Intel Hardware P-states */ -#define X86_FEATURE_HWP_NOTIFY (14*32+ 8) /* HWP Notification */ -#define X86_FEATURE_HWP_ACT_WINDOW (14*32+ 9) /* HWP Activity Window */ -#define X86_FEATURE_HWP_EPP (14*32+10) /* HWP Energy Perf. Preference */ -#define X86_FEATURE_HWP_PKG_REQ (14*32+11) /* HWP Package Level Request */ -#define X86_FEATURE_HDC (14*32+13) /* HDC base registers present */ +#define X86_FEATURE_DTHERM (14 * 32 + 0) /* Digital Thermal Sensor */ +#define X86_FEATURE_IDA (14 * 32 + 1) /* Intel Dynamic Acceleration */ +#define X86_FEATURE_ARAT (14 * 32 + 2) /* Always Running APIC Timer */ +#define X86_FEATURE_PLN (14 * 32 + 4) /* Intel Power Limit Notification */ +#define X86_FEATURE_PTS (14 * 32 + 6) /* Intel Package Thermal Status */ +#define X86_FEATURE_HWP (14 * 32 + 7) /* Intel Hardware P-states */ +#define X86_FEATURE_HWP_NOTIFY (14 * 32 + 8) /* HWP Notification */ +#define X86_FEATURE_HWP_ACT_WINDOW (14 * 32 + 9) /* HWP Activity Window */ +#define X86_FEATURE_HWP_EPP (14 * 32 + 10) /* HWP Energy Perf. Preference */ +#define X86_FEATURE_HWP_PKG_REQ (14 * 32 + 11) /* HWP Package Level Request */ +#define X86_FEATURE_HDC (14 * 32 + 13) /* HDC base registers present */ /* AMD SVM Feature Identification, CPUID level 0x8000000a (EDX), word 15 */ -#define X86_FEATURE_NPT (15*32+ 0) /* Nested Page Table support */ -#define X86_FEATURE_LBRV (15*32+ 1) /* LBR Virtualization support */ -#define X86_FEATURE_SVML (15*32+ 2) /* "svm_lock" SVM locking MSR */ -#define X86_FEATURE_NRIPS (15*32+ 3) /* "nrip_save" SVM next_rip save */ -#define X86_FEATURE_TSCRATEMSR (15*32+ 4) /* "tsc_scale" TSC scaling support */ -#define X86_FEATURE_VMCBCLEAN (15*32+ 5) /* "vmcb_clean" VMCB clean bits support */ -#define X86_FEATURE_FLUSHBYASID (15*32+ 6) /* flush-by-ASID support */ -#define X86_FEATURE_DECODEASSISTS (15*32+ 7) /* Decode Assists support */ -#define X86_FEATURE_PAUSEFILTER (15*32+10) /* filtered pause intercept */ -#define X86_FEATURE_PFTHRESHOLD (15*32+12) /* pause filter threshold */ -#define X86_FEATURE_AVIC (15*32+13) /* Virtual Interrupt Controller */ -#define X86_FEATURE_V_VMSAVE_VMLOAD (15*32+15) /* Virtual VMSAVE VMLOAD */ -#define X86_FEATURE_VGIF (15*32+16) /* Virtual GIF */ +#define X86_FEATURE_NPT (15 * 32 + 0) /* Nested Page Table support */ +#define X86_FEATURE_LBRV (15 * 32 + 1) /* LBR Virtualization support */ +#define X86_FEATURE_SVML (15 * 32 + 2) /* "svm_lock" SVM locking MSR */ +#define X86_FEATURE_NRIPS (15 * 32 + 3) /* "nrip_save" SVM next_rip save */ +#define X86_FEATURE_TSCRATEMSR (15 * 32 + 4) /* "tsc_scale" TSC scaling support */ +#define X86_FEATURE_VMCBCLEAN (15 * 32 + 5) /* "vmcb_clean" VMCB clean bits support */ +#define X86_FEATURE_FLUSHBYASID (15 * 32 + 6) /* flush-by-ASID support */ +#define X86_FEATURE_DECODEASSISTS (15 * 32 + 7) /* Decode Assists support */ +#define X86_FEATURE_PAUSEFILTER (15 * 32 + 10) /* filtered pause intercept */ +#define X86_FEATURE_PFTHRESHOLD (15 * 32 + 12) /* pause filter threshold */ +#define X86_FEATURE_AVIC (15 * 32 + 13) /* Virtual Interrupt Controller */ +#define X86_FEATURE_V_VMSAVE_VMLOAD (15 * 32 + 15) /* Virtual VMSAVE VMLOAD */ +#define X86_FEATURE_VGIF (15 * 32 + 16) /* Virtual GIF */ /* Intel-defined CPU QoS Sub-leaf, CPUID level 0x0000000F:0 (EDX), word 16 */ -#define X86_FEATURE_CQM_LLC (16*32+ 1) /* LLC QoS if 1 */ +#define X86_FEATURE_CQM_LLC (16 * 32 + 1) /* LLC QoS if 1 */ /* AMD-defined CPU features, CPUID level 0x80000007 (EBX), word 17 */ -#define X86_FEATURE_OVERFLOW_RECOV (17*32+ 0) /* MCA overflow recovery support */ -#define X86_FEATURE_SUCCOR (17*32+ 1) /* Uncorrectable error containment and recovery */ -#define X86_FEATURE_SMCA (17*32+ 3) /* Scalable MCA */ +#define X86_FEATURE_OVERFLOW_RECOV (17 * 32 + 0) /* MCA overflow recovery support */ +#define X86_FEATURE_SUCCOR (17 * 32 + 1) /* Uncorrectable error containment and recovery */ +#define X86_FEATURE_SMCA (17 * 32 + 3) /* Scalable MCA */ /* Intel-defined CPU features, CPUID level 0x00000007:0 (EDX), word 18 */ -#define X86_FEATURE_AVX512_4VNNIW (18*32+ 2) /* AVX-512 Neural Network Instructions */ -#define X86_FEATURE_AVX512_4FMAPS (18*32+ 3) /* AVX-512 Multiply Accumulation Single precision */ -#define X86_FEATURE_PCONFIG (18*32+18) /* Intel PCONFIG */ -#define X86_FEATURE_SPEC_CTRL (18*32+26) /* "" Speculation Control (IBRS + IBPB) */ -#define X86_FEATURE_INTEL_STIBP (18*32+27) /* "" Single Thread Indirect Branch Predictors */ -#define X86_FEATURE_ARCH_CAPABILITIES (18*32+29) /* IA32_ARCH_CAPABILITIES MSR (Intel) */ -#define X86_FEATURE_SPEC_CTRL_SSBD (18*32+31) /* "" Speculative Store Bypass Disable */ +#define X86_FEATURE_AVX512_4VNNIW (18 * 32 + 2) /* AVX-512 Neural Network Instructions */ +#define X86_FEATURE_AVX512_4FMAPS (18 * 32 + 3) /* AVX-512 Multiply Accumulation Single precision */ +#define X86_FEATURE_PCONFIG (18 * 32 + 18) /* Intel PCONFIG */ +#define X86_FEATURE_SPEC_CTRL (18 * 32 + 26) /* "" Speculation Control (IBRS + IBPB) */ +#define X86_FEATURE_INTEL_STIBP (18 * 32 + 27) /* "" Single Thread Indirect Branch Predictors */ +#define X86_FEATURE_ARCH_CAPABILITIES (18 * 32 + 29) /* IA32_ARCH_CAPABILITIES MSR (Intel) */ +#define X86_FEATURE_SPEC_CTRL_SSBD (18 * 32 + 31) /* "" Speculative Store Bypass Disable */ enum { - X86_VENDOR_INTEL = 0, - X86_VENDOR_AMD = 1, + X86_VENDOR_INTEL = 0, + X86_VENDOR_AMD = 1, X86_VENDOR_MAX }; struct cpuinfo_x86 { /* cpu context */ - uint8_t x86_family; - uint8_t x86_vendor; - uint8_t x86_model; - uint8_t x86_mask; - uint32_t x86_capability[NCAPINTS]; - uint32_t x86_power; - uint32_t extended_cpuid_level; - int cpuid_level; - char x86_vendor_id[16]; - char x86_model_id[64]; + uint8_t x86_family; + uint8_t x86_vendor; + uint8_t x86_model; + uint8_t x86_mask; + uint32_t x86_capability[NCAPINTS]; + uint32_t x86_power; + uint32_t extended_cpuid_level; + int cpuid_level; + char x86_vendor_id[16]; + char x86_model_id[64]; /* fpu context */ - uint64_t xfeatures_mask; - uint32_t xsave_size_max; - uint32_t xsave_size; - uint32_t xstate_offsets[XFEATURE_MAX]; - uint32_t xstate_sizes[XFEATURE_MAX]; + uint64_t xfeatures_mask; + uint32_t xsave_size_max; + uint32_t xsave_size; + uint32_t xstate_offsets[XFEATURE_MAX]; + uint32_t xstate_sizes[XFEATURE_MAX]; - uint32_t xsaves_size; - uint32_t xstate_comp_offsets[XFEATURE_MAX]; - uint32_t xstate_comp_sizes[XFEATURE_MAX]; + uint32_t xsaves_size; + uint32_t xstate_comp_offsets[XFEATURE_MAX]; + uint32_t xstate_comp_sizes[XFEATURE_MAX]; }; typedef struct cpuinfo_x86 compel_cpuinfo_t; diff --git a/compel/arch/x86/src/lib/include/uapi/asm/fpu.h b/compel/arch/x86/src/lib/include/uapi/asm/fpu.h index 509f4488b..d595a68fc 100644 --- a/compel/arch/x86/src/lib/include/uapi/asm/fpu.h +++ b/compel/arch/x86/src/lib/include/uapi/asm/fpu.h @@ -7,27 +7,48 @@ #include -#define FP_MIN_ALIGN_BYTES 64 -#define FXSAVE_ALIGN_BYTES 16 +#define FP_MIN_ALIGN_BYTES 64 +#define FXSAVE_ALIGN_BYTES 16 -#define FP_XSTATE_MAGIC1 0x46505853U -#define FP_XSTATE_MAGIC2 0x46505845U +#define FP_XSTATE_MAGIC1 0x46505853U +#define FP_XSTATE_MAGIC2 0x46505845U #ifndef FP_XSTATE_MAGIC2_SIZE -#define FP_XSTATE_MAGIC2_SIZE sizeof(FP_XSTATE_MAGIC2) +#define FP_XSTATE_MAGIC2_SIZE sizeof(FP_XSTATE_MAGIC2) #endif -#define XSTATE_FP 0x1 -#define XSTATE_SSE 0x2 -#define XSTATE_YMM 0x4 +#define XSTATE_FP 0x1 +#define XSTATE_SSE 0x2 +#define XSTATE_YMM 0x4 -#define FXSAVE_SIZE 512 -#define XSAVE_SIZE 4096 +#define FXSAVE_SIZE 512 +/* + * This used to be 4096 (one page). There is a comment below concerning + * this size: + * "One page should be enough for the whole xsave state ;-)" + * Which is kind of funny as it is no longer enough ;-) + * + * Older CPUs: + * # cpuid -1 -l 0xd -s 0 + * ... + * bytes required by XSAVE/XRSTOR area = 0x00000988 (2440) + * + * Newer CPUs (Sapphire Rapids): + * # cpuid -1 -l 0xd -s 0 + * ... + * bytes required by XSAVE/XRSTOR area = 0x00002b00 (11008) + * + * So one page is no longer enough... But: + * + * Four pages should be enough for the whole xsave state ;-) + */ -#define XSAVE_HDR_SIZE 64 -#define XSAVE_HDR_OFFSET FXSAVE_SIZE +#define XSAVE_SIZE 4*4096 -#define XSAVE_YMM_SIZE 256 -#define XSAVE_YMM_OFFSET (XSAVE_HDR_SIZE + XSAVE_HDR_OFFSET) +#define XSAVE_HDR_SIZE 64 +#define XSAVE_HDR_OFFSET FXSAVE_SIZE + +#define XSAVE_YMM_SIZE 256 +#define XSAVE_YMM_OFFSET (XSAVE_HDR_SIZE + XSAVE_HDR_OFFSET) /* * List of XSAVE features Linux knows about: @@ -52,91 +73,93 @@ enum xfeature { XFEATURE_MAX, }; -#define XSTATE_CPUID 0x0000000d +#define XSTATE_CPUID 0x0000000d -#define XFEATURE_MASK_FP (1 << XFEATURE_FP) -#define XFEATURE_MASK_SSE (1 << XFEATURE_SSE) -#define XFEATURE_MASK_YMM (1 << XFEATURE_YMM) -#define XFEATURE_MASK_BNDREGS (1 << XFEATURE_BNDREGS) -#define XFEATURE_MASK_BNDCSR (1 << XFEATURE_BNDCSR) -#define XFEATURE_MASK_OPMASK (1 << XFEATURE_OPMASK) -#define XFEATURE_MASK_ZMM_Hi256 (1 << XFEATURE_ZMM_Hi256) -#define XFEATURE_MASK_Hi16_ZMM (1 << XFEATURE_Hi16_ZMM) -#define XFEATURE_MASK_PT (1 << XFEATURE_PT) -#define XFEATURE_MASK_PKRU (1 << XFEATURE_PKRU) -#define XFEATURE_MASK_HDC (1 << XFEATURE_HDC) -#define XFEATURE_MASK_MAX (1 << XFEATURE_MAX) +#define XFEATURE_MASK_FP (1 << XFEATURE_FP) +#define XFEATURE_MASK_SSE (1 << XFEATURE_SSE) +#define XFEATURE_MASK_YMM (1 << XFEATURE_YMM) +#define XFEATURE_MASK_BNDREGS (1 << XFEATURE_BNDREGS) +#define XFEATURE_MASK_BNDCSR (1 << XFEATURE_BNDCSR) +#define XFEATURE_MASK_OPMASK (1 << XFEATURE_OPMASK) +#define XFEATURE_MASK_ZMM_Hi256 (1 << XFEATURE_ZMM_Hi256) +#define XFEATURE_MASK_Hi16_ZMM (1 << XFEATURE_Hi16_ZMM) +#define XFEATURE_MASK_PT (1 << XFEATURE_PT) +#define XFEATURE_MASK_PKRU (1 << XFEATURE_PKRU) +#define XFEATURE_MASK_HDC (1 << XFEATURE_HDC) +#define XFEATURE_MASK_MAX (1 << XFEATURE_MAX) -#define XFEATURE_MASK_FPSSE (XFEATURE_MASK_FP | XFEATURE_MASK_SSE) -#define XFEATURE_MASK_AVX512 (XFEATURE_MASK_OPMASK | XFEATURE_MASK_ZMM_Hi256 | XFEATURE_MASK_Hi16_ZMM) +#define XFEATURE_MASK_FPSSE (XFEATURE_MASK_FP | XFEATURE_MASK_SSE) +#define XFEATURE_MASK_AVX512 (XFEATURE_MASK_OPMASK | XFEATURE_MASK_ZMM_Hi256 | XFEATURE_MASK_Hi16_ZMM) -#define FIRST_EXTENDED_XFEATURE XFEATURE_YMM +#define FIRST_EXTENDED_XFEATURE XFEATURE_YMM /* Supervisor features */ -#define XFEATURE_MASK_SUPERVISOR (XFEATURE_MASK_PT | XFEATURE_HDC) +#define XFEATURE_MASK_SUPERVISOR (XFEATURE_MASK_PT | XFEATURE_HDC) /* All currently supported features */ -#define XCNTXT_MASK \ - (XFEATURE_MASK_FP | XFEATURE_MASK_SSE | \ - XFEATURE_MASK_YMM | XFEATURE_MASK_OPMASK | \ - XFEATURE_MASK_ZMM_Hi256 | XFEATURE_MASK_Hi16_ZMM | \ - XFEATURE_MASK_PKRU | XFEATURE_MASK_BNDREGS | \ - XFEATURE_MASK_BNDCSR) +#define XFEATURE_MASK_USER \ + (XFEATURE_MASK_FP | XFEATURE_MASK_SSE | XFEATURE_MASK_YMM | XFEATURE_MASK_OPMASK | XFEATURE_MASK_ZMM_Hi256 | \ + XFEATURE_MASK_Hi16_ZMM | XFEATURE_MASK_PKRU | XFEATURE_MASK_BNDREGS | XFEATURE_MASK_BNDCSR) + +/* xsave structure features which is safe to fill with garbage (see validate_random_xstate()) */ +#define XFEATURE_MASK_FAULTINJ \ + (XFEATURE_MASK_FP | XFEATURE_MASK_SSE | XFEATURE_MASK_YMM | XFEATURE_MASK_OPMASK | XFEATURE_MASK_ZMM_Hi256 | \ + XFEATURE_MASK_Hi16_ZMM) struct fpx_sw_bytes { - uint32_t magic1; - uint32_t extended_size; - uint64_t xstate_bv; - uint32_t xstate_size; - uint32_t padding[7]; + uint32_t magic1; + uint32_t extended_size; + uint64_t xstate_bv; + uint32_t xstate_size; + uint32_t padding[7]; }; struct i387_fxsave_struct { - uint16_t cwd; /* Control Word */ - uint16_t swd; /* Status Word */ - uint16_t twd; /* Tag Word */ - uint16_t fop; /* Last Instruction Opcode */ + uint16_t cwd; /* Control Word */ + uint16_t swd; /* Status Word */ + uint16_t twd; /* Tag Word */ + uint16_t fop; /* Last Instruction Opcode */ union { struct { - uint64_t rip; /* Instruction Pointer */ - uint64_t rdp; /* Data Pointer */ + uint64_t rip; /* Instruction Pointer */ + uint64_t rdp; /* Data Pointer */ }; struct { - uint32_t fip; /* FPU IP Offset */ - uint32_t fcs; /* FPU IP Selector */ - uint32_t foo; /* FPU Operand Offset */ - uint32_t fos; /* FPU Operand Selector */ + uint32_t fip; /* FPU IP Offset */ + uint32_t fcs; /* FPU IP Selector */ + uint32_t foo; /* FPU Operand Offset */ + uint32_t fos; /* FPU Operand Selector */ }; }; - uint32_t mxcsr; /* MXCSR Register State */ - uint32_t mxcsr_mask; /* MXCSR Mask */ + uint32_t mxcsr; /* MXCSR Register State */ + uint32_t mxcsr_mask; /* MXCSR Mask */ /* 8*16 bytes for each FP-reg = 128 bytes */ - uint32_t st_space[32]; + uint32_t st_space[32]; /* 16*16 bytes for each XMM-reg = 256 bytes */ - uint32_t xmm_space[64]; + uint32_t xmm_space[64]; - uint32_t padding[12]; + uint32_t padding[12]; union { - uint32_t padding1[12]; - uint32_t sw_reserved[12]; + uint32_t padding1[12]; + uint32_t sw_reserved[12]; }; } __aligned(FXSAVE_ALIGN_BYTES); struct xsave_hdr_struct { - uint64_t xstate_bv; - uint64_t xcomp_bv; - uint64_t reserved[6]; + uint64_t xstate_bv; + uint64_t xcomp_bv; + uint64_t reserved[6]; } __packed; /* * xstate_header.xcomp_bv[63] indicates that the extended_state_area * is in compacted format. */ -#define XCOMP_BV_COMPACTED_FORMAT ((uint64_t)1 << 63) +#define XCOMP_BV_COMPACTED_FORMAT ((uint64_t)1 << 63) /* * State component 2: @@ -149,21 +172,21 @@ struct xsave_hdr_struct { * The high 128 bits are stored here. */ struct ymmh_struct { - uint32_t ymmh_space[64]; + uint32_t ymmh_space[64]; } __packed; /* Intel MPX support: */ struct mpx_bndreg { - uint64_t lower_bound; - uint64_t upper_bound; + uint64_t lower_bound; + uint64_t upper_bound; } __packed; /* * State component 3 is used for the 4 128-bit bounds registers */ struct mpx_bndreg_state { - struct mpx_bndreg bndreg[4]; + struct mpx_bndreg bndreg[4]; } __packed; /* @@ -172,8 +195,8 @@ struct mpx_bndreg_state { * register BNDSTATUS. We call the pair "BNDCSR". */ struct mpx_bndcsr { - uint64_t bndcfgu; - uint64_t bndstatus; + uint64_t bndcfgu; + uint64_t bndstatus; } __packed; /* @@ -181,8 +204,8 @@ struct mpx_bndcsr { */ struct mpx_bndcsr_state { union { - struct mpx_bndcsr bndcsr; - uint8_t pad_to_64_bytes[64]; + struct mpx_bndcsr bndcsr; + uint8_t pad_to_64_bytes[64]; }; } __packed; @@ -193,7 +216,7 @@ struct mpx_bndcsr_state { * k0-k7 (opmask state). */ struct avx_512_opmask_state { - uint64_t opmask_reg[8]; + uint64_t opmask_reg[8]; } __packed; /* @@ -202,7 +225,7 @@ struct avx_512_opmask_state { * ZMM0_H-ZMM15_H (ZMM_Hi256 state). */ struct avx_512_zmm_uppers_state { - uint64_t zmm_upper[16 * 4]; + uint64_t zmm_upper[16 * 4]; } __packed; /* @@ -210,7 +233,7 @@ struct avx_512_zmm_uppers_state { * ZMM16-ZMM31 (Hi16_ZMM state). */ struct avx_512_hi16_state { - uint64_t hi16_zmm[16 * 8]; + uint64_t hi16_zmm[16 * 8]; } __packed; /* @@ -218,10 +241,18 @@ struct avx_512_hi16_state { * 8 bytes long but only 4 bytes is used currently. */ struct pkru_state { - uint32_t pkru; - uint32_t pad; + uint32_t pkru; + uint32_t pad; } __packed; +/* + * State component 11 is Control-flow Enforcement user states + */ +struct cet_user_state { + uint64_t cet; /* user control-flow settings */ + uint64_t ssp; /* user shadow stack pointer */ +}; + /* * This is our most modern FPU state format, as saved by the XSAVE * and restored by the XRSTOR instructions. @@ -232,74 +263,78 @@ struct pkru_state { * can vary quite a bit between CPUs. * * - * One page should be enough for the whole xsave state. + * One page should be enough for the whole xsave state ;-) + * + * Of course it was not ;-) Now using four pages... + * */ -#define EXTENDED_STATE_AREA_SIZE (4096 - sizeof(struct i387_fxsave_struct) - sizeof(struct xsave_hdr_struct)) +#define EXTENDED_STATE_AREA_SIZE (XSAVE_SIZE - sizeof(struct i387_fxsave_struct) - sizeof(struct xsave_hdr_struct) - sizeof(struct cet_user_state)) /* * cpu requires it to be 64 byte aligned */ struct xsave_struct { - struct i387_fxsave_struct i387; - struct xsave_hdr_struct xsave_hdr; + struct i387_fxsave_struct i387; + struct xsave_hdr_struct xsave_hdr; union { /* * This ymmh is unndeed, for * backward compatibility. */ - struct ymmh_struct ymmh; - uint8_t extended_state_area[EXTENDED_STATE_AREA_SIZE]; + struct ymmh_struct ymmh; + uint8_t extended_state_area[EXTENDED_STATE_AREA_SIZE]; }; + struct cet_user_state cet; } __aligned(FP_MIN_ALIGN_BYTES) __packed; struct xsave_struct_ia32 { - struct i387_fxsave_struct i387; - struct xsave_hdr_struct xsave_hdr; + struct i387_fxsave_struct i387; + struct xsave_hdr_struct xsave_hdr; union { /* * This ymmh is unndeed, for * backward compatibility. */ - struct ymmh_struct ymmh; - uint8_t extended_state_area[EXTENDED_STATE_AREA_SIZE]; + struct ymmh_struct ymmh; + uint8_t extended_state_area[EXTENDED_STATE_AREA_SIZE]; }; -} __aligned(FXSAVE_ALIGN_BYTES); +}; typedef struct { /* - * The FPU xsave area must be continious and FP_MIN_ALIGN_BYTES + * The FPU xsave area must be continuous and FP_MIN_ALIGN_BYTES * aligned, thus make sure the compiler won't insert any hole here. */ union { - struct xsave_struct xsave; - uint8_t __pad[sizeof(struct xsave_struct) + FP_XSTATE_MAGIC2_SIZE]; + struct xsave_struct xsave; + uint8_t __pad[sizeof(struct xsave_struct) + FP_XSTATE_MAGIC2_SIZE]; }; uint8_t has_fpu; } fpu_state_64_t; struct user_i387_ia32_struct { - uint32_t cwd; /* FPU Control Word */ - uint32_t swd; /* FPU Status Word */ - uint32_t twd; /* FPU Tag Word */ - uint32_t fip; /* FPU IP Offset */ - uint32_t fcs; /* FPU IP Selector */ - uint32_t foo; /* FPU Operand Pointer Offset */ - uint32_t fos; /* FPU Operand Pointer Selector */ - uint32_t st_space[20]; /* 8*10 bytes for each FP-reg = 80 bytes */ + uint32_t cwd; /* FPU Control Word */ + uint32_t swd; /* FPU Status Word */ + uint32_t twd; /* FPU Tag Word */ + uint32_t fip; /* FPU IP Offset */ + uint32_t fcs; /* FPU IP Selector */ + uint32_t foo; /* FPU Operand Pointer Offset */ + uint32_t fos; /* FPU Operand Pointer Selector */ + uint32_t st_space[20]; /* 8*10 bytes for each FP-reg = 80 bytes */ }; typedef struct { struct { - struct user_i387_ia32_struct i387_ia32; + struct user_i387_ia32_struct i387_ia32; /* Software status information [not touched by FSAVE]: */ - uint32_t status; + uint32_t status; } fregs_state; union { - struct xsave_struct_ia32 xsave; - uint8_t __pad[sizeof(struct xsave_struct) + FP_XSTATE_MAGIC2_SIZE]; + struct xsave_struct_ia32 xsave; + uint8_t __pad[sizeof(struct xsave_struct) + FP_XSTATE_MAGIC2_SIZE]; } __aligned(FXSAVE_ALIGN_BYTES); } __aligned(FXSAVE_ALIGN_BYTES) fpu_state_ia32_t; @@ -308,14 +343,17 @@ typedef struct { */ typedef struct { union { - fpu_state_64_t fpu_state_64; - fpu_state_ia32_t fpu_state_ia32; + fpu_state_64_t fpu_state_64; + struct { + /* fpu_state_ia32->xsave has to be 64-byte aligned. */ + uint32_t __pad[2]; + fpu_state_ia32_t fpu_state_ia32; + }; }; uint8_t has_fpu; } fpu_state_t; -extern void compel_convert_from_fxsr(struct user_i387_ia32_struct *env, - struct i387_fxsave_struct *fxsave); +extern void compel_convert_from_fxsr(struct user_i387_ia32_struct *env, struct i387_fxsave_struct *fxsave); #endif /* __CR_ASM_FPU_H__ */ diff --git a/compel/arch/x86/src/lib/include/uapi/asm/infect-types.h b/compel/arch/x86/src/lib/include/uapi/asm/infect-types.h index e6d394989..b998c488c 100644 --- a/compel/arch/x86/src/lib/include/uapi/asm/infect-types.h +++ b/compel/arch/x86/src/lib/include/uapi/asm/infect-types.h @@ -6,57 +6,80 @@ #include #include -#define SIGMAX 64 -#define SIGMAX_OLD 31 +#define SIGMAX 64 +#define SIGMAX_OLD 31 + +#define ARCH_HAS_PTRACE_GET_THREAD_AREA + +/* + * Linux preserves three TLS segments in GDT. + * Offsets in GDT differ between 32-bit and 64-bit machines. + * For 64-bit x86 those GDT offsets are the same + * for native and compat tasks. + */ +#define GDT_ENTRY_TLS_MIN 12 +#define GDT_ENTRY_TLS_MAX 14 +#define GDT_ENTRY_TLS_NUM 3 +typedef struct { + user_desc_t desc[GDT_ENTRY_TLS_NUM]; +} tls_t; + +struct thread_ctx; +struct parasite_ctl; +struct parasite_thread_ctl; +extern int __compel_arch_fetch_thread_area(int tid, struct thread_ctx *th); +extern int compel_arch_fetch_thread_area(struct parasite_thread_ctl *tctl); +extern void compel_arch_get_tls_thread(struct parasite_thread_ctl *tctl, tls_t *out); +extern void compel_arch_get_tls_task(struct parasite_ctl *ctl, tls_t *out); typedef struct { - uint64_t r15; - uint64_t r14; - uint64_t r13; - uint64_t r12; - uint64_t bp; - uint64_t bx; - uint64_t r11; - uint64_t r10; - uint64_t r9; - uint64_t r8; - uint64_t ax; - uint64_t cx; - uint64_t dx; - uint64_t si; - uint64_t di; - uint64_t orig_ax; - uint64_t ip; - uint64_t cs; - uint64_t flags; - uint64_t sp; - uint64_t ss; - uint64_t fs_base; - uint64_t gs_base; - uint64_t ds; - uint64_t es; - uint64_t fs; - uint64_t gs; + uint64_t r15; + uint64_t r14; + uint64_t r13; + uint64_t r12; + uint64_t bp; + uint64_t bx; + uint64_t r11; + uint64_t r10; + uint64_t r9; + uint64_t r8; + uint64_t ax; + uint64_t cx; + uint64_t dx; + uint64_t si; + uint64_t di; + uint64_t orig_ax; + uint64_t ip; + uint64_t cs; + uint64_t flags; + uint64_t sp; + uint64_t ss; + uint64_t fs_base; + uint64_t gs_base; + uint64_t ds; + uint64_t es; + uint64_t fs; + uint64_t gs; } user_regs_struct64; typedef struct { - uint32_t bx; - uint32_t cx; - uint32_t dx; - uint32_t si; - uint32_t di; - uint32_t bp; - uint32_t ax; - uint32_t ds; - uint32_t es; - uint32_t fs; - uint32_t gs; - uint32_t orig_ax; - uint32_t ip; - uint32_t cs; - uint32_t flags; - uint32_t sp; - uint32_t ss; + uint32_t bx; + uint32_t cx; + uint32_t dx; + uint32_t si; + uint32_t di; + uint32_t bp; + uint32_t ax; + uint32_t ds; + uint32_t es; + uint32_t fs; + uint32_t gs; + uint32_t orig_ax; + uint32_t ip; + uint32_t cs; + uint32_t flags; + uint32_t sp; + uint32_t ss; } user_regs_struct32; /* @@ -73,22 +96,17 @@ typedef struct { short __is_native; /* use user_regs_native macro to check it */ } user_regs_struct_t; -#define NATIVE_MAGIC 0x0A -#define COMPAT_MAGIC 0x0C +#define NATIVE_MAGIC 0x0A +#define COMPAT_MAGIC 0x0C static inline bool user_regs_native(user_regs_struct_t *pregs) { return pregs->__is_native == NATIVE_MAGIC; } -#define get_user_reg(pregs, name) \ - ((user_regs_native(pregs)) ? \ - ((pregs)->native.name) : \ - ((pregs)->compat.name)) +#define get_user_reg(pregs, name) ((user_regs_native(pregs)) ? ((pregs)->native.name) : ((pregs)->compat.name)) -#define set_user_reg(pregs, name, val) \ - ((user_regs_native(pregs)) ? \ - ((pregs)->native.name = (val)) : \ - ((pregs)->compat.name = (val))) +#define set_user_reg(pregs, name, val) \ + ((user_regs_native(pregs)) ? ((pregs)->native.name = (val)) : ((pregs)->compat.name = (val))) #if 0 typedef struct { @@ -109,12 +127,13 @@ typedef struct { typedef struct xsave_struct user_fpregs_struct_t; -#define REG_RES(regs) get_user_reg(®s, ax) -#define REG_IP(regs) get_user_reg(®s, ip) -#define REG_SP(regs) get_user_reg(®s, sp) -#define REG_SYSCALL_NR(regs) get_user_reg(®s, orig_ax) +#define REG_RES(regs) get_user_reg(®s, ax) +#define REG_IP(regs) get_user_reg(®s, ip) +#define SET_REG_IP(regs, val) set_user_reg(®s, ip, val) +#define REG_SP(regs) get_user_reg(®s, sp) +#define REG_SYSCALL_NR(regs) get_user_reg(®s, orig_ax) -#define __NR(syscall, compat) ((compat) ? __NR32_##syscall : __NR_##syscall) +#define __NR(syscall, compat) ((compat) ? __NR32_##syscall : __NR_##syscall) /* * For x86_32 __NR_mmap inside the kernel represents old_mmap system @@ -124,4 +143,11 @@ typedef struct xsave_struct user_fpregs_struct_t; */ #define __NR32_mmap __NR32_mmap2 +extern bool __compel_shstk_enabled(user_fpregs_struct_t *ext_regs); +#define compel_shstk_enabled __compel_shstk_enabled + +extern int __parasite_setup_shstk(struct parasite_ctl *ctl, + user_fpregs_struct_t *ext_regs); +#define parasite_setup_shstk __parasite_setup_shstk + #endif /* UAPI_COMPEL_ASM_TYPES_H__ */ diff --git a/compel/arch/x86/src/lib/include/uapi/asm/processor-flags.h b/compel/arch/x86/src/lib/include/uapi/asm/processor-flags.h index 9f1bccdbe..caa784557 100644 --- a/compel/arch/x86/src/lib/include/uapi/asm/processor-flags.h +++ b/compel/arch/x86/src/lib/include/uapi/asm/processor-flags.h @@ -7,7 +7,7 @@ * EFLAGS bits */ #define X86_EFLAGS_CF 0x00000001 /* Carry Flag */ -#define X86_EFLAGS_BIT1 0x00000002 /* Bit 1 - always on */ +#define X86_EFLAGS_BIT1 0x00000002 /* Bit 1 - always on */ #define X86_EFLAGS_PF 0x00000004 /* Parity Flag */ #define X86_EFLAGS_AF 0x00000010 /* Auxiliary carry Flag */ #define X86_EFLAGS_ZF 0x00000040 /* Zero Flag */ @@ -16,7 +16,7 @@ #define X86_EFLAGS_IF 0x00000200 /* Interrupt Flag */ #define X86_EFLAGS_DF 0x00000400 /* Direction Flag */ #define X86_EFLAGS_OF 0x00000800 /* Overflow Flag */ -#define X86_EFLAGS_IOPL 0x00003000 /* IOPL mask */ +#define X86_EFLAGS_IOPL 0x00003000 /* IOPL mask */ #define X86_EFLAGS_NT 0x00004000 /* Nested Task */ #define X86_EFLAGS_RF 0x00010000 /* Resume Flag */ #define X86_EFLAGS_VM 0x00020000 /* Virtual Mode */ diff --git a/compel/arch/x86/src/lib/include/uapi/asm/sigframe.h b/compel/arch/x86/src/lib/include/uapi/asm/sigframe.h index 51ca023f7..4a2e67559 100644 --- a/compel/arch/x86/src/lib/include/uapi/asm/sigframe.h +++ b/compel/arch/x86/src/lib/include/uapi/asm/sigframe.h @@ -11,60 +11,60 @@ #define SIGFRAME_MAX_OFFSET 8 struct rt_sigcontext { - uint64_t r8; - uint64_t r9; - uint64_t r10; - uint64_t r11; - uint64_t r12; - uint64_t r13; - uint64_t r14; - uint64_t r15; - uint64_t rdi; - uint64_t rsi; - uint64_t rbp; - uint64_t rbx; - uint64_t rdx; - uint64_t rax; - uint64_t rcx; - uint64_t rsp; - uint64_t rip; - uint64_t eflags; - uint16_t cs; - uint16_t gs; - uint16_t fs; - uint16_t ss; - uint64_t err; - uint64_t trapno; - uint64_t oldmask; - uint64_t cr2; - uint64_t fpstate; - uint64_t reserved1[8]; + uint64_t r8; + uint64_t r9; + uint64_t r10; + uint64_t r11; + uint64_t r12; + uint64_t r13; + uint64_t r14; + uint64_t r15; + uint64_t rdi; + uint64_t rsi; + uint64_t rbp; + uint64_t rbx; + uint64_t rdx; + uint64_t rax; + uint64_t rcx; + uint64_t rsp; + uint64_t rip; + uint64_t eflags; + uint16_t cs; + uint16_t gs; + uint16_t fs; + uint16_t ss; + uint64_t err; + uint64_t trapno; + uint64_t oldmask; + uint64_t cr2; + uint64_t fpstate; + uint64_t reserved1[8]; }; struct rt_sigcontext_32 { - uint32_t gs; - uint32_t fs; - uint32_t es; - uint32_t ds; - uint32_t di; - uint32_t si; - uint32_t bp; - uint32_t sp; - uint32_t bx; - uint32_t dx; - uint32_t cx; - uint32_t ax; - uint32_t trapno; - uint32_t err; - uint32_t ip; - uint32_t cs; - uint32_t flags; - uint32_t sp_at_signal; - uint32_t ss; + uint32_t gs; + uint32_t fs; + uint32_t es; + uint32_t ds; + uint32_t di; + uint32_t si; + uint32_t bp; + uint32_t sp; + uint32_t bx; + uint32_t dx; + uint32_t cx; + uint32_t ax; + uint32_t trapno; + uint32_t err; + uint32_t ip; + uint32_t cs; + uint32_t flags; + uint32_t sp_at_signal; + uint32_t ss; - uint32_t fpstate; - uint32_t oldmask; - uint32_t cr2; + uint32_t fpstate; + uint32_t oldmask; + uint32_t cr2; }; #include @@ -74,71 +74,70 @@ struct rt_sigcontext_32 { * when (if) other architectures will support compatible C/R */ -typedef uint32_t compat_uptr_t; -typedef uint32_t compat_size_t; -typedef uint32_t compat_sigset_word; +typedef uint32_t compat_uptr_t; +typedef uint32_t compat_size_t; +typedef uint32_t compat_sigset_word; typedef struct compat_siginfo { - int si_signo; - int si_errno; - int si_code; - int _pad[128/sizeof(int) - 3]; + int si_signo; + int si_errno; + int si_code; + int _pad[128 / sizeof(int) - 3]; } compat_siginfo_t; typedef struct compat_sigaltstack { - compat_uptr_t ss_sp; - int ss_flags; - compat_size_t ss_size; + compat_uptr_t ss_sp; + int ss_flags; + compat_size_t ss_size; } compat_stack_t; -#define _COMPAT_NSIG 64 -#define _COMPAT_NSIG_BPW 32 -#define _COMPAT_NSIG_WORDS (_COMPAT_NSIG / _COMPAT_NSIG_BPW) +#define _COMPAT_NSIG 64 +#define _COMPAT_NSIG_BPW 32 +#define _COMPAT_NSIG_WORDS (_COMPAT_NSIG / _COMPAT_NSIG_BPW) typedef struct { - compat_sigset_word sig[_COMPAT_NSIG_WORDS]; + compat_sigset_word sig[_COMPAT_NSIG_WORDS]; } compat_sigset_t; struct ucontext_ia32 { - unsigned int uc_flags; - unsigned int uc_link; - compat_stack_t uc_stack; - struct rt_sigcontext_32 uc_mcontext; - compat_sigset_t uc_sigmask; /* mask last for extensibility */ + unsigned int uc_flags; + unsigned int uc_link; + compat_stack_t uc_stack; + struct rt_sigcontext_32 uc_mcontext; + compat_sigset_t uc_sigmask; /* mask last for extensibility */ }; struct rt_sigframe_ia32 { - uint32_t pretcode; - int32_t sig; - uint32_t pinfo; - uint32_t puc; - compat_siginfo_t info; - struct ucontext_ia32 uc; - char retcode[8]; + uint32_t pretcode; + int32_t sig; + uint32_t pinfo; + uint32_t puc; + compat_siginfo_t info; + struct ucontext_ia32 uc; + char retcode[8]; /* fp state follows here */ - fpu_state_t fpu_state; + fpu_state_t fpu_state; }; struct rt_sigframe_64 { - char *pretcode; - struct rt_ucontext uc; - struct rt_siginfo info; + char *pretcode; + struct rt_ucontext uc; + struct rt_siginfo info; /* fp state follows here */ - fpu_state_t fpu_state; + fpu_state_t fpu_state; }; struct rt_sigframe { union { - struct rt_sigframe_ia32 compat; - struct rt_sigframe_64 native; + struct rt_sigframe_ia32 compat; + struct rt_sigframe_64 native; }; bool is_native; }; -static inline -void rt_sigframe_copy_sigset(struct rt_sigframe *to, k_rtsigset_t *from) +static inline void rt_sigframe_copy_sigset(struct rt_sigframe *to, k_rtsigset_t *from) { size_t sz = sizeof(k_rtsigset_t); @@ -149,8 +148,7 @@ void rt_sigframe_copy_sigset(struct rt_sigframe *to, k_rtsigset_t *from) memcpy(&to->compat.uc.uc_sigmask, from, sz); } -static inline -void rt_sigframe_erase_sigset(struct rt_sigframe *sigframe) +static inline void rt_sigframe_erase_sigset(struct rt_sigframe *sigframe) { size_t sz = sizeof(k_rtsigset_t); @@ -160,15 +158,11 @@ void rt_sigframe_erase_sigset(struct rt_sigframe *sigframe) memset(&sigframe->compat.uc.uc_sigmask, 0, sz); } -#define RT_SIGFRAME_REGIP(rt_sigframe) \ - ((rt_sigframe->is_native) ? \ - (rt_sigframe)->native.uc.uc_mcontext.rip : \ - (rt_sigframe)->compat.uc.uc_mcontext.ip) +#define RT_SIGFRAME_REGIP(rt_sigframe) \ + ((rt_sigframe->is_native) ? (rt_sigframe)->native.uc.uc_mcontext.rip : (rt_sigframe)->compat.uc.uc_mcontext.ip) -#define RT_SIGFRAME_FPU(rt_sigframe) \ - ((rt_sigframe->is_native) ? \ - (&(rt_sigframe)->native.fpu_state) : \ - (&(rt_sigframe)->compat.fpu_state)) +#define RT_SIGFRAME_FPU(rt_sigframe) \ + ((rt_sigframe->is_native) ? (&(rt_sigframe)->native.fpu_state) : (&(rt_sigframe)->compat.fpu_state)) #define RT_SIGFRAME_HAS_FPU(rt_sigframe) (RT_SIGFRAME_FPU(rt_sigframe)->has_fpu) @@ -178,9 +172,28 @@ void rt_sigframe_erase_sigset(struct rt_sigframe *sigframe) * - compatible is in sys32_rt_sigreturn at arch/x86/ia32/ia32_signal.c * - native is in sys_rt_sigreturn at arch/x86/kernel/signal.c */ -#define RT_SIGFRAME_OFFSET(rt_sigframe) (((rt_sigframe)->is_native) ? 8 : 4 ) +#define RT_SIGFRAME_OFFSET(rt_sigframe) (((rt_sigframe)->is_native) ? 8 : 4) -#define USER32_CS 0x23 +#define USER32_CS 0x23 + +/* clang-format off */ +/* + * rst_sigreturn in resorer is noninline call which adds an entry to the + * shadow stack above the sigframe token; + * if shadow stack is enabled, increment the shadow stack pointer to remove + * that entry + */ +#define ARCH_SHSTK_POP() \ + asm volatile( \ + "xor %%rax, %%rax\n" \ + "rdsspq %%rax\n" \ + "cmpq $0, %%rax\n" \ + "jz 1f\n" \ + "movq $1, %%rax\n" \ + "incsspq %%rax\n" \ + "1:\n" \ + : : \ + : "rax") #define ARCH_RT_SIGRETURN_NATIVE(new_sp) \ asm volatile( \ @@ -194,7 +207,9 @@ void rt_sigframe_erase_sigset(struct rt_sigframe *sigframe) #define ARCH_RT_SIGRETURN_COMPAT(new_sp) \ asm volatile( \ "pushq $"__stringify(USER32_CS)" \n" \ - "pushq $1f \n" \ + "xor %%rax, %%rax \n" \ + "movl $1f, %%eax \n" \ + "pushq %%rax \n" \ "lretq \n" \ "1: \n" \ ".code32 \n" \ @@ -206,13 +221,23 @@ void rt_sigframe_erase_sigset(struct rt_sigframe *sigframe) : "rdi"(new_sp) \ : "eax", "r8", "r9", "r10", "r11", "memory") -#define ARCH_RT_SIGRETURN(new_sp, rt_sigframe) \ +#define ARCH_RT_SIGRETURN_RST(new_sp, rt_sigframe) \ +do { \ + if ((rt_sigframe)->is_native) { \ + ARCH_SHSTK_POP(); \ + ARCH_RT_SIGRETURN_NATIVE(new_sp); \ + } else \ + ARCH_RT_SIGRETURN_COMPAT(new_sp); \ +} while (0) + +#define ARCH_RT_SIGRETURN_DUMP(new_sp, rt_sigframe) \ do { \ if ((rt_sigframe)->is_native) \ - ARCH_RT_SIGRETURN_NATIVE(new_sp); \ + return new_sp; \ else \ ARCH_RT_SIGRETURN_COMPAT(new_sp); \ } while (0) +/* clang-format off */ int sigreturn_prep_fpu_frame(struct rt_sigframe *sigframe, struct rt_sigframe *rsigframe); diff --git a/compel/arch/x86/src/lib/infect.c b/compel/arch/x86/src/lib/infect.c index 11e7f4c91..afcf2c53b 100644 --- a/compel/arch/x86/src/lib/infect.c +++ b/compel/arch/x86/src/lib/infect.c @@ -3,6 +3,9 @@ #include #include #include +#include +#include +#include #include @@ -21,29 +24,37 @@ #include "log.h" #ifndef NT_X86_XSTATE -#define NT_X86_XSTATE 0x202 /* x86 extended state using xsave */ +#define NT_X86_XSTATE 0x202 /* x86 extended state using xsave */ #endif + +#ifndef NT_X86_SHSTK +#define NT_X86_SHSTK 0x204 /* x86 shstk state */ +#endif + +#ifndef ARCH_SHSTK_STATUS +#define ARCH_SHSTK_STATUS 0x5005 +#define ARCH_SHSTK_SHSTK (1ULL << 0) +#endif + #ifndef NT_PRSTATUS -#define NT_PRSTATUS 1 /* Contains copy of prstatus struct */ +#define NT_PRSTATUS 1 /* Contains copy of prstatus struct */ #endif /* * Injected syscall instruction */ const char code_syscall[] = { - 0x0f, 0x05, /* syscall */ - 0xcc, 0xcc, 0xcc, 0xcc, 0xcc, 0xcc /* int 3, ... */ + 0x0f, 0x05, /* syscall */ + 0xcc, 0xcc, 0xcc, 0xcc, 0xcc, 0xcc /* int 3, ... */ }; const char code_int_80[] = { - 0xcd, 0x80, /* int $0x80 */ - 0xcc, 0xcc, 0xcc, 0xcc, 0xcc, 0xcc /* int 3, ... */ + 0xcd, 0x80, /* int $0x80 */ + 0xcc, 0xcc, 0xcc, 0xcc, 0xcc, 0xcc /* int 3, ... */ }; -static const int -code_syscall_aligned = round_up(sizeof(code_syscall), sizeof(long)); -static const int -code_int_80_aligned = round_up(sizeof(code_syscall), sizeof(long)); +static const int code_syscall_aligned = round_up(sizeof(code_syscall), sizeof(long)); +static const int code_int_80_aligned = round_up(sizeof(code_syscall), sizeof(long)); static inline __always_unused void __check_code_syscall(void) { @@ -54,22 +65,22 @@ static inline __always_unused void __check_code_syscall(void) /* 10-byte legacy floating point register */ struct fpreg { - uint16_t significand[4]; - uint16_t exponent; + uint16_t significand[4]; + uint16_t exponent; }; /* 16-byte floating point register */ struct fpxreg { - uint16_t significand[4]; - uint16_t exponent; - uint16_t padding[3]; + uint16_t significand[4]; + uint16_t exponent; + uint16_t padding[3]; }; -#define FPREG_ADDR(f, n) ((void *)&(f)->st_space + (n) * 16) -#define FP_EXP_TAG_VALID 0 -#define FP_EXP_TAG_ZERO 1 -#define FP_EXP_TAG_SPECIAL 2 -#define FP_EXP_TAG_EMPTY 3 +#define FPREG_ADDR(f, n) ((void *)&(f)->st_space + (n)*16) +#define FP_EXP_TAG_VALID 0 +#define FP_EXP_TAG_ZERO 1 +#define FP_EXP_TAG_SPECIAL 2 +#define FP_EXP_TAG_EMPTY 3 static inline uint32_t twd_fxsr_to_i387(struct i387_fxsave_struct *fxsave) { @@ -89,9 +100,7 @@ static inline uint32_t twd_fxsr_to_i387(struct i387_fxsave_struct *fxsave) tag = FP_EXP_TAG_SPECIAL; break; case 0x0000: - if (!st->significand[0] && - !st->significand[1] && - !st->significand[2] && + if (!st->significand[0] && !st->significand[1] && !st->significand[2] && !st->significand[3]) tag = FP_EXP_TAG_ZERO; else @@ -112,8 +121,7 @@ static inline uint32_t twd_fxsr_to_i387(struct i387_fxsave_struct *fxsave) return ret; } -void compel_convert_from_fxsr(struct user_i387_ia32_struct *env, - struct i387_fxsave_struct *fxsave) +void compel_convert_from_fxsr(struct user_i387_ia32_struct *env, struct i387_fxsave_struct *fxsave) { struct fpxreg *from = (struct fpxreg *)&fxsave->st_space[0]; struct fpreg *to = (struct fpreg *)env->st_space; @@ -137,16 +145,12 @@ void compel_convert_from_fxsr(struct user_i387_ia32_struct *env, memcpy(&to[i], &from[i], sizeof(to[0])); } -int sigreturn_prep_regs_plain(struct rt_sigframe *sigframe, - user_regs_struct_t *regs, - user_fpregs_struct_t *fpregs) +int sigreturn_prep_regs_plain(struct rt_sigframe *sigframe, user_regs_struct_t *regs, user_fpregs_struct_t *fpregs) { bool is_native = user_regs_native(regs); - fpu_state_t *fpu_state = is_native ? - &sigframe->native.fpu_state : - &sigframe->compat.fpu_state; + fpu_state_t *fpu_state = is_native ? &sigframe->native.fpu_state : &sigframe->compat.fpu_state; if (is_native) { -#define cpreg64_native(d, s) sigframe->native.uc.uc_mcontext.d = regs->native.s +#define cpreg64_native(d, s) sigframe->native.uc.uc_mcontext.d = regs->native.s cpreg64_native(rdi, di); cpreg64_native(rsi, si); cpreg64_native(rbp, bp); @@ -170,7 +174,7 @@ int sigreturn_prep_regs_plain(struct rt_sigframe *sigframe, sigframe->is_native = true; #undef cpreg64_native } else { -#define cpreg32_compat(d) sigframe->compat.uc.uc_mcontext.d = regs->compat.d +#define cpreg32_compat(d) sigframe->compat.uc.uc_mcontext.d = regs->compat.d cpreg32_compat(gs); cpreg32_compat(fs); cpreg32_compat(es); @@ -203,34 +207,38 @@ int sigreturn_prep_regs_plain(struct rt_sigframe *sigframe, return 0; } -int sigreturn_prep_fpu_frame_plain(struct rt_sigframe *sigframe, - struct rt_sigframe *rsigframe) +int sigreturn_prep_fpu_frame_plain(struct rt_sigframe *sigframe, struct rt_sigframe *rsigframe) { - fpu_state_t *fpu_state = (sigframe->is_native) ? - &rsigframe->native.fpu_state : - &rsigframe->compat.fpu_state; + fpu_state_t *fpu_state = (sigframe->is_native) ? &rsigframe->native.fpu_state : &rsigframe->compat.fpu_state; if (sigframe->is_native) { unsigned long addr = (unsigned long)(void *)&fpu_state->fpu_state_64.xsave; if ((addr % 64ul)) { - pr_err("Unaligned address passed: %lx (native %d)\n", - addr, sigframe->is_native); + pr_err("Unaligned address passed: %lx (native %d)\n", addr, sigframe->is_native); return -1; } sigframe->native.uc.uc_mcontext.fpstate = (uint64_t)addr; } else if (!sigframe->is_native) { - sigframe->compat.uc.uc_mcontext.fpstate = - (uint32_t)(unsigned long)(void *)&fpu_state->fpu_state_ia32; + sigframe->compat.uc.uc_mcontext.fpstate = (uint32_t)(unsigned long)(void *)&fpu_state->fpu_state_ia32; } return 0; } -#define get_signed_user_reg(pregs, name) \ - ((user_regs_native(pregs)) ? (int64_t)((pregs)->native.name) : \ - (int32_t)((pregs)->compat.name)) +#define get_signed_user_reg(pregs, name) \ + ((user_regs_native(pregs)) ? (int64_t)((pregs)->native.name) : (int32_t)((pregs)->compat.name)) + +static int get_task_fpregs(pid_t pid, user_fpregs_struct_t *xsave) +{ + if (ptrace(PTRACE_GETFPREGS, pid, NULL, xsave)) { + pr_perror("Can't obtain FPU registers for %d", pid); + return -1; + } + + return 0; +} static int get_task_xsave(pid_t pid, user_fpregs_struct_t *xsave) { @@ -244,27 +252,157 @@ static int get_task_xsave(pid_t pid, user_fpregs_struct_t *xsave) return -1; } - return 0; -} + if ((xsave->xsave_hdr.xstate_bv & 3) != 3) { + // Due to init-optimisation [1] x87 FPU or SSE state may not be filled in. + // Since those are restored unconditionally, make sure the init values are + // filled by retrying with old PTRACE_GETFPREGS. + // + // [1] Intel® 64 and IA-32 Architectures Software Developer's + // Manual Volume 1: Basic Architecture + // Section 13.6: Processor tracking of XSAVE-managed state + if (get_task_fpregs(pid, xsave)) + return -1; + } -static int get_task_fpregs(pid_t pid, user_fpregs_struct_t *xsave) -{ - if (ptrace(PTRACE_GETFPREGS, pid, NULL, xsave)) { - pr_perror("Can't obtain FPU registers for %d", pid); - return -1; + /* + * xsave may be on stack, if we don't clear it explicitly we get + * funky shadow stack state + */ + memset(&xsave->cet, 0, sizeof(xsave->cet)); + if (compel_cpu_has_feature(X86_FEATURE_SHSTK)) { + unsigned long ssp = 0; + unsigned long features = 0; + + if (ptrace(PTRACE_ARCH_PRCTL, pid, (unsigned long)&features, ARCH_SHSTK_STATUS)) { + /* + * kernels that don't support shadow stack return + * -EINVAL + */ + if (errno == EINVAL) + return 0; + + pr_perror("shstk: can't get shadow stack status for %d", pid); + return -1; + } + + if (!(features & ARCH_SHSTK_SHSTK)) + return 0; + + iov.iov_base = &ssp; + iov.iov_len = sizeof(ssp); + + if (ptrace(PTRACE_GETREGSET, pid, (unsigned int)NT_X86_SHSTK, &iov) < 0) { + /* ENODEV means CET is not supported by the CPU */ + if (errno != ENODEV) { + pr_perror("shstk: can't get SSP for %d", pid); + return -1; + } + } + + xsave->cet.cet = features; + xsave->cet.ssp = ssp; + + pr_debug("%d: shstk: cet: %lx ssp: %lx\n", pid, xsave->cet.cet, xsave->cet.ssp); } return 0; } -int get_task_regs(pid_t pid, user_regs_struct_t *regs, save_regs_t save, - void *arg, unsigned long flags) +static inline void fixup_mxcsr(struct xsave_struct *xsave) +{ + /* + * Right now xsave->i387.mxcsr filled with the random garbage, + * let's make it valid by applying mask which allows all + * features, except the denormals-are-zero feature bit. + * + * See also fpu__init_system_mxcsr function: + * https://github.com/torvalds/linux/blob/8cb1ae19/arch/x86/kernel/fpu/init.c#L117 + */ + xsave->i387.mxcsr &= 0x0000ffbf; +} + +/* See arch/x86/kernel/fpu/xstate.c */ +static void validate_random_xstate(struct xsave_struct *xsave) +{ + struct xsave_hdr_struct *hdr = &xsave->xsave_hdr; + unsigned int i; + + /* No unknown or supervisor features may be set */ + hdr->xstate_bv &= XFEATURE_MASK_USER; + hdr->xstate_bv &= ~XFEATURE_MASK_SUPERVISOR; + hdr->xstate_bv &= XFEATURE_MASK_FAULTINJ; + + for (i = 0; i < XFEATURE_MAX; i++) { + if (!compel_fpu_has_feature(i)) + hdr->xstate_bv &= ~(1 << i); + } + + /* Userspace must use the uncompacted format */ + hdr->xcomp_bv = 0; + + /* + * If 'reserved' is shrunken to add a new field, make sure to validate + * that new field here! + */ + BUILD_BUG_ON(sizeof(hdr->reserved) != 48); + + /* No reserved bits may be set */ + memset(&hdr->reserved, 0, sizeof(hdr->reserved)); +} + +/* + * TODO: Put fault-injection under CONFIG_* and move + * extended regset corruption to generic code + */ +static int corrupt_extregs(pid_t pid) +{ + bool use_xsave = compel_cpu_has_feature(X86_FEATURE_OSXSAVE); + user_fpregs_struct_t ext_regs; + int *rand_to = (int *)&ext_regs; + unsigned int seed, init_seed; + size_t i; + + init_seed = seed = time(NULL); + for (i = 0; i < sizeof(ext_regs) / sizeof(int); i++) + *rand_to++ = rand_r(&seed); + + /* + * Error log-level as: + * - not intended to be used outside of testing; + * - zdtm.py will grep it auto-magically from logs + * (and the seed will be known from automatic testing). + */ + pr_err("Corrupting %s for %d, seed %u\n", use_xsave ? "xsave" : "fpuregs", pid, init_seed); + + fixup_mxcsr(&ext_regs); + + if (!use_xsave) { + if (ptrace(PTRACE_SETFPREGS, pid, NULL, &ext_regs)) { + pr_perror("Can't set FPU registers for %d", pid); + return -1; + } + } else { + struct iovec iov; + + validate_random_xstate((void *)&ext_regs); + + iov.iov_base = &ext_regs; + iov.iov_len = sizeof(ext_regs); + + if (ptrace(PTRACE_SETREGSET, pid, (unsigned int)NT_X86_XSTATE, &iov) < 0) { + pr_perror("Can't set xstate for %d", pid); + return -1; + } + } + return 0; +} + +int compel_get_task_regs(pid_t pid, user_regs_struct_t *regs, user_fpregs_struct_t *xs, save_regs_t save, + void *arg, unsigned long flags) { - user_fpregs_struct_t xsave = { }, *xs = NULL; int ret = -1; - pr_info("Dumping general registers for %d in %s mode\n", pid, - user_regs_native(regs) ? "native" : "compat"); + pr_info("Dumping general registers for %d in %s mode\n", pid, user_regs_native(regs) ? "native" : "compat"); /* Did we come from a system call? */ if (get_signed_user_reg(regs, orig_ax) >= 0) { @@ -288,43 +426,65 @@ int get_task_regs(pid_t pid, user_regs_struct_t *regs, save_regs_t save, /* * FPU fetched either via fxsave or via xsave, - * thus decode it accrodingly. + * thus decode it accordingly. */ pr_info("Dumping GP/FPU registers for %d\n", pid); if (!compel_cpu_has_feature(X86_FEATURE_OSXSAVE)) { - ret = get_task_fpregs(pid, &xsave); + ret = get_task_fpregs(pid, xs); } else if (unlikely(flags & INFECT_X86_PTRACE_MXCSR_BUG)) { /* * get_task_fpregs() will fill FP state, * get_task_xsave() will overwrite rightly sse/mmx/etc */ pr_warn("Skylake xsave fpu bug workaround used\n"); - ret = get_task_fpregs(pid, &xsave); + ret = get_task_fpregs(pid, xs); if (!ret) - ret = get_task_xsave(pid, &xsave); + ret = get_task_xsave(pid, xs); } else { - ret = get_task_xsave(pid, &xsave); + ret = get_task_xsave(pid, xs); } + if (!ret && unlikely(flags & INFECT_CORRUPT_EXTREGS)) + ret = corrupt_extregs(pid); + if (ret) goto err; - xs = &xsave; out: - ret = save(arg, regs, xs); + ret = save(pid, arg, regs, xs); err: return ret; } -int compel_syscall(struct parasite_ctl *ctl, int nr, long *ret, - unsigned long arg1, - unsigned long arg2, - unsigned long arg3, - unsigned long arg4, - unsigned long arg5, - unsigned long arg6) +int compel_set_task_ext_regs(pid_t pid, user_fpregs_struct_t *ext_regs) +{ + struct iovec iov; + + pr_info("Restoring GP/FPU registers for %d\n", pid); + + if (!compel_cpu_has_feature(X86_FEATURE_OSXSAVE)) { + if (ptrace(PTRACE_SETFPREGS, pid, NULL, ext_regs)) { + pr_perror("Can't set FPU registers for %d", pid); + return -1; + } + return 0; + } + + iov.iov_base = ext_regs; + iov.iov_len = sizeof(*ext_regs); + + if (ptrace(PTRACE_SETREGSET, pid, (unsigned int)NT_X86_XSTATE, &iov) < 0) { + pr_perror("Can't set FPU registers for %d", pid); + return -1; + } + + return 0; +} + +int compel_syscall(struct parasite_ctl *ctl, int nr, long *ret, unsigned long arg1, unsigned long arg2, + unsigned long arg3, unsigned long arg4, unsigned long arg5, unsigned long arg6) { user_regs_struct_t regs = ctl->orig.regs; bool native = user_regs_native(®s); @@ -333,51 +493,47 @@ int compel_syscall(struct parasite_ctl *ctl, int nr, long *ret, if (native) { user_regs_struct64 *r = ®s.native; - r->ax = (uint64_t)nr; - r->di = arg1; - r->si = arg2; - r->dx = arg3; + r->ax = (uint64_t)nr; + r->di = arg1; + r->si = arg2; + r->dx = arg3; r->r10 = arg4; - r->r8 = arg5; - r->r9 = arg6; + r->r8 = arg5; + r->r9 = arg6; err = compel_execute_syscall(ctl, ®s, code_syscall); } else { user_regs_struct32 *r = ®s.compat; - r->ax = (uint32_t)nr; - r->bx = arg1; - r->cx = arg2; - r->dx = arg3; - r->si = arg4; - r->di = arg5; - r->bp = arg6; + r->ax = (uint32_t)nr; + r->bx = arg1; + r->cx = arg2; + r->dx = arg3; + r->si = arg4; + r->di = arg5; + r->bp = arg6; err = compel_execute_syscall(ctl, ®s, code_int_80); } - *ret = native ? - (long)get_user_reg(®s, ax) : - (int)get_user_reg(®s, ax); + *ret = native ? (long)get_user_reg(®s, ax) : (int)get_user_reg(®s, ax); return err; } -void *remote_mmap(struct parasite_ctl *ctl, - void *addr, size_t length, int prot, - int flags, int fd, off_t offset) +void *remote_mmap(struct parasite_ctl *ctl, void *addr, size_t length, int prot, int flags, int fd, off_t offset) { long map; int err; bool compat_task = !user_regs_native(&ctl->orig.regs); - err = compel_syscall(ctl, __NR(mmap, compat_task), &map, - (unsigned long)addr, length, prot, flags, fd, offset); + err = compel_syscall(ctl, __NR(mmap, compat_task), &map, (unsigned long)addr, length, prot, flags, fd, offset); if (err < 0) return NULL; if (map == -EACCES && (prot & PROT_WRITE) && (prot & PROT_EXEC)) { pr_warn("mmap(PROT_WRITE | PROT_EXEC) failed for %d, " - "check selinux execmem policy\n", ctl->rpid); + "check selinux execmem policy\n", + ctl->rpid); return NULL; } if (IS_ERR_VALUE(map)) { @@ -402,18 +558,17 @@ void parasite_setup_regs(unsigned long new_ip, void *stack, user_regs_struct_t * { set_user_reg(regs, ip, new_ip); if (stack) - set_user_reg(regs, sp, (unsigned long) stack); + set_user_reg(regs, sp, (unsigned long)stack); /* Avoid end of syscall processing */ set_user_reg(regs, orig_ax, -1); /* Make sure flags are in known state */ - set_user_reg(regs, flags, get_user_reg(regs, flags) & - ~(X86_EFLAGS_TF | X86_EFLAGS_DF | X86_EFLAGS_IF)); + set_user_reg(regs, flags, get_user_reg(regs, flags) & ~(X86_EFLAGS_TF | X86_EFLAGS_DF | X86_EFLAGS_IF)); } -#define USER32_CS 0x23 -#define USER_CS 0x33 +#define USER32_CS 0x23 +#define USER_CS 0x33 static bool ldt_task_selectors(pid_t pid) { @@ -469,54 +624,58 @@ bool arch_can_dump_task(struct parasite_ctl *ctl) int arch_fetch_sas(struct parasite_ctl *ctl, struct rt_sigframe *s) { int native = compel_mode_native(ctl); - void *where = native ? - (void *)&s->native.uc.uc_stack : - (void *)&s->compat.uc.uc_stack; + void *where = native ? (void *)&s->native.uc.uc_stack : (void *)&s->compat.uc.uc_stack; long ret; int err; - err = compel_syscall(ctl, __NR(sigaltstack, !native), - &ret, 0, (unsigned long)where, - 0, 0, 0, 0); + err = compel_syscall(ctl, __NR(sigaltstack, !native), &ret, 0, (unsigned long)where, 0, 0, 0, 0); return err ? err : ret; } /* Copied from the gdb header gdb/nat/x86-dregs.h */ /* Debug registers' indices. */ -#define DR_FIRSTADDR 0 -#define DR_LASTADDR 3 -#define DR_NADDR 4 /* The number of debug address registers. */ -#define DR_STATUS 6 /* Index of debug status register (DR6). */ -#define DR_CONTROL 7 /* Index of debug control register (DR7). */ +#define DR_FIRSTADDR 0 +#define DR_LASTADDR 3 +#define DR_NADDR 4 /* The number of debug address registers. */ +#define DR_STATUS 6 /* Index of debug status register (DR6). */ +#define DR_CONTROL 7 /* Index of debug control register (DR7). */ -#define DR_LOCAL_ENABLE_SHIFT 0 /* Extra shift to the local enable bit. */ -#define DR_GLOBAL_ENABLE_SHIFT 1 /* Extra shift to the global enable bit. */ -#define DR_ENABLE_SIZE 2 /* Two enable bits per debug register. */ +#define DR_LOCAL_ENABLE_SHIFT 0 /* Extra shift to the local enable bit. */ +#define DR_GLOBAL_ENABLE_SHIFT 1 /* Extra shift to the global enable bit. */ +#define DR_ENABLE_SIZE 2 /* Two enable bits per debug register. */ /* Locally enable the break/watchpoint in the I'th debug register. */ #define X86_DR_LOCAL_ENABLE(i) (1 << (DR_LOCAL_ENABLE_SHIFT + DR_ENABLE_SIZE * (i))) int ptrace_set_breakpoint(pid_t pid, void *addr) { + k_rtsigset_t block; int ret; /* Set a breakpoint */ - if (ptrace(PTRACE_POKEUSER, pid, - offsetof(struct user, u_debugreg[DR_FIRSTADDR]), - addr)) { + if (ptrace(PTRACE_POKEUSER, pid, offsetof(struct user, u_debugreg[DR_FIRSTADDR]), addr)) { pr_perror("Unable to setup a breakpoint into %d", pid); return -1; } /* Enable the breakpoint */ - if (ptrace(PTRACE_POKEUSER, pid, - offsetof(struct user, u_debugreg[DR_CONTROL]), - X86_DR_LOCAL_ENABLE(DR_FIRSTADDR))) { + if (ptrace(PTRACE_POKEUSER, pid, offsetof(struct user, u_debugreg[DR_CONTROL]), + X86_DR_LOCAL_ENABLE(DR_FIRSTADDR))) { pr_perror("Unable to enable the breakpoint for %d", pid); return -1; } + /* + * FIXME(issues/1429): SIGTRAP can't be blocked, otherwise its handler + * will be reset to the default one. + */ + ksigfillset(&block); + ksigdelset(&block, SIGTRAP); + if (ptrace(PTRACE_SETSIGMASK, pid, sizeof(k_rtsigset_t), &block)) { + pr_perror("Can't block signals for %d", pid); + return -1; + } ret = ptrace(PTRACE_CONT, pid, NULL, NULL); if (ret) { pr_perror("Unable to restart the stopped tracee process %d", pid); @@ -529,9 +688,7 @@ int ptrace_set_breakpoint(pid_t pid, void *addr) int ptrace_flush_breakpoints(pid_t pid) { /* Disable the breakpoint */ - if (ptrace(PTRACE_POKEUSER, pid, - offsetof(struct user, u_debugreg[DR_CONTROL]), - 0)) { + if (ptrace(PTRACE_POKEUSER, pid, offsetof(struct user, u_debugreg[DR_CONTROL]), 0)) { pr_perror("Unable to disable the breakpoint for %d", pid); return -1; } @@ -563,8 +720,7 @@ int ptrace_get_regs(pid_t pid, user_regs_struct_t *regs) } pr_err("PTRACE_GETREGSET read %zu bytes for pid %d, but native/compat regs sizes are %zu/%zu bytes\n", - iov.iov_len, pid, - sizeof(regs->native), sizeof(regs->compat)); + iov.iov_len, pid, sizeof(regs->native), sizeof(regs->compat)); return -1; } @@ -582,11 +738,70 @@ int ptrace_set_regs(pid_t pid, user_regs_struct_t *regs) return ptrace(PTRACE_SETREGSET, pid, NT_PRSTATUS, &iov); } -#define TASK_SIZE ((1UL << 47) - PAGE_SIZE) +#define TASK_SIZE ((1UL << 47) - PAGE_SIZE) /* * Task size may be limited to 3G but we need a * higher limit, because it's backward compatible. */ -#define TASK_SIZE_IA32 (0xffffe000) +#define TASK_SIZE_IA32 (0xffffe000) -unsigned long compel_task_size(void) { return TASK_SIZE; } +unsigned long compel_task_size(void) +{ + return TASK_SIZE; +} + +bool __compel_shstk_enabled(user_fpregs_struct_t *ext_regs) +{ + if (!compel_cpu_has_feature(X86_FEATURE_SHSTK)) + return false; + + if (ext_regs->cet.cet & ARCH_SHSTK_SHSTK) + return true; + + return false; +} + +int parasite_setup_shstk(struct parasite_ctl *ctl, __maybe_unused user_fpregs_struct_t *ext_regs) +{ + pid_t pid = ctl->rpid; + unsigned long sa_restorer = ctl->parasite_ip; + unsigned long long ssp; + unsigned long token; + struct iovec iov; + + if (!compel_shstk_enabled(ext_regs)) + return 0; + + iov.iov_base = &ssp; + iov.iov_len = sizeof(ssp); + if (ptrace(PTRACE_GETREGSET, pid, (unsigned int)NT_X86_SHSTK, &iov) < 0) { + /* ENODEV means CET is not supported by the CPU */ + if (errno != ENODEV) { + pr_perror("shstk: %d: cannot get SSP", pid); + return -1; + } + } + + /* The token is for 64-bit */ + token = ALIGN_DOWN(ssp, 8); + token |= (1UL << 63); + ssp = ALIGN_DOWN(ssp, 8) - 8; + if (ptrace(PTRACE_POKEDATA, pid, (void *)ssp, token)) { + pr_perror("shstk: %d: failed to inject shadow stack token", pid); + return -1; + } + + ssp = ssp - sizeof(uint64_t); + if (ptrace(PTRACE_POKEDATA, pid, (void *)ssp, sa_restorer)) { + pr_perror("shstk: %d: failed to inject restorer address", pid); + return -1; + } + + ssp = ssp + sizeof(uint64_t); + if (ptrace(PTRACE_SETREGSET, pid, (unsigned int)NT_X86_SHSTK, &iov) < 0) { + pr_perror("shstk: %d: cannot write SSP", pid); + return -1; + } + + return 0; +} diff --git a/compel/arch/x86/src/lib/thread_area.c b/compel/arch/x86/src/lib/thread_area.c new file mode 100644 index 000000000..271d89dcd --- /dev/null +++ b/compel/arch/x86/src/lib/thread_area.c @@ -0,0 +1,87 @@ +#include +#include +#include +#include +#include "log.h" +#include "asm/infect-types.h" +#include "infect.h" +#include "infect-priv.h" + +#ifndef PTRACE_GET_THREAD_AREA +#define PTRACE_GET_THREAD_AREA 25 +#endif + +/* + * For 64-bit applications, TLS (fs_base for Glibc) is in MSR, + * which are dumped with the help of ptrace() and restored with + * arch_prctl(ARCH_SET_FS/ARCH_SET_GS). + * + * But SET_FS_BASE will update GDT if base pointer fits in 4 bytes. + * Otherwise it will set only MSR, which allows for mixed 64/32-bit + * code to use: 2 MSRs as TLS base _and_ 3 GDT entries. + * Having in sum 5 TLS pointers, 3 of which are four bytes and + * other two eight bytes: + * struct thread_struct { + * struct desc_struct tls_array[3]; + * ... + * #ifdef CONFIG_X86_64 + * unsigned long fsbase; + * unsigned long gsbase; + * #endif + * ... + * }; + * + * Most x86_64 applications don't use GDT, but mixed code (i.e. Wine) + * can use it. Be pessimistic and dump it for 64-bit applications too. + */ +int __compel_arch_fetch_thread_area(int tid, struct thread_ctx *th) +{ + bool native_mode = user_regs_native(&th->regs); + tls_t *ptls = &th->tls; + int err, i; + + /* Initialise as not present by default */ + for (i = 0; i < GDT_ENTRY_TLS_NUM; i++) { + user_desc_t *d = &ptls->desc[i]; + + memset(d, 0, sizeof(user_desc_t)); + d->seg_not_present = 1; + d->entry_number = GDT_ENTRY_TLS_MIN + i; + } + + for (i = 0; i < GDT_ENTRY_TLS_NUM; i++) { + user_desc_t *d = &ptls->desc[i]; + + err = ptrace(PTRACE_GET_THREAD_AREA, tid, GDT_ENTRY_TLS_MIN + i, d); + if (err) { + /* + * Ignoring absent syscall on !CONFIG_IA32_EMULATION + * where such mixed code can't run. + * XXX: Add compile CONFIG_X86_IGNORE_64BIT_TLS + * (for x86_64 systems with CONFIG_IA32_EMULATION) + */ + if (errno == EIO && native_mode) + return 0; + + pr_perror("get_thread_area failed for %d", tid); + return err; + } + } + + return 0; +} + +int compel_arch_fetch_thread_area(struct parasite_thread_ctl *tctl) +{ + return __compel_arch_fetch_thread_area(tctl->tid, &tctl->th); +} + +void compel_arch_get_tls_task(struct parasite_ctl *ctl, tls_t *out) +{ + memcpy(out, &ctl->orig.tls, sizeof(tls_t)); +} + +void compel_arch_get_tls_thread(struct parasite_thread_ctl *tctl, tls_t *out) +{ + memcpy(out, &tctl->th.tls, sizeof(tls_t)); +} diff --git a/compel/include/elf32-types.h b/compel/include/elf32-types.h index b516ba17e..8f6c59960 100644 --- a/compel/include/elf32-types.h +++ b/compel/include/elf32-types.h @@ -1,16 +1,16 @@ #ifndef COMPEL_ELF32_TYPES_H__ #define COMPEL_ELF32_TYPES_H__ -#define Elf_Ehdr Elf32_Ehdr -#define Elf_Shdr Elf32_Shdr -#define Elf_Sym Elf32_Sym -#define Elf_Rel Elf32_Rel -#define Elf_Rela Elf32_Rela +#define Elf_Ehdr Elf32_Ehdr +#define Elf_Shdr Elf32_Shdr +#define Elf_Sym Elf32_Sym +#define Elf_Rel Elf32_Rel +#define Elf_Rela Elf32_Rela -#define ELF_ST_TYPE ELF32_ST_TYPE -#define ELF_ST_BIND ELF32_ST_BIND +#define ELF_ST_TYPE ELF32_ST_TYPE +#define ELF_ST_BIND ELF32_ST_BIND -#define ELF_R_SYM ELF32_R_SYM -#define ELF_R_TYPE ELF32_R_TYPE +#define ELF_R_SYM ELF32_R_SYM +#define ELF_R_TYPE ELF32_R_TYPE #endif /* COMPEL_ELF32_TYPES_H__ */ diff --git a/compel/include/elf64-types.h b/compel/include/elf64-types.h index c4d5f1c72..ce11a2a61 100644 --- a/compel/include/elf64-types.h +++ b/compel/include/elf64-types.h @@ -1,16 +1,16 @@ #ifndef COMPEL_ELF64_TYPES_H__ #define COMPEL_ELF64_TYPES_H__ -#define Elf_Ehdr Elf64_Ehdr -#define Elf_Shdr Elf64_Shdr -#define Elf_Sym Elf64_Sym -#define Elf_Rel Elf64_Rel -#define Elf_Rela Elf64_Rela +#define Elf_Ehdr Elf64_Ehdr +#define Elf_Shdr Elf64_Shdr +#define Elf_Sym Elf64_Sym +#define Elf_Rel Elf64_Rel +#define Elf_Rela Elf64_Rela -#define ELF_ST_TYPE ELF64_ST_TYPE -#define ELF_ST_BIND ELF64_ST_BIND +#define ELF_ST_TYPE ELF64_ST_TYPE +#define ELF_ST_BIND ELF64_ST_BIND -#define ELF_R_SYM ELF64_R_SYM -#define ELF_R_TYPE ELF64_R_TYPE +#define ELF_R_SYM ELF64_R_SYM +#define ELF_R_TYPE ELF64_R_TYPE #endif /* COMPEL_ELF64_TYPES_H__ */ diff --git a/compel/include/errno.h b/compel/include/errno.h index d41fd5391..b4ad1f086 100644 --- a/compel/include/errno.h +++ b/compel/include/errno.h @@ -1,9 +1,9 @@ #ifndef __COMPEL_ERRNO_H__ #define __COMPEL_ERRNO_H__ -#define ERESTARTSYS 512 -#define ERESTARTNOINTR 513 -#define ERESTARTNOHAND 514 -#define ERESTART_RESTARTBLOCK 516 +#define ERESTARTSYS 512 +#define ERESTARTNOINTR 513 +#define ERESTARTNOHAND 514 +#define ERESTART_RESTARTBLOCK 516 #endif /* __CR_ERRNO_H__ */ diff --git a/compel/include/infect-priv.h b/compel/include/infect-priv.h index ec6dd455d..8e78a7f6c 100644 --- a/compel/include/infect-priv.h +++ b/compel/include/infect-priv.h @@ -3,69 +3,79 @@ #include -#define BUILTIN_SYSCALL_SIZE 8 +#define BUILTIN_SYSCALL_SIZE 8 struct thread_ctx { - k_rtsigset_t sigmask; - user_regs_struct_t regs; + k_rtsigset_t sigmask; + user_regs_struct_t regs; +#ifdef ARCH_HAS_PTRACE_GET_THREAD_AREA + tls_t tls; +#endif + user_fpregs_struct_t ext_regs; }; /* parasite control block */ struct parasite_ctl { - int rpid; /* Real pid of the victim */ - void *remote_map; - void *local_map; - void *sigreturn_addr; /* A place for the breakpoint */ - unsigned long map_length; + int rpid; /* Real pid of the victim */ + void *remote_map; + void *local_map; + void *sigreturn_addr; /* A place for the breakpoint */ + unsigned long map_length; - struct infect_ctx ictx; + struct infect_ctx ictx; /* thread leader data */ - bool daemonized; + bool daemonized; - struct thread_ctx orig; + struct thread_ctx orig; - void *rstack; /* thread leader stack*/ - struct rt_sigframe *sigframe; - struct rt_sigframe *rsigframe; /* address in a parasite */ + void *rstack; /* thread leader stack*/ + struct rt_sigframe *sigframe; + struct rt_sigframe *rsigframe; /* address in a parasite */ - void *r_thread_stack; /* stack for non-leader threads */ + void *r_thread_stack; /* stack for non-leader threads */ - unsigned long parasite_ip; /* service routine start ip */ + unsigned long parasite_ip; /* service routine start ip */ - unsigned int *addr_cmd; /* addr for command */ - void *addr_args; /* address for arguments */ - unsigned long args_size; - int tsock; /* transport socket for transferring fds */ + unsigned int *cmd; /* address for command */ + void *args; /* address for arguments */ + unsigned long args_size; + int tsock; /* transport socket for transferring fds */ struct parasite_blob_desc pblob; }; struct parasite_thread_ctl { - int tid; - struct parasite_ctl *ctl; - struct thread_ctx th; + int tid; + struct parasite_ctl *ctl; + struct thread_ctx th; }; -#define MEMFD_FNAME "CRIUMFD" -#define MEMFD_FNAME_SZ sizeof(MEMFD_FNAME) +#define MEMFD_FNAME "CRIUMFD" +#define MEMFD_FNAME_SZ sizeof(MEMFD_FNAME) struct ctl_msg; int parasite_wait_ack(int sockfd, unsigned int cmd, struct ctl_msg *m); extern void parasite_setup_regs(unsigned long new_ip, void *stack, user_regs_struct_t *regs); -extern void *remote_mmap(struct parasite_ctl *ctl, - void *addr, size_t length, int prot, - int flags, int fd, off_t offset); +extern void *remote_mmap(struct parasite_ctl *ctl, void *addr, size_t length, int prot, int flags, int fd, + off_t offset); extern bool arch_can_dump_task(struct parasite_ctl *ctl); -extern int get_task_regs(pid_t pid, user_regs_struct_t *regs, save_regs_t save, - void *arg, unsigned long flags); +/* + * @regs: general purpose registers + * @ext_regs: extended register set (fpu/mmx/sse/etc) + * for task that is NULL, restored by sigframe on rt_sigreturn() + * @save: callback to dump all info + * @flags: see INFECT_* in infect_ctx::flags + * @pid: mystery + */ +extern int compel_get_task_regs(pid_t pid, user_regs_struct_t *regs, user_fpregs_struct_t *ext_regs, save_regs_t save, + void *arg, unsigned long flags); +extern int compel_set_task_ext_regs(pid_t pid, user_fpregs_struct_t *ext_regs); +extern int compel_set_task_gcs_regs(pid_t pid, user_fpregs_struct_t *ext_regs); extern int arch_fetch_sas(struct parasite_ctl *ctl, struct rt_sigframe *s); -extern int sigreturn_prep_regs_plain(struct rt_sigframe *sigframe, - user_regs_struct_t *regs, +extern int sigreturn_prep_regs_plain(struct rt_sigframe *sigframe, user_regs_struct_t *regs, user_fpregs_struct_t *fpregs); -extern int sigreturn_prep_fpu_frame_plain(struct rt_sigframe *sigframe, - struct rt_sigframe *rsigframe); -extern int compel_execute_syscall(struct parasite_ctl *ctl, - user_regs_struct_t *regs, const char *code_syscall); +extern int sigreturn_prep_fpu_frame_plain(struct rt_sigframe *sigframe, struct rt_sigframe *rsigframe); +extern int compel_execute_syscall(struct parasite_ctl *ctl, user_regs_struct_t *regs, const char *code_syscall); #endif diff --git a/compel/include/log.h b/compel/include/log.h index 559f909ce..5250622c8 100644 --- a/compel/include/log.h +++ b/compel/include/log.h @@ -1,64 +1,53 @@ #ifndef COMPEL_LOG_H__ #define COMPEL_LOG_H__ -#include "uapi/compel/compel.h" -#include "uapi/compel/loglevels.h" +#include +#include + +#include "uapi/compel/log.h" #ifndef LOG_PREFIX -# define LOG_PREFIX +#define LOG_PREFIX #endif static inline int pr_quelled(unsigned int loglevel) { - return compel_log_get_loglevel() < loglevel - && loglevel != COMPEL_LOG_MSG; + return compel_log_get_loglevel() < loglevel && loglevel != COMPEL_LOG_MSG; } -extern void compel_print_on_level(unsigned int loglevel, - const char *format, ...) - __attribute__ ((__format__ (__printf__, 2, 3))); +extern void compel_print_on_level(unsigned int loglevel, const char *format, ...) + __attribute__((__format__(__printf__, 2, 3))); -#define pr_msg(fmt, ...) \ - compel_print_on_level(COMPEL_LOG_MSG, \ - fmt, ##__VA_ARGS__) +#define pr_msg(fmt, ...) compel_print_on_level(COMPEL_LOG_MSG, fmt, ##__VA_ARGS__) -#define pr_info(fmt, ...) \ - compel_print_on_level(COMPEL_LOG_INFO, \ - LOG_PREFIX fmt, ##__VA_ARGS__) +#define pr_info(fmt, ...) compel_print_on_level(COMPEL_LOG_INFO, LOG_PREFIX fmt, ##__VA_ARGS__) -#define pr_err(fmt, ...) \ - compel_print_on_level(COMPEL_LOG_ERROR, \ - "Error (%s:%d): " LOG_PREFIX fmt, \ - __FILE__, __LINE__, ##__VA_ARGS__) +#define pr_err(fmt, ...) \ + compel_print_on_level(COMPEL_LOG_ERROR, "Error (%s:%d): " LOG_PREFIX fmt, __FILE__, __LINE__, ##__VA_ARGS__) -#define pr_err_once(fmt, ...) \ - do { \ - static bool __printed; \ - if (!__printed) { \ - pr_err(fmt, ##__VA_ARGS__); \ - __printed = 1; \ - } \ +#define pr_err_once(fmt, ...) \ + do { \ + static bool __printed; \ + if (!__printed) { \ + pr_err(fmt, ##__VA_ARGS__); \ + __printed = 1; \ + } \ } while (0) -#define pr_warn(fmt, ...) \ - compel_print_on_level(COMPEL_LOG_WARN, \ - "Warn (%s:%d): " LOG_PREFIX fmt, \ - __FILE__, __LINE__, ##__VA_ARGS__) +#define pr_warn(fmt, ...) \ + compel_print_on_level(COMPEL_LOG_WARN, "Warn (%s:%d): " LOG_PREFIX fmt, __FILE__, __LINE__, ##__VA_ARGS__) -#define pr_warn_once(fmt, ...) \ - do { \ - static bool __printed; \ - if (!__printed) { \ - pr_warn(fmt, ##__VA_ARGS__); \ - __printed = 1; \ - } \ +#define pr_warn_once(fmt, ...) \ + do { \ + static bool __printed; \ + if (!__printed) { \ + pr_warn(fmt, ##__VA_ARGS__); \ + __printed = 1; \ + } \ } while (0) -#define pr_debug(fmt, ...) \ - compel_print_on_level(COMPEL_LOG_DEBUG, \ - LOG_PREFIX fmt, ##__VA_ARGS__) +#define pr_debug(fmt, ...) compel_print_on_level(COMPEL_LOG_DEBUG, LOG_PREFIX fmt, ##__VA_ARGS__) -#define pr_perror(fmt, ...) \ - pr_err(fmt ": %m\n", ##__VA_ARGS__) +#define pr_perror(fmt, ...) pr_err(fmt ": %s\n", ##__VA_ARGS__, strerror(errno)) #endif /* COMPEL_LOG_H__ */ diff --git a/compel/include/piegen.h b/compel/include/piegen.h index fd72f9c22..d445d5349 100644 --- a/compel/include/piegen.h +++ b/compel/include/piegen.h @@ -9,19 +9,19 @@ #include "common/compiler.h" typedef struct { - char *input_filename; - char *output_filename; - char *prefix; - FILE *fout; + char *input_filename; + char *output_filename; + char *prefix; + FILE *fout; } piegen_opt_t; extern piegen_opt_t opts; -#define pr_out(fmt, ...) \ -do { \ - if (opts.fout) \ - fprintf(opts.fout, fmt, ##__VA_ARGS__); \ -} while (0) +#define pr_out(fmt, ...) \ + do { \ + if (opts.fout) \ + fprintf(opts.fout, fmt, ##__VA_ARGS__); \ + } while (0) extern int handle_binary(void *mem, size_t size); diff --git a/compel/include/ptrace.h b/compel/include/ptrace.h index 01f55c45a..00013f937 100644 --- a/compel/include/ptrace.h +++ b/compel/include/ptrace.h @@ -5,7 +5,9 @@ #include #include -#define PTRACE_SI_EVENT(_si_code) (((_si_code) & 0xFFFF) >> 8) +#define PTRACE_SYSCALL_TRAP 0x80 + +#define PTRACE_SI_EVENT(_si_code) (((_si_code)&0xFFFF) >> 8) extern int ptrace_get_regs(pid_t pid, user_regs_struct_t *regs); extern int ptrace_set_regs(pid_t pid, user_regs_struct_t *regs); diff --git a/compel/include/rpc-pie-priv.h b/compel/include/rpc-pie-priv.h index 15f5b14ca..5a6b337b2 100644 --- a/compel/include/rpc-pie-priv.h +++ b/compel/include/rpc-pie-priv.h @@ -1,16 +1,22 @@ #ifndef __COMPEL_RPC_H__ #define __COMPEL_RPC_H__ struct ctl_msg { - uint32_t cmd; /* command itself */ - uint32_t ack; /* ack on command */ - int32_t err; /* error code on reply */ + uint32_t cmd; /* command itself */ + uint32_t ack; /* ack on command */ + int32_t err; /* error code on reply */ }; -#define ctl_msg_cmd(_cmd) \ - (struct ctl_msg){.cmd = _cmd, } +#define ctl_msg_cmd(_cmd) \ + (struct ctl_msg) \ + { \ + .cmd = _cmd, \ + } -#define ctl_msg_ack(_cmd, _err) \ - (struct ctl_msg){.cmd = _cmd, .ack = _cmd, .err = _err, } +#define ctl_msg_ack(_cmd, _err) \ + (struct ctl_msg) \ + { \ + .cmd = _cmd, .ack = _cmd, .err = _err, \ + } /* * NOTE: each command's args should be arch-independed sized. @@ -18,7 +24,7 @@ struct ctl_msg { * alternative type for compatible tasks in parasite-compat.h */ enum { - PARASITE_CMD_IDLE = 0, + PARASITE_CMD_IDLE = 0, PARASITE_CMD_ACK, PARASITE_CMD_INIT_DAEMON, @@ -32,19 +38,19 @@ enum { }; struct parasite_init_args { - int32_t h_addr_len; - struct sockaddr_un h_addr; - int32_t log_level; - uint64_t sigreturn_addr; - uint64_t sigframe; /* pointer to sigframe */ - futex_t daemon_connected; + int32_t h_addr_len; + struct sockaddr_un h_addr; + int32_t log_level; + uint64_t sigreturn_addr; + uint64_t sigframe; /* pointer to sigframe */ + futex_t daemon_connected; #ifdef ARCH_HAS_LONG_PAGES - uint32_t page_size; + uint32_t page_size; #endif }; struct parasite_unmap_args { - uint64_t parasite_start; - uint64_t parasite_len; + uint64_t parasite_start; + uint64_t parasite_len; }; #endif diff --git a/compel/include/shmem.h b/compel/include/shmem.h index b6f994617..a38599625 100644 --- a/compel/include/shmem.h +++ b/compel/include/shmem.h @@ -7,4 +7,3 @@ struct shmem_plugin_msg { }; #endif /* __COMPEL_PLUGIN_SHMEM_PRIV_H__ */ - diff --git a/compel/include/uapi/compel.h b/compel/include/uapi/compel.h deleted file mode 100644 index 318a472da..000000000 --- a/compel/include/uapi/compel.h +++ /dev/null @@ -1,14 +0,0 @@ -#ifndef UAPI_COMPEL_H__ -#define UAPI_COMPEL_H__ - -#include -#include - -#include -#include -#include -#include -#include -#include - -#endif /* UAPI_COMPEL_H__ */ diff --git a/compel/include/uapi/cpu.h b/compel/include/uapi/cpu.h index 6f827d447..72c8a516c 100644 --- a/compel/include/uapi/cpu.h +++ b/compel/include/uapi/cpu.h @@ -6,7 +6,7 @@ #include -extern int compel_cpuid(compel_cpuinfo_t *info); +extern int /* TODO: __must_check */ compel_cpuid(compel_cpuinfo_t *info); extern bool compel_cpu_has_feature(unsigned int feature); extern bool compel_fpu_has_feature(unsigned int feature); extern uint32_t compel_fpu_feature_size(unsigned int feature); diff --git a/compel/include/uapi/handle-elf.h b/compel/include/uapi/handle-elf.h index ddeecb0d5..3b6d5bfee 100644 --- a/compel/include/uapi/handle-elf.h +++ b/compel/include/uapi/handle-elf.h @@ -1,15 +1,22 @@ #ifndef __COMPEL_UAPI_HANDLE_ELF__ #define __COMPEL_UAPI_HANDLE_ELF__ -#define COMPEL_TYPE_INT (1u << 0) -#define COMPEL_TYPE_LONG (1u << 1) -#define COMPEL_TYPE_GOTPCREL (1u << 2) - +#define COMPEL_TYPE_INT (1u << 0) +#define COMPEL_TYPE_LONG (1u << 1) +#define COMPEL_TYPE_GOTPCREL (1u << 2) +#ifdef CONFIG_MIPS +#define COMPEL_TYPE_MIPS_26 (1u << 3) +#define COMPEL_TYPE_MIPS_HI16 (1u << 4) +#define COMPEL_TYPE_MIPS_LO16 (1u << 5) +#define COMPEL_TYPE_MIPS_HIGHER (1u << 6) +#define COMPEL_TYPE_MIPS_HIGHEST (1u << 7) +#define COMPEL_TYPE_MIPS_64 (1u << 8) +#endif typedef struct { - unsigned int offset; - unsigned int type; - long addend; - long value; + unsigned int offset; + unsigned int type; + long addend; + long value; } compel_reloc_t; #endif diff --git a/compel/include/uapi/infect-rpc.h b/compel/include/uapi/infect-rpc.h index 0176c1142..d0f853859 100644 --- a/compel/include/uapi/infect-rpc.h +++ b/compel/include/uapi/infect-rpc.h @@ -5,13 +5,14 @@ #include #include +#include + struct parasite_ctl; -extern int compel_rpc_sync(unsigned int cmd, struct parasite_ctl *ctl); -extern int compel_rpc_call(unsigned int cmd, struct parasite_ctl *ctl); -extern int compel_rpc_call_sync(unsigned int cmd, struct parasite_ctl *ctl); +extern int __must_check compel_rpc_sync(unsigned int cmd, struct parasite_ctl *ctl); +extern int __must_check compel_rpc_call(unsigned int cmd, struct parasite_ctl *ctl); +extern int __must_check compel_rpc_call_sync(unsigned int cmd, struct parasite_ctl *ctl); extern int compel_rpc_sock(struct parasite_ctl *ctl); -#define PARASITE_USER_CMDS 64 - +#define PARASITE_USER_CMDS 64 #endif diff --git a/compel/include/uapi/infect-util.h b/compel/include/uapi/infect-util.h index 7307ba57a..658df9393 100644 --- a/compel/include/uapi/infect-util.h +++ b/compel/include/uapi/infect-util.h @@ -1,6 +1,24 @@ #ifndef __COMPEL_INFECT_UTIL_H__ #define __COMPEL_INFECT_UTIL_H__ + +#include "common/compiler.h" + +/** + * The length of the hash is based on what libuuid provides. + * According to the manpage this is: + * + * The uuid_unparse() function converts the supplied UUID uu from the binary + * representation into a 36-byte string (plus trailing '\0') + */ +#define RUN_ID_HASH_LENGTH 37 + +/* + * compel_run_id is a unique value of the current run. It can be used to + * generate resource ID-s to avoid conflicts with other processes. + */ +extern char compel_run_id[RUN_ID_HASH_LENGTH]; + struct parasite_ctl; -extern int compel_util_send_fd(struct parasite_ctl *ctl, int fd); +extern int __must_check compel_util_send_fd(struct parasite_ctl *ctl, int fd); extern int compel_util_recv_fd(struct parasite_ctl *ctl, int *pfd); #endif diff --git a/compel/include/uapi/infect.h b/compel/include/uapi/infect.h index 08beaffcd..d21c261b7 100644 --- a/compel/include/uapi/infect.h +++ b/compel/include/uapi/infect.h @@ -11,62 +11,74 @@ #include "common/compiler.h" -#define PARASITE_START_AREA_MIN (4096) +#define PARASITE_START_AREA_MIN (4096) -extern int compel_interrupt_task(int pid); +#define PARASITE_STACK_SIZE (16 << 10) +/* + * A stack redzone is a small, protected region of memory located immediately + * after a parasite stack. It is intended to remain unchanged. While it can be + * implemented as a guard page, we want to avoid the overhead of additional + * remote system calls. + */ +#define PARASITE_STACK_REDZONE 128 + +extern int __must_check compel_interrupt_task(int pid); struct seize_task_status { - unsigned long long sigpnd; - unsigned long long shdpnd; - char state; - int ppid; - int seccomp_mode; + unsigned long long sigpnd; + unsigned long long shdpnd; + unsigned long long sigblk; + char state; + int vpid; + int ppid; + int seccomp_mode; }; -extern int compel_wait_task(int pid, int ppid, - int (*get_status)(int pid, struct seize_task_status *, void *data), - void (*free_status)(int pid, struct seize_task_status *, void *data), - struct seize_task_status *st, void *data); +extern int __must_check compel_wait_task(int pid, int ppid, + int (*get_status)(int pid, struct seize_task_status *, void *data), + void (*free_status)(int pid, struct seize_task_status *, void *data), + struct seize_task_status *st, void *data); -extern int compel_stop_task(int pid); +extern int __must_check compel_stop_task(int pid); +extern int __must_check compel_parse_stop_signo(int pid); extern int compel_resume_task(pid_t pid, int orig_state, int state); +extern int compel_resume_task_sig(pid_t pid, int orig_state, int state, int stop_signo); struct parasite_ctl; struct parasite_thread_ctl; -extern struct parasite_ctl *compel_prepare(int pid); -extern struct parasite_ctl *compel_prepare_noctx(int pid); -extern int compel_infect(struct parasite_ctl *ctl, unsigned long nr_threads, unsigned long args_size); -extern struct parasite_thread_ctl *compel_prepare_thread(struct parasite_ctl *ctl, int pid); +extern struct parasite_ctl __must_check *compel_prepare(int pid); +extern struct parasite_ctl __must_check *compel_prepare_noctx(int pid); +extern int __must_check compel_infect(struct parasite_ctl *ctl, unsigned long nr_threads, unsigned long args_size); +extern int __must_check compel_infect_no_daemon(struct parasite_ctl *ctl, unsigned long nr_threads, + unsigned long args_size); +extern struct parasite_thread_ctl __must_check *compel_prepare_thread(struct parasite_ctl *ctl, int pid); extern void compel_release_thread(struct parasite_thread_ctl *); -extern int compel_stop_daemon(struct parasite_ctl *ctl); -extern int compel_cure_remote(struct parasite_ctl *ctl); -extern int compel_cure_local(struct parasite_ctl *ctl); -extern int compel_cure(struct parasite_ctl *ctl); +extern int __must_check compel_start_daemon(struct parasite_ctl *ctl); +extern int __must_check compel_stop_daemon(struct parasite_ctl *ctl); +extern int __must_check compel_cure_remote(struct parasite_ctl *ctl); +extern int __must_check compel_cure_local(struct parasite_ctl *ctl); +extern int __must_check compel_cure(struct parasite_ctl *ctl); -#define PARASITE_ARG_SIZE_MIN ( 1 << 12) +#define PARASITE_ARG_SIZE_MIN (1 << 12) -#define compel_parasite_args(ctl, type) \ - ({ \ - void *___ret; \ - BUILD_BUG_ON(sizeof(type) > PARASITE_ARG_SIZE_MIN); \ - ___ret = compel_parasite_args_p(ctl); \ - ___ret; \ +#define compel_parasite_args(ctl, type) \ + ({ \ + void *___ret; \ + BUILD_BUG_ON(sizeof(type) > PARASITE_ARG_SIZE_MIN); \ + ___ret = compel_parasite_args_p(ctl); \ + ___ret; \ }) extern void *compel_parasite_args_p(struct parasite_ctl *ctl); extern void *compel_parasite_args_s(struct parasite_ctl *ctl, unsigned long args_size); -extern int compel_syscall(struct parasite_ctl *ctl, int nr, long *ret, - unsigned long arg1, - unsigned long arg2, - unsigned long arg3, - unsigned long arg4, - unsigned long arg5, - unsigned long arg6); -extern int compel_run_in_thread(struct parasite_thread_ctl *tctl, unsigned int cmd); -extern int compel_run_at(struct parasite_ctl *ctl, unsigned long ip, user_regs_struct_t *ret_regs); +extern int __must_check compel_syscall(struct parasite_ctl *ctl, int nr, long *ret, unsigned long arg1, + unsigned long arg2, unsigned long arg3, unsigned long arg4, unsigned long arg5, + unsigned long arg6); +extern int __must_check compel_run_in_thread(struct parasite_thread_ctl *tctl, unsigned int cmd); +extern int __must_check compel_run_at(struct parasite_ctl *ctl, unsigned long ip, user_regs_struct_t *ret_regs); /* * The PTRACE_SYSCALL will trap task twice -- on @@ -80,12 +92,11 @@ enum trace_flags { TRACE_EXIT, }; -extern int compel_stop_on_syscall(int tasks, int sys_nr, - int sys_nr_compat, enum trace_flags trace); +extern int __must_check compel_stop_on_syscall(int tasks, int sys_nr, int sys_nr_compat); -extern int compel_stop_pie(pid_t pid, void *addr, enum trace_flags *tf, bool no_bp); +extern int __must_check compel_stop_pie(pid_t pid, void *addr, bool no_bp); -extern int compel_unmap(struct parasite_ctl *ctl, unsigned long addr); +extern int __must_check compel_unmap(struct parasite_ctl *ctl, unsigned long addr); extern int compel_mode_native(struct parasite_ctl *ctl); @@ -94,78 +105,116 @@ extern k_rtsigset_t *compel_thread_sigmask(struct parasite_thread_ctl *tctl); struct rt_sigframe; -typedef int (*open_proc_fn)(int pid, int mode, const char *fmt, ...) - __attribute__ ((__format__ (__printf__, 3, 4))); -typedef int (*save_regs_t)(void *, user_regs_struct_t *, user_fpregs_struct_t *); +typedef int (*open_proc_fn)(int pid, int mode, const char *fmt, ...) __attribute__((__format__(__printf__, 3, 4))); +typedef int (*save_regs_t)(pid_t pid, void *, user_regs_struct_t *, user_fpregs_struct_t *); typedef int (*make_sigframe_t)(void *, struct rt_sigframe *, struct rt_sigframe *, k_rtsigset_t *); struct infect_ctx { - int sock; + int sock; /* * Regs manipulation context. */ - save_regs_t save_regs; - make_sigframe_t make_sigframe; + save_regs_t save_regs; + make_sigframe_t make_sigframe; void *regs_arg; - unsigned long task_size; - unsigned long syscall_ip; /* entry point of infection */ - unsigned long flags; /* fine-tune (e.g. faults) */ + unsigned long task_size; + unsigned long syscall_ip; /* entry point of infection */ + unsigned long flags; /* fine-tune (e.g. faults) */ - void (*child_handler)(int, siginfo_t *, void *); /* hander for SIGCHLD deaths */ - struct sigaction orig_handler; + void (*child_handler)(int, siginfo_t *, void *); /* handler for SIGCHLD deaths */ + struct sigaction orig_handler; open_proc_fn open_proc; - int log_fd; /* fd for parasite code to send messages to */ + int log_fd; /* fd for parasite code to send messages to */ + unsigned long remote_map_addr; /* User-specified address where to mmap parasitic code, default not set */ }; extern struct infect_ctx *compel_infect_ctx(struct parasite_ctl *); /* Don't use memfd() */ -#define INFECT_NO_MEMFD (1UL << 0) +#define INFECT_NO_MEMFD (1UL << 0) /* Make parasite connect() fail */ -#define INFECT_FAIL_CONNECT (1UL << 1) +#define INFECT_FAIL_CONNECT (1UL << 1) /* No breakpoints in pie tracking */ -#define INFECT_NO_BREAKPOINTS (1UL << 2) +#define INFECT_NO_BREAKPOINTS (1UL << 2) /* Can run parasite inside compat tasks */ -#define INFECT_COMPATIBLE (1UL << 3) +#define INFECT_COMPATIBLE (1UL << 3) /* Workaround for ptrace bug on Skylake CPUs with kernels older than v4.14 */ -#define INFECT_X86_PTRACE_MXCSR_BUG (1UL << 4) +#define INFECT_X86_PTRACE_MXCSR_BUG (1UL << 4) +/* After infecting - corrupt extended registers (fault-injection) */ +#define INFECT_CORRUPT_EXTREGS (1UL << 5) /* * There are several ways to describe a blob to compel * library. The simplest one derived from criu is to * provide it from .h files. */ -#define COMPEL_BLOB_CHEADER 0x1 +#define COMPEL_BLOB_CHEADER 0x1 struct parasite_blob_desc { - unsigned parasite_type; + unsigned parasite_type; union { struct { - const void *mem; - size_t bsize; - size_t nr_gotpcrel; - unsigned long parasite_ip_off; - unsigned long addr_cmd_off; - unsigned long addr_arg_off; - compel_reloc_t *relocs; - unsigned int nr_relocs; + const void *mem; + size_t bsize; + unsigned long parasite_ip_off; + unsigned long cmd_off; + unsigned long args_ptr_off; + unsigned long got_off; + unsigned long args_off; + unsigned long data_off; + compel_reloc_t *relocs; + unsigned int nr_relocs; } hdr; }; }; extern struct parasite_blob_desc *compel_parasite_blob_desc(struct parasite_ctl *); -extern int compel_get_thread_regs(struct parasite_thread_ctl *, save_regs_t, void *); +extern int __must_check compel_get_thread_regs(struct parasite_thread_ctl *, save_regs_t, void *); -extern void compel_relocs_apply(void *mem, void *vbase, size_t size, compel_reloc_t *elf_relocs, size_t nr_relocs); +extern void compel_relocs_apply(void *mem, void *vbase, struct parasite_blob_desc *pbd); +extern void compel_relocs_apply_mips(void *mem, void *vbase, struct parasite_blob_desc *pbd); extern unsigned long compel_task_size(void); extern uint64_t compel_get_leader_sp(struct parasite_ctl *ctl); extern uint64_t compel_get_thread_sp(struct parasite_thread_ctl *tctl); +extern uint64_t compel_get_leader_ip(struct parasite_ctl *ctl); +extern uint64_t compel_get_thread_ip(struct parasite_thread_ctl *tctl); + +void compel_set_leader_ip(struct parasite_ctl *ctl, uint64_t v); +void compel_set_thread_ip(struct parasite_thread_ctl *tctl, uint64_t v); + +extern void compel_get_stack(struct parasite_ctl *ctl, void **rstack, void **r_thread_stack); + +#ifndef compel_host_supports_gcs +static inline bool compel_host_supports_gcs(void) +{ + return false; +} +#define compel_host_supports_gcs +#endif + +#ifndef compel_shstk_enabled +static inline bool compel_shstk_enabled(user_fpregs_struct_t *ext_regs) +{ + return false; +} +#define compel_shstk_enabled +#endif + +#ifndef parasite_setup_shstk +static inline int parasite_setup_shstk(struct parasite_ctl *ctl, + user_fpregs_struct_t *ext_regs) +{ + return 0; +} +#define parasite_setup_shstk parasite_setup_shstk +#endif + #endif diff --git a/compel/include/uapi/ksigset.h b/compel/include/uapi/ksigset.h index f6b124bf3..e2465041d 100644 --- a/compel/include/uapi/ksigset.h +++ b/compel/include/uapi/ksigset.h @@ -22,4 +22,10 @@ static inline void ksigaddset(k_rtsigset_t *set, int _sig) int sig = _sig - 1; set->sig[sig / _NSIG_BPW] |= 1UL << (sig % _NSIG_BPW); } + +static inline void ksigdelset(k_rtsigset_t *set, int _sig) +{ + int sig = _sig - 1; + set->sig[sig / _NSIG_BPW] &= ~(1UL << (sig % _NSIG_BPW)); +} #endif diff --git a/compel/include/uapi/loglevels.h b/compel/include/uapi/loglevels.h index 7bf88475d..7a49825d2 100644 --- a/compel/include/uapi/loglevels.h +++ b/compel/include/uapi/loglevels.h @@ -6,15 +6,14 @@ * also by log functions in the std plugin. */ -enum __compel_log_levels -{ - COMPEL_LOG_MSG, /* Print message regardless of log level */ - COMPEL_LOG_ERROR, /* Errors only, when we're in trouble */ - COMPEL_LOG_WARN, /* Warnings */ - COMPEL_LOG_INFO, /* Informative, everything is fine */ - COMPEL_LOG_DEBUG, /* Debug only */ +enum __compel_log_levels { + COMPEL_LOG_MSG, /* Print message regardless of log level */ + COMPEL_LOG_ERROR, /* Errors only, when we're in trouble */ + COMPEL_LOG_WARN, /* Warnings */ + COMPEL_LOG_INFO, /* Informative, everything is fine */ + COMPEL_LOG_DEBUG, /* Debug only */ - COMPEL_DEFAULT_LOGLEVEL = COMPEL_LOG_WARN + COMPEL_DEFAULT_LOGLEVEL = COMPEL_LOG_WARN }; #endif /* UAPI_COMPEL_LOGLEVELS_H__ */ diff --git a/compel/include/uapi/plugins.h b/compel/include/uapi/plugins.h index e9ebfb67f..658c95fc9 100644 --- a/compel/include/uapi/plugins.h +++ b/compel/include/uapi/plugins.h @@ -1,33 +1,31 @@ #ifndef UAPI_COMPEL_PLUGIN_H__ #define UAPI_COMPEL_PLUGIN_H__ -#define __init __attribute__((__used__)) __attribute__ ((__section__(".compel.init"))) -#define __exit __attribute__((__used__)) __attribute__ ((__section__(".compel.exit"))) +#define __init __attribute__((__used__)) __attribute__((__section__(".compel.init"))) +#define __exit __attribute__((__used__)) __attribute__((__section__(".compel.exit"))) #ifndef __ASSEMBLY__ typedef struct { - const char *name; - int (*init)(void); - void (*exit)(void); + const char *name; + int (*init)(void); + void (*exit)(void); } plugin_init_t; -#define plugin_register(___desc) \ - static const plugin_init_t * const \ - ___ptr__##___desc __init = &___desc; +#define plugin_register(___desc) static const plugin_init_t *const ___ptr__##___desc __init = &___desc; -#define PLUGIN_REGISTER(___id, ___name, ___init, ___exit) \ - static const plugin_init_t __plugin_desc_##___id = { \ - .name = ___name, \ - .init = ___init, \ - .exit = ___exit, \ - }; \ +#define PLUGIN_REGISTER(___id, ___name, ___init, ___exit) \ + static const plugin_init_t __plugin_desc_##___id = { \ + .name = ___name, \ + .init = ___init, \ + .exit = ___exit, \ + }; \ plugin_register(__plugin_desc_##___id); -#define PLUGIN_REGISTER_DUMMY(___id) \ - static const plugin_init_t __plugin_desc_##___id = { \ - .name = #___id, \ - }; \ +#define PLUGIN_REGISTER_DUMMY(___id) \ + static const plugin_init_t __plugin_desc_##___id = { \ + .name = #___id, \ + }; \ plugin_register(__plugin_desc_##___id); #endif /* __ASSEMBLY__ */ diff --git a/compel/include/uapi/ptrace.h b/compel/include/uapi/ptrace.h index 4df00b6e1..558124fbd 100644 --- a/compel/include/uapi/ptrace.h +++ b/compel/include/uapi/ptrace.h @@ -1,6 +1,7 @@ #ifndef UAPI_COMPEL_PTRACE_H__ #define UAPI_COMPEL_PTRACE_H__ +#include "common/compiler.h" /* * We'd want to include both sys/ptrace.h and linux/ptrace.h, * hoping that most definitions come from either one or another. @@ -18,40 +19,40 @@ */ #ifndef PTRACE_SEIZE -# define PTRACE_SEIZE 0x4206 +#define PTRACE_SEIZE 0x4206 #endif #ifndef PTRACE_O_SUSPEND_SECCOMP -# define PTRACE_O_SUSPEND_SECCOMP (1 << 21) +#define PTRACE_O_SUSPEND_SECCOMP (1 << 21) #endif #ifndef PTRACE_INTERRUPT -# define PTRACE_INTERRUPT 0x4207 +#define PTRACE_INTERRUPT 0x4207 #endif #ifndef PTRACE_PEEKSIGINFO -#define PTRACE_PEEKSIGINFO 0x4209 +#define PTRACE_PEEKSIGINFO 0x4209 /* Read signals from a shared (process wide) queue */ -#define PTRACE_PEEKSIGINFO_SHARED (1 << 0) +#define PTRACE_PEEKSIGINFO_SHARED (1 << 0) #endif #ifndef PTRACE_GETREGSET -# define PTRACE_GETREGSET 0x4204 -# define PTRACE_SETREGSET 0x4205 +#define PTRACE_GETREGSET 0x4204 +#define PTRACE_SETREGSET 0x4205 #endif #ifndef PTRACE_GETSIGMASK -# define PTRACE_GETSIGMASK 0x420a -# define PTRACE_SETSIGMASK 0x420b +#define PTRACE_GETSIGMASK 0x420a +#define PTRACE_SETSIGMASK 0x420b #endif #ifndef PTRACE_SECCOMP_GET_FILTER -#define PTRACE_SECCOMP_GET_FILTER 0x420c +#define PTRACE_SECCOMP_GET_FILTER 0x420c #endif #ifndef PTRACE_SECCOMP_GET_METADATA -# define PTRACE_SECCOMP_GET_METADATA 0x420d +#define PTRACE_SECCOMP_GET_METADATA 0x420d #endif /* PTRACE_SECCOMP_GET_METADATA */ /* @@ -60,23 +61,48 @@ * own identical definition for a while. */ typedef struct { - uint64_t filter_off; /* Input: which filter */ - uint64_t flags; /* Output: filter's flags */ + uint64_t filter_off; /* Input: which filter */ + uint64_t flags; /* Output: filter's flags */ } seccomp_metadata_t; +#ifndef PTRACE_GET_RSEQ_CONFIGURATION +#define PTRACE_GET_RSEQ_CONFIGURATION 0x420f + +struct __ptrace_rseq_configuration { + uint64_t rseq_abi_pointer; + uint32_t rseq_abi_size; + uint32_t signature; + uint32_t flags; + uint32_t pad; +}; +#endif + #ifdef PTRACE_EVENT_STOP -# if PTRACE_EVENT_STOP == 7 /* Bad value from Linux 3.1-3.3, fixed in 3.4 */ -# undef PTRACE_EVENT_STOP -# endif +#if PTRACE_EVENT_STOP == 7 /* Bad value from Linux 3.1-3.3, fixed in 3.4 */ +#undef PTRACE_EVENT_STOP +#endif #endif #ifndef PTRACE_EVENT_STOP -# define PTRACE_EVENT_STOP 128 +#define PTRACE_EVENT_STOP 128 +#endif + +/* + * Amazon Linux 2 uses glibc 2.26. PTRACE_ARCH_PRCTL was added in glibc 2.27. + * This allows CRIU to build on Amazon Linux 2. + * + * Note that in sys/ptrace.h, PTRACE_ARCH_PRCTL is an enum value so the + * preprocessor doesn't know about it. PT_ARCH_PRCTL is the preprocessor symbol + * that matches the value of PTRACE_ARCH_PRCTL. So look for PT_ARCH_PRCTL to + * decide if PTRACE_ARCH_PRCTL is available or not. + */ +#if defined(__x86_64__) && !defined(PT_ARCH_PRCTL) +#define PTRACE_ARCH_PRCTL 30 /* From asm/ptrace-abi.h. */ #endif extern int ptrace_suspend_seccomp(pid_t pid); -extern int ptrace_peek_area(pid_t pid, void *dst, void *addr, long bytes); -extern int ptrace_poke_area(pid_t pid, void *src, void *addr, long bytes); -extern int ptrace_swap_area(pid_t pid, void *dst, void *src, long bytes); +extern int __must_check ptrace_peek_area(pid_t pid, void *dst, void *addr, long bytes); +extern int __must_check ptrace_poke_area(pid_t pid, void *src, void *addr, long bytes); +extern int __must_check ptrace_swap_area(pid_t pid, void *dst, void *src, long bytes); #endif /* UAPI_COMPEL_PTRACE_H__ */ diff --git a/compel/include/uapi/sigframe-common.h b/compel/include/uapi/sigframe-common.h index fc93c5480..3062d1d38 100644 --- a/compel/include/uapi/sigframe-common.h +++ b/compel/include/uapi/sigframe-common.h @@ -5,58 +5,57 @@ #define UAPI_COMPEL_SIGFRAME_COMMON_H__ #ifndef UAPI_COMPEL_ASM_SIGFRAME_H__ -# error "Direct inclusion is forbidden, use instead" +#error "Direct inclusion is forbidden, use instead" #endif +#include "common/compiler.h" #include #include struct rt_sigframe; #ifndef SIGFRAME_MAX_OFFSET -# define SIGFRAME_MAX_OFFSET RT_SIGFRAME_OFFSET(0) +#define SIGFRAME_MAX_OFFSET RT_SIGFRAME_OFFSET(0) #endif -#define RESTORE_STACK_ALIGN(x, a) (((x) + (a) - 1) & ~((a) - 1)) +#define RESTORE_STACK_ALIGN(x, a) (((x) + (a)-1) & ~((a)-1)) /* sigframe should be aligned on 64 byte for x86 and 8 bytes for arm */ -#define RESTORE_STACK_SIGFRAME \ - RESTORE_STACK_ALIGN(sizeof(struct rt_sigframe) + SIGFRAME_MAX_OFFSET, 64) +#define RESTORE_STACK_SIGFRAME RESTORE_STACK_ALIGN(sizeof(struct rt_sigframe) + SIGFRAME_MAX_OFFSET, 64) #ifndef __ARCH_SI_PREAMBLE_SIZE -# define __ARCH_SI_PREAMBLE_SIZE (3 * sizeof(int)) +#define __ARCH_SI_PREAMBLE_SIZE (3 * sizeof(int)) #endif -#define SI_MAX_SIZE 128 +#define SI_MAX_SIZE 128 #ifndef SI_PAD_SIZE -# define SI_PAD_SIZE ((SI_MAX_SIZE - __ARCH_SI_PREAMBLE_SIZE) / sizeof(int)) +#define SI_PAD_SIZE ((SI_MAX_SIZE - __ARCH_SI_PREAMBLE_SIZE) / sizeof(int)) #endif typedef struct rt_siginfo { - int si_signo; - int si_errno; - int si_code; - int _pad[SI_PAD_SIZE]; + int si_signo; + int si_errno; + int si_code; + int _pad[SI_PAD_SIZE]; } rt_siginfo_t; typedef struct rt_sigaltstack { - void *ss_sp; - int ss_flags; - size_t ss_size; + void *ss_sp; + int ss_flags; + size_t ss_size; } rt_stack_t; struct rt_ucontext { - unsigned long uc_flags; - struct rt_ucontext *uc_link; - rt_stack_t uc_stack; - struct rt_sigcontext uc_mcontext; - k_rtsigset_t uc_sigmask; /* mask last for extensibility */ - int _unused[32 - (sizeof (k_rtsigset_t) / sizeof (int))]; - unsigned long uc_regspace[128] __attribute__((aligned(8))); + unsigned long uc_flags; + struct rt_ucontext *uc_link; + rt_stack_t uc_stack; + struct rt_sigcontext uc_mcontext; + k_rtsigset_t uc_sigmask; /* mask last for extensibility */ + int _unused[32 - (sizeof(k_rtsigset_t) / sizeof(int))]; + unsigned long uc_regspace[128] __attribute__((aligned(8))); }; -extern int sigreturn_prep_fpu_frame(struct rt_sigframe *frame, - struct rt_sigframe *rframe); +extern int __must_check sigreturn_prep_fpu_frame(struct rt_sigframe *frame, struct rt_sigframe *rframe); #endif /* UAPI_COMPEL_SIGFRAME_COMMON_H__ */ diff --git a/compel/include/uapi/task-state.h b/compel/include/uapi/task-state.h index 84a2a0ba5..ac1baafb5 100644 --- a/compel/include/uapi/task-state.h +++ b/compel/include/uapi/task-state.h @@ -5,15 +5,14 @@ * Task state, as returned by compel_wait_task() * and used in arguments to compel_resume_task(). */ -enum __compel_task_state -{ - COMPEL_TASK_ALIVE = 0x01, - COMPEL_TASK_DEAD = 0x02, - COMPEL_TASK_STOPPED = 0x03, - COMPEL_TASK_ZOMBIE = 0x06, +enum __compel_task_state { + COMPEL_TASK_ALIVE = 0x01, + COMPEL_TASK_DEAD = 0x02, + COMPEL_TASK_STOPPED = 0x03, + COMPEL_TASK_ZOMBIE = 0x06, /* Don't ever change the above values, they are used by CRIU! */ - COMPEL_TASK_MAX = 0x7f + COMPEL_TASK_MAX = 0x7f }; #endif /* __COMPEL_UAPI_TASK_STATE_H__ */ diff --git a/compel/plugins/Makefile b/compel/plugins/Makefile index a326e2a66..e5fa781ac 100644 --- a/compel/plugins/Makefile +++ b/compel/plugins/Makefile @@ -16,7 +16,12 @@ asflags-y += -I compel/include/uapi # General compel includes ccflags-y += -iquote compel/include + +ifeq ($(ARCH),mips) +ccflags-y += -mno-abicalls -fno-pic -fno-stack-protector +else ccflags-y += -fpie -fno-stack-protector +endif # General compel/plugins includes ccflags-y += -iquote $(obj)/include @@ -28,7 +33,12 @@ asflags-y += -iquote $(PLUGIN_ARCH_DIR)/include asflags-y += -iquote $(PLUGIN_ARCH_DIR) # General flags for assembly +ifeq ($(ARCH),mips) +asflags-y += -mno-abicalls -fno-pic -Wstrict-prototypes +else asflags-y += -fpie -Wstrict-prototypes +endif + asflags-y += -nostdlib -fomit-frame-pointer asflags-y += -fno-stack-protector ldflags-y += -z noexecstack @@ -53,11 +63,15 @@ std-lib-y += ./$(PLUGIN_ARCH_DIR)/std/parasite-head.o target += fds fds-lib-y += fds/fds.o -ifeq ($(SRCARCH),x86) +ifeq ($(ARCH),x86) std-lib-y += ./$(PLUGIN_ARCH_DIR)/std/memcpy.o endif -ifeq ($(SRCARCH),ppc64) +ifeq ($(ARCH),mips) + std-lib-y += ./$(PLUGIN_ARCH_DIR)/std/memcpy.o +endif + +ifeq ($(ARCH),ppc64) std-lib-y += ./$(PLUGIN_ARCH_DIR)/std/memcpy.o std-lib-y += ./$(PLUGIN_ARCH_DIR)/std/memcmp.o endif diff --git a/compel/plugins/fds/fds.c b/compel/plugins/fds/fds.c index 7ed94509d..c0c6a2131 100644 --- a/compel/plugins/fds/fds.c +++ b/compel/plugins/fds/fds.c @@ -9,8 +9,8 @@ #include "common/compiler.h" #include "common/bug.h" -#define __sys(foo) sys_##foo -#define __sys_err(ret) ret +#define __sys(foo) sys_##foo +#define __sys_err(ret) ret #include "common/scm.h" diff --git a/compel/plugins/include/uapi/plugin-fds.h b/compel/plugins/include/uapi/plugin-fds.h index cececb21d..e995b4b66 100644 --- a/compel/plugins/include/uapi/plugin-fds.h +++ b/compel/plugins/include/uapi/plugin-fds.h @@ -1,7 +1,7 @@ #ifndef COMPEL_PLUGIN_STD_STD_H__ #define COMPEL_PLUGIN_STD_STD_H__ -extern int fds_send_fd(int fd); +extern int __must_check fds_send_fd(int fd); extern int fds_recv_fd(void); #endif /* COMPEL_PLUGIN_STD_STD_H__ */ diff --git a/compel/plugins/include/uapi/std/infect.h b/compel/plugins/include/uapi/std/infect.h index 800df2509..a729abbd2 100644 --- a/compel/plugins/include/uapi/std/infect.h +++ b/compel/plugins/include/uapi/std/infect.h @@ -1,14 +1,19 @@ #ifndef COMPEL_PLUGIN_STD_INFECT_H__ #define COMPEL_PLUGIN_STD_INFECT_H__ +#include "common/compiler.h" + extern int parasite_get_rpc_sock(void); -extern int parasite_service(unsigned int cmd, void *args); + +extern unsigned int __export_parasite_service_cmd; +extern void *__export_parasite_service_args_ptr; +extern unsigned long __must_check parasite_service(void); /* * Must be supplied by user plugins. */ -extern int parasite_daemon_cmd(int cmd, void *args); -extern int parasite_trap_cmd(int cmd, void *args); +extern int __must_check parasite_daemon_cmd(int cmd, void *args); +extern int __must_check parasite_trap_cmd(int cmd, void *args); extern void parasite_cleanup(void); /* diff --git a/compel/plugins/include/uapi/std/log.h b/compel/plugins/include/uapi/std/log.h index f21b6df0d..0b53567a0 100644 --- a/compel/plugins/include/uapi/std/log.h +++ b/compel/plugins/include/uapi/std/log.h @@ -2,8 +2,9 @@ #define COMPEL_PLUGIN_STD_LOG_H__ #include "compel/loglevels.h" +#include "common/compiler.h" -#define STD_LOG_SIMPLE_CHUNK 256 +#define STD_LOG_SIMPLE_CHUNK 256 extern void std_log_set_fd(int fd); extern void std_log_set_loglevel(enum __compel_log_levels level); @@ -22,8 +23,8 @@ extern int std_gettimeofday(struct timeval *tv, struct timezone *tz); extern int std_vprint_num(char *buf, int blen, int num, char **ps); extern void std_sprintf(char output[STD_LOG_SIMPLE_CHUNK], const char *format, ...) - __attribute__ ((__format__ (__printf__, 2, 3))); + __attribute__((__format__(__printf__, 2, 3))); extern void print_on_level(unsigned int loglevel, const char *format, ...) - __attribute__ ((__format__ (__printf__, 2, 3))); + __attribute__((__format__(__printf__, 2, 3))); #endif /* COMPEL_PLUGIN_STD_LOG_H__ */ diff --git a/compel/plugins/include/uapi/std/string.h b/compel/plugins/include/uapi/std/string.h index c2e4b9345..1c68cb5f4 100644 --- a/compel/plugins/include/uapi/std/string.h +++ b/compel/plugins/include/uapi/std/string.h @@ -6,20 +6,18 @@ #include /* Standard file descriptors. */ -#define STDIN_FILENO 0 /* Standard input. */ -#define STDOUT_FILENO 1 /* Standard output. */ -#define STDERR_FILENO 2 /* Standard error output. */ - +#define STDIN_FILENO 0 /* Standard input. */ +#define STDOUT_FILENO 1 /* Standard output. */ +#define STDERR_FILENO 2 /* Standard error output. */ extern void std_dputc(int fd, char c); extern void std_dputs(int fd, const char *s); extern void std_vdprintf(int fd, const char *format, va_list args); -extern void std_dprintf(int fd, const char *format, ...) - __attribute__ ((__format__ (__printf__, 2, 3))); +extern void std_dprintf(int fd, const char *format, ...) __attribute__((__format__(__printf__, 2, 3))); -#define std_printf(fmt, ...) std_dprintf(STDOUT_FILENO, fmt, ##__VA_ARGS__) -#define std_puts(s) std_dputs(STDOUT_FILENO, s) -#define std_putchar(c) std_dputc(STDOUT_FILENO, c) +#define std_printf(fmt, ...) std_dprintf(STDOUT_FILENO, fmt, ##__VA_ARGS__) +#define std_puts(s) std_dputs(STDOUT_FILENO, s) +#define std_putchar(c) std_dputc(STDOUT_FILENO, c) extern unsigned long std_strtoul(const char *nptr, char **endptr, int base); extern int std_strcmp(const char *cs, const char *ct); diff --git a/compel/plugins/include/uapi/std/syscall-types.h b/compel/plugins/include/uapi/std/syscall-types.h index 57865e741..1eea99daa 100644 --- a/compel/plugins/include/uapi/std/syscall-types.h +++ b/compel/plugins/include/uapi/std/syscall-types.h @@ -39,11 +39,13 @@ struct msghdr; struct rusage; struct iocb; struct pollfd; +struct clone_args; +struct open_how; typedef unsigned long aio_context_t; #ifndef F_GETFD -# define F_GETFD 1 +#define F_GETFD 1 #endif struct krlimit { @@ -56,7 +58,6 @@ typedef int kernel_timer_t; #include - extern long sys_preadv_raw(int fd, struct iovec *iov, unsigned long nr, unsigned long pos_l, unsigned long pos_h); static inline long sys_preadv(int fd, struct iovec *iov, unsigned long nr, off_t off) @@ -66,7 +67,7 @@ static inline long sys_preadv(int fd, struct iovec *iov, unsigned long nr, off_t #elif BITS_PER_LONG == 32 return sys_preadv_raw(fd, iov, nr, off, ((uint64_t)off) >> 32); #else -# error "BITS_PER_LONG isn't defined" +#error "BITS_PER_LONG isn't defined" #endif } diff --git a/compel/plugins/shmem/shmem.c b/compel/plugins/shmem/shmem.c index 695d1931f..2b402f926 100644 --- a/compel/plugins/shmem/shmem.c +++ b/compel/plugins/shmem/shmem.c @@ -12,8 +12,7 @@ void *shmem_create(unsigned long size) void *mem; struct shmem_plugin_msg spi; - mem = (void *)sys_mmap(NULL, size, PROT_READ | PROT_WRITE, - MAP_SHARED | MAP_ANONYMOUS, 0, 0); + mem = (void *)sys_mmap(NULL, size, PROT_READ | PROT_WRITE, MAP_SHARED | MAP_ANONYMOUS, 0, 0); if (mem == MAP_FAILED) return NULL; diff --git a/compel/plugins/std/fds.c b/compel/plugins/std/fds.c index 499102788..6a0757992 100644 --- a/compel/plugins/std/fds.c +++ b/compel/plugins/std/fds.c @@ -10,7 +10,7 @@ #include "common/compiler.h" #include "common/bug.h" -#define __sys(foo) sys_##foo -#define __sys_err(ret) ret +#define __sys(foo) sys_##foo +#define __sys_err(ret) ret #include "common/scm-code.c" diff --git a/compel/plugins/std/infect.c b/compel/plugins/std/infect.c index d5e1b4354..034201320 100644 --- a/compel/plugins/std/infect.c +++ b/compel/plugins/std/infect.c @@ -5,9 +5,9 @@ #include "common/lock.h" #include "common/page.h" -#define pr_err(fmt, ...) print_on_level(1, fmt, ##__VA_ARGS__) -#define pr_info(fmt, ...) print_on_level(3, fmt, ##__VA_ARGS__) -#define pr_debug(fmt, ...) print_on_level(4, fmt, ##__VA_ARGS__) +#define pr_err(fmt, ...) print_on_level(1, fmt, ##__VA_ARGS__) +#define pr_info(fmt, ...) print_on_level(3, fmt, ##__VA_ARGS__) +#define pr_debug(fmt, ...) print_on_level(4, fmt, ##__VA_ARGS__) #include "common/bug.h" @@ -16,6 +16,10 @@ #include "rpc-pie-priv.h" +#ifndef ARCH_RT_SIGRETURN_DUMP +#define ARCH_RT_SIGRETURN_DUMP ARCH_RT_SIGRETURN +#endif + static int tsock = -1; static struct rt_sigframe *sigframe; @@ -27,7 +31,7 @@ static struct rt_sigframe *sigframe; */ static unsigned __page_size; -unsigned __attribute((weak)) page_size(void) +unsigned long __attribute((weak)) page_size(void) { return __page_size; } @@ -51,8 +55,7 @@ static int __parasite_daemon_reply_ack(unsigned int cmd, int err) return -1; } - pr_debug("__sent ack msg: %d %d %d\n", - m.cmd, m.ack, m.err); + pr_debug("__sent ack msg: %d %d %d\n", m.cmd, m.ack, m.err); return 0; } @@ -64,16 +67,14 @@ static int __parasite_daemon_wait_msg(struct ctl_msg *m) pr_debug("Daemon waits for command\n"); while (1) { - *m = (struct ctl_msg){ }; + *m = (struct ctl_msg){}; ret = sys_recvfrom(tsock, m, sizeof(*m), MSG_WAITALL, NULL, 0); if (ret != sizeof(*m)) { - pr_err("Trimmed message received (%d/%d)\n", - (int)sizeof(*m), ret); + pr_err("Trimmed message received (%d/%d)\n", (int)sizeof(*m), ret); return -1; } - pr_debug("__fetched msg: %d %d %d\n", - m->cmd, m->ack, m->err); + pr_debug("__fetched msg: %d %d %d\n", m->cmd, m->ack, m->err); return 0; } @@ -82,32 +83,32 @@ static int __parasite_daemon_wait_msg(struct ctl_msg *m) /* Core infect code */ -static noinline void fini_sigreturn(unsigned long new_sp) +static noinline unsigned long fini_sigreturn(unsigned long new_sp) { - ARCH_RT_SIGRETURN(new_sp, sigframe); + ARCH_RT_SIGRETURN_DUMP(new_sp, sigframe); + return new_sp; } -static int fini(void) +static unsigned long fini(void) { unsigned long new_sp; parasite_cleanup(); new_sp = (long)sigframe + RT_SIGFRAME_OFFSET(sigframe); - pr_debug("%ld: new_sp=%lx ip %lx\n", sys_gettid(), - new_sp, RT_SIGFRAME_REGIP(sigframe)); + pr_debug("%ld: new_sp=%lx ip %lx\n", sys_gettid(), new_sp, RT_SIGFRAME_REGIP(sigframe)); sys_close(tsock); std_log_set_fd(-1); - fini_sigreturn(new_sp); + return fini_sigreturn(new_sp); BUG(); return -1; } -static noinline __used int noinline parasite_daemon(void *args) +static noinline __used unsigned long parasite_daemon(void *args) { struct ctl_msg m; int ret = -1; @@ -144,18 +145,16 @@ static noinline __used int noinline parasite_daemon(void *args) } out: - fini(); - - return 0; + return fini(); } -static noinline __used int parasite_init_daemon(void *data) +static noinline __used unsigned long parasite_init_daemon(void *data) { struct parasite_init_args *args = data; int ret; args->sigreturn_addr = (uint64_t)(uintptr_t)fini_sigreturn; - sigframe = (void*)(uintptr_t)args->sigframe; + sigframe = (void *)(uintptr_t)args->sigframe; #ifdef ARCH_HAS_LONG_PAGES __page_size = args->page_size; #endif @@ -182,22 +181,33 @@ static noinline __used int parasite_init_daemon(void *data) } else goto err; - parasite_daemon(data); + return parasite_daemon(data); err: futex_set_and_wake(&args->daemon_connected, ret); - fini(); - BUG(); - - return -1; + return fini(); } #ifndef __parasite_entry -# define __parasite_entry +#define __parasite_entry #endif -int __used __parasite_entry parasite_service(unsigned int cmd, void *args) +/* + * __export_parasite_service_{cmd,args} serve as arguments to the + * parasite_service() function. We use these global variables to make it + * easier to pass arguments when invoking from ptrace. + * + * We need the linker to allocate these variables. Hence the dummy + * initialization. Otherwise, we end up with COMMON symbols. + */ +unsigned int __export_parasite_service_cmd = 0; +void *__export_parasite_service_args_ptr = NULL; + +unsigned long __used __parasite_entry parasite_service(void) { + unsigned int cmd = __export_parasite_service_cmd; + void *args = __export_parasite_service_args_ptr; + pr_info("Parasite cmd %d/%x process\n", cmd, cmd); if (cmd == PARASITE_CMD_INIT_DAEMON) diff --git a/compel/plugins/std/log.c b/compel/plugins/std/log.c index f9be432ea..005348246 100644 --- a/compel/plugins/std/log.c +++ b/compel/plugins/std/log.c @@ -225,6 +225,22 @@ done: print_string(s, b); } +static void print_num_u(unsigned long num, struct simple_buf *b) +{ + char buf[22], *s; + + buf[21] = '\0'; + s = &buf[21]; + + do { + s--; + *s = (num % 10) + '0'; + num /= 10; + } while (num > 0); + + print_string(s, b); +} + static void hexdigit(unsigned int v, char *to, char **z) { *to = "0123456789abcdef"[v & 0xf]; @@ -329,10 +345,17 @@ static void sbuf_printf(struct simple_buf *b, const char *format, va_list args) case 'p': print_hex_l((unsigned long)va_arg(args, void *), b); break; - default: - print_string("UNKNOWN FORMAT ", b); - sbuf_putc(b, *s); + case 'u': + if (along) + print_num_u(va_arg(args, unsigned long), b); + else + print_num_u(va_arg(args, unsigned), b); break; + default: + print_string("\nError: Unknown printf format %", b); + sbuf_putc(b, *s); + sbuf_putc(b, '\n'); + return; } s++; } diff --git a/compel/plugins/std/std.c b/compel/plugins/std/std.c index 82f51eac4..01f88ef80 100644 --- a/compel/plugins/std/std.c +++ b/compel/plugins/std/std.c @@ -46,11 +46,9 @@ err: return ret; } -#define plugin_init_count(size) ((size) / (sizeof(plugin_init_t *))) +#define plugin_init_count(size) ((size) / (sizeof(plugin_init_t *))) -int __export_std_compel_start(struct prologue_init_args *args, - const plugin_init_t * const *init_array, - size_t init_size) +int __export_std_compel_start(struct prologue_init_args *args, const plugin_init_t *const *init_array, size_t init_size) { unsigned int i; int ret = 0; diff --git a/compel/plugins/std/string.c b/compel/plugins/std/string.c index 85bede803..d67e0d1a9 100644 --- a/compel/plugins/std/string.c +++ b/compel/plugins/std/string.c @@ -100,18 +100,11 @@ void std_vdprintf(int fd, const char *format, va_list args) std_dputs(fd, va_arg(args, char *)); break; case 'd': - __std_vprint_long(buf, sizeof(buf), - along ? - va_arg(args, long) : - (long)va_arg(args, int), - &t); + __std_vprint_long(buf, sizeof(buf), along ? va_arg(args, long) : (long)va_arg(args, int), &t); std_dputs(fd, t); break; case 'x': - __std_vprint_long_hex(buf, sizeof(buf), - along ? - va_arg(args, long) : - (long)va_arg(args, int), + __std_vprint_long_hex(buf, sizeof(buf), along ? va_arg(args, long) : (long)va_arg(args, int), &t); std_dputs(fd, t); break; @@ -130,9 +123,7 @@ void std_dprintf(int fd, const char *format, ...) static inline bool __isspace(unsigned char c) { - return c == ' ' || c == '\f' || - c == '\n' || c == '\r' || - c == '\t' || c == '\v'; + return c == ' ' || c == '\f' || c == '\n' || c == '\r' || c == '\t' || c == '\v'; } static unsigned char __tolower(unsigned char c) @@ -142,8 +133,7 @@ static unsigned char __tolower(unsigned char c) static inline bool __isalpha(unsigned char c) { - return ((c <= 'Z' && c >= 'A') || - (c <= 'z' && c >= 'a')); + return ((c <= 'Z' && c >= 'A') || (c <= 'z' && c >= 'a')); } static inline bool __isdigit(unsigned char c) @@ -161,7 +151,12 @@ static unsigned int __conv_val(unsigned char c) if (__isdigit(c)) return c - '0'; else if (__isalpha(c)) - return &conv_tab[__tolower(c)] - conv_tab; + /** + * If we want the value of something which __isalpha() == true + * it has to be base > 10. 'A' = 10, 'B' = 11 ... 'Z' = 35 + */ + return __tolower(c) - 'a' + 10; + return -1u; } @@ -208,7 +203,7 @@ unsigned long std_strtoul(const char *nptr, char **endptr, int base) if (__isspace(*s)) continue; if (!__isalnum(*s)) - goto fin; + goto fin; v = __conv_val(*s); if (v == -1u || v > base) goto fin; @@ -222,7 +217,6 @@ fin: return neg ? (unsigned long)-num : (unsigned long)num; } - /* * C compiler is free to insert implicit calls to memcmp, memset, * memcpy and memmove, assuming they are available during linking. @@ -267,7 +261,7 @@ void *memset(void *s, const int c, size_t count) size_t i = 0; while (i < count) - dest[i++] = (char) c; + dest[i++] = (char)c; return s; } diff --git a/compel/src/lib/handle-elf.c b/compel/src/lib/handle-elf.c index ca7c53b71..e4b8728ce 100644 --- a/compel/src/lib/handle-elf.c +++ b/compel/src/lib/handle-elf.c @@ -4,7 +4,7 @@ #include #include #include -#include +#include #include @@ -12,14 +12,13 @@ #include #include -#include "uapi/compel.h" - #include "handle-elf.h" #include "piegen.h" #include "log.h" -piegen_opt_t opts = {}; - +#ifdef CONFIG_MIPS +#include "ldsodefs.h" +#endif /* Check if pointer is out-of-bound */ static bool __ptr_oob(const uintptr_t ptr, const uintptr_t start, const size_t size) { @@ -29,44 +28,39 @@ static bool __ptr_oob(const uintptr_t ptr, const uintptr_t start, const size_t s } /* Check if pointed structure's end is out-of-bound */ -static bool __ptr_struct_end_oob(const uintptr_t ptr, const size_t struct_size, - const uintptr_t start, const size_t size) +static bool __ptr_struct_end_oob(const uintptr_t ptr, const size_t struct_size, const uintptr_t start, + const size_t size) { /* the last byte of the structure should be inside [begin, end) */ return __ptr_oob(ptr + struct_size - 1, start, size); } /* Check if pointed structure is out-of-bound */ -static bool __ptr_struct_oob(const uintptr_t ptr, const size_t struct_size, - const uintptr_t start, const size_t size) +static bool __ptr_struct_oob(const uintptr_t ptr, const size_t struct_size, const uintptr_t start, const size_t size) { - return __ptr_oob(ptr, start, size) || - __ptr_struct_end_oob(ptr, struct_size, start, size); + return __ptr_oob(ptr, start, size) || __ptr_struct_end_oob(ptr, struct_size, start, size); } -static bool test_pointer(const void *ptr, const void *start, const size_t size, - const char *name, const char *file, const int line) +static bool test_pointer(const void *ptr, const void *start, const size_t size, const char *name, const char *file, + const int line) { if (__ptr_oob((const uintptr_t)ptr, (const uintptr_t)start, size)) { - pr_err("Corrupted pointer %p (%s) at %s:%d\n", - ptr, name, file, line); + pr_err("Corrupted pointer %p (%s) at %s:%d\n", ptr, name, file, line); return true; } return false; } -#define ptr_func_exit(__ptr) \ - do { \ - if (test_pointer((__ptr), mem, size, #__ptr, \ - __FILE__, __LINE__)) { \ - free(sec_hdrs); \ - return -1; \ - } \ +#define ptr_func_exit(__ptr) \ + do { \ + if (test_pointer((__ptr), mem, size, #__ptr, __FILE__, __LINE__)) { \ + free(sec_hdrs); \ + return -1; \ + } \ } while (0) #ifdef ELF_PPC64 -static int do_relative_toc(long value, uint16_t *location, - unsigned long mask, int complain_signed) +static int do_relative_toc(long value, uint16_t *location, unsigned long mask, int complain_signed) { if (complain_signed && (value + 0x8000 > 0xffff)) { pr_err("TOC16 relocation overflows (%ld)\n", value); @@ -74,8 +68,7 @@ static int do_relative_toc(long value, uint16_t *location, } if ((~mask & 0xffff) & value) { - pr_err("bad TOC16 relocation (%ld) (0x%lx)\n", - value, (~mask & 0xffff) & value); + pr_err("bad TOC16 relocation (%ld) (0x%lx)\n", value, (~mask & 0xffff) & value); return -1; } @@ -90,23 +83,24 @@ static bool is_header_supported(Elf_Ehdr *hdr) return false; if ((hdr->e_type != ET_REL #ifdef NO_RELOCS - && hdr->e_type != ET_EXEC + && hdr->e_type != ET_EXEC #endif - ) || hdr->e_version != EV_CURRENT) + ) || + hdr->e_version != EV_CURRENT) return false; return true; } static const char *get_strings_section(Elf_Ehdr *hdr, uintptr_t mem, size_t size) { - size_t sec_table_size = ((size_t) hdr->e_shentsize) * hdr->e_shnum; + size_t sec_table_size = ((size_t)hdr->e_shentsize) * hdr->e_shnum; uintptr_t sec_table = mem + hdr->e_shoff; Elf_Shdr *secstrings_hdr; uintptr_t addr; if (__ptr_struct_oob(sec_table, sec_table_size, mem, size)) { - pr_err("Section table [%#zx, %#zx) is out of [%#zx, %#zx)\n", - sec_table, sec_table + sec_table_size, mem, mem + size); + pr_err("Section table [%#zx, %#zx) is out of [%#zx, %#zx)\n", sec_table, sec_table + sec_table_size, + mem, mem + size); return NULL; } @@ -114,24 +108,22 @@ static const char *get_strings_section(Elf_Ehdr *hdr, uintptr_t mem, size_t size * strings section header's offset in section headers table is * (size of section header * index of string section header) */ - addr = sec_table + ((size_t) hdr->e_shentsize) * hdr->e_shstrndx; - if (__ptr_struct_oob(addr, sizeof(Elf_Shdr), - sec_table, sec_table_size)) { - pr_err("String section header @%#zx is out of [%#zx, %#zx)\n", - addr, sec_table, sec_table + sec_table_size); + addr = sec_table + ((size_t)hdr->e_shentsize) * hdr->e_shstrndx; + if (__ptr_struct_oob(addr, sizeof(Elf_Shdr), sec_table, sec_table_size)) { + pr_err("String section header @%#zx is out of [%#zx, %#zx)\n", addr, sec_table, + sec_table + sec_table_size); return NULL; } - secstrings_hdr = (void*)addr; + secstrings_hdr = (void *)addr; addr = mem + secstrings_hdr->sh_offset; if (__ptr_struct_oob(addr, secstrings_hdr->sh_size, mem, size)) { - pr_err("String section @%#zx size %#lx is out of [%#zx, %#zx)\n", - addr, (unsigned long)secstrings_hdr->sh_size, - mem, mem + size); + pr_err("String section @%#zx size %#lx is out of [%#zx, %#zx)\n", addr, + (unsigned long)secstrings_hdr->sh_size, mem, mem + size); return NULL; } - return (void*)addr; + return (void *)addr; } /* @@ -156,11 +148,11 @@ int __handle_elf(void *mem, size_t size) int64_t toc_offset = 0; #endif int ret = -EINVAL; + unsigned long data_off = 0; pr_debug("Header\n"); pr_debug("------------\n"); - pr_debug("\ttype 0x%x machine 0x%x version 0x%x\n", - (unsigned)hdr->e_type, (unsigned)hdr->e_machine, + pr_debug("\ttype 0x%x machine 0x%x version 0x%x\n", (unsigned)hdr->e_type, (unsigned)hdr->e_machine, (unsigned)hdr->e_version); if (!is_header_supported(hdr)) { @@ -189,8 +181,7 @@ int __handle_elf(void *mem, size_t size) symtab_hdr = sh; ptr_func_exit(&secstrings[sh->sh_name]); - pr_debug("\t index %-2zd type 0x%-2x name %s\n", i, - (unsigned)sh->sh_type, &secstrings[sh->sh_name]); + pr_debug("\t index %-2zd type 0x%-2x name %s\n", i, (unsigned)sh->sh_type, &secstrings[sh->sh_name]); sec_hdrs[i] = sh; @@ -202,6 +193,23 @@ int __handle_elf(void *mem, size_t size) #endif } + /* Calculate section addresses with proper alignment. + * Note: some but not all linkers precalculate this information. + */ + for (i = 0, k = 0; i < hdr->e_shnum; i++) { + Elf_Shdr *sh = sec_hdrs[i]; + if (!(sh->sh_flags & SHF_ALLOC)) + continue; + if (sh->sh_addralign > 0 && k % sh->sh_addralign != 0) { + k += sh->sh_addralign - k % sh->sh_addralign; + } + if (sh->sh_addr && sh->sh_addr != k) + pr_info("Overriding unexpected precalculated address of section (section %s addr 0x%lx expected 0x%lx)\n", + &secstrings[sh->sh_name], (unsigned long)sh->sh_addr, (unsigned long)k); + sh->sh_addr = k; + k += sh->sh_size; + } + if (!symtab_hdr) { pr_err("No symbol table present\n"); goto err; @@ -228,7 +236,7 @@ int __handle_elf(void *mem, size_t size) } pr_out("/* Autogenerated from %s */\n", opts.input_filename); - pr_out("#include \n"); + pr_out("#include \n"); for (i = 0; i < symtab_hdr->sh_size / symtab_hdr->sh_entsize; i++) { Elf_Sym *sym = &symbols[i]; @@ -243,8 +251,8 @@ int __handle_elf(void *mem, size_t size) continue; pr_debug("\ttype 0x%-2x bind 0x%-2x shndx 0x%-4x value 0x%-2lx name %s\n", - (unsigned)ELF_ST_TYPE(sym->st_info), (unsigned)ELF_ST_BIND(sym->st_info), - (unsigned)sym->st_shndx, (unsigned long)sym->st_value, name); + (unsigned)ELF_ST_TYPE(sym->st_info), (unsigned)ELF_ST_BIND(sym->st_info), + (unsigned)sym->st_shndx, (unsigned long)sym->st_value, name); #ifdef ELF_PPC64 if (!sym->st_value && !strncmp(name, ".TOC.", 6)) { if (!toc_offset) { @@ -257,18 +265,15 @@ int __handle_elf(void *mem, size_t size) #endif if (strncmp(name, "__export", 8)) continue; - if ((sym->st_shndx && sym->st_shndx < hdr->e_shnum) || - sym->st_shndx == SHN_ABS) { + if ((sym->st_shndx && sym->st_shndx < hdr->e_shnum) || sym->st_shndx == SHN_ABS) { if (sym->st_shndx == SHN_ABS) { sh_src = NULL; } else { sh_src = sec_hdrs[sym->st_shndx]; ptr_func_exit(sh_src); } - pr_out("#define %s_sym%s 0x%lx\n", - opts.prefix, name, - (unsigned long)(sym->st_value + - (sh_src ? sh_src->sh_addr : 0))); + pr_out("#define %s_sym%s 0x%lx\n", opts.prefix, name, + (unsigned long)(sym->st_value + (sh_src ? sh_src->sh_addr : 0))); } } @@ -286,9 +291,8 @@ int __handle_elf(void *mem, size_t size) sh_rel = sec_hdrs[sh->sh_info]; ptr_func_exit(sh_rel); - pr_debug("\tsection %2zd type 0x%-2x link 0x%-2x info 0x%-2x name %s\n", i, - (unsigned)sh->sh_type, (unsigned)sh->sh_link, - (unsigned)sh->sh_info, &secstrings[sh->sh_name]); + pr_debug("\tsection %2zd type 0x%-2x link 0x%-2x info 0x%-2x name %s\n", i, (unsigned)sh->sh_type, + (unsigned)sh->sh_link, (unsigned)sh->sh_info, &secstrings[sh->sh_name]); for (k = 0; k < sh->sh_size / sh->sh_entsize; k++) { int64_t __maybe_unused addend64, __maybe_unused value64; @@ -315,8 +319,7 @@ int __handle_elf(void *mem, size_t size) pr_debug("\t\tr_offset 0x%-4lx r_info 0x%-4lx / sym 0x%-2lx type 0x%-2lx symsecoff 0x%-4lx\n", (unsigned long)r->rel.r_offset, (unsigned long)r->rel.r_info, - (unsigned long)ELF_R_SYM(r->rel.r_info), - (unsigned long)ELF_R_TYPE(r->rel.r_info), + (unsigned long)ELF_R_SYM(r->rel.r_info), (unsigned long)ELF_R_TYPE(r->rel.r_info), (unsigned long)sh_rel->sh_addr); if (sym->st_shndx == SHN_UNDEF) { @@ -326,8 +329,7 @@ int __handle_elf(void *mem, size_t size) * Their type is STT_NOTYPE, so report any * other one. */ - if (ELF32_ST_TYPE(sym->st_info) != STT_NOTYPE - || strncmp(name, ".TOC.", 6)) { + if (ELF32_ST_TYPE(sym->st_info) != STT_NOTYPE || strncmp(name, ".TOC.", 6)) { pr_err("Unexpected undefined symbol:%s\n", name); goto err; } @@ -335,6 +337,15 @@ int __handle_elf(void *mem, size_t size) pr_err("Unexpected undefined symbol: `%s'. External symbol in PIE?\n", name); goto err; #endif + } else if (sym->st_shndx == SHN_COMMON) { + /* + * To support COMMON symbols, we could + * allocate these variables somewhere, + * perhaps somewhere near the GOT table. + * For now, we punt. + */ + pr_err("Unsupported COMMON symbol: `%s'. Try initializing the variable\n", name); + goto err; } if (sh->sh_type == SHT_REL) { @@ -357,8 +368,8 @@ int __handle_elf(void *mem, size_t size) Elf_Shdr *sh_src; if ((unsigned)sym->st_shndx > (unsigned)hdr->e_shnum) { - pr_err("Unexpected symbol section index %u/%u\n", - (unsigned)sym->st_shndx, hdr->e_shnum); + pr_err("Unexpected symbol section index %u/%u\n", (unsigned)sym->st_shndx, + hdr->e_shnum); goto err; } sh_src = sec_hdrs[sym->st_shndx]; @@ -391,11 +402,10 @@ int __handle_elf(void *mem, size_t size) * * Here we are only handle the case '3' which is the most commonly seen. */ -#define LOCAL_OFFSET(s) ((s->st_other >> 5) & 0x7) +#define LOCAL_OFFSET(s) ((s->st_other >> 5) & 0x7) if (LOCAL_OFFSET(sym)) { if (LOCAL_OFFSET(sym) != 3) { - pr_err("Unexpected local offset value %d\n", - LOCAL_OFFSET(sym)); + pr_err("Unexpected local offset value %d\n", LOCAL_OFFSET(sym)); goto err; } pr_debug("\t\t\tUsing local offset\n"); @@ -405,11 +415,70 @@ int __handle_elf(void *mem, size_t size) #endif switch (ELF_R_TYPE(r->rel.r_info)) { +#ifdef CONFIG_MIPS + case R_MIPS_PC16: + /* s+a-p relative */ + *((int32_t *)where) = *((int32_t *)where) | ((value32 + addend32 - place) >> 2); + break; + + case R_MIPS_26: + /* local : (((A << 2) | (P & 0xf0000000) + S) >> 2 + * external : (sign–extend(A < 2) + S) >> 2 + */ + + if (((unsigned)ELF_ST_BIND(sym->st_info) == 0x1) || + ((unsigned)ELF_ST_BIND(sym->st_info) == 0x2)) { + /* bind type local is 0x0 ,global is 0x1,WEAK is 0x2 */ + addend32 = value32; + } + pr_out(" { .offset = 0x%-8x, .type = COMPEL_TYPE_MIPS_26, " + ".addend = %-8d, .value = 0x%-16x, }, /* R_MIPS_26 */\n", + (unsigned int)place, addend32, value32); + break; + + case R_MIPS_32: + /* S+A */ + break; + + case R_MIPS_64: + pr_out(" { .offset = 0x%-8x, .type = COMPEL_TYPE_MIPS_64, " + ".addend = %-8ld, .value = 0x%-16lx, }, /* R_MIPS_64 */\n", + (unsigned int)place, (long)addend64, (long)value64); + break; + + case R_MIPS_HIGHEST: + pr_out(" { .offset = 0x%-8x, .type = COMPEL_TYPE_MIPS_HIGHEST, " + ".addend = %-8d, .value = 0x%-16x, }, /* R_MIPS_HIGHEST */\n", + (unsigned int)place, addend32, value32); + break; + + case R_MIPS_HIGHER: + pr_out(" { .offset = 0x%-8x, .type = COMPEL_TYPE_MIPS_HIGHER, " + ".addend = %-8d, .value = 0x%-16x, }, /* R_MIPS_HIGHER */\n", + (unsigned int)place, addend32, value32); + break; + + case R_MIPS_HI16: + pr_out(" { .offset = 0x%-8x, .type = COMPEL_TYPE_MIPS_HI16, " + ".addend = %-8d, .value = 0x%-16x, }, /* R_MIPS_HI16 */\n", + (unsigned int)place, addend32, value32); + break; + + case R_MIPS_LO16: + if ((unsigned)ELF_ST_BIND(sym->st_info) == 0x1) { + /* bind type local is 0x0 ,global is 0x1 */ + addend32 = value32; + } + pr_out(" { .offset = 0x%-8x, .type = COMPEL_TYPE_MIPS_LO16, " + ".addend = %-8d, .value = 0x%-16x, }, /* R_MIPS_LO16 */\n", + (unsigned int)place, addend32, value32); + break; + +#endif #ifdef ELF_PPC64 case R_PPC64_REL24: /* Update PC relative offset, linker has not done this yet */ - pr_debug("\t\t\tR_PPC64_REL24 at 0x%-4lx val 0x%lx\n", - place, value64); + pr_debug("\t\t\tR_PPC64_REL24 at 0x%-4lx val 0x%lx\n", place, value64); /* Convert value to relative */ value64 -= place; if (value64 + 0x2000000 > 0x3ffffff || (value64 & 3) != 0) { @@ -417,60 +486,54 @@ int __handle_elf(void *mem, size_t size) goto err; } /* Only replace bits 2 through 26 */ - *(uint32_t *)where = (*(uint32_t *)where & ~0x03fffffc) | - (value64 & 0x03fffffc); + *(uint32_t *)where = (*(uint32_t *)where & ~0x03fffffc) | (value64 & 0x03fffffc); break; case R_PPC64_ADDR32: case R_PPC64_REL32: - pr_debug("\t\t\tR_PPC64_ADDR32 at 0x%-4lx val 0x%x\n", - place, (unsigned int)(value32 + addend32)); + pr_debug("\t\t\tR_PPC64_ADDR32 at 0x%-4lx val 0x%x\n", place, + (unsigned int)(value32 + addend32)); pr_out(" { .offset = 0x%-8x, .type = COMPEL_TYPE_INT, " " .addend = %-8d, .value = 0x%-16x, " "}, /* R_PPC64_ADDR32 */\n", - (unsigned int) place, addend32, value32); + (unsigned int)place, addend32, value32); break; case R_PPC64_ADDR64: case R_PPC64_REL64: - pr_debug("\t\t\tR_PPC64_ADDR64 at 0x%-4lx val 0x%lx\n", - place, value64 + addend64); + pr_debug("\t\t\tR_PPC64_ADDR64 at 0x%-4lx val 0x%lx\n", place, value64 + addend64); pr_out("\t{ .offset = 0x%-8x, .type = COMPEL_TYPE_LONG," " .addend = %-8ld, .value = 0x%-16lx, " "}, /* R_PPC64_ADDR64 */\n", - (unsigned int) place, (long)addend64, (long)value64); + (unsigned int)place, (long)addend64, (long)value64); break; case R_PPC64_TOC16_HA: - pr_debug("\t\t\tR_PPC64_TOC16_HA at 0x%-4lx val 0x%lx\n", - place, value64 + addend64 - toc_offset + 0x8000); - if (do_relative_toc((value64 + addend64 - toc_offset + 0x8000) >> 16, - where, 0xffff, 1)) + pr_debug("\t\t\tR_PPC64_TOC16_HA at 0x%-4lx val 0x%lx\n", place, + value64 + addend64 - toc_offset + 0x8000); + if (do_relative_toc((value64 + addend64 - toc_offset + 0x8000) >> 16, where, 0xffff, 1)) goto err; break; case R_PPC64_TOC16_LO: - pr_debug("\t\t\tR_PPC64_TOC16_LO at 0x%-4lx val 0x%lx\n", - place, value64 + addend64 - toc_offset); - if (do_relative_toc(value64 + addend64 - toc_offset, - where, 0xffff, 1)) + pr_debug("\t\t\tR_PPC64_TOC16_LO at 0x%-4lx val 0x%lx\n", place, + value64 + addend64 - toc_offset); + if (do_relative_toc(value64 + addend64 - toc_offset, where, 0xffff, 1)) goto err; break; case R_PPC64_TOC16_LO_DS: - pr_debug("\t\t\tR_PPC64_TOC16_LO_DS at 0x%-4lx val 0x%lx\n", - place, value64 + addend64 - toc_offset); - if (do_relative_toc(value64 + addend64 - toc_offset, - where, 0xfffc, 0)) + pr_debug("\t\t\tR_PPC64_TOC16_LO_DS at 0x%-4lx val 0x%lx\n", place, + value64 + addend64 - toc_offset); + if (do_relative_toc(value64 + addend64 - toc_offset, where, 0xfffc, 0)) goto err; break; case R_PPC64_REL16_HA: value64 += addend64 - place; - pr_debug("\t\t\tR_PPC64_REL16_HA at 0x%-4lx val 0x%lx\n", - place, value64); + pr_debug("\t\t\tR_PPC64_REL16_HA at 0x%-4lx val 0x%lx\n", place, value64); /* check that we are dealing with the addis 2,12 instruction */ - if (((*(uint32_t*)where) & 0xffff0000) != 0x3c4c0000) { + if (((*(uint32_t *)where) & 0xffff0000) != 0x3c4c0000) { pr_err("Unexpected instruction for R_PPC64_REL16_HA\n"); goto err; } @@ -479,10 +542,9 @@ int __handle_elf(void *mem, size_t size) case R_PPC64_REL16_LO: value64 += addend64 - place; - pr_debug("\t\t\tR_PPC64_REL16_LO at 0x%-4lx val 0x%lx\n", - place, value64); + pr_debug("\t\t\tR_PPC64_REL16_LO at 0x%-4lx val 0x%lx\n", place, value64); /* check that we are dealing with the addi 2,2 instruction */ - if (((*(uint32_t*)where) & 0xffff0000) != 0x38420000) { + if (((*(uint32_t *)where) & 0xffff0000) != 0x38420000) { pr_err("Unexpected instruction for R_PPC64_REL16_LO\n"); goto err; } @@ -492,7 +554,7 @@ int __handle_elf(void *mem, size_t size) #endif /* ELF_PPC64 */ #ifdef ELF_X86_64 - case R_X86_64_32: /* Symbol + Addend (4 bytes) */ + case R_X86_64_32: /* Symbol + Addend (4 bytes) */ case R_X86_64_32S: /* Symbol + Addend (4 bytes) */ pr_debug("\t\t\t\tR_X86_64_32 at 0x%-4lx val 0x%x\n", place, value32); pr_out(" { .offset = 0x%-8x, .type = COMPEL_TYPE_INT, " @@ -506,14 +568,16 @@ int __handle_elf(void *mem, size_t size) (unsigned int)place, (long)addend64, (long)value64); break; case R_X86_64_PC32: /* Symbol + Addend - Place (4 bytes) */ - pr_debug("\t\t\t\tR_X86_64_PC32 at 0x%-4lx val 0x%x\n", place, value32 + addend32 - (int32_t)place); + pr_debug("\t\t\t\tR_X86_64_PC32 at 0x%-4lx val 0x%x\n", place, + value32 + addend32 - (int32_t)place); /* * R_X86_64_PC32 are relative, patch them inplace. */ *((int32_t *)where) = value32 + addend32 - place; break; case R_X86_64_PLT32: /* ProcLinkage + Addend - Place (4 bytes) */ - pr_debug("\t\t\t\tR_X86_64_PLT32 at 0x%-4lx val 0x%x\n", place, value32 + addend32 - (int32_t)place); + pr_debug("\t\t\t\tR_X86_64_PLT32 at 0x%-4lx val 0x%x\n", place, + value32 + addend32 - (int32_t)place); /* * R_X86_64_PLT32 are relative, patch them inplace. */ @@ -538,7 +602,8 @@ int __handle_elf(void *mem, size_t size) (unsigned int)place, addend32, value32); break; case R_386_PC32: /* Symbol + Addend - Place */ - pr_debug("\t\t\t\tR_386_PC32 at 0x%-4lx val 0x%x\n", place, value32 + addend32 - (int32_t)place); + pr_debug("\t\t\t\tR_386_PC32 at 0x%-4lx val 0x%x\n", place, + value32 + addend32 - (int32_t)place); /* * R_386_PC32 are relative, patch them inplace. */ @@ -549,7 +614,7 @@ int __handle_elf(void *mem, size_t size) #ifdef ELF_S390 /* * See also arch/s390/kernel/module.c/apply_rela(): - * A PLT reads the GOT (global offest table). We can handle it like + * A PLT reads the GOT (global offset table). We can handle it like * R_390_PC32DBL because we have linked statically. */ case R_390_PLT32DBL: /* PC relative on a PLT (predure link table) */ @@ -573,14 +638,13 @@ int __handle_elf(void *mem, size_t size) #endif default: pr_err("Unsupported relocation of type %lu\n", - (unsigned long)ELF_R_TYPE(r->rel.r_info)); + (unsigned long)ELF_R_TYPE(r->rel.r_info)); goto err; } } } #endif /* !NO_RELOCS */ pr_out("};\n"); - pr_out("static __maybe_unused size_t %s_nr_gotpcrel = %zd;\n", opts.prefix, nr_gotpcrel); pr_out("static __maybe_unused const char %s_blob[] = {\n\t", opts.prefix); @@ -592,11 +656,11 @@ int __handle_elf(void *mem, size_t size) if (!(sh->sh_flags & SHF_ALLOC) || !sh->sh_size) continue; - shdata = mem + sh->sh_offset; + shdata = mem + sh->sh_offset; pr_debug("Copying section '%s'\n" "\tstart:0x%lx (gap:0x%lx) size:0x%lx\n", - &secstrings[sh->sh_name], (unsigned long) sh->sh_addr, - (unsigned long)(sh->sh_addr - k), (unsigned long) sh->sh_size); + &secstrings[sh->sh_name], (unsigned long)sh->sh_addr, (unsigned long)(sh->sh_addr - k), + (unsigned long)sh->sh_size); /* write 0 in the gap between the 2 sections */ for (; k < sh->sh_addr; k++) { @@ -610,39 +674,44 @@ int __handle_elf(void *mem, size_t size) pr_out("\n\t"); pr_out("0x%02x,", shdata[j]); } + + if (!strcmp(&secstrings[sh->sh_name], ".data")) + data_off = sh->sh_addr; } pr_out("};\n"); pr_out("\n"); - pr_out("static void __maybe_unused %s_setup_c_header(struct parasite_ctl *ctl)\n", - opts.prefix); - pr_out( -"{\n" -" struct parasite_blob_desc *pbd;\n" -"\n" -" pbd = compel_parasite_blob_desc(ctl);\n" -" pbd->parasite_type = COMPEL_BLOB_CHEADER;\n" -); + pr_out("static void __maybe_unused %s_setup_c_header_desc(struct parasite_blob_desc *pbd, bool native)\n", + opts.prefix); + pr_out("{\n" + " pbd->parasite_type = COMPEL_BLOB_CHEADER;\n"); pr_out("\tpbd->hdr.mem = %s_blob;\n", opts.prefix); - pr_out("\tpbd->hdr.bsize = sizeof(%s_blob);\n", - opts.prefix); - pr_out("\tpbd->hdr.nr_gotpcrel = %s_nr_gotpcrel;\n", opts.prefix); - pr_out("\tif (compel_mode_native(ctl))\n"); + pr_out("\tpbd->hdr.bsize = sizeof(%s_blob);\n", opts.prefix); + pr_out("\tif (native)\n"); pr_out("\t\tpbd->hdr.parasite_ip_off = " - "%s_sym__export_parasite_head_start;\n", opts.prefix); + "%s_sym__export_parasite_head_start;\n", + opts.prefix); pr_out("#ifdef CONFIG_COMPAT\n"); pr_out("\telse\n"); pr_out("\t\tpbd->hdr.parasite_ip_off = " - "%s_sym__export_parasite_head_start_compat;\n", opts.prefix); + "%s_sym__export_parasite_head_start_compat;\n", + opts.prefix); pr_out("#endif /* CONFIG_COMPAT */\n"); - pr_out("\tpbd->hdr.addr_cmd_off = " - "%s_sym__export_parasite_cmd;\n", opts.prefix); - pr_out("\tpbd->hdr.addr_arg_off = " - "%s_sym__export_parasite_args;\n", opts.prefix); + pr_out("\tpbd->hdr.cmd_off = %s_sym__export_parasite_service_cmd;\n", opts.prefix); + pr_out("\tpbd->hdr.args_ptr_off = %s_sym__export_parasite_service_args_ptr;\n", opts.prefix); + pr_out("\tpbd->hdr.got_off = round_up(pbd->hdr.bsize, sizeof(long));\n"); + pr_out("\tpbd->hdr.args_off = pbd->hdr.got_off + %zd*sizeof(long);\n", nr_gotpcrel); + pr_out("\tpbd->hdr.data_off = %#lx;\n", data_off); pr_out("\tpbd->hdr.relocs = %s_relocs;\n", opts.prefix); pr_out("\tpbd->hdr.nr_relocs = " - "sizeof(%s_relocs) / sizeof(%s_relocs[0]);\n", - opts.prefix, opts.prefix); + "sizeof(%s_relocs) / sizeof(%s_relocs[0]);\n", + opts.prefix, opts.prefix); pr_out("}\n"); + pr_out("\n"); + pr_out("static void __maybe_unused %s_setup_c_header(struct parasite_ctl *ctl)\n", opts.prefix); + pr_out("{\n"); + pr_out("\t%s_setup_c_header_desc(compel_parasite_blob_desc(ctl), compel_mode_native(ctl));\n", opts.prefix); + pr_out("}\n"); + ret = 0; err: free(sec_hdrs); diff --git a/compel/src/lib/infect-rpc.c b/compel/src/lib/infect-rpc.c index 265a4ad2f..6b19fa6bd 100644 --- a/compel/src/lib/infect-rpc.c +++ b/compel/src/lib/infect-rpc.c @@ -19,8 +19,7 @@ static int __parasite_send_cmd(int sockfd, struct ctl_msg *m) pr_perror("Failed to send command %d to daemon", m->cmd); return -1; } else if (ret != sizeof(*m)) { - pr_err("Message to daemon is trimmed (%d/%d)\n", - (int)sizeof(*m), ret); + pr_err("Message to daemon is trimmed (%d/%d)\n", (int)sizeof(*m), ret); return -1; } @@ -42,12 +41,10 @@ int parasite_wait_ack(int sockfd, unsigned int cmd, struct ctl_msg *m) pr_perror("Failed to read ack"); return -1; } else if (ret != sizeof(*m)) { - pr_err("Message reply from daemon is trimmed (%d/%d)\n", - (int)sizeof(*m), ret); + pr_err("Message reply from daemon is trimmed (%d/%d)\n", (int)sizeof(*m), ret); return -1; } - pr_debug("Fetched ack: %d %d %d\n", - m->cmd, m->ack, m->err); + pr_debug("Fetched ack: %d %d %d\n", m->cmd, m->ack, m->err); if (m->cmd != cmd || m->ack != cmd) { pr_err("Communication error, this is not " @@ -68,8 +65,7 @@ int compel_rpc_sync(unsigned int cmd, struct parasite_ctl *ctl) return -1; if (m.err != 0) { - pr_err("Command %d for daemon failed with %d\n", - cmd, m.err); + pr_err("Command %d for daemon failed with %d\n", cmd, m.err); return -1; } diff --git a/compel/src/lib/infect-util.c b/compel/src/lib/infect-util.c index 5d6d0ddd8..dc57e28f7 100644 --- a/compel/src/lib/infect-util.c +++ b/compel/src/lib/infect-util.c @@ -7,6 +7,8 @@ #include "infect-rpc.h" #include "infect-util.h" +char compel_run_id[RUN_ID_HASH_LENGTH]; + int compel_util_send_fd(struct parasite_ctl *ctl, int fd) { int sk; diff --git a/compel/src/lib/infect.c b/compel/src/lib/infect.c index f0bcaf334..22fcf24fa 100644 --- a/compel/src/lib/infect.c +++ b/compel/src/lib/infect.c @@ -1,3 +1,4 @@ +#include #include #include #include @@ -27,19 +28,16 @@ #include "rpc-pie-priv.h" #include "infect-util.h" -#define __sys(foo) foo -#define __sys_err(ret) (-errno) +#define __sys(foo) foo +#define __sys_err(ret) (-errno) #include "common/scm.h" #include "common/scm-code.c" #ifndef UNIX_PATH_MAX -#define UNIX_PATH_MAX (sizeof(struct sockaddr_un) - \ - (size_t)((struct sockaddr_un *) 0)->sun_path) +#define UNIX_PATH_MAX (sizeof(struct sockaddr_un) - (size_t)((struct sockaddr_un *)0)->sun_path) #endif -#define PARASITE_STACK_SIZE (16 << 10) - #ifndef SECCOMP_MODE_DISABLED #define SECCOMP_MODE_DISABLED 0 #endif @@ -92,6 +90,12 @@ static int parse_pid_status(int pid, struct seize_task_status *ss, void *data) continue; } + if (!strncmp(aux, "SigBlk:", 7)) { + if (sscanf(aux + 7, "%llx", &ss->sigblk) != 1) + goto err_parse; + + continue; + } } fclose(f); @@ -105,7 +109,7 @@ err_parse: int compel_stop_task(int pid) { int ret; - struct seize_task_status ss; + struct seize_task_status ss = {}; ret = compel_interrupt_task(pid); if (ret == 0) @@ -186,6 +190,29 @@ static int skip_sigstop(int pid, int nr_signals) return 0; } +#define SIG_MASK(sig) (1ULL << ((sig)-1)) + +#define SIG_IN_MASK(sig, mask) ((sig) > 0 && (sig) <= SIGMAX && (SIG_MASK(sig) & (mask))) + +#define SUPPORTED_STOP_MASK ((1ULL << (SIGSTOP - 1)) | (1ULL << (SIGTSTP - 1))) + +static inline int sig_stop(int sig) +{ + return SIG_IN_MASK(sig, SUPPORTED_STOP_MASK); +} + +int compel_parse_stop_signo(int pid) +{ + siginfo_t si; + + if (ptrace(PTRACE_GETSIGINFO, pid, NULL, &si) < 0) { + pr_perror("SEIZE %d: can't parse stopped siginfo", pid); + return -1; + } + + return si.si_signo; +} + /* * This routine seizes task putting it into a special * state where we can manipulate the task via ptrace @@ -193,13 +220,12 @@ static int skip_sigstop(int pid, int nr_signals) * of it so the task would not know if it was saddled * up with someone else. */ -int compel_wait_task(int pid, int ppid, - int (*get_status)(int pid, struct seize_task_status *, void *), - void (*free_status)(int pid, struct seize_task_status *, void *), - struct seize_task_status *ss, void *data) +int compel_wait_task(int pid, int ppid, int (*get_status)(int pid, struct seize_task_status *, void *), + void (*free_status)(int pid, struct seize_task_status *, void *), struct seize_task_status *ss, + void *data) { siginfo_t si; - int status, nr_sigstop; + int status, nr_stopsig; int ret = 0, ret2, wait_errno = 0; /* @@ -232,8 +258,8 @@ try_again: if (pid == getpid()) pr_err("The criu itself is within dumped tree.\n"); else - pr_err("Unseizable non-zombie %d found, state %c, err %d/%d\n", - pid, ss->state, ret, wait_errno); + pr_err("Unseizable non-zombie %d found, state %c, err %d/%d\n", pid, ss->state, ret, + wait_errno); return -1; } @@ -244,8 +270,7 @@ try_again: } if ((ppid != -1) && (ss->ppid != ppid)) { - pr_err("Task pid reused while suspending (%d: %d -> %d)\n", - pid, ppid, ss->ppid); + pr_err("Task pid reused while suspending (%d: %d -> %d)\n", pid, ppid, ss->ppid); goto err; } @@ -267,8 +292,7 @@ try_again: * handle one and repeat. */ - if (ptrace(PTRACE_CONT, pid, NULL, - (void *)(unsigned long)si.si_signo)) { + if (ptrace(PTRACE_CONT, pid, NULL, (void *)(unsigned long)si.si_signo)) { pr_perror("Can't continue signal handling, aborting"); goto err; } @@ -278,20 +302,53 @@ try_again: goto try_again; } + if (ptrace(PTRACE_SETOPTIONS, pid, NULL, PTRACE_O_TRACESYSGOOD)) { + pr_perror("Unable to set PTRACE_O_TRACESYSGOOD for %d", pid); + return -1; + } + if (ss->seccomp_mode != SECCOMP_MODE_DISABLED && ptrace_suspend_seccomp(pid) < 0) goto err; - nr_sigstop = 0; - if (ss->sigpnd & (1 << (SIGSTOP - 1))) - nr_sigstop++; - if (ss->shdpnd & (1 << (SIGSTOP - 1))) - nr_sigstop++; - if (si.si_signo == SIGSTOP) - nr_sigstop++; + /* + * FIXME(issues/1429): parasite code contains instructions that trigger + * SIGTRAP to stop at certain points. In such cases, the kernel sends a + * force SIGTRAP that can't be ignored and if it is blocked, the kernel + * resets its signal handler to a default one and unblocks it. It means + * that if we want to save the origin signal handler, we need to run a + * parasite code with the unblocked SIGTRAP. + */ + if ((ss->sigpnd | ss->shdpnd) & (1 << (SIGTRAP - 1))) { + pr_err("Can't dump the %d thread with a pending SIGTRAP.\n", pid); + goto err; + } - if (nr_sigstop) { - if (skip_sigstop(pid, nr_sigstop)) - goto err_stop; + nr_stopsig = 0; + if (SIG_IN_MASK(SIGSTOP, ss->sigpnd)) + nr_stopsig++; + if (SIG_IN_MASK(SIGSTOP, ss->shdpnd)) + nr_stopsig++; + + if (SIG_IN_MASK(SIGTSTP, ss->sigpnd) && !SIG_IN_MASK(SIGTSTP, ss->sigblk)) + nr_stopsig++; + if (SIG_IN_MASK(SIGTSTP, ss->shdpnd) && !SIG_IN_MASK(SIGTSTP, ss->sigblk)) + nr_stopsig++; + + if (sig_stop(si.si_signo)) + nr_stopsig++; + + if (nr_stopsig) { + if (skip_sigstop(pid, nr_stopsig)) { + /* + * Make sure that the task is stopped by a supported stop signal and + * send it again to restore task state before criu intervention. + */ + if (sig_stop(si.si_signo)) + kill(pid, si.si_signo); + else + kill(pid, SIGSTOP); + goto err; + } return COMPEL_TASK_STOPPED; } @@ -303,8 +360,6 @@ try_again: goto err; } -err_stop: - kill(pid, SIGSTOP); err: if (ptrace(PTRACE_DETACH, pid, NULL, NULL)) pr_perror("Unable to detach from %d", pid); @@ -313,6 +368,13 @@ err: int compel_resume_task(pid_t pid, int orig_st, int st) { + return compel_resume_task_sig(pid, orig_st, st, SIGSTOP); +} + +int compel_resume_task_sig(pid_t pid, int orig_st, int st, int stop_signo) +{ + int ret = 0; + pr_debug("\tUnseizing %d into %d\n", pid, st); if (st == COMPEL_TASK_DEAD) { @@ -333,17 +395,29 @@ int compel_resume_task(pid_t pid, int orig_st, int st) * task with STOP in queue that would get lost after * detach, so stop it again. */ - if (orig_st == COMPEL_TASK_STOPPED) - kill(pid, SIGSTOP); - } else + if (orig_st == COMPEL_TASK_STOPPED) { + /* + * Check that stop_signo contain supported stop signal. + * If it isn't, then send SIGSTOP. It makes sense in the case + * when we get COMPEL_TASK_STOPPED from old image, + * where stop_signo was not yet supported. + */ + if (sig_stop(stop_signo)) + kill(pid, stop_signo); + else + kill(pid, SIGSTOP); + } + } else { pr_err("Unknown final state %d\n", st); + ret = -1; + } if (ptrace(PTRACE_DETACH, pid, NULL, NULL)) { pr_perror("Unable to detach from %d", pid); return -1; } - return 0; + return ret; } static int gen_parasite_saddr(struct sockaddr_un *saddr, int key) @@ -351,8 +425,7 @@ static int gen_parasite_saddr(struct sockaddr_un *saddr, int key) int sun_len; saddr->sun_family = AF_UNIX; - snprintf(saddr->sun_path, UNIX_PATH_MAX, - "X/crtools-pr-%d", key); + snprintf(saddr->sun_path, UNIX_PATH_MAX, "X/crtools-pr-%d-%s", key, compel_run_id); sun_len = SUN_LEN(saddr); *saddr->sun_path = '\0'; @@ -360,8 +433,7 @@ static int gen_parasite_saddr(struct sockaddr_un *saddr, int key) return sun_len; } -static int prepare_tsock(struct parasite_ctl *ctl, pid_t pid, - struct parasite_init_args *args) +static int prepare_tsock(struct parasite_ctl *ctl, pid_t pid, struct parasite_init_args *args) { int ssock = -1; socklen_t sk_len; @@ -378,7 +450,7 @@ static int prepare_tsock(struct parasite_ctl *ctl, pid_t pid, goto err; } - if (getsockname(ssock, (struct sockaddr *) &addr, &sk_len) < 0) { + if (getsockname(ssock, (struct sockaddr *)&addr, &sk_len) < 0) { pr_perror("Unable to get name for a socket"); return -1; } @@ -413,8 +485,8 @@ err: static int setup_child_handler(struct parasite_ctl *ctl) { struct sigaction sa = { - .sa_sigaction = ctl->ictx.child_handler, - .sa_flags = SA_SIGINFO | SA_RESTART, + .sa_sigaction = ctl->ictx.child_handler, + .sa_flags = SA_SIGINFO | SA_RESTART, }; sigemptyset(&sa.sa_mask); @@ -437,12 +509,17 @@ static int restore_child_handler(struct parasite_ctl *ctl) return 0; } -static int parasite_run(pid_t pid, int cmd, unsigned long ip, void *stack, - user_regs_struct_t *regs, struct thread_ctx *octx) +static int parasite_run(pid_t pid, int cmd, unsigned long ip, void *stack, user_regs_struct_t *regs, + struct thread_ctx *octx) { k_rtsigset_t block; ksigfillset(&block); + /* + * FIXME(issues/1429): SIGTRAP can't be blocked, otherwise its handler + * will be reset to the default one. + */ + ksigdelset(&block, SIGTRAP); if (ptrace(PTRACE_SETSIGMASK, pid, sizeof(k_rtsigset_t), &block)) { pr_perror("Can't block signals for %d", pid); goto err_sig; @@ -471,7 +548,7 @@ err_sig: return -1; } -static int restore_thread_ctx(int pid, struct thread_ctx *ctx) +static int restore_thread_ctx(int pid, struct thread_ctx *ctx, bool restore_ext_regs) { int ret = 0; @@ -479,6 +556,10 @@ static int restore_thread_ctx(int pid, struct thread_ctx *ctx) pr_perror("Can't restore registers (pid: %d)", pid); ret = -1; } + + if (restore_ext_regs && compel_set_task_ext_regs(pid, &ctx->ext_regs)) + ret = -1; + if (ptrace(PTRACE_SETSIGMASK, pid, sizeof(k_rtsigset_t), &ctx->sigmask)) { pr_perror("Can't block signals"); ret = -1; @@ -487,11 +568,9 @@ static int restore_thread_ctx(int pid, struct thread_ctx *ctx) return ret; } - /* we run at @regs->ip */ -static int parasite_trap(struct parasite_ctl *ctl, pid_t pid, - user_regs_struct_t *regs, - struct thread_ctx *octx) +static int parasite_trap(struct parasite_ctl *ctl, pid_t pid, user_regs_struct_t *regs, struct thread_ctx *octx, + bool may_use_extended_regs) { siginfo_t siginfo; int status; @@ -508,7 +587,7 @@ static int parasite_trap(struct parasite_ctl *ctl, pid_t pid, } if (!WIFSTOPPED(status)) { - pr_err("Task is still running (pid: %d)\n", pid); + pr_err("Task is still running (pid: %d, status: 0x%x)\n", pid, status); goto err; } @@ -519,12 +598,11 @@ static int parasite_trap(struct parasite_ctl *ctl, pid_t pid, if (ptrace_get_regs(pid, regs)) { pr_perror("Can't obtain registers (pid: %d)", pid); - goto err; + goto err; } if (WSTOPSIG(status) != SIGTRAP || siginfo.si_code != ARCH_SI_TRAP) { - pr_debug("** delivering signal %d si_code=%d\n", - siginfo.si_signo, siginfo.si_code); + pr_debug("** delivering signal %d si_code=%d\n", siginfo.si_signo, siginfo.si_code); pr_err("Unexpected %d task interruption, aborting\n", pid); goto err; @@ -536,15 +614,13 @@ static int parasite_trap(struct parasite_ctl *ctl, pid_t pid, */ ret = 0; err: - if (restore_thread_ctx(pid, octx)) + if (restore_thread_ctx(pid, octx, may_use_extended_regs)) ret = -1; return ret; } - -int compel_execute_syscall(struct parasite_ctl *ctl, - user_regs_struct_t *regs, const char *code_syscall) +int compel_execute_syscall(struct parasite_ctl *ctl, user_regs_struct_t *regs, const char *code_syscall) { pid_t pid = ctl->rpid; int err; @@ -555,18 +631,16 @@ int compel_execute_syscall(struct parasite_ctl *ctl, * we will need it to restore original program content. */ memcpy(code_orig, code_syscall, sizeof(code_orig)); - if (ptrace_swap_area(pid, (void *)ctl->ictx.syscall_ip, - (void *)code_orig, sizeof(code_orig))) { + if (ptrace_swap_area(pid, (void *)ctl->ictx.syscall_ip, (void *)code_orig, sizeof(code_orig))) { pr_err("Can't inject syscall blob (pid: %d)\n", pid); return -1; } err = parasite_run(pid, PTRACE_CONT, ctl->ictx.syscall_ip, 0, regs, &ctl->orig); if (!err) - err = parasite_trap(ctl, pid, regs, &ctl->orig); + err = parasite_trap(ctl, pid, regs, &ctl->orig, false); - if (ptrace_poke_area(pid, (void *)code_orig, - (void *)ctl->ictx.syscall_ip, sizeof(code_orig))) { + if (ptrace_poke_area(pid, (void *)code_orig, (void *)ctl->ictx.syscall_ip, sizeof(code_orig))) { pr_err("Can't restore syscall blob (pid: %d)\n", ctl->rpid); err = -1; } @@ -581,7 +655,7 @@ int compel_run_at(struct parasite_ctl *ctl, unsigned long ip, user_regs_struct_t ret = parasite_run(ctl->rpid, PTRACE_CONT, ip, 0, ®s, &ctl->orig); if (!ret) - ret = parasite_trap(ctl, ctl->rpid, ret_regs ? ret_regs : ®s, &ctl->orig); + ret = parasite_trap(ctl, ctl->rpid, ret_regs ? ret_regs : ®s, &ctl->orig, false); return ret; } @@ -606,9 +680,9 @@ static int parasite_init_daemon(struct parasite_ctl *ctl) struct parasite_init_args *args; pid_t pid = ctl->rpid; user_regs_struct_t regs; - struct ctl_msg m = { }; + struct ctl_msg m = {}; - *ctl->addr_cmd = PARASITE_CMD_INIT_DAEMON; + *ctl->cmd = PARASITE_CMD_INIT_DAEMON; args = compel_parasite_args(ctl, struct parasite_init_args); @@ -647,12 +721,11 @@ static int parasite_init_daemon(struct parasite_ctl *ctl) pr_info("Wait for parasite being daemonized...\n"); if (parasite_wait_ack(ctl->tsock, PARASITE_CMD_INIT_DAEMON, &m)) { - pr_err("Can't switch parasite %d to daemon mode %d\n", - pid, m.err); + pr_err("Can't switch parasite %d to daemon mode %d\n", pid, m.err); goto err; } - ctl->sigreturn_addr = (void*)(uintptr_t)args->sigreturn_addr; + ctl->sigreturn_addr = (void *)(uintptr_t)args->sigreturn_addr; ctl->daemonized = true; pr_info("Parasite %d has been switched to daemon mode\n", pid); return 0; @@ -664,35 +737,41 @@ static int parasite_start_daemon(struct parasite_ctl *ctl) { pid_t pid = ctl->rpid; struct infect_ctx *ictx = &ctl->ictx; + user_fpregs_struct_t ext_regs; /* * Get task registers before going daemon, since the - * compel_get_task_regs needs to call ptrace on _stopped_ task, + * compel_get_task_regs() needs to call ptrace on _stopped_ task, * while in daemon it is not such. */ - if (get_task_regs(pid, &ctl->orig.regs, ictx->save_regs, - ictx->regs_arg, ictx->flags)) { + if (compel_get_task_regs(pid, &ctl->orig.regs, &ext_regs, ictx->save_regs, ictx->regs_arg, ictx->flags)) { pr_err("Can't obtain regs for thread %d\n", pid); return -1; } + if (__compel_arch_fetch_thread_area(pid, &ctl->orig)) { + pr_err("Can't get thread area of %d\n", pid); + return -1; + } + if (ictx->make_sigframe(ictx->regs_arg, ctl->sigframe, ctl->rsigframe, &ctl->orig.sigmask)) return -1; + if (parasite_setup_shstk(ctl, &ext_regs)) + return -1; + if (parasite_init_daemon(ctl)) return -1; return 0; } -static int parasite_mmap_exchange(struct parasite_ctl *ctl, unsigned long size) +static int parasite_mmap_exchange(struct parasite_ctl *ctl, unsigned long size, int remote_prot) { int fd; - ctl->remote_map = remote_mmap(ctl, NULL, size, - PROT_READ | PROT_WRITE | PROT_EXEC, - MAP_ANONYMOUS | MAP_SHARED, -1, 0); + ctl->remote_map = remote_mmap(ctl, NULL, size, remote_prot, MAP_ANONYMOUS | MAP_SHARED, -1, 0); if (!ctl->remote_map) { pr_err("Can't allocate memory for parasite blob (pid: %d)\n", ctl->rpid); return -1; @@ -700,13 +779,12 @@ static int parasite_mmap_exchange(struct parasite_ctl *ctl, unsigned long size) ctl->map_length = round_up(size, page_size()); - fd = ctl->ictx.open_proc(ctl->rpid, O_RDWR, "map_files/%lx-%lx", - (long)ctl->remote_map, (long)ctl->remote_map + ctl->map_length); + fd = ctl->ictx.open_proc(ctl->rpid, O_RDWR, "map_files/%lx-%lx", (long)ctl->remote_map, + (long)ctl->remote_map + ctl->map_length); if (fd < 0) return -1; - ctl->local_map = mmap(NULL, size, PROT_READ | PROT_WRITE, - MAP_SHARED | MAP_FILE, fd, 0); + ctl->local_map = mmap(NULL, size, PROT_READ | PROT_WRITE, MAP_SHARED | MAP_FILE, fd, 0); close(fd); if (ctl->local_map == MAP_FAILED) { @@ -718,14 +796,25 @@ static int parasite_mmap_exchange(struct parasite_ctl *ctl, unsigned long size) return 0; } -static int parasite_memfd_exchange(struct parasite_ctl *ctl, unsigned long size) +static void parasite_memfd_close(struct parasite_ctl *ctl, int fd) +{ + bool compat = !compel_mode_native(ctl); + long ret; + int err; + + err = compel_syscall(ctl, __NR(close, compat), &ret, fd, 0, 0, 0, 0, 0); + if (err || ret) + pr_err("Can't close memfd\n"); +} + +static int parasite_memfd_exchange(struct parasite_ctl *ctl, unsigned long size, int remote_prot) { void *where = (void *)ctl->ictx.syscall_ip + BUILTIN_SYSCALL_SIZE; + bool compat_task = !compel_mode_native(ctl); uint8_t orig_code[MEMFD_FNAME_SZ] = MEMFD_FNAME; pid_t pid = ctl->rpid; long sret = -ENOSYS; - int ret, fd, lfd; - bool __maybe_unused compat_task = !compel_mode_native(ctl); + int ret, fd, lfd, remote_flags; if (ctl->ictx.flags & INFECT_NO_MEMFD) return 1; @@ -737,14 +826,12 @@ static int parasite_memfd_exchange(struct parasite_ctl *ctl, unsigned long size) return -1; } - ret = compel_syscall(ctl, __NR(memfd_create, compat_task), &sret, - (unsigned long)where, 0, 0, 0, 0, 0); + ret = compel_syscall(ctl, __NR(memfd_create, compat_task), &sret, (unsigned long)where, 0, 0, 0, 0, 0); if (ptrace_poke_area(pid, orig_code, where, sizeof(orig_code))) { - fd = (int)(long)sret; + fd = (int)sret; if (fd >= 0) - compel_syscall(ctl, __NR(close, compat_task), &sret, - fd, 0, 0, 0, 0, 0); + parasite_memfd_close(ctl, fd); pr_err("Can't restore memfd args (pid: %d)\n", pid); return -1; } @@ -752,7 +839,7 @@ static int parasite_memfd_exchange(struct parasite_ctl *ctl, unsigned long size) if (ret < 0) return ret; - fd = (int)(long)sret; + fd = (int)sret; if (fd == -ENOSYS) return 1; if (fd < 0) { @@ -771,23 +858,24 @@ static int parasite_memfd_exchange(struct parasite_ctl *ctl, unsigned long size) goto err_cure; } - ctl->remote_map = remote_mmap(ctl, NULL, size, - PROT_READ | PROT_WRITE | PROT_EXEC, - MAP_FILE | MAP_SHARED, fd, 0); + remote_flags = MAP_FILE | MAP_SHARED; + if (ctl->ictx.remote_map_addr){ + remote_flags |= MAP_FIXED_NOREPLACE; + } + ctl->remote_map = remote_mmap(ctl, (void *)ctl->ictx.remote_map_addr, size, remote_prot, remote_flags, fd, 0); if (!ctl->remote_map) { pr_err("Can't rmap memfd for parasite blob\n"); goto err_curef; } - ctl->local_map = mmap(NULL, size, PROT_READ | PROT_WRITE, - MAP_SHARED | MAP_FILE, lfd, 0); + ctl->local_map = mmap(NULL, size, PROT_READ | PROT_WRITE, MAP_SHARED | MAP_FILE, lfd, 0); if (ctl->local_map == MAP_FAILED) { ctl->local_map = NULL; pr_perror("Can't lmap memfd for parasite blob"); goto err_curef; } - compel_syscall(ctl, __NR(close, compat_task), &sret, fd, 0, 0, 0, 0, 0); + parasite_memfd_close(ctl, fd); close(lfd); pr_info("Set up parasite blob using memfd\n"); @@ -796,25 +884,39 @@ static int parasite_memfd_exchange(struct parasite_ctl *ctl, unsigned long size) err_curef: close(lfd); err_cure: - compel_syscall(ctl, __NR(close, compat_task), &sret, fd, 0, 0, 0, 0, 0); + parasite_memfd_close(ctl, fd); return -1; } -void compel_relocs_apply(void *mem, void *vbase, size_t size, compel_reloc_t *elf_relocs, size_t nr_relocs) +void compel_relocs_apply(void *mem, void *vbase, struct parasite_blob_desc *pbd) { - size_t i, j; + compel_reloc_t *elf_relocs = pbd->hdr.relocs; + size_t nr_relocs = pbd->hdr.nr_relocs; + size_t i, j; + void **got = mem + pbd->hdr.got_off; + + /* + * parasite_service() reads the value of __export_parasite_service_args_ptr. + * The reason it is set here is that semantically, we are doing a symbol + * resolution on parasite_service_args, and it turns out to be relocatable. + */ + *(void **)(mem + pbd->hdr.args_ptr_off) = vbase + pbd->hdr.args_off; + +#ifdef CONFIG_MIPS + compel_relocs_apply_mips(mem, vbase, pbd); +#else for (i = 0, j = 0; i < nr_relocs; i++) { if (elf_relocs[i].type & COMPEL_TYPE_LONG) { long *where = mem + elf_relocs[i].offset; - long *p = mem + size; if (elf_relocs[i].type & COMPEL_TYPE_GOTPCREL) { int *value = (int *)where; int rel; - p[j] = (long)vbase + elf_relocs[i].value; - rel = (unsigned)((void *)&p[j] - (void *)mem) - elf_relocs[i].offset + elf_relocs[i].addend; + got[j] = vbase + elf_relocs[i].value; + rel = (unsigned)((void *)&got[j] - (void *)mem) - elf_relocs[i].offset + + elf_relocs[i].addend; *value = rel; j++; @@ -826,26 +928,52 @@ void compel_relocs_apply(void *mem, void *vbase, size_t size, compel_reloc_t *el } else BUG(); } +#endif } -static int compel_map_exchange(struct parasite_ctl *ctl, unsigned long size) +long remote_mprotect(struct parasite_ctl *ctl, void *addr, size_t len, int prot) { - int ret; + long ret; + int err; + bool compat_task = !user_regs_native(&ctl->orig.regs); - ret = parasite_memfd_exchange(ctl, size); - if (ret == 1) { - pr_info("MemFD parasite doesn't work, goto legacy mmap\n"); - ret = parasite_mmap_exchange(ctl, size); + err = compel_syscall(ctl, __NR(mprotect, compat_task), &ret, (unsigned long)addr, len, prot, 0, 0, 0); + if (err < 0) { + pr_err("compel_syscall for mprotect failed\n"); + return -1; } return ret; } -static inline unsigned long total_pie_size(size_t blob_size) +static int compel_map_exchange(struct parasite_ctl *ctl, unsigned long size) { - return round_up(blob_size, page_size()); + int ret, remote_prot; + + if (ctl->pblob.hdr.data_off) + remote_prot = PROT_READ | PROT_EXEC; + else + remote_prot = PROT_READ | PROT_WRITE | PROT_EXEC; + + ret = parasite_memfd_exchange(ctl, size, remote_prot); + if (ret == 1) { + pr_info("MemFD parasite doesn't work, goto legacy mmap\n"); + ret = parasite_mmap_exchange(ctl, size, remote_prot); + if (ret) + return ret; + } + + if (!ctl->pblob.hdr.data_off) + return 0; + + ret = remote_mprotect(ctl, ctl->remote_map + ctl->pblob.hdr.data_off, size - ctl->pblob.hdr.data_off, + PROT_READ | PROT_WRITE); + if (ret) + pr_err("remote_mprotect failed\n"); + + return ret; } -int compel_infect(struct parasite_ctl *ctl, unsigned long nr_threads, unsigned long args_size) +int compel_infect_no_daemon(struct parasite_ctl *ctl, unsigned long nr_threads, unsigned long args_size) { int ret; unsigned long p, map_exchange_size, parasite_size = 0; @@ -866,11 +994,39 @@ int compel_infect(struct parasite_ctl *ctl, unsigned long nr_threads, unsigned l * without using ptrace at all. */ - parasite_size = total_pie_size(ctl->pblob.hdr.bsize); + /* + * The parasite memory layout is the following: + * Low address start first. + * The number in parenthesis denotes the size of the section. + * The arrow on the right shows the different variables that + * corresponds to a given offset. + * +------------------------------------------------------+ <--- 0 + * | Parasite blob (sizeof(parasite_blob)) | + * +------------------------------------------------------+ <--- hdr.bsize + * align 8 + * +------------------------------------------------------+ <--- hdr.got_off + * | GOT Table (nr_gotpcrel * sizeof(long)) | + * +------------------------------------------------------+ <--- hdr.args_off + * | Args area (args_size) | + * +------------------------------------------------------+ + * align 64 + * +------------------------------------------------------+ <--- ctl->rsigframe + * | sigframe (RESTORE_STACK_SIGFRAME) | ctl->sigframe + * +------------------------------------------------------+ + * | main stack (PARASITE_STACK_SIZE) | + * +------------------------------------------------------+ <--- ctl->rstack + * | compel_run_in_thread stack (PARASITE_STACK_SIZE) | + * +------------------------------------------------------+ <--- ctl->r_thread_stack + * map_exchange_size + */ + parasite_size = ctl->pblob.hdr.args_off; - ctl->args_size = round_up(args_size, PAGE_SIZE); + ctl->args_size = args_size; parasite_size += ctl->args_size; + /* RESTORE_STACK_SIGFRAME needs a 64 bytes alignment */ + parasite_size = round_up(parasite_size, 64); + map_exchange_size = parasite_size; map_exchange_size += RESTORE_STACK_SIGFRAME + PARASITE_STACK_SIZE; if (nr_threads > 1) @@ -883,44 +1039,79 @@ int compel_infect(struct parasite_ctl *ctl, unsigned long nr_threads, unsigned l pr_info("Putting parasite blob into %p->%p\n", ctl->local_map, ctl->remote_map); ctl->parasite_ip = (unsigned long)(ctl->remote_map + ctl->pblob.hdr.parasite_ip_off); - ctl->addr_cmd = ctl->local_map + ctl->pblob.hdr.addr_cmd_off; - ctl->addr_args = ctl->local_map + ctl->pblob.hdr.addr_arg_off; + ctl->cmd = ctl->local_map + ctl->pblob.hdr.cmd_off; + ctl->args = ctl->local_map + ctl->pblob.hdr.args_off; + + /* + * args must be 4 bytes aligned as we use futexes() on them. It is + * already the case, as args follows the GOT table, which is 8 bytes + * aligned. + */ + if ((unsigned long)ctl->args & (4 - 1)) { + pr_err("BUG: args are not 4 bytes aligned: %p\n", ctl->args); + goto err; + } memcpy(ctl->local_map, ctl->pblob.hdr.mem, ctl->pblob.hdr.bsize); - if (ctl->pblob.hdr.nr_relocs) - compel_relocs_apply(ctl->local_map, ctl->remote_map, ctl->pblob.hdr.bsize, - ctl->pblob.hdr.relocs, ctl->pblob.hdr.nr_relocs); + compel_relocs_apply(ctl->local_map, ctl->remote_map, &ctl->pblob); + /* + * Ensure the infected thread sees the updated code. + * + * On architectures like ARM64, the Data Cache (D-cache) and + * Instruction Cache (I-cache) are not automatically coherent. + * Modifications land in the D-cache, so we must flush (clean) the + * D-cache to push changes to RAM to ensure the CPU fetches the updated + * instructions. + */ + __builtin___clear_cache(ctl->local_map, ctl->local_map + ctl->pblob.hdr.bsize); p = parasite_size; - ctl->rsigframe = ctl->remote_map + p; - ctl->sigframe = ctl->local_map + p; + ctl->rsigframe = ctl->remote_map + p; + ctl->sigframe = ctl->local_map + p; p += RESTORE_STACK_SIGFRAME; p += PARASITE_STACK_SIZE; - ctl->rstack = ctl->remote_map + p; + ctl->rstack = ctl->remote_map + p - PARASITE_STACK_REDZONE; + + /* + * x86-64 ABI requires a 16 bytes aligned stack. + * It is already the case as RESTORE_STACK_SIGFRAME is a multiple of + * 64, and PARASITE_STACK_SIZE is 0x4000. + */ + if ((unsigned long)ctl->rstack & (16 - 1)) { + pr_err("BUG: stack is not 16 bytes aligned: %p\n", ctl->rstack); + goto err; + } if (nr_threads > 1) { p += PARASITE_STACK_SIZE; - ctl->r_thread_stack = ctl->remote_map + p; + ctl->r_thread_stack = ctl->remote_map + p - PARASITE_STACK_REDZONE; } ret = arch_fetch_sas(ctl, ctl->rsigframe); if (ret) { - pr_err("Can't fetch sigaltstack for task %d (ret %d)\n", - ctl->rpid, ret); + pr_err("Can't fetch sigaltstack for task %d (ret %d)\n", ctl->rpid, ret); goto err; } - if (parasite_start_daemon(ctl)) - goto err; - return 0; err: return -1; } +int compel_infect(struct parasite_ctl *ctl, unsigned long nr_threads, unsigned long args_size) +{ + if (compel_infect_no_daemon(ctl, nr_threads, args_size)) + return -1; + + if (parasite_start_daemon(ctl)) + return -1; + + return 0; +} + struct parasite_thread_ctl *compel_prepare_thread(struct parasite_ctl *ctl, int pid) { struct parasite_thread_ctl *tctl; @@ -1099,14 +1290,12 @@ static void handle_sigchld(int signal, siginfo_t *siginfo, void *data) if (pid <= 0) return; - pr_err("si_code=%d si_pid=%d si_status=%d\n", - siginfo->si_code, siginfo->si_pid, siginfo->si_status); + pr_err("si_code=%d si_pid=%d si_status=%d\n", siginfo->si_code, siginfo->si_pid, siginfo->si_status); if (WIFEXITED(status)) pr_err("%d exited with %d unexpectedly\n", pid, WEXITSTATUS(status)); else if (WIFSIGNALED(status)) - pr_err("%d was killed by %d unexpectedly: %s\n", - pid, WTERMSIG(status), strsignal(WTERMSIG(status))); + pr_err("%d was killed by %d unexpectedly: %s\n", pid, WTERMSIG(status), strsignal(WTERMSIG(status))); else if (WIFSTOPPED(status)) pr_err("%d was stopped by %d unexpectedly\n", pid, WSTOPSIG(status)); @@ -1119,7 +1308,7 @@ struct plain_regs_struct { user_fpregs_struct_t fpregs; }; -static int save_regs_plain(void *to, user_regs_struct_t *r, user_fpregs_struct_t *f) +static int save_regs_plain(pid_t pid, void *to, user_regs_struct_t *r, user_fpregs_struct_t *f) { struct plain_regs_struct *prs = to; @@ -1197,9 +1386,8 @@ err: static bool task_in_parasite(struct parasite_ctl *ctl, user_regs_struct_t *regs) { - void *addr = (void *) REG_IP(*regs); - return addr >= ctl->remote_map && - addr < ctl->remote_map + ctl->map_length; + void *addr = (void *)REG_IP(*regs); + return addr >= ctl->remote_map && addr < ctl->remote_map + ctl->map_length; } static int parasite_fini_seized(struct parasite_ctl *ctl) @@ -1207,7 +1395,6 @@ static int parasite_fini_seized(struct parasite_ctl *ctl) pid_t pid = ctl->rpid; user_regs_struct_t regs; int status, ret = 0; - enum trace_flags flag; /* stop getting chld from parasite -- we're about to step-by-step it */ if (restore_child_handler(ctl)) @@ -1227,7 +1414,7 @@ static int parasite_fini_seized(struct parasite_ctl *ctl) pr_debug("Daemon %d exited trapping\n", pid); if (!WIFSTOPPED(status)) { - pr_err("Task is still running (pid: %d)\n", pid); + pr_err("Task is still running (pid: %d, status: 0x%x)\n", pid, status); return -1; } @@ -1248,16 +1435,11 @@ static int parasite_fini_seized(struct parasite_ctl *ctl) return -1; /* Go to sigreturn as closer as we can */ - ret = compel_stop_pie(pid, ctl->sigreturn_addr, &flag, - ctl->ictx.flags & INFECT_NO_BREAKPOINTS); + ret = compel_stop_pie(pid, ctl->sigreturn_addr, ctl->ictx.flags & INFECT_NO_BREAKPOINTS); if (ret < 0) return ret; - if (compel_stop_on_syscall(1, __NR(rt_sigreturn, 0), - __NR(rt_sigreturn, 1), flag)) - return -1; - - if (ptrace_flush_breakpoints(pid)) + if (compel_stop_on_syscall(1, __NR(rt_sigreturn, 0), __NR(rt_sigreturn, 1))) return -1; /* @@ -1269,6 +1451,11 @@ static int parasite_fini_seized(struct parasite_ctl *ctl) return 0; } +int compel_start_daemon(struct parasite_ctl *ctl) +{ + return parasite_start_daemon(ctl); +} + int compel_stop_daemon(struct parasite_ctl *ctl) { if (ctl->daemonized) { @@ -1293,6 +1480,7 @@ int compel_stop_daemon(struct parasite_ctl *ctl) int compel_cure_remote(struct parasite_ctl *ctl) { long ret; + int err; if (compel_stop_daemon(ctl)) return -1; @@ -1300,12 +1488,13 @@ int compel_cure_remote(struct parasite_ctl *ctl) if (!ctl->remote_map) return 0; - compel_syscall(ctl, __NR(munmap, !compel_mode_native(ctl)), &ret, - (unsigned long)ctl->remote_map, ctl->map_length, - 0, 0, 0, 0); + err = compel_syscall(ctl, __NR(munmap, !compel_mode_native(ctl)), &ret, (unsigned long)ctl->remote_map, + ctl->map_length, 0, 0, 0, 0); + if (err) + return err; + if (ret) { - pr_err("munmap for remote map %p, %lu returned %lu\n", - ctl->remote_map, ctl->map_length, ret); + pr_err("munmap for remote map %p, %lu returned %lu\n", ctl->remote_map, ctl->map_length, ret); return -1; } @@ -1340,7 +1529,7 @@ int compel_cure(struct parasite_ctl *ctl) void *compel_parasite_args_p(struct parasite_ctl *ctl) { - return ctl->addr_args; + return ctl->args; } void *compel_parasite_args_s(struct parasite_ctl *ctl, unsigned long args_size) @@ -1358,11 +1547,11 @@ int compel_run_in_thread(struct parasite_thread_ctl *tctl, unsigned int cmd) user_regs_struct_t regs = octx->regs; int ret; - *ctl->addr_cmd = cmd; + *ctl->cmd = cmd; ret = parasite_run(pid, PTRACE_CONT, ctl->parasite_ip, stack, ®s, octx); if (ret == 0) - ret = parasite_trap(ctl, pid, ®s, octx); + ret = parasite_trap(ctl, pid, ®s, octx, true); if (ret == 0) ret = (int)REG_RES(regs); @@ -1374,7 +1563,7 @@ int compel_run_in_thread(struct parasite_thread_ctl *tctl, unsigned int cmd) /* * compel_unmap() is used for unmapping parasite and restorer blobs. - * A blob can contain code for unmapping itself, so the porcess is + * A blob can contain code for unmapping itself, so the process is * trapped on the exit from the munmap syscall. */ int compel_unmap(struct parasite_ctl *ctl, unsigned long addr) @@ -1387,21 +1576,24 @@ int compel_unmap(struct parasite_ctl *ctl, unsigned long addr) if (ret) goto err; - ret = compel_stop_on_syscall(1, __NR(munmap, 0), - __NR(munmap, 1), TRACE_ENTER); + ret = compel_stop_on_syscall(1, __NR(munmap, 0), __NR(munmap, 1)); - if (restore_thread_ctx(pid, &ctl->orig)) + /* + * Don't touch extended registers here: they were restored + * with rt_sigreturn from sigframe. + */ + if (restore_thread_ctx(pid, &ctl->orig, false)) ret = -1; err: return ret; } -int compel_stop_pie(pid_t pid, void *addr, enum trace_flags *tf, bool no_bp) +int compel_stop_pie(pid_t pid, void *addr, bool no_bp) { int ret; if (no_bp) { - pr_debug("Force no-breakpoints restore\n"); + pr_debug("Force no-breakpoints restore of %d\n", pid); ret = 0; } else ret = ptrace_set_breakpoint(pid, addr); @@ -1413,7 +1605,6 @@ int compel_stop_pie(pid_t pid, void *addr, enum trace_flags *tf, bool no_bp) * PIE will stop on a breakpoint, next * stop after that will be syscall enter. */ - *tf = TRACE_EXIT; return 0; } @@ -1426,39 +1617,34 @@ int compel_stop_pie(pid_t pid, void *addr, enum trace_flags *tf, bool no_bp) pr_perror("Unable to restart the %d process", pid); return -1; } - - *tf = TRACE_ENTER; return 0; } static bool task_is_trapped(int status, pid_t pid) { - if (WIFSTOPPED(status) && WSTOPSIG(status) == SIGTRAP) + if (WIFSTOPPED(status) && (WSTOPSIG(status) & ~PTRACE_SYSCALL_TRAP) == SIGTRAP) return true; pr_err("Task %d is in unexpected state: %x\n", pid, status); if (WIFEXITED(status)) pr_err("Task exited with %d\n", WEXITSTATUS(status)); if (WIFSIGNALED(status)) - pr_err("Task signaled with %d: %s\n", - WTERMSIG(status), strsignal(WTERMSIG(status))); + pr_err("Task signaled with %d: %s\n", WTERMSIG(status), strsignal(WTERMSIG(status))); if (WIFSTOPPED(status)) - pr_err("Task stopped with %d: %s\n", - WSTOPSIG(status), strsignal(WSTOPSIG(status))); + pr_err("Task stopped with %d: %s\n", WSTOPSIG(status), strsignal(WSTOPSIG(status))); if (WIFCONTINUED(status)) pr_err("Task continued\n"); return false; } -static inline int is_required_syscall(user_regs_struct_t *regs, pid_t pid, - const int sys_nr, const int sys_nr_compat) +static inline int is_required_syscall(user_regs_struct_t *regs, pid_t pid, const int sys_nr, const int sys_nr_compat) { const char *mode = user_regs_native(regs) ? "native" : "compat"; int req_sysnr = user_regs_native(regs) ? sys_nr : sys_nr_compat; - pr_debug("%d (%s) is going to execute the syscall %lu, required is %d\n", - pid, mode, REG_SYSCALL_NR(*regs), req_sysnr); + pr_debug("%d (%s) is going to execute the syscall %lu, required is %d\n", pid, mode, REG_SYSCALL_NR(*regs), + req_sysnr); return (REG_SYSCALL_NR(*regs) == req_sysnr); } @@ -1470,17 +1656,13 @@ static inline int is_required_syscall(user_regs_struct_t *regs, pid_t pid, * sys_nr - the required syscall number * sys_nr_compat - the required compatible syscall number */ -int compel_stop_on_syscall(int tasks, - const int sys_nr, const int sys_nr_compat, - enum trace_flags trace) +int compel_stop_on_syscall(int tasks, const int sys_nr, const int sys_nr_compat) { + enum trace_flags trace = tasks > 1 ? TRACE_ALL : TRACE_ENTER; user_regs_struct_t regs; int status, ret; pid_t pid; - if (tasks > 1) - trace = TRACE_ALL; - /* Stop all threads on the enter point in sys_rt_sigreturn */ while (tasks) { pid = wait4(-1, &status, __WALL, NULL); @@ -1494,6 +1676,18 @@ int compel_stop_on_syscall(int tasks, pr_debug("%d was trapped\n", pid); + if ((WSTOPSIG(status) & PTRACE_SYSCALL_TRAP) == 0) { + /* + * On some platforms such as ARM64, it is impossible to + * pass through a breakpoint, so let's clear it right + * after it has been triggered. + */ + if (ptrace_flush_breakpoints(pid)) { + pr_err("Unable to clear breakpoints\n"); + return -1; + } + goto goon; + } if (trace == TRACE_EXIT) { trace = TRACE_ENTER; pr_debug("`- Expecting exit\n"); @@ -1532,7 +1726,7 @@ int compel_stop_on_syscall(int tasks, tasks--; continue; } -goon: + goon: ret = ptrace(PTRACE_SYSCALL, pid, NULL, NULL); if (ret) { pr_perror("ptrace"); @@ -1563,9 +1757,9 @@ k_rtsigset_t *compel_task_sigmask(struct parasite_ctl *ctl) return thread_ctx_sigmask(&ctl->orig); } -int compel_get_thread_regs(struct parasite_thread_ctl *tctl, save_regs_t save, void * arg) +int compel_get_thread_regs(struct parasite_thread_ctl *tctl, save_regs_t save, void *arg) { - return get_task_regs(tctl->tid, &tctl->th.regs, save, arg, tctl->ctl->ictx.flags); + return compel_get_task_regs(tctl->tid, &tctl->th.regs, &tctl->th.ext_regs, save, arg, tctl->ctl->ictx.flags); } struct infect_ctx *compel_infect_ctx(struct parasite_ctl *ctl) @@ -1587,3 +1781,31 @@ uint64_t compel_get_thread_sp(struct parasite_thread_ctl *tctl) { return REG_SP(tctl->th.regs); } + +uint64_t compel_get_leader_ip(struct parasite_ctl *ctl) +{ + return REG_IP(ctl->orig.regs); +} + +uint64_t compel_get_thread_ip(struct parasite_thread_ctl *tctl) +{ + return REG_IP(tctl->th.regs); +} + +void compel_set_leader_ip(struct parasite_ctl *ctl, uint64_t v) +{ + SET_REG_IP(ctl->orig.regs, v); +} + +void compel_set_thread_ip(struct parasite_thread_ctl *tctl, uint64_t v) +{ + SET_REG_IP(tctl->th.regs, v); +} + +void compel_get_stack(struct parasite_ctl *ctl, void **rstack, void **r_thread_stack) +{ + if (rstack) + *rstack = ctl->rstack; + if (r_thread_stack) + *r_thread_stack = ctl->r_thread_stack; +} diff --git a/compel/src/lib/log.c b/compel/src/lib/log.c index d195343e4..c86be02c5 100644 --- a/compel/src/lib/log.c +++ b/compel/src/lib/log.c @@ -4,11 +4,8 @@ #include #include #include - #include -#include - #include "log.h" static unsigned int current_loglevel = COMPEL_DEFAULT_LOGLEVEL; diff --git a/compel/src/lib/ptrace.c b/compel/src/lib/ptrace.c index 9142bac42..717ee2839 100644 --- a/compel/src/lib/ptrace.c +++ b/compel/src/lib/ptrace.c @@ -23,7 +23,7 @@ int ptrace_suspend_seccomp(pid_t pid) { - if (ptrace(PTRACE_SETOPTIONS, pid, NULL, PTRACE_O_SUSPEND_SECCOMP) < 0) { + if (ptrace(PTRACE_SETOPTIONS, pid, NULL, PTRACE_O_SUSPEND_SECCOMP | PTRACE_O_TRACESYSGOOD) < 0) { pr_perror("suspending seccomp failed"); return -1; } @@ -34,46 +34,73 @@ int ptrace_suspend_seccomp(pid_t pid) int ptrace_peek_area(pid_t pid, void *dst, void *addr, long bytes) { unsigned long w; - if (bytes & (sizeof(long) - 1)) + int old_errno = errno; + + if (bytes & (sizeof(long) - 1)) { + pr_err("Peek request with non-word size %ld\n", bytes); return -1; + } + + errno = 0; for (w = 0; w < bytes / sizeof(long); w++) { unsigned long *d = dst, *a = addr; + d[w] = ptrace(PTRACE_PEEKDATA, pid, a + w, NULL); - if (d[w] == -1U && errno) + if (d[w] == -1U && errno) { + pr_perror("PEEKDATA failed"); goto err; + } } + errno = old_errno; return 0; err: - return -2; + return -errno; } int ptrace_poke_area(pid_t pid, void *src, void *addr, long bytes) { unsigned long w; - if (bytes & (sizeof(long) - 1)) + + if (bytes & (sizeof(long) - 1)) { + pr_err("Poke request with non-word size %ld\n", bytes); return -1; + } + for (w = 0; w < bytes / sizeof(long); w++) { unsigned long *s = src, *a = addr; - if (ptrace(PTRACE_POKEDATA, pid, a + w, s[w])) + + if (ptrace(PTRACE_POKEDATA, pid, a + w, s[w])) { + pr_perror("POKEDATA failed"); goto err; + } } return 0; err: - return -2; + return -errno; } /* don't swap big space, it might overflow the stack */ int ptrace_swap_area(pid_t pid, void *dst, void *src, long bytes) { void *t = alloca(bytes); + int err; - if (ptrace_peek_area(pid, t, dst, bytes)) - return -1; + err = ptrace_peek_area(pid, t, dst, bytes); + if (err) + return err; - if (ptrace_poke_area(pid, src, dst, bytes)) { - if (ptrace_poke_area(pid, t, dst, bytes)) - return -2; - return -1; + err = ptrace_poke_area(pid, src, dst, bytes); + if (err) { + int err2; + + pr_err("Can't poke %d @ %p from %p sized %ld\n", pid, dst, src, bytes); + + err2 = ptrace_poke_area(pid, t, dst, bytes); + if (err2) { + pr_err("Can't restore the original data with poke\n"); + return err2; + } + return err; } memcpy(src, t, bytes); @@ -81,7 +108,8 @@ int ptrace_swap_area(pid_t pid, void *dst, void *src, long bytes) return 0; } -int __attribute__((weak)) ptrace_get_regs(int pid, user_regs_struct_t *regs) { +int __attribute__((weak)) ptrace_get_regs(int pid, user_regs_struct_t *regs) +{ struct iovec iov; iov.iov_base = regs; diff --git a/compel/src/main.c b/compel/src/main.c index 51bac099f..21e06d7dd 100644 --- a/compel/src/main.c +++ b/compel/src/main.c @@ -13,53 +13,62 @@ #include #include -#include "uapi/compel/compel.h" - #include "version.h" #include "piegen.h" #include "log.h" -#define CFLAGS_DEFAULT_SET \ - "-Wstrict-prototypes " \ +#define CFLAGS_DEFAULT_SET \ + "-Wstrict-prototypes " \ + "-ffreestanding " \ "-fno-stack-protector -nostdlib -fomit-frame-pointer " -#define COMPEL_CFLAGS_PIE CFLAGS_DEFAULT_SET "-fpie" -#define COMPEL_CFLAGS_NOPIC CFLAGS_DEFAULT_SET "-fno-pic" +#define COMPEL_CFLAGS_PIE CFLAGS_DEFAULT_SET "-fpie" +#define COMPEL_CFLAGS_NOPIC CFLAGS_DEFAULT_SET "-fno-pic" #ifdef NO_RELOCS -#define COMPEL_LDFLAGS_COMMON "-z noexecstack -T " +#define COMPEL_LDFLAGS_COMMON "-z noexecstack -T " #else -#define COMPEL_LDFLAGS_COMMON "-r -z noexecstack -T " +#define COMPEL_LDFLAGS_COMMON "-r -z noexecstack -T " #endif typedef struct { - const char *arch; // dir name under arch/ - const char *cflags; - const char *cflags_compat; + const char *arch; // dir name under arch/ + const char *cflags; + const char *cflags_compat; } flags_t; static const flags_t flags = { #if defined CONFIG_X86_64 - .arch = "x86", - .cflags = COMPEL_CFLAGS_PIE, - .cflags_compat = COMPEL_CFLAGS_NOPIC, + .arch = "x86", + .cflags = COMPEL_CFLAGS_PIE, + .cflags_compat = COMPEL_CFLAGS_NOPIC, #elif defined CONFIG_AARCH64 - .arch = "aarch64", - .cflags = COMPEL_CFLAGS_PIE, + .arch = "aarch64", + .cflags = COMPEL_CFLAGS_PIE, #elif defined(CONFIG_ARMV6) || defined(CONFIG_ARMV7) - .arch = "arm", - .cflags = COMPEL_CFLAGS_PIE, + .arch = "arm", + .cflags = COMPEL_CFLAGS_PIE, #elif defined CONFIG_PPC64 - .arch = "ppc64", - .cflags = COMPEL_CFLAGS_PIE, + .arch = "ppc64", + .cflags = COMPEL_CFLAGS_PIE, #elif defined CONFIG_S390 - .arch = "s390", - .cflags = COMPEL_CFLAGS_PIE, + .arch = "s390", + .cflags = COMPEL_CFLAGS_PIE, +#elif defined CONFIG_MIPS + .arch = "mips", + .cflags = COMPEL_CFLAGS_PIE, +#elif defined CONFIG_LOONGARCH64 + .arch = "loongarch64", + .cflags = COMPEL_CFLAGS_PIE, +#elif defined CONFIG_RISCV64 + .arch = "riscv64", + .cflags = COMPEL_CFLAGS_PIE, #else #error "CONFIG_ not defined, or unsupported ARCH" #endif }; +piegen_opt_t opts = {}; const char *uninst_root; static int piegen(void) @@ -92,7 +101,6 @@ static int piegen(void) } if (handle_binary(mem, st.st_size)) { - close(fd), fd = -1; unlink(opts.output_filename); goto err; } @@ -100,8 +108,7 @@ static int piegen(void) ret = 0; err: - if (fd >= 0) - close(fd); + close(fd); if (opts.fout) fclose(opts.fout); if (!ret) @@ -122,23 +129,23 @@ static void cli_log(unsigned int lvl, const char *fmt, va_list parms) vfprintf(f, fmt, parms); } -static int usage(int rc) { +static int usage(int rc) +{ FILE *out = (rc == 0) ? stdout : stderr; fprintf(out, -"Usage:\n" -" compel [--compat] includes | cflags | ldflags\n" -" compel plugins [PLUGIN_NAME ...]\n" -" compel [--compat] [--static] libs\n" -" compel -f FILE -o FILE [-p NAME] [-l N] hgen\n" -" -f, --file FILE input (parasite object) file name\n" -" -o, --output FILE output (header) file name\n" -" -p, --prefix NAME prefix for var names\n" -" -l, --log-level NUM log level (default: %d)\n" -" compel -h|--help\n" -" compel -V|--version\n" -, COMPEL_DEFAULT_LOGLEVEL -); + "Usage:\n" + " compel [--compat] includes | cflags | ldflags\n" + " compel plugins [PLUGIN_NAME ...]\n" + " compel [--compat] [--static] libs\n" + " compel -f FILE -o FILE [-p NAME] [-l N] hgen\n" + " -f, --file FILE input (parasite object) file name\n" + " -o, --output FILE output (header) file name\n" + " -p, --prefix NAME prefix for var names\n" + " -l, --log-level NUM log level (default: %d)\n" + " compel -h|--help\n" + " compel -V|--version\n", + COMPEL_DEFAULT_LOGLEVEL); return rc; } @@ -181,13 +188,9 @@ static void print_ldflags(bool compat) printf("%s", COMPEL_LDFLAGS_COMMON); if (uninst_root) { - printf("%s/arch/%s/scripts/compel-pack%s.lds.S\n", - uninst_root, flags.arch, compat_str); - } - else { - printf("%s/compel/scripts/compel-pack%s.lds.S\n", - LIBEXECDIR, compat_str); - + printf("%s/arch/%s/scripts/compel-pack%s.lds.S\n", uninst_root, flags.arch, compat_str); + } else { + printf("%s/compel/scripts/compel-pack%s.lds.S\n", LIBEXECDIR, compat_str); } } @@ -196,8 +199,7 @@ static void print_plugin(const char *name) const char suffix[] = ".lib.a"; if (uninst_root) - printf("%s/plugins/%s%s\n", - uninst_root, name, suffix); + printf("%s/plugins/%s%s\n", uninst_root, name, suffix); else printf("%s/compel/%s%s\n", LIBEXECDIR, name, suffix); } @@ -224,8 +226,7 @@ static int print_libs(bool is_static) return 1; } printf("%s/%s\n", uninst_root, STATIC_LIB); - } - else { + } else { printf("%s/%s\n", LIBDIR, (is_static) ? STATIC_LIB : DYN_LIB); } @@ -257,8 +258,7 @@ static char *gen_prefix(const char *path) for (i = len - 1; i >= 0; i--) { if (!p1 && path[i] == '.') { p2 = path + i - 1; - } - else if (!p1 && path[i] == '/') { + } else if (!p1 && path[i] == '/') { p1 = path + i + 1; break; } @@ -309,15 +309,15 @@ int main(int argc, char *argv[]) static const char short_opts[] = "csf:o:p:hVl:"; static struct option long_opts[] = { - { "compat", no_argument, 0, 'c' }, - { "static", no_argument, 0, 's' }, - { "file", required_argument, 0, 'f' }, - { "output", required_argument, 0, 'o' }, - { "prefix", required_argument, 0, 'p' }, - { "help", no_argument, 0, 'h' }, - { "version", no_argument, 0, 'V' }, - { "log-level", required_argument, 0, 'l' }, - { }, + { "compat", no_argument, 0, 'c' }, + { "static", no_argument, 0, 's' }, + { "file", required_argument, 0, 'f' }, + { "output", required_argument, 0, 'o' }, + { "prefix", required_argument, 0, 'p' }, + { "help", no_argument, 0, 'h' }, + { "version", no_argument, 0, 'V' }, + { "log-level", required_argument, 0, 'l' }, + {}, }; uninst_root = getenv("COMPEL_UNINSTALLED_ROOTDIR"); @@ -349,16 +349,12 @@ int main(int argc, char *argv[]) case 'h': return usage(0); case 'V': - printf("Version: %d.%d.%d\n", - COMPEL_SO_VERSION_MAJOR, - COMPEL_SO_VERSION_MINOR, + printf("Version: %d.%d.%d\n", COMPEL_SO_VERSION_MAJOR, COMPEL_SO_VERSION_MINOR, COMPEL_SO_VERSION_SUBLEVEL); exit(0); - break; default: // '?' // error message already printed by getopt_long() return usage(1); - break; } } diff --git a/compel/test/Makefile b/compel/test/Makefile new file mode 100644 index 000000000..f46a821ee --- /dev/null +++ b/compel/test/Makefile @@ -0,0 +1,21 @@ +all: fdspy infect rsys stack + +fdspy: + $(Q) $(MAKE) -C fdspy + $(Q) $(MAKE) -C fdspy run +.PHONY: fdspy + +infect: + $(Q) $(MAKE) -C infect + $(Q) $(MAKE) -C infect run +.PHONY: infect + +rsys: + $(Q) $(MAKE) -C rsys + $(Q) $(MAKE) -C rsys run +.PHONY: rsys + +stack: + $(Q) $(MAKE) -C stack + $(Q) $(MAKE) -C stack run +.PHONY: stack diff --git a/compel/test/fdspy/Makefile b/compel/test/fdspy/Makefile index 027c373fe..82d9fdc0b 100644 --- a/compel/test/fdspy/Makefile +++ b/compel/test/fdspy/Makefile @@ -5,6 +5,10 @@ COMPEL := ../../../compel/compel-host all: victim spy +run: + ./spy +.PHONY: run + clean: rm -f victim rm -f spy diff --git a/compel/test/fdspy/parasite.c b/compel/test/fdspy/parasite.c index c14064b36..2399cc3ba 100644 --- a/compel/test/fdspy/parasite.c +++ b/compel/test/fdspy/parasite.c @@ -1,20 +1,28 @@ #include -#include #include +#include /* * Stubs for std compel plugin. */ -int compel_main(void *arg_p, unsigned int arg_s) { return 0; } -int parasite_trap_cmd(int cmd, void *args) { return 0; } -void parasite_cleanup(void) { } +int compel_main(void *arg_p, unsigned int arg_s) +{ + return 0; +} +int parasite_trap_cmd(int cmd, void *args) +{ + return 0; +} +void parasite_cleanup(void) +{ +} -#define PARASITE_CMD_GETFD PARASITE_USER_CMDS +#define PARASITE_CMD_GETFD PARASITE_USER_CMDS int parasite_daemon_cmd(int cmd, void *args) { if (cmd == PARASITE_CMD_GETFD) - fds_send_fd(2); + return (fds_send_fd(2) < 0); return 0; } diff --git a/compel/test/fdspy/spy.c b/compel/test/fdspy/spy.c index 258e3ab75..41de99e20 100644 --- a/compel/test/fdspy/spy.c +++ b/compel/test/fdspy/spy.c @@ -5,10 +5,13 @@ #include #include -#include +#include +#include +#include + #include "parasite.h" -#define PARASITE_CMD_GETFD PARASITE_USER_CMDS +#define PARASITE_CMD_GETFD PARASITE_USER_CMDS static void print_vmsg(unsigned int lvl, const char *fmt, va_list parms) { @@ -18,7 +21,11 @@ static void print_vmsg(unsigned int lvl, const char *fmt, va_list parms) static int do_infection(int pid, int *stolen_fd) { -#define err_and_ret(msg) do { fprintf(stderr, msg); return -1; } while (0) +#define err_and_ret(msg) \ + do { \ + fprintf(stderr, msg); \ + return -1; \ + } while (0) int state; struct parasite_ctl *ctl; @@ -101,8 +108,14 @@ static int check_pipe_ends(int wfd, int rfd) } printf("Check pipe ends are connected\n"); - write(wfd, "1", 2); - read(rfd, aux, sizeof(aux)); + if (write(wfd, "1", 2) != 2) { + fprintf(stderr, "write to pipe failed\n"); + return 0; + } + if (read(rfd, aux, sizeof(aux)) != sizeof(aux)) { + fprintf(stderr, "read from pipe failed\n"); + return 0; + } if (aux[0] != '1' || aux[1] != '\0') { fprintf(stderr, "Pipe connectivity lost\n"); return 0; @@ -126,14 +139,22 @@ int main(int argc, char **argv) printf("Run the victim\n"); pid = vfork(); if (pid == 0) { - close(p_in[1]); dup2(p_in[0], 0); close(p_in[0]); - close(p_out[0]); dup2(p_out[1], 1); close(p_out[1]); - close(p_err[0]); dup2(p_err[1], 2); close(p_err[1]); + close(p_in[1]); + dup2(p_in[0], 0); + close(p_in[0]); + close(p_out[0]); + dup2(p_out[1], 1); + close(p_out[1]); + close(p_err[0]); + dup2(p_err[1], 2); + close(p_err[1]); execl("./victim", "victim", NULL); exit(1); } - close(p_in[0]); close(p_out[1]); close(p_err[1]); + close(p_in[0]); + close(p_out[1]); + close(p_err[1]); /* * Now do the infection with parasite.c diff --git a/compel/test/infect/Makefile b/compel/test/infect/Makefile index 4dedf33c9..85efa5fd9 100644 --- a/compel/test/infect/Makefile +++ b/compel/test/infect/Makefile @@ -3,8 +3,17 @@ CFLAGS ?= -O2 -g -Wall -Werror COMPEL := ../../../compel/compel-host +ifeq ($(GCS_ENABLE),1) +CFLAGS += -mbranch-protection=standard -DGCS_TEST_ENABLE=1 +LDFLAGS += -z experimental-gcs=check +endif + all: victim spy +run: + ./spy +.PHONY: run + clean: rm -f victim rm -f spy @@ -13,7 +22,7 @@ clean: rm -f parasite.o victim: victim.c - $(CC) $(CFLAGS) -o $@ $^ + $(CC) $(CFLAGS) -o $@ $^ $(LDFLAGS) spy: spy.c parasite.h $(CC) $(CFLAGS) $(shell $(COMPEL) includes) -o $@ $< $(shell $(COMPEL) --static libs) diff --git a/compel/test/infect/parasite.c b/compel/test/infect/parasite.c index f18580966..ad13bd25d 100644 --- a/compel/test/infect/parasite.c +++ b/compel/test/infect/parasite.c @@ -6,11 +6,16 @@ /* * Stubs for std compel plugin. */ -int parasite_trap_cmd(int cmd, void *args) { return 0; } -void parasite_cleanup(void) { } +int parasite_trap_cmd(int cmd, void *args) +{ + return 0; +} +void parasite_cleanup(void) +{ +} -#define PARASITE_CMD_INC PARASITE_USER_CMDS -#define PARASITE_CMD_DEC PARASITE_USER_CMDS + 1 +#define PARASITE_CMD_INC PARASITE_USER_CMDS +#define PARASITE_CMD_DEC PARASITE_USER_CMDS + 1 int parasite_daemon_cmd(int cmd, void *args) { diff --git a/compel/test/infect/spy.c b/compel/test/infect/spy.c index a5aba7308..143946941 100644 --- a/compel/test/infect/spy.c +++ b/compel/test/infect/spy.c @@ -3,11 +3,13 @@ #include #include -#include +#include +#include + #include "parasite.h" -#define PARASITE_CMD_INC PARASITE_USER_CMDS -#define PARASITE_CMD_DEC PARASITE_USER_CMDS + 1 +#define PARASITE_CMD_INC PARASITE_USER_CMDS +#define PARASITE_CMD_DEC PARASITE_USER_CMDS + 1 static void print_vmsg(unsigned int lvl, const char *fmt, va_list parms) { @@ -17,7 +19,11 @@ static void print_vmsg(unsigned int lvl, const char *fmt, va_list parms) static int do_infection(int pid) { -#define err_and_ret(msg) do { fprintf(stderr, msg); return -1; } while (0) +#define err_and_ret(msg) \ + do { \ + fprintf(stderr, msg); \ + return -1; \ + } while (0) int state; struct parasite_ctl *ctl; @@ -88,15 +94,15 @@ static inline int chk(int fd, int val) int v = 0; if (read(fd, &v, sizeof(v)) != sizeof(v)) - return 0; + return 1; printf("%d, want %d\n", v, val); - return v == val; + return v != val; } int main(int argc, char **argv) { - int p_in[2], p_out[2], p_err[2], pid, i, pass = 1; + int p_in[2], p_out[2], p_err[2], pid, i, err = 0; /* * Prepare IO-s and fork the victim binary @@ -106,16 +112,27 @@ int main(int argc, char **argv) return -1; } +#ifdef GCS_TEST_ENABLE + setenv("GLIBC_TUNABLES", "glibc.cpu.aarch64_gcs=1:glibc.cpu.aarch64_gcs_policy=2", 1); +#endif pid = vfork(); if (pid == 0) { - close(p_in[1]); dup2(p_in[0], 0); close(p_in[0]); - close(p_out[0]); dup2(p_out[1], 1); close(p_out[1]); - close(p_err[0]); dup2(p_err[1], 2); close(p_err[1]); + close(p_in[1]); + dup2(p_in[0], 0); + close(p_in[0]); + close(p_out[0]); + dup2(p_out[1], 1); + close(p_out[1]); + close(p_err[0]); + dup2(p_err[1], 2); + close(p_err[1]); execl("./victim", "victim", NULL); exit(1); } - close(p_in[0]); close(p_out[1]); close(p_err[1]); + close(p_in[0]); + close(p_out[1]); + close(p_err[1]); /* * Tell the little guy some numbers @@ -128,9 +145,11 @@ int main(int argc, char **argv) return 1; printf("Checking the victim alive\n"); - pass = chk(p_out[0], 1); - pass = chk(p_out[0], 42); - if (!pass) + err = chk(p_out[0], 1); + if (err) + return 1; + err = chk(p_out[0], 42); + if (err) return 1; /* @@ -162,14 +181,14 @@ int main(int argc, char **argv) printf("Checking the result\n"); /* These two came from parasite */ - pass = chk(p_out[0], 138); - pass = chk(p_out[0], 403); + err = chk(p_out[0], 138); + err |= chk(p_out[0], 403); /* These two came from post-infect */ - pass = chk(p_out[0], 1234); - pass = chk(p_out[0], 4096); + err |= chk(p_out[0], 1234); + err |= chk(p_out[0], 4096); - if (pass) + if (!err) printf("All OK\n"); else printf("Something went WRONG\n"); diff --git a/compel/test/rsys/Makefile b/compel/test/rsys/Makefile index 3babda18f..53400498e 100644 --- a/compel/test/rsys/Makefile +++ b/compel/test/rsys/Makefile @@ -5,6 +5,10 @@ COMPEL := ../../../compel/compel-host all: victim spy +run: + ./spy +.PHONY: run + clean: rm -f victim rm -f spy diff --git a/compel/test/rsys/spy.c b/compel/test/rsys/spy.c index f5c999d5a..4a6fcef29 100644 --- a/compel/test/rsys/spy.c +++ b/compel/test/rsys/spy.c @@ -4,7 +4,8 @@ #include #include -#include +#include +#include static void print_vmsg(unsigned int lvl, const char *fmt, va_list parms) { @@ -14,7 +15,11 @@ static void print_vmsg(unsigned int lvl, const char *fmt, va_list parms) static int do_rsetsid(int pid) { -#define err_and_ret(msg) do { fprintf(stderr, msg); return -1; } while (0) +#define err_and_ret(msg) \ + do { \ + fprintf(stderr, msg); \ + return -1; \ + } while (0) int state; long ret; @@ -63,7 +68,9 @@ static inline int chk(int fd, int val) { int v = 0; - read(fd, &v, sizeof(v)); + if (read(fd, &v, sizeof(v)) != sizeof(v)) { + fprintf(stderr, "read failed\n"); + } printf("%d, want %d\n", v, val); return v == val; } @@ -82,21 +89,32 @@ int main(int argc, char **argv) pid = vfork(); if (pid == 0) { - close(p_in[1]); dup2(p_in[0], 0); close(p_in[0]); - close(p_out[0]); dup2(p_out[1], 1); close(p_out[1]); - close(p_err[0]); dup2(p_err[1], 2); close(p_err[1]); + close(p_in[1]); + dup2(p_in[0], 0); + close(p_in[0]); + close(p_out[0]); + dup2(p_out[1], 1); + close(p_out[1]); + close(p_err[0]); + dup2(p_err[1], 2); + close(p_err[1]); execl("./victim", "victim", NULL); exit(1); } - close(p_in[0]); close(p_out[1]); close(p_err[1]); + close(p_in[0]); + close(p_out[1]); + close(p_err[1]); sid = getsid(0); /* * Kick the victim once */ i = 0; - write(p_in[1], &i, sizeof(i)); + if (write(p_in[1], &i, sizeof(i)) != sizeof(i)) { + fprintf(stderr, "write to pipe failed\n"); + return -1; + } printf("Checking the victim session to be %d\n", sid); pass = chk(p_out[0], sid); @@ -114,7 +132,10 @@ int main(int argc, char **argv) /* * Kick the victim again so it tells new session */ - write(p_in[1], &i, sizeof(i)); + if (write(p_in[1], &i, sizeof(i)) != sizeof(i)) { + fprintf(stderr, "write to pipe failed\n"); + return -1; + } /* * Stop the victim and check the intrusion went well diff --git a/compel/test/rsys/victim.c b/compel/test/rsys/victim.c index 2f1943d0c..85cb7cb89 100644 --- a/compel/test/rsys/victim.c +++ b/compel/test/rsys/victim.c @@ -9,7 +9,8 @@ int main(int argc, char **argv) break; i = getsid(0); - write(1, &i, sizeof(i)); + if (write(1, &i, sizeof(i)) != sizeof(i)) + break; } return 0; diff --git a/compel/test/stack/.gitignore b/compel/test/stack/.gitignore new file mode 100644 index 000000000..0a554758d --- /dev/null +++ b/compel/test/stack/.gitignore @@ -0,0 +1,4 @@ +parasite.h +parasite.po +spy +victim diff --git a/compel/test/stack/Makefile b/compel/test/stack/Makefile new file mode 100644 index 000000000..bacfad962 --- /dev/null +++ b/compel/test/stack/Makefile @@ -0,0 +1,32 @@ +CC := gcc +CFLAGS ?= -O2 -g -Wall -Werror + +COMPEL := ../../../compel/compel-host + +all: victim spy + +run: + ./spy +.PHONY: run + +clean: + rm -f victim + rm -f spy + rm -f parasite.h + rm -f parasite.po + rm -f parasite.o + +victim: victim.c + $(CC) $(CFLAGS) -o $@ $^ + +spy: spy.c parasite.h + $(CC) $(CFLAGS) $(shell $(COMPEL) includes) -o $@ $< $(shell $(COMPEL) --static libs) + +parasite.h: parasite.po + $(COMPEL) hgen -o $@ -f $< + +parasite.po: parasite.o + ld $(shell $(COMPEL) ldflags) -o $@ $^ $(shell $(COMPEL) plugins) + +parasite.o: parasite.c + $(CC) $(CFLAGS) -c $(shell $(COMPEL) cflags) -o $@ $^ diff --git a/compel/test/stack/parasite.c b/compel/test/stack/parasite.c new file mode 100644 index 000000000..ad13bd25d --- /dev/null +++ b/compel/test/stack/parasite.c @@ -0,0 +1,38 @@ +#include + +#include +#include + +/* + * Stubs for std compel plugin. + */ +int parasite_trap_cmd(int cmd, void *args) +{ + return 0; +} +void parasite_cleanup(void) +{ +} + +#define PARASITE_CMD_INC PARASITE_USER_CMDS +#define PARASITE_CMD_DEC PARASITE_USER_CMDS + 1 + +int parasite_daemon_cmd(int cmd, void *args) +{ + int v; + + switch (cmd) { + case PARASITE_CMD_INC: + v = (*(int *)args) + 1; + break; + case PARASITE_CMD_DEC: + v = (*(int *)args) - 1; + break; + default: + v = -1; + break; + } + + sys_write(1, &v, sizeof(int)); + return 0; +} diff --git a/compel/test/stack/spy.c b/compel/test/stack/spy.c new file mode 100644 index 000000000..184c8ab31 --- /dev/null +++ b/compel/test/stack/spy.c @@ -0,0 +1,294 @@ +#include +#include +#include +#include +#include +#include + +#include + +#include +#include +#include + +#include "parasite.h" + +#define PARASITE_CMD_INC PARASITE_USER_CMDS +#define PARASITE_CMD_DEC PARASITE_USER_CMDS + 1 + +#define err_and_ret(msg) \ + do { \ + fprintf(stderr, msg); \ + return -1; \ + } while (0) + +void *saved_data = NULL; + +#define SAVED_DATA_MAX page_size() + +void cleanup_saved_data(void) +{ + free(saved_data); +} + +static void print_vmsg(unsigned int lvl, const char *fmt, va_list parms) +{ + printf("\tLC%u: ", lvl); + vprintf(fmt, parms); +} + +static void *get_parasite_rstack_start(struct parasite_ctl *ctl) +{ + void *rstack, *r_thread_stack, *rstack_start; + + compel_get_stack(ctl, &rstack, &r_thread_stack); + + rstack_start = rstack; + if (r_thread_stack != NULL && r_thread_stack < rstack_start) + rstack_start = r_thread_stack; + + return rstack_start; +} + +static void *read_proc_mem(int pid, void *offset, size_t len) +{ + char victim_mem_path[6 + 11 + 4 + 1]; + int written; + int fd; + void *data; + ssize_t mem_read; + + written = snprintf(victim_mem_path, sizeof(victim_mem_path), "/proc/%d/mem", pid); + if (written < 0 || written >= sizeof(victim_mem_path)) { + fprintf(stderr, "Failed to create path string to victim's /proc/%d/mem file\n", pid); + return NULL; + } + + fd = open(victim_mem_path, O_RDONLY); + if (fd < 0) { + perror("Failed to open victim's /proc/$pid/mem file"); + return NULL; + } + + data = malloc(len); + if (data == NULL) { + perror("Can't allocate memory to read victim's /proc/$pid/mem file"); + return NULL; + } + + mem_read = pread(fd, data, len, (off_t)offset); + if (mem_read == -1) { + perror("Failed to read victim's /proc/$pid/mem file"); + goto freebuf; + } + + return data; + +freebuf: + free(data); + return NULL; +} + +static int check_saved_data(struct parasite_ctl *ctl, int pid, void *stack, void *saved_data, size_t saved_data_size) +{ + if (saved_data != NULL) { + void *current_data; + + current_data = read_proc_mem(pid, stack, saved_data_size); + if (current_data == NULL) + return -1; + + if (memcmp(saved_data, current_data, saved_data_size) != 0) + return 1; + } + + return 0; +} + +static int do_infection(int pid) +{ + int state; + struct parasite_ctl *ctl; + struct infect_ctx *ictx; + int *arg; + void *stack; + size_t saved_data_size = PARASITE_STACK_REDZONE; + int saved_data_check; + + compel_log_init(print_vmsg, COMPEL_LOG_DEBUG); + + printf("Stopping task\n"); + state = compel_stop_task(pid); + if (state < 0) + err_and_ret("Can't stop task\n"); + + printf("Preparing parasite ctl\n"); + ctl = compel_prepare(pid); + if (!ctl) + err_and_ret("Can't prepare for infection\n"); + + printf("Configuring contexts\n"); + + /* + * First -- the infection context. Most of the stuff + * is already filled by compel_prepare(), just set the + * log descriptor for parasite side, library cannot + * live w/o it. + */ + ictx = compel_infect_ctx(ctl); + ictx->log_fd = STDERR_FILENO; + + parasite_setup_c_header(ctl); + + printf("Infecting\n"); + if (compel_infect_no_daemon(ctl, 1, sizeof(int))) + err_and_ret("Can't infect victim\n"); + + if (atexit(cleanup_saved_data)) + err_and_ret("Can't register cleanup function with atexit\n"); + + stack = get_parasite_rstack_start(ctl); + + if (compel_start_daemon(ctl)) + err_and_ret("Can't start daemon in victim\n"); + + /* + * Now get the area with arguments and run two + * commands one by one. + */ + arg = compel_parasite_args(ctl, int); + + printf("Running cmd 1\n"); + *arg = 137; + if (compel_rpc_call_sync(PARASITE_CMD_INC, ctl)) + err_and_ret("Can't run parasite command 1\n"); + + printf("Running cmd 2\n"); + *arg = 404; + if (compel_rpc_call_sync(PARASITE_CMD_DEC, ctl)) + err_and_ret("Can't run parasite command 2\n"); + + saved_data_check = check_saved_data(ctl, pid, stack, saved_data, saved_data_size); + if (saved_data_check == -1) + err_and_ret("Could not check saved data\n"); + if (saved_data_check != 0) + err_and_ret("Saved data unexpectedly modified\n"); + + /* + * Done. Cure and resume the task. + */ + printf("Curing\n"); + if (compel_cure(ctl)) + err_and_ret("Can't cure victim\n"); + + if (compel_resume_task(pid, state, state)) + err_and_ret("Can't unseize task\n"); + + printf("Done\n"); + + return 0; +} + +static inline int chk(int fd, int val) +{ + int v = 0; + + if (read(fd, &v, sizeof(v)) != sizeof(v)) + return 1; + + printf("%d, want %d\n", v, val); + return v != val; +} + +int main(int argc, char **argv) +{ + int p_in[2], p_out[2], p_err[2], pid, i, err = 0; + + /* + * Prepare IO-s and fork the victim binary + */ + if (pipe(p_in) || pipe(p_out) || pipe(p_err)) { + perror("Can't make pipe"); + return -1; + } + + pid = vfork(); + if (pid == 0) { + close(p_in[1]); + dup2(p_in[0], 0); + close(p_in[0]); + close(p_out[0]); + dup2(p_out[1], 1); + close(p_out[1]); + close(p_err[0]); + dup2(p_err[1], 2); + close(p_err[1]); + execl("./victim", "victim", NULL); + exit(1); + } + + close(p_in[0]); + close(p_out[1]); + close(p_err[1]); + + /* + * Tell the little guy some numbers + */ + i = 1; + if (write(p_in[1], &i, sizeof(i)) != sizeof(i)) + return 1; + i = 42; + if (write(p_in[1], &i, sizeof(i)) != sizeof(i)) + return 1; + + printf("Checking the victim alive\n"); + err = chk(p_out[0], 1); + if (err) + return 1; + err = chk(p_out[0], 42); + if (err) + return 1; + + /* + * Now do the infection with parasite.c + */ + + printf("Infecting the victim\n"); + if (do_infection(pid)) + return 1; + + /* + * Tell the victim some more stuff to check it's alive + */ + i = 1234; + if (write(p_in[1], &i, sizeof(i)) != sizeof(i)) + return 1; + i = 4096; + if (write(p_in[1], &i, sizeof(i)) != sizeof(i)) + return 1; + + /* + * Stop the victim and check the infection went well + */ + printf("Closing victim stdin\n"); + close(p_in[1]); + printf("Waiting for victim to die\n"); + wait(NULL); + + printf("Checking the result\n"); + + /* These two came from parasite */ + err = chk(p_out[0], 138); + err |= chk(p_out[0], 403); + + /* These two came from post-infect */ + err |= chk(p_out[0], 1234); + err |= chk(p_out[0], 4096); + + if (!err) + printf("All OK\n"); + else + printf("Something went WRONG\n"); + + return 0; +} diff --git a/compel/test/stack/victim.c b/compel/test/stack/victim.c new file mode 100644 index 000000000..f94613fa1 --- /dev/null +++ b/compel/test/stack/victim.c @@ -0,0 +1,16 @@ +#include + +int main(int argc, char **argv) +{ + int i; + + while (1) { + if (read(0, &i, sizeof(i)) != sizeof(i)) + break; + + if (write(1, &i, sizeof(i)) != sizeof(i)) + break; + } + + return 0; +} diff --git a/contrib/apt-install b/contrib/apt-install new file mode 100755 index 000000000..676e0f794 --- /dev/null +++ b/contrib/apt-install @@ -0,0 +1,23 @@ +#!/bin/bash + +set -e -x + +export DEBIAN_FRONTEND=noninteractive + +install_retry_counter=0 +max_apt_retries=5 + +# This function loops a couple of times over apt-get, hoping to +# avoid CI errors due to errors during apt-get +# hashsum mismatches, DNS errors and similar things +while true; do + (( install_retry_counter+=1 )) + if [ "${install_retry_counter}" -gt "${max_apt_retries}" ]; then + exit 1 + fi + apt-get update -y && apt-get install -y --no-install-recommends "$@" && break + + # In case it is a network error let's wait a bit. + echo "Retrying attempt ${install_retry_counter}" + sleep "${install_retry_counter}" +done diff --git a/contrib/debian/dev-packages.lst b/contrib/debian/dev-packages.lst deleted file mode 100644 index 049bbd82d..000000000 --- a/contrib/debian/dev-packages.lst +++ /dev/null @@ -1,20 +0,0 @@ -# Required packages for development in Debian -build-essential -libprotobuf-dev -libprotobuf-c-dev -protobuf-c-compiler -protobuf-compiler -python-protobuf -libnet-dev - -# Extra packages, required for testing and building other tools -pkg-config -libnl-3-dev -python-ipaddr -libbsd0 -libbsd-dev -iproute2 -libcap-dev -libaio-dev -python-yaml -libnl-route-3-dev diff --git a/contrib/dependencies/apk-packages.sh b/contrib/dependencies/apk-packages.sh new file mode 100755 index 000000000..c47fb9fe0 --- /dev/null +++ b/contrib/dependencies/apk-packages.sh @@ -0,0 +1,42 @@ +#!/usr/bin/env sh + +apk add --no-cache \ + asciidoctor \ + bash \ + build-base \ + coreutils \ + e2fsprogs \ + elfutils-dev \ + git \ + gnutls-dev \ + go \ + ip6tables \ + iproute2 \ + iptables \ + iptables-legacy \ + libaio-dev \ + libbsd-dev \ + libcap-dev \ + libcap-utils \ + libdrm-dev \ + libnet-dev \ + libnl3-dev \ + libtraceevent-dev \ + libtracefs-dev \ + nftables \ + nftables-dev \ + perl \ + pkgconfig \ + procps \ + protobuf-c-compiler \ + protobuf-c-dev \ + protobuf-dev \ + py3-importlib-metadata \ + py3-pip \ + py3-protobuf \ + py3-yaml \ + python3 \ + sudo \ + tar \ + util-linux \ + util-linux-dev diff --git a/contrib/dependencies/apt-cross-packages.sh b/contrib/dependencies/apt-cross-packages.sh new file mode 100755 index 000000000..30ce6874c --- /dev/null +++ b/contrib/dependencies/apt-cross-packages.sh @@ -0,0 +1,37 @@ +#!/usr/bin/env sh + +APT_INSTALL="$(cd "$(dirname "$0")/.." >/dev/null 2>&1 && pwd)/apt-install" +if [ ! -x "$APT_INSTALL" ]; then + echo "Error: apt-install not found or not executable" + exit 1 +fi + +"$APT_INSTALL" \ + crossbuild-essential-"${DEBIAN_ARCH}" \ + iproute2:"${DEBIAN_ARCH}" \ + libaio-dev:"${DEBIAN_ARCH}" \ + libbz2-dev:"${DEBIAN_ARCH}" \ + libc6-"${DEBIAN_ARCH}"-cross \ + libc6-dev-"${DEBIAN_ARCH}"-cross \ + libcap-dev:"${DEBIAN_ARCH}" \ + libdrm-dev:"${DEBIAN_ARCH}" \ + libelf-dev:"${DEBIAN_ARCH}" \ + libexpat1-dev:"${DEBIAN_ARCH}" \ + libgnutls28-dev:"${DEBIAN_ARCH}" \ + libnet-dev:"${DEBIAN_ARCH}" \ + libnftables-dev:"${DEBIAN_ARCH}" \ + libnl-3-dev:"${DEBIAN_ARCH}" \ + libnl-route-3-dev:"${DEBIAN_ARCH}" \ + libprotobuf-c-dev:"${DEBIAN_ARCH}" \ + libprotobuf-dev:"${DEBIAN_ARCH}" \ + libssl-dev:"${DEBIAN_ARCH}" \ + libtraceevent-dev:"${DEBIAN_ARCH}" \ + libtracefs-dev:"${DEBIAN_ARCH}" \ + ncurses-dev:"${DEBIAN_ARCH}" \ + uuid-dev:"${DEBIAN_ARCH}" \ + build-essential \ + pkg-config \ + git \ + protobuf-c-compiler \ + protobuf-compiler \ + python3-protobuf diff --git a/contrib/dependencies/apt-packages.sh b/contrib/dependencies/apt-packages.sh new file mode 100755 index 000000000..7963be7b4 --- /dev/null +++ b/contrib/dependencies/apt-packages.sh @@ -0,0 +1,44 @@ +#!/usr/bin/env sh + +APT_INSTALL="$(cd "$(dirname "$0")/.." >/dev/null 2>&1 && pwd)/apt-install" +if [ ! -x "$APT_INSTALL" ]; then + echo "Error: apt-install not found or not executable" + exit 1 +fi + +"$APT_INSTALL" \ + asciidoctor \ + bash \ + bsdmainutils \ + build-essential \ + gdb \ + git-core \ + iproute2 \ + iptables \ + kmod \ + libaio-dev \ + libbsd-dev \ + libcap-dev \ + libdrm-dev \ + libelf-dev \ + libgnutls28-dev \ + libgnutls30 \ + libnet-dev \ + libnl-3-dev \ + libnl-route-3-dev \ + libperl-dev \ + libprotobuf-c-dev \ + libprotobuf-dev \ + libselinux-dev \ + libtraceevent-dev \ + libtracefs-dev \ + pkg-config \ + protobuf-c-compiler \ + protobuf-compiler \ + python3-importlib-metadata \ + python3-pip \ + python3-protobuf \ + python3-yaml \ + time \ + util-linux \ + uuid-dev diff --git a/contrib/dependencies/dnf-packages.sh b/contrib/dependencies/dnf-packages.sh new file mode 100755 index 000000000..793f267a5 --- /dev/null +++ b/contrib/dependencies/dnf-packages.sh @@ -0,0 +1,40 @@ +#!/usr/bin/env sh + +dnf install -y \ + asciidoc \ + binutils \ + elfutils-libelf-devel \ + gcc \ + git \ + glibc-devel \ + gnutls-devel \ + iproute \ + iptables \ + libaio-devel \ + libasan \ + libbpf-devel \ + libbsd-devel \ + libcap-devel \ + libdrm-devel \ + libnet-devel \ + libnl3-devel \ + libselinux-devel \ + libtraceevent-devel \ + libtracefs-devel \ + libuuid-devel \ + make \ + nftables \ + pkg-config \ + protobuf \ + protobuf-c \ + protobuf-c-devel \ + protobuf-compiler \ + protobuf-devel \ + python-devel \ + python3-importlib-metadata \ + python3-protobuf \ + python3-pyyaml \ + python3-setuptools \ + python3-wheel \ + rubygem-asciidoctor \ + xmlto diff --git a/contrib/dependencies/pacman-packages.sh b/contrib/dependencies/pacman-packages.sh new file mode 100755 index 000000000..260797606 --- /dev/null +++ b/contrib/dependencies/pacman-packages.sh @@ -0,0 +1,34 @@ +#!/usr/bin/env sh + +pacman -Syu --noconfirm \ + asciidoctor \ + base-devel \ + bash \ + coreutils \ + diffutils \ + git \ + gnutls \ + go \ + iproute2 \ + iptables \ + libaio \ + libbsd \ + libcap \ + libdrm \ + libelf \ + libnet \ + libnl \ + libtraceevent \ + libtracefs \ + nftables \ + pkg-config \ + protobuf \ + protobuf-c \ + python-importlib-metadata \ + python-pip \ + python-protobuf \ + python-yaml \ + sudo \ + tar \ + util-linux \ + util-linux-libs diff --git a/contrib/docker_cr.sh b/contrib/docker_cr.sh index 9b43d8ba1..04ef676cd 100755 --- a/contrib/docker_cr.sh +++ b/contrib/docker_cr.sh @@ -418,7 +418,7 @@ resolve_path() { local p p="${2}" - if which realpath > /dev/null; then + if command -v realpath > /dev/null; then p=$(realpath "${p}") fi ${ECHO} "${1}: ${p}" @@ -427,7 +427,7 @@ resolve_path() { resolve_cmd() { local cpath - cpath=$(which "${2}") + cpath=$(command -v "${2}") resolve_path "${1}" "${cpath}" } diff --git a/coredump/coredump b/coredump/coredump new file mode 100755 index 000000000..5b3e6f366 --- /dev/null +++ b/coredump/coredump @@ -0,0 +1,55 @@ +#!/usr/bin/env python3 +import platform +import argparse +import os +import sys + +import criu_coredump + +PLATFORMS = ["aarch64", "armv7l", "x86_64"] + + +def coredump(opts): + generator = criu_coredump.coredump_generator() + cores = generator(os.path.realpath(opts['in'])) + for pid in cores: + if opts['pid'] and pid != opts['pid']: + continue + with open(os.path.realpath(opts['out']) + "/core." + str(pid), 'wb+') as f: + cores[pid].write(f) + + +def main(): + desc = 'CRIU core dump' + parser = argparse.ArgumentParser(description=desc, + formatter_class=argparse.RawTextHelpFormatter) + + parser.add_argument('-i', + '--in', + default='.', + help='directory where to get images from') + parser.add_argument('-p', + '--pid', + type=int, + help='generate coredump for specific pid(all pids py default)') + parser.add_argument('-o', + '--out', + default='.', + help='directory to write coredumps to') + + opts = vars(parser.parse_args()) + + if platform.machine() not in PLATFORMS: + print("ERROR: %s is only supported on: %s" % (sys.argv[0], ', '.join(PLATFORMS))) + sys.exit(1) + + try: + coredump(opts) + except SystemExit as error: + print('ERROR: %s' % error) + print('Exiting') + sys.exit(1) + + +if __name__ == '__main__': + main() diff --git a/coredump/criu-coredump b/coredump/criu-coredump deleted file mode 100755 index 25c188c6b..000000000 --- a/coredump/criu-coredump +++ /dev/null @@ -1,40 +0,0 @@ -#!/usr/bin/env python2 -import argparse -import os - -import criu_coredump - -def coredump(opts): - generator = criu_coredump.coredump_generator() - cores = generator(os.path.realpath(opts['in'])) - for pid in cores: - if opts['pid'] and pid != opts['pid']: - continue - with open(os.path.realpath(opts['out'])+"/core."+str(pid), 'w+') as f: - cores[pid].write(f) - - -def main(): - desc = 'CRIU core dump' - parser = argparse.ArgumentParser(description=desc, - formatter_class=argparse.RawTextHelpFormatter) - - parser.add_argument('-i', - '--in', - default = '.', - help = 'directory where to get images from') - parser.add_argument('-p', - '--pid', - type = int, - help = 'generate coredump for specific pid(all pids py default)') - parser.add_argument('-o', - '--out', - default = '.', - help = 'directory to write coredumps to') - - opts = vars(parser.parse_args()) - - coredump(opts) - -if __name__ == '__main__': - main() diff --git a/coredump/criu_coredump/__init__.py b/coredump/criu_coredump/__init__.py index 213af42ec..c1a437cf4 100644 --- a/coredump/criu_coredump/__init__.py +++ b/coredump/criu_coredump/__init__.py @@ -1,2 +1 @@ -from coredump import * -import elf +from .coredump import coredump_generator diff --git a/coredump/criu_coredump/coredump.py b/coredump/criu_coredump/coredump.py index bc53a7705..acb806ace 100644 --- a/coredump/criu_coredump/coredump.py +++ b/coredump/criu_coredump/coredump.py @@ -29,9 +29,12 @@ # 4) VMAs contents; # import io -import elf +import sys import ctypes +import platform + from pycriu import images +from . import elf # Some memory-related constants PAGESIZE = 4096 @@ -51,6 +54,8 @@ status = { "VMA_AREA_SOCKET": 1 << 11, "VMA_AREA_VVAR": 1 << 12, "VMA_AREA_AIORING": 1 << 13, + "VMA_AREA_MEMFD": 1 << 14, + "VMA_AREA_UPROBES": 1 << 17, "VMA_AREA_UNSUPP": 1 << 31 } @@ -88,11 +93,16 @@ class coredump: for note in self.notes: buf.write(note.nhdr) buf.write(note.owner) - buf.write("\0" * (8 - len(note.owner))) + buf.write(b"\0" * (8 - len(note.owner))) buf.write(note.data) - offset = ctypes.sizeof(elf.Elf64_Ehdr()) - offset += (len(self.vmas) + 1) * ctypes.sizeof(elf.Elf64_Phdr()) + bits = platform.architecture()[0] # 32 or 64 bits + + ehdr = {"32bit": elf.Elf32_Ehdr, "64bit": elf.Elf64_Ehdr} + phdr = {"32bit": elf.Elf32_Phdr, "64bit": elf.Elf64_Phdr} + + offset = ctypes.sizeof(ehdr[bits]()) + offset += (len(self.vmas) + 1) * ctypes.sizeof(phdr[bits]()) filesz = 0 for note in self.notes: @@ -127,6 +137,20 @@ class coredump_generator: reg_files = None # reg-files; pagemaps = {} # pagemap by pid; + # thread info key based on the current arch + thread_info_key = { + "aarch64": "ti_aarch64", + "armv7l": "ti_arm", + "x86_64": "thread_info", + } + + machine = platform.machine() # current arch + bits = platform.architecture()[0] # 32 or 64 bits + + ehdr = {"32bit": elf.Elf32_Ehdr, "64bit": elf.Elf64_Ehdr} # 32 or 64 bits Ehdr + nhdr = {"32bit": elf.Elf32_Nhdr, "64bit": elf.Elf64_Nhdr} # 32 or 64 bits Nhdr + phdr = {"32bit": elf.Elf32_Phdr, "64bit": elf.Elf64_Phdr} # 32 or 64 bits Phdr + def _img_open_and_strip(self, name, single=False, pid=None): """ Load criu image and strip it from magic and redundant list. @@ -136,7 +160,7 @@ class coredump_generator: path += "-" + str(pid) path += ".img" - with open(path) as f: + with open(path, 'rb') as f: img = images.load(f) if single: @@ -177,7 +201,7 @@ class coredump_generator: for p in self.coredumps: if pid and p != pid: continue - with open(coredumps_dir + "/" + "core." + str(p), 'w+') as f: + with open(coredumps_dir + "/" + "core." + str(p), 'wb+') as f: self.coredumps[p].write(f) def _gen_coredump(self, pid): @@ -198,44 +222,62 @@ class coredump_generator: """ Generate elf header for process pid with program headers phdrs. """ - ehdr = elf.Elf64_Ehdr() + ei_class = {"32bit": elf.ELFCLASS32, "64bit": elf.ELFCLASS64} + + ehdr = self.ehdr[self.bits]() ctypes.memset(ctypes.addressof(ehdr), 0, ctypes.sizeof(ehdr)) ehdr.e_ident[elf.EI_MAG0] = elf.ELFMAG0 ehdr.e_ident[elf.EI_MAG1] = elf.ELFMAG1 ehdr.e_ident[elf.EI_MAG2] = elf.ELFMAG2 ehdr.e_ident[elf.EI_MAG3] = elf.ELFMAG3 - ehdr.e_ident[elf.EI_CLASS] = elf.ELFCLASS64 + ehdr.e_ident[elf.EI_CLASS] = ei_class[self.bits] ehdr.e_ident[elf.EI_DATA] = elf.ELFDATA2LSB ehdr.e_ident[elf.EI_VERSION] = elf.EV_CURRENT + if self.machine == "armv7l": + ehdr.e_ident[elf.EI_OSABI] = elf.ELFOSABI_ARM + else: + ehdr.e_ident[elf.EI_OSABI] = elf.ELFOSABI_NONE + ehdr.e_type = elf.ET_CORE - ehdr.e_machine = elf.EM_X86_64 + ehdr.e_machine = self._get_e_machine() ehdr.e_version = elf.EV_CURRENT - ehdr.e_phoff = ctypes.sizeof(elf.Elf64_Ehdr()) - ehdr.e_ehsize = ctypes.sizeof(elf.Elf64_Ehdr()) - ehdr.e_phentsize = ctypes.sizeof(elf.Elf64_Phdr()) - #FIXME Case len(phdrs) > PN_XNUM should be handled properly. + ehdr.e_phoff = ctypes.sizeof(self.ehdr[self.bits]()) + ehdr.e_ehsize = ctypes.sizeof(self.ehdr[self.bits]()) + ehdr.e_phentsize = ctypes.sizeof(self.phdr[self.bits]()) + # FIXME Case len(phdrs) > PN_XNUM should be handled properly. # See fs/binfmt_elf.c from linux kernel. ehdr.e_phnum = len(phdrs) return ehdr + def _get_e_machine(self): + """ + Get the e_machine field based on the current architecture. + """ + e_machine_dict = { + "aarch64": elf.EM_AARCH64, + "armv7l": elf.EM_ARM, + "x86_64": elf.EM_X86_64, + } + return e_machine_dict[self.machine] + def _gen_phdrs(self, pid, notes, vmas): """ Generate program headers for process pid. """ phdrs = [] - offset = ctypes.sizeof(elf.Elf64_Ehdr()) - offset += (len(vmas) + 1) * ctypes.sizeof(elf.Elf64_Phdr()) + offset = ctypes.sizeof(self.ehdr[self.bits]()) + offset += (len(vmas) + 1) * ctypes.sizeof(self.phdr[self.bits]()) filesz = 0 for note in notes: filesz += ctypes.sizeof(note.nhdr) + ctypes.sizeof(note.data) + 8 # PT_NOTE - phdr = elf.Elf64_Phdr() + phdr = self.phdr[self.bits]() ctypes.memset(ctypes.addressof(phdr), 0, ctypes.sizeof(phdr)) phdr.p_type = elf.PT_NOTE phdr.p_offset = offset @@ -255,7 +297,7 @@ class coredump_generator: for vma in vmas: offset += filesz filesz = vma.filesz - phdr = elf.Elf64_Phdr() + phdr = self.phdr[self.bits]() ctypes.memset(ctypes.addressof(phdr), 0, ctypes.sizeof(phdr)) phdr.p_type = elf.PT_LOAD phdr.p_align = PAGESIZE @@ -295,7 +337,7 @@ class coredump_generator: prpsinfo.pr_state = 3 # Don't even ask me why it is so, just borrowed from linux # source and made pr_state match. - prpsinfo.pr_sname = '.' if prpsinfo.pr_state > 5 else "RSDTZW" [ + prpsinfo.pr_sname = b'.' if prpsinfo.pr_state > 5 else b"RSDTZW" [ prpsinfo.pr_state] prpsinfo.pr_zomb = 1 if prpsinfo.pr_state == 4 else 0 prpsinfo.pr_nice = core["thread_core"][ @@ -307,17 +349,19 @@ class coredump_generator: prpsinfo.pr_ppid = pstree["ppid"] prpsinfo.pr_pgrp = pstree["pgid"] prpsinfo.pr_sid = pstree["sid"] - prpsinfo.pr_fname = core["tc"]["comm"] - prpsinfo.pr_psargs = self._gen_cmdline(pid) + # prpsinfo.pr_psargs has a limit of 80 characters which means it will + # fail here if the cmdline is longer than 80 + prpsinfo.pr_psargs = self._gen_cmdline(pid)[:80] + prpsinfo.pr_fname = core["tc"]["comm"].encode() - nhdr = elf.Elf64_Nhdr() + nhdr = self.nhdr[self.bits]() nhdr.n_namesz = 5 nhdr.n_descsz = ctypes.sizeof(elf.elf_prpsinfo()) nhdr.n_type = elf.NT_PRPSINFO note = elf_note() note.data = prpsinfo - note.owner = "CORE" + note.owner = b"CORE" note.nhdr = nhdr return note @@ -327,82 +371,110 @@ class coredump_generator: Generate NT_PRSTATUS note for thread tid of process pid. """ core = self.cores[tid] - regs = core["thread_info"]["gpregs"] + regs = self._get_gpregs(core) pstree = self.pstree[pid] prstatus = elf.elf_prstatus() ctypes.memset(ctypes.addressof(prstatus), 0, ctypes.sizeof(prstatus)) - #FIXME setting only some of the fields for now. Revisit later. + # FIXME setting only some of the fields for now. Revisit later. prstatus.pr_pid = tid prstatus.pr_ppid = pstree["ppid"] prstatus.pr_pgrp = pstree["pgid"] prstatus.pr_sid = pstree["sid"] - prstatus.pr_reg.r15 = regs["r15"] - prstatus.pr_reg.r14 = regs["r14"] - prstatus.pr_reg.r13 = regs["r13"] - prstatus.pr_reg.r12 = regs["r12"] - prstatus.pr_reg.rbp = regs["bp"] - prstatus.pr_reg.rbx = regs["bx"] - prstatus.pr_reg.r11 = regs["r11"] - prstatus.pr_reg.r10 = regs["r10"] - prstatus.pr_reg.r9 = regs["r9"] - prstatus.pr_reg.r8 = regs["r8"] - prstatus.pr_reg.rax = regs["ax"] - prstatus.pr_reg.rcx = regs["cx"] - prstatus.pr_reg.rdx = regs["dx"] - prstatus.pr_reg.rsi = regs["si"] - prstatus.pr_reg.rdi = regs["di"] - prstatus.pr_reg.orig_rax = regs["orig_ax"] - prstatus.pr_reg.rip = regs["ip"] - prstatus.pr_reg.cs = regs["cs"] - prstatus.pr_reg.eflags = regs["flags"] - prstatus.pr_reg.rsp = regs["sp"] - prstatus.pr_reg.ss = regs["ss"] - prstatus.pr_reg.fs_base = regs["fs_base"] - prstatus.pr_reg.gs_base = regs["gs_base"] - prstatus.pr_reg.ds = regs["ds"] - prstatus.pr_reg.es = regs["es"] - prstatus.pr_reg.fs = regs["fs"] - prstatus.pr_reg.gs = regs["gs"] + self._set_pr_regset(prstatus.pr_reg, regs) - nhdr = elf.Elf64_Nhdr() + nhdr = self.nhdr[self.bits]() nhdr.n_namesz = 5 nhdr.n_descsz = ctypes.sizeof(elf.elf_prstatus()) nhdr.n_type = elf.NT_PRSTATUS note = elf_note() note.data = prstatus - note.owner = "CORE" + note.owner = b"CORE" note.nhdr = nhdr return note + def _get_gpregs(self, core): + """ + Get the general purpose registers based on the current architecture. + """ + thread_info_key = self.thread_info_key[self.machine] + thread_info = core[thread_info_key] + + return thread_info["gpregs"] + + def _set_pr_regset(self, pr_reg, regs): + """ + Set the pr_reg struct based on the current architecture. + """ + if self.machine == "aarch64": + pr_reg.regs = (ctypes.c_ulonglong * len(regs["regs"]))(*regs["regs"]) + pr_reg.sp = regs["sp"] + pr_reg.pc = regs["pc"] + pr_reg.pstate = regs["pstate"] + elif self.machine == "armv7l": + pr_reg.r0 = regs["r0"] + pr_reg.r1 = regs["r1"] + pr_reg.r2 = regs["r2"] + pr_reg.r3 = regs["r3"] + pr_reg.r4 = regs["r4"] + pr_reg.r5 = regs["r5"] + pr_reg.r6 = regs["r6"] + pr_reg.r7 = regs["r7"] + pr_reg.r8 = regs["r8"] + pr_reg.r9 = regs["r9"] + pr_reg.r10 = regs["r10"] + pr_reg.fp = regs["fp"] + pr_reg.ip = regs["ip"] + pr_reg.sp = regs["sp"] + pr_reg.lr = regs["lr"] + pr_reg.pc = regs["pc"] + pr_reg.cpsr = regs["cpsr"] + pr_reg.orig_r0 = regs["orig_r0"] + elif self.machine == "x86_64": + pr_reg.r15 = regs["r15"] + pr_reg.r14 = regs["r14"] + pr_reg.r13 = regs["r13"] + pr_reg.r12 = regs["r12"] + pr_reg.rbp = regs["bp"] + pr_reg.rbx = regs["bx"] + pr_reg.r11 = regs["r11"] + pr_reg.r10 = regs["r10"] + pr_reg.r9 = regs["r9"] + pr_reg.r8 = regs["r8"] + pr_reg.rax = regs["ax"] + pr_reg.rcx = regs["cx"] + pr_reg.rdx = regs["dx"] + pr_reg.rsi = regs["si"] + pr_reg.rdi = regs["di"] + pr_reg.orig_rax = regs["orig_ax"] + pr_reg.rip = regs["ip"] + pr_reg.cs = regs["cs"] + pr_reg.eflags = regs["flags"] + pr_reg.rsp = regs["sp"] + pr_reg.ss = regs["ss"] + pr_reg.fs_base = regs["fs_base"] + pr_reg.gs_base = regs["gs_base"] + pr_reg.ds = regs["ds"] + pr_reg.es = regs["es"] + pr_reg.fs = regs["fs"] + pr_reg.gs = regs["gs"] + def _gen_fpregset(self, pid, tid): """ Generate NT_FPREGSET note for thread tid of process pid. """ core = self.cores[tid] - regs = core["thread_info"]["fpregs"] + regs = self._get_fpregs(core) fpregset = elf.elf_fpregset_t() ctypes.memset(ctypes.addressof(fpregset), 0, ctypes.sizeof(fpregset)) - fpregset.cwd = regs["cwd"] - fpregset.swd = regs["swd"] - fpregset.ftw = regs["twd"] - fpregset.fop = regs["fop"] - fpregset.rip = regs["rip"] - fpregset.rdp = regs["rdp"] - fpregset.mxcsr = regs["mxcsr"] - fpregset.mxcr_mask = regs["mxcsr_mask"] - fpregset.st_space = (ctypes.c_uint * len(regs["st_space"]))( - *regs["st_space"]) - fpregset.xmm_space = (ctypes.c_uint * len(regs["xmm_space"]))( - *regs["xmm_space"]) - #fpregset.padding = regs["padding"] unused + self._set_fpregset(fpregset, regs) nhdr = elf.Elf64_Nhdr() nhdr.n_namesz = 5 @@ -411,7 +483,87 @@ class coredump_generator: note = elf_note() note.data = fpregset - note.owner = "CORE" + note.owner = b"CORE" + note.nhdr = nhdr + + return note + + def _get_fpregs(self, core): + """ + Get the floating point register dictionary based on the current architecture. + """ + fpregs_key_dict = {"aarch64": "fpsimd", "x86_64": "fpregs"} + fpregs_key = fpregs_key_dict[self.machine] + + thread_info_key = self.thread_info_key[self.machine] + + return core[thread_info_key][fpregs_key] + + def _set_fpregset(self, fpregset, regs): + """ + Set the fpregset struct based on the current architecture. + """ + if self.machine == "aarch64": + fpregset.vregs = (ctypes.c_ulonglong * len(regs["vregs"]))(*regs["vregs"]) + fpregset.fpsr = regs["fpsr"] + fpregset.fpcr = regs["fpcr"] + elif self.machine == "x86_64": + fpregset.cwd = regs["cwd"] + fpregset.swd = regs["swd"] + fpregset.ftw = regs["twd"] + fpregset.fop = regs["fop"] + fpregset.rip = regs["rip"] + fpregset.rdp = regs["rdp"] + fpregset.mxcsr = regs["mxcsr"] + fpregset.mxcr_mask = regs["mxcsr_mask"] + fpregset.st_space = (ctypes.c_uint * len(regs["st_space"]))( + *regs["st_space"]) + fpregset.xmm_space = (ctypes.c_uint * len(regs["xmm_space"]))( + *regs["xmm_space"]) + + def _gen_arm_tls(self, tid): + """ + Generate NT_ARM_TLS note for thread tid of process pid. + """ + core = self.cores[tid] + tls = ctypes.c_ulonglong(core["ti_aarch64"]["tls"]) + + nhdr = elf.Elf64_Nhdr() + nhdr.n_namesz = 6 + nhdr.n_descsz = ctypes.sizeof(ctypes.c_ulonglong) + nhdr.n_type = elf.NT_ARM_TLS + + note = elf_note() + note.data = tls + note.owner = b"LINUX" + note.nhdr = nhdr + + return note + + def _gen_arm_vfp(self, tid): + """ + Generate NT_ARM_VFP note for thread tid of process pid. + """ + core = self.cores[tid] + fpstate = core["ti_arm"]["fpstate"] + + data = elf.vfp_hard_struct() + ctypes.memset(ctypes.addressof(data), 0, ctypes.sizeof(data)) + + data.vfp_regs = (ctypes.c_uint64 * len(fpstate["vfp_regs"]))(*fpstate["vfp_regs"]) + data.fpexc = fpstate["fpexc"] + data.fpscr = fpstate["fpscr"] + data.fpinst = fpstate["fpinst"] + data.fpinst2 = fpstate["fpinst2"] + + nhdr = elf.Elf32_Nhdr() + nhdr.n_namesz = 6 + nhdr.n_descsz = ctypes.sizeof(data) + nhdr.n_type = elf.NT_ARM_VFP + + note = elf_note() + note.data = data + note.owner = b"LINUX" note.nhdr = nhdr return note @@ -452,7 +604,7 @@ class coredump_generator: note = elf_note() note.data = data - note.owner = "LINUX" + note.owner = b"LINUX" note.nhdr = nhdr return note @@ -465,14 +617,14 @@ class coredump_generator: # FIXME zeroify everything for now ctypes.memset(ctypes.addressof(siginfo), 0, ctypes.sizeof(siginfo)) - nhdr = elf.Elf64_Nhdr() + nhdr = self.nhdr[self.bits]() nhdr.n_namesz = 5 nhdr.n_descsz = ctypes.sizeof(elf.siginfo_t()) nhdr.n_type = elf.NT_SIGINFO note = elf_note() note.data = siginfo - note.owner = "CORE" + note.owner = b"CORE" note.nhdr = nhdr return note @@ -482,24 +634,29 @@ class coredump_generator: Generate NT_AUXV note for thread tid of process pid. """ mm = self.mms[pid] - num_auxv = len(mm["mm_saved_auxv"]) / 2 + num_auxv = len(mm["mm_saved_auxv"]) // 2 - class elf_auxv(ctypes.Structure): + class elf32_auxv(ctypes.Structure): + _fields_ = [("auxv", elf.Elf32_auxv_t * num_auxv)] + + class elf64_auxv(ctypes.Structure): _fields_ = [("auxv", elf.Elf64_auxv_t * num_auxv)] - auxv = elf_auxv() + elf_auxv = {"32bit": elf32_auxv(), "64bit": elf64_auxv()} + + auxv = elf_auxv[self.bits] for i in range(num_auxv): auxv.auxv[i].a_type = mm["mm_saved_auxv"][i] auxv.auxv[i].a_val = mm["mm_saved_auxv"][i + 1] - nhdr = elf.Elf64_Nhdr() + nhdr = self.nhdr[self.bits]() nhdr.n_namesz = 5 - nhdr.n_descsz = ctypes.sizeof(elf_auxv()) + nhdr.n_descsz = ctypes.sizeof(elf_auxv[self.bits]) nhdr.n_type = elf.NT_AUXV note = elf_note() note.data = auxv - note.owner = "CORE" + note.owner = b"CORE" note.nhdr = nhdr return note @@ -523,11 +680,10 @@ class coredump_generator: continue shmid = vma["shmid"] - size = vma["end"] - vma["start"] - off = vma["pgoff"] / PAGESIZE + off = vma["pgoff"] // PAGESIZE files = self.reg_files - fname = filter(lambda x: x["id"] == shmid, files)[0]["name"] + fname = next(filter(lambda x: x["id"] == shmid, files))["name"] info = mmaped_file_info() info.start = vma["start"] @@ -570,17 +726,17 @@ class coredump_generator: setattr(data, "start" + str(i), info.start) setattr(data, "end" + str(i), info.end) setattr(data, "file_ofs" + str(i), info.file_ofs) - setattr(data, "name" + str(i), info.name) + setattr(data, "name" + str(i), info.name.encode()) - nhdr = elf.Elf64_Nhdr() + nhdr = self.nhdr[self.bits]() - nhdr.n_namesz = 5 #XXX strlen + 1 + nhdr.n_namesz = 5 # strlen + 1 nhdr.n_descsz = ctypes.sizeof(elf_files()) nhdr.n_type = elf.NT_FILE note = elf_note() note.nhdr = nhdr - note.owner = "CORE" + note.owner = b"CORE" note.data = data return note @@ -589,9 +745,15 @@ class coredump_generator: notes = [] notes.append(self._gen_prstatus(pid, tid)) - notes.append(self._gen_fpregset(pid, tid)) - notes.append(self._gen_x86_xstate(pid, tid)) + if self.machine != "armv7l": + notes.append(self._gen_fpregset(pid, tid)) notes.append(self._gen_siginfo(pid, tid)) + if self.machine == "aarch64": + notes.append(self._gen_arm_tls(tid)) + elif self.machine == "armv7l": + notes.append(self._gen_arm_vfp(tid)) + elif self.machine == "x86_64": + notes.append(self._gen_x86_xstate(pid, tid)) return notes @@ -632,7 +794,9 @@ class coredump_generator: off = 0 # in pages for m in pagemap[1:]: found = False - for i in range(m["nr_pages"]): + num_pages = m.get("nr_pages", m["compat_nr_pages"]) + + for i in range(num_pages): if m["vaddr"] + i * PAGESIZE == page_no * PAGESIZE: found = True break @@ -641,12 +805,11 @@ class coredump_generator: if not found: continue - if "in_parent" in m and m["in_parent"] == True: + if "in_parent" in m and m["in_parent"]: ppid = self.pstree[pid]["ppid"] return self._get_page(ppid, page_no) else: - with open(self._imgs_dir + "/" + "pages-" + str(pages_id) + - ".img") as f: + with open(self._imgs_dir + "/pages-%s.img" % pages_id, 'rb') as f: f.seek(off * PAGESIZE) return f.read(PAGESIZE) @@ -659,16 +822,16 @@ class coredump_generator: f = None if size == 0: - return "" + return b"" if vma["status"] & status["VMA_AREA_VVAR"]: - #FIXME this is what gdb does, as vvar vma + # FIXME this is what gdb does, as vvar vma # is not readable from userspace? - return "\0" * size + return b"\0" * size elif vma["status"] & status["VMA_AREA_VSYSCALL"]: - #FIXME need to dump it with criu or read from + # FIXME need to dump it with criu or read from # current process. - return "\0" * size + return b"\0" * size if vma["status"] & status["VMA_FILE_SHARED"] or \ vma["status"] & status["VMA_FILE_PRIVATE"]: @@ -677,9 +840,13 @@ class coredump_generator: off = vma["pgoff"] files = self.reg_files - fname = filter(lambda x: x["id"] == shmid, files)[0]["name"] + fname = next(filter(lambda x: x["id"] == shmid, files))["name"] + + try: + f = open(fname, 'rb') + except FileNotFoundError: + sys.exit('Required file %s not found.' % fname) - f = open(fname) f.seek(off) start = vma["start"] @@ -701,10 +868,10 @@ class coredump_generator: # a file, and changed ones -- from pages.img. # Finally, if no page is found neither in pages.img nor # in file, hole in inserted -- a page filled with zeroes. - start_page = start / PAGESIZE - end_page = end / PAGESIZE + start_page = start // PAGESIZE + end_page = end // PAGESIZE - buf = "" + buf = b"" for page_no in range(start_page, end_page + 1): page = None @@ -712,17 +879,17 @@ class coredump_generator: # and choose appropriate. page_mem = self._get_page(pid, page_no) - if f != None: + if f is not None: page = f.read(PAGESIZE) - if page_mem != None: + if page_mem is not None: # Page from pages.img has higher priority - # than one from maped file on disk. + # than one from mapped file on disk. page = page_mem - if page == None: + if page is None: # Hole - page = PAGESIZE * "\0" + page = PAGESIZE * b"\0" # If it is a start or end page, we need to read # only part of it. @@ -742,7 +909,7 @@ class coredump_generator: buf += page[n_skip:n_skip + n_read] # Don't forget to close file. - if f != None: + if f is not None: f.close() return buf @@ -764,25 +931,25 @@ class coredump_generator: chunk = self._gen_mem_chunk(pid, vma, size) # Replace all '\0's with spaces. - return chunk.replace('\0', ' ') + return chunk.replace(b'\0', b' ') def _get_vma_dump_size(self, vma): """ Calculate amount of vma to put into core dump. """ - if vma["status"] & status["VMA_AREA_VVAR"] or \ - vma["status"] & status["VMA_AREA_VSYSCALL"] or \ - vma["status"] & status["VMA_AREA_VDSO"]: + if (vma["status"] & status["VMA_AREA_VVAR"] or + vma["status"] & status["VMA_AREA_VSYSCALL"] or + vma["status"] & status["VMA_AREA_VDSO"]): size = vma["end"] - vma["start"] elif vma["prot"] == 0: size = 0 - elif vma["prot"] & prot["PROT_READ"] and \ - vma["prot"] & prot["PROT_EXEC"]: + elif (vma["prot"] & prot["PROT_READ"] and + vma["prot"] & prot["PROT_EXEC"]): size = PAGESIZE - elif vma["status"] & status["VMA_ANON_SHARED"] or \ - vma["status"] & status["VMA_FILE_SHARED"] or \ - vma["status"] & status["VMA_ANON_PRIVATE"] or \ - vma["status"] & status["VMA_FILE_PRIVATE"]: + elif (vma["status"] & status["VMA_ANON_SHARED"] or + vma["status"] & status["VMA_FILE_SHARED"] or + vma["status"] & status["VMA_ANON_PRIVATE"] or + vma["status"] & status["VMA_FILE_PRIVATE"]): size = vma["end"] - vma["start"] else: size = 0 @@ -821,10 +988,6 @@ class coredump_generator: vmas = [] for vma in mm["vmas"]: - size = self._get_vma_dump_size(vma) - - chunk = self._gen_mem_chunk(pid, vma, size) - v = vma_class() v.filesz = self._get_vma_dump_size(vma) v.data = self._gen_mem_chunk(pid, vma, v.filesz) diff --git a/coredump/criu_coredump/elf.py b/coredump/criu_coredump/elf.py index e65919e6b..2911f491e 100644 --- a/coredump/criu_coredump/elf.py +++ b/coredump/criu_coredump/elf.py @@ -1,5 +1,14 @@ # Define structures and constants for generating elf file. import ctypes +import platform + +MACHINE = platform.machine() + +Elf32_Half = ctypes.c_uint16 # typedef uint16_t Elf32_Half; +Elf32_Word = ctypes.c_uint32 # typedef uint32_t Elf32_Word; +Elf32_Addr = ctypes.c_uint32 # typedef uint32_t Elf32_Addr; +Elf32_Off = ctypes.c_uint32 # typedef uint32_t Elf32_Off; +Elf32_Xword = ctypes.c_uint64 # typedef uint64_t Elf32_Xword; Elf64_Half = ctypes.c_uint16 # typedef uint16_t Elf64_Half; Elf64_Word = ctypes.c_uint32 # typedef uint32_t Elf64_Word; @@ -7,7 +16,7 @@ Elf64_Addr = ctypes.c_uint64 # typedef uint64_t Elf64_Addr; Elf64_Off = ctypes.c_uint64 # typedef uint64_t Elf64_Off; Elf64_Xword = ctypes.c_uint64 # typedef uint64_t Elf64_Xword; -# Elf64_Ehdr related constants. +# Elf_Ehdr related constants. # e_ident size. EI_NIDENT = 16 # #define EI_NIDENT (16) @@ -16,58 +25,84 @@ EI_MAG0 = 0 # #define EI_MAG0 0 /* File identification by ELFMAG0 = 0x7f # #define ELFMAG0 0x7f /* Magic number byte 0 */ EI_MAG1 = 1 # #define EI_MAG1 1 /* File identification byte 1 index */ -ELFMAG1 = ord( - 'E') # #define ELFMAG1 'E' /* Magic number byte 1 */ +ELFMAG1 = ord('E') # #define ELFMAG1 'E' /* Magic number byte 1 */ EI_MAG2 = 2 # #define EI_MAG2 2 /* File identification byte 2 index */ -ELFMAG2 = ord( - 'L') # #define ELFMAG2 'L' /* Magic number byte 2 */ +ELFMAG2 = ord('L') # #define ELFMAG2 'L' /* Magic number byte 2 */ EI_MAG3 = 3 # #define EI_MAG3 3 /* File identification byte 3 index */ -ELFMAG3 = ord( - 'F') # #define ELFMAG3 'F' /* Magic number byte 3 */ +ELFMAG3 = ord('F') # #define ELFMAG3 'F' /* Magic number byte 3 */ EI_CLASS = 4 # #define EI_CLASS 4 /* File class byte index */ EI_DATA = 5 # #define EI_DATA 5 /* Data encoding byte index */ +EI_OSABI = 7 # #define EI_OSABI 7 /* OS ABI identification */ + EI_VERSION = 6 # #define EI_VERSION 6 /* File version byte index */ ELFDATA2LSB = 1 # #define ELFDATA2LSB 1 /* 2's complement, little endian */ +ELFCLASS32 = 1 # #define ELFCLASS32 1 /* 32-bit objects */ ELFCLASS64 = 2 # #define ELFCLASS64 2 /* 64-bit objects */ # Legal values for e_type (object file type). ET_CORE = 4 # #define ET_CORE 4 /* Core file */ # Legal values for e_machine (architecture). +EM_ARM = 40 # #define EM_ARM 40 /* ARM */ EM_X86_64 = 62 # #define EM_X86_64 62 /* AMD x86-64 architecture */ +EM_AARCH64 = 183 # #define EM_AARCH64 183 /* ARM AARCH64 */ # Legal values for e_version (version). EV_CURRENT = 1 # #define EV_CURRENT 1 /* Current version */ +# Legal values for e_osabi +ELFOSABI_NONE = 0 # #define ELFOSABI_NONE 0 /* UNIX System V ABI */ +ELFOSABI_ARM = 97 # #define ELFOSABI_ARM 97 /* ARM */ + + +class Elf32_Ehdr(ctypes.Structure): # typedef struct + _fields_ = [ + ("e_ident", + ctypes.c_ubyte * EI_NIDENT), # unsigned char e_ident[EI_NIDENT]; + ("e_type", Elf32_Half), # Elf32_Half e_type; + ("e_machine", Elf32_Half), # Elf32_Half e_machine; + ("e_version", Elf32_Word), # Elf32_Word e_version; + ("e_entry", Elf32_Addr), # Elf32_Addr e_entry; + ("e_phoff", Elf32_Off), # Elf32_Off e_phoff; + ("e_shoff", Elf32_Off), # Elf32_Off e_shoff; + ("e_flags", Elf32_Word), # Elf32_Word e_flags; + ("e_ehsize", Elf32_Half), # Elf32_Half e_ehsize; + ("e_phentsize", Elf32_Half), # Elf32_Half e_phentsize; + ("e_phnum", Elf32_Half), # Elf32_Half e_phnum; + ("e_shentsize", Elf32_Half), # Elf32_Half e_shentsize; + ("e_shnum", Elf32_Half), # Elf32_Half e_shnum; + ("e_shstrndx", Elf32_Half) # Elf32_Half e_shstrndx; + ] # } Elf32_Ehdr; + class Elf64_Ehdr(ctypes.Structure): # typedef struct - _fields_ = [ # { + _fields_ = [ ("e_ident", - ctypes.c_ubyte * EI_NIDENT), # unsigned char e_ident[EI_NIDENT]; - ("e_type", Elf64_Half), # Elf64_Half e_type; - ("e_machine", Elf64_Half), # Elf64_Half e_machine; - ("e_version", Elf64_Word), # Elf64_Word e_version; - ("e_entry", Elf64_Addr), # Elf64_Addr e_entry; - ("e_phoff", Elf64_Off), # Elf64_Off e_phoff; - ("e_shoff", Elf64_Off), # Elf64_Off e_shoff; - ("e_flags", Elf64_Word), # Elf64_Word e_flags; - ("e_ehsize", Elf64_Half), # Elf64_Half e_ehsize; - ("e_phentsize", Elf64_Half), # Elf64_Half e_phentsize; - ("e_phnum", Elf64_Half), # Elf64_Half e_phnum; - ("e_shentsize", Elf64_Half), # Elf64_Half e_shentsize; - ("e_shnum", Elf64_Half), # Elf64_Half e_shnum; - ("e_shstrndx", Elf64_Half) # Elf64_Half e_shstrndx; + ctypes.c_ubyte * EI_NIDENT), # unsigned char e_ident[EI_NIDENT]; + ("e_type", Elf64_Half), # Elf64_Half e_type; + ("e_machine", Elf64_Half), # Elf64_Half e_machine; + ("e_version", Elf64_Word), # Elf64_Word e_version; + ("e_entry", Elf64_Addr), # Elf64_Addr e_entry; + ("e_phoff", Elf64_Off), # Elf64_Off e_phoff; + ("e_shoff", Elf64_Off), # Elf64_Off e_shoff; + ("e_flags", Elf64_Word), # Elf64_Word e_flags; + ("e_ehsize", Elf64_Half), # Elf64_Half e_ehsize; + ("e_phentsize", Elf64_Half), # Elf64_Half e_phentsize; + ("e_phnum", Elf64_Half), # Elf64_Half e_phnum; + ("e_shentsize", Elf64_Half), # Elf64_Half e_shentsize; + ("e_shnum", Elf64_Half), # Elf64_Half e_shnum; + ("e_shstrndx", Elf64_Half) # Elf64_Half e_shstrndx; ] # } Elf64_Ehdr; -# Elf64_Phdr related constants. +# Elf_Phdr related constants. # Legal values for p_type (segment type). PT_LOAD = 1 # #define PT_LOAD 1 /* Loadable program segment */ @@ -79,20 +114,51 @@ PF_W = 1 << 1 # #define PF_W (1 << 1) /* Segment is writable PF_R = 1 << 2 # #define PF_R (1 << 2) /* Segment is readable */ +class Elf32_Phdr(ctypes.Structure): # typedef struct + _fields_ = [ + ("p_type", Elf32_Word), # Elf32_Word p_type; + ("p_offset", Elf32_Off), # Elf32_Off p_offset; + ("p_vaddr", Elf32_Addr), # Elf32_Addr p_vaddr; + ("p_paddr", Elf32_Addr), # Elf32_Addr p_paddr; + ("p_filesz", Elf32_Word), # Elf32_Word p_filesz; + ("p_memsz", Elf32_Word), # Elf32_Word p_memsz; + ("p_flags", Elf32_Word), # Elf32_Word p_flags; + ("p_align", Elf32_Word), # Elf32_Word p_align; + ] # } Elf32_Phdr; + + class Elf64_Phdr(ctypes.Structure): # typedef struct - _fields_ = [ # { - ("p_type", Elf64_Word), # Elf64_Word p_type; - ("p_flags", Elf64_Word), # Elf64_Word p_flags; - ("p_offset", Elf64_Off), # Elf64_Off p_offset; - ("p_vaddr", Elf64_Addr), # Elf64_Addr p_vaddr; - ("p_paddr", Elf64_Addr), # Elf64_Addr p_paddr; - ("p_filesz", Elf64_Xword), # Elf64_Xword p_filesz; - ("p_memsz", Elf64_Xword), # Elf64_Xword p_memsz; - ("p_align", Elf64_Xword), # Elf64_Xword p_align; + _fields_ = [ + ("p_type", Elf64_Word), # Elf64_Word p_type; + ("p_flags", Elf64_Word), # Elf64_Word p_flags; + ("p_offset", Elf64_Off), # Elf64_Off p_offset; + ("p_vaddr", Elf64_Addr), # Elf64_Addr p_vaddr; + ("p_paddr", Elf64_Addr), # Elf64_Addr p_paddr; + ("p_filesz", Elf64_Xword), # Elf64_Xword p_filesz; + ("p_memsz", Elf64_Xword), # Elf64_Xword p_memsz; + ("p_align", Elf64_Xword), # Elf64_Xword p_align; ] # } Elf64_Phdr; -# Elf64_auxv_t related constants. +# Elf_auxv_t related constants. + + +class _Elf32_auxv_t_U(ctypes.Union): + _fields_ = [("a_val", ctypes.c_uint32)] + + +class Elf32_auxv_t(ctypes.Structure): # typedef struct + _fields_ = [ + ("a_type", + ctypes.c_uint32), # uint32_t a_type; /* Entry type */ + ("a_un", _Elf32_auxv_t_U) # union + + # uint32_t a_val; /* Integer value */ + # /* We use to have pointer elements added here. We cannot do that, + # though, since it does not work when using 32-bit definitions + # on 64-bit platforms and vice versa. */ + # } a_un; + ] # } Elf32_auxv_t; class _Elf64_auxv_t_U(ctypes.Union): @@ -100,78 +166,150 @@ class _Elf64_auxv_t_U(ctypes.Union): class Elf64_auxv_t(ctypes.Structure): # typedef struct - _fields_ = [ # { + _fields_ = [ ("a_type", - ctypes.c_uint64), # uint64_t a_type; /* Entry type */ - ("a_un", _Elf64_auxv_t_U) # union - # { - # uint64_t a_val; /* Integer value */ - # /* We use to have pointer elements added here. We cannot do that, - # though, since it does not work when using 32-bit definitions - # on 64-bit platforms and vice versa. */ - # } a_un; + ctypes.c_uint64), # uint64_t a_type; /* Entry type */ + ("a_un", _Elf64_auxv_t_U) # union + + # uint64_t a_val; /* Integer value */ + # /* We use to have pointer elements added here. We cannot do that, + # though, since it does not work when using 32-bit definitions + # on 64-bit platforms and vice versa. */ + # } a_un; ] # } Elf64_auxv_t; -# Elf64_Nhdr related constants. +# Elf_Nhdr related constants. -NT_PRSTATUS = 1 # #define NT_PRSTATUS 1 /* Contains copy of prstatus struct */ -NT_FPREGSET = 2 # #define NT_FPREGSET 2 /* Contains copy of fpregset struct */ -NT_PRPSINFO = 3 # #define NT_PRPSINFO 3 /* Contains copy of prpsinfo struct */ -NT_AUXV = 6 # #define NT_AUXV 6 /* Contains copy of auxv array */ -NT_SIGINFO = 0x53494749 # #define NT_SIGINFO 0x53494749 /* Contains copy of siginfo_t, -# size might increase */ -NT_FILE = 0x46494c45 # #define NT_FILE 0x46494c45 /* Contains information about mapped -# files */ -NT_X86_XSTATE = 0x202 # #define NT_X86_XSTATE 0x202 /* x86 extended state using xsave */ +NT_PRSTATUS = 1 # #define NT_PRSTATUS 1 /* Contains copy of prstatus struct */ +NT_FPREGSET = 2 # #define NT_FPREGSET 2 /* Contains copy of fpregset struct */ +NT_PRPSINFO = 3 # #define NT_PRPSINFO 3 /* Contains copy of prpsinfo struct */ +NT_AUXV = 6 # #define NT_AUXV 6 /* Contains copy of auxv array */ +NT_SIGINFO = 0x53494749 # #define NT_SIGINFO 0x53494749 /* Contains copy of siginfo_t, size might increase */ +NT_FILE = 0x46494c45 # #define NT_FILE 0x46494c45 /* Contains information about mapped files */ +NT_X86_XSTATE = 0x202 # #define NT_X86_XSTATE 0x202 /* x86 extended state using xsave */ +NT_ARM_VFP = 0x400 # #define NT_ARM_VFP 0x400 /* ARM VFP/NEON registers */ +NT_ARM_TLS = 0x401 # #define NT_ARM_TLS 0x401 /* ARM TLS register */ + + +class Elf32_Nhdr(ctypes.Structure): # typedef struct + _fields_ = [ + ( + "n_namesz", Elf32_Word + ), # Elf32_Word n_namesz; /* Length of the note's name. */ + ( + "n_descsz", Elf32_Word + ), # Elf32_Word n_descsz; /* Length of the note's descriptor. */ + ( + "n_type", Elf32_Word + ), # Elf32_Word n_type; /* Type of the note. */ + ] # } Elf32_Nhdr; class Elf64_Nhdr(ctypes.Structure): # typedef struct - _fields_ = [ # { + _fields_ = [ ( "n_namesz", Elf64_Word - ), # Elf64_Word n_namesz; /* Length of the note's name. */ + ), # Elf64_Word n_namesz; /* Length of the note's name. */ ( "n_descsz", Elf64_Word - ), # Elf64_Word n_descsz; /* Length of the note's descriptor. */ + ), # Elf64_Word n_descsz; /* Length of the note's descriptor. */ ("n_type", Elf64_Word - ), # Elf64_Word n_type; /* Type of the note. */ + ), # Elf64_Word n_type; /* Type of the note. */ ] # } Elf64_Nhdr; -# Elf64_Shdr related constants. +# Elf_Shdr related constants. -class Elf64_Shdr(ctypes.Structure): # typedef struct - _fields_ = [ # { +class Elf32_Shdr(ctypes.Structure): + _fields_ = [ ( + # Section name (string tbl index) + "sh_name", Elf32_Word + ), + ( + # Section type + "sh_type", Elf32_Word + ), + ( + # Section flags + "sh_flags", Elf32_Word + ), + ( + # Section virtual addr at execution + "sh_addr", Elf32_Addr + ), + ( + # Section file offset + "sh_offset", Elf32_Off + ), + ( + # Section size in bytes + "sh_size", Elf32_Word + ), + ( + # Link to another section + "sh_link", Elf32_Word + ), + ( + # Additional section information + "sh_info", Elf32_Word + ), + ( + # Section alignment + "sh_addralign", Elf32_Word + ), + ( + # Entry size if section holds table + "sh_entsize", Elf32_Word + ) + ] + + +class Elf64_Shdr(ctypes.Structure): + _fields_ = [ + ( + # Section name (string tbl index) "sh_name", Elf64_Word - ), # Elf64_Word sh_name; /* Section name (string tbl index) */ - ("sh_type", Elf64_Word - ), # Elf64_Word sh_type; /* Section type */ - ("sh_flags", Elf64_Xword - ), # Elf64_Xword sh_flags; /* Section flags */ + ), ( + # Section type + "sh_type", Elf64_Word + ), + ( + # Section flags + "sh_flags", Elf64_Xword + ), + ( + # Section virtual addr at execution "sh_addr", Elf64_Addr - ), # Elf64_Addr sh_addr; /* Section virtual addr at execution */ + ), ( + # Section file offset "sh_offset", Elf64_Off - ), # Elf64_Off sh_offset; /* Section file offset */ + ), ( + # Section size in bytes "sh_size", Elf64_Xword - ), # Elf64_Xword sh_size; /* Section size in bytes */ + ), ( + # Link to another section "sh_link", Elf64_Word - ), # Elf64_Word sh_link; /* Link to another section */ + ), ( + # Additional section information "sh_info", Elf64_Word - ), # Elf64_Word sh_info; /* Additional section information */ - ("sh_addralign", Elf64_Xword - ), # Elf64_Xword sh_addralign; /* Section alignment */ + ), ( + # Section alignment + "sh_addralign", Elf64_Xword + ), + ( + # Entry size if section holds table "sh_entsize", Elf64_Xword - ) # Elf64_Xword sh_entsize; /* Entry size if section holds table */ - ] # } Elf64_Shdr; + ) + ] # elf_prstatus related constants. @@ -179,507 +317,753 @@ class Elf64_Shdr(ctypes.Structure): # typedef struct # Signal info. class elf_siginfo(ctypes.Structure): # struct elf_siginfo - _fields_ = [ # { - ("si_signo", ctypes.c_int - ), # int si_signo; /* Signal number. */ - ("si_code", ctypes.c_int - ), # int si_code; /* Extra code. */ - ("si_errno", ctypes.c_int - ) # int si_errno; /* Errno. */ - ] # }; + _fields_ = [ + ( + # Signal number + "si_signo", ctypes.c_int + ), + ( + # Extra code + "si_code", ctypes.c_int + ), + ( + # Errno + "si_errno", ctypes.c_int + ) + ] # A time value that is accurate to the nearest # microsecond but also has a range of years. class timeval(ctypes.Structure): # struct timeval - _fields_ = [ # { - ("tv_sec", - ctypes.c_long), # __time_t tv_sec; /* Seconds. */ - ("tv_usec", ctypes.c_long - ) # __suseconds_t tv_usec; /* Microseconds. */ - ] # }; + _fields_ = [ + ( + # __time_t tv_sec; /* Seconds. */ + "tv_sec", ctypes.c_long + ), + ( + # __suseconds_t tv_usec; /* Microseconds. */ + "tv_usec", ctypes.c_long + ) + ] -class user_regs_struct(ctypes.Structure): # struct user_regs_struct - _fields_ = [ # { +class x86_64_user_regs_struct(ctypes.Structure): # struct x86_64_user_regs_struct + _fields_ = [ ("r15", - ctypes.c_ulonglong), # __extension__ unsigned long long int r15; + ctypes.c_ulonglong), # __extension__ unsigned long long int r15; ("r14", - ctypes.c_ulonglong), # __extension__ unsigned long long int r14; + ctypes.c_ulonglong), # __extension__ unsigned long long int r14; ("r13", - ctypes.c_ulonglong), # __extension__ unsigned long long int r13; + ctypes.c_ulonglong), # __extension__ unsigned long long int r13; ("r12", - ctypes.c_ulonglong), # __extension__ unsigned long long int r12; + ctypes.c_ulonglong), # __extension__ unsigned long long int r12; ("rbp", - ctypes.c_ulonglong), # __extension__ unsigned long long int rbp; + ctypes.c_ulonglong), # __extension__ unsigned long long int rbp; ("rbx", - ctypes.c_ulonglong), # __extension__ unsigned long long int rbx; + ctypes.c_ulonglong), # __extension__ unsigned long long int rbx; ("r11", - ctypes.c_ulonglong), # __extension__ unsigned long long int r11; + ctypes.c_ulonglong), # __extension__ unsigned long long int r11; ("r10", - ctypes.c_ulonglong), # __extension__ unsigned long long int r10; + ctypes.c_ulonglong), # __extension__ unsigned long long int r10; ("r9", - ctypes.c_ulonglong), # __extension__ unsigned long long int r9; + ctypes.c_ulonglong), # __extension__ unsigned long long int r9; ("r8", - ctypes.c_ulonglong), # __extension__ unsigned long long int r8; + ctypes.c_ulonglong), # __extension__ unsigned long long int r8; ("rax", - ctypes.c_ulonglong), # __extension__ unsigned long long int rax; + ctypes.c_ulonglong), # __extension__ unsigned long long int rax; ("rcx", - ctypes.c_ulonglong), # __extension__ unsigned long long int rcx; + ctypes.c_ulonglong), # __extension__ unsigned long long int rcx; ("rdx", - ctypes.c_ulonglong), # __extension__ unsigned long long int rdx; + ctypes.c_ulonglong), # __extension__ unsigned long long int rdx; ("rsi", - ctypes.c_ulonglong), # __extension__ unsigned long long int rsi; + ctypes.c_ulonglong), # __extension__ unsigned long long int rsi; ("rdi", - ctypes.c_ulonglong), # __extension__ unsigned long long int rdi; + ctypes.c_ulonglong), # __extension__ unsigned long long int rdi; ("orig_rax", ctypes.c_ulonglong - ), # __extension__ unsigned long long int orig_rax; + ), # __extension__ unsigned long long int orig_rax; ("rip", - ctypes.c_ulonglong), # __extension__ unsigned long long int rip; + ctypes.c_ulonglong), # __extension__ unsigned long long int rip; ("cs", - ctypes.c_ulonglong), # __extension__ unsigned long long int cs; + ctypes.c_ulonglong), # __extension__ unsigned long long int cs; ("eflags", - ctypes.c_ulonglong), # __extension__ unsigned long long int eflags; + ctypes.c_ulonglong), # __extension__ unsigned long long int eflags; ("rsp", - ctypes.c_ulonglong), # __extension__ unsigned long long int rsp; + ctypes.c_ulonglong), # __extension__ unsigned long long int rsp; ("ss", - ctypes.c_ulonglong), # __extension__ unsigned long long int ss; + ctypes.c_ulonglong), # __extension__ unsigned long long int ss; ("fs_base", ctypes.c_ulonglong - ), # __extension__ unsigned long long int fs_base; + ), # __extension__ unsigned long long int fs_base; ("gs_base", ctypes.c_ulonglong - ), # __extension__ unsigned long long int gs_base; + ), # __extension__ unsigned long long int gs_base; ("ds", - ctypes.c_ulonglong), # __extension__ unsigned long long int ds; + ctypes.c_ulonglong), # __extension__ unsigned long long int ds; ("es", - ctypes.c_ulonglong), # __extension__ unsigned long long int es; + ctypes.c_ulonglong), # __extension__ unsigned long long int es; ("fs", - ctypes.c_ulonglong), # __extension__ unsigned long long int fs; + ctypes.c_ulonglong), # __extension__ unsigned long long int fs; ("gs", ctypes.c_ulonglong - ) # __extension__ unsigned long long int gs; - ] # }; + ) # __extension__ unsigned long long int gs; + ] -#elf_greg_t = ctypes.c_ulonglong -#ELF_NGREG = ctypes.sizeof(user_regs_struct)/ctypes.sizeof(elf_greg_t) -#elf_gregset_t = elf_greg_t*ELF_NGREG -elf_gregset_t = user_regs_struct +class aarch64_user_regs_struct(ctypes.Structure): # struct aarch64_user_regs_struct + _fields_ = [ + ("regs", + ctypes.c_ulonglong * 31), # unsigned long long int regs[31]; + ("sp", + ctypes.c_ulonglong), # unsigned long long int sp; + ("pc", + ctypes.c_ulonglong), # unsigned long long int pc; + ("pstate", + ctypes.c_ulonglong), # unsigned long long int pstate; + ] + + +class arm_user_regs_struct(ctypes.Structure): # struct arm_user_regs_struct + _fields_ = [ + ("r0", + ctypes.c_ulong), # unsigned ulong int r0; + ("r1", + ctypes.c_ulong), # unsigned ulong int r1; + ("r2", + ctypes.c_ulong), # unsigned ulong int r2; + ("r3", + ctypes.c_ulong), # unsigned ulong int r3; + ("r4", + ctypes.c_ulong), # unsigned ulong int r4; + ("r5", + ctypes.c_ulong), # unsigned ulong int r5; + ("r6", + ctypes.c_ulong), # unsigned ulong int r6; + ("r7", + ctypes.c_ulong), # unsigned ulong int r7; + ("r8", + ctypes.c_ulong), # unsigned ulong int r8; + ("r9", + ctypes.c_ulong), # unsigned ulong int r9; + ("r10", + ctypes.c_ulong), # unsigned ulong int r10; + ("fp", + ctypes.c_ulong), # unsigned ulong int fp; + ("ip", + ctypes.c_ulong), # unsigned ulong int ip; + ("sp", + ctypes.c_ulong), # unsigned ulong int sp; + ("lr", + ctypes.c_ulong), # unsigned ulong int lr; + ("pc", + ctypes.c_ulong), # unsigned ulong int pc; + ("cpsr", + ctypes.c_ulong), # unsigned ulong int cpsr; + ("orig_r0", + ctypes.c_ulong), # unsigned ulong int orig_r0; + ] + + +# elf_greg_t = ctypes.c_ulonglong +# ELF_NGREG = ctypes.sizeof(user_regs_struct)/ctypes.sizeof(elf_greg_t) +# elf_gregset_t = elf_greg_t*ELF_NGREG +user_regs_dict = { + "aarch64": aarch64_user_regs_struct, + "armv7l": arm_user_regs_struct, + "x86_64": x86_64_user_regs_struct, +} + +try: + elf_gregset_t = user_regs_dict[MACHINE] +except KeyError: + raise ValueError("Current architecture %s is not supported." % MACHINE) class elf_prstatus(ctypes.Structure): # struct elf_prstatus - _fields_ = [ # { + _fields_ = [ ( + # Info associated with signal + # struct elf_siginfo pr_info; "pr_info", elf_siginfo - ), # struct elf_siginfo pr_info; /* Info associated with signal. */ - ("pr_cursig", ctypes.c_short - ), # short int pr_cursig; /* Current signal. */ + ), ( + # Current signal + # short int pr_cursig; + "pr_cursig", ctypes.c_short + ), + ( + # Set of pending signals + # unsigned long int pr_sigpend; "pr_sigpend", ctypes.c_ulong - ), # unsigned long int pr_sigpend; /* Set of pending signals. */ + ), ( + # Set of held signals + # unsigned long int pr_sighold; "pr_sighold", ctypes.c_ulong - ), # unsigned long int pr_sighold; /* Set of held signals. */ - ("pr_pid", ctypes.c_int), # __pid_t pr_pid; - ("pr_ppid", ctypes.c_int), # __pid_t pr_ppid; - ("pr_pgrp", ctypes.c_int), # __pid_t pr_pgrp; - ("pr_sid", ctypes.c_int), # __pid_t pr_sid; - ("pr_utime", - timeval), # struct timeval pr_utime; /* User time. */ - ("pr_stime", timeval - ), # struct timeval pr_stime; /* System time. */ + ), ( + # Process ID + # __pid_t pr_pid; + "pr_pid", ctypes.c_int + ), + ( + # Parent process ID + # __pid_t pr_ppid; + "pr_ppid", ctypes.c_int + ), + ( + # Parent group ID + # __pid_t pr_pgrp; + "pr_pgrp", ctypes.c_int + ), + ( + # Parent session ID + # __pid_t pr_sid; + "pr_sid", ctypes.c_int + ), + ( + # User time + # struct timeval pr_utime; + "pr_utime", timeval + ), + ( + # System time + # struct timeval pr_stime; + "pr_stime", timeval + ), + ( + # Cumulative user time + # struct timeval pr_cutime; "pr_cutime", timeval - ), # struct timeval pr_cutime; /* Cumulative user time. */ + ), ( + # Cumulative system time + # struct timeval pr_cstime; "pr_cstime", timeval - ), # struct timeval pr_cstime; /* Cumulative system time. */ - ("pr_reg", elf_gregset_t - ), # elf_gregset_t pr_reg; /* GP registers. */ + ), ( + # GP registers + # elf_gregset_t pr_reg; + "pr_reg", elf_gregset_t + ), + ( + # True if math copro being used + # int pr_fpvalid; "pr_fpvalid", ctypes.c_int - ) # int pr_fpvalid; /* True if math copro being used. */ - ] # }; + ) + ] # elf_prpsinfo related constants. -ELF_PRARGSZ = 80 # #define ELF_PRARGSZ (80) /* Number of chars for args. */ +# Number of chars for args +# #define ELF_PRARGSZ (80) +ELF_PRARGSZ = 80 class elf_prpsinfo(ctypes.Structure): # struct elf_prpsinfo - _fields_ = [ # { + _fields_ = [ ( + # Numeric process state + # char pr_state; "pr_state", ctypes.c_byte - ), # char pr_state; /* Numeric process state. */ + ), ( + # Char for pr_state + # char pr_sname; "pr_sname", ctypes.c_char - ), # char pr_sname; /* Char for pr_state. */ - ("pr_zomb", ctypes.c_byte - ), # char pr_zomb; /* Zombie. */ - ("pr_nice", ctypes.c_byte - ), # char pr_nice; /* Nice val. */ - ("pr_flag", ctypes.c_ulong - ), # unsigned long int pr_flag; /* Flags. */ - # #if __WORDSIZE == 32 - # unsigned short int pr_uid; - # unsigned short int pr_gid; - # #else - ("pr_uid", ctypes.c_uint), # unsigned int pr_uid; - ("pr_gid", ctypes.c_uint), # unsigned int pr_gid; - # #endif - ("pr_pid", ctypes.c_int), # int pr_pid, pr_ppid, pr_pgrp, pr_sid; + ), + ( + # Zombie + # char pr_zomb; + "pr_zomb", ctypes.c_byte + ), + ( + # Nice value + # char pr_nice; + "pr_nice", ctypes.c_byte + ), + ( + # Flags + # unsigned long int pr_flag; + "pr_flag", ctypes.c_ulong + ), + ( + # User ID + # unsigned int pr_uid; + "pr_uid", ctypes.c_uint + ), + ( + # Group ID + # unsigned int pr_gid; + "pr_gid", ctypes.c_uint + ), + ("pr_pid", ctypes.c_int), ("pr_ppid", ctypes.c_int), ("pr_pgrp", ctypes.c_int), ("pr_sid", ctypes.c_int), - # /* Lots missing */ + # /* Lots missing */ ( + # Filename of executable + # char pr_fname[16]; "pr_fname", ctypes.c_char * 16 - ), # char pr_fname[16]; /* Filename of executable. */ + ), ( + # Initial part of arg list + # char pr_psargs[ELF_PRARGSZ]; "pr_psargs", ctypes.c_char * ELF_PRARGSZ - ) # char pr_psargs[ELF_PRARGSZ]; /* Initial part of arg list. */ - ] # }; + ) + ] -class user_fpregs_struct(ctypes.Structure): # struct user_fpregs_struct - _fields_ = [ # { - ("cwd", ctypes.c_ushort), # unsigned short int cwd; - ("swd", ctypes.c_ushort), # unsigned short int swd; - ("ftw", ctypes.c_ushort), # unsigned short int ftw; - ("fop", ctypes.c_ushort), # unsigned short int fop; - ("rip", - ctypes.c_ulonglong), # __extension__ unsigned long long int rip; - ("rdp", - ctypes.c_ulonglong), # __extension__ unsigned long long int rdp; - ("mxcsr", ctypes.c_uint), # unsigned int mxcsr; - ("mxcr_mask", ctypes.c_uint), # unsigned int mxcr_mask; - ( - "st_space", ctypes.c_uint * 32 - ), # unsigned int st_space[32]; /* 8*16 bytes for each FP-reg = 128 bytes */ - ( - "xmm_space", ctypes.c_uint * 64 - ), # unsigned int xmm_space[64]; /* 16*16 bytes for each XMM-reg = 256 bytes */ - ("padding", - ctypes.c_uint * 24), # unsigned int padding[24]; - ] # }; +class x86_64_user_fpregs_struct(ctypes.Structure): # struct x86_64_user_fpregs_struct + _fields_ = [ + # unsigned short int cwd; + ("cwd", ctypes.c_ushort), + # unsigned short int swd; + ("swd", ctypes.c_ushort), + # unsigned short int ftw; + ("ftw", ctypes.c_ushort), + # unsigned short int fop; + ("fop", ctypes.c_ushort), + # __extension__ unsigned long long int rip; + ("rip", ctypes.c_ulonglong), + # __extension__ unsigned long long int rdp; + ("rdp", ctypes.c_ulonglong), + # unsigned int mxcsr; + ("mxcsr", ctypes.c_uint), + # unsigned int mxcr_mask; + ("mxcr_mask", ctypes.c_uint), + # unsigned int st_space[32]; /* 8*16 bytes for each FP-reg = 128 bytes */ + ("st_space", ctypes.c_uint * 32), + # unsigned int xmm_space[64]; /* 16*16 bytes for each XMM-reg = 256 bytes */ + ("xmm_space", ctypes.c_uint * 64), + # unsigned int padding[24]; + ("padding", ctypes.c_uint * 24), + ] -elf_fpregset_t = user_fpregs_struct +class aarch64_user_fpregs_struct(ctypes.Structure): # struct aarch64_user_fpregs_struct + _fields_ = [ + # unsigned long long int vregs[64]; + ("vregs", ctypes.c_ulonglong * 64), + # unsigned int fpsr; + ("fpsr", ctypes.c_uint), + # unsigned int fpcr; + ("fpcr", ctypes.c_uint), + # unsigned int padding[2]; + ("padding", ctypes.c_uint * 2), + ] + + +user_fpregs_dict = { + "aarch64": aarch64_user_fpregs_struct, + "armv7l": None, + "x86_64": x86_64_user_fpregs_struct, +} + +try: + elf_fpregset_t = user_fpregs_dict[MACHINE] +except KeyError: + raise ValueError("Current architecture %s is not supported." % MACHINE) # siginfo_t related constants. _SI_MAX_SIZE = 128 -_SI_PAD_SIZE = (_SI_MAX_SIZE / ctypes.sizeof(ctypes.c_int)) - 4 +_SI_PAD_SIZE = (_SI_MAX_SIZE // ctypes.sizeof(ctypes.c_int)) - 4 -# /* kill(). */ -class _siginfo_t_U_kill(ctypes.Structure): # struct - _fields_ = [ # { - ("si_pid", ctypes.c_int - ), # __pid_t si_pid; /* Sending process ID. */ +# /* kill(). */ +class _siginfo_t_U_kill(ctypes.Structure): # struct + _fields_ = [ ( + # Sending process ID + # __pid_t si_pid; + "si_pid", ctypes.c_int + ), + ( + # Real user ID of sending process + # __uid_t si_uid; "si_uid", ctypes.c_uint - ) # __uid_t si_uid; /* Real user ID of sending process. */ - ] # } _kill; + ) + ] # } _kill; # Type for data associated with a signal. class sigval_t(ctypes.Union): # typedef union sigval - _fields_ = [ # { - ("sival_int", ctypes.c_int), # int sival_int; - ("sical_ptr", ctypes.c_void_p), # void *sival_ptr; - ] # } sigval_t; + _fields_ = [ + ("sival_int", ctypes.c_int), # int sival_int; + ("sical_ptr", ctypes.c_void_p), # void *sival_ptr; + ] # } sigval_t; - # /* POSIX.1b timers. */ -class _siginfo_t_U_timer(ctypes.Structure): # struct - _fields_ = [ # { - ("si_tid", - ctypes.c_int), # int si_tid; /* Timer ID. */ - ("si_overrun", ctypes.c_int - ), # int si_overrun; /* Overrun count. */ - ("si_sigval", sigval_t - ) # sigval_t si_sigval; /* Signal value. */ - ] # } _timer; - - - # /* POSIX.1b signals. */ -class _siginfo_t_U_rt(ctypes.Structure): # struct - _fields_ = [ # { - ("si_pid", ctypes.c_int - ), # __pid_t si_pid; /* Sending process ID. */ +# /* POSIX.1b timers. */ +class _siginfo_t_U_timer(ctypes.Structure): # struct + _fields_ = [ ( + # Timer ID + # int si_tid; + "si_tid", ctypes.c_int + ), + ( + # Overrun count + # int si_overrun; + "si_overrun", ctypes.c_int + ), + ( + # Signal value + # sigval_t si_sigval; + "si_sigval", sigval_t + ) + ] # } _timer; + + +# /* POSIX.1b signals. */ +class _siginfo_t_U_rt(ctypes.Structure): # struct + _fields_ = [ + ( + # Sending process ID + # __pid_t si_pid; + "si_pid", ctypes.c_int + ), + ( + # Real user ID of sending process + # __uid_t si_uid; "si_uid", ctypes.c_uint - ), # __uid_t si_uid; /* Real user ID of sending process. */ - ("si_sigval", sigval_t - ) # sigval_t si_sigval; /* Signal value. */ - ] # } _rt; - - - # /* SIGCHLD. */ -class _siginfo_t_U_sigchld(ctypes.Structure): # struct - _fields_ = [ # { - ("si_pid", - ctypes.c_int), # __pid_t si_pid; /* Which child. */ + ), ( + # Signal value + # sigval_t si_sigval; + "si_sigval", sigval_t + ) + ] # } _rt; + + +# /* SIGCHLD. */ +class _siginfo_t_U_sigchld(ctypes.Structure): # struct + _fields_ = [ + ( + # Which child + # __pid_t si_pid; + "si_pid", ctypes.c_int + ), + ( + # Real user ID of sending process + # __uid_t si_uid; "si_uid", ctypes.c_uint - ), # __uid_t si_uid; /* Real user ID of sending process. */ - ("si_status", ctypes.c_int - ), # int si_status; /* Exit value or signal. */ - ("si_utime", ctypes.c_long), # __sigchld_clock_t si_utime; - ("si_stime", ctypes.c_long) # __sigchld_clock_t si_stime; - ] # } _sigchld; - - - # /* SIGILL, SIGFPE, SIGSEGV, SIGBUS. */ -class _siginfo_t_U_sigfault(ctypes.Structure): # struct - _fields_ = [ # { - ("si_addr", ctypes.c_void_p - ), # void *si_addr; /* Faulting insn/memory ref. */ + ), ( + # Exit value or signal + # int si_status; + "si_status", ctypes.c_int + ), + ( + # __sigchld_clock_t si_utime; + "si_utime", ctypes.c_long + ), + ( + # __sigchld_clock_t si_stime; + "si_stime", ctypes.c_long + ) + ] # } _sigchld; + + +# /* SIGILL, SIGFPE, SIGSEGV, SIGBUS. */ +class _siginfo_t_U_sigfault(ctypes.Structure): # struct + _fields_ = [ + ( + # Faulting insn/memory ref + # void *si_addr; + "si_addr", ctypes.c_void_p + ), + ( + # Valid LSB of the reported address + # short int si_addr_lsb; "si_addr_lsb", ctypes.c_short - ) # short int si_addr_lsb; /* Valid LSB of the reported address. */ - ] # } _sigfault; + ) + ] # } _sigfault; - # /* SIGPOLL. */ -class _siginfo_t_U_sigpoll(ctypes.Structure): # struct - _fields_ = [ # { - ("si_band", ctypes.c_long - ), # long int si_band; /* Band event for SIGPOLL. */ - ("si_fd", ctypes.c_int) # int si_fd; - ] # } _sigpoll; +# /* SIGPOLL. */ +class _siginfo_t_U_sigpoll(ctypes.Structure): # struct + _fields_ = [ + ( + # Band event for SIGPOLL + # long int si_band; + "si_band", ctypes.c_long + ), + ( + # int si_fd; + "si_fd", ctypes.c_int + ) + ] # } _sigpoll; - # /* SIGSYS. */ -class _siginfo_t_U_sigsys(ctypes.Structure): # struct - _fields_ = [ # { +# /* SIGSYS. */ +class _siginfo_t_U_sigsys(ctypes.Structure): # struct + _fields_ = [ ("_call_addr", ctypes.c_void_p - ), # void *_call_addr; /* Calling user insn. */ + ), # void *_call_addr; /* Calling user insn. */ ( "_syscall", ctypes.c_int - ), # int _syscall; /* Triggering system call number. */ + ), # int _syscall; /* Triggering system call number. */ ("_arch", ctypes.c_uint - ) # unsigned int _arch; /* AUDIT_ARCH_* of syscall. */ - ] # } _sigsys; + ) # unsigned int _arch; /* AUDIT_ARCH_* of syscall. */ + ] # } _sigsys; -class _siginfo_t_U(ctypes.Union): # union - _fields_ = [ # { +class _siginfo_t_U(ctypes.Union): # union + _fields_ = [ ("_pad", - ctypes.c_int * _SI_PAD_SIZE), # int _pad[__SI_PAD_SIZE]; - # - # /* kill(). */ - ("_kill", _siginfo_t_U_kill), # struct - # { - # __pid_t si_pid; /* Sending process ID. */ - # __uid_t si_uid; /* Real user ID of sending process. */ - # } _kill; - # - # /* POSIX.1b timers. */ - ("_timer", _siginfo_t_U_timer), # struct - # { - # int si_tid; /* Timer ID. */ - # int si_overrun; /* Overrun count. */ - # sigval_t si_sigval; /* Signal value. */ - # } _timer; - # - # /* POSIX.1b signals. */ - ("_rt", _siginfo_t_U_rt), # struct - # { - # __pid_t si_pid; /* Sending process ID. */ - # __uid_t si_uid; /* Real user ID of sending process. */ - # sigval_t si_sigval; /* Signal value. */ - # } _rt; - # - # /* SIGCHLD. */ - ("_sigchld", _siginfo_t_U_sigchld), # struct - # { - # __pid_t si_pid; /* Which child. */ - # __uid_t si_uid; /* Real user ID of sending process. */ - # int si_status; /* Exit value or signal. */ - # __sigchld_clock_t si_utime; - # __sigchld_clock_t si_stime; - # } _sigchld; - # - # /* SIGILL, SIGFPE, SIGSEGV, SIGBUS. */ - ("_sigfault", _siginfo_t_U_sigfault), # struct - # { - # void *si_addr; /* Faulting insn/memory ref. */ - # short int si_addr_lsb; /* Valid LSB of the reported address. */ - # } _sigfault; - # - # /* SIGPOLL. */ - ("_sigpoll", _siginfo_t_U_sigpoll), # struct - # { - # long int si_band; /* Band event for SIGPOLL. */ - # int si_fd; - # } _sigpoll; - # - # /* SIGSYS. */ - ("_sigsys", _siginfo_t_U_sigpoll) # struct - # { - # void *_call_addr; /* Calling user insn. */ - # int _syscall; /* Triggering system call number. */ - # unsigned int _arch; /* AUDIT_ARCH_* of syscall. */ - # } _sigsys; - ] # } _sifields; + ctypes.c_int * _SI_PAD_SIZE), # int _pad[__SI_PAD_SIZE]; + + # /* kill(). */ + ("_kill", _siginfo_t_U_kill), # struct + + # __pid_t si_pid; /* Sending process ID. */ + # __uid_t si_uid; /* Real user ID of sending process. */ + # } _kill; + + # /* POSIX.1b timers. */ + ("_timer", _siginfo_t_U_timer), # struct + + # int si_tid; /* Timer ID. */ + # int si_overrun; /* Overrun count. */ + # sigval_t si_sigval; /* Signal value. */ + # } _timer; + + # /* POSIX.1b signals. */ + ("_rt", _siginfo_t_U_rt), # struct + + # __pid_t si_pid; /* Sending process ID. */ + # __uid_t si_uid; /* Real user ID of sending process. */ + # sigval_t si_sigval; /* Signal value. */ + # } _rt; + + # /* SIGCHLD. */ + ("_sigchld", _siginfo_t_U_sigchld), # struct + + # __pid_t si_pid; /* Which child. */ + # __uid_t si_uid; /* Real user ID of sending process. */ + # int si_status; /* Exit value or signal. */ + # __sigchld_clock_t si_utime; + # __sigchld_clock_t si_stime; + # } _sigchld; + + # /* SIGILL, SIGFPE, SIGSEGV, SIGBUS. */ + ("_sigfault", _siginfo_t_U_sigfault), # struct + + # void *si_addr; /* Faulting insn/memory ref. */ + # short int si_addr_lsb; /* Valid LSB of the reported address. */ + # } _sigfault; + + # /* SIGPOLL. */ + ("_sigpoll", _siginfo_t_U_sigpoll), # struct + + # long int si_band; /* Band event for SIGPOLL. */ + # int si_fd; + # } _sigpoll; + + # /* SIGSYS. */ + ("_sigsys", _siginfo_t_U_sigpoll) # struct + + # void *_call_addr; /* Calling user insn. */ + # int _syscall; /* Triggering system call number. */ + # unsigned int _arch; /* AUDIT_ARCH_* of syscall. */ + # } _sigsys; + ] # } _sifields; class siginfo_t(ctypes.Structure): # typedef struct - _fields_ = [ # { - ("si_signo", ctypes.c_int - ), # int si_signo; /* Signal number. */ + _fields_ = [ ( + # Signal number + # int si_signo; + "si_signo", ctypes.c_int + ), + ( + # If non-zero, an errno value associated with + # int si_errno; "si_errno", ctypes.c_int - ), # int si_errno; /* If non-zero, an errno value associated with - # this signal, as defined in . */ - ("si_code", ctypes.c_int - ), # int si_code; /* Signal code. */ + ), + ( + # Signal code - this signal, as defined in + # int si_code; + "si_code", ctypes.c_int + ), + ( + # Union + "_sifields", _siginfo_t_U + ) + + # int _pad[__SI_PAD_SIZE]; # - ("_sifields", _siginfo_t_U) # union - # { - # int _pad[__SI_PAD_SIZE]; + # /* kill(). */ + # struct + + # __pid_t si_pid; /* Sending process ID. */ + # __uid_t si_uid; /* Real user ID of sending process. */ + # } _kill; # - # /* kill(). */ - # struct - # { - # __pid_t si_pid; /* Sending process ID. */ - # __uid_t si_uid; /* Real user ID of sending process. */ - # } _kill; + # /* POSIX.1b timers. */ + # struct + + # int si_tid; /* Timer ID. */ + # int si_overrun; /* Overrun count. */ + # sigval_t si_sigval; /* Signal value. */ + # } _timer; # - # /* POSIX.1b timers. */ - # struct - # { - # int si_tid; /* Timer ID. */ - # int si_overrun; /* Overrun count. */ - # sigval_t si_sigval; /* Signal value. */ - # } _timer; + # /* POSIX.1b signals. */ + # struct + + # __pid_t si_pid; /* Sending process ID. */ + # __uid_t si_uid; /* Real user ID of sending process. */ + # sigval_t si_sigval; /* Signal value. */ + # } _rt; # - # /* POSIX.1b signals. */ - # struct - # { - # __pid_t si_pid; /* Sending process ID. */ - # __uid_t si_uid; /* Real user ID of sending process. */ - # sigval_t si_sigval; /* Signal value. */ - # } _rt; + # /* SIGCHLD. */ + # struct + + # __pid_t si_pid; /* Which child. */ + # __uid_t si_uid; /* Real user ID of sending process. */ + # int si_status; /* Exit value or signal. */ + # __sigchld_clock_t si_utime; + # __sigchld_clock_t si_stime; + # } _sigchld; # - # /* SIGCHLD. */ - # struct - # { - # __pid_t si_pid; /* Which child. */ - # __uid_t si_uid; /* Real user ID of sending process. */ - # int si_status; /* Exit value or signal. */ - # __sigchld_clock_t si_utime; - # __sigchld_clock_t si_stime; - # } _sigchld; + # /* SIGILL, SIGFPE, SIGSEGV, SIGBUS. */ + # struct + + # void *si_addr; /* Faulting insn/memory ref. */ + # short int si_addr_lsb; /* Valid LSB of the reported address. */ + # } _sigfault; # - # /* SIGILL, SIGFPE, SIGSEGV, SIGBUS. */ - # struct - # { - # void *si_addr; /* Faulting insn/memory ref. */ - # short int si_addr_lsb; /* Valid LSB of the reported address. */ - # } _sigfault; + # /* SIGPOLL. */ + # struct + + # long int si_band; /* Band event for SIGPOLL. */ + # int si_fd; + # } _sigpoll; # - # /* SIGPOLL. */ - # struct - # { - # long int si_band; /* Band event for SIGPOLL. */ - # int si_fd; - # } _sigpoll; - # - # /* SIGSYS. */ - # struct - # { - # void *_call_addr; /* Calling user insn. */ - # int _syscall; /* Triggering system call number. */ - # unsigned int _arch; /* AUDIT_ARCH_* of syscall. */ - # } _sigsys; - # } _sifields; - ] # } siginfo_t __SI_ALIGNMENT; + # /* SIGSYS. */ + # struct + + # void *_call_addr; /* Calling user insn. */ + # int _syscall; /* Triggering system call number. */ + # unsigned int _arch; /* AUDIT_ARCH_* of syscall. */ + # } _sigsys; + # } _sifields; + ] # } siginfo_t __SI_ALIGNMENT; # xsave related. class ymmh_struct(ctypes.Structure): # struct ymmh_struct { - _fields_ = [("ymmh_space", 64 * ctypes.c_uint - ) # u32 ymmh_space[64]; - ] # } __packed; + _fields_ = [ + # u32 ymmh_space[64]; + ("ymmh_space", 64 * ctypes.c_uint) + ] # } __packed; class xsave_hdr_struct(ctypes.Structure): # struct xsave_hdr_struct { _fields_ = [ - ("xstate_bv", ctypes.c_ulonglong - ), # u64 xstate_bv; - ("reserved1", ctypes.c_ulonglong * - 2), # u64 reserved1[2]; - ("reserved2", ctypes.c_ulonglong * 5 - ) # u64 reserved2[5]; + # u64 xstate_bv; + ("xstate_bv", ctypes.c_ulonglong), + # u64 reserved1[2]; + ("reserved1", ctypes.c_ulonglong * 2), + # u64 reserved2[5]; + ("reserved2", ctypes.c_ulonglong * 5) ] # } __packed; class i387_fxsave_struct(ctypes.Structure): # struct i387_fxsave_struct { _fields_ = [ ( + # Control Word + # u16 cwd; "cwd", ctypes.c_ushort - ), # u16 cwd; /* Control Word */ + ), ( + # Status Word + # u16 swd; "swd", ctypes.c_ushort - ), # u16 swd; /* Status Word */ + ), ( + # Tag Word + # u16 twd; "twd", ctypes.c_ushort - ), # u16 twd; /* Tag Word */ + ), ( + # Last Instruction Opcode + # u16 fop; "fop", ctypes.c_ushort - ), # u16 fop; /* Last Instruction Opcode */ - # union { - # struct { + ), + # union { + # struct { ( + # Instruction Pointer + # u64 rip; "rip", ctypes.c_ulonglong - ), # u64 rip; /* Instruction Pointer */ + ), ( + # Data Pointer + # u64 rdp; "rdp", ctypes.c_ulonglong - ), # u64 rdp; /* Data Pointer */ - # }; - # struct { - # u32 fip; /* FPU IP Offset */ - # u32 fcs; /* FPU IP Selector */ - # u32 foo; /* FPU Operand Offset */ - # u32 fos; /* FPU Operand Selector */ - # }; - # }; + ), + + # struct { + # u32 fip; /* FPU IP Offset */ + # u32 fcs; /* FPU IP Selector */ + # u32 foo; /* FPU Operand Offset */ + # u32 fos; /* FPU Operand Selector */ + ( + # MXCSR Register State + # u32 mxcsr; "mxcsr", ctypes.c_uint - ), # u32 mxcsr; /* MXCSR Register State */ + ), ( + # MXCSR Mask + # u32 mxcsr_mask; "mxcsr_mask", ctypes.c_uint - ), # u32 mxcsr_mask; /* MXCSR Mask */ - # - # /* 8*16 bytes for each FP-reg = 128 bytes */ - ("st_space", ctypes.c_uint * 32 - ), # u32 st_space[32]; - # - # /* 16*16 bytes for each XMM-reg = 256 bytes */ - ("xmm_space", ctypes.c_uint * 64 - ), # u32 xmm_space[64]; - # - ("padding", ctypes.c_uint * 12 - ), # u32 padding[12]; - # - # union { - ("padding1", ctypes.c_uint * 12 - ) # u32 padding1[12]; - # u32 sw_reserved[12]; - # }; - # + ), + # 8*16 bytes for each FP-reg = 128 bytes + ( + # u32 st_space[32]; + "st_space", ctypes.c_uint * 32 + ), + # 16*16 bytes for each XMM-reg = 256 bytes + ( + # u32 xmm_space[64]; + "xmm_space", ctypes.c_uint * 64 + ), + ( + # u32 padding[12]; + "padding", ctypes.c_uint * 12 + ), + # union { + ( + # u32 padding1[12]; + "padding1", ctypes.c_uint * 12 + ) + # u32 sw_reserved[12]; ] # } __aligned(16); class elf_xsave_struct(ctypes.Structure): # struct xsave_struct { _fields_ = [ - ("i387", - i387_fxsave_struct), # struct i387_fxsave_struct i387; - ("xsave_hdr", xsave_hdr_struct - ), # struct xsave_hdr_struct xsave_hdr; - ("ymmh", ymmh_struct) # struct ymmh_struct ymmh; + # struct i387_fxsave_struct i387; + ("i387", i387_fxsave_struct), + # struct xsave_hdr_struct xsave_hdr; + ("xsave_hdr", xsave_hdr_struct), + # struct ymmh_struct ymmh; + ("ymmh", ymmh_struct) ] # } __aligned(FP_MIN_ALIGN_BYTES) __packed; + + +class vfp_hard_struct(ctypes.Structure): # struct vfp_hard_struct { + _fields_ = [ + ("vfp_regs", ctypes.c_ulonglong * 32), # __u64 fpregs[32]; + ("fpexc", ctypes.c_ulong), # __u32 fpexc; + ("fpscr", ctypes.c_ulong), # __u32 fpscr; + ("fpinst", ctypes.c_ulong), # __u32 fpinst; + ("fpinst2", ctypes.c_ulong), # __u32 fpinst2; + ] # }; diff --git a/coredump/pycriu b/coredump/pycriu index d13a8790a..d1b6ed5c4 120000 --- a/coredump/pycriu +++ b/coredump/pycriu @@ -1 +1 @@ -../lib/py/ \ No newline at end of file +../lib/pycriu \ No newline at end of file diff --git a/crit/.gitignore b/crit/.gitignore new file mode 100644 index 000000000..10c8ab186 --- /dev/null +++ b/crit/.gitignore @@ -0,0 +1,4 @@ +crit.egg-info/ +build/ +dist/ +version.py diff --git a/crit/Makefile b/crit/Makefile index 988b481b6..33bd68eed 100644 --- a/crit/Makefile +++ b/crit/Makefile @@ -1,13 +1,25 @@ +VERSION_FILE := $(if $(obj),$(addprefix $(obj)/,crit/version.py),crit/version.py) -all-y += crit +all-y += ${VERSION_FILE} +cleanup-y += ${VERSION_FILE} -crit/crit: crit/crit-$(PYTHON) - $(Q) cp $^ $@ -crit: crit/crit -.PHONY: crit +${VERSION_FILE}: + $(Q) echo "__version__ = '${CRIU_VERSION}'" > $@ -clean-crit: - $(Q) $(RM) crit/crit -.PHONY: clean-crit -clean: clean-crit -mrproper: clean +install: ${VERSION_FILE} +ifeq ($(SKIP_PIP_INSTALL),0) + $(E) " INSTALL " crit + $(Q) $(PYTHON) -m pip install $(PIPFLAGS) --prefix=$(DESTDIR)$(PREFIX) ./crit +else + $(E) " SKIP INSTALL crit" +endif +.PHONY: install + +uninstall: +ifeq ($(SKIP_PIP_INSTALL),0) + $(E) " UNINSTALL" crit + $(Q) $(PYTHON) ./scripts/uninstall_module.py --prefix=$(DESTDIR)$(PREFIX) crit +else + $(E) " SKIP UNINSTALL crit" +endif +.PHONY: uninstall diff --git a/crit/crit-python2 b/crit/crit-python2 deleted file mode 100755 index b0b7d3c3a..000000000 --- a/crit/crit-python2 +++ /dev/null @@ -1,6 +0,0 @@ -#!/usr/bin/env python2 - -from pycriu import cli - -if __name__ == '__main__': - cli.main() diff --git a/crit/crit/__init__.py b/crit/crit/__init__.py new file mode 100644 index 000000000..58f3ace6c --- /dev/null +++ b/crit/crit/__init__.py @@ -0,0 +1 @@ +from .version import __version__ diff --git a/lib/py/cli.py b/crit/crit/__main__.py similarity index 86% rename from lib/py/cli.py rename to crit/crit/__main__.py index da343022e..bce523445 100755 --- a/lib/py/cli.py +++ b/crit/crit/__main__.py @@ -1,28 +1,39 @@ -from __future__ import print_function +#!/usr/bin/env python3 import argparse import sys import json import os import pycriu +from . import __version__ def inf(opts): if opts['in']: return open(opts['in'], 'rb') else: - return sys.stdin + if sys.stdin.isatty(): + # If we are reading from a terminal (not a pipe) we want text input and not binary + return sys.stdin + return sys.stdin.buffer -def outf(opts): +def outf(opts, decode): + # Decode means from protobuf to JSON. + # Use text when writing to JSON else use binaray mode if opts['out']: - return open(opts['out'], 'w+') + mode = 'wb+' + if decode: + mode = 'w+' + return open(opts['out'], mode) else: - return sys.stdout + if decode: + return sys.stdout + return sys.stdout.buffer def dinf(opts, name): - return open(os.path.join(opts['dir'], name)) + return open(os.path.join(opts['dir'], name), mode='rb') def decode(opts): @@ -31,23 +42,29 @@ def decode(opts): try: img = pycriu.images.load(inf(opts), opts['pretty'], opts['nopl']) except pycriu.images.MagicException as exc: - print("Unknown magic %#x.\n"\ - "Maybe you are feeding me an image with "\ - "raw data(i.e. pages.img)?" % exc.magic, file=sys.stderr) + print("Unknown magic %#x.\n" + "Maybe you are feeding me an image with " + "raw data(i.e. pages.img)?" % exc.magic, file=sys.stderr) sys.exit(1) if opts['pretty']: indent = 4 - f = outf(opts) + f = outf(opts, True) json.dump(img, f, indent=indent) if f == sys.stdout: f.write("\n") def encode(opts): - img = json.load(inf(opts)) - pycriu.images.dump(img, outf(opts)) + try: + img = json.load(inf(opts)) + except UnicodeDecodeError: + print("Cannot read JSON.\n" + "Maybe you are feeding me an image with protobuf data? " + "Encode expects JSON input.", file=sys.stderr) + sys.exit(1) + pycriu.images.dump(img, outf(opts, False)) def info(opts): @@ -115,7 +132,7 @@ def ftype_find_in_files(opts, ft, fid): if files_img is None: try: files_img = pycriu.images.load(dinf(opts, "files.img"))['entries'] - except: + except Exception: files_img = [] if len(files_img) == 0: @@ -131,9 +148,12 @@ def ftype_find_in_files(opts, ft, fid): def ftype_find_in_image(opts, ft, fid, img): f = ftype_find_in_files(opts, ft, fid) if f: - return f[ft['field']] + if ft['field'] in f: + return f[ft['field']] + else: + return None - if ft['img'] == None: + if ft['img'] is None: ft['img'] = pycriu.images.load(dinf(opts, img))['entries'] for f in ft['img']: if f['id'] == fid: @@ -303,12 +323,12 @@ def explore_rss(opts): pvmi = -1 for pm in pms[1:]: pstr = '\t%lx / %-8d' % (pm['vaddr'], pm['nr_pages']) - while vmas[vmi]['end'] <= pm['vaddr']: + while vmi < len(vmas) and vmas[vmi]['end'] <= pm['vaddr']: vmi += 1 pme = pm['vaddr'] + (pm['nr_pages'] << 12) vstr = '' - while vmas[vmi]['start'] < pme: + while vmi < len(vmas) and vmas[vmi]['start'] < pme: vma = vmas[vmi] if vmi == pvmi: vstr += ' ~' @@ -345,6 +365,7 @@ def main(): desc = 'CRiu Image Tool' parser = argparse.ArgumentParser( description=desc, formatter_class=argparse.RawTextHelpFormatter) + parser.add_argument('--version', action='version', version=__version__) subparsers = parser.add_subparsers( help='Use crit CMD --help for command-specific help') @@ -354,8 +375,7 @@ def main(): 'decode', help='convert criu image from binary type to json') decode_parser.add_argument( '--pretty', - help= - 'Multiline with indents and some numerical fields in field-specific format', + help='Multiline with indents and some numerical fields in field-specific format', action='store_true') decode_parser.add_argument( '-i', diff --git a/crit/pycriu b/crit/pycriu deleted file mode 120000 index d13a8790a..000000000 --- a/crit/pycriu +++ /dev/null @@ -1 +0,0 @@ -../lib/py/ \ No newline at end of file diff --git a/crit/pyproject.toml b/crit/pyproject.toml new file mode 100644 index 000000000..f0b185eb7 --- /dev/null +++ b/crit/pyproject.toml @@ -0,0 +1,22 @@ +[build-system] +requires = ["setuptools"] +build-backend = "setuptools.build_meta" + +[project] +name = "crit" +description = "CRiu Image Tool" +authors = [ + {name = "CRIU team", email = "criu@lists.linux.dev"}, +] +license = {text = "GPLv2"} +dynamic = ["version"] +requires-python = ">=3.6" + +[project.scripts] +crit = "crit.__main__:main" + +[tool.setuptools] +packages = ["crit"] + +[tool.setuptools.dynamic] +version = {attr = "crit.__version__"} diff --git a/crit/setup.cfg b/crit/setup.cfg new file mode 100644 index 000000000..37895923f --- /dev/null +++ b/crit/setup.cfg @@ -0,0 +1,20 @@ +# Configuring setuptools using pyproject.toml files was introduced in setuptools 61.0.0 +# https://setuptools.pypa.io/en/latest/history.html#v61-0-0 +# For older versions of setuptools, we need to use the setup.cfg file +# https://setuptools.pypa.io/en/latest/userguide/declarative_config.html#declarative-config + +[metadata] +name = crit +description = CRiu Image Tool +author = CRIU team +author_email = criu@lists.linux.dev +license = GPLv2 +version = attr: crit.__version__ + +[options] +packages = crit +python_requires = >=3.6 + +[options.entry_points] +console_scripts = + crit = crit.__main__:main diff --git a/crit/crit-python3 b/crit/setup.py old mode 100755 new mode 100644 similarity index 55% rename from crit/crit-python3 rename to crit/setup.py index 80467cba7..618ac1de4 --- a/crit/crit-python3 +++ b/crit/setup.py @@ -1,6 +1,6 @@ #!/usr/bin/env python3 +import setuptools -from pycriu import cli if __name__ == '__main__': - cli.main() + setuptools.setup() diff --git a/criu/Makefile b/criu/Makefile index 4134e5052..bafdd980b 100644 --- a/criu/Makefile +++ b/criu/Makefile @@ -2,7 +2,7 @@ # 6a8d90f5fec4 "attr: Allow attribute type 0" WRAPFLAGS += -Wl,--wrap=nla_parse,--wrap=nlmsg_parse -ARCH_DIR := criu/arch/$(SRCARCH) +ARCH_DIR := criu/arch/$(ARCH) PIE_DIR := criu/pie export ARCH_DIR PIE_DIR @@ -27,7 +27,7 @@ CFLAGS += -iquote include CFLAGS += -iquote images CFLAGS += -iquote $(ARCH_DIR)/include CFLAGS += -iquote . -CFLAGS += $(shell pkg-config --cflags libnl-3.0) +CFLAGS += $(shell $(PKG_CONFIG) --cflags libnl-3.0) CFLAGS += $(CONFIG-DEFINES) ifeq ($(GMON),1) @@ -85,8 +85,29 @@ $(obj)/%: pie $(obj)/criu: $(PROGRAM-BUILTINS) $(call msg-link, $@) - $(Q) $(CC) $(CFLAGS) $^ $(LIBS) $(WRAPFLAGS) $(LDFLAGS) $(GMONLDOPT) -rdynamic -o $@ + $(Q) $(CC) $(CFLAGS) $^ $(LDFLAGS) $(LIBS) $(WRAPFLAGS) $(GMONLDOPT) -rdynamic -o $@ +UNIT-BUILTINS += $(obj)/util.o +UNIT-BUILTINS += $(obj)/config.o +UNIT-BUILTINS += $(obj)/log.o +UNIT-BUILTINS += $(obj)/string.o +UNIT-BUILTINS += $(obj)/unittest/built-in.o + +$(obj)/unittest/Makefile: ; + +$(obj)/unittest/%: .FORCE + +$(obj)/unittest/built-in.o: .FORCE + $(Q) $(MAKE) $(call build-as,Makefile,criu/unittest) all + +$(obj)/unittest/unittest: $(UNIT-BUILTINS) + $(call msg-link, $@) + $(Q) $(CC) $(CFLAGS) $^ $(LDFLAGS) $(LIBS) $(WRAPFLAGS) -rdynamic -o $@ + +unittest: $(obj)/unittest/unittest + $(Q) $(obj)/unittest/$@ + +.PHONY: unittest # # Clean the most, except generated c files @@ -97,6 +118,7 @@ subclean: $(Q) $(MAKE) $(build)=$(ARCH_DIR) clean $(Q) $(MAKE) $(call build-as,Makefile.library,$(PIE_DIR)) clean $(Q) $(MAKE) $(call build-as,Makefile.crtools,criu) clean + $(Q) $(MAKE) $(call build-as,Makefile,criu/unittest) clean $(Q) $(MAKE) $(build)=$(PIE_DIR) clean .PHONY: subclean cleanup-y += $(obj)/criu @@ -123,11 +145,14 @@ install: $(obj)/criu $(Q) install -m 644 $(UAPI_HEADERS) $(DESTDIR)$(INCLUDEDIR)/criu/ $(Q) mkdir -p $(DESTDIR)$(LIBEXECDIR)/criu/scripts $(Q) install -m 755 scripts/systemd-autofs-restart.sh $(DESTDIR)$(LIBEXECDIR)/criu/scripts + $(E) " INSTALL " scripts/criu-ns + $(Q) install -m 755 scripts/criu-ns $(DESTDIR)$(SBINDIR) .PHONY: install uninstall: $(E) " UNINSTALL" criu $(Q) $(RM) $(addprefix $(DESTDIR)$(SBINDIR)/,criu) + $(Q) $(RM) $(addprefix $(DESTDIR)$(SBINDIR)/,criu-ns) $(Q) $(RM) $(addprefix $(DESTDIR)$(INCLUDEDIR)/criu/,$(notdir $(UAPI_HEADERS))) $(Q) $(RM) $(addprefix $(DESTDIR)$(LIBEXECDIR)/criu/scripts/,systemd-autofs-restart.sh) .PHONY: uninstall diff --git a/criu/Makefile.crtools b/criu/Makefile.crtools index 4588ea5b8..ba6132d2f 100644 --- a/criu/Makefile.crtools +++ b/criu/Makefile.crtools @@ -5,6 +5,7 @@ ldflags-y += -r obj-y += action-scripts.o obj-y += external.o obj-y += aio.o +obj-y += apparmor.o obj-y += bfd.o obj-y += bitmap.o obj-y += cgroup.o @@ -29,6 +30,7 @@ obj-y += files-reg.o obj-y += fsnotify.o obj-y += image-desc.o obj-y += image.o +obj-y += img-streamer.o obj-y += ipc_ns.o obj-y += irmap.o obj-y += kcmp-ids.o @@ -37,7 +39,9 @@ obj-y += libnetlink.o obj-y += log.o obj-y += lsm.o obj-y += mem.o +obj-y += memfd.o obj-y += mount.o +obj-y += mount-v2.o obj-y += filesystems.o obj-y += namespaces.o obj-y += netfilter.o @@ -70,6 +74,7 @@ obj-y += sk-unix.o obj-y += sockets.o obj-y += stats.o obj-y += string.o +obj-y += setproctitle.o obj-y += sysctl.o obj-y += sysfs_parse.o obj-y += timerfd.o @@ -86,10 +91,17 @@ obj-y += config.o obj-y += servicefd.o obj-y += pie-util-vdso.o obj-y += vdso.o +obj-y += timens.o +obj-y += timer.o +obj-y += sigact.o +obj-$(CONFIG_HAS_LIBBPF) += bpfmap.o obj-$(CONFIG_COMPAT) += pie-util-vdso-elf32.o CFLAGS_pie-util-vdso-elf32.o += -DCONFIG_VDSO_32 obj-$(CONFIG_COMPAT) += vdso-compat.o CFLAGS_REMOVE_vdso-compat.o += $(CFLAGS-ASAN) $(CFLAGS-GCOV) +obj-y += pidfd-store.o +obj-y += hugetlb.o +obj-y += pidfd.o PROTOBUF_GEN := scripts/protobuf-gen.sh diff --git a/criu/Makefile.packages b/criu/Makefile.packages index f380fa2f0..3e2e6efd1 100644 --- a/criu/Makefile.packages +++ b/criu/Makefile.packages @@ -6,7 +6,7 @@ REQ-RPM-PKG-NAMES += protobuf-devel REQ-RPM-PKG-NAMES += protobuf-python REQ-RPM-PKG-NAMES += libnl3-devel REQ-RPM-PKG-NAMES += libcap-devel -REQ-RPM-PKG-NAMES += $(PYTHON)-future +REQ-RPM-PKG-NAMES += libuuid-devel REQ-RPM-PKG-TEST-NAMES += libaio-devel @@ -14,25 +14,20 @@ REQ-DEB-PKG-NAMES += libprotobuf-dev REQ-DEB-PKG-NAMES += libprotobuf-c-dev REQ-DEB-PKG-NAMES += protobuf-c-compiler REQ-DEB-PKG-NAMES += protobuf-compiler -REQ-DEB-PKG-NAMES += python-protobuf +REQ-DEB-PKG-NAMES += $(PYTHON)-protobuf REQ-DEB-PKG-NAMES += libnl-3-dev REQ-DEB-PKG-NAMES += libcap-dev +REQ-DEB-PKG-NAMES += uuid-dev -REQ-DEB-PKG-TEST-NAMES += python-yaml +REQ-DEB-PKG-TEST-NAMES += $(PYTHON)-yaml REQ-DEB-PKG-TEST-NAMES += libaio-dev -ifeq ($(PYTHON),python3) -REQ-DEB-PKG-NAMES += $(PYTHON)-future REQ-DEB-PKG-TEST-NAMES += libaio-dev REQ-RPM-PKG-TEST-NAMES += $(PYTHON)-PyYAML -else -REQ-DEB-PKG-NAMES += python-future -REQ-RPM-PKG-TEST-NAMES += $(PYTHON)-pyyaml -endif -export LIBS += -lprotobuf-c -ldl -lnl-3 -lsoccr -Lsoccr/ -lnet +export LIBS += -lprotobuf-c -ldl -lnl-3 -lsoccr -Lsoccr/ -lnet -luuid check-packages-failed: $(warning Can not find some of the required libraries) diff --git a/criu/action-scripts.c b/criu/action-scripts.c index 2f7617c0f..6f7900186 100644 --- a/criu/action-scripts.c +++ b/criu/action-scripts.c @@ -18,17 +18,20 @@ #include "common/scm.h" static const char *action_names[ACT_MAX] = { - [ ACT_PRE_DUMP ] = "pre-dump", - [ ACT_POST_DUMP ] = "post-dump", - [ ACT_PRE_RESTORE ] = "pre-restore", - [ ACT_POST_RESTORE ] = "post-restore", - [ ACT_NET_LOCK ] = "network-lock", - [ ACT_NET_UNLOCK ] = "network-unlock", - [ ACT_SETUP_NS ] = "setup-namespaces", - [ ACT_POST_SETUP_NS ] = "post-setup-namespaces", - [ ACT_PRE_RESUME ] = "pre-resume", - [ ACT_POST_RESUME ] = "post-resume", - [ ACT_ORPHAN_PTS_MASTER ] = "orphan-pts-master", + [ACT_PRE_STREAM] = "pre-stream", + [ACT_PRE_DUMP] = "pre-dump", + [ACT_POST_DUMP] = "post-dump", + [ACT_PRE_RESTORE] = "pre-restore", + [ACT_POST_RESTORE] = "post-restore", + [ACT_NET_LOCK] = "network-lock", + [ACT_NET_UNLOCK] = "network-unlock", + [ACT_SETUP_NS] = "setup-namespaces", + [ACT_POST_SETUP_NS] = "post-setup-namespaces", + [ACT_PRE_RESUME] = "pre-resume", + [ACT_POST_RESUME] = "post-resume", + [ACT_ORPHAN_PTS_MASTER] = "orphan-pts-master", + [ACT_STATUS_READY] = "status-ready", + [ACT_QUERY_EXT_FILES] = "query-ext-files", }; struct script { @@ -36,11 +39,7 @@ struct script { char *path; }; -enum { - SCRIPTS_NONE, - SCRIPTS_SHELL, - SCRIPTS_RPC -}; +enum { SCRIPTS_NONE, SCRIPTS_SHELL, SCRIPTS_RPC }; static int scripts_mode = SCRIPTS_NONE; static LIST_HEAD(scripts); @@ -51,8 +50,11 @@ static int run_shell_scripts(const char *action) struct script *script; static unsigned env_set = 0; -#define ENV_IMGDIR 0x1 -#define ENV_ROOTPID 0x2 +#define ENV_IMGDIR 0x1 +#define ENV_ROOTPID 0x2 + + if (list_empty(&scripts)) + return 0; if (setenv("CRTOOLS_SCRIPT_ACTION", action, 1)) { pr_perror("Can't set CRTOOLS_SCRIPT_ACTION=%s", action); @@ -61,7 +63,7 @@ static int run_shell_scripts(const char *action) if (!(env_set & ENV_IMGDIR)) { char image_dir[PATH_MAX]; - sprintf(image_dir, "/proc/%ld/fd/%d", (long) getpid(), get_service_fd(IMG_FD_OFF)); + sprintf(image_dir, "/proc/%ld/fd/%d", (long)getpid(), get_service_fd(IMG_FD_OFF)); if (setenv("CRTOOLS_IMAGE_DIR", image_dir, 1)) { pr_perror("Can't set CRTOOLS_IMAGE_DIR=%s", image_dir); return -1; @@ -87,8 +89,7 @@ static int run_shell_scripts(const char *action) list_for_each_entry(script, &scripts, node) { int err; pr_debug("\t[%s]\n", script->path); - err = cr_system(-1, -1, -1, script->path, - (char *[]) { script->path, NULL }, 0); + err = cr_system(-1, -1, -1, script->path, (char *[]){ script->path, NULL }, 0); if (err) pr_err("Script %s exited with %d\n", script->path, err); retval |= err; @@ -115,6 +116,20 @@ int rpc_send_fd(enum script_actions act, int fd) return send_criu_rpc_script(act, (char *)action, rpc_sk, fd); } +int rpc_query_external_files(void) +{ + int rpc_sk; + + if (scripts_mode != SCRIPTS_RPC) + return 0; + + rpc_sk = get_service_fd(RPC_SK_OFF); + if (rpc_sk < 0) + return -1; + + return exec_rpc_query_external_files((char *)action_names[ACT_QUERY_EXT_FILES], rpc_sk); +} + int run_scripts(enum script_actions act) { int ret = 0; @@ -122,23 +137,24 @@ int run_scripts(enum script_actions act) pr_debug("Running %s scripts\n", action); - if (scripts_mode == SCRIPTS_NONE) + switch (scripts_mode) { + case SCRIPTS_NONE: return 0; - - if (scripts_mode == SCRIPTS_RPC) { + case SCRIPTS_RPC: ret = rpc_send_fd(act, -1); - goto out; - } - - if (scripts_mode == SCRIPTS_SHELL) { + if (ret) + break; + /* Enable scripts from config file in RPC mode (fallthrough) */ + case SCRIPTS_SHELL: ret = run_shell_scripts(action); - goto out; + break; + default: + BUG(); } - BUG(); -out: if (ret) pr_err("One of more action scripts failed\n"); + return ret; } @@ -146,14 +162,19 @@ int add_script(char *path) { struct script *script; - BUG_ON(scripts_mode == SCRIPTS_RPC); - scripts_mode = SCRIPTS_SHELL; + /* Set shell mode when a script is added but don't overwrite RPC mode */ + if (scripts_mode == SCRIPTS_NONE) + scripts_mode = SCRIPTS_SHELL; script = xmalloc(sizeof(struct script)); if (script == NULL) - return 1; + return -1; - script->path = path; + script->path = xstrdup(path); + if (!script->path) { + xfree(script); + return -1; + } list_add(&script->node, &scripts); return 0; @@ -161,10 +182,17 @@ int add_script(char *path) int add_rpc_notify(int sk) { - BUG_ON(scripts_mode == SCRIPTS_SHELL); + int fd; + + fd = dup(sk); + if (fd < 0) { + pr_perror("dup() failed"); + return -1; + } + scripts_mode = SCRIPTS_RPC; - if (install_service_fd(RPC_SK_OFF, dup(sk)) < 0) + if (install_service_fd(RPC_SK_OFF, fd) < 0) return -1; return 0; diff --git a/criu/aio.c b/criu/aio.c index 45651f2d3..d2831a204 100644 --- a/criu/aio.c +++ b/criu/aio.c @@ -11,7 +11,7 @@ #include "parasite.h" #include "parasite-syscall.h" #include "images/mm.pb-c.h" -#include +#include "compel/infect.h" #define NR_IOEVENTS_IN_NPAGES(npages) ((PAGE_SIZE * (npages) - sizeof(struct aio_ring)) / sizeof(struct io_event)) @@ -38,8 +38,7 @@ int dump_aio_ring(MmEntry *mme, struct vma_area *vma) } mme->aios[nr] = re; mme->n_aios = nr + 1; - pr_info("Dumping AIO ring @%"PRIx64"-%"PRIx64"\n", - vma->e->start, vma->e->end); + pr_info("Dumping AIO ring @%" PRIx64 "-%" PRIx64 "\n", vma->e->start, vma->e->end); return 0; } @@ -56,7 +55,7 @@ void free_aios(MmEntry *mme) unsigned int aio_estimate_nr_reqs(unsigned int size) { - unsigned int k_max_reqs = NR_IOEVENTS_IN_NPAGES(size/PAGE_SIZE); + unsigned int k_max_reqs = NR_IOEVENTS_IN_NPAGES(size / PAGE_SIZE); if (size & ~PAGE_MASK) { pr_err("Ring size is not aligned\n"); @@ -82,8 +81,7 @@ unsigned int aio_estimate_nr_reqs(unsigned int size) unsigned long aio_rings_args_size(struct vm_area_list *vmas) { - return sizeof(struct parasite_check_aios_args) + - vmas->nr_aios * sizeof(struct parasite_aio); + return sizeof(struct parasite_check_aios_args) + vmas->nr_aios * sizeof(struct parasite_aio); } int parasite_collect_aios(struct parasite_ctl *ctl, struct vm_area_list *vmas) @@ -111,8 +109,7 @@ int parasite_collect_aios(struct parasite_ctl *ctl, struct vm_area_list *vmas) if (!vma_area_is(vma, VMA_AREA_AIORING)) continue; - pr_debug(" `- Ring #%ld @%"PRIx64"\n", - (long)(pa - &aa->ring[0]), vma->e->start); + pr_debug(" `- Ring #%ld @%" PRIx64 "\n", (long)(pa - &aa->ring[0]), vma->e->start); pa->ctx = vma->e->start; pa->size = vma->e->end - vma->e->start; pa++; diff --git a/criu/apparmor.c b/criu/apparmor.c new file mode 100644 index 000000000..48b639216 --- /dev/null +++ b/criu/apparmor.c @@ -0,0 +1,811 @@ +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include "common/config.h" +#include "imgset.h" +#include "pstree.h" +#include "util.h" +#include "string.h" +#include "lsm.h" +#include "cr_options.h" +#include "kerndat.h" + +#include "protobuf.h" +#include "images/inventory.pb-c.h" +#include "images/apparmor.pb-c.h" + +/* + * Apparmor stacked profile checkpoint restore. Previously, we just saved the + * profile that was in use by the task, and we expected it to be present on the + * target host. Now with stacking, containers are able to load their own + * profiles, so we can't rely on this. + * + * The basic idea here is that there is some (collection) of (potentially + * nested) namespaces that a container uses. We don't collect everything on the + * host level, but we *do* collect everything inside the namespace; a container + * could have loaded a profile but not yet used it when we start to checkpoint. + * + * Thus, the old code that saves and restores AA profiles is still relevant, we + * just need to add the new code in this file to walk the namespace and dump + * any blobs in that AA namespace, and then restore these blobs on restore so + * that the profiles the old code tries to use are actually present. + */ + +static AaNamespace **namespaces = NULL; +static int n_namespaces = 0; + +static AaNamespace *new_namespace(char *name, AaNamespace *parent) +{ + void *m; + AaNamespace *ret; + + ret = xzalloc(sizeof(*ret)); + if (!ret) + return NULL; + aa_namespace__init(ret); + + ret->name = xstrdup(name); + if (!ret->name) { + xfree(ret); + return NULL; + } + + if (parent) { + m = xrealloc(parent->namespaces, sizeof(*parent->namespaces) * (parent->n_namespaces + 1)); + if (!m) { + xfree(ret->name); + xfree(ret); + return NULL; + } + + parent->namespaces = m; + parent->namespaces[parent->n_namespaces++] = ret; + } + + m = xrealloc(namespaces, sizeof(*namespaces) * (n_namespaces + 1)); + if (!m) { + if (parent) + parent->n_namespaces--; + + xfree(ret->name); + xfree(ret); + return NULL; + } + + namespaces = m; + namespaces[n_namespaces++] = ret; + + return ret; +} + +static int collect_profile(char *path, int offset, char *dir, AaNamespace *ns) +{ + AaPolicy *cur; + int fd, my_offset, ret; + struct stat sb; + ssize_t n; + void *m; + FILE *f; + + my_offset = snprintf(path + offset, PATH_MAX - offset, "%s/", dir); + if (my_offset < 0 || my_offset >= PATH_MAX - offset) { + pr_err("snprintf failed\n"); + return -1; + } + my_offset += offset; + + pr_info("dumping profile %s\n", path); + + cur = xmalloc(sizeof(*cur)); + if (!cur) + return -1; + aa_policy__init(cur); + + __strlcat(path + my_offset, "name", PATH_MAX - my_offset); + f = fopen(path, "r"); + if (!f) { + xfree(cur); + pr_perror("failed to open %s", path); + return -1; + } + + ret = fscanf(f, "%ms", &cur->name); + fclose(f); + if (ret != 1) { + xfree(cur); + pr_perror("couldn't scanf %s", path); + return -1; + } + + __strlcpy(path + my_offset, "raw_data", PATH_MAX - my_offset); + fd = open(path, O_RDONLY); + if (fd < 0) { + pr_perror("failed to open aa policy %s", path); + goto err; + } + + if (fstat(fd, &sb) < 0) { + pr_perror("failed to stat %s", path); + goto close; + } + + cur->blob.len = sb.st_size; + cur->blob.data = xmalloc(sb.st_size); + if (!cur->blob.data) + goto close; + + n = read(fd, cur->blob.data, sb.st_size); + if (n < 0) { + pr_perror("failed to read %s", path); + goto close; + } + + if (n != sb.st_size) { + pr_err("didn't read all of %s\n", path); + goto close; + } + + close(fd); + + m = xrealloc(ns->policies, sizeof(*ns->policies) * (ns->n_policies + 1)); + if (!m) + goto err; + ns->policies = m; + ns->policies[ns->n_policies++] = cur; + + return 0; + +close: + close(fd); + +err: + xfree(cur->name); + xfree(cur); + return -1; +} + +char *ns_path; +int sort_err; + +static int no_dirdots(const struct dirent *de) +{ + return !dir_dots(de); +} + +static int by_time(const struct dirent **de1, const struct dirent **de2) +{ + char path[PATH_MAX]; + struct stat sb1, sb2; + + snprintf(path, sizeof(path), "%s/%s", ns_path, (*de1)->d_name); + if (stat(path, &sb1) < 0) { + pr_perror("couldn't stat %s", path); + sort_err = errno; + return 0; + } + + snprintf(path, sizeof(path), "%s/%s", ns_path, (*de2)->d_name); + if (stat(path, &sb2) < 0) { + pr_perror("couldn't state %s", path); + sort_err = errno; + return 0; + } + + if (sb1.st_mtim.tv_sec == sb2.st_mtim.tv_sec) { + if (sb1.st_mtim.tv_nsec < sb2.st_mtim.tv_nsec) + return -1; + if (sb1.st_mtim.tv_nsec == sb2.st_mtim.tv_nsec) + return 0; + return 1; + } else { + if (sb1.st_mtim.tv_sec < sb2.st_mtim.tv_sec) + return -1; + return 1; + } +} + +static int walk_namespace(char *path, size_t offset, AaNamespace *ns) +{ + DIR *dir = NULL; + struct dirent *de, **namelist = NULL; + int ret = -1, n_names = 0, i; + size_t my_offset; + + /* collect all the child namespaces */ + strcat(path, "/namespaces/"); + my_offset = offset + 12; + + dir = opendir(path); + if (!dir) + goto out; + + while ((de = readdir(dir))) { + AaNamespace *cur; + + if (dir_dots(de)) + continue; + + path[my_offset] = '\0'; + strcat(path, de->d_name); + + cur = new_namespace(de->d_name, ns); + if (!cur) + goto out; + + if (walk_namespace(path, my_offset + strlen(de->d_name), cur) < 0) { + aa_namespace__free_unpacked(cur, NULL); + ns->n_namespaces--; + goto out; + } + } + + closedir(dir); + dir = NULL; + + /* now collect the profiles for this namespace */ + path[offset] = '\0'; + strcat(path, "/profiles/"); + my_offset = offset + 10; + + sort_err = 0; + ns_path = path; + n_names = scandir(path, &namelist, no_dirdots, by_time); + if (n_names < 0 || sort_err != 0) { + pr_perror("scandir failed"); + goto out; + } + + for (i = 0; i < n_names; i++) { + de = namelist[i]; + + path[my_offset] = 0; + if (collect_profile(path, my_offset, de->d_name, ns) < 0) + goto out; + } + + ret = 0; +out: + if (dir) + closedir(dir); + + if (namelist) { + for (i = 0; i < n_names; i++) + xfree(namelist[i]); + xfree(namelist); + } + + return ret; +} + +int collect_aa_namespace(char *profile) +{ + char path[PATH_MAX], *namespace, *end; + int ret, i; + AaNamespace *ns; + + if (!profile) + return 0; + + namespace = strchr(profile, ':'); + if (!namespace) + return 0; /* no namespace to dump */ + namespace ++; + + if (!kdat.apparmor_ns_dumping_enabled) { + pr_warn("Apparmor namespace present but dumping not enabled\n"); + return 0; + } + + /* XXX: this is not strictly correct; if something is using namespace + * views, extra //s can indicate a namespace separation. However, I + * think only the apparmor developers use this feature :) + */ + end = strchr(namespace, ':'); + if (!end) { + pr_err("couldn't find AA namespace end in: %s\n", namespace); + return -1; + } + + *end = '\0'; + + for (i = 0; i < n_namespaces; i++) { + /* did we already dump this namespace? */ + if (!strcmp(namespaces[i]->name, namespace)) { + *end = ':'; + return 0; + } + } + + pr_info("dumping AA namespace %s\n", namespace); + + ns = new_namespace(namespace, NULL); + *end = ':'; + if (!ns) + return -1; + + ret = snprintf(path, sizeof(path), AA_SECURITYFS_PATH "/policy/namespaces/%s", ns->name); + if (ret < 0 || ret >= sizeof(path)) { + pr_err("snprintf failed?\n"); + goto err; + } + + if (walk_namespace(path, ret, ns) < 0) { + pr_err("walking AA namespace %s failed\n", ns->name); + goto err; + } + + return 0; + +err: + aa_namespace__free_unpacked(ns, NULL); + n_namespaces--; + return -1; +} + +/* An AA profile that allows everything that the parasite needs to do */ +#define PARASITE_PROFILE \ + ("profile %s {\n" \ + " /** rwmlkix,\n" \ + " unix,\n" \ + " capability,\n" \ + " signal,\n" \ + "}\n") + +char policydir[PATH_MAX] = ".criu.temp-aa-policy.XXXXXX"; +char cachedir[PATH_MAX]; + +struct apparmor_parser_args { + char *cache; + char *file; +}; + +static int apparmor_parser_exec(void *data) +{ + struct apparmor_parser_args *args = data; + + execlp("apparmor_parser", "apparmor_parser", "-QWL", args->cache, args->file, NULL); + + return -1; +} + +static int apparmor_cache_exec(void *data) +{ + execlp("apparmor_parser", "apparmor_parser", "--cache-loc", "/", "--print-cache-dir", (char *)NULL); + + return -1; +} + +static void *get_suspend_policy(char *name, off_t *len) +{ + char policy[1024], file[PATH_MAX], cache[PATH_MAX], clean_name[PATH_MAX]; + void *ret = NULL; + int n, fd, policy_len, i; + struct stat sb; + struct apparmor_parser_args args = { + .cache = cache, + .file = file, + }; + + *len = 0; + + policy_len = snprintf(policy, sizeof(policy), PARASITE_PROFILE, name); + if (policy_len < 0 || policy_len >= sizeof(policy)) { + pr_err("policy name %s too long\n", name); + return NULL; + } + + /* policy names can have /s, but file paths can't */ + for (i = 0; name[i]; i++) { + if (i == PATH_MAX) { + pr_err("name %s too long\n", name); + return NULL; + } + + clean_name[i] = name[i] == '/' ? '.' : name[i]; + } + clean_name[i] = 0; + + n = snprintf(file, sizeof(file), "%s/%s", policydir, clean_name); + if (n < 0 || n >= sizeof(policy)) { + pr_err("policy name %s too long\n", clean_name); + return NULL; + } + + n = snprintf(cache, sizeof(cache), "%s/cache", policydir); + if (n < 0 || n >= sizeof(policy)) { + pr_err("policy dir too long\n"); + return NULL; + } + + fd = open(file, O_CREAT | O_WRONLY, 0600); + if (fd < 0) { + pr_perror("couldn't create %s", file); + return NULL; + } + + n = write(fd, policy, policy_len); + close(fd); + if (n < 0 || n != policy_len) { + pr_perror("couldn't write policy for %s", file); + return NULL; + } + + n = run_command(cachedir, sizeof(cachedir), apparmor_cache_exec, NULL); + if (n < 0) { + pr_err("apparmor parsing failed %d\n", n); + return NULL; + } + + n = run_command(NULL, 0, apparmor_parser_exec, &args); + if (n < 0) { + pr_err("apparmor parsing failed %d\n", n); + return NULL; + } + + n = snprintf(file, sizeof(file), "%s/cache/%s/%s", policydir, cachedir, clean_name); + if (n < 0 || n >= sizeof(policy)) { + pr_err("policy name %s too long\n", clean_name); + return NULL; + } + + fd = open(file, O_RDONLY); + if (fd < 0) { + pr_perror("couldn't open %s", file); + return NULL; + } + + if (fstat(fd, &sb) < 0) { + pr_perror("couldn't stat fd"); + goto out; + } + + ret = mmap(NULL, sb.st_size, PROT_READ, MAP_PRIVATE, fd, 0); + if (ret == MAP_FAILED) { + pr_perror("mmap of %s failed", file); + ret = NULL; + goto out; + } + + *len = sb.st_size; +out: + close(fd); + return ret; +} + +#define NEXT_AA_TOKEN(pos) \ + while (*pos) { \ + if (*pos == '/' && *(pos + 1) && *(pos + 1) == '/' && *(pos + 2) && *(pos + 2) == '&') { \ + pos += 3; \ + break; \ + } \ + if (*pos == ':' && *(pos + 1) && *(pos + 1) == '/' && *(pos + 2) && *(pos + 2) == '/') { \ + pos += 3; \ + break; \ + } \ + pos++; \ + } + +static int write_aa_policy(AaNamespace *ns, char *path, int offset, char *rewrite, bool suspend) +{ + int i, my_offset, ret; + char *rewrite_pos = rewrite, namespace[PATH_MAX]; + + if (rewrite && suspend) { + pr_err("requesting aa rewriting and suspension at the same time is not supported\n"); + return -1; + } + + if (!rewrite) { + strncpy(namespace, ns->name, sizeof(namespace) - 1); + } else { + NEXT_AA_TOKEN(rewrite_pos); + + switch (*rewrite_pos) { + case ':': { + char tmp, *end; + + end = strchr(rewrite_pos + 1, ':'); + if (!end) { + pr_err("invalid namespace %s\n", rewrite_pos); + return -1; + } + + tmp = *end; + *end = 0; + __strlcpy(namespace, rewrite_pos + 1, sizeof(namespace)); + *end = tmp; + + break; + } + default: + __strlcpy(namespace, ns->name, sizeof(namespace)); + for (i = 0; i < ns->n_policies; i++) { + if (strcmp(ns->policies[i]->name, rewrite_pos)) + pr_warn("binary rewriting of apparmor policies not supported right now, not renaming %s to %s\n", + ns->policies[i]->name, rewrite_pos); + } + } + } + + my_offset = snprintf(path + offset, PATH_MAX - offset, "/namespaces/%s", ns->name); + if (my_offset < 0 || my_offset >= PATH_MAX - offset) { + pr_err("snprintf'd too many characters\n"); + return -1; + } + + if (!suspend && mkdir(path, 0755) < 0 && errno != EEXIST) { + pr_perror("failed to create namespace %s", path); + goto fail; + } + + for (i = 0; i < ns->n_namespaces; i++) { + if (write_aa_policy(ns, path, offset + my_offset, rewrite_pos, suspend) < 0) + goto fail; + } + + ret = snprintf(path + offset + my_offset, PATH_MAX - offset - my_offset, "/.replace"); + if (ret < 0 || ret >= PATH_MAX - offset - my_offset) { + pr_err("snprintf failed\n"); + goto fail; + } + + for (i = 0; i < ns->n_policies; i++) { + AaPolicy *p = ns->policies[i]; + void *data = p->blob.data; + int fd, n; + off_t len = p->blob.len; + + fd = open(path, O_WRONLY); + if (fd < 0) { + pr_perror("couldn't open apparmor load file %s", path); + goto fail; + } + + if (suspend) { + pr_info("suspending policy %s\n", p->name); + data = get_suspend_policy(p->name, &len); + if (!data) { + close(fd); + goto fail; + } + } + + n = write(fd, data, len); + close(fd); + if (suspend && munmap(data, len) < 0) { + pr_perror("failed to munmap"); + goto fail; + } + + if (n != len) { + pr_perror("write AA policy %s in %s failed", p->name, namespace); + goto fail; + } + + if (!suspend) + pr_info("wrote aa policy %s: %s %d\n", path, p->name, n); + } + + return 0; + +fail: + if (!suspend) { + path[offset + my_offset] = 0; + rmdir(path); + } + + pr_err("failed to write policy in AA namespace %s\n", namespace); + return -1; +} + +static int do_suspend(bool suspend) +{ + int i; + + for (i = 0; i < n_namespaces; i++) { + AaNamespace *ns = namespaces[i]; + char path[PATH_MAX] = AA_SECURITYFS_PATH "/policy"; + + if (write_aa_policy(ns, path, strlen(path), opts.lsm_profile, suspend) < 0) + return -1; + } + + return 0; +} + +int suspend_aa(void) +{ + int ret; + if (!mkdtemp(policydir)) { + pr_perror("failed to make AA policy dir"); + return -1; + } + + ret = do_suspend(true); + if (rmrf(policydir) < 0) + pr_err("failed removing policy dir %s\n", policydir); + + return ret; +} + +int unsuspend_aa(void) +{ + return do_suspend(false); +} + +int dump_aa_namespaces(void) +{ + ApparmorEntry *ae = NULL; + int ret; + + if (n_namespaces == 0) + return 0; + + ae = xmalloc(sizeof(*ae)); + if (!ae) + return -1; + apparmor_entry__init(ae); + + ae->n_namespaces = n_namespaces; + ae->namespaces = namespaces; + + ret = pb_write_one(img_from_set(glob_imgset, CR_FD_APPARMOR), ae, PB_APPARMOR); + + apparmor_entry__free_unpacked(ae, NULL); + n_namespaces = -1; + namespaces = NULL; + + return ret; +} + +bool check_aa_ns_dumping(void) +{ + char contents[49]; + int major, minor, ret; + FILE *f; + + f = fopen(AA_SECURITYFS_PATH "/features/domain/stack", "r"); + if (!f) + return false; + + ret = fscanf(f, "%48s", contents); + fclose(f); + if (ret != 1) { + pr_err("scanning aa stack feature failed\n"); + return false; + } + + if (strcmp("yes", contents)) { + pr_warn("aa stack featured disabled: %s\n", contents); + return false; + } + + f = fopen(AA_SECURITYFS_PATH "/features/domain/version", "r"); + if (!f) + return false; + + ret = fscanf(f, "%d.%d", &major, &minor); + fclose(f); + if (ret != 2) { + pr_err("scanning aa stack version failed\n"); + return false; + } + + return major >= 1 && minor >= 2; +} + +int prepare_apparmor_namespaces(void) +{ + struct cr_img *img; + int ret, i; + ApparmorEntry *ae; + + img = open_image(CR_FD_APPARMOR, O_RSTR); + if (!img) + return -1; + + ret = pb_read_one_eof(img, &ae, PB_APPARMOR); + close_image(img); + if (ret <= 0) + return 0; /* there was no AA namespace entry */ + + if (!ae) { + pr_err("missing aa namespace entry\n"); + return -1; + } + + /* no real reason we couldn't do this in parallel, but in usually we + * expect one namespace so there's probably not a lot to be gained. + */ + for (i = 0; i < ae->n_namespaces; i++) { + char path[PATH_MAX] = AA_SECURITYFS_PATH "/policy"; + + if (write_aa_policy(ae->namespaces[i], path, strlen(path), opts.lsm_profile, false) < 0) { + ret = -1; + goto out; + } + } + + ret = 0; +out: + apparmor_entry__free_unpacked(ae, NULL); + return ret; +} + +int render_aa_profile(char **out, const char *cur) +{ + const char *pos; + int n_namespaces = 0, n_profiles = 0; + bool last_namespace = false; + + /* no rewriting necessary */ + if (!opts.lsm_supplied) { + *out = xsprintf("changeprofile %s", cur); + if (!*out) + return -1; + + return 0; + } + + /* user asked to re-write to an unconfined profile */ + if (!opts.lsm_profile) { + *out = NULL; + return 0; + } + + pos = opts.lsm_profile; + while (*pos) { + switch (*pos) { + case ':': + n_namespaces++; + break; + default: + n_profiles++; + } + + NEXT_AA_TOKEN(pos); + } + + /* special case: there is no namespacing or stacking; we can just + * changeprofile to the rewritten string + */ + if (n_profiles == 1 && n_namespaces == 0) { + *out = xsprintf("changeprofile %s", opts.lsm_profile); + if (!*out) + return -1; + + pr_info("rewrote apparmor profile from %s to %s\n", cur, *out); + return 0; + } + + pos = cur; + while (*pos) { + switch (*pos) { + case ':': + n_namespaces--; + last_namespace = true; + break; + default: + n_profiles--; + } + + NEXT_AA_TOKEN(pos); + + if (n_profiles == 0 && n_namespaces == 0) + break; + } + + *out = xsprintf("changeprofile %s//%s%s", opts.lsm_profile, last_namespace ? "" : "&", pos); + if (!*out) + return -1; + + pr_info("rewrote apparmor profile from %s to %s\n", cur, *out); + return 0; +} diff --git a/criu/arch/aarch64/Makefile b/criu/arch/aarch64/Makefile index b26487367..b87fcaa5b 100644 --- a/criu/arch/aarch64/Makefile +++ b/criu/arch/aarch64/Makefile @@ -6,3 +6,4 @@ obj-y += cpu.o obj-y += crtools.o obj-y += sigframe.o obj-y += bitops.o +obj-y += gcs.o \ No newline at end of file diff --git a/criu/arch/aarch64/cpu.c b/criu/arch/aarch64/cpu.c index 34313fb15..97a883b8c 100644 --- a/criu/arch/aarch64/cpu.c +++ b/criu/arch/aarch64/cpu.c @@ -1,4 +1,4 @@ -#undef LOG_PREFIX +#undef LOG_PREFIX #define LOG_PREFIX "cpu: " #include diff --git a/criu/arch/aarch64/crtools.c b/criu/arch/aarch64/crtools.c index f98743a23..2e89f9ce3 100644 --- a/criu/arch/aarch64/crtools.c +++ b/criu/arch/aarch64/crtools.c @@ -1,5 +1,6 @@ #include #include +#include #include @@ -11,6 +12,7 @@ #include "common/compiler.h" #include #include "asm/dump.h" +#include "asm/gcs-types.h" #include "protobuf.h" #include "images/core.pb-c.h" #include "images/creds.pb-c.h" @@ -19,13 +21,138 @@ #include "util.h" #include "cpu.h" #include "restorer.h" -#include +#include "compel/infect.h" +#include "pstree.h" +#include -#define assign_reg(dst, src, e) dst->e = (__typeof__(dst->e))(src)->e +/* + * cr_user_pac_* are a copy of the corresponding uapi structs + * in arch/arm64/include/uapi/asm/ptrace.h + */ +struct cr_user_pac_address_keys { + __uint128_t apiakey; + __uint128_t apibkey; + __uint128_t apdakey; + __uint128_t apdbkey; +}; -int save_task_regs(void *x, user_regs_struct_t *regs, user_fpregs_struct_t *fpsimd) +struct cr_user_pac_generic_keys { + __uint128_t apgakey; +}; + +/* + * The following HWCAP constants are copied from + * arch/arm64/include/uapi/asm/hwcap.h + */ +#ifndef HWCAP_PACA +#define HWCAP_PACA (1 << 30) +#endif + +#ifndef HWCAP_PACG +#define HWCAP_PACG (1UL << 31) +#endif + +/* + * The following NT_ARM_PAC constants are copied from + * include/uapi/linux/elf.h + */ +#ifndef NT_ARM_PACA_KEYS +#define NT_ARM_PACA_KEYS 0x407 /* ARM pointer authentication address keys */ +#endif + +#ifndef NT_ARM_PACG_KEYS +#define NT_ARM_PACG_KEYS 0x408 +#endif + +#ifndef NT_ARM_PAC_ENABLED_KEYS +#define NT_ARM_PAC_ENABLED_KEYS 0x40a /* AArch64 pointer authentication enabled keys. */ +#endif + +extern unsigned long getauxval(unsigned long type); + +#define assign_reg(dst, src, e) dst->e = (__typeof__(dst->e))(src)->e + +static int save_pac_keys(int pid, CoreEntry *core) +{ + struct cr_user_pac_address_keys paca; + struct cr_user_pac_generic_keys pacg; + PacKeys *pac_entry; + long pac_enabled_key; + struct iovec iov; + int ret; + + unsigned long hwcaps = getauxval(AT_HWCAP); + + pac_entry = xmalloc(sizeof(PacKeys)); + if (!pac_entry) + return -1; + core->ti_aarch64->pac_keys = pac_entry; + pac_keys__init(pac_entry); + + if (hwcaps & HWCAP_PACA) { + PacAddressKeys *pac_address_keys; + + pr_debug("%d: Dumping address authentication keys\n", pid); + iov.iov_base = &paca; + iov.iov_len = sizeof(paca); + if ((ret = ptrace(PTRACE_GETREGSET, pid, NT_ARM_PACA_KEYS, &iov))) { + pr_perror("Failed to get address authentication key for %d", pid); + return -1; + } + pac_address_keys = xmalloc(sizeof(PacAddressKeys)); + if (!pac_address_keys) + return -1; + pac_address_keys__init(pac_address_keys); + pac_entry->pac_address_keys = pac_address_keys; + pac_address_keys->apiakey_lo = paca.apiakey; + pac_address_keys->apiakey_hi = paca.apiakey >> 64; + pac_address_keys->apibkey_lo = paca.apibkey; + pac_address_keys->apibkey_hi = paca.apibkey >> 64; + pac_address_keys->apdakey_lo = paca.apdakey; + pac_address_keys->apdakey_hi = paca.apdakey >> 64; + pac_address_keys->apdbkey_lo = paca.apdbkey; + pac_address_keys->apdbkey_hi = paca.apdbkey >> 64; + + iov.iov_base = &pac_enabled_key; + iov.iov_len = sizeof(pac_enabled_key); + ret = ptrace(PTRACE_GETREGSET, pid, NT_ARM_PAC_ENABLED_KEYS, &iov); + if (ret) { + pr_perror("Failed to get authentication key mask for %d", pid); + return -1; + } + + pac_address_keys->pac_enabled_key = pac_enabled_key; + + } + if (hwcaps & HWCAP_PACG) { + PacGenericKeys *pac_generic_keys; + + pr_debug("%d: Dumping generic authentication keys\n", pid); + iov.iov_base = &pacg; + iov.iov_len = sizeof(pacg); + if ((ret = ptrace(PTRACE_GETREGSET, pid, NT_ARM_PACG_KEYS, &iov))) { + pr_perror("Failed to get a generic authantication key for %d", pid); + return -1; + } + pac_generic_keys = xmalloc(sizeof(PacGenericKeys)); + if (!pac_generic_keys) + return -1; + pac_generic_keys__init(pac_generic_keys); + pac_entry->pac_generic_keys = pac_generic_keys; + pac_generic_keys->apgakey_lo = pacg.apgakey; + pac_generic_keys->apgakey_hi = pacg.apgakey >> 64; + } + return 0; +} + +int save_task_regs(pid_t pid, void *x, user_regs_struct_t *regs, user_fpregs_struct_t *fpsimd) { int i; + struct cr_user_gcs gcs_live; + struct iovec gcs_iov = { + .iov_base = &gcs_live, + .iov_len = sizeof(gcs_live), + }; CoreEntry *core = x; // Save the Aarch64 CPU state @@ -35,15 +162,26 @@ int save_task_regs(void *x, user_regs_struct_t *regs, user_fpregs_struct_t *fpsi assign_reg(core->ti_aarch64->gpregs, regs, pc); assign_reg(core->ti_aarch64->gpregs, regs, pstate); - // Save the FP/SIMD state - for (i = 0; i < 32; ++i) - { - core->ti_aarch64->fpsimd->vregs[2*i] = fpsimd->vregs[i]; - core->ti_aarch64->fpsimd->vregs[2*i + 1] = fpsimd->vregs[i] >> 64; + for (i = 0; i < 32; ++i) { + core->ti_aarch64->fpsimd->vregs[2 * i] = fpsimd->fpstate.vregs[i]; + core->ti_aarch64->fpsimd->vregs[2 * i + 1] = fpsimd->fpstate.vregs[i] >> 64; + } + assign_reg(core->ti_aarch64->fpsimd, &fpsimd->fpstate, fpsr); + assign_reg(core->ti_aarch64->fpsimd, &fpsimd->fpstate, fpcr); + + if (save_pac_keys(pid, core)) + return -1; + + /* Save the GCS state */ + if (compel_host_supports_gcs()) { + if (ptrace(PTRACE_GETREGSET, pid, NT_ARM_GCS, &gcs_iov) < 0) { + pr_perror("Failed to get GCS for %d", pid); + return -1; + } + core->ti_aarch64->gcs->gcspr_el0 = gcs_live.gcspr_el0; + core->ti_aarch64->gcs->features_enabled = gcs_live.features_enabled; } - assign_reg(core->ti_aarch64->fpsimd, fpsimd, fpsr); - assign_reg(core->ti_aarch64->fpsimd, fpsimd, fpcr); return 0; } @@ -53,6 +191,7 @@ int arch_alloc_thread_info(CoreEntry *core) ThreadInfoAarch64 *ti_aarch64; UserAarch64RegsEntry *gpregs; UserAarch64FpsimdContextEntry *fpsimd; + UserAarch64GcsEntry *gcs; ti_aarch64 = xmalloc(sizeof(*ti_aarch64)); if (!ti_aarch64) @@ -65,7 +204,7 @@ int arch_alloc_thread_info(CoreEntry *core) goto err; user_aarch64_regs_entry__init(gpregs); - gpregs->regs = xmalloc(31*sizeof(uint64_t)); + gpregs->regs = xmalloc(31 * sizeof(uint64_t)); if (!gpregs->regs) goto err; gpregs->n_regs = 31; @@ -77,11 +216,20 @@ int arch_alloc_thread_info(CoreEntry *core) goto err; user_aarch64_fpsimd_context_entry__init(fpsimd); ti_aarch64->fpsimd = fpsimd; - fpsimd->vregs = xmalloc(64*sizeof(fpsimd->vregs[0])); + fpsimd->vregs = xmalloc(64 * sizeof(fpsimd->vregs[0])); fpsimd->n_vregs = 64; if (!fpsimd->vregs) goto err; + /* Allocate & init GCS */ + if (compel_host_supports_gcs()) { + gcs = xmalloc(sizeof(*gcs)); + if (!gcs) + goto err; + user_aarch64_gcs_entry__init(gcs); + ti_aarch64->gcs = gcs; + } + return 0; err: return -1; @@ -94,6 +242,12 @@ void arch_free_thread_info(CoreEntry *core) xfree(CORE_THREAD_ARCH_INFO(core)->fpsimd->vregs); xfree(CORE_THREAD_ARCH_INFO(core)->fpsimd); } + if (CORE_THREAD_ARCH_INFO(core)->pac_keys) { + PacKeys *pac_entry = CORE_THREAD_ARCH_INFO(core)->pac_keys; + xfree(pac_entry->pac_address_keys); + xfree(pac_entry->pac_generic_keys); + xfree(pac_entry); + } xfree(CORE_THREAD_ARCH_INFO(core)->gpregs->regs); xfree(CORE_THREAD_ARCH_INFO(core)->gpregs); xfree(CORE_THREAD_ARCH_INFO(core)); @@ -105,25 +259,38 @@ int restore_fpu(struct rt_sigframe *sigframe, CoreEntry *core) { int i; struct fpsimd_context *fpsimd = RT_SIGFRAME_FPU(sigframe); + struct gcs_context *gcs; if (core->ti_aarch64->fpsimd->n_vregs != 64) return 1; for (i = 0; i < 32; ++i) - fpsimd->vregs[i] = (__uint128_t)core->ti_aarch64->fpsimd->vregs[2*i] | - ((__uint128_t)core->ti_aarch64->fpsimd->vregs[2*i + 1] << 64); + fpsimd->vregs[i] = (__uint128_t)core->ti_aarch64->fpsimd->vregs[2 * i] | + ((__uint128_t)core->ti_aarch64->fpsimd->vregs[2 * i + 1] << 64); assign_reg(fpsimd, core->ti_aarch64->fpsimd, fpsr); assign_reg(fpsimd, core->ti_aarch64->fpsimd, fpcr); fpsimd->head.magic = FPSIMD_MAGIC; fpsimd->head.size = sizeof(*fpsimd); + if (compel_host_supports_gcs()) { + gcs = RT_SIGFRAME_GCS(sigframe); + + pr_debug("sigframe gcspr %llx enabled %llx\n", gcs->gcspr, gcs->features_enabled); + + gcs->head.magic = GCS_MAGIC; + gcs->head.size = sizeof(*gcs); + gcs->reserved = 0; + gcs->gcspr = core->ti_aarch64->gcs->gcspr_el0 - 8; + gcs->features_enabled = core->ti_aarch64->gcs->features_enabled; + } + return 0; } int restore_gpregs(struct rt_sigframe *f, UserRegsEntry *r) { -#define CPREG1(d) f->uc.uc_mcontext.d = r->d +#define CPREG1(d) f->uc.uc_mcontext.d = r->d int i; @@ -137,3 +304,83 @@ int restore_gpregs(struct rt_sigframe *f, UserRegsEntry *r) return 0; } + +int arch_ptrace_restore(int pid, struct pstree_item *item) +{ + unsigned long hwcaps = getauxval(AT_HWCAP); + struct cr_user_pac_address_keys upaca; + struct cr_user_pac_generic_keys upacg; + PacAddressKeys *paca; + PacGenericKeys *pacg; + long pac_enabled_keys; + struct iovec iov; + int ret; + + + pr_debug("%d: Restoring PAC keys\n", pid); + + paca = &rsti(item)->arch_info.pac_address_keys; + pacg = &rsti(item)->arch_info.pac_generic_keys; + if (rsti(item)->arch_info.has_paca) { + if (!(hwcaps & HWCAP_PACA)) { + pr_err("PACG support is required from the source system.\n"); + return 1; + } + pac_enabled_keys = rsti(item)->arch_info.pac_address_keys.pac_enabled_key; + + upaca.apiakey = paca->apiakey_lo + ((__uint128_t)paca->apiakey_hi << 64); + upaca.apibkey = paca->apibkey_lo + ((__uint128_t)paca->apibkey_hi << 64); + upaca.apdakey = paca->apdakey_lo + ((__uint128_t)paca->apdakey_hi << 64); + upaca.apdbkey = paca->apdbkey_lo + ((__uint128_t)paca->apdbkey_hi << 64); + + iov.iov_base = &upaca; + iov.iov_len = sizeof(upaca); + + if ((ret = ptrace(PTRACE_SETREGSET, pid, NT_ARM_PACA_KEYS, &iov))) { + pr_perror("Failed to set address authentication keys for %d", pid); + return 1; + } + iov.iov_base = &pac_enabled_keys; + iov.iov_len = sizeof(pac_enabled_keys); + if ((ret = ptrace(PTRACE_SETREGSET, pid, NT_ARM_PAC_ENABLED_KEYS, &iov))) { + pr_perror("Failed to set enabled key mask for %d", pid); + return 1; + } + } + + if (rsti(item)->arch_info.has_pacg) { + if (!(hwcaps & HWCAP_PACG)) { + pr_err("PACG support is required from the source system.\n"); + return 1; + } + upacg.apgakey = pacg->apgakey_lo + ((__uint128_t)pacg->apgakey_hi << 64); + iov.iov_base = &upacg; + iov.iov_len = sizeof(upacg); + if ((ret = ptrace(PTRACE_SETREGSET, pid, NT_ARM_PACG_KEYS, &iov))) { + pr_perror("Failed to set the generic authentication key for %d", pid); + return 1; + } + } + + return 0; +} + +void arch_rsti_init(struct pstree_item *p) +{ + PacKeys *pac_keys = p->core[0]->ti_aarch64->pac_keys; + + rsti(p)->arch_info.has_paca = false; + rsti(p)->arch_info.has_pacg = false; + + if (!pac_keys) + return; + + if (pac_keys->pac_address_keys) { + rsti(p)->arch_info.has_paca = true; + rsti(p)->arch_info.pac_address_keys = *pac_keys->pac_address_keys; + } + if (pac_keys->pac_generic_keys) { + rsti(p)->arch_info.has_pacg = true; + rsti(p)->arch_info.pac_generic_keys = *pac_keys->pac_generic_keys; + } +} diff --git a/criu/arch/aarch64/gcs.c b/criu/arch/aarch64/gcs.c new file mode 100644 index 000000000..4bdb9d2e4 --- /dev/null +++ b/criu/arch/aarch64/gcs.c @@ -0,0 +1,157 @@ +#include +#include + +#include +#include + +#include "asm/gcs-types.h" +#include "pstree.h" +#include "restorer.h" +#include "rst-malloc.h" +#include "vma.h" + +#include +#include + +static bool task_has_gcs_enabled(UserAarch64GcsEntry *gcs) +{ + return gcs && (gcs->features_enabled & PR_SHADOW_STACK_ENABLE) != 0; +} + +static bool host_supports_gcs(void) +{ + unsigned long hwcap = getauxval(AT_HWCAP); + return (hwcap & HWCAP_GCS) != 0; +} + +static bool task_needs_gcs(struct pstree_item *item, CoreEntry *core) +{ + UserAarch64GcsEntry *gcs; + + if (!task_alive(item)) + return false; + + gcs = core->ti_aarch64->gcs; + + if (task_has_gcs_enabled(gcs)) { + if (!host_supports_gcs()) { + pr_warn_once("Restoring task with GCS on non-GCS host\n"); + return false; + } + + pr_info("Restoring task with GCS\n"); + return true; + } + + pr_info("Restoring a task without GCS\n"); + return false; +} + +static int gcs_prepare_task(struct vm_area_list *vmas, + struct rst_shstk_info *gcs) +{ + struct vma_area *vma; + + list_for_each_entry(vma, &vmas->h, list) { + if (vma_area_is(vma, VMA_AREA_SHSTK) && + in_vma_area(vma, gcs->gcspr_el0)) { + unsigned long premapped_addr = vma->premmaped_addr; + unsigned long size = vma_area_len(vma); + + gcs->vma_start = vma->e->start; + gcs->vma_size = size; + gcs->premapped_addr = premapped_addr; + + return 0; + } + } + + pr_err("Unable to find a shadow stack vma: %lx\n", gcs->gcspr_el0); + return -1; +} + +int arch_gcs_prepare(struct pstree_item *item, CoreEntry *core, + struct task_restore_args *ta) +{ + int i; + struct thread_restore_args *args_array = (struct thread_restore_args *)(&ta[1]); + struct vm_area_list *vmas = &rsti(item)->vmas; + struct rst_shstk_info *gcs = &ta->shstk; + + if (!task_needs_gcs(item, core)) + return 0; + + gcs->gcspr_el0 = core->ti_aarch64->gcs->gcspr_el0; + gcs->features_enabled = core->ti_aarch64->gcs->features_enabled; + + if (gcs_prepare_task(vmas, gcs)) { + pr_err("gcs: failed to prepare shadow stack memory\n"); + return -1; + } + + for (i = 0; i < item->nr_threads; i++) { + struct thread_restore_args *thread_args = &args_array[i]; + + core = item->core[i]; + gcs = &thread_args->shstk; + + gcs->gcspr_el0 = core->ti_aarch64->gcs->gcspr_el0; + gcs->features_enabled = core->ti_aarch64->gcs->features_enabled; + + if (gcs_prepare_task(vmas, gcs)) { + pr_err("gcs: failed to prepare GCS memory\n"); + return -1; + } + } + + return 0; +} + +int arch_shstk_trampoline(struct pstree_item *item, CoreEntry *core, + int (*func)(void *arg), void *arg) +{ + int fret; + unsigned long flags = PR_SHADOW_STACK_ENABLE | + PR_SHADOW_STACK_PUSH | + PR_SHADOW_STACK_WRITE; + + long ret, x1_after, x8_after; + + /* If task doesn't need GCS, just call func */ + if (!task_needs_gcs(item, core)) { + return func(arg); + } + + pr_debug("gcs: GCS enable SVC about to fire: x8=%d x0=%d x1=0x%lx\n", + __NR_prctl, PR_SET_SHADOW_STACK_STATUS, flags); + + asm volatile( + "mov x0, %3\n" // x0 = PR_SET_SHADOW_STACK_STATUS (75) + "mov x1, %4\n" // x1 = flags + "mov x2, xzr\n" // x2 = 0 + "mov x3, xzr\n" // x3 = 0 + "mov x4, xzr\n" // x4 = 0 + "mov x8, %5\n" // x8 = __NR_prctl (167) + "svc #0\n" // Invoke syscall + "mov %0, x0\n" // Capture return value + "mov %1, x1\n" // Capture x1 after + "mov %2, x8\n" // Capture x8 after + : "=r"(ret), "=r"(x1_after), "=r"(x8_after) + : "i"(PR_SET_SHADOW_STACK_STATUS), // x0 - %3rd + "r"(flags), // x1 - %4th + "i"(__NR_prctl) // x8 - %5th + : "x0", "x1", "x2", "x3", "x4", "x8", "memory", "cc"); + + pr_info("gcs: after SVC: ret=%ld x1=%ld x8=%ld\n", ret, x1_after, x8_after); + + if (ret != 0) { + int err = errno; + pr_err("gcs: failed to enable GCS: ret=%ld errno=%d (%s)\n", ret, err, strerror(err)); + return -1; + } + + fret = func(arg); + exit(fret); + + return -1; +} diff --git a/criu/arch/aarch64/include/asm/dump.h b/criu/arch/aarch64/include/asm/dump.h index bc3dbcf3a..ecab061c3 100644 --- a/criu/arch/aarch64/include/asm/dump.h +++ b/criu/arch/aarch64/include/asm/dump.h @@ -1,11 +1,10 @@ #ifndef __CR_ASM_DUMP_H__ #define __CR_ASM_DUMP_H__ -extern int save_task_regs(void *, user_regs_struct_t *, user_fpregs_struct_t *); +extern int save_task_regs(pid_t pid, void *, user_regs_struct_t *, user_fpregs_struct_t *); extern int arch_alloc_thread_info(CoreEntry *core); extern void arch_free_thread_info(CoreEntry *core); - static inline void core_put_tls(CoreEntry *core, tls_t tls) { core->ti_aarch64->tls = tls; diff --git a/criu/arch/aarch64/include/asm/gcs.h b/criu/arch/aarch64/include/asm/gcs.h new file mode 100644 index 000000000..28faa23b7 --- /dev/null +++ b/criu/arch/aarch64/include/asm/gcs.h @@ -0,0 +1,196 @@ +#ifndef __CR_ASM_GCS_H__ +#define __CR_ASM_GCS_H__ + +#include + +struct rst_shstk_info { + unsigned long vma_start; /* start of GCS VMA */ + unsigned long vma_size; /* size of GCS VMA */ + unsigned long premapped_addr; /* premapped buffer */ + unsigned long tmp_gcs; /* temp area for GCS if needed */ + u64 gcspr_el0; /* GCS pointer */ + u64 features_enabled; /* GCS flags */ +}; + +#define rst_shstk_info rst_shstk_info + +struct task_restore_args; +struct pstree_item; + +int arch_gcs_prepare(struct pstree_item *item, CoreEntry *core, + struct task_restore_args *ta); +#define arch_shstk_prepare arch_gcs_prepare + +int arch_shstk_trampoline(struct pstree_item *item, CoreEntry *core, + int (*func)(void *arg), void *arg); +#define arch_shstk_trampoline arch_shstk_trampoline + +static always_inline void shstk_set_restorer_stack(struct rst_shstk_info *gcs, void *ptr) +{ + gcs->tmp_gcs = (long unsigned)ptr; +} +#define shstk_set_restorer_stack shstk_set_restorer_stack + +static always_inline long shstk_restorer_stack_size(void) +{ + return PAGE_SIZE; +} +#define shstk_restorer_stack_size shstk_restorer_stack_size + +#ifdef CR_NOGLIBC +#include +#include +#include "vma.h" + +static inline unsigned long gcs_map(unsigned long addr, unsigned long size, unsigned int flags) +{ + long gcspr = sys_map_shadow_stack(addr, size, flags); + pr_info("gcs: syscall: map_shadow_stack at=%lx size=%ld\n", addr, size); + + if (gcspr < 0) { + pr_err("gcs: failed to map GCS at %lx: %ld\n", addr, gcspr); + return -1; + } + + if (addr && gcspr != addr) { + pr_err("gcs: address mismatch: need %lx, got %lx\n", addr, gcspr); + return -1; + } + + pr_info("gcs: mmapped GCS at %lx\n", gcspr); + + return gcspr; +} + +/* clang-format off */ +static always_inline void gcsss1(unsigned long *Xt) +{ + asm volatile ( + "sys #3, C7, C7, #2, %0\n" + : + : "rZ" (Xt) + : "memory"); +} + +static always_inline unsigned long *gcsss2(void) +{ + unsigned long *Xt; + + asm volatile ( + "SYSL %0, #3, C7, C7, #3\n" + : "=r" (Xt) + : + : "memory"); + + return Xt; +} + +static inline void gcsstr(unsigned long addr, unsigned long val) +{ + asm volatile( + "mov x0, %0\n" + "mov x1, %1\n" + ".inst 0xd91f1c01\n" // GCSSTR x1, [x0] + "mov x0, #0\n" + : + : "r"(addr), "r"(val) + : "x0", "x1", "memory"); +} +/* clang-format on */ + +static always_inline int gcs_restore(struct rst_shstk_info *gcs) +{ + unsigned long gcspr, val; + + if (!(gcs && gcs->features_enabled & PR_SHADOW_STACK_ENABLE)) { + return 0; + } + + gcspr = gcs->gcspr_el0 - 8; + + val = ALIGN_DOWN(GCS_SIGNAL_CAP(gcspr), 8); + pr_debug("gcs: [0] GCSSTR VAL=%lx write at GCSPR=%lx\n", val, gcspr); + gcsstr(gcspr, val); + + val = ALIGN_DOWN(GCS_SIGNAL_CAP(gcspr), 8) | GCS_CAP_VALID_TOKEN; + gcspr -= 8; + pr_debug("gcs: [1] GCSSTR VAL=%lx write at GCSPR=%lx\n", val, gcspr); + gcsstr(gcspr, val); + + pr_debug("gcs: about to switch stacks via GCSSS1 to: %lx\n", gcspr); + gcsss1((unsigned long *)gcspr); + return 0; +} +#define arch_shstk_restore gcs_restore + +static always_inline int gcs_vma_restore(VmaEntry *vma_entry) +{ + unsigned long shstk, i, ret; + unsigned long *gcs_data = (void *)vma_premmaped_start(vma_entry); + unsigned long vma_size = vma_entry_len(vma_entry); + + shstk = gcs_map(0, vma_size, SHADOW_STACK_SET_TOKEN); + if (shstk < 0) { + pr_err("Failed to map shadow stack at %lx: %ld\n", shstk, shstk); + } + + /* restore shadow stack contents */ + for (i = 0; i < vma_size / 8; i++) + gcsstr(shstk + i * 8, gcs_data[i]); + + pr_debug("unmap %lx %ld\n", (unsigned long)gcs_data, vma_size); + ret = sys_munmap(gcs_data, vma_size); + if (ret < 0) { + pr_err("Failed to unmap premmaped shadow stack\n"); + return ret; + } + + vma_premmaped_start(vma_entry) = shstk; + + return 0; +} +#define shstk_vma_restore gcs_vma_restore + +static always_inline int gcs_switch_to_restorer(struct rst_shstk_info *gcs) +{ + int ret; + unsigned long *ssp; + unsigned long addr; + unsigned long gcspr; + + if (!(gcs && gcs->features_enabled & PR_SHADOW_STACK_ENABLE)) { + return 0; + } + + pr_debug("gcs->premapped_addr + gcs->vma_size = %lx\n", gcs->premapped_addr + gcs->vma_size); + pr_debug("gcs->tmp_gcs = %lx\n", gcs->tmp_gcs); + addr = gcs->tmp_gcs; + + if (addr % PAGE_SIZE != 0) { + pr_err("gcs: 0x%lx not page-aligned to size 0x%lx\n", addr, PAGE_SIZE); + return -1; + } + + ret = sys_munmap((void *)addr, PAGE_SIZE); + if (ret < 0) { + pr_err("gcs: Failed to unmap aarea for dumpee GCS VMAs\n"); + return -1; + } + + gcspr = gcs_map(addr, PAGE_SIZE, SHADOW_STACK_SET_TOKEN); + + if (gcspr == -1) { + pr_err("gcs: failed to gcs_map(%lx, %lx)\n", (unsigned long)addr, PAGE_SIZE); + return -1; + } + + ssp = (unsigned long *)(addr + PAGE_SIZE - 8); + gcsss1(ssp); + + return 0; +} +#define arch_shstk_switch_to_restorer gcs_switch_to_restorer + +#endif /* CR_NOGLIBC */ + +#endif /* __CR_ASM_GCS_H__ */ diff --git a/criu/arch/aarch64/include/asm/kerndat.h b/criu/arch/aarch64/include/asm/kerndat.h index 60956b573..bb70cf6cf 100644 --- a/criu/arch/aarch64/include/asm/kerndat.h +++ b/criu/arch/aarch64/include/asm/kerndat.h @@ -1,7 +1,7 @@ #ifndef __CR_ASM_KERNDAT_H__ #define __CR_ASM_KERNDAT_H__ -#define kdat_compatible_cr() 0 -#define kdat_can_map_vdso() 0 +#define kdat_compatible_cr() 0 +#define kdat_can_map_vdso() 0 #endif /* __CR_ASM_KERNDAT_H__ */ diff --git a/criu/arch/aarch64/include/asm/parasite.h b/criu/arch/aarch64/include/asm/parasite.h index 2a1e1c12e..cdcbc7909 100644 --- a/criu/arch/aarch64/include/asm/parasite.h +++ b/criu/arch/aarch64/include/asm/parasite.h @@ -4,7 +4,7 @@ static inline void arch_get_tls(tls_t *ptls) { tls_t tls; - asm("mrs %0, tpidr_el0" : "=r" (tls)); + asm("mrs %0, tpidr_el0" : "=r"(tls)); *ptls = tls; } diff --git a/criu/arch/aarch64/include/asm/restore.h b/criu/arch/aarch64/include/asm/restore.h index 3d794ffb5..c79605c40 100644 --- a/criu/arch/aarch64/include/asm/restore.h +++ b/criu/arch/aarch64/include/asm/restore.h @@ -5,6 +5,7 @@ #include "images/core.pb-c.h" +/* clang-format off */ #define JUMP_TO_RESTORER_BLOB(new_sp, restore_task_exec_start, \ task_args) \ asm volatile( \ @@ -16,13 +17,23 @@ "r"(restore_task_exec_start), \ "r"(task_args) \ : "x0", "memory") +/* clang-format on */ static inline void core_get_tls(CoreEntry *pcore, tls_t *ptls) { *ptls = pcore->ti_aarch64->tls; } - int restore_fpu(struct rt_sigframe *sigframe, CoreEntry *core); +#define ARCH_RST_INFO y +struct rst_arch_info { + bool has_paca, has_pacg; + PacAddressKeys pac_address_keys; + PacGenericKeys pac_generic_keys; +}; + +int arch_ptrace_restore(int pid, struct pstree_item *item); +void arch_rsti_init(struct pstree_item *current); + #endif diff --git a/criu/arch/aarch64/include/asm/restorer.h b/criu/arch/aarch64/include/asm/restorer.h index f502cdcaf..8f3edc257 100644 --- a/criu/arch/aarch64/include/asm/restorer.h +++ b/criu/arch/aarch64/include/asm/restorer.h @@ -1,14 +1,16 @@ #ifndef __CR_ASM_RESTORER_H__ #define __CR_ASM_RESTORER_H__ -#include +#include #include #include "asm/types.h" +#include "asm/gcs.h" #include "images/core.pb-c.h" #include +/* clang-format off */ #define RUN_CLONE_RESTORE_FN(ret, clone_flags, new_sp, parent_tid, \ thread_args, clone_restore_fn) \ asm volatile( \ @@ -42,6 +44,68 @@ "r"(&thread_args[i]) \ : "x0", "x1", "x2", "x3", "x8", "memory") +/* + * Based on sysdeps/unix/sysv/linux/aarch64/clone.S + * + * int clone(int (*fn)(void *arg), x0 + * void *child_stack, x1 + * int flags, x2 + * void *arg, x3 + * pid_t *ptid, x4 + * struct user_desc *tls, x5 + * pid_t *ctid); x6 + * + * int clone3(struct clone_args *args, x0 + * size_t size); x1 + * + * Always consult the CLONE3 wrappers for other architectures + * for additional details. + * + */ + +#define RUN_CLONE3_RESTORE_FN(ret, clone_args, size, args, \ + clone_restore_fn) \ + asm volatile( \ + /* In contrast to the clone() wrapper above this does not put + * the thread function and its arguments on the child stack, + * but uses registers to pass these parameters to the child process. + * Based on the glibc clone() wrapper at + * sysdeps/unix/sysv/linux/aarch64/clone.S. + */ \ + "clone3_emul: \n" \ + /* + * Based on the glibc clone() wrapper, which uses x10 and x11 + * to save the arguments for the child process, this does the same. + * x10 for the thread function and x11 for the thread arguments. + */ \ + "mov x10, %3 /* clone_restore_fn */ \n" \ + "mov x11, %4 /* args */ \n" \ + "mov x0, %1 /* &clone_args */ \n" \ + "mov x1, %2 /* size */ \n" \ + /* Load syscall number */ \ + "mov x8, #"__stringify(__NR_clone3)" \n" \ + /* Do the syscall */ \ + "svc #0 \n" \ + \ + "cbz x0, clone3_thread_run \n" \ + \ + "mov %0, x0 \n" \ + "b clone3_end \n" \ + \ + "clone3_thread_run: \n" \ + /* Move args to x0 */ \ + "mov x0, x11 \n" \ + /* Jump to clone_restore_fn */ \ + "br x10 \n" \ + \ + "clone3_end: \n" \ + : "=r"(ret) \ + : "r"(&clone_args), \ + "r"(size), \ + "r"(clone_restore_fn), \ + "r"(args) \ + : "x0", "x1", "x8", "x10", "x11", "memory") + #define ARCH_FAIL_CORE_RESTORE \ asm volatile( \ "mov sp, %0 \n" \ @@ -50,20 +114,25 @@ : \ : "r"(ret) \ : "sp", "x0", "memory") +/* clang-format on */ - -#define arch_map_vdso(map, compat) -1 +#define arch_map_vdso(map, compat) -1 int restore_gpregs(struct rt_sigframe *f, UserAarch64RegsEntry *r); int restore_nonsigframe_gpregs(UserAarch64RegsEntry *r); static inline void restore_tls(tls_t *ptls) { - asm("msr tpidr_el0, %0" : : "r" (*ptls)); + asm("msr tpidr_el0, %0" : : "r"(*ptls)); } -static inline void *alloc_compat_syscall_stack(void) { return NULL; } -static inline void free_compat_syscall_stack(void *stack32) { } +static inline void *alloc_compat_syscall_stack(void) +{ + return NULL; +} +static inline void free_compat_syscall_stack(void *stack32) +{ +} static inline int arch_compat_rt_sigaction(void *stack, int sig, void *act) { return -1; diff --git a/criu/arch/aarch64/include/asm/thread_pointer.h b/criu/arch/aarch64/include/asm/thread_pointer.h new file mode 100644 index 000000000..f7e07066a --- /dev/null +++ b/criu/arch/aarch64/include/asm/thread_pointer.h @@ -0,0 +1,27 @@ +/* __thread_pointer definition. Generic version. + Copyright (C) 2021 Free Software Foundation, Inc. + This file is part of the GNU C Library. + + The GNU C Library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + The GNU C Library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with the GNU C Library. If not, see + . */ + +#ifndef _SYS_THREAD_POINTER_H +#define _SYS_THREAD_POINTER_H + +static inline void *__criu_thread_pointer(void) +{ + return __builtin_thread_pointer(); +} + +#endif /* _SYS_THREAD_POINTER_H */ diff --git a/criu/arch/aarch64/include/asm/types.h b/criu/arch/aarch64/include/asm/types.h index e79f86698..db118cafd 100644 --- a/criu/arch/aarch64/include/asm/types.h +++ b/criu/arch/aarch64/include/asm/types.h @@ -12,7 +12,7 @@ #include -#define core_is_compat(core) false +#define core_is_compat(core) false typedef UserAarch64RegsEntry UserRegsEntry; @@ -22,10 +22,27 @@ typedef UserAarch64RegsEntry UserRegsEntry; #define TI_SP(core) ((core)->ti_aarch64->gpregs->sp) -static inline void *decode_pointer(uint64_t v) { return (void*)v; } -static inline uint64_t encode_pointer(void *p) { return (uint64_t)p; } +#define TI_IP(core) ((core)->ti_aarch64->gpregs->pc) + +static inline void *decode_pointer(uint64_t v) +{ + return (void *)v; +} +static inline uint64_t encode_pointer(void *p) +{ + return (uint64_t)p; +} + +/** + * See also: + * * arch/arm64/include/uapi/asm/auxvec.h + * * include/linux/auxvec.h + * * include/linux/mm_types.h + */ +#define AT_VECTOR_SIZE_BASE 22 +#define AT_VECTOR_SIZE_ARCH 2 +#define AT_VECTOR_SIZE (2*(AT_VECTOR_SIZE_ARCH + AT_VECTOR_SIZE_BASE + 1)) -#define AT_VECTOR_SIZE 40 typedef uint64_t auxv_t; typedef uint64_t tls_t; diff --git a/criu/arch/aarch64/include/asm/vdso.h b/criu/arch/aarch64/include/asm/vdso.h index 8a65e0947..e77d04cd5 100644 --- a/criu/arch/aarch64/include/asm/vdso.h +++ b/criu/arch/aarch64/include/asm/vdso.h @@ -9,23 +9,20 @@ * This is a minimal amount of symbols * we should support at the moment. */ -#define VDSO_SYMBOL_MAX 4 -#define VDSO_SYMBOL_GTOD 2 +#define VDSO_SYMBOL_MAX 4 +#define VDSO_SYMBOL_GTOD 2 /* * Workaround for VDSO array symbol table's relocation. * XXX: remove when compel/piegen will support aarch64. */ -static const char* __maybe_unused aarch_vdso_symbol1 = "__kernel_clock_getres"; -static const char* __maybe_unused aarch_vdso_symbol2 = "__kernel_clock_gettime"; -static const char* __maybe_unused aarch_vdso_symbol3 = "__kernel_gettimeofday"; -static const char* __maybe_unused aarch_vdso_symbol4 = "__kernel_rt_sigreturn"; +#define ARCH_VDSO_SYMBOLS_LIST \ + const char *aarch_vdso_symbol1 = "__kernel_clock_getres"; \ + const char *aarch_vdso_symbol2 = "__kernel_clock_gettime"; \ + const char *aarch_vdso_symbol3 = "__kernel_gettimeofday"; \ + const char *aarch_vdso_symbol4 = "__kernel_rt_sigreturn"; -#define ARCH_VDSO_SYMBOLS \ - aarch_vdso_symbol1, \ - aarch_vdso_symbol2, \ - aarch_vdso_symbol3, \ - aarch_vdso_symbol4 +#define ARCH_VDSO_SYMBOLS aarch_vdso_symbol1, aarch_vdso_symbol2, aarch_vdso_symbol3, aarch_vdso_symbol4 extern void write_intraprocedure_branch(unsigned long to, unsigned long from); diff --git a/criu/arch/aarch64/sigframe.c b/criu/arch/aarch64/sigframe.c index be57c1670..8096fab66 100644 --- a/criu/arch/aarch64/sigframe.c +++ b/criu/arch/aarch64/sigframe.c @@ -2,8 +2,7 @@ #include #include "asm/sigframe.h" -int sigreturn_prep_fpu_frame(struct rt_sigframe *sigframe, - struct rt_sigframe *rsigframe) +int sigreturn_prep_fpu_frame(struct rt_sigframe *sigframe, struct rt_sigframe *rsigframe) { return 0; } diff --git a/criu/arch/aarch64/vdso-pie.c b/criu/arch/aarch64/vdso-pie.c index 53d83cbe7..8b9d97274 100644 --- a/criu/arch/aarch64/vdso-pie.c +++ b/criu/arch/aarch64/vdso-pie.c @@ -8,13 +8,12 @@ #include "common/bug.h" #ifdef LOG_PREFIX -# undef LOG_PREFIX +#undef LOG_PREFIX #endif #define LOG_PREFIX "vdso: " -int vdso_redirect_calls(unsigned long base_to, unsigned long base_from, - struct vdso_symtable *to, struct vdso_symtable *from, - bool __always_unused compat_vdso) +int vdso_redirect_calls(unsigned long base_to, unsigned long base_from, struct vdso_symtable *to, + struct vdso_symtable *from, bool __always_unused compat_vdso) { unsigned int i; @@ -22,12 +21,10 @@ int vdso_redirect_calls(unsigned long base_to, unsigned long base_from, if (vdso_symbol_empty(&from->symbols[i])) continue; - pr_debug("br: %lx/%lx -> %lx/%lx (index %d)\n", - base_from, from->symbols[i].offset, - base_to, to->symbols[i].offset, i); + pr_debug("br: %lx/%lx -> %lx/%lx (index %d)\n", base_from, from->symbols[i].offset, base_to, + to->symbols[i].offset, i); - write_intraprocedure_branch(base_to + to->symbols[i].offset, - base_from + from->symbols[i].offset); + write_intraprocedure_branch(base_to + to->symbols[i].offset, base_from + from->symbols[i].offset); } return 0; diff --git a/criu/arch/arm/cpu.c b/criu/arch/arm/cpu.c index 34313fb15..97a883b8c 100644 --- a/criu/arch/arm/cpu.c +++ b/criu/arch/arm/cpu.c @@ -1,4 +1,4 @@ -#undef LOG_PREFIX +#undef LOG_PREFIX #define LOG_PREFIX "cpu: " #include diff --git a/criu/arch/arm/crtools.c b/criu/arch/arm/crtools.c index c216cdc5c..6a5e4c89a 100644 --- a/criu/arch/arm/crtools.c +++ b/criu/arch/arm/crtools.c @@ -18,12 +18,11 @@ #include "elf.h" #include "parasite-syscall.h" #include "restorer.h" +#include "compel/infect.h" -#include +#define assign_reg(dst, src, e) dst->e = (__typeof__(dst->e))((src)->ARM_##e) -#define assign_reg(dst, src, e) dst->e = (__typeof__(dst->e))((src)->ARM_##e) - -int save_task_regs(void *x, user_regs_struct_t *regs, user_fpregs_struct_t *fpregs) +int save_task_regs(pid_t pid, void *x, user_regs_struct_t *regs, user_fpregs_struct_t *fpregs) { CoreEntry *core = x; @@ -48,7 +47,6 @@ int save_task_regs(void *x, user_regs_struct_t *regs, user_fpregs_struct_t *fpre assign_reg(core->ti_arm->gpregs, regs, cpsr); core->ti_arm->gpregs->orig_r0 = regs->ARM_ORIG_r0; - // Save the VFP state memcpy(CORE_THREAD_ARCH_INFO(core)->fpstate->vfp_regs, &fpregs->fpregs, sizeof(fpregs->fpregs)); @@ -78,7 +76,7 @@ int arch_alloc_thread_info(CoreEntry *core) goto err; user_arm_vfpstate_entry__init(fpstate); ti_arm->fpstate = fpstate; - fpstate->vfp_regs = xmalloc(32*sizeof(unsigned long long)); + fpstate->vfp_regs = xmalloc(32 * sizeof(unsigned long long)); fpstate->n_vfp_regs = 32; if (!fpstate->vfp_regs) goto err; @@ -114,8 +112,8 @@ int restore_fpu(struct rt_sigframe *sigframe, CoreEntry *core) int restore_gpregs(struct rt_sigframe *f, UserArmRegsEntry *r) { -#define CPREG1(d) f->sig.uc.uc_mcontext.arm_##d = r->d -#define CPREG2(d, s) f->sig.uc.uc_mcontext.arm_##d = r->s +#define CPREG1(d) f->sig.uc.uc_mcontext.arm_##d = r->d +#define CPREG2(d, s) f->sig.uc.uc_mcontext.arm_##d = r->s CPREG1(r0); CPREG1(r1); diff --git a/criu/arch/arm/include/asm/dump.h b/criu/arch/arm/include/asm/dump.h index 2382ba42c..b0ac5715d 100644 --- a/criu/arch/arm/include/asm/dump.h +++ b/criu/arch/arm/include/asm/dump.h @@ -1,11 +1,10 @@ #ifndef __CR_ASM_DUMP_H__ #define __CR_ASM_DUMP_H__ -extern int save_task_regs(void *, user_regs_struct_t *, user_fpregs_struct_t *); +extern int save_task_regs(pid_t pid, void *, user_regs_struct_t *, user_fpregs_struct_t *); extern int arch_alloc_thread_info(CoreEntry *core); extern void arch_free_thread_info(CoreEntry *core); - static inline void core_put_tls(CoreEntry *core, tls_t tls) { core->ti_arm->tls = tls; diff --git a/criu/arch/arm/include/asm/kerndat.h b/criu/arch/arm/include/asm/kerndat.h index 60956b573..bb70cf6cf 100644 --- a/criu/arch/arm/include/asm/kerndat.h +++ b/criu/arch/arm/include/asm/kerndat.h @@ -1,7 +1,7 @@ #ifndef __CR_ASM_KERNDAT_H__ #define __CR_ASM_KERNDAT_H__ -#define kdat_compatible_cr() 0 -#define kdat_can_map_vdso() 0 +#define kdat_compatible_cr() 0 +#define kdat_can_map_vdso() 0 #endif /* __CR_ASM_KERNDAT_H__ */ diff --git a/criu/arch/arm/include/asm/parasite.h b/criu/arch/arm/include/asm/parasite.h index 0ed320ba6..5911ef6b7 100644 --- a/criu/arch/arm/include/asm/parasite.h +++ b/criu/arch/arm/include/asm/parasite.h @@ -1,9 +1,10 @@ #ifndef __ASM_PARASITE_H__ #define __ASM_PARASITE_H__ +/* kuser_get_tls() kernel-provided user-helper, the address is emulated */ static inline void arch_get_tls(tls_t *ptls) { - *ptls = ((tls_t (*)(void))0xffff0fe0)(); + *ptls = ((tls_t(*)(void))0xffff0fe0)(); } #endif diff --git a/criu/arch/arm/include/asm/restore.h b/criu/arch/arm/include/asm/restore.h index 4c64d58ef..09ff691a5 100644 --- a/criu/arch/arm/include/asm/restore.h +++ b/criu/arch/arm/include/asm/restore.h @@ -5,6 +5,7 @@ #include "images/core.pb-c.h" +/* clang-format off */ #define JUMP_TO_RESTORER_BLOB(new_sp, restore_task_exec_start, \ task_args) \ asm volatile( \ @@ -16,14 +17,14 @@ : "r"(new_sp), \ "r"(restore_task_exec_start), \ "r"(task_args) \ - : "sp", "r0", "r1", "memory") + : "r0", "r1", "memory") +/* clang-format on */ static inline void core_get_tls(CoreEntry *pcore, tls_t *ptls) { *ptls = pcore->ti_arm->tls; } - int restore_fpu(struct rt_sigframe *sigframe, CoreEntry *core); #endif diff --git a/criu/arch/arm/include/asm/restorer.h b/criu/arch/arm/include/asm/restorer.h index 217d920e8..d35fda4bc 100644 --- a/criu/arch/arm/include/asm/restorer.h +++ b/criu/arch/arm/include/asm/restorer.h @@ -6,6 +6,7 @@ #include +/* clang-format off */ #define RUN_CLONE_RESTORE_FN(ret, clone_flags, new_sp, parent_tid, \ thread_args, clone_restore_fn) \ asm volatile( \ @@ -43,6 +44,63 @@ "r"(&thread_args[i]) \ : "r0", "r1", "r2", "r3", "r7", "memory") + +/* + * The clone3() assembler wrapper is based on the clone() wrapper above + * and on code from the glibc wrapper at + * sysdeps/unix/sysv/linux/arm/clone.S + * + * For arm it is necessary to change the child stack as on x86_64 as + * it seems there are not registers which stay the same over a syscall + * like on s390x, ppc64le and aarch64. + * + * Changing the child stack means that this code has to deal with the + * kernel doing stack + stack_size implicitly. + * + * int clone3(struct clone_args *args, size_t size) + */ + +#define RUN_CLONE3_RESTORE_FN(ret, clone_args, size, args, \ + clone_restore_fn) \ + asm volatile( \ + "clone3_emul: \n" \ + /* Load thread stack pointer */ \ + "ldr r1, [%3] \n" \ + /* Load thread stack size */ \ + "mov r2, %4 \n" \ + /* Goto to the end of stack */ \ + "add r1, r1, r2 \n" \ + /* Load thread function and arguments and push on stack */ \ + "mov r2, %6 /* args */ \n" \ + "str r2, [r1, #4] /* args */ \n" \ + "mov r2, %5 /* function */ \n" \ + "str r2, [r1] /* function */ \n" \ + "mov r0, %1 /* clone_args */ \n" \ + "mov r1, %2 /* size */ \n" \ + "mov r7, #"__stringify(__NR_clone3)" \n" \ + "svc #0 \n" \ + \ + "cmp r0, #0 \n" \ + "beq thread3_run \n" \ + \ + "mov %0, r0 \n" \ + "b clone3_end \n" \ + \ + "thread3_run: \n" \ + "pop { r1 } \n" \ + "pop { r0 } \n" \ + "bx r1 \n" \ + \ + "clone3_end: \n" \ + : "=r"(ret) \ + : "r"(&clone_args), \ + "r"(size), \ + "r"(&clone_args.stack), \ + "r"(clone_args.stack_size), \ + "r"(clone_restore_fn), \ + "r"(args) \ + : "r0", "r1", "r2", "r7", "memory") + #define ARCH_FAIL_CORE_RESTORE \ asm volatile( \ "mov sp, %0 \n" \ @@ -51,32 +109,35 @@ : \ : "r"(ret) \ : "memory") +/* clang-format on */ - -#define arch_map_vdso(map, compat) -1 +#define arch_map_vdso(map, compat) -1 int restore_gpregs(struct rt_sigframe *f, UserArmRegsEntry *r); int restore_nonsigframe_gpregs(UserArmRegsEntry *r); #define ARCH_HAS_SHMAT_HOOK -unsigned long arch_shmat(int shmid, void *shmaddr, - int shmflg, unsigned long size); +unsigned long arch_shmat(int shmid, void *shmaddr, int shmflg, unsigned long size); -static inline void restore_tls(tls_t *ptls) { - asm ( - "mov r7, #15 \n" - "lsl r7, #16 \n" - "mov r0, #5 \n" - "add r7, r0 \n" /* r7 = 0xF005 */ - "ldr r0, [%0] \n" - "svc #0 \n" - : - : "r"(ptls) - : "r0", "r7" - ); +static inline void restore_tls(tls_t *ptls) +{ + asm("mov r7, #15 \n" + "lsl r7, #16 \n" + "mov r0, #5 \n" + "add r7, r0 \n" /* r7 = 0xF005 */ + "ldr r0, [%0] \n" + "svc #0 \n" + : + : "r"(ptls) + : "r0", "r7"); } -static inline void *alloc_compat_syscall_stack(void) { return NULL; } -static inline void free_compat_syscall_stack(void *stack32) { } +static inline void *alloc_compat_syscall_stack(void) +{ + return NULL; +} +static inline void free_compat_syscall_stack(void *stack32) +{ +} static inline int arch_compat_rt_sigaction(void *stack, int sig, void *act) { return -1; diff --git a/criu/arch/arm/include/asm/thread_pointer.h b/criu/arch/arm/include/asm/thread_pointer.h new file mode 100644 index 000000000..f7e07066a --- /dev/null +++ b/criu/arch/arm/include/asm/thread_pointer.h @@ -0,0 +1,27 @@ +/* __thread_pointer definition. Generic version. + Copyright (C) 2021 Free Software Foundation, Inc. + This file is part of the GNU C Library. + + The GNU C Library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + The GNU C Library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with the GNU C Library. If not, see + . */ + +#ifndef _SYS_THREAD_POINTER_H +#define _SYS_THREAD_POINTER_H + +static inline void *__criu_thread_pointer(void) +{ + return __builtin_thread_pointer(); +} + +#endif /* _SYS_THREAD_POINTER_H */ diff --git a/criu/arch/arm/include/asm/types.h b/criu/arch/arm/include/asm/types.h index 32612a692..93d2dc23d 100644 --- a/criu/arch/arm/include/asm/types.h +++ b/criu/arch/arm/include/asm/types.h @@ -11,7 +11,7 @@ #include -#define core_is_compat(core) false +#define core_is_compat(core) false typedef UserArmRegsEntry UserRegsEntry; @@ -21,10 +21,18 @@ typedef UserArmRegsEntry UserRegsEntry; #define TI_SP(core) ((core)->ti_arm->gpregs->sp) -static inline void *decode_pointer(u64 v) { return (void*)(u32)v; } -static inline u64 encode_pointer(void *p) { return (u32)p; } +#define TI_IP(core) ((core)->ti_arm->gpregs->ip) -#define AT_VECTOR_SIZE 40 +static inline void *decode_pointer(u64 v) +{ + return (void *)(u32)v; +} +static inline u64 encode_pointer(void *p) +{ + return (u32)p; +} + +#define AT_VECTOR_SIZE 40 typedef uint32_t auxv_t; typedef uint32_t tls_t; diff --git a/criu/arch/arm/include/asm/vdso.h b/criu/arch/arm/include/asm/vdso.h index f57790ac2..5787bfe17 100644 --- a/criu/arch/arm/include/asm/vdso.h +++ b/criu/arch/arm/include/asm/vdso.h @@ -9,10 +9,11 @@ * * Poke from kernel file arch/arm/vdso/vdso.lds.S */ -#define VDSO_SYMBOL_MAX 2 -#define VDSO_SYMBOL_GTOD 1 -#define ARCH_VDSO_SYMBOLS \ - "__vdso_clock_gettime", \ - "__vdso_gettimeofday" +#define VDSO_SYMBOL_MAX 2 +#define VDSO_SYMBOL_GTOD 1 +#define ARCH_VDSO_SYMBOLS_LIST \ + const char *aarch_vdso_symbol1 = "__vdso_clock_gettime"; \ + const char *aarch_vdso_symbol2 = "__vdso_gettimeofday"; +#define ARCH_VDSO_SYMBOLS aarch_vdso_symbol1, aarch_vdso_symbol2, #endif /* __CR_ASM_VDSO_H__ */ diff --git a/criu/arch/arm/restorer.c b/criu/arch/arm/restorer.c index 588c1c074..fd4b636b0 100644 --- a/criu/arch/arm/restorer.c +++ b/criu/arch/arm/restorer.c @@ -41,8 +41,7 @@ int restore_nonsigframe_gpregs(UserArmRegsEntry *r) * allocated with shmat(shmid, NULL, shmflg). */ #define SHMLBA (4UL * PAGE_SIZE) -unsigned long arch_shmat(int shmid, void *shmaddr, - int shmflg, unsigned long size) +unsigned long arch_shmat(int shmid, void *shmaddr, int shmflg, unsigned long size) { unsigned long smap; @@ -65,8 +64,7 @@ unsigned long arch_shmat(int shmid, void *shmaddr, pr_warn("Make sure that you don't migrate shmem from non-VIPT cached CPU to VIPT cached (e.g., ARMv7 -> ARMv6)\n"); pr_warn("Otherwise YOU HAVE A CHANCE OF DATA CORRUPTIONS in writeable shmem\n"); - smap = sys_mremap(smap, size, size, - MREMAP_FIXED | MREMAP_MAYMOVE, (unsigned long)shmaddr); + smap = sys_mremap(smap, size, size, MREMAP_FIXED | MREMAP_MAYMOVE, (unsigned long)shmaddr); if (IS_ERR_VALUE(smap)) pr_err("mremap() for shmem failed: %d\n", (int)smap); return smap; diff --git a/criu/arch/arm/sigframe.c b/criu/arch/arm/sigframe.c index be57c1670..8096fab66 100644 --- a/criu/arch/arm/sigframe.c +++ b/criu/arch/arm/sigframe.c @@ -2,8 +2,7 @@ #include #include "asm/sigframe.h" -int sigreturn_prep_fpu_frame(struct rt_sigframe *sigframe, - struct rt_sigframe *rsigframe) +int sigreturn_prep_fpu_frame(struct rt_sigframe *sigframe, struct rt_sigframe *rsigframe) { return 0; } diff --git a/criu/arch/arm/vdso-pie.c b/criu/arch/arm/vdso-pie.c index 0ec8bd9a8..ecfe6a498 100644 --- a/criu/arch/arm/vdso-pie.c +++ b/criu/arch/arm/vdso-pie.c @@ -9,32 +9,31 @@ #include "common/bug.h" #ifdef LOG_PREFIX -# undef LOG_PREFIX +#undef LOG_PREFIX #endif #define LOG_PREFIX "vdso: " static void insert_trampoline(uintptr_t from, uintptr_t to) { struct { - uint32_t ldr_pc; - uint32_t imm32; - uint32_t guards; + uint32_t ldr_pc; + uint32_t imm32; + uint32_t guards; } __packed jmp = { - .ldr_pc = 0xe51ff004, /* ldr pc, [pc, #-4] */ - .imm32 = to, - .guards = 0xe1200070, /* bkpt 0x0000 */ + .ldr_pc = 0xe51ff004, /* ldr pc, [pc, #-4] */ + .imm32 = to, + .guards = 0xe1200070, /* bkpt 0x0000 */ }; - void *iflush_start = (void *)from; - void *iflush_end = iflush_start + sizeof(jmp); + void *iflush_start = (void *)from; + void *iflush_end = iflush_start + sizeof(jmp); memcpy((void *)from, &jmp, sizeof(jmp)); __builtin___clear_cache(iflush_start, iflush_end); } -int vdso_redirect_calls(unsigned long base_to, unsigned long base_from, - struct vdso_symtable *sto, struct vdso_symtable *sfrom, - bool compat_vdso) +int vdso_redirect_calls(unsigned long base_to, unsigned long base_from, struct vdso_symtable *sto, + struct vdso_symtable *sfrom, bool compat_vdso) { unsigned int i; @@ -44,9 +43,8 @@ int vdso_redirect_calls(unsigned long base_to, unsigned long base_from, if (vdso_symbol_empty(&sfrom->symbols[i])) continue; - pr_debug("jmp: %lx/%lx -> %lx/%lx (index %d)\n", - base_from, sfrom->symbols[i].offset, - base_to, sto->symbols[i].offset, i); + pr_debug("jmp: %lx/%lx -> %lx/%lx (index %d)\n", base_from, sfrom->symbols[i].offset, base_to, + sto->symbols[i].offset, i); from = base_from + sfrom->symbols[i].offset; to = base_to + sto->symbols[i].offset; diff --git a/criu/arch/loongarch64/Makefile b/criu/arch/loongarch64/Makefile new file mode 100644 index 000000000..4bd99eb7e --- /dev/null +++ b/criu/arch/loongarch64/Makefile @@ -0,0 +1,14 @@ +builtin-name := crtools.built-in.o + +ccflags-y += -iquote $(obj)/include +ccflags-y += -iquote criu/include -iquote include +ccflags-y += $(COMPEL_UAPI_INCLUDES) + +asflags-y += -Wstrict-prototypes +asflags-y += -D__ASSEMBLY__ -nostdlib -fomit-frame-pointer +asflags-y += -iquote $(obj)/include +ldflags-y += -r -z noexecstack + +obj-y += cpu.o +obj-y += crtools.o +obj-y += sigframe.o diff --git a/criu/arch/loongarch64/cpu.c b/criu/arch/loongarch64/cpu.c new file mode 100644 index 000000000..5559c4288 --- /dev/null +++ b/criu/arch/loongarch64/cpu.c @@ -0,0 +1,31 @@ +#undef LOG_PREFIX +#define LOG_PREFIX "cpu: " + +int cpu_init(void) +{ + return 0; +} + +int cpu_dump_cpuinfo(void) +{ + return 0; +} + +int cpu_validate_cpuinfo(void) +{ + return 0; +} + +int cpuinfo_dump(void) +{ + if (cpu_init()) + return -1; + if (cpu_dump_cpuinfo()) + return -1; + return 0; +} + +int cpuinfo_check(void) +{ + return 0; +} diff --git a/criu/arch/loongarch64/crtools.c b/criu/arch/loongarch64/crtools.c new file mode 100644 index 000000000..783951b5b --- /dev/null +++ b/criu/arch/loongarch64/crtools.c @@ -0,0 +1,115 @@ +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include "types.h" +#include "log.h" +#include "asm/restorer.h" +#include "asm/parasite-syscall.h" +#include +#include "asm/dump.h" +#include "cr_options.h" +#include "common/compiler.h" +#include "restorer.h" +#include "parasite-syscall.h" +#include "util.h" +#include "cpu.h" +#include +#include "kerndat.h" + +#include "protobuf.h" +#include "images/core.pb-c.h" +#include "images/creds.pb-c.h" + +#define assign_reg(dst, src, e) (dst)->e = (__typeof__(dst->e))(src)->e + +int save_task_regs(pid_t pid, void *x, user_regs_struct_t *regs, user_fpregs_struct_t *fpregs) +{ + int i; + CoreEntry *core = x; + UserLoongarch64GpregsEntry *gprs = core->ti_loongarch64->gpregs; + UserLoongarch64FpregsEntry *fprs = core->ti_loongarch64->fpregs; + for (i = 0; i < GPR_NUM; i++) + assign_reg(gprs, regs, regs[i]); + assign_reg(gprs, regs, pc); + + for (i = 0; i < FPR_NUM; i++) + assign_reg(fpregs, fpregs, regs[i]); + assign_reg(fprs, fpregs, fcc); + assign_reg(fprs, fpregs, fcsr); + return 0; +} + +int arch_alloc_thread_info(CoreEntry *core) +{ + ThreadInfoLoongarch64 *ti_loongarch64; + UserLoongarch64GpregsEntry *gpregs; + UserLoongarch64FpregsEntry *fpregs; + + ti_loongarch64 = xmalloc(sizeof(*ti_loongarch64)); + thread_info_loongarch64__init(ti_loongarch64); + core->ti_loongarch64 = ti_loongarch64; + + gpregs = xmalloc(sizeof(*gpregs)); + if (!gpregs) + goto err; + user_loongarch64_gpregs_entry__init(gpregs); + gpregs->n_regs = GPR_NUM; + gpregs->regs = xmalloc(GPR_NUM * sizeof(uint64_t)); + if (!gpregs->regs) + goto err; + ti_loongarch64->gpregs = gpregs; + + fpregs = xmalloc(sizeof(*fpregs)); + if (!fpregs) + goto err; + user_loongarch64_fpregs_entry__init(fpregs); + fpregs->n_regs = FPR_NUM; + fpregs->regs = xmalloc(FPR_NUM * sizeof(uint64_t)); + if (!fpregs->regs) + goto err; + ti_loongarch64->fpregs = fpregs; + + return 0; +err: + return -1; +} + +void arch_free_thread_info(CoreEntry *core) +{ + if (CORE_THREAD_ARCH_INFO(core)) { + if (CORE_THREAD_ARCH_INFO(core)->fpregs) { + xfree(CORE_THREAD_ARCH_INFO(core)->fpregs->regs); + xfree(CORE_THREAD_ARCH_INFO(core)->fpregs); + } + xfree(CORE_THREAD_ARCH_INFO(core)->gpregs->regs); + xfree(CORE_THREAD_ARCH_INFO(core)->gpregs); + xfree(CORE_THREAD_ARCH_INFO(core)); + CORE_THREAD_ARCH_INFO(core) = NULL; + } +} + +int restore_fpu(struct rt_sigframe *sigframe, CoreEntry *core) +{ + fpu_context_t *fpu = RT_SIGFRAME_FPU(sigframe); + UserLoongarch64FpregsEntry *fpregs = core->ti_loongarch64->fpregs; + + memcpy(fpu->regs, fpregs->regs, sizeof(fpu->regs)); + fpu->fcc = fpregs->fcc; + fpu->fcsr = fpregs->fcsr; + return 0; +} + +int restore_gpregs(struct rt_sigframe *sigframe, UserRegsEntry *r) +{ + sigcontext_t *sc = RT_SIGFRAME_SIGCTX(sigframe); + memcpy(sc->regs, r->regs, sizeof(sc->regs)); + sc->pc = r->pc; + return 0; +} diff --git a/criu/arch/loongarch64/include/asm/dump.h b/criu/arch/loongarch64/include/asm/dump.h new file mode 100644 index 000000000..a1c0c4c58 --- /dev/null +++ b/criu/arch/loongarch64/include/asm/dump.h @@ -0,0 +1,15 @@ +#ifndef __CR_ASM_DUMP_H__ +#define __CR_ASM_DUMP_H__ + +extern int save_task_regs(pid_t pid, void *, user_regs_struct_t *, user_fpregs_struct_t *); +extern int arch_alloc_thread_info(CoreEntry *core); +extern void arch_free_thread_info(CoreEntry *core); + +static inline void core_put_tls(CoreEntry *core, tls_t tls) +{ + core->ti_loongarch64->tls = tls; +} + +#define get_task_futex_robust_list_compat(pid, info) -1 + +#endif diff --git a/criu/arch/loongarch64/include/asm/int.h b/criu/arch/loongarch64/include/asm/int.h new file mode 100644 index 000000000..642804e9b --- /dev/null +++ b/criu/arch/loongarch64/include/asm/int.h @@ -0,0 +1,6 @@ +#ifndef __CR_ASM_INT_H__ +#define __CR_ASM_INT_H__ + +#include "asm-generic/int.h" + +#endif /* __CR_ASM_INT_H__ */ diff --git a/criu/arch/loongarch64/include/asm/kerndat.h b/criu/arch/loongarch64/include/asm/kerndat.h new file mode 100644 index 000000000..bb70cf6cf --- /dev/null +++ b/criu/arch/loongarch64/include/asm/kerndat.h @@ -0,0 +1,7 @@ +#ifndef __CR_ASM_KERNDAT_H__ +#define __CR_ASM_KERNDAT_H__ + +#define kdat_compatible_cr() 0 +#define kdat_can_map_vdso() 0 + +#endif /* __CR_ASM_KERNDAT_H__ */ diff --git a/criu/arch/loongarch64/include/asm/parasite-syscall.h b/criu/arch/loongarch64/include/asm/parasite-syscall.h new file mode 100644 index 000000000..6008c3792 --- /dev/null +++ b/criu/arch/loongarch64/include/asm/parasite-syscall.h @@ -0,0 +1,6 @@ +#ifndef __CR_ASM_PARASITE_SYSCALL_H__ +#define __CR_ASM_PARASITE_SYSCALL_H__ + +struct parasite_ctl; + +#endif diff --git a/criu/arch/loongarch64/include/asm/parasite.h b/criu/arch/loongarch64/include/asm/parasite.h new file mode 100644 index 000000000..b64cb3185 --- /dev/null +++ b/criu/arch/loongarch64/include/asm/parasite.h @@ -0,0 +1,11 @@ +#ifndef __ASM_PARASITE_H__ +#define __ASM_PARASITE_H__ + +static inline void arch_get_tls(tls_t *ptls) +{ + tls_t tls; + asm volatile("or %0, $zero, $tp" : "=r"(tls)); + *ptls = tls; +} + +#endif diff --git a/criu/arch/loongarch64/include/asm/restore.h b/criu/arch/loongarch64/include/asm/restore.h new file mode 100644 index 000000000..d956231c8 --- /dev/null +++ b/criu/arch/loongarch64/include/asm/restore.h @@ -0,0 +1,33 @@ +#ifndef __CR_ASM_RESTORE_H__ +#define __CR_ASM_RESTORE_H__ + +#include "asm/restorer.h" +#include "images/core.pb-c.h" + +/* clang-format off */ +#define JUMP_TO_RESTORER_BLOB(new_sp, restore_task_exec_start, task_args) \ +({ \ + uint64_t save_sp; \ + asm volatile("or %0, $zero, $sp" : "=r"(save_sp) : :"memory"); \ + asm volatile( \ + "or $a0, $zero, %2 \n" \ + "or $sp, $zero, %0 \n" \ + "jirl $ra, %1, 0 \n" \ + : \ + : "r"(new_sp & ~15), \ + "r"(restore_task_exec_start), \ + "r"(task_args) \ + : "$a0", "memory"); \ + asm volatile("or $sp, $zero, %0" : : "r"(save_sp) : "memory"); \ +}) + +/* clang-format on */ + +static inline void core_get_tls(CoreEntry *pcore, tls_t *ptls) +{ + *ptls = pcore->ti_loongarch64->tls; +} + +int restore_fpu(struct rt_sigframe *sigframe, CoreEntry *core); + +#endif diff --git a/criu/arch/loongarch64/include/asm/restorer.h b/criu/arch/loongarch64/include/asm/restorer.h new file mode 100644 index 000000000..7a0d35c5b --- /dev/null +++ b/criu/arch/loongarch64/include/asm/restorer.h @@ -0,0 +1,97 @@ +#ifndef __CR_ASM_RESTORER_H__ +#define __CR_ASM_RESTORER_H__ + +#include "asm/types.h" +#include +#include "images/core.pb-c.h" +#include +#include + +/* clang-format off */ +#define RUN_CLONE_RESTORE_FN(ret, clone_flags, new_sp, parent_tid, \ + thread_args, clone_restore_fn) \ + asm volatile( \ + "clone_emul: \n" \ + "ld.d $a1, %2 \n" \ + "addi.d $a1, $a1, -16 \n" \ + "st.d %5, $a1, 0 \n" \ + "st.d %6, $a1, 8 \n" \ + "or $a0, $zero, %1 \n" \ + "or $a2, $zero, %3 \n" \ + "or $a3, $zero, %4 \n" \ + "ori $a7, $zero, "__stringify(__NR_clone)" \n" \ + "syscall 0 \n" \ + \ + "beqz $a0, thread_run \n" \ + \ + "or %0, $zero, $a0 \n" \ + "b clone_end \n" \ + \ + "thread_run: \n" \ + "ld.d $a1, $sp, 0 \n" \ + "ld.d $a0, $sp, 8 \n" \ + "jirl $ra, $a1, 0 \n" \ + \ + "clone_end: \n" \ + : "=r"(ret) \ + : "r"(clone_flags), \ + "ZB"(new_sp), \ + "r"(&parent_tid), \ + "r"(&thread_args[i].pid), \ + "r"(&clone_restore_fn), \ + "r"(&thread_args[i]) \ + : "$a0", "$a1", "$a2", "$a3", "$a7", "memory") + +#define RUN_CLONE3_RESTORE_FN(ret, clone_args, size, args, \ + clone_restore_fn) \ + asm volatile( \ + "clone3_emul: \n" \ + "or $a0, $zero, %1 \n" \ + "or $a1, $zero, %2 \n" \ + "or $a2, $zero, %3 \n" \ + "or $a3, $zero, %4 \n" \ + "ori $a7, $zero, "__stringify(__NR_clone3)" \n" \ + "syscall 0 \n" \ + \ + "beqz $a0, clone3_thread_run \n" \ + \ + "or %0, $zero, $a0 \n" \ + "b clone3_end \n" \ + \ + "clone3_thread_run: \n" \ + "or $a0, $zero, $a3 \n" \ + "jirl $ra, $a2, 0 \n" \ + "clone3_end: \n" \ + : "=r"(ret) \ + : "r"(&clone_args), \ + "r"(size), \ + "r"(clone_restore_fn), \ + "r"(args) \ + : "$a0", "$a1", "$a2", "$a3", "$a7", "memory") +/* clang-format on */ + +static inline void restore_tls(tls_t *ptls) +{ + asm volatile("or $tp, $zero, %0" : : "r"(*ptls)); +} +static inline int arch_compat_rt_sigaction(void *stack, int sig, void *act) +{ + return -1; +} +static inline int set_compat_robust_list(uint32_t head_ptr, uint32_t len) +{ + return -1; +} +static inline void *alloc_compat_syscall_stack(void) +{ + return NULL; +} +static inline void free_compat_syscall_stack(void *stack32) +{ +} +int restore_gpregs(struct rt_sigframe *f, UserLoongarch64GpregsEntry *r); +int restore_nonsigframe_gpregs(UserLoongarch64GpregsEntry *r); + +#define arch_map_vdso(map, compat) -1 + +#endif diff --git a/criu/arch/loongarch64/include/asm/thread_pointer.h b/criu/arch/loongarch64/include/asm/thread_pointer.h new file mode 100644 index 000000000..f7e07066a --- /dev/null +++ b/criu/arch/loongarch64/include/asm/thread_pointer.h @@ -0,0 +1,27 @@ +/* __thread_pointer definition. Generic version. + Copyright (C) 2021 Free Software Foundation, Inc. + This file is part of the GNU C Library. + + The GNU C Library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + The GNU C Library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with the GNU C Library. If not, see + . */ + +#ifndef _SYS_THREAD_POINTER_H +#define _SYS_THREAD_POINTER_H + +static inline void *__criu_thread_pointer(void) +{ + return __builtin_thread_pointer(); +} + +#endif /* _SYS_THREAD_POINTER_H */ diff --git a/criu/arch/loongarch64/include/asm/types.h b/criu/arch/loongarch64/include/asm/types.h new file mode 100644 index 000000000..72bca2022 --- /dev/null +++ b/criu/arch/loongarch64/include/asm/types.h @@ -0,0 +1,39 @@ +#ifndef __CR_ASM_TYPES_H__ +#define __CR_ASM_TYPES_H__ + +#include +#include + +#include "page.h" +#include "bitops.h" +#include "asm/int.h" +#include "images/core.pb-c.h" + +#include + +#define core_is_compat(core) false + +#define CORE_ENTRY__MARCH CORE_ENTRY__MARCH__LOONGARCH64 + +#define CORE_THREAD_ARCH_INFO(core) core->ti_loongarch64 + +#define TI_SP(core) ((core)->ti_loongarch64->gpregs->regs[4]) + +#define TI_IP(core) ((core)->ti_loongarch64->gpregs->pc) + +typedef UserLoongarch64GpregsEntry UserRegsEntry; + +static inline uint64_t encode_pointer(void *p) +{ + return (uint64_t)p; +} +static inline void *decode_pointer(uint64_t v) +{ + return (void *)v; +} + +#define AT_VECTOR_SIZE 44 +typedef uint64_t auxv_t; +typedef uint64_t tls_t; + +#endif /* __CR_ASM_TYPES_H__ */ diff --git a/criu/arch/loongarch64/include/asm/vdso.h b/criu/arch/loongarch64/include/asm/vdso.h new file mode 100644 index 000000000..64631dee0 --- /dev/null +++ b/criu/arch/loongarch64/include/asm/vdso.h @@ -0,0 +1,27 @@ +#ifndef __CR_ASM_VDSO_H__ +#define __CR_ASM_VDSO_H__ + +#include "asm/int.h" +#include "asm-generic/vdso.h" + +/* This definition is used in pie/util-vdso.c to initialize the vdso symbol + * name string table 'vdso_symbols' + */ + +/* + * This is a minimal amount of symbols + * we should support at the moment. + */ +#define VDSO_SYMBOL_MAX 5 +#define VDSO_SYMBOL_GTOD 3 + +#define ARCH_VDSO_SYMBOLS_LIST \ + const char *aarch_vdso_symbol1 = "__vdso_getcpu"; \ + const char *aarch_vdso_symbol2 = "__vdso_clock_getres"; \ + const char *aarch_vdso_symbol3 = "__vdso_clock_gettime"; \ + const char *aarch_vdso_symbol4 = "__vdso_gettimeofday"; \ + const char *aarch_vdso_symbol5 = "__vdso_rt_sigreturn"; + +#define ARCH_VDSO_SYMBOLS \ + aarch_vdso_symbol1, aarch_vdso_symbol2, aarch_vdso_symbol3, aarch_vdso_symbol4, aarch_vdso_symbol5 +#endif diff --git a/criu/arch/loongarch64/restorer.c b/criu/arch/loongarch64/restorer.c new file mode 100644 index 000000000..730318ac1 --- /dev/null +++ b/criu/arch/loongarch64/restorer.c @@ -0,0 +1,14 @@ +#include + +#include "restorer.h" +#include "asm/restorer.h" +#include + +#include +#include "log.h" +#include "cpu.h" + +int restore_nonsigframe_gpregs(UserLoongarch64GpregsEntry *r) +{ + return 0; +} diff --git a/criu/arch/loongarch64/sigframe.c b/criu/arch/loongarch64/sigframe.c new file mode 100644 index 000000000..18983ff13 --- /dev/null +++ b/criu/arch/loongarch64/sigframe.c @@ -0,0 +1,12 @@ +#include +#include + +#include "asm/sigframe.h" +#include "asm/types.h" + +#include "log.h" +#include +int sigreturn_prep_fpu_frame(struct rt_sigframe *sigframe, struct rt_sigframe *rsigframe) +{ + return 0; +} diff --git a/criu/arch/loongarch64/vdso-pie.c b/criu/arch/loongarch64/vdso-pie.c new file mode 100644 index 000000000..7a75d2741 --- /dev/null +++ b/criu/arch/loongarch64/vdso-pie.c @@ -0,0 +1,48 @@ +#include +#include "asm/types.h" + +#include +#include +#include "parasite-vdso.h" +#include "log.h" +#include "common/bug.h" + +#ifdef LOG_PREFIX +#undef LOG_PREFIX +#endif +#define LOG_PREFIX "vdso: " +static void insert_trampoline(uintptr_t from, uintptr_t to) +{ + struct { + uint32_t pcaddi; + uint32_t ldptr; + uint32_t jirl; + uint32_t guards; + uint64_t imm64; + } __packed jmp = { + .pcaddi = 0x18000095, /* pcaddi $x, 4 */ + .ldptr = 0x260002b5, /* ldptr.d $x, $x, 0 */ + .jirl = 0x4c0002a0, /* jirl $zero, $x, 0 */ + .guards = 0x002a0000, /* break 0 */ + .imm64 = to, + }; + memcpy((void *)from, &jmp, sizeof(jmp)); +} + +int vdso_redirect_calls(unsigned long base_to, unsigned long base_from, struct vdso_symtable *sto, + struct vdso_symtable *sfrom, bool compat_vdso) +{ + unsigned int i; + unsigned long from, to; + for (i = 0; i < ARRAY_SIZE(sto->symbols); i++) { + if (vdso_symbol_empty(&sfrom->symbols[i])) + continue; + pr_debug("br: %lx/%lx -> %lx/%lx (index %d)\n", base_from, sfrom->symbols[i].offset, base_to, + sto->symbols[i].offset, i); + + from = base_from + sfrom->symbols[i].offset; + to = base_to + sto->symbols[i].offset; + insert_trampoline(from, to); + } + return 0; +} diff --git a/criu/arch/mips/Makefile b/criu/arch/mips/Makefile new file mode 100644 index 000000000..4bd99eb7e --- /dev/null +++ b/criu/arch/mips/Makefile @@ -0,0 +1,14 @@ +builtin-name := crtools.built-in.o + +ccflags-y += -iquote $(obj)/include +ccflags-y += -iquote criu/include -iquote include +ccflags-y += $(COMPEL_UAPI_INCLUDES) + +asflags-y += -Wstrict-prototypes +asflags-y += -D__ASSEMBLY__ -nostdlib -fomit-frame-pointer +asflags-y += -iquote $(obj)/include +ldflags-y += -r -z noexecstack + +obj-y += cpu.o +obj-y += crtools.o +obj-y += sigframe.o diff --git a/criu/arch/mips/cpu.c b/criu/arch/mips/cpu.c new file mode 100644 index 000000000..55b385d09 --- /dev/null +++ b/criu/arch/mips/cpu.c @@ -0,0 +1,53 @@ +#include +#include +#include +#include +#include +#include + +#include "bitops.h" +#include "asm/types.h" +#include "asm/cpu.h" +#include +#include + +#include "common/compiler.h" +#include "cr_options.h" +#include "image.h" +#include "util.h" +#include "log.h" +#include "cpu.h" +#include "protobuf.h" +#include "images/cpuinfo.pb-c.h" + +#undef LOG_PREFIX +#define LOG_PREFIX "cpu: " + +int cpu_init(void) +{ + return 0; +} + +int cpu_dump_cpuinfo(void) +{ + return 0; +} + +int cpu_validate_cpuinfo(void) +{ + return 0; +} + +int cpuinfo_dump(void) +{ + if (cpu_init()) + return -1; + if (cpu_dump_cpuinfo()) + return -1; + return 0; +} + +int cpuinfo_check(void) +{ + return 0; +} diff --git a/criu/arch/mips/crtools.c b/criu/arch/mips/crtools.c new file mode 100644 index 000000000..eabbd85f4 --- /dev/null +++ b/criu/arch/mips/crtools.c @@ -0,0 +1,250 @@ +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include "types.h" +#include "log.h" +#include "asm/parasite-syscall.h" +#include "asm/restorer.h" +#include +#include "asm/dump.h" +#include "cr_options.h" +#include "common/compiler.h" +#include "restorer.h" +#include "parasite-syscall.h" +#include "util.h" +#include "cpu.h" +#include +#include "kerndat.h" + +#include "protobuf.h" +#include "images/core.pb-c.h" +#include "images/creds.pb-c.h" + +int save_task_regs(pid_t pid, void *x, user_regs_struct_t *regs, user_fpregs_struct_t *fpregs) +{ + CoreEntry *core = x; + + /* Save the MIPS CPU state */ + core->ti_mips->gpregs->r0 = regs->regs[0]; + core->ti_mips->gpregs->r1 = regs->regs[1]; + core->ti_mips->gpregs->r2 = regs->regs[2]; + core->ti_mips->gpregs->r3 = regs->regs[3]; + core->ti_mips->gpregs->r4 = regs->regs[4]; + core->ti_mips->gpregs->r5 = regs->regs[5]; + core->ti_mips->gpregs->r6 = regs->regs[6]; + core->ti_mips->gpregs->r7 = regs->regs[7]; + core->ti_mips->gpregs->r8 = regs->regs[8]; + core->ti_mips->gpregs->r9 = regs->regs[9]; + core->ti_mips->gpregs->r10 = regs->regs[10]; + core->ti_mips->gpregs->r11 = regs->regs[11]; + core->ti_mips->gpregs->r12 = regs->regs[12]; + core->ti_mips->gpregs->r13 = regs->regs[13]; + core->ti_mips->gpregs->r14 = regs->regs[14]; + core->ti_mips->gpregs->r15 = regs->regs[15]; + core->ti_mips->gpregs->r16 = regs->regs[16]; + core->ti_mips->gpregs->r17 = regs->regs[17]; + core->ti_mips->gpregs->r18 = regs->regs[18]; + core->ti_mips->gpregs->r19 = regs->regs[19]; + core->ti_mips->gpregs->r20 = regs->regs[20]; + core->ti_mips->gpregs->r21 = regs->regs[21]; + core->ti_mips->gpregs->r22 = regs->regs[22]; + core->ti_mips->gpregs->r23 = regs->regs[23]; + core->ti_mips->gpregs->r24 = regs->regs[24]; + core->ti_mips->gpregs->r25 = regs->regs[25]; + core->ti_mips->gpregs->r26 = regs->regs[26]; + core->ti_mips->gpregs->r27 = regs->regs[27]; + core->ti_mips->gpregs->r28 = regs->regs[28]; + core->ti_mips->gpregs->r29 = regs->regs[29]; + core->ti_mips->gpregs->r30 = regs->regs[30]; + core->ti_mips->gpregs->r31 = regs->regs[31]; + + core->ti_mips->gpregs->lo = regs->lo; + core->ti_mips->gpregs->hi = regs->hi; + core->ti_mips->gpregs->cp0_epc = regs->cp0_epc; + core->ti_mips->gpregs->cp0_badvaddr = regs->cp0_badvaddr; + core->ti_mips->gpregs->cp0_status = regs->cp0_status; + core->ti_mips->gpregs->cp0_cause = regs->cp0_cause; + + core->ti_mips->fpregs->r0 = fpregs->regs[0]; + core->ti_mips->fpregs->r1 = fpregs->regs[1]; + core->ti_mips->fpregs->r2 = fpregs->regs[2]; + core->ti_mips->fpregs->r3 = fpregs->regs[3]; + core->ti_mips->fpregs->r4 = fpregs->regs[4]; + core->ti_mips->fpregs->r5 = fpregs->regs[5]; + core->ti_mips->fpregs->r6 = fpregs->regs[6]; + core->ti_mips->fpregs->r7 = fpregs->regs[7]; + core->ti_mips->fpregs->r8 = fpregs->regs[8]; + core->ti_mips->fpregs->r9 = fpregs->regs[9]; + core->ti_mips->fpregs->r10 = fpregs->regs[10]; + core->ti_mips->fpregs->r11 = fpregs->regs[11]; + core->ti_mips->fpregs->r12 = fpregs->regs[12]; + core->ti_mips->fpregs->r13 = fpregs->regs[13]; + core->ti_mips->fpregs->r14 = fpregs->regs[14]; + core->ti_mips->fpregs->r15 = fpregs->regs[15]; + core->ti_mips->fpregs->r16 = fpregs->regs[16]; + core->ti_mips->fpregs->r17 = fpregs->regs[17]; + core->ti_mips->fpregs->r18 = fpregs->regs[18]; + core->ti_mips->fpregs->r19 = fpregs->regs[19]; + core->ti_mips->fpregs->r20 = fpregs->regs[20]; + core->ti_mips->fpregs->r21 = fpregs->regs[21]; + core->ti_mips->fpregs->r22 = fpregs->regs[22]; + core->ti_mips->fpregs->r23 = fpregs->regs[23]; + core->ti_mips->fpregs->r24 = fpregs->regs[24]; + core->ti_mips->fpregs->r25 = fpregs->regs[25]; + core->ti_mips->fpregs->r26 = fpregs->regs[26]; + core->ti_mips->fpregs->r27 = fpregs->regs[27]; + core->ti_mips->fpregs->r28 = fpregs->regs[28]; + core->ti_mips->fpregs->r29 = fpregs->regs[29]; + core->ti_mips->fpregs->r30 = fpregs->regs[30]; + core->ti_mips->fpregs->r31 = fpregs->regs[31]; + core->ti_mips->fpregs->fpu_fcr31 = fpregs->fpu_fcr31; + core->ti_mips->fpregs->fpu_id = fpregs->fpu_id; + + return 0; +} + +int arch_alloc_thread_info(CoreEntry *core) +{ + ThreadInfoMips *ti_mips; + UserMipsRegsEntry *gpregs; + UserMipsFpregsEntry *fpregs; + + ti_mips = xmalloc(sizeof(*ti_mips)); + if (!ti_mips) + goto err; + + thread_info_mips__init(ti_mips); + core->ti_mips = ti_mips; + + gpregs = xmalloc(sizeof(*gpregs)); + if (!gpregs) { + xfree(ti_mips); + goto err; + } + + user_mips_regs_entry__init(gpregs); + ti_mips->gpregs = gpregs; + + fpregs = xmalloc(sizeof(*fpregs)); + if (!fpregs) { + xfree(ti_mips); + xfree(gpregs); + goto err; + } + + user_mips_fpregs_entry__init(fpregs); + ti_mips->fpregs = fpregs; + + return 0; +err: + return -1; +} + +void arch_free_thread_info(CoreEntry *core) +{ + if (!core->ti_mips) + return; + + if (core->ti_mips->gpregs) + xfree(core->ti_mips->gpregs); + + if (core->ti_mips->fpregs) + xfree(core->ti_mips->fpregs); + + xfree(core->ti_mips); +} + +int restore_fpu(struct rt_sigframe *sigframe, CoreEntry *core) +{ + struct rt_sigframe *f = sigframe; + UserMipsFpregsEntry *r = core->ti_mips->fpregs; + + f->rs_uc.uc_mcontext.sc_fpregs[0] = r->r0; + f->rs_uc.uc_mcontext.sc_fpregs[1] = r->r1; + f->rs_uc.uc_mcontext.sc_fpregs[2] = r->r2; + f->rs_uc.uc_mcontext.sc_fpregs[3] = r->r3; + f->rs_uc.uc_mcontext.sc_fpregs[4] = r->r4; + f->rs_uc.uc_mcontext.sc_fpregs[5] = r->r5; + f->rs_uc.uc_mcontext.sc_fpregs[6] = r->r6; + f->rs_uc.uc_mcontext.sc_fpregs[7] = r->r7; + f->rs_uc.uc_mcontext.sc_fpregs[8] = r->r8; + f->rs_uc.uc_mcontext.sc_fpregs[9] = r->r9; + f->rs_uc.uc_mcontext.sc_fpregs[10] = r->r10; + f->rs_uc.uc_mcontext.sc_fpregs[11] = r->r11; + f->rs_uc.uc_mcontext.sc_fpregs[12] = r->r12; + f->rs_uc.uc_mcontext.sc_fpregs[13] = r->r13; + f->rs_uc.uc_mcontext.sc_fpregs[14] = r->r14; + f->rs_uc.uc_mcontext.sc_fpregs[15] = r->r15; + f->rs_uc.uc_mcontext.sc_fpregs[16] = r->r16; + f->rs_uc.uc_mcontext.sc_fpregs[17] = r->r17; + f->rs_uc.uc_mcontext.sc_fpregs[18] = r->r18; + f->rs_uc.uc_mcontext.sc_fpregs[19] = r->r19; + f->rs_uc.uc_mcontext.sc_fpregs[20] = r->r20; + f->rs_uc.uc_mcontext.sc_fpregs[21] = r->r21; + f->rs_uc.uc_mcontext.sc_fpregs[22] = r->r22; + f->rs_uc.uc_mcontext.sc_fpregs[23] = r->r23; + f->rs_uc.uc_mcontext.sc_fpregs[24] = r->r24; + f->rs_uc.uc_mcontext.sc_fpregs[25] = r->r25; + f->rs_uc.uc_mcontext.sc_fpregs[26] = r->r26; + f->rs_uc.uc_mcontext.sc_fpregs[27] = r->r27; + f->rs_uc.uc_mcontext.sc_fpregs[28] = r->r28; + f->rs_uc.uc_mcontext.sc_fpregs[29] = r->r29; + f->rs_uc.uc_mcontext.sc_fpregs[30] = r->r30; + f->rs_uc.uc_mcontext.sc_fpregs[31] = r->r31; + + return 0; +} + +int restore_gpregs(struct rt_sigframe *f, UserMipsRegsEntry *r) +{ + f->rs_uc.uc_mcontext.sc_regs[0] = r->r0; + f->rs_uc.uc_mcontext.sc_regs[1] = r->r1; + f->rs_uc.uc_mcontext.sc_regs[2] = r->r2; + f->rs_uc.uc_mcontext.sc_regs[3] = r->r3; + f->rs_uc.uc_mcontext.sc_regs[4] = r->r4; + f->rs_uc.uc_mcontext.sc_regs[5] = r->r5; + f->rs_uc.uc_mcontext.sc_regs[6] = r->r6; + f->rs_uc.uc_mcontext.sc_regs[7] = r->r7; + f->rs_uc.uc_mcontext.sc_regs[8] = r->r8; + f->rs_uc.uc_mcontext.sc_regs[9] = r->r9; + f->rs_uc.uc_mcontext.sc_regs[10] = r->r10; + f->rs_uc.uc_mcontext.sc_regs[11] = r->r11; + f->rs_uc.uc_mcontext.sc_regs[12] = r->r12; + f->rs_uc.uc_mcontext.sc_regs[13] = r->r13; + f->rs_uc.uc_mcontext.sc_regs[14] = r->r14; + f->rs_uc.uc_mcontext.sc_regs[15] = r->r15; + f->rs_uc.uc_mcontext.sc_regs[16] = r->r16; + f->rs_uc.uc_mcontext.sc_regs[17] = r->r17; + f->rs_uc.uc_mcontext.sc_regs[18] = r->r18; + f->rs_uc.uc_mcontext.sc_regs[19] = r->r19; + f->rs_uc.uc_mcontext.sc_regs[20] = r->r20; + f->rs_uc.uc_mcontext.sc_regs[21] = r->r21; + f->rs_uc.uc_mcontext.sc_regs[22] = r->r22; + f->rs_uc.uc_mcontext.sc_regs[23] = r->r23; + f->rs_uc.uc_mcontext.sc_regs[24] = r->r24; + f->rs_uc.uc_mcontext.sc_regs[25] = r->r25; + f->rs_uc.uc_mcontext.sc_regs[26] = r->r26; + f->rs_uc.uc_mcontext.sc_regs[27] = r->r27; + f->rs_uc.uc_mcontext.sc_regs[28] = r->r28; + f->rs_uc.uc_mcontext.sc_regs[29] = r->r29; + f->rs_uc.uc_mcontext.sc_regs[30] = r->r30; + f->rs_uc.uc_mcontext.sc_regs[31] = r->r31; + + f->rs_uc.uc_mcontext.sc_mdlo = r->lo; + f->rs_uc.uc_mcontext.sc_mdhi = r->hi; + f->rs_uc.uc_mcontext.sc_pc = r->cp0_epc; + + return 0; +} + +int get_task_futex_robust_list_compat(pid_t pid, ThreadCoreEntry *info) +{ + return 0; +} diff --git a/criu/arch/mips/include/asm/dump.h b/criu/arch/mips/include/asm/dump.h new file mode 100644 index 000000000..ec59b051b --- /dev/null +++ b/criu/arch/mips/include/asm/dump.h @@ -0,0 +1,14 @@ +#ifndef __CR_ASM_DUMP_H__ +#define __CR_ASM_DUMP_H__ + +extern int save_task_regs(pid_t pid, void *, user_regs_struct_t *, user_fpregs_struct_t *); +extern int arch_alloc_thread_info(CoreEntry *core); +extern void arch_free_thread_info(CoreEntry *core); +extern int get_task_futex_robust_list_compat(pid_t pid, ThreadCoreEntry *info); + +static inline void core_put_tls(CoreEntry *core, tls_t tls) +{ + core->ti_mips->tls = tls; +} + +#endif diff --git a/criu/arch/mips/include/asm/int.h b/criu/arch/mips/include/asm/int.h new file mode 100644 index 000000000..642804e9b --- /dev/null +++ b/criu/arch/mips/include/asm/int.h @@ -0,0 +1,6 @@ +#ifndef __CR_ASM_INT_H__ +#define __CR_ASM_INT_H__ + +#include "asm-generic/int.h" + +#endif /* __CR_ASM_INT_H__ */ diff --git a/criu/arch/mips/include/asm/kerndat.h b/criu/arch/mips/include/asm/kerndat.h new file mode 100644 index 000000000..bb70cf6cf --- /dev/null +++ b/criu/arch/mips/include/asm/kerndat.h @@ -0,0 +1,7 @@ +#ifndef __CR_ASM_KERNDAT_H__ +#define __CR_ASM_KERNDAT_H__ + +#define kdat_compatible_cr() 0 +#define kdat_can_map_vdso() 0 + +#endif /* __CR_ASM_KERNDAT_H__ */ diff --git a/criu/arch/mips/include/asm/parasite-syscall.h b/criu/arch/mips/include/asm/parasite-syscall.h new file mode 100644 index 000000000..a2b5e75ff --- /dev/null +++ b/criu/arch/mips/include/asm/parasite-syscall.h @@ -0,0 +1,8 @@ +#ifndef __CR_ASM_PARASITE_SYSCALL_H__ +#define __CR_ASM_PARASITE_SYSCALL_H__ + +#include "asm/types.h" + +struct parasite_ctl; + +#endif diff --git a/criu/arch/mips/include/asm/parasite.h b/criu/arch/mips/include/asm/parasite.h new file mode 100644 index 000000000..d8557095b --- /dev/null +++ b/criu/arch/mips/include/asm/parasite.h @@ -0,0 +1,9 @@ +#ifndef __ASM_PARASITE_H__ +#define __ASM_PARASITE_H__ + +static inline void arch_get_tls(tls_t *ptls) +{ + asm("rdhwr %0, $29" : "=r"(*ptls)); +} + +#endif diff --git a/criu/arch/mips/include/asm/restore.h b/criu/arch/mips/include/asm/restore.h new file mode 100644 index 000000000..c40b46999 --- /dev/null +++ b/criu/arch/mips/include/asm/restore.h @@ -0,0 +1,29 @@ +#ifndef __CR_ASM_RESTORE_H__ +#define __CR_ASM_RESTORE_H__ + +#include "asm/restorer.h" +#include "images/core.pb-c.h" + +/* clang-format off */ +#define JUMP_TO_RESTORER_BLOB(new_sp, restore_task_exec_start, task_args) \ + asm volatile( \ + "move $4, %0 \n" \ + "move $25, %1 \n" \ + "move $5, %2 \n" \ + "move $29, $5 \n" \ + "jalr $25 \n" \ + "nop \n" \ + : \ + :"r"(task_args),"r"(restore_task_exec_start), \ + "g"(new_sp) \ + : "$25", "$4","$5") +/* clang-format on */ + +static inline void core_get_tls(CoreEntry *pcore, tls_t *ptls) +{ + *ptls = pcore->ti_mips->tls; +} + +int restore_fpu(struct rt_sigframe *sigframe, CoreEntry *core); + +#endif diff --git a/criu/arch/mips/include/asm/restorer.h b/criu/arch/mips/include/asm/restorer.h new file mode 100644 index 000000000..ac4258eae --- /dev/null +++ b/criu/arch/mips/include/asm/restorer.h @@ -0,0 +1,91 @@ +#ifndef __CR_ASM_RESTORER_H__ +#define __CR_ASM_RESTORER_H__ + +#include "asm/types.h" +#include +#include "images/core.pb-c.h" +#include +#include + +static inline void restore_tls(tls_t *ptls) +{ + /* clang-format off */ + asm volatile("move $4, %0 \n" + "li $2, " __stringify(__NR_set_thread_area) " \n" + "syscall \n" + : + : "r"(*ptls) + : "$4", "$2", "memory"); + /* clang-format on */ +} +static inline int arch_compat_rt_sigaction(void *stack, int sig, void *act) +{ + return -1; +} +static inline int set_compat_robust_list(uint32_t head_ptr, uint32_t len) +{ + return -1; +} + +/* clang-format off */ +#define RUN_CLONE_RESTORE_FN(ret, clone_flags, new_sp, parent_tid, \ + thread_args, clone_restore_fn) \ + asm volatile( \ + "ld $5,%2 \n" /* a1 = new_sp */ \ + "dsubu $5,32 \n" \ + "sd %5,0($5) \n" \ + "sd %6,8($5) \n" \ + "sd %1,16($5) \n" \ + "move $4,%1 \n" /* a0=flags */ \ + "move $6,%3 \n" /* a2=parent_tid */ \ + "li $7,0 \n" /* a3 = tls is 0 */ \ + "move $8,%4 \n" /* a4 = child_tid */ \ + "li $2, "__stringify(__NR_clone)" \n" \ + "syscall \n" /* syscall */ \ + "sync \n" \ + "bnez $7,err \n" \ + "nop \n" \ + "beqz $2,thread_start \n" \ + "nop \n" \ + "move %0,$2 \n" \ + "b end \n" \ + "err:break \n" \ + "thread_start: \n" \ + "ld $25,0($29) \n" \ + "ld $4,8($29) \n" \ + "jal $25 \n" \ + "nop \n" \ + "end: \n" \ + : "=r"(ret) \ + : "r"(clone_flags), \ + "m"(new_sp), \ + "r"(&parent_tid), \ + "r"(&thread_args[i].pid), \ + "r"(clone_restore_fn), \ + "r"(&thread_args[i]) \ + :"$2","$4","$5","$6","$7","$8","$25","memory") + +#define RUN_CLONE3_RESTORE_FN(ret, clone_args, size, args, \ + clone_restore_fn) do { \ + pr_err("This architecture does not support clone3() with set_tid, yet!\n"); \ + ret = -1; \ +} while (0) +/* clang-format on */ + +#define kdat_compatible_cr() 0 +#define arch_map_vdso(map, compat) -1 + +static inline void *alloc_compat_syscall_stack(void) +{ + return NULL; +} +static inline void free_compat_syscall_stack(void *stack32) +{ +} +int restore_gpregs(struct rt_sigframe *f, UserMipsRegsEntry *r); +int restore_nonsigframe_gpregs(UserMipsRegsEntry *r); + +#define ARCH_HAS_SHMAT_HOOK +unsigned long arch_shmat(int shmid, void *shmaddr, int shmflg, unsigned long size); + +#endif diff --git a/criu/arch/mips/include/asm/syscall32.h b/criu/arch/mips/include/asm/syscall32.h new file mode 100644 index 000000000..a6e298217 --- /dev/null +++ b/criu/arch/mips/include/asm/syscall32.h @@ -0,0 +1,17 @@ +#ifndef __CR_SYSCALL32_H__ +#define __CR_SYSCALL32_H__ + +extern long sys_socket(int domain, int type, int protocol); +extern long sys_connect(int sockfd, struct sockaddr *addr, int addrlen); +extern long sys_sendto(int sockfd, void *buff, size_t len, unsigned int flags, struct sockaddr *addr, int addr_len); +extern long sys_recvfrom(int sockfd, void *ubuf, size_t size, unsigned int flags, struct sockaddr *addr, int *addr_len); +extern long sys_sendmsg(int sockfd, const struct msghdr *msg, int flags); +extern long sys_recvmsg(int sockfd, struct msghdr *msg, int flags); +extern long sys_shutdown(int sockfd, int how); +extern long sys_bind(int sockfd, const struct sockaddr *addr, int addrlen); +extern long sys_setsockopt(int sockfd, int level, int optname, const void *optval, unsigned int optlen); +extern long sys_getsockopt(int sockfd, int level, int optname, const void *optval, unsigned int *optlen); +extern long sys_shmat(int shmid, void *shmaddr, int shmflag); +extern long sys_pread(unsigned int fd, char *ubuf, u32 count, u64 pos); + +#endif /* __CR_SYSCALL32_H__ */ diff --git a/criu/arch/mips/include/asm/thread_pointer.h b/criu/arch/mips/include/asm/thread_pointer.h new file mode 100644 index 000000000..f7e07066a --- /dev/null +++ b/criu/arch/mips/include/asm/thread_pointer.h @@ -0,0 +1,27 @@ +/* __thread_pointer definition. Generic version. + Copyright (C) 2021 Free Software Foundation, Inc. + This file is part of the GNU C Library. + + The GNU C Library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + The GNU C Library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with the GNU C Library. If not, see + . */ + +#ifndef _SYS_THREAD_POINTER_H +#define _SYS_THREAD_POINTER_H + +static inline void *__criu_thread_pointer(void) +{ + return __builtin_thread_pointer(); +} + +#endif /* _SYS_THREAD_POINTER_H */ diff --git a/criu/arch/mips/include/asm/types.h b/criu/arch/mips/include/asm/types.h new file mode 100644 index 000000000..2c75b6a92 --- /dev/null +++ b/criu/arch/mips/include/asm/types.h @@ -0,0 +1,38 @@ +#ifndef __CR_ASM_TYPES_H__ +#define __CR_ASM_TYPES_H__ + +#include +#include + +#include "page.h" +#include "bitops.h" +#include "asm/int.h" + +#include + +#include "images/core.pb-c.h" + +#define core_is_compat(core) false + +#define CORE_ENTRY__MARCH CORE_ENTRY__MARCH__MIPS + +#define CORE_THREAD_ARCH_INFO(core) core->ti_mips + +#define TI_IP(core) ((core)->ti_mips->gpregs->cp0_epc) + +typedef UserMipsRegsEntry UserRegsEntry; + +static inline u64 encode_pointer(void *p) +{ + return (u64)p; +} +static inline void *decode_pointer(u64 v) +{ + return (void *)v; +} + +#define AT_VECTOR_SIZE 44 +typedef uint64_t auxv_t; +typedef unsigned long tls_t; + +#endif /* __CR_ASM_TYPES_H__ */ diff --git a/criu/arch/mips/include/asm/vdso.h b/criu/arch/mips/include/asm/vdso.h new file mode 100644 index 000000000..294c369ca --- /dev/null +++ b/criu/arch/mips/include/asm/vdso.h @@ -0,0 +1,23 @@ +#ifndef __CR_ASM_VDSO_H__ +#define __CR_ASM_VDSO_H__ + +#include "asm/int.h" +#include "asm-generic/vdso.h" + +/* This definition is used in pie/util-vdso.c to initialize the vdso symbol + * name string table 'vdso_symbols' + */ + +/* + * This is a minimal amount of symbols + * we should support at the moment. + */ +#define VDSO_SYMBOL_MAX 3 +#define VDSO_SYMBOL_GTOD 0 +#define ARCH_VDSO_SYMBOLS_LIST \ + const char *aarch_vdso_symbol1 = "__vdso_clock_gettime"; \ + const char *aarch_vdso_symbol2 = "__vdso_gettimeofday"; \ + const char *aarch_vdso_symbol3 = "__vdso_clock_getres"; +#define ARCH_VDSO_SYMBOLS aarch_vdso_symbol1, aarch_vdso_symbol2, aarch_vdso_symbol3, + +#endif /* __CR_ASM_VDSO_H__ */ diff --git a/criu/arch/mips/restorer.c b/criu/arch/mips/restorer.c new file mode 100644 index 000000000..45a0f0a64 --- /dev/null +++ b/criu/arch/mips/restorer.c @@ -0,0 +1,47 @@ +#include + +#include "types.h" +#include "restorer.h" +#include "asm/restorer.h" +#include + +#include +#include +#include +#include "log.h" +#include "cpu.h" + +int restore_nonsigframe_gpregs(UserMipsRegsEntry *r) +{ + return 0; +} + +#define SHMLBA 0x40000 +unsigned long arch_shmat(int shmid, void *shmaddr, int shmflg, unsigned long size) +{ + unsigned long smap; + + /* SHMLBA-aligned, direct call shmat() */ + if (!((unsigned long)shmaddr & (SHMLBA - 1))) + return sys_shmat(shmid, shmaddr, shmflg); + + smap = sys_shmat(shmid, NULL, shmflg); + if (IS_ERR_VALUE(smap)) { + pr_err("shmat() with NULL shmaddr failed: %d\n", (int)smap); + return smap; + } + + /* We're lucky! */ + if (smap == (unsigned long)shmaddr) + return smap; + + /* Warn ALOUD */ + pr_warn("Restoring shmem %p unaligned to SHMLBA.\n", shmaddr); + pr_warn("Make sure that you don't migrate shmem from non-VIPT cached CPU to VIPT cached \n"); + pr_warn("Otherwise YOU HAVE A CHANCE OF DATA CORRUPTIONS in writeable shmem\n"); + + smap = sys_mremap(smap, size, size, MREMAP_FIXED | MREMAP_MAYMOVE, (unsigned long)shmaddr); + if (IS_ERR_VALUE(smap)) + pr_err("mremap() for shmem failed: %d\n", (int)smap); + return smap; +} diff --git a/criu/arch/mips/sigaction_compat.c b/criu/arch/mips/sigaction_compat.c new file mode 100644 index 000000000..b389b7b73 --- /dev/null +++ b/criu/arch/mips/sigaction_compat.c @@ -0,0 +1,18 @@ +#include "log.h" +#include "asm/restorer.h" +#include +#include "asm/compat.h" +#include + +#ifdef CR_NOGLIBC +#include +#endif + +#include "cpu.h" + +extern char restore_rt_sigaction; + +int arch_compat_rt_sigaction(void *stack32, int sig, rt_sigaction_t_compat *act) +{ + return 0; +} diff --git a/criu/arch/mips/sigframe.c b/criu/arch/mips/sigframe.c new file mode 100644 index 000000000..18983ff13 --- /dev/null +++ b/criu/arch/mips/sigframe.c @@ -0,0 +1,12 @@ +#include +#include + +#include "asm/sigframe.h" +#include "asm/types.h" + +#include "log.h" +#include +int sigreturn_prep_fpu_frame(struct rt_sigframe *sigframe, struct rt_sigframe *rsigframe) +{ + return 0; +} diff --git a/criu/arch/mips/vdso-pie.c b/criu/arch/mips/vdso-pie.c new file mode 100644 index 000000000..3bb92d857 --- /dev/null +++ b/criu/arch/mips/vdso-pie.c @@ -0,0 +1,54 @@ +#include +#include "asm/types.h" + +#include +#include +#include "parasite-vdso.h" +#include "log.h" +#include "common/bug.h" + +#ifdef LOG_PREFIX +#undef LOG_PREFIX +#endif +#define LOG_PREFIX "vdso: " +static void insert_trampoline(uintptr_t from, uintptr_t to) +{ + struct { + uint32_t ldr_pc; + uint32_t imm32; + uint32_t guards; + } __packed jmp = { + .ldr_pc = 0x1000fffe, /* b -4 */ + .imm32 = to, + .guards = 0x0000000d, /* break */ + }; + void *iflush_start = (void *)from; + void *iflush_end = iflush_start + sizeof(jmp); + + memcpy((void *)from, &jmp, sizeof(jmp)); + + sys_cacheflush(iflush_start, sizeof(jmp), 0); +} + +int vdso_redirect_calls(unsigned long base_to, unsigned long base_from, struct vdso_symtable *sto, + struct vdso_symtable *sfrom, bool compat_vdso) +{ + unsigned int i; + + for (i = 0; i < ARRAY_SIZE(sto->symbols); i++) { + uintptr_t from, to; + + if (vdso_symbol_empty(&sfrom->symbols[i])) + continue; + + pr_debug("jmp: %lx/%lx -> %lx/%lx (index %d)\n", base_from, sfrom->symbols[i].offset, base_to, + sto->symbols[i].offset, i); + + from = base_from + sfrom->symbols[i].offset; + to = base_to + sto->symbols[i].offset; + + insert_trampoline(from, to); + } + + return 0; +} diff --git a/criu/arch/ppc64/cpu.c b/criu/arch/ppc64/cpu.c index 4fcfb065a..b87230f40 100644 --- a/criu/arch/ppc64/cpu.c +++ b/criu/arch/ppc64/cpu.c @@ -1,4 +1,4 @@ -#undef LOG_PREFIX +#undef LOG_PREFIX #define LOG_PREFIX "cpu: " #include @@ -19,9 +19,9 @@ static compel_cpuinfo_t rt_cpuinfo; #ifdef __LITTLE_ENDIAN__ -#define CURRENT_ENDIANNESS CPUINFO_PPC64_ENTRY__ENDIANNESS__LITTLEENDIAN +#define CURRENT_ENDIANNESS CPUINFO_PPC64_ENTRY__ENDIANNESS__LITTLEENDIAN #else -#define CURRENT_ENDIANNESS CPUINFO_PPC64_ENTRY__ENDIANESS__BIGENDIAN +#define CURRENT_ENDIANNESS CPUINFO_PPC64_ENTRY__ENDIANESS__BIGENDIAN #endif int cpu_init(void) @@ -64,6 +64,12 @@ int cpu_validate_cpuinfo(void) if (!img) return -1; + if (empty_image(img)) { + pr_err("No cpuinfo image\n"); + close_image(img); + return -1; + } + if (pb_read_one(img, &cpu_info, PB_CPUINFO) < 0) goto error; @@ -83,21 +89,23 @@ int cpu_validate_cpuinfo(void) goto error; } -#define CHECK_FEATURE(s,f) do { \ - if ((cpu_ppc64_entry->hwcap[s] & f) && \ - !(rt_cpuinfo.hwcap[s] & f)) { \ - pr_err("CPU Feature %s required by image " \ - "is not supported on host.\n", #f); \ - goto error; \ - } \ - } while(0) +#define CHECK_FEATURE(s, f) \ + do { \ + if ((cpu_ppc64_entry->hwcap[s] & f) && !(rt_cpuinfo.hwcap[s] & f)) { \ + pr_err("CPU Feature %s required by image " \ + "is not supported on host.\n", \ + #f); \ + goto error; \ + } \ + } while (0) -#define REQUIRE_FEATURE(s,f) do { \ - if (!(cpu_ppc64_entry->hwcap[s] & f)) { \ +#define REQUIRE_FEATURE(s, f) \ + do { \ + if (!(cpu_ppc64_entry->hwcap[s] & f)) { \ pr_err("CPU Feature %s missing in image.\n", #f); \ - goto error; \ - } \ - } while(0) + goto error; \ + } \ + } while (0) REQUIRE_FEATURE(0, PPC_FEATURE_64); REQUIRE_FEATURE(0, PPC_FEATURE_HAS_FPU); diff --git a/criu/arch/ppc64/crtools.c b/criu/arch/ppc64/crtools.c index 5a5966ad4..d57040008 100644 --- a/criu/arch/ppc64/crtools.c +++ b/criu/arch/ppc64/crtools.c @@ -17,7 +17,7 @@ #include "log.h" #include "util.h" #include "cpu.h" -#include +#include "compel/infect.h" #include "protobuf.h" #include "images/core.pb-c.h" @@ -53,7 +53,7 @@ static void put_fpu_regs(mcontext_t *mc, UserPpc64FpstateEntry *fpe) size_t i; for (i = 0; i < fpe->n_fpregs; i++) - mcfp[i] = fpe->fpregs[i]; + mcfp[i] = fpe->fpregs[i]; } static UserPpc64VrstateEntry *copy_altivec_regs(__vector128 *vrregs) @@ -69,7 +69,7 @@ static UserPpc64VrstateEntry *copy_altivec_regs(__vector128 *vrregs) user_ppc64_vrstate_entry__init(vse); /* protocol buffer store only 64bit entries and we need 128bit */ - vse->n_vrregs = (NVRREG-1) * 2; + vse->n_vrregs = (NVRREG - 1) * 2; vse->vrregs = xmalloc(vse->n_vrregs * sizeof(vse->vrregs[0])); if (!vse->vrregs) { xfree(vse); @@ -77,13 +77,13 @@ static UserPpc64VrstateEntry *copy_altivec_regs(__vector128 *vrregs) } /* Vectors are 2*64bits entries */ - for (i = 0; i < (NVRREG-1); i++) { - p64 = (uint64_t*) &vrregs[i]; - vse->vrregs[i*2] = p64[0]; - vse->vrregs[i*2 + 1] = p64[1]; + for (i = 0; i < (NVRREG - 1); i++) { + p64 = (uint64_t *)&vrregs[i]; + vse->vrregs[i * 2] = p64[0]; + vse->vrregs[i * 2 + 1] = p64[1]; } - p32 = (uint32_t*) &vrregs[NVRREG-1]; + p32 = (uint32_t *)&vrregs[NVRREG - 1]; vse->vrsave = *p32; return vse; @@ -95,7 +95,7 @@ static int put_altivec_regs(mcontext_t *mc, UserPpc64VrstateEntry *vse) pr_debug("Restoring Altivec registers\n"); - if (vse->n_vrregs != (NVRREG-1)*2) { + if (vse->n_vrregs != (NVRREG - 1) * 2) { pr_err("Corrupted Altivec dump data\n"); return -1; } @@ -103,8 +103,7 @@ static int put_altivec_regs(mcontext_t *mc, UserPpc64VrstateEntry *vse) /* Note that this should only be done in the case MSR_VEC is set but * this is not a big deal to do that in all cases. */ - memcpy(&v_regs->vrregs[0][0], vse->vrregs, - sizeof(uint64_t) * 2 * (NVRREG-1)); + memcpy(&v_regs->vrregs[0][0], vse->vrregs, sizeof(uint64_t) * 2 * (NVRREG - 1)); /* vscr has been restored with the previous memcpy which copied 32 * 128bits registers + a 128bits field containing the vscr value in * the low part. @@ -116,7 +115,7 @@ static int put_altivec_regs(mcontext_t *mc, UserPpc64VrstateEntry *vse) return 0; } -static UserPpc64VsxstateEntry* copy_vsx_regs(uint64_t *vsregs) +static UserPpc64VsxstateEntry *copy_vsx_regs(uint64_t *vsregs) { UserPpc64VsxstateEntry *vse; int i; @@ -128,7 +127,7 @@ static UserPpc64VsxstateEntry* copy_vsx_regs(uint64_t *vsregs) user_ppc64_vsxstate_entry__init(vse); vse->n_vsxregs = NVSXREG; - vse->vsxregs = xmalloc(vse->n_vsxregs*sizeof(vse->vsxregs[0])); + vse->vsxregs = xmalloc(vse->n_vsxregs * sizeof(vse->vsxregs[0])); if (!vse->vsxregs) { xfree(vse); return NULL; @@ -153,25 +152,25 @@ static int put_vsx_regs(mcontext_t *mc, UserPpc64VsxstateEntry *vse) } /* point after the Altivec registers */ - buf = (uint64_t*) (mc->v_regs + 1); + buf = (uint64_t *)(mc->v_regs + 1); /* Copy the value saved by get_vsx_regs in the sigframe */ - for (i=0; i < vse->n_vsxregs; i++) + for (i = 0; i < vse->n_vsxregs; i++) buf[i] = vse->vsxregs[i]; return 0; } - static void copy_gp_regs(UserPpc64RegsEntry *dst, user_regs_struct_t *src) { int i; -#define assign_reg(e) do { \ - dst->e = (__typeof__(dst->e))src->e; \ -} while (0) +#define assign_reg(e) \ + do { \ + dst->e = (__typeof__(dst->e))src->e; \ + } while (0) - for (i=0; i<32; i++) + for (i = 0; i < 32; i++) assign_reg(gpr[i]); assign_reg(nip); assign_reg(msr); @@ -189,17 +188,17 @@ static void restore_gp_regs(mcontext_t *dst, UserPpc64RegsEntry *src) int i; /* r0 to r31 */ - for (i=0; i<32; i++) - dst->gp_regs[i] = src->gpr[i]; + for (i = 0; i < 32; i++) + dst->gp_regs[i] = src->gpr[i]; - dst->gp_regs[PT_NIP] = src->nip; - dst->gp_regs[PT_MSR] = src->msr; - dst->gp_regs[PT_ORIG_R3] = src->orig_gpr3; - dst->gp_regs[PT_CTR] = src->ctr; - dst->gp_regs[PT_LNK] = src->link; - dst->gp_regs[PT_XER] = src->xer; - dst->gp_regs[PT_CCR] = src->ccr; - dst->gp_regs[PT_TRAP] = src->trap; + dst->gp_regs[PT_NIP] = src->nip; + dst->gp_regs[PT_MSR] = src->msr; + dst->gp_regs[PT_ORIG_R3] = src->orig_gpr3; + dst->gp_regs[PT_CTR] = src->ctr; + dst->gp_regs[PT_LNK] = src->link; + dst->gp_regs[PT_XER] = src->xer; + dst->gp_regs[PT_CCR] = src->ccr; + dst->gp_regs[PT_TRAP] = src->trap; } static UserPpc64RegsEntry *allocate_gp_regs(void) @@ -250,7 +249,7 @@ static void xfree_tm_state(UserPpc64TmRegsEntry *tme) static int put_tm_regs(struct rt_sigframe *f, UserPpc64TmRegsEntry *tme) { -/* + /* * WARNING: As stated in kernel's restore_tm_sigcontexts, TEXASR has to be * restored by the process itself : * TEXASR was set by the signal delivery reclaim, as was TFIAR. @@ -261,20 +260,17 @@ static int put_tm_regs(struct rt_sigframe *f, UserPpc64TmRegsEntry *tme) */ ucontext_t *tm_uc = &f->uc_transact; - pr_debug("Restoring TM registers FP:%d VR:%d VSX:%d\n", - !!(tme->fpstate), !!(tme->vrstate), !!(tme->vsxstate)); + pr_debug("Restoring TM registers FP:%d VR:%d VSX:%d\n", !!(tme->fpstate), !!(tme->vrstate), !!(tme->vsxstate)); restore_gp_regs(&tm_uc->uc_mcontext, tme->gpregs); if (tme->fpstate) put_fpu_regs(&tm_uc->uc_mcontext, tme->fpstate); - if (tme->vrstate && put_altivec_regs(&tm_uc->uc_mcontext, - tme->vrstate)) + if (tme->vrstate && put_altivec_regs(&tm_uc->uc_mcontext, tme->vrstate)) return -1; - if (tme->vsxstate && put_vsx_regs(&tm_uc->uc_mcontext, - tme->vsxstate)) + if (tme->vsxstate && put_vsx_regs(&tm_uc->uc_mcontext, tme->vsxstate)) return -1; f->uc.uc_link = tm_uc; @@ -282,8 +278,7 @@ static int put_tm_regs(struct rt_sigframe *f, UserPpc64TmRegsEntry *tme) } /****************************************************************************/ -static int copy_tm_regs(user_regs_struct_t *regs, user_fpregs_struct_t *fpregs, - CoreEntry *core) +static int copy_tm_regs(user_regs_struct_t *regs, user_fpregs_struct_t *fpregs, CoreEntry *core) { UserPpc64TmRegsEntry *tme; UserPpc64RegsEntry *gpregs = core->ti_ppc64->gpregs; @@ -299,13 +294,12 @@ static int copy_tm_regs(user_regs_struct_t *regs, user_fpregs_struct_t *fpregs, if (!tme->gpregs) goto out_free; - gpregs->has_tfhar = true; - gpregs->tfhar = fpregs->tm.tm_spr_regs.tfhar; - gpregs->has_texasr = true; - gpregs->texasr = fpregs->tm.tm_spr_regs.texasr; - gpregs->has_tfiar = true; - gpregs->tfiar = fpregs->tm.tm_spr_regs.tfiar; - + gpregs->has_tfhar = true; + gpregs->tfhar = fpregs->tm.tm_spr_regs.tfhar; + gpregs->has_texasr = true; + gpregs->texasr = fpregs->tm.tm_spr_regs.texasr; + gpregs->has_tfiar = true; + gpregs->tfiar = fpregs->tm.tm_spr_regs.tfiar; /* This is the checkpointed state, we must save it in place of the * current state because the signal handler is made in this way. @@ -352,9 +346,7 @@ out_free: return -1; } -static int __copy_task_regs(user_regs_struct_t *regs, - user_fpregs_struct_t *fpregs, - CoreEntry *core) +static int __copy_task_regs(user_regs_struct_t *regs, user_fpregs_struct_t *fpregs, CoreEntry *core) { UserPpc64RegsEntry *gpregs; UserPpc64FpstateEntry **fpstate; @@ -374,8 +366,7 @@ static int __copy_task_regs(user_regs_struct_t *regs, fpstate = &(core->ti_ppc64->tmstate->fpstate); vrstate = &(core->ti_ppc64->tmstate->vrstate); vsxstate = &(core->ti_ppc64->tmstate->vsxstate); - } - else { + } else { gpregs = core->ti_ppc64->gpregs; fpstate = &(core->ti_ppc64->fpstate); vrstate = &(core->ti_ppc64->vrstate); @@ -413,7 +404,7 @@ static int __copy_task_regs(user_regs_struct_t *regs, return 0; } -int save_task_regs(void *arg, user_regs_struct_t *u, user_fpregs_struct_t *f) +int save_task_regs(pid_t pid, void *arg, user_regs_struct_t *u, user_fpregs_struct_t *f) { return __copy_task_regs(u, f, (CoreEntry *)arg); } @@ -424,7 +415,7 @@ int arch_alloc_thread_info(CoreEntry *core) ThreadInfoPpc64 *ti_ppc64; ti_ppc64 = xmalloc(sizeof(*ti_ppc64)); - if(!ti_ppc64) + if (!ti_ppc64) return -1; thread_info_ppc64__init(ti_ppc64); @@ -467,28 +458,24 @@ int restore_fpu(struct rt_sigframe *sigframe, CoreEntry *core) int ret = 0; if (CORE_THREAD_ARCH_INFO(core)->fpstate) - put_fpu_regs(&sigframe->uc.uc_mcontext, - CORE_THREAD_ARCH_INFO(core)->fpstate); + put_fpu_regs(&sigframe->uc.uc_mcontext, CORE_THREAD_ARCH_INFO(core)->fpstate); if (CORE_THREAD_ARCH_INFO(core)->vrstate) - ret = put_altivec_regs(&sigframe->uc.uc_mcontext, - CORE_THREAD_ARCH_INFO(core)->vrstate); + ret = put_altivec_regs(&sigframe->uc.uc_mcontext, CORE_THREAD_ARCH_INFO(core)->vrstate); else if (core->ti_ppc64->gpregs->msr & MSR_VEC) { pr_err("Register's data mismatch, corrupted image ?\n"); ret = -1; } if (!ret && CORE_THREAD_ARCH_INFO(core)->vsxstate) - ret = put_vsx_regs(&sigframe->uc.uc_mcontext, - CORE_THREAD_ARCH_INFO(core)->vsxstate); + ret = put_vsx_regs(&sigframe->uc.uc_mcontext, CORE_THREAD_ARCH_INFO(core)->vsxstate); else if (core->ti_ppc64->gpregs->msr & MSR_VSX) { pr_err("VSX register's data mismatch, corrupted image ?\n"); ret = -1; } if (!ret && CORE_THREAD_ARCH_INFO(core)->tmstate) - ret = put_tm_regs(sigframe, - CORE_THREAD_ARCH_INFO(core)->tmstate); + ret = put_tm_regs(sigframe, CORE_THREAD_ARCH_INFO(core)->tmstate); else if (MSR_TM_ACTIVE(core->ti_ppc64->gpregs->msr)) { pr_err("TM register's data mismatch, corrupted image ?\n"); ret = -1; diff --git a/criu/arch/ppc64/include/asm/dump.h b/criu/arch/ppc64/include/asm/dump.h index a81ee02bd..7393654fa 100644 --- a/criu/arch/ppc64/include/asm/dump.h +++ b/criu/arch/ppc64/include/asm/dump.h @@ -1,11 +1,10 @@ #ifndef __CR_ASM_DUMP_H__ #define __CR_ASM_DUMP_H__ -extern int save_task_regs(void *, user_regs_struct_t *, user_fpregs_struct_t *); +extern int save_task_regs(pid_t pid, void *, user_regs_struct_t *, user_fpregs_struct_t *); extern int arch_alloc_thread_info(CoreEntry *core); extern void arch_free_thread_info(CoreEntry *core); - #define core_put_tls(core, tls) #define get_task_futex_robust_list_compat(pid, info) -1 diff --git a/criu/arch/ppc64/include/asm/kerndat.h b/criu/arch/ppc64/include/asm/kerndat.h index 60956b573..bb70cf6cf 100644 --- a/criu/arch/ppc64/include/asm/kerndat.h +++ b/criu/arch/ppc64/include/asm/kerndat.h @@ -1,7 +1,7 @@ #ifndef __CR_ASM_KERNDAT_H__ #define __CR_ASM_KERNDAT_H__ -#define kdat_compatible_cr() 0 -#define kdat_can_map_vdso() 0 +#define kdat_compatible_cr() 0 +#define kdat_can_map_vdso() 0 #endif /* __CR_ASM_KERNDAT_H__ */ diff --git a/criu/arch/ppc64/include/asm/parasite.h b/criu/arch/ppc64/include/asm/parasite.h index fdbc340b0..45cfc632d 100644 --- a/criu/arch/ppc64/include/asm/parasite.h +++ b/criu/arch/ppc64/include/asm/parasite.h @@ -2,6 +2,9 @@ #define __ASM_PARASITE_H__ /* TLS is accessed through r13, which is already processed */ -static inline void arch_get_tls(tls_t *ptls) { (void)ptls; } +static inline void arch_get_tls(tls_t *ptls) +{ + (void)ptls; +} #endif diff --git a/criu/arch/ppc64/include/asm/restore.h b/criu/arch/ppc64/include/asm/restore.h index 8d4516090..8148015fd 100644 --- a/criu/arch/ppc64/include/asm/restore.h +++ b/criu/arch/ppc64/include/asm/restore.h @@ -9,6 +9,7 @@ * Set R2 to blob + 8000 which is the default value * Jump to restore_task_exec_start + 8 since R2 is already set (local call) */ +/* clang-format off */ #define JUMP_TO_RESTORER_BLOB(new_sp, restore_task_exec_start, \ task_args) \ asm volatile( \ @@ -21,7 +22,8 @@ : "r"(new_sp), \ "r"((unsigned long)restore_task_exec_start), \ "r"(task_args) \ - : "1", "3", "12") + : "3", "12") +/* clang-format on */ /* There is nothing to do since TLS is accessed through r13 */ #define core_get_tls(pcore, ptls) diff --git a/criu/arch/ppc64/include/asm/restorer.h b/criu/arch/ppc64/include/asm/restorer.h index d48d833d6..1ca91140b 100644 --- a/criu/arch/ppc64/include/asm/restorer.h +++ b/criu/arch/ppc64/include/asm/restorer.h @@ -14,9 +14,10 @@ * * See glibc sysdeps/powerpc/powerpc64/sysdep.h for FRAME_MIN_SIZE defines */ -#define RUN_CLONE_RESTORE_FN(ret, clone_flags, new_sp, parent_tid, \ +/* clang-format off */ +#define RUN_CLONE_RESTORE_FN(ret, clone_flags, new_sp, parent_tid, \ thread_args, clone_restore_fn) \ - asm volatile( \ + asm volatile( \ "clone_emul: \n" \ "/* Save fn, args, stack across syscall. */ \n" \ "mr 14, %5 /* clone_restore_fn in r14 */ \n" \ @@ -48,21 +49,71 @@ "r"(&thread_args[i]) /* %6 */ \ : "memory","0","3","4","5","6","7","14","15") -#define arch_map_vdso(map, compat) -1 +#define RUN_CLONE3_RESTORE_FN(ret, clone_args, size, args, \ + clone_restore_fn) \ +/* + * The clone3() function accepts following parameters: + * int clone3(struct clone_args *args, size_t size) + * + * Always consult the CLONE3 wrappers for other architectures + * for additional details. + * + * For PPC64LE the first parameter (clone_args) is passed in r3 and + * the second parameter (size) is passed in r4. + * + * This clone3() wrapper is based on the clone() wrapper from above. + */ \ + asm volatile( \ + "clone3_emul: \n" \ + "/* Save fn, args across syscall. */ \n" \ + "mr 14, %3 /* clone_restore_fn in r14 */ \n" \ + "mr 15, %4 /* &thread_args[i] in r15 */ \n" \ + "mr 3, %1 /* clone_args */ \n" \ + "mr 4, %2 /* size */ \n" \ + "li 0,"__stringify(__NR_clone3)" \n" \ + "sc \n" \ + "/* Check for child process. */ \n" \ + "cmpdi cr1,3,0 \n" \ + "crandc cr1*4+eq,cr1*4+eq,cr0*4+so \n" \ + "bne- cr1,clone3_end \n" \ + "/* child */ \n" \ + "addi 14, 14, 8 /* jump over r2 fixup */ \n" \ + "mtctr 14 \n" \ + "mr 3,15 \n" \ + "bctr \n" \ + "clone3_end: \n" \ + "mr %0,3 \n" \ + : "=r"(ret) /* %0 */ \ + : "r"(&clone_args), /* %1 */ \ + "r"(size), /* %2 */ \ + "r"(clone_restore_fn), /* %3 */ \ + "r"(args) /* %4 */ \ + : "memory","0","3","4","5","14","15") +/* clang-format on */ + +#define arch_map_vdso(map, compat) -1 int restore_gpregs(struct rt_sigframe *f, UserPpc64RegsEntry *r); int restore_nonsigframe_gpregs(UserPpc64RegsEntry *r); /* Nothing to do, TLS is accessed through r13 */ -static inline void restore_tls(tls_t *ptls) { (void)ptls; } +static inline void restore_tls(tls_t *ptls) +{ + (void)ptls; +} /* * Defined in arch/ppc64/syscall-common-ppc64.S */ unsigned long sys_shmat(int shmid, const void *shmaddr, int shmflg); -static inline void *alloc_compat_syscall_stack(void) { return NULL; } -static inline void free_compat_syscall_stack(void *stack32) { } +static inline void *alloc_compat_syscall_stack(void) +{ + return NULL; +} +static inline void free_compat_syscall_stack(void *stack32) +{ +} static inline int arch_compat_rt_sigaction(void *stack, int sig, void *act) { return -1; diff --git a/criu/arch/ppc64/include/asm/thread_pointer.h b/criu/arch/ppc64/include/asm/thread_pointer.h new file mode 100644 index 000000000..304516fbe --- /dev/null +++ b/criu/arch/ppc64/include/asm/thread_pointer.h @@ -0,0 +1,33 @@ +/* __thread_pointer definition. powerpc version. + Copyright (C) 2021 Free Software Foundation, Inc. + This file is part of the GNU C Library. + + The GNU C Library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + The GNU C Library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with the GNU C Library. If not, see + . */ + +#ifndef _SYS_THREAD_POINTER_H +#define _SYS_THREAD_POINTER_H + +#ifdef __powerpc64__ +register void *__thread_register asm("r13"); +#else +register void *__thread_register asm("r2"); +#endif + +static inline void *__criu_thread_pointer(void) +{ + return __thread_register; +} + +#endif /* _SYS_THREAD_POINTER_H */ \ No newline at end of file diff --git a/criu/arch/ppc64/include/asm/types.h b/criu/arch/ppc64/include/asm/types.h index 8f3af86a9..d60aadde5 100644 --- a/criu/arch/ppc64/include/asm/types.h +++ b/criu/arch/ppc64/include/asm/types.h @@ -13,14 +13,22 @@ typedef UserPpc64RegsEntry UserRegsEntry; -#define CORE_ENTRY__MARCH CORE_ENTRY__MARCH__PPC64 +#define CORE_ENTRY__MARCH CORE_ENTRY__MARCH__PPC64 -#define core_is_compat(core) false +#define core_is_compat(core) false #define CORE_THREAD_ARCH_INFO(core) core->ti_ppc64 -static inline void *decode_pointer(uint64_t v) { return (void*)v; } -static inline uint64_t encode_pointer(void *p) { return (uint64_t)p; } +#define TI_IP(core) ((core)->ti_ppc64->gpregs->nip) + +static inline void *decode_pointer(uint64_t v) +{ + return (void *)v; +} +static inline uint64_t encode_pointer(void *p) +{ + return (uint64_t)p; +} /* * Copied from the following kernel header files : @@ -28,11 +36,11 @@ static inline uint64_t encode_pointer(void *p) { return (uint64_t)p; } * arch/powerpc/include/uapi/asm/auxvec.h * include/linux/mm_types.h */ -#define AT_VECTOR_SIZE_BASE 20 +#define AT_VECTOR_SIZE_BASE 20 #if !defined AT_VECTOR_SIZE_ARCH -#define AT_VECTOR_SIZE_ARCH 6 +#define AT_VECTOR_SIZE_ARCH 6 #endif -#define AT_VECTOR_SIZE (2*(AT_VECTOR_SIZE_ARCH + AT_VECTOR_SIZE_BASE + 1)) +#define AT_VECTOR_SIZE (2 * (AT_VECTOR_SIZE_ARCH + AT_VECTOR_SIZE_BASE + 1)) typedef uint64_t auxv_t; diff --git a/criu/arch/ppc64/include/asm/vdso.h b/criu/arch/ppc64/include/asm/vdso.h index 6c92348d6..b73927bb6 100644 --- a/criu/arch/ppc64/include/asm/vdso.h +++ b/criu/arch/ppc64/include/asm/vdso.h @@ -12,18 +12,22 @@ * Note that '__kernel_datapage_offset' is not a service but mostly a data * inside the text page which should not be used as is from user space. */ -#define VDSO_SYMBOL_MAX 10 -#define VDSO_SYMBOL_GTOD 5 -#define ARCH_VDSO_SYMBOLS \ - "__kernel_clock_getres", \ - "__kernel_clock_gettime", \ - "__kernel_get_syscall_map", \ - "__kernel_get_tbfreq", \ - "__kernel_getcpu", \ - "__kernel_gettimeofday", \ - "__kernel_sigtramp_rt64", \ - "__kernel_sync_dicache", \ - "__kernel_sync_dicache_p5", \ - "__kernel_time" +#define VDSO_SYMBOL_MAX 10 +#define VDSO_SYMBOL_GTOD 5 +#define ARCH_VDSO_SYMBOLS_LIST \ + const char *aarch_vdso_symbol1 = "__kernel_clock_getres"; \ + const char *aarch_vdso_symbol2 = "__kernel_clock_gettime"; \ + const char *aarch_vdso_symbol3 = "__kernel_get_syscall_map"; \ + const char *aarch_vdso_symbol4 = "__kernel_get_tbfreq"; \ + const char *aarch_vdso_symbol5 = "__kernel_getcpu"; \ + const char *aarch_vdso_symbol6 = "__kernel_gettimeofday"; \ + const char *aarch_vdso_symbol7 = "__kernel_sigtramp_rt64"; \ + const char *aarch_vdso_symbol8 = "__kernel_sync_dicache"; \ + const char *aarch_vdso_symbol9 = "__kernel_sync_dicache_p5"; \ + const char *aarch_vdso_symbol10 = "__kernel_time"; + +#define ARCH_VDSO_SYMBOLS \ + aarch_vdso_symbol1, aarch_vdso_symbol2, aarch_vdso_symbol3, aarch_vdso_symbol4, aarch_vdso_symbol5, \ + aarch_vdso_symbol6, aarch_vdso_symbol7, aarch_vdso_symbol8, aarch_vdso_symbol9, aarch_vdso_symbol10 #endif /* __CR_ASM_VDSO_H__ */ diff --git a/criu/arch/ppc64/restorer.c b/criu/arch/ppc64/restorer.c index 7172e44c3..56c09391e 100644 --- a/criu/arch/ppc64/restorer.c +++ b/criu/arch/ppc64/restorer.c @@ -9,35 +9,32 @@ int restore_nonsigframe_gpregs(UserPpc64RegsEntry *r) { -#define SPRN_TFHAR 128 -#define SPRN_TFIAR 129 -#define SPRN_TEXASR 130 +#define SPRN_TFHAR 128 +#define SPRN_TFIAR 129 +#define SPRN_TEXASR 130 if (r->has_tfhar) { - asm __volatile__ ( - "ld 3, %[value] ;" - "mtspr %[sprn],3 ;" - : [value]"=m"(r->tfhar) - : [sprn]"i"(SPRN_TFHAR) - : "r3"); + asm __volatile__("ld 3, %[value] ;" + "mtspr %[sprn],3 ;" + : [value] "=m"(r->tfhar) + : [sprn] "i"(SPRN_TFHAR) + : "r3"); } if (r->has_tfiar) { - asm __volatile__ ( - "ld 3, %[value] ;" - "mtspr %[sprn],3 ;" - : [value]"=m"(r->tfiar) - : [sprn]"i"(SPRN_TFIAR) - : "r3"); + asm __volatile__("ld 3, %[value] ;" + "mtspr %[sprn],3 ;" + : [value] "=m"(r->tfiar) + : [sprn] "i"(SPRN_TFIAR) + : "r3"); } if (r->has_texasr) { - asm __volatile__ ( - "ld 3, %[value] ;" - "mtspr %[sprn],3 ;" - : [value]"=m"(r->texasr) - : [sprn]"i"(SPRN_TEXASR) - : "r3"); + asm __volatile__("ld 3, %[value] ;" + "mtspr %[sprn],3 ;" + : [value] "=m"(r->texasr) + : [sprn] "i"(SPRN_TEXASR) + : "r3"); } return 0; @@ -48,15 +45,14 @@ unsigned long sys_shmat(int shmid, const void *shmaddr, int shmflg) unsigned long raddr; int ret; - ret = sys_ipc(21 /*SHMAT */, - shmid, /* first */ - shmflg, /* second */ - (unsigned long)&raddr, /* third */ - shmaddr, /* ptr */ - 0 /* fifth not used */); + ret = sys_ipc(21 /*SHMAT */, shmid, /* first */ + shmflg, /* second */ + (unsigned long)&raddr, /* third */ + shmaddr, /* ptr */ + 0 /* fifth not used */); if (ret) - raddr = (unsigned long) ret; + raddr = (unsigned long)ret; return raddr; } diff --git a/criu/arch/ppc64/sigframe.c b/criu/arch/ppc64/sigframe.c index 52fad2e9a..5a98eb8b9 100644 --- a/criu/arch/ppc64/sigframe.c +++ b/criu/arch/ppc64/sigframe.c @@ -18,14 +18,12 @@ static inline void update_vregs(mcontext_t *lcontext, mcontext_t *rcontext) uint64_t offset = (uint64_t)(lcontext->v_regs) - (uint64_t)lcontext; lcontext->v_regs = (vrregset_t *)((uint64_t)rcontext + offset); - pr_debug("Updated v_regs:%llx (rcontext:%llx)\n", - (unsigned long long) lcontext->v_regs, - (unsigned long long) rcontext); + pr_debug("Updated v_regs:%llx (rcontext:%llx)\n", (unsigned long long)lcontext->v_regs, + (unsigned long long)rcontext); } } -int sigreturn_prep_fpu_frame(struct rt_sigframe *frame, - struct rt_sigframe *rframe) +int sigreturn_prep_fpu_frame(struct rt_sigframe *frame, struct rt_sigframe *rframe) { uint64_t msr = frame->uc.uc_mcontext.gp_regs[PT_MSR]; @@ -39,9 +37,8 @@ int sigreturn_prep_fpu_frame(struct rt_sigframe *frame, /* Updating the transactional state address if any */ if (frame->uc.uc_link) { - update_vregs(&frame->uc_transact.uc_mcontext, - &rframe->uc_transact.uc_mcontext); - frame->uc.uc_link = &rframe->uc_transact; + update_vregs(&frame->uc_transact.uc_mcontext, &rframe->uc_transact.uc_mcontext); + frame->uc.uc_link = &rframe->uc_transact; } return 0; diff --git a/criu/arch/ppc64/vdso-pie.c b/criu/arch/ppc64/vdso-pie.c index 910c3d38b..a84ae776b 100644 --- a/criu/arch/ppc64/vdso-pie.c +++ b/criu/arch/ppc64/vdso-pie.c @@ -9,7 +9,7 @@ #include "common/bug.h" #ifdef LOG_PREFIX -# undef LOG_PREFIX +#undef LOG_PREFIX #endif #define LOG_PREFIX "vdso: " @@ -18,15 +18,15 @@ extern char *vdso_trampoline, *vdso_trampoline_end; static inline void invalidate_caches(unsigned long at) { - asm volatile("isync \n" \ - "li 3,0 \n" \ - "dcbf 3,%0 \n" \ - "sync \n" \ - "icbi 3,%0 \n" \ - "isync \n" \ - : /* no output */ \ - : "r"(at) \ - :"memory", "r3"); + asm volatile("isync \n" + "li 3,0 \n" + "dcbf 3,%0 \n" + "sync \n" + "icbi 3,%0 \n" + "isync \n" + : /* no output */ + : "r"(at) + : "memory", "r3"); } /* This is the size of the trampoline call : @@ -34,7 +34,7 @@ static inline void invalidate_caches(unsigned long at) * bl trampoline * <64 bit address> */ -#define TRAMP_CALL_SIZE (2*sizeof(uint32_t) + sizeof(uint64_t)) +#define TRAMP_CALL_SIZE (2 * sizeof(uint32_t) + sizeof(uint64_t)) /* * put_trampoline does 2 things : @@ -53,39 +53,35 @@ static inline void invalidate_caches(unsigned long at) */ static unsigned long put_trampoline(unsigned long at, struct vdso_symtable *sym) { - int i,j; + int i, j; unsigned long size; unsigned long trampoline = 0; /* First of all we have to find a place where to put the trampoline * code. */ - size = (unsigned long)&vdso_trampoline_end - - (unsigned long)&vdso_trampoline; + size = (unsigned long)&vdso_trampoline_end - (unsigned long)&vdso_trampoline; for (i = 0; i < ARRAY_SIZE(sym->symbols); i++) { if (vdso_symbol_empty(&sym->symbols[i])) continue; - pr_debug("Checking '%s' at %lx\n", sym->symbols[i].name, - sym->symbols[i].offset); + pr_debug("Checking '%s' at %lx\n", sym->symbols[i].name, sym->symbols[i].offset); /* find the nearest following symbol we are interested in */ - for (j=0; j < ARRAY_SIZE(sym->symbols); j++) { - if (i==j || vdso_symbol_empty(&sym->symbols[j])) + for (j = 0; j < ARRAY_SIZE(sym->symbols); j++) { + if (i == j || vdso_symbol_empty(&sym->symbols[j])) continue; if (sym->symbols[j].offset <= sym->symbols[i].offset) /* this symbol is above the current one */ continue; - if ((sym->symbols[i].offset+TRAMP_CALL_SIZE) > - sym->symbols[j].offset) { + if ((sym->symbols[i].offset + TRAMP_CALL_SIZE) > sym->symbols[j].offset) { /* we have a major issue here since we cannot * even put the trampoline call for this symbol */ - pr_err("Can't handle small vDSO symbol %s\n", - sym->symbols[i].name); + pr_err("Can't handle small vDSO symbol %s\n", sym->symbols[i].name); return 0; } @@ -93,8 +89,7 @@ static unsigned long put_trampoline(unsigned long at, struct vdso_symtable *sym) /* no need to put it twice */ continue; - if ((sym->symbols[j].offset - - (sym->symbols[i].offset+TRAMP_CALL_SIZE)) <= size) + if ((sym->symbols[j].offset - (sym->symbols[i].offset + TRAMP_CALL_SIZE)) <= size) /* not enough place */ continue; @@ -102,10 +97,8 @@ static unsigned long put_trampoline(unsigned long at, struct vdso_symtable *sym) trampoline = at + sym->symbols[i].offset; trampoline += TRAMP_CALL_SIZE; - pr_debug("Putting vDSO trampoline in %s at %lx\n", - sym->symbols[i].name, trampoline); - memcpy((void *)trampoline, &vdso_trampoline, - size); + pr_debug("Putting vDSO trampoline in %s at %lx\n", sym->symbols[i].name, trampoline); + memcpy((void *)trampoline, &vdso_trampoline, size); invalidate_caches(trampoline); } } @@ -113,21 +106,19 @@ static unsigned long put_trampoline(unsigned long at, struct vdso_symtable *sym) return trampoline; } -static inline void put_trampoline_call(unsigned long at, unsigned long to, - unsigned long tr) +static inline void put_trampoline_call(unsigned long at, unsigned long to, unsigned long tr) { - uint32_t *addr = (uint32_t *)at; + uint32_t *addr = (uint32_t *)at; - *addr++ = 0x7C0802a6; /* mflr r0 */ - *addr++ = 0x48000001 | ((long)(tr-at-4) & 0x3fffffc); /* bl tr */ - *(uint64_t *)addr = to; /* the address to read by the trampoline */ + *addr++ = 0x7C0802a6; /* mflr r0 */ + *addr++ = 0x48000001 | ((long)(tr - at - 4) & 0x3fffffc); /* bl tr */ + *(uint64_t *)addr = to; /* the address to read by the trampoline */ - invalidate_caches(at); + invalidate_caches(at); } -int vdso_redirect_calls(unsigned long base_to, unsigned long base_from, - struct vdso_symtable *to, struct vdso_symtable *from, - bool __always_unused compat_vdso) +int vdso_redirect_calls(unsigned long base_to, unsigned long base_from, struct vdso_symtable *to, + struct vdso_symtable *from, bool __always_unused compat_vdso) { unsigned int i; unsigned long trampoline; @@ -140,14 +131,10 @@ int vdso_redirect_calls(unsigned long base_to, unsigned long base_from, if (vdso_symbol_empty(&from->symbols[i])) continue; - pr_debug("br: %lx/%lx -> %lx/%lx (index %d) '%s'\n", - base_from, from->symbols[i].offset, - base_to, to->symbols[i].offset, i, - from->symbols[i].name); + pr_debug("br: %lx/%lx -> %lx/%lx (index %d) '%s'\n", base_from, from->symbols[i].offset, base_to, + to->symbols[i].offset, i, from->symbols[i].name); - put_trampoline_call(base_from + from->symbols[i].offset, - base_to + to->symbols[i].offset, - trampoline); + put_trampoline_call(base_from + from->symbols[i].offset, base_to + to->symbols[i].offset, trampoline); } return 0; diff --git a/criu/arch/riscv64/Makefile b/criu/arch/riscv64/Makefile new file mode 100644 index 000000000..d19895471 --- /dev/null +++ b/criu/arch/riscv64/Makefile @@ -0,0 +1,8 @@ +builtin-name := crtools.built-in.o + +ldflags-y += -r + +obj-y += cpu.o +obj-y += crtools.o +obj-y += sigframe.o +obj-y += vdso-lookup.o \ No newline at end of file diff --git a/criu/arch/riscv64/cpu.c b/criu/arch/riscv64/cpu.c new file mode 100644 index 000000000..97a883b8c --- /dev/null +++ b/criu/arch/riscv64/cpu.c @@ -0,0 +1,40 @@ +#undef LOG_PREFIX +#define LOG_PREFIX "cpu: " + +#include +#include "cpu.h" + +int cpu_init(void) +{ + return 0; +} + +int cpu_dump_cpuinfo(void) +{ + return 0; +} + +int cpu_validate_cpuinfo(void) +{ + return 0; +} + +int cpu_dump_cpuinfo_single(void) +{ + return -ENOTSUP; +} + +int cpu_validate_image_cpuinfo_single(void) +{ + return -ENOTSUP; +} + +int cpuinfo_dump(void) +{ + return -ENOTSUP; +} + +int cpuinfo_check(void) +{ + return -ENOTSUP; +} diff --git a/criu/arch/riscv64/crtools.c b/criu/arch/riscv64/crtools.c new file mode 100644 index 000000000..eea98d6de --- /dev/null +++ b/criu/arch/riscv64/crtools.c @@ -0,0 +1,171 @@ +#include +#include + +#include + +#include "types.h" +#include + +#include +#include "asm/restorer.h" +#include "common/compiler.h" +#include +#include "asm/dump.h" +#include "protobuf.h" +#include "images/core.pb-c.h" +#include "images/creds.pb-c.h" +#include "parasite-syscall.h" +#include "log.h" +#include "util.h" +#include "cpu.h" +#include "restorer.h" +#include "compel/infect.h" + +#define assign_reg(dst, src, e) dst->e = (__typeof__(dst->e))(src)->e + +int save_task_regs(pid_t pid, void *x, user_regs_struct_t *regs, user_fpregs_struct_t *fpsimd) +{ + int i; + CoreEntry *core = x; + + // Save riscv64 gprs + assign_reg(core->ti_riscv64->gpregs, regs, pc); + assign_reg(core->ti_riscv64->gpregs, regs, ra); + assign_reg(core->ti_riscv64->gpregs, regs, sp); + assign_reg(core->ti_riscv64->gpregs, regs, gp); + assign_reg(core->ti_riscv64->gpregs, regs, tp); + assign_reg(core->ti_riscv64->gpregs, regs, t0); + assign_reg(core->ti_riscv64->gpregs, regs, t1); + assign_reg(core->ti_riscv64->gpregs, regs, t2); + assign_reg(core->ti_riscv64->gpregs, regs, s0); + assign_reg(core->ti_riscv64->gpregs, regs, s1); + assign_reg(core->ti_riscv64->gpregs, regs, a0); + assign_reg(core->ti_riscv64->gpregs, regs, a1); + assign_reg(core->ti_riscv64->gpregs, regs, a2); + assign_reg(core->ti_riscv64->gpregs, regs, a3); + assign_reg(core->ti_riscv64->gpregs, regs, a4); + assign_reg(core->ti_riscv64->gpregs, regs, a5); + assign_reg(core->ti_riscv64->gpregs, regs, a6); + assign_reg(core->ti_riscv64->gpregs, regs, a7); + assign_reg(core->ti_riscv64->gpregs, regs, s2); + assign_reg(core->ti_riscv64->gpregs, regs, s3); + assign_reg(core->ti_riscv64->gpregs, regs, s4); + assign_reg(core->ti_riscv64->gpregs, regs, s5); + assign_reg(core->ti_riscv64->gpregs, regs, s6); + assign_reg(core->ti_riscv64->gpregs, regs, s7); + assign_reg(core->ti_riscv64->gpregs, regs, s8); + assign_reg(core->ti_riscv64->gpregs, regs, s9); + assign_reg(core->ti_riscv64->gpregs, regs, s10); + assign_reg(core->ti_riscv64->gpregs, regs, s11); + assign_reg(core->ti_riscv64->gpregs, regs, t3); + assign_reg(core->ti_riscv64->gpregs, regs, t4); + assign_reg(core->ti_riscv64->gpregs, regs, t5); + assign_reg(core->ti_riscv64->gpregs, regs, t6); + + // Save riscv64 fprs + for (i = 0; i < 32; ++i) + assign_reg(core->ti_riscv64->fpsimd, fpsimd, f[i]); + assign_reg(core->ti_riscv64->fpsimd, fpsimd, fcsr); + + return 0; +} + +int arch_alloc_thread_info(CoreEntry *core) +{ + ThreadInfoRiscv64 *ti_riscv64; + UserRiscv64RegsEntry *gpregs; + UserRiscv64DExtEntry *fpsimd; + + ti_riscv64 = xmalloc(sizeof(*ti_riscv64)); + if (!ti_riscv64) + goto err; + thread_info_riscv64__init(ti_riscv64); + core->ti_riscv64 = ti_riscv64; + + gpregs = xmalloc(sizeof(*gpregs)); + if (!gpregs) + goto err; + user_riscv64_regs_entry__init(gpregs); + + ti_riscv64->gpregs = gpregs; + + fpsimd = xmalloc(sizeof(*fpsimd)); + if (!fpsimd) + goto err; + user_riscv64_d_ext_entry__init(fpsimd); + ti_riscv64->fpsimd = fpsimd; + fpsimd->f = xmalloc(32 * sizeof(fpsimd->f[0])); + fpsimd->n_f = 32; + if (!fpsimd->f) + goto err; + + return 0; +err: + return -1; +} + +void arch_free_thread_info(CoreEntry *core) +{ + if (core->ti_riscv64) { + if (core->ti_riscv64->fpsimd) { + xfree(core->ti_riscv64->fpsimd->f); + xfree(core->ti_riscv64->fpsimd); + } + xfree(core->ti_riscv64->gpregs); + xfree(core->ti_riscv64); + core->ti_riscv64 = NULL; + } +} + +int restore_fpu(struct rt_sigframe *sigframe, CoreEntry *core) +{ + int i; + UserRiscv64DExtEntry *fpsimd = core->ti_riscv64->fpsimd; + + if (fpsimd->n_f != 32) + return 1; + + for (i = 0; i < 32; ++i) + sigframe->uc.uc_mcontext.__fpregs.__d.__f[i] = fpsimd->f[i]; + sigframe->uc.uc_mcontext.__fpregs.__d.__fcsr = fpsimd->fcsr; + + return 0; +} + +int restore_gpregs(struct rt_sigframe *f, UserRiscv64RegsEntry *r) +{ + f->uc.uc_mcontext.__gregs[0] = r->pc; + f->uc.uc_mcontext.__gregs[1] = r->ra; + f->uc.uc_mcontext.__gregs[2] = r->sp; + f->uc.uc_mcontext.__gregs[3] = r->gp; + f->uc.uc_mcontext.__gregs[4] = r->tp; + f->uc.uc_mcontext.__gregs[5] = r->t0; + f->uc.uc_mcontext.__gregs[6] = r->t1; + f->uc.uc_mcontext.__gregs[7] = r->t2; + f->uc.uc_mcontext.__gregs[8] = r->s0; + f->uc.uc_mcontext.__gregs[9] = r->s1; + f->uc.uc_mcontext.__gregs[10] = r->a0; + f->uc.uc_mcontext.__gregs[11] = r->a1; + f->uc.uc_mcontext.__gregs[12] = r->a2; + f->uc.uc_mcontext.__gregs[13] = r->a3; + f->uc.uc_mcontext.__gregs[14] = r->a4; + f->uc.uc_mcontext.__gregs[15] = r->a5; + f->uc.uc_mcontext.__gregs[16] = r->a6; + f->uc.uc_mcontext.__gregs[17] = r->a7; + f->uc.uc_mcontext.__gregs[18] = r->s2; + f->uc.uc_mcontext.__gregs[19] = r->s3; + f->uc.uc_mcontext.__gregs[20] = r->s4; + f->uc.uc_mcontext.__gregs[21] = r->s5; + f->uc.uc_mcontext.__gregs[22] = r->s6; + f->uc.uc_mcontext.__gregs[23] = r->s7; + f->uc.uc_mcontext.__gregs[24] = r->s8; + f->uc.uc_mcontext.__gregs[25] = r->s9; + f->uc.uc_mcontext.__gregs[26] = r->s10; + f->uc.uc_mcontext.__gregs[27] = r->s11; + f->uc.uc_mcontext.__gregs[28] = r->t3; + f->uc.uc_mcontext.__gregs[29] = r->t4; + f->uc.uc_mcontext.__gregs[30] = r->t5; + f->uc.uc_mcontext.__gregs[31] = r->t6; + + return 0; +} diff --git a/criu/arch/riscv64/include/asm/dump.h b/criu/arch/riscv64/include/asm/dump.h new file mode 100644 index 000000000..4f0a2d209 --- /dev/null +++ b/criu/arch/riscv64/include/asm/dump.h @@ -0,0 +1,15 @@ +#ifndef __CR_ASM_DUMP_H__ +#define __CR_ASM_DUMP_H__ + +extern int save_task_regs(pid_t pid, void *, user_regs_struct_t *, user_fpregs_struct_t *); +extern int arch_alloc_thread_info(CoreEntry *core); +extern void arch_free_thread_info(CoreEntry *core); + +static inline void core_put_tls(CoreEntry *core, tls_t tls) +{ + core->ti_riscv64->tls = tls; +} + +#define get_task_futex_robust_list_compat(pid, info) -1 + +#endif diff --git a/criu/arch/riscv64/include/asm/int.h b/criu/arch/riscv64/include/asm/int.h new file mode 100644 index 000000000..642804e9b --- /dev/null +++ b/criu/arch/riscv64/include/asm/int.h @@ -0,0 +1,6 @@ +#ifndef __CR_ASM_INT_H__ +#define __CR_ASM_INT_H__ + +#include "asm-generic/int.h" + +#endif /* __CR_ASM_INT_H__ */ diff --git a/criu/arch/riscv64/include/asm/kerndat.h b/criu/arch/riscv64/include/asm/kerndat.h new file mode 100644 index 000000000..bb70cf6cf --- /dev/null +++ b/criu/arch/riscv64/include/asm/kerndat.h @@ -0,0 +1,7 @@ +#ifndef __CR_ASM_KERNDAT_H__ +#define __CR_ASM_KERNDAT_H__ + +#define kdat_compatible_cr() 0 +#define kdat_can_map_vdso() 0 + +#endif /* __CR_ASM_KERNDAT_H__ */ diff --git a/criu/arch/riscv64/include/asm/parasite-syscall.h b/criu/arch/riscv64/include/asm/parasite-syscall.h new file mode 100644 index 000000000..6008c3792 --- /dev/null +++ b/criu/arch/riscv64/include/asm/parasite-syscall.h @@ -0,0 +1,6 @@ +#ifndef __CR_ASM_PARASITE_SYSCALL_H__ +#define __CR_ASM_PARASITE_SYSCALL_H__ + +struct parasite_ctl; + +#endif diff --git a/criu/arch/riscv64/include/asm/parasite.h b/criu/arch/riscv64/include/asm/parasite.h new file mode 100644 index 000000000..4798cfd8a --- /dev/null +++ b/criu/arch/riscv64/include/asm/parasite.h @@ -0,0 +1,16 @@ +#ifndef __ASM_PARASITE_H__ +#define __ASM_PARASITE_H__ + +/* + * This function is used to retrieve the value of the thread pointer (tp) + * in RISC-V architecture, which is typically used for thread-local storage (TLS). + * The value is then stored in the provided tls_t pointer. + */ +static inline void arch_get_tls(tls_t *ptls) +{ + tls_t tls; + asm("mv %0, tp" : "=r"(tls)); + *ptls = tls; +} + +#endif diff --git a/criu/arch/riscv64/include/asm/restore.h b/criu/arch/riscv64/include/asm/restore.h new file mode 100644 index 000000000..e4f25a57b --- /dev/null +++ b/criu/arch/riscv64/include/asm/restore.h @@ -0,0 +1,29 @@ +#ifndef __CR_ASM_RESTORE_H__ +#define __CR_ASM_RESTORE_H__ + +#include "asm/restorer.h" + +#include "images/core.pb-c.h" + +/* clang-format off */ +#define JUMP_TO_RESTORER_BLOB(new_sp, restore_task_exec_start, \ + task_args) \ + asm volatile( \ + "and sp, %0, ~15 \n" \ + "mv a0, %2 \n" \ + "jr %1 \n" \ + : \ + : "r"(new_sp), \ + "r"(restore_task_exec_start), \ + "r"(task_args) \ + : "a0", "memory") +/* clang-format on */ + +static inline void core_get_tls(CoreEntry *pcore, tls_t *ptls) +{ + *ptls = pcore->ti_riscv64->tls; +} + +int restore_fpu(struct rt_sigframe *sigframe, CoreEntry *core); + +#endif diff --git a/criu/arch/riscv64/include/asm/restorer.h b/criu/arch/riscv64/include/asm/restorer.h new file mode 100644 index 000000000..45fe847a9 --- /dev/null +++ b/criu/arch/riscv64/include/asm/restorer.h @@ -0,0 +1,150 @@ +#ifndef __CR_ASM_RESTORER_H__ +#define __CR_ASM_RESTORER_H__ + +#include + +#include "asm/types.h" +#include "images/core.pb-c.h" + +#include + +// kernel arg order for clone +// unsigned long clone_flags, +// unsigned long newsp, +// int __user * parent_tidptr, +// unsigned long tls, +// int __user * child_tidptr +/* clang-format off */ +#define RUN_CLONE_RESTORE_FN(ret, clone_flags, new_sp, parent_tid, \ + thread_args, clone_restore_fn) \ + asm volatile( \ + "clone_emul: \n" \ + "ld a1, %2 \n" \ + "andi a1, a1, ~15 \n" \ + "addi a1, a1, -16 \n" \ + "sd %5, 0(a1) \n" \ + "sd %6, 8(a1) \n" \ + "mv a0, %1 \n" \ + "mv a2, %3 \n" \ + "mv a3, %4 \n" \ + "li a7, "__stringify(__NR_clone)" \n" \ + "ecall \n" \ + \ + "beqz a0, thread_run \n" \ + \ + "mv %0, a0 \n" \ + "j clone_end \n" \ + \ + "thread_run: \n" \ + "ld a1, 0(sp) \n" \ + "ld a0, 8(sp) \n" \ + "jr a1 \n" \ + \ + "clone_end: \n" \ + : "=r"(ret) \ + : "r"(clone_flags), \ + "m"(new_sp), \ + "r"(&parent_tid), \ + "r"(&thread_args[i].pid), \ + "r"(clone_restore_fn), \ + "r"(&thread_args[i]) \ + : "a0", "a1", "a2", "a3", "a7", "memory") + +/* + * Based on sysdeps/unix/sysv/linux/riscv/clone.S + * + * int clone(int (*fn)(void *arg), x0 + * void *child_stack, x1 + * int flags, x2 + * void *arg, x3 + * pid_t *ptid, x4 + * struct user_desc *tls, x5 + * pid_t *ctid); x6 + * + * int clone3(struct clone_args *args, x0 + * size_t size); x1 + * + * Always consult the CLONE3 wrappers for other architectures + * for additional details. + * + */ +#define RUN_CLONE3_RESTORE_FN(ret, clone_args, size, args, \ + clone_restore_fn) \ + asm volatile( \ + /* In contrast to the clone() wrapper above this does not put + * the thread function and its arguments on the child stack, + * but uses registers to pass these parameters to the child process. + * Based on the glibc clone() wrapper at + * sysdeps/unix/sysv/linux/riscv/clone.S. + */ \ + "clone3_emul: \n" \ + /* + * Based on the glibc clone() wrapper, which uses x10 and x11 + * to save the arguments for the child process, this does the same. + * x10 for the thread function and x11 for the thread arguments. + */ \ + "mv t0, %3 /* clone_restore_fn */ \n" \ + "mv t1, %4 /* args */ \n" \ + "mv a0, %1 /* &clone_args */ \n" \ + "mv a1, %2 /* size */ \n" \ + /* Load syscall number */ \ + "li a7, "__stringify(__NR_clone3)" \n" \ + /* Do the syscall */ \ + "ecall \n" \ + \ + "beqz a0, clone3_thread_run \n" \ + \ + "mv %0, a0 \n" \ + "j clone3_end \n" \ + \ + "clone3_thread_run: \n" \ + /* Move args to a0 */ \ + "mv a0, t1 \n" \ + /* Jump to clone_restore_fn */ \ + "jr t0 \n" \ + \ + "clone3_end: \n" \ + : "=r"(ret) \ + : "r"(&clone_args), \ + "r"(size), \ + "r"(clone_restore_fn), \ + "r"(args) \ + : "a0", "a1", "a7", "t0", "t1", "memory") + +#define ARCH_FAIL_CORE_RESTORE \ + asm volatile( \ + "mv sp, %0 \n" \ + "li a0, 0 \n" \ + "jr x0 \n" \ + : \ + : "r"(ret) \ + : "sp", "a0", "memory") +/* clang-format on */ + +#define arch_map_vdso(map, compat) -1 + +int restore_gpregs(struct rt_sigframe *f, UserRiscv64RegsEntry *r); +int restore_nonsigframe_gpregs(UserRiscv64RegsEntry *r); + +static inline void restore_tls(tls_t *ptls) +{ + asm("mv tp, %0" : : "r"(*ptls)); +} + +static inline void *alloc_compat_syscall_stack(void) +{ + return NULL; +} +static inline void free_compat_syscall_stack(void *stack32) +{ +} +static inline int arch_compat_rt_sigaction(void *stack, int sig, void *act) +{ + return -1; +} +static inline int set_compat_robust_list(uint32_t head_ptr, uint32_t len) +{ + return -1; +} + +#endif \ No newline at end of file diff --git a/criu/arch/riscv64/include/asm/thread_pointer.h b/criu/arch/riscv64/include/asm/thread_pointer.h new file mode 100644 index 000000000..f7e07066a --- /dev/null +++ b/criu/arch/riscv64/include/asm/thread_pointer.h @@ -0,0 +1,27 @@ +/* __thread_pointer definition. Generic version. + Copyright (C) 2021 Free Software Foundation, Inc. + This file is part of the GNU C Library. + + The GNU C Library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + The GNU C Library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with the GNU C Library. If not, see + . */ + +#ifndef _SYS_THREAD_POINTER_H +#define _SYS_THREAD_POINTER_H + +static inline void *__criu_thread_pointer(void) +{ + return __builtin_thread_pointer(); +} + +#endif /* _SYS_THREAD_POINTER_H */ diff --git a/criu/arch/riscv64/include/asm/types.h b/criu/arch/riscv64/include/asm/types.h new file mode 100644 index 000000000..83bb5f65f --- /dev/null +++ b/criu/arch/riscv64/include/asm/types.h @@ -0,0 +1,40 @@ +#ifndef __CR_ASM_TYPES_H__ +#define __CR_ASM_TYPES_H__ + +#include +#include +#include +#include "images/core.pb-c.h" + +#include "page.h" +#include "bitops.h" +#include "asm/int.h" + +#include + +#define core_is_compat(core) false + +typedef UserRiscv64RegsEntry UserRegsEntry; + +#define CORE_ENTRY__MARCH CORE_ENTRY__MARCH__RISCV64 + +#define CORE_THREAD_ARCH_INFO(core) core->ti_riscv64 + +#define TI_SP(core) ((core)->ti_riscv64->gpregs->sp) + +#define TI_IP(core) ((core)->ti_riscv64->gpregs->pc) + +static inline void *decode_pointer(uint64_t v) +{ + return (void *)v; +} +static inline uint64_t encode_pointer(void *p) +{ + return (uint64_t)p; +} + +#define AT_VECTOR_SIZE 64 +typedef uint64_t auxv_t; +typedef uint64_t tls_t; + +#endif /* __CR_ASM_TYPES_H__ */ diff --git a/criu/arch/riscv64/include/asm/vdso.h b/criu/arch/riscv64/include/asm/vdso.h new file mode 100644 index 000000000..322149c6e --- /dev/null +++ b/criu/arch/riscv64/include/asm/vdso.h @@ -0,0 +1,28 @@ +#ifndef __CR_ASM_VDSO_H__ +#define __CR_ASM_VDSO_H__ + +#include "asm/int.h" +#include "common/compiler.h" +#include "asm-generic/vdso.h" + +/* + * This is a minimal amount of symbols + * we should support at the moment. + */ +#define VDSO_SYMBOL_MAX 6 +#define VDSO_SYMBOL_GTOD 2 + +#define ARCH_VDSO_SYMBOLS_LIST \ + const char *rv64_vdso_symbol1 = "__vdso_clock_getres"; \ + const char *rv64_vdso_symbol2 = "__vdso_clock_gettime"; \ + const char *rv64_vdso_symbol3 = "__vdso_gettimeofday"; \ + const char *rv64_vdso_symbol4 = "__vdso_getcpu"; \ + const char *rv64_vdso_symbol5 = "__vdso_flush_icache"; \ + const char *rv64_vdso_symbol6 = "__vdso_rt_sigreturn"; + +#define ARCH_VDSO_SYMBOLS \ + rv64_vdso_symbol1, rv64_vdso_symbol2, rv64_vdso_symbol3, rv64_vdso_symbol4, rv64_vdso_symbol5, rv64_vdso_symbol6 + +extern void write_intraprocedure_branch(unsigned long to, unsigned long from); + +#endif /* __CR_ASM_VDSO_H__ */ \ No newline at end of file diff --git a/criu/arch/riscv64/restorer.c b/criu/arch/riscv64/restorer.c new file mode 100644 index 000000000..d605f048d --- /dev/null +++ b/criu/arch/riscv64/restorer.c @@ -0,0 +1,14 @@ +#include + +#include "restorer.h" +#include "asm/restorer.h" + +#include +#include "log.h" +#include +#include "cpu.h" + +int restore_nonsigframe_gpregs(UserRiscv64RegsEntry *r) +{ + return 0; +} diff --git a/criu/arch/riscv64/sigframe.c b/criu/arch/riscv64/sigframe.c new file mode 100644 index 000000000..8096fab66 --- /dev/null +++ b/criu/arch/riscv64/sigframe.c @@ -0,0 +1,8 @@ +#include "asm/types.h" +#include +#include "asm/sigframe.h" + +int sigreturn_prep_fpu_frame(struct rt_sigframe *sigframe, struct rt_sigframe *rsigframe) +{ + return 0; +} diff --git a/criu/arch/riscv64/vdso-lookup.S b/criu/arch/riscv64/vdso-lookup.S new file mode 100644 index 000000000..50d4ecf08 --- /dev/null +++ b/criu/arch/riscv64/vdso-lookup.S @@ -0,0 +1,15 @@ +#include "common/asm/linkage.h" + +.section .text + +/* Expects t0 to hold the index into the lookup table. */ +GLOBAL(riscv_vdso_lookup) + /* Get the beginning of the lookup table */ + la t1, riscv_vdso_lookup_end + /* Scale the index */ + slli t0, t0, 3 + add t1, t0, t1 + ld t2, 0(t1) + jr t2 + +GLOBAL(riscv_vdso_lookup_end) \ No newline at end of file diff --git a/criu/arch/riscv64/vdso-pie.c b/criu/arch/riscv64/vdso-pie.c new file mode 100644 index 000000000..aa9272fb5 --- /dev/null +++ b/criu/arch/riscv64/vdso-pie.c @@ -0,0 +1,159 @@ +#include + +#include "asm/types.h" + +#include +#include +#include +#include +#include "atomic.h" +#include "parasite-vdso.h" +#include "log.h" +#include "common/bug.h" + +#ifdef LOG_PREFIX +#undef LOG_PREFIX +#endif +#define LOG_PREFIX "vdso: " + +/* These symbols are defined in vdso-lookup.S */ +extern char *riscv_vdso_lookup, *riscv_vdso_lookup_end; + +/* + * li t0, INDEX + * jal x0, riscv_vdso_lookup + */ +#define TRAMP_CALL_SIZE (2 * sizeof(uint32_t)) + +static inline void invalidate_caches(void) +{ + // We're supposed to use the VDSO as the officially sanctioned ABI. But oh well. + int ret; + __smp_mb(); + asm volatile("li a0, 0\n" + "li a1, 0\n" + "li a2, 1\n" /* SYS_RISCV_FLUSH_ICACHE_ALL */ + "li a7, 259\n" /* __NR_arch_specific_syscall */ + "ecall\n" + : "=r"(ret) + : + : "a7"); +} + +static inline size_t vdso_trampoline_size(void) +{ + return (size_t)&riscv_vdso_lookup_end - (size_t)&riscv_vdso_lookup; +} + +static uint64_t put_trampoline(uint64_t at, struct vdso_symtable *sym) +{ + int i, j; + uint64_t total_size, trampoline_size; + uint64_t trampoline = 0; + + /* First of all we have to find a place where to put the trampoline + * code. + */ + trampoline_size = vdso_trampoline_size(); + total_size = trampoline_size + VDSO_SYMBOL_MAX * sizeof(uint64_t); + + for (i = 0; i < ARRAY_SIZE(sym->symbols); i++) { + if (vdso_symbol_empty(&sym->symbols[i])) + continue; + + pr_debug("Checking '%s' at %lx\n", sym->symbols[i].name, sym->symbols[i].offset); + + /* find the nearest following symbol we are interested in */ + for (j = 0; j < ARRAY_SIZE(sym->symbols); j++) { + if (i == j || vdso_symbol_empty(&sym->symbols[j])) + continue; + + if (sym->symbols[j].offset <= sym->symbols[i].offset) + /* this symbol is above the current one */ + continue; + + if ((sym->symbols[i].offset + TRAMP_CALL_SIZE) > sym->symbols[j].offset) { + /* we have a major issue here since we cannot + * even put the trampoline call for this symbol + */ + pr_err("Can't handle small vDSO symbol %s\n", sym->symbols[i].name); + return 0; + } + + if (trampoline) + /* no need to put it twice */ + continue; + + if ((sym->symbols[j].offset - (sym->symbols[i].offset + TRAMP_CALL_SIZE)) <= total_size) + /* not enough place */ + continue; + + /* We can put the trampoline there */ + trampoline = at + sym->symbols[i].offset; + trampoline += TRAMP_CALL_SIZE; + + pr_debug("Putting vDSO trampoline in %s at %lx\n", sym->symbols[i].name, trampoline); + memcpy((void *)trampoline, &riscv_vdso_lookup, trampoline_size); + invalidate_caches(); + return trampoline; + } + } + + return 0; +} + +static inline void put_trampoline_call(uint64_t from, uint64_t to, uint64_t trampoline, unsigned int idx) +{ + size_t trampoline_size = vdso_trampoline_size(); + uint64_t *lookup_table = NULL; + /* + * li t0, INDEX + * addi t0, x0 INDEX + * jal x0, riscv_vdso_lookup + */ + uint32_t trampoline_call[2] = { + 0x00000293, + 0x0000006f, + }; + const size_t insts_len = ARRAY_SIZE(trampoline_call); + uint32_t *call_addr = (uint32_t *)from; + // Offset from the jal instruction to the lookup trampoline. + ssize_t trampoline_offset = trampoline - (from + sizeof(uint32_t)); + + trampoline_call[0] = trampoline_call[0] | (idx << 24); + trampoline_call[1] = trampoline_call[1] | riscv_j_imm(trampoline_offset); + + for (unsigned int i = 0; i < insts_len; i++) { + call_addr[i] = trampoline_call[i]; + } + + // Set the lookup table pointer for this vdso symbol. + lookup_table = (uint64_t *)(trampoline + trampoline_size); + lookup_table[idx] = to; +} + +int vdso_redirect_calls(uint64_t base_to, uint64_t base_from, struct vdso_symtable *to, struct vdso_symtable *from, + bool __always_unused compat_vdso) +{ + unsigned int i, valid_idx = 0; + + uint64_t trampoline = (uint64_t)put_trampoline(base_from, from); + if (!trampoline) + return 1; + + for (i = 0; i < ARRAY_SIZE(to->symbols); i++) { + if (vdso_symbol_empty(&from->symbols[i])) + continue; + + pr_debug("br: %lx/%lx -> %lx/%lx (index %d) '%s'\n", base_from, from->symbols[i].offset, base_to, + to->symbols[i].offset, i, from->symbols[i].name); + + put_trampoline_call(base_from + from->symbols[i].offset, base_to + to->symbols[i].offset, trampoline, + valid_idx); + valid_idx++; + } + + invalidate_caches(); + + return 0; +} \ No newline at end of file diff --git a/criu/arch/s390/cpu.c b/criu/arch/s390/cpu.c index f93666ed6..e227fad5e 100644 --- a/criu/arch/s390/cpu.c +++ b/criu/arch/s390/cpu.c @@ -1,4 +1,4 @@ -#undef LOG_PREFIX +#undef LOG_PREFIX #define LOG_PREFIX "cpu: " #include @@ -18,22 +18,11 @@ static compel_cpuinfo_t rt_cpuinfo; static const char *hwcap_str1[64] = { - "HWCAP_S390_ESAN3", - "HWCAP_S390_ZARCH", - "HWCAP_S390_STFLE", - "HWCAP_S390_MSA", - "HWCAP_S390_LDISP", - "HWCAP_S390_EIMM", - "HWCAP_S390_DFP", - "HWCAP_S390_HPAGE", - "HWCAP_S390_ETF3EH", - "HWCAP_S390_HIGH_GPRS", - "HWCAP_S390_TE", - "HWCAP_S390_VXRS", - "HWCAP_S390_VXRS_BCD", - "HWCAP_S390_VXRS_EXT", + "HWCAP_S390_ESAN3", "HWCAP_S390_ZARCH", "HWCAP_S390_STFLE", "HWCAP_S390_MSA", "HWCAP_S390_LDISP", + "HWCAP_S390_EIMM", "HWCAP_S390_DFP", "HWCAP_S390_HPAGE", "HWCAP_S390_ETF3EH", "HWCAP_S390_HIGH_GPRS", + "HWCAP_S390_TE", "HWCAP_S390_VXRS", "HWCAP_S390_VXRS_BCD", "HWCAP_S390_VXRS_EXT", }; -static const char *hwcap_str2[64] = { }; +static const char *hwcap_str2[64] = {}; static const char **hwcap_str[2] = { hwcap_str1, hwcap_str2 }; @@ -73,7 +62,7 @@ int cpu_dump_cpuinfo(void) img = open_image(CR_FD_CPUINFO, O_DUMP); if (!img) - return -1; + return -1; cpu_info.s390_entry = &cpu_s390_info_ptr; cpu_info.n_s390_entry = 1; @@ -98,6 +87,12 @@ int cpu_validate_cpuinfo(void) if (!img) return -1; + if (empty_image(img)) { + pr_err("No cpuinfo image\n"); + close_image(img); + return -1; + } + ret = 0; if (pb_read_one(img, &cpu_info, PB_CPUINFO) < 0) goto error; @@ -124,11 +119,9 @@ int cpu_validate_cpuinfo(void) if (rt_cpuinfo.hwcap[nr] & (1 << cap)) continue; if (hwcap_str[nr][cap]) - pr_err("CPU Feature %s not supported on host\n", - hwcap_str[nr][cap]); + pr_err("CPU Feature %s not supported on host\n", hwcap_str[nr][cap]); else - pr_err("CPU Feature %d/%x not supported on host\n", - nr, 1 << cap); + pr_err("CPU Feature %d/%x not supported on host\n", nr, 1 << cap); ret = -1; } } diff --git a/criu/arch/s390/crtools.c b/criu/arch/s390/crtools.c index 238035b76..e08c83878 100644 --- a/criu/arch/s390/crtools.c +++ b/criu/arch/s390/crtools.c @@ -17,7 +17,7 @@ #include "log.h" #include "util.h" #include "cpu.h" -#include +#include "compel/infect.h" #include "protobuf.h" #include "images/core.pb-c.h" @@ -26,12 +26,12 @@ #include "pstree.h" #include "image.h" -#define NT_PRFPREG 2 -#define NT_S390_VXRS_LOW 0x309 -#define NT_S390_VXRS_HIGH 0x30a -#define NT_S390_GS_CB 0x30b -#define NT_S390_GS_BC 0x30c -#define NT_S390_RI_CB 0x30d +#define NT_PRFPREG 2 +#define NT_S390_VXRS_LOW 0x309 +#define NT_S390_VXRS_HIGH 0x30a +#define NT_S390_GS_CB 0x30b +#define NT_S390_GS_BC 0x30c +#define NT_S390_RI_CB 0x30d /* * Print general purpose and access registers @@ -41,8 +41,7 @@ static void print_core_gpregs(const char *msg, UserS390RegsEntry *gpregs) int i; pr_debug("%s: General purpose registers\n", msg); - pr_debug(" psw %016lx %016lx\n", - gpregs->psw_mask, gpregs->psw_addr); + pr_debug(" psw %016lx %016lx\n", gpregs->psw_mask, gpregs->psw_addr); pr_debug(" orig_gpr2 %016lx\n", gpregs->orig_gpr2); for (i = 0; i < 16; i++) pr_debug(" g%02d %016lx\n", i, gpregs->gprs[i]); @@ -69,8 +68,7 @@ static void print_core_vx_regs(CoreEntry *core) for (i = 0; i < 16; i++) pr_debug(" vx_low%02d %016lx\n", i, vxrs_low->regs[i]); for (i = 0; i < 32; i += 2) - pr_debug(" vx_high%02d %016lx %016lx\n", i / 2, - vxrs_high->regs[i], vxrs_high->regs[i + 1]); + pr_debug(" vx_high%02d %016lx %016lx\n", i / 2, vxrs_high->regs[i], vxrs_high->regs[i + 1]); } /* @@ -144,6 +142,29 @@ static void print_core_fp_regs(const char *msg, CoreEntry *core) print_core_ri_cb(core); } +/* + * Allocate floating point registers + */ +static UserS390FpregsEntry *allocate_fp_regs(void) +{ + UserS390FpregsEntry *fpregs; + + fpregs = xmalloc(sizeof(*fpregs)); + if (!fpregs) + return NULL; + user_s390_fpregs_entry__init(fpregs); + + fpregs->n_fprs = 16; + fpregs->fprs = xzalloc(16 * sizeof(uint64_t)); + if (!fpregs->fprs) + goto fail_free_fpregs; + return fpregs; + +fail_free_fpregs: + xfree(fpregs); + return NULL; +} + /* * Allocate VxrsLow registers */ @@ -236,7 +257,7 @@ fail_free_gs_cb: } /* - * Free Guareded Storage control blocks + * Free Guarded Storage control blocks */ static void free_gs_cb(UserS390GsCbEntry *gs_cb) { @@ -284,7 +305,7 @@ static void free_ri_cb(UserS390RiEntry *ri_cb) /* * Copy internal structures into Google Protocol Buffers */ -int save_task_regs(void *arg, user_regs_struct_t *u, user_fpregs_struct_t *f) +int save_task_regs(pid_t pid, void *arg, user_regs_struct_t *u, user_fpregs_struct_t *f) { UserS390VxrsHighEntry *vxrs_high = NULL; UserS390VxrsLowEntry *vxrs_low = NULL; @@ -296,7 +317,13 @@ int save_task_regs(void *arg, user_regs_struct_t *u, user_fpregs_struct_t *f) CoreEntry *core = arg; gpregs = CORE_THREAD_ARCH_INFO(core)->gpregs; - fpregs = CORE_THREAD_ARCH_INFO(core)->fpregs; + /* + * We delay allocating this until now because checkpointing can fail earlier. + * When it fails we need to know if we reached here or not so that the cleanup + * code doesn't restore FPRs that were never saved in the first place. + */ + fpregs = allocate_fp_regs(); + CORE_THREAD_ARCH_INFO(core)->fpregs = fpregs; /* Vector registers */ if (f->flags & USER_FPREGS_VXRS) { @@ -395,44 +422,21 @@ int restore_fpu(struct rt_sigframe *f, CoreEntry *core) dst->fpregs.fpc = fpregs->fpc; memcpy(dst->fpregs.fprs, fpregs->fprs, sizeof(dst->fpregs.fprs)); if (vxrs_low) { - memcpy(&dst_ext->vxrs_low, vxrs_low->regs, - sizeof(dst_ext->vxrs_low)); - memcpy(&dst_ext->vxrs_high, vxrs_high->regs, - sizeof(dst_ext->vxrs_high)); + memcpy(&dst_ext->vxrs_low, vxrs_low->regs, sizeof(dst_ext->vxrs_low)); + memcpy(&dst_ext->vxrs_high, vxrs_high->regs, sizeof(dst_ext->vxrs_high)); } return 0; } -/* - * Allocate floating point registers - */ -static UserS390FpregsEntry *allocate_fp_regs(void) -{ - UserS390FpregsEntry *fpregs; - - fpregs = xmalloc(sizeof(*fpregs)); - if (!fpregs) - return NULL; - user_s390_fpregs_entry__init(fpregs); - - fpregs->n_fprs = 16; - fpregs->fprs = xzalloc(16 * sizeof(uint64_t)); - if (!fpregs->fprs) - goto fail_free_fpregs; - return fpregs; - -fail_free_fpregs: - xfree(fpregs); - return NULL; -} - /* * Free floating point registers */ static void free_fp_regs(UserS390FpregsEntry *fpregs) { - xfree(fpregs->fprs); - xfree(fpregs); + if (fpregs) { + xfree(fpregs->fprs); + xfree(fpregs); + } } /* @@ -491,15 +495,17 @@ int arch_alloc_thread_info(CoreEntry *core) ti_s390->gpregs = allocate_gp_regs(); if (!ti_s390->gpregs) goto fail_free_ti_s390; - ti_s390->fpregs = allocate_fp_regs(); - if (!ti_s390->fpregs) - goto fail_free_gp_regs; + + /* + * Delay allocating space until needed. Checkpointing can fail before that + * and the cleanup code needs to be able to tell if FPRs were saved or not + * before trying to restore the register state. + */ + ti_s390->fpregs = NULL; CORE_THREAD_ARCH_INFO(core) = ti_s390; return 0; -fail_free_gp_regs: - free_gp_regs(ti_s390->gpregs); fail_free_ti_s390: xfree(ti_s390); return -1; @@ -682,14 +688,18 @@ static int set_task_regs(pid_t pid, CoreEntry *core) user_fpregs_struct_t fpregs; memset(&fpregs, 0, sizeof(fpregs)); - /* Floating point registers */ + /* + * Floating point registers + * Optional on checkpoint; checkpoint may have failed and we may reach here as part of cleanup + * so there's no guarantee that we saved FPRs for this thread. + */ cfpregs = CORE_THREAD_ARCH_INFO(core)->fpregs; - if (!cfpregs) - return -1; - fpregs.prfpreg.fpc = cfpregs->fpc; - memcpy(fpregs.prfpreg.fprs, cfpregs->fprs, sizeof(fpregs.prfpreg.fprs)); - if (set_fp_regs(pid, &fpregs) < 0) - return -1; + if (cfpregs) { + fpregs.prfpreg.fpc = cfpregs->fpc; + memcpy(fpregs.prfpreg.fprs, cfpregs->fprs, sizeof(fpregs.prfpreg.fprs)); + if (set_fp_regs(pid, &fpregs) < 0) + return -1; + } /* Vector registers (optional) */ cvxrs_low = CORE_THREAD_ARCH_INFO(core)->vxrs_low; if (cvxrs_low != NULL) { @@ -697,10 +707,8 @@ static int set_task_regs(pid_t pid, CoreEntry *core) if (!cvxrs_high) return -1; fpregs.flags |= USER_FPREGS_VXRS; - memcpy(&fpregs.vxrs_low, cvxrs_low->regs, - sizeof(fpregs.vxrs_low)); - memcpy(&fpregs.vxrs_high, cvxrs_high->regs, - sizeof(fpregs.vxrs_high)); + memcpy(&fpregs.vxrs_low, cvxrs_low->regs, sizeof(fpregs.vxrs_low)); + memcpy(&fpregs.vxrs_high, cvxrs_high->regs, sizeof(fpregs.vxrs_high)); if (set_vx_regs(pid, &fpregs) < 0) return -1; } @@ -720,19 +728,15 @@ int arch_set_thread_regs(struct pstree_item *item, bool with_threads) int i; for_each_pstree_item(item) { - if (item->pid->state == TASK_DEAD || - item->pid->state == TASK_ZOMBIE) + if (item->pid->state == TASK_DEAD || item->pid->state == TASK_ZOMBIE) continue; for (i = 0; i < item->nr_threads; i++) { - if (item->threads[i].state == TASK_DEAD || - item->threads[i].state == TASK_ZOMBIE) + if (item->threads[i].state == TASK_DEAD || item->threads[i].state == TASK_ZOMBIE) continue; if (!with_threads && i > 0) continue; - if (set_task_regs(item->threads[i].real, - item->core[i])) { - pr_perror("Not set registers for task %d", - item->threads[i].real); + if (set_task_regs(item->threads[i].real, item->core[i])) { + pr_perror("Not set registers for task %d", item->threads[i].real); return -1; } } diff --git a/criu/arch/s390/include/asm/dump.h b/criu/arch/s390/include/asm/dump.h index 53aaac9c4..5a24c5b3d 100644 --- a/criu/arch/s390/include/asm/dump.h +++ b/criu/arch/s390/include/asm/dump.h @@ -1,11 +1,13 @@ #ifndef __CR_ASM_DUMP_H__ #define __CR_ASM_DUMP_H__ -int save_task_regs(void *arg, user_regs_struct_t *u, user_fpregs_struct_t *f); +int save_task_regs(pid_t pid, void *arg, user_regs_struct_t *u, user_fpregs_struct_t *f); int arch_alloc_thread_info(CoreEntry *core); void arch_free_thread_info(CoreEntry *core); -static inline void core_put_tls(CoreEntry *core, tls_t tls) { } +static inline void core_put_tls(CoreEntry *core, tls_t tls) +{ +} #define get_task_futex_robust_list_compat(pid, info) -1 diff --git a/criu/arch/s390/include/asm/kerndat.h b/criu/arch/s390/include/asm/kerndat.h index 60956b573..bb70cf6cf 100644 --- a/criu/arch/s390/include/asm/kerndat.h +++ b/criu/arch/s390/include/asm/kerndat.h @@ -1,7 +1,7 @@ #ifndef __CR_ASM_KERNDAT_H__ #define __CR_ASM_KERNDAT_H__ -#define kdat_compatible_cr() 0 -#define kdat_can_map_vdso() 0 +#define kdat_compatible_cr() 0 +#define kdat_can_map_vdso() 0 #endif /* __CR_ASM_KERNDAT_H__ */ diff --git a/criu/arch/s390/include/asm/parasite.h b/criu/arch/s390/include/asm/parasite.h index 0b0268936..752753023 100644 --- a/criu/arch/s390/include/asm/parasite.h +++ b/criu/arch/s390/include/asm/parasite.h @@ -2,6 +2,9 @@ #define __ASM_PARASITE_H__ /* TLS is accessed through %a01, which is already processed */ -static inline void arch_get_tls(tls_t *ptls) { (void)ptls; } +static inline void arch_get_tls(tls_t *ptls) +{ + (void)ptls; +} #endif diff --git a/criu/arch/s390/include/asm/restore.h b/criu/arch/s390/include/asm/restore.h index 6463d8e62..07ff49246 100644 --- a/criu/arch/s390/include/asm/restore.h +++ b/criu/arch/s390/include/asm/restore.h @@ -7,6 +7,7 @@ /* * Load stack to %r15, return address in %r14 and argument 1 into %r2 */ +/* clang-format off */ #define JUMP_TO_RESTORER_BLOB(new_sp, restore_task_exec_start, \ task_args) \ asm volatile( \ @@ -18,7 +19,8 @@ : "d" (new_sp), \ "d"((unsigned long)restore_task_exec_start), \ "d" (task_args) \ - : "2", "14", "15", "memory") + : "2", "14", "memory") +/* clang-format on */ /* There is nothing to do since TLS is accessed through %a01 */ #define core_get_tls(pcore, ptls) diff --git a/criu/arch/s390/include/asm/restorer.h b/criu/arch/s390/include/asm/restorer.h index cfdefcab9..b8472afc8 100644 --- a/criu/arch/s390/include/asm/restorer.h +++ b/criu/arch/s390/include/asm/restorer.h @@ -11,20 +11,21 @@ /* * Clone trampoline - see glibc sysdeps/unix/sysv/linux/s390/s390-64/clone.S */ +/* clang-format off */ #define RUN_CLONE_RESTORE_FN(ret, clone_flags, new_sp, parent_tid, \ thread_args, clone_restore_fn) \ asm volatile( \ "lgr %%r0,%6\n" /* Save thread_args in %r0 */ \ "lgr %%r1,%5\n" /* Save clone_restore_fn in %r1 */ \ - "lgr %%r2,%2\n" /* Parm 1: new_sp (child stack) */ \ - "lgr %%r3,%1\n" /* Parm 2: clone_flags */ \ - "lgr %%r4,%3\n" /* Parm 3: &parent_tid */ \ - "lgr %%r5,%4\n" /* Parm 4: &thread_args[i].pid */ \ - "lghi %%r6,0\n" /* Parm 5: tls = 0 */ \ + "lgr %%r2,%2\n" /* Parameter 1: new_sp (child stack) */ \ + "lgr %%r3,%1\n" /* Parameter 2: clone_flags */ \ + "lgr %%r4,%3\n" /* Parameter 3: &parent_tid */ \ + "lgr %%r5,%4\n" /* Parameter 4: &thread_args[i].pid */ \ + "lghi %%r6,0\n" /* Parameter 5: tls = 0 */ \ "svc "__stringify(__NR_clone)"\n" \ "ltgr %0,%%r2\n" /* Set and check "ret" */ \ "jnz 0f\n" /* ret != 0: Continue caller */ \ - "lgr %%r2,%%r0\n" /* Parm 1: &thread_args */ \ + "lgr %%r2,%%r0\n" /* Parameter 1: &thread_args */ \ "aghi %%r15,-160\n" /* Prepare stack frame */ \ "xc 0(8,%%r15),0(%%r15)\n" \ "basr %%r14,%%r1\n" /* Jump to clone_restore_fn() */ \ @@ -39,19 +40,64 @@ "d"(&thread_args[i]) \ : "0", "1", "2", "3", "4", "5", "6", "cc", "memory") -#define arch_map_vdso(map, compat) -1 +#define RUN_CLONE3_RESTORE_FN(ret, clone_args, size, args, \ + clone_restore_fn) \ + asm volatile( \ + /* + * clone3 only needs two arguments (r2, r3), this means + * we can use r4 and r5 for args and thread function. + * r4 and r5 are callee-saved and are not overwritten. + * No need to put these values on the child stack. + */ \ + "lgr %%r4,%4\n" /* Save args in %r4 */ \ + "lgr %%r5,%3\n" /* Save clone_restore_fn in %r5 */ \ + "lgr %%r2,%1\n" /* Parameter 1: clone_args */ \ + "lgr %%r3,%2\n" /* Parameter 2: size */ \ + /* + * On s390x a syscall is done sc . + * That only works for syscalls < 255. clone3 is 435, + * therefore it is necessary to load the syscall number + * into r1 and do 'svc 0'. + */ \ + "lghi %%r1,"__stringify(__NR_clone3)"\n" \ + "svc 0\n" \ + "ltgr %0,%%r2\n" /* Set and check "ret" */ \ + "jnz 0f\n" /* ret != 0: Continue caller */ \ + "lgr %%r2,%%r4\n" /* Thread arguments taken from r4. */ \ + "lgr %%r1,%%r5\n" /* Thread function taken from r5. */ \ + "aghi %%r15,-160\n" /* Prepare stack frame */ \ + "xc 0(8,%%r15),0(%%r15)\n" \ + "basr %%r14,%%r1\n" /* Jump to clone_restore_fn() */ \ + "j .+2\n" /* BUG(): Force PGM check */ \ +"0:\n" /* Continue caller */ \ + : "=d"(ret) \ + : "a"(&clone_args), \ + "d"(size), \ + "d"(clone_restore_fn), \ + "d"(args) \ + : "0", "1", "2", "3", "4", "5", "cc", "memory") +/* clang-format on */ + +#define arch_map_vdso(map, compat) -1 int restore_gpregs(struct rt_sigframe *f, UserS390RegsEntry *r); int restore_nonsigframe_gpregs(UserS390RegsEntry *r); unsigned long sys_shmat(int shmid, const void *shmaddr, int shmflg); -unsigned long sys_mmap(void *addr, unsigned long len, unsigned long prot, - unsigned long flags, unsigned long fd, +unsigned long sys_mmap(void *addr, unsigned long len, unsigned long prot, unsigned long flags, unsigned long fd, unsigned long offset); -static inline void restore_tls(tls_t *ptls) { (void)ptls; } -static inline void *alloc_compat_syscall_stack(void) { return NULL; } -static inline void free_compat_syscall_stack(void *stack32) { } +static inline void restore_tls(tls_t *ptls) +{ + (void)ptls; +} +static inline void *alloc_compat_syscall_stack(void) +{ + return NULL; +} +static inline void free_compat_syscall_stack(void *stack32) +{ +} static inline int arch_compat_rt_sigaction(void *stack, int sig, void *act) { return -1; diff --git a/criu/arch/s390/include/asm/thread_pointer.h b/criu/arch/s390/include/asm/thread_pointer.h new file mode 100644 index 000000000..f7e07066a --- /dev/null +++ b/criu/arch/s390/include/asm/thread_pointer.h @@ -0,0 +1,27 @@ +/* __thread_pointer definition. Generic version. + Copyright (C) 2021 Free Software Foundation, Inc. + This file is part of the GNU C Library. + + The GNU C Library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + The GNU C Library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with the GNU C Library. If not, see + . */ + +#ifndef _SYS_THREAD_POINTER_H +#define _SYS_THREAD_POINTER_H + +static inline void *__criu_thread_pointer(void) +{ + return __builtin_thread_pointer(); +} + +#endif /* _SYS_THREAD_POINTER_H */ diff --git a/criu/arch/s390/include/asm/types.h b/criu/arch/s390/include/asm/types.h index 4f36c1309..abf12dec0 100644 --- a/criu/arch/s390/include/asm/types.h +++ b/criu/arch/s390/include/asm/types.h @@ -15,21 +15,29 @@ typedef UserS390RegsEntry UserRegsEntry; #define CORE_ENTRY__MARCH CORE_ENTRY__MARCH__S390 -#define core_is_compat(core) false +#define core_is_compat(core) false #define CORE_THREAD_ARCH_INFO(core) core->ti_s390 -static inline u64 encode_pointer(void *p) { return (u64) p; } -static inline void *decode_pointer(u64 v) { return (void *) v; } +#define TI_IP(core) ((core)->ti_s390->gpregs->psw_addr) + +static inline u64 encode_pointer(void *p) +{ + return (u64)p; +} +static inline void *decode_pointer(u64 v) +{ + return (void *)v; +} /* * See also: * * arch/s390/include/uapi/asm/auxvec.h * * include/linux/auxvec.h */ -#define AT_VECTOR_SIZE_BASE 20 -#define AT_VECTOR_SIZE_ARCH 1 -#define AT_VECTOR_SIZE (2*(AT_VECTOR_SIZE_ARCH + AT_VECTOR_SIZE_BASE + 1)) +#define AT_VECTOR_SIZE_BASE 20 +#define AT_VECTOR_SIZE_ARCH 1 +#define AT_VECTOR_SIZE (2 * (AT_VECTOR_SIZE_ARCH + AT_VECTOR_SIZE_BASE + 1)) typedef uint64_t auxv_t; typedef uint64_t tls_t; diff --git a/criu/arch/s390/include/asm/vdso.h b/criu/arch/s390/include/asm/vdso.h index c54d848ad..72b3dca97 100644 --- a/criu/arch/s390/include/asm/vdso.h +++ b/criu/arch/s390/include/asm/vdso.h @@ -8,17 +8,18 @@ * This is a minimal amount of symbols * we should support at the moment. */ -#define VDSO_SYMBOL_MAX 4 -#define VDSO_SYMBOL_GTOD 0 +#define VDSO_SYMBOL_MAX 4 +#define VDSO_SYMBOL_GTOD 0 /* - * This definition is used in pie/util-vdso.c to initialize the vdso symbol + * These definitions are used in pie/util-vdso.c to initialize the vdso symbol * name string table 'vdso_symbols' */ -#define ARCH_VDSO_SYMBOLS \ - "__kernel_gettimeofday", \ - "__kernel_clock_gettime", \ - "__kernel_clock_getres", \ - "__kernel_getcpu" +#define ARCH_VDSO_SYMBOLS_LIST \ + const char *aarch_vdso_symbol1 = "__kernel_gettimeofday"; \ + const char *aarch_vdso_symbol2 = "__kernel_clock_gettime"; \ + const char *aarch_vdso_symbol3 = "__kernel_clock_getres"; \ + const char *aarch_vdso_symbol4 = "__kernel_getcpu"; +#define ARCH_VDSO_SYMBOLS aarch_vdso_symbol1, aarch_vdso_symbol2, aarch_vdso_symbol3, aarch_vdso_symbol4 #endif /* __CR_ASM_VDSO_H__ */ diff --git a/criu/arch/s390/restorer.c b/criu/arch/s390/restorer.c index 3823fda98..8b3bc44ba 100644 --- a/criu/arch/s390/restorer.c +++ b/criu/arch/s390/restorer.c @@ -23,15 +23,14 @@ unsigned long sys_shmat(int shmid, const void *shmaddr, int shmflg) unsigned long raddr; int ret; - ret = sys_ipc(21 /*SHMAT */, - shmid, /* first */ - shmflg, /* second */ - (unsigned long)&raddr, /* third */ - shmaddr, /* ptr */ - 0 /* fifth not used */); + ret = sys_ipc(21 /*SHMAT */, shmid, /* first */ + shmflg, /* second */ + (unsigned long)&raddr, /* third */ + shmaddr, /* ptr */ + 0 /* fifth not used */); if (ret) - raddr = (unsigned long) ret; + raddr = (unsigned long)ret; return raddr; } diff --git a/criu/arch/s390/sigframe.c b/criu/arch/s390/sigframe.c index 03f206a8d..89a897260 100644 --- a/criu/arch/s390/sigframe.c +++ b/criu/arch/s390/sigframe.c @@ -13,8 +13,7 @@ * - sigframe : Pointer to local signal frame * - rsigframe: Pointer to remote signal frame of inferior */ -int sigreturn_prep_fpu_frame(struct rt_sigframe *sigframe, - struct rt_sigframe *rsigframe) +int sigreturn_prep_fpu_frame(struct rt_sigframe *sigframe, struct rt_sigframe *rsigframe) { return 0; } diff --git a/criu/arch/s390/vdso-pie.c b/criu/arch/s390/vdso-pie.c index 0667668ee..bf0366b0e 100644 --- a/criu/arch/s390/vdso-pie.c +++ b/criu/arch/s390/vdso-pie.c @@ -9,7 +9,7 @@ #include "common/bug.h" #ifdef LOG_PREFIX -# undef LOG_PREFIX +#undef LOG_PREFIX #endif #define LOG_PREFIX "vdso: " @@ -17,11 +17,11 @@ * Trampoline instruction sequence */ typedef struct { - u8 larl[6]; /* Load relative address of imm64 */ - u8 lg[6]; /* Load %r1 with imm64 */ - u8 br[2]; /* Branch to %r1 */ - u64 addr; /* Jump address */ - u32 guards; /* Guard bytes */ + u8 larl[6]; /* Load relative address of imm64 */ + u8 lg[6]; /* Load %r1 with imm64 */ + u8 br[2]; /* Branch to %r1 */ + u64 addr; /* Jump address */ + u32 guards; /* Guard bytes */ } __packed jmp_t; /* @@ -29,21 +29,20 @@ typedef struct { */ jmp_t jmp = { /* larl %r1,e (addr) */ - .larl = {0xc0, 0x10, 0x00, 0x00, 0x00, 0x07}, + .larl = { 0xc0, 0x10, 0x00, 0x00, 0x00, 0x07 }, /* lg %r1,0(%r1) */ - .lg = {0xe3, 0x10, 0x10, 0x00, 0x00, 0x04}, + .lg = { 0xe3, 0x10, 0x10, 0x00, 0x00, 0x04 }, /* br %r1 */ - .br = {0x07, 0xf1}, - .guards = 0xcccccccc, + .br = { 0x07, 0xf1 }, + .guards = 0xcccccccc, }; /* * Insert trampoline code into old vdso entry points to * jump to new vdso functions. */ -int vdso_redirect_calls(unsigned long base_to, unsigned long base_from, - struct vdso_symtable *to, struct vdso_symtable *from, - bool __always_unused compat_vdso) +int vdso_redirect_calls(unsigned long base_to, unsigned long base_from, struct vdso_symtable *to, + struct vdso_symtable *from, bool __always_unused compat_vdso) { unsigned int i; @@ -51,14 +50,11 @@ int vdso_redirect_calls(unsigned long base_to, unsigned long base_from, if (vdso_symbol_empty(&from->symbols[i])) continue; - pr_debug("jmp: %s: %lx/%lx -> %lx/%lx (index %d)\n", - from->symbols[i].name, base_from, - from->symbols[i].offset, - base_to, to->symbols[i].offset, i); + pr_debug("jmp: %s: %lx/%lx -> %lx/%lx (index %d)\n", from->symbols[i].name, base_from, + from->symbols[i].offset, base_to, to->symbols[i].offset, i); jmp.addr = base_to + to->symbols[i].offset; - memcpy((void *)(base_from + from->symbols[i].offset), &jmp, - sizeof(jmp)); + memcpy((void *)(base_from + from->symbols[i].offset), &jmp, sizeof(jmp)); } return 0; diff --git a/criu/arch/x86/Makefile b/criu/arch/x86/Makefile index 618e85bb3..46f00e9e9 100644 --- a/criu/arch/x86/Makefile +++ b/criu/arch/x86/Makefile @@ -9,6 +9,7 @@ obj-y += cpu.o obj-y += crtools.o obj-y += kerndat.o obj-y += sigframe.o +obj-y += shstk.o ifeq ($(CONFIG_COMPAT),y) obj-y += sigaction_compat.o endif diff --git a/criu/arch/x86/cpu.c b/criu/arch/x86/cpu.c index 3808b9d33..2e1f2de9a 100644 --- a/criu/arch/x86/cpu.c +++ b/criu/arch/x86/cpu.c @@ -23,7 +23,7 @@ #include "protobuf.h" #include "images/cpuinfo.pb-c.h" -#undef LOG_PREFIX +#undef LOG_PREFIX #define LOG_PREFIX "cpu: " static compel_cpuinfo_t rt_cpu_info; @@ -55,12 +55,9 @@ int cpu_init(void) } pr_debug("fpu:%d fxsr:%d xsave:%d xsaveopt:%d xsavec:%d xgetbv1:%d xsaves:%d\n", - !!compel_cpu_has_feature(X86_FEATURE_FPU), - !!compel_cpu_has_feature(X86_FEATURE_FXSR), - !!compel_cpu_has_feature(X86_FEATURE_OSXSAVE), - !!compel_cpu_has_feature(X86_FEATURE_XSAVEOPT), - !!compel_cpu_has_feature(X86_FEATURE_XSAVEC), - !!compel_cpu_has_feature(X86_FEATURE_XGETBV1), + !!compel_cpu_has_feature(X86_FEATURE_FPU), !!compel_cpu_has_feature(X86_FEATURE_FXSR), + !!compel_cpu_has_feature(X86_FEATURE_OSXSAVE), !!compel_cpu_has_feature(X86_FEATURE_XSAVEOPT), + !!compel_cpu_has_feature(X86_FEATURE_XSAVEC), !!compel_cpu_has_feature(X86_FEATURE_XGETBV1), !!compel_cpu_has_feature(X86_FEATURE_XSAVES)); return cpu_has_unsupported_features() ? -1 : 0; @@ -77,25 +74,24 @@ int cpu_dump_cpuinfo(void) if (!img) return -1; - cpu_info.x86_entry = &cpu_x86_info_ptr; - cpu_info.n_x86_entry = 1; + cpu_info.x86_entry = &cpu_x86_info_ptr; + cpu_info.n_x86_entry = 1; - cpu_x86_info.vendor_id = (rt_cpu_info.x86_vendor == X86_VENDOR_INTEL) ? - CPUINFO_X86_ENTRY__VENDOR__INTEL : - CPUINFO_X86_ENTRY__VENDOR__AMD; + cpu_x86_info.vendor_id = (rt_cpu_info.x86_vendor == X86_VENDOR_INTEL) ? CPUINFO_X86_ENTRY__VENDOR__INTEL : + CPUINFO_X86_ENTRY__VENDOR__AMD; - cpu_x86_info.cpu_family = rt_cpu_info.x86_family; - cpu_x86_info.model = rt_cpu_info.x86_model; - cpu_x86_info.stepping = rt_cpu_info.x86_mask; - cpu_x86_info.capability_ver = 2; - cpu_x86_info.n_capability = ARRAY_SIZE(rt_cpu_info.x86_capability); - cpu_x86_info.capability = (void *)rt_cpu_info.x86_capability; - cpu_x86_info.has_xfeatures_mask = true; - cpu_x86_info.xfeatures_mask = rt_cpu_info.xfeatures_mask; - cpu_x86_info.has_xsave_size = true; - cpu_x86_info.xsave_size = rt_cpu_info.xsave_size; - cpu_x86_info.has_xsave_size_max = true; - cpu_x86_info.xsave_size_max = rt_cpu_info.xsave_size_max; + cpu_x86_info.cpu_family = rt_cpu_info.x86_family; + cpu_x86_info.model = rt_cpu_info.x86_model; + cpu_x86_info.stepping = rt_cpu_info.x86_mask; + cpu_x86_info.capability_ver = 2; + cpu_x86_info.n_capability = ARRAY_SIZE(rt_cpu_info.x86_capability); + cpu_x86_info.capability = (void *)rt_cpu_info.x86_capability; + cpu_x86_info.has_xfeatures_mask = true; + cpu_x86_info.xfeatures_mask = rt_cpu_info.xfeatures_mask; + cpu_x86_info.has_xsave_size = true; + cpu_x86_info.xsave_size = rt_cpu_info.xsave_size; + cpu_x86_info.has_xsave_size_max = true; + cpu_x86_info.xsave_size_max = rt_cpu_info.xsave_size_max; if (rt_cpu_info.x86_model_id[0]) cpu_x86_info.model_id = rt_cpu_info.x86_model_id; @@ -109,103 +105,105 @@ int cpu_dump_cpuinfo(void) return 0; } -#define __ins_bit(__l, __v) (1u << ((__v) - 32u * (__l))) +#define __ins_bit(__l, __v) (1u << ((__v)-32u * (__l))) +// clang-format off static uint32_t x86_ins_capability_mask[NCAPINTS] = { [CPUID_1_EDX] = - __ins_bit(CPUID_1_EDX, X86_FEATURE_FPU) | - __ins_bit(CPUID_1_EDX, X86_FEATURE_TSC) | - __ins_bit(CPUID_1_EDX, X86_FEATURE_CX8) | - __ins_bit(CPUID_1_EDX, X86_FEATURE_SEP) | - __ins_bit(CPUID_1_EDX, X86_FEATURE_CMOV) | - __ins_bit(CPUID_1_EDX, X86_FEATURE_CLFLUSH) | - __ins_bit(CPUID_1_EDX, X86_FEATURE_MMX) | - __ins_bit(CPUID_1_EDX, X86_FEATURE_FXSR) | - __ins_bit(CPUID_1_EDX, X86_FEATURE_XMM) | - __ins_bit(CPUID_1_EDX, X86_FEATURE_XMM2), + __ins_bit(CPUID_1_EDX, X86_FEATURE_FPU) | + __ins_bit(CPUID_1_EDX, X86_FEATURE_TSC) | + __ins_bit(CPUID_1_EDX, X86_FEATURE_CX8) | + __ins_bit(CPUID_1_EDX, X86_FEATURE_SEP) | + __ins_bit(CPUID_1_EDX, X86_FEATURE_CMOV) | + __ins_bit(CPUID_1_EDX, X86_FEATURE_CLFLUSH) | + __ins_bit(CPUID_1_EDX, X86_FEATURE_MMX) | + __ins_bit(CPUID_1_EDX, X86_FEATURE_FXSR) | + __ins_bit(CPUID_1_EDX, X86_FEATURE_XMM) | + __ins_bit(CPUID_1_EDX, X86_FEATURE_XMM2), [CPUID_8000_0001_EDX] = - __ins_bit(CPUID_8000_0001_EDX, X86_FEATURE_SYSCALL) | - __ins_bit(CPUID_8000_0001_EDX, X86_FEATURE_MMXEXT) | - __ins_bit(CPUID_8000_0001_EDX, X86_FEATURE_RDTSCP) | - __ins_bit(CPUID_8000_0001_EDX, X86_FEATURE_3DNOWEXT) | - __ins_bit(CPUID_8000_0001_EDX, X86_FEATURE_3DNOW), + __ins_bit(CPUID_8000_0001_EDX, X86_FEATURE_SYSCALL) | + __ins_bit(CPUID_8000_0001_EDX, X86_FEATURE_MMXEXT) | + __ins_bit(CPUID_8000_0001_EDX, X86_FEATURE_RDTSCP) | + __ins_bit(CPUID_8000_0001_EDX, X86_FEATURE_3DNOWEXT) | + __ins_bit(CPUID_8000_0001_EDX, X86_FEATURE_3DNOW), [CPUID_LNX_1] = - __ins_bit(CPUID_LNX_1, X86_FEATURE_REP_GOOD) | - __ins_bit(CPUID_LNX_1, X86_FEATURE_NOPL), + __ins_bit(CPUID_LNX_1, X86_FEATURE_REP_GOOD) | + __ins_bit(CPUID_LNX_1, X86_FEATURE_NOPL), [CPUID_1_ECX] = - __ins_bit(CPUID_1_ECX, X86_FEATURE_XMM3) | - __ins_bit(CPUID_1_ECX, X86_FEATURE_PCLMULQDQ) | - __ins_bit(CPUID_1_ECX, X86_FEATURE_MWAIT) | - __ins_bit(CPUID_1_ECX, X86_FEATURE_SSSE3) | - __ins_bit(CPUID_1_ECX, X86_FEATURE_CX16) | - __ins_bit(CPUID_1_ECX, X86_FEATURE_XMM4_1) | - __ins_bit(CPUID_1_ECX, X86_FEATURE_XMM4_2) | - __ins_bit(CPUID_1_ECX, X86_FEATURE_MOVBE) | - __ins_bit(CPUID_1_ECX, X86_FEATURE_POPCNT) | - __ins_bit(CPUID_1_ECX, X86_FEATURE_AES) | - __ins_bit(CPUID_1_ECX, X86_FEATURE_XSAVE) | - __ins_bit(CPUID_1_ECX, X86_FEATURE_OSXSAVE) | - __ins_bit(CPUID_1_ECX, X86_FEATURE_AVX) | - __ins_bit(CPUID_1_ECX, X86_FEATURE_F16C) | - __ins_bit(CPUID_1_ECX, X86_FEATURE_RDRAND), + __ins_bit(CPUID_1_ECX, X86_FEATURE_XMM3) | + __ins_bit(CPUID_1_ECX, X86_FEATURE_PCLMULQDQ) | + __ins_bit(CPUID_1_ECX, X86_FEATURE_MWAIT) | + __ins_bit(CPUID_1_ECX, X86_FEATURE_SSSE3) | + __ins_bit(CPUID_1_ECX, X86_FEATURE_CX16) | + __ins_bit(CPUID_1_ECX, X86_FEATURE_XMM4_1) | + __ins_bit(CPUID_1_ECX, X86_FEATURE_XMM4_2) | + __ins_bit(CPUID_1_ECX, X86_FEATURE_MOVBE) | + __ins_bit(CPUID_1_ECX, X86_FEATURE_POPCNT) | + __ins_bit(CPUID_1_ECX, X86_FEATURE_AES) | + __ins_bit(CPUID_1_ECX, X86_FEATURE_XSAVE) | + __ins_bit(CPUID_1_ECX, X86_FEATURE_OSXSAVE) | + __ins_bit(CPUID_1_ECX, X86_FEATURE_AVX) | + __ins_bit(CPUID_1_ECX, X86_FEATURE_F16C) | + __ins_bit(CPUID_1_ECX, X86_FEATURE_RDRAND), [CPUID_8000_0001_ECX] = - __ins_bit(CPUID_8000_0001_ECX, X86_FEATURE_ABM) | - __ins_bit(CPUID_8000_0001_ECX, X86_FEATURE_SSE4A) | - __ins_bit(CPUID_8000_0001_ECX, X86_FEATURE_MISALIGNSSE) | - __ins_bit(CPUID_8000_0001_ECX, X86_FEATURE_3DNOWPREFETCH) | - __ins_bit(CPUID_8000_0001_ECX, X86_FEATURE_XOP) | - __ins_bit(CPUID_8000_0001_ECX, X86_FEATURE_FMA4) | - __ins_bit(CPUID_8000_0001_ECX, X86_FEATURE_TBM), + __ins_bit(CPUID_8000_0001_ECX, X86_FEATURE_ABM) | + __ins_bit(CPUID_8000_0001_ECX, X86_FEATURE_SSE4A) | + __ins_bit(CPUID_8000_0001_ECX, X86_FEATURE_MISALIGNSSE) | + __ins_bit(CPUID_8000_0001_ECX, X86_FEATURE_3DNOWPREFETCH) | + __ins_bit(CPUID_8000_0001_ECX, X86_FEATURE_XOP) | + __ins_bit(CPUID_8000_0001_ECX, X86_FEATURE_FMA4) | + __ins_bit(CPUID_8000_0001_ECX, X86_FEATURE_TBM), [CPUID_7_0_EBX] = - __ins_bit(CPUID_7_0_EBX, X86_FEATURE_FSGSBASE) | - __ins_bit(CPUID_7_0_EBX, X86_FEATURE_BMI1) | - __ins_bit(CPUID_7_0_EBX, X86_FEATURE_HLE) | - __ins_bit(CPUID_7_0_EBX, X86_FEATURE_AVX2) | - __ins_bit(CPUID_7_0_EBX, X86_FEATURE_BMI2) | - __ins_bit(CPUID_7_0_EBX, X86_FEATURE_ERMS) | - __ins_bit(CPUID_7_0_EBX, X86_FEATURE_RTM) | - __ins_bit(CPUID_7_0_EBX, X86_FEATURE_MPX) | - __ins_bit(CPUID_7_0_EBX, X86_FEATURE_AVX512F) | - __ins_bit(CPUID_7_0_EBX, X86_FEATURE_AVX512DQ) | - __ins_bit(CPUID_7_0_EBX, X86_FEATURE_RDSEED) | - __ins_bit(CPUID_7_0_EBX, X86_FEATURE_ADX) | - __ins_bit(CPUID_7_0_EBX, X86_FEATURE_CLFLUSHOPT) | - __ins_bit(CPUID_7_0_EBX, X86_FEATURE_AVX512PF) | - __ins_bit(CPUID_7_0_EBX, X86_FEATURE_AVX512ER) | - __ins_bit(CPUID_7_0_EBX, X86_FEATURE_AVX512CD) | - __ins_bit(CPUID_7_0_EBX, X86_FEATURE_SHA_NI) | - __ins_bit(CPUID_7_0_EBX, X86_FEATURE_AVX512BW) | - __ins_bit(CPUID_7_0_EBX, X86_FEATURE_AVX512VL), + __ins_bit(CPUID_7_0_EBX, X86_FEATURE_FSGSBASE) | + __ins_bit(CPUID_7_0_EBX, X86_FEATURE_BMI1) | + __ins_bit(CPUID_7_0_EBX, X86_FEATURE_HLE) | + __ins_bit(CPUID_7_0_EBX, X86_FEATURE_AVX2) | + __ins_bit(CPUID_7_0_EBX, X86_FEATURE_BMI2) | + __ins_bit(CPUID_7_0_EBX, X86_FEATURE_ERMS) | + __ins_bit(CPUID_7_0_EBX, X86_FEATURE_RTM) | + __ins_bit(CPUID_7_0_EBX, X86_FEATURE_MPX) | + __ins_bit(CPUID_7_0_EBX, X86_FEATURE_AVX512F) | + __ins_bit(CPUID_7_0_EBX, X86_FEATURE_AVX512DQ) | + __ins_bit(CPUID_7_0_EBX, X86_FEATURE_RDSEED) | + __ins_bit(CPUID_7_0_EBX, X86_FEATURE_ADX) | + __ins_bit(CPUID_7_0_EBX, X86_FEATURE_CLFLUSHOPT) | + __ins_bit(CPUID_7_0_EBX, X86_FEATURE_AVX512PF) | + __ins_bit(CPUID_7_0_EBX, X86_FEATURE_AVX512ER) | + __ins_bit(CPUID_7_0_EBX, X86_FEATURE_AVX512CD) | + __ins_bit(CPUID_7_0_EBX, X86_FEATURE_SHA_NI) | + __ins_bit(CPUID_7_0_EBX, X86_FEATURE_AVX512BW) | + __ins_bit(CPUID_7_0_EBX, X86_FEATURE_AVX512VL), [CPUID_D_1_EAX] = - __ins_bit(CPUID_D_1_EAX, X86_FEATURE_XSAVEOPT) | - __ins_bit(CPUID_D_1_EAX, X86_FEATURE_XSAVEC) | - __ins_bit(CPUID_D_1_EAX, X86_FEATURE_XGETBV1), + __ins_bit(CPUID_D_1_EAX, X86_FEATURE_XSAVEOPT) | + __ins_bit(CPUID_D_1_EAX, X86_FEATURE_XSAVEC) | + __ins_bit(CPUID_D_1_EAX, X86_FEATURE_XGETBV1), [CPUID_7_0_ECX] = - __ins_bit(CPUID_7_0_ECX, X86_FEATURE_AVX512VBMI) | - __ins_bit(CPUID_7_0_ECX, X86_FEATURE_AVX512_VBMI2) | - __ins_bit(CPUID_7_0_ECX, X86_FEATURE_GFNI) | - __ins_bit(CPUID_7_0_ECX, X86_FEATURE_VAES) | - __ins_bit(CPUID_7_0_ECX, X86_FEATURE_VPCLMULQDQ) | - __ins_bit(CPUID_7_0_ECX, X86_FEATURE_AVX512_VNNI) | - __ins_bit(CPUID_7_0_ECX, X86_FEATURE_AVX512_BITALG) | - __ins_bit(CPUID_7_0_ECX, X86_FEATURE_TME) | - __ins_bit(CPUID_7_0_ECX, X86_FEATURE_AVX512_VPOPCNTDQ) | - __ins_bit(CPUID_7_0_ECX, X86_FEATURE_RDPID), + __ins_bit(CPUID_7_0_ECX, X86_FEATURE_AVX512VBMI) | + __ins_bit(CPUID_7_0_ECX, X86_FEATURE_AVX512_VBMI2) | + __ins_bit(CPUID_7_0_ECX, X86_FEATURE_GFNI) | + __ins_bit(CPUID_7_0_ECX, X86_FEATURE_VAES) | + __ins_bit(CPUID_7_0_ECX, X86_FEATURE_VPCLMULQDQ) | + __ins_bit(CPUID_7_0_ECX, X86_FEATURE_AVX512_VNNI) | + __ins_bit(CPUID_7_0_ECX, X86_FEATURE_AVX512_BITALG) | + __ins_bit(CPUID_7_0_ECX, X86_FEATURE_TME) | + __ins_bit(CPUID_7_0_ECX, X86_FEATURE_AVX512_VPOPCNTDQ) | + __ins_bit(CPUID_7_0_ECX, X86_FEATURE_RDPID), [CPUID_8000_0008_EBX] = - __ins_bit(CPUID_8000_0008_EBX, X86_FEATURE_CLZERO), + __ins_bit(CPUID_8000_0008_EBX, X86_FEATURE_CLZERO), [CPUID_7_0_EDX] = - __ins_bit(CPUID_7_0_EDX, X86_FEATURE_AVX512_4VNNIW) | - __ins_bit(CPUID_7_0_EDX, X86_FEATURE_AVX512_4FMAPS), + __ins_bit(CPUID_7_0_EDX, X86_FEATURE_AVX512_4VNNIW) | + __ins_bit(CPUID_7_0_EDX, X86_FEATURE_AVX512_4FMAPS), }; +// clang-format on #undef __ins_bit @@ -236,49 +234,49 @@ static int cpu_validate_features(compel_cpuinfo_t *cpu_info) return -1; if (opts.cpu_cap & CPU_CAP_FPU) { + uint64_t m; /* * If we're requested to check FPU only ignore * any other bit. It's up to a user if the * rest of mismatches won't cause problems. */ -#define __mismatch_fpu_bit(__bit) \ - (test_bit(__bit, (void *)cpu_info->x86_capability) && \ - !compel_cpu_has_feature(__bit)) - if (__mismatch_fpu_bit(X86_FEATURE_FPU) || - __mismatch_fpu_bit(X86_FEATURE_FXSR) || - __mismatch_fpu_bit(X86_FEATURE_OSXSAVE) || - __mismatch_fpu_bit(X86_FEATURE_XSAVES)) { +#define __mismatch_fpu_bit(__bit) (test_bit(__bit, (void *)cpu_info->x86_capability) && !compel_cpu_has_feature(__bit)) + if (__mismatch_fpu_bit(X86_FEATURE_FPU) || __mismatch_fpu_bit(X86_FEATURE_FXSR) || + __mismatch_fpu_bit(X86_FEATURE_OSXSAVE) || __mismatch_fpu_bit(X86_FEATURE_XSAVES)) { pr_err("FPU feature required by image " "is not supported on host " "(fpu:%d fxsr:%d osxsave:%d xsaves:%d)\n", - __mismatch_fpu_bit(X86_FEATURE_FPU), - __mismatch_fpu_bit(X86_FEATURE_FXSR), - __mismatch_fpu_bit(X86_FEATURE_OSXSAVE), - __mismatch_fpu_bit(X86_FEATURE_XSAVES)); + __mismatch_fpu_bit(X86_FEATURE_FPU), __mismatch_fpu_bit(X86_FEATURE_FXSR), + __mismatch_fpu_bit(X86_FEATURE_OSXSAVE), __mismatch_fpu_bit(X86_FEATURE_XSAVES)); return -1; } #undef __mismatch_fpu_bit /* - * Make sure the xsave features are compatible. We already hit the - * issue with libc where we've checkpointed the container on old - * machine but restored on more modern one and libc fetched new - * xsave frame size directly by xsave instruction with greedy - * feature mask causing programs to misbehave. + * Make sure the xsave features are compatible. Check that on + * the destination there are all the features which were on the + * source. */ - if (cpu_info->xfeatures_mask > rt_cpu_info.xfeatures_mask) { - uint64_t m = cpu_info->xfeatures_mask & ~rt_cpu_info.xfeatures_mask; - pr_err("CPU xfeatures has unsupported bits (%#llx)\n", - (unsigned long long)m); + if ((m = cpu_info->xfeatures_mask & ~rt_cpu_info.xfeatures_mask)) { + pr_err("CPU xfeatures has unsupported bits (%#" PRIx64 ")\n", m); return -1; - } else if (cpu_info->xsave_size != rt_cpu_info.xsave_size) { - pr_err("CPU xsave size mismatch (%u/%u)\n", - cpu_info->xsave_size, rt_cpu_info.xsave_size); + } + + /* + * Make sure the xsave sizes are compatible. We already hit the + * issue with libc where we've checkpointed the container on + * old machine but restored on more modern one and libc fetched + * new xsave frame size directly by xsave instruction with + * greedy feature mask causing programs to misbehave. + */ + if (cpu_info->xsave_size != rt_cpu_info.xsave_size) { + pr_err("CPU xsave size mismatch (%u/%u)\n", cpu_info->xsave_size, rt_cpu_info.xsave_size); return -1; - } else if (cpu_info->xsave_size_max != rt_cpu_info.xsave_size_max) { - pr_err("CPU xsave max size mismatch (%u/%u)\n", - cpu_info->xsave_size_max, rt_cpu_info.xsave_size_max); + } + if (cpu_info->xsave_size_max != rt_cpu_info.xsave_size_max) { + pr_err("CPU xsave max size mismatch (%u/%u)\n", cpu_info->xsave_size_max, + rt_cpu_info.xsave_size_max); return -1; } } @@ -295,8 +293,7 @@ static int cpu_validate_features(compel_cpuinfo_t *cpu_info) * Strict capability mode. Everything must match. */ if (opts.cpu_cap & CPU_CAP_CPU) { - if (memcmp(cpu_info->x86_capability, rt_cpu_info.x86_capability, - sizeof(cpu_info->x86_capability))) { + if (memcmp(cpu_info->x86_capability, rt_cpu_info.x86_capability, sizeof(cpu_info->x86_capability))) { pr_err("CPU capabilities do not match run time\n"); return -1; } @@ -306,8 +303,8 @@ static int cpu_validate_features(compel_cpuinfo_t *cpu_info) } static const struct { - const uint32_t capability_ver; - const uint32_t ncapints; + const uint32_t capability_ver; + const uint32_t ncapints; } ncapints[] = { { .capability_ver = 1, .ncapints = NCAPINTS_V1 }, { .capability_ver = 2, .ncapints = NCAPINTS_V2 }, @@ -318,14 +315,12 @@ static compel_cpuinfo_t *img_to_cpuinfo(CpuinfoX86Entry *img_x86_entry) compel_cpuinfo_t *cpu_info; size_t size, i; - BUILD_BUG_ON(sizeof(img_x86_entry->capability[0]) != - sizeof(cpu_info->x86_capability[0])); + BUILD_BUG_ON(sizeof(img_x86_entry->capability[0]) != sizeof(cpu_info->x86_capability[0])); BUILD_BUG_ON(ARRAY_SIZE(rt_cpu_info.x86_capability) != NCAPINTS); if (img_x86_entry->vendor_id != CPUINFO_X86_ENTRY__VENDOR__INTEL && img_x86_entry->vendor_id != CPUINFO_X86_ENTRY__VENDOR__AMD) { - pr_err("Image carries unknown vendor %u\n", - (unsigned)img_x86_entry->vendor_id); + pr_err("Image carries unknown vendor %u\n", (unsigned)img_x86_entry->vendor_id); return NULL; } @@ -333,8 +328,7 @@ static compel_cpuinfo_t *img_to_cpuinfo(CpuinfoX86Entry *img_x86_entry) if (img_x86_entry->capability_ver == ncapints[i].capability_ver) { if (img_x86_entry->n_capability != ncapints[i].ncapints) { pr_err("Image carries %u words while %u expected\n", - (unsigned)img_x86_entry->n_capability, - (unsigned)ncapints[i].ncapints); + (unsigned)img_x86_entry->n_capability, (unsigned)ncapints[i].ncapints); return NULL; } break; @@ -342,8 +336,7 @@ static compel_cpuinfo_t *img_to_cpuinfo(CpuinfoX86Entry *img_x86_entry) } if (i >= ARRAY_SIZE(ncapints)) { - pr_err("Image carries unknown capability version %d\n", - (unsigned)img_x86_entry->capability_ver); + pr_err("Image carries unknown capability version %d\n", (unsigned)img_x86_entry->capability_ver); return NULL; } @@ -358,21 +351,20 @@ static compel_cpuinfo_t *img_to_cpuinfo(CpuinfoX86Entry *img_x86_entry) size = sizeof(img_x86_entry->capability[0]) * img_x86_entry->n_capability; memcpy(cpu_info->x86_capability, img_x86_entry->capability, size); if (img_x86_entry->capability_ver == 1) { - memcpy(&cpu_info->x86_capability[NCAPINTS_V1], - &rt_cpu_info.x86_capability[NCAPINTS_V1], + memcpy(&cpu_info->x86_capability[NCAPINTS_V1], &rt_cpu_info.x86_capability[NCAPINTS_V1], (NCAPINTS_V2 - NCAPINTS_V1) * sizeof(rt_cpu_info.x86_capability[0])); } if (img_x86_entry->vendor_id == CPUINFO_X86_ENTRY__VENDOR__INTEL) - cpu_info->x86_vendor = X86_VENDOR_INTEL; + cpu_info->x86_vendor = X86_VENDOR_INTEL; else - cpu_info->x86_vendor = X86_VENDOR_AMD; - cpu_info->x86_family = img_x86_entry->cpu_family; - cpu_info->x86_model = img_x86_entry->model; - cpu_info->x86_mask = img_x86_entry->stepping; - cpu_info->extended_cpuid_level = rt_cpu_info.extended_cpuid_level; - cpu_info->cpuid_level = rt_cpu_info.cpuid_level; - cpu_info->x86_power = rt_cpu_info.x86_power; + cpu_info->x86_vendor = X86_VENDOR_AMD; + cpu_info->x86_family = img_x86_entry->cpu_family; + cpu_info->x86_model = img_x86_entry->model; + cpu_info->x86_mask = img_x86_entry->stepping; + cpu_info->extended_cpuid_level = rt_cpu_info.extended_cpuid_level; + cpu_info->cpuid_level = rt_cpu_info.cpuid_level; + cpu_info->x86_power = rt_cpu_info.x86_power; memcpy(cpu_info->x86_vendor_id, rt_cpu_info.x86_model_id, sizeof(cpu_info->x86_vendor_id)); strncpy(cpu_info->x86_model_id, img_x86_entry->model_id, sizeof(cpu_info->x86_model_id) - 1); @@ -415,6 +407,12 @@ int cpu_validate_cpuinfo(void) if (!img) return -1; + if (empty_image(img)) { + pr_err("No cpuinfo image\n"); + close_image(img); + return -1; + } + if (pb_read_one(img, &img_cpu_info, PB_CPUINFO) < 0) goto err; diff --git a/criu/arch/x86/crtools.c b/criu/arch/x86/crtools.c index efc23e5fe..1f4d0736b 100644 --- a/criu/arch/x86/crtools.c +++ b/criu/arch/x86/crtools.c @@ -1,5 +1,5 @@ #include "compel/asm/fpu.h" -#include "compel/compel.h" +#include "compel/infect.h" #include "compel/plugins/std/syscall-codes.h" #include "cpu.h" #include "cr_options.h" @@ -13,30 +13,31 @@ #undef LOG_PREFIX #define LOG_PREFIX "x86: " -#define XSAVE_PB_NELEMS(__s, __obj, __member) \ - (sizeof(__s) / sizeof(*(__obj)->__member)) +#define XSAVE_PB_NELEMS(__s, __obj, __member) (sizeof(__s) / sizeof(*(__obj)->__member)) -int save_task_regs(void *x, user_regs_struct_t *regs, user_fpregs_struct_t *fpregs) +int save_task_regs(pid_t pid, void *x, user_regs_struct_t *regs, user_fpregs_struct_t *fpregs) { CoreEntry *core = x; - UserX86RegsEntry *gpregs = core->thread_info->gpregs; + UserX86RegsEntry *gpregs = core->thread_info->gpregs; -#define assign_reg(dst, src, e) do { dst->e = (__typeof__(dst->e))src.e; } while (0) -#define assign_array(dst, src, e) memcpy(dst->e, &src.e, sizeof(src.e)) -#define assign_xsave(feature, xsave, member, area) \ - do { \ - if (compel_fpu_has_feature(feature)) { \ - uint32_t off = compel_fpu_feature_offset(feature); \ - void *from = &area[off]; \ - size_t size = pb_repeated_size(xsave, member); \ - size_t xsize = (size_t)compel_fpu_feature_size(feature); \ - if (xsize != size) { \ - pr_err("%s reported %zu bytes (expecting %zu)\n", \ - # feature, xsize, size); \ - return -1; \ - } \ - memcpy(xsave->member, from, size); \ - } \ +#define assign_reg(dst, src, e) \ + do { \ + dst->e = (__typeof__(dst->e))src.e; \ + } while (0) +#define assign_array(dst, src, e) memcpy(dst->e, &src.e, sizeof(src.e)) +#define assign_xsave(feature, xsave, member, area) \ + do { \ + if (compel_fpu_has_feature(feature)) { \ + uint32_t off = compel_fpu_feature_offset(feature); \ + void *from = &area[off]; \ + size_t size = pb_repeated_size(xsave, member); \ + size_t xsize = (size_t)compel_fpu_feature_size(feature); \ + if (xsize != size) { \ + pr_err("%s reported %zu bytes (expecting %zu)\n", #feature, xsize, size); \ + return -1; \ + } \ + memcpy(xsave->member, from, size); \ + } \ } while (0) if (user_regs_native(regs)) { @@ -119,19 +120,27 @@ int save_task_regs(void *x, user_regs_struct_t *regs, user_fpregs_struct_t *fpre */ assign_reg(xsave, fpregs->xsave_hdr, xstate_bv); - assign_xsave(XFEATURE_YMM, xsave, ymmh_space, extended_state_area); - assign_xsave(XFEATURE_BNDREGS, xsave, bndreg_state, extended_state_area); - assign_xsave(XFEATURE_BNDCSR, xsave, bndcsr_state, extended_state_area); - assign_xsave(XFEATURE_OPMASK, xsave, opmask_reg, extended_state_area); - assign_xsave(XFEATURE_ZMM_Hi256,xsave, zmm_upper, extended_state_area); - assign_xsave(XFEATURE_Hi16_ZMM, xsave, hi16_zmm, extended_state_area); - assign_xsave(XFEATURE_PKRU, xsave, pkru, extended_state_area); + assign_xsave(XFEATURE_YMM, xsave, ymmh_space, extended_state_area); + assign_xsave(XFEATURE_BNDREGS, xsave, bndreg_state, extended_state_area); + assign_xsave(XFEATURE_BNDCSR, xsave, bndcsr_state, extended_state_area); + assign_xsave(XFEATURE_OPMASK, xsave, opmask_reg, extended_state_area); + assign_xsave(XFEATURE_ZMM_Hi256, xsave, zmm_upper, extended_state_area); + assign_xsave(XFEATURE_Hi16_ZMM, xsave, hi16_zmm, extended_state_area); + assign_xsave(XFEATURE_PKRU, xsave, pkru, extended_state_area); } #undef assign_reg #undef assign_array #undef assign_xsave + if (compel_cpu_has_feature(X86_FEATURE_SHSTK)) { + UserX86CetEntry *cet = core->thread_info->fpregs->xsave->cet; + struct cet_user_state *regs = &fpregs->cet; + + cet->cet = regs->cet; + cet->ssp = regs->ssp; + } + return 0; } @@ -139,7 +148,7 @@ static void alloc_tls(ThreadInfoX86 *ti, void **mempool) { int i; - ti->tls = xptr_pull_s(mempool, GDT_ENTRY_TLS_NUM*sizeof(UserDescT*)); + ti->tls = xptr_pull_s(mempool, GDT_ENTRY_TLS_NUM * sizeof(UserDescT *)); ti->n_tls = GDT_ENTRY_TLS_NUM; for (i = 0; i < GDT_ENTRY_TLS_NUM; i++) { ti->tls[i] = xptr_pull(mempool, UserDescT); @@ -150,54 +159,61 @@ static void alloc_tls(ThreadInfoX86 *ti, void **mempool) static int alloc_xsave_extends(UserX86XsaveEntry *xsave) { if (compel_fpu_has_feature(XFEATURE_YMM)) { - xsave->n_ymmh_space = XSAVE_PB_NELEMS(struct ymmh_struct, xsave, ymmh_space); - xsave->ymmh_space = xzalloc(pb_repeated_size(xsave, ymmh_space)); + xsave->n_ymmh_space = XSAVE_PB_NELEMS(struct ymmh_struct, xsave, ymmh_space); + xsave->ymmh_space = xzalloc(pb_repeated_size(xsave, ymmh_space)); if (!xsave->ymmh_space) goto err; } if (compel_fpu_has_feature(XFEATURE_BNDREGS)) { - xsave->n_bndreg_state = XSAVE_PB_NELEMS(struct mpx_bndreg_state, xsave, bndreg_state); - xsave->bndreg_state = xzalloc(pb_repeated_size(xsave, bndreg_state)); + xsave->n_bndreg_state = XSAVE_PB_NELEMS(struct mpx_bndreg_state, xsave, bndreg_state); + xsave->bndreg_state = xzalloc(pb_repeated_size(xsave, bndreg_state)); if (!xsave->bndreg_state) goto err; } if (compel_fpu_has_feature(XFEATURE_BNDCSR)) { - xsave->n_bndcsr_state = XSAVE_PB_NELEMS(struct mpx_bndcsr_state, xsave, bndcsr_state); - xsave->bndcsr_state = xzalloc(pb_repeated_size(xsave, bndcsr_state)); + xsave->n_bndcsr_state = XSAVE_PB_NELEMS(struct mpx_bndcsr_state, xsave, bndcsr_state); + xsave->bndcsr_state = xzalloc(pb_repeated_size(xsave, bndcsr_state)); if (!xsave->bndcsr_state) goto err; } if (compel_fpu_has_feature(XFEATURE_OPMASK)) { - xsave->n_opmask_reg = XSAVE_PB_NELEMS(struct avx_512_opmask_state, xsave, opmask_reg); - xsave->opmask_reg = xzalloc(pb_repeated_size(xsave, opmask_reg)); + xsave->n_opmask_reg = XSAVE_PB_NELEMS(struct avx_512_opmask_state, xsave, opmask_reg); + xsave->opmask_reg = xzalloc(pb_repeated_size(xsave, opmask_reg)); if (!xsave->opmask_reg) goto err; } if (compel_fpu_has_feature(XFEATURE_ZMM_Hi256)) { - xsave->n_zmm_upper = XSAVE_PB_NELEMS(struct avx_512_zmm_uppers_state, xsave, zmm_upper); - xsave->zmm_upper = xzalloc(pb_repeated_size(xsave, zmm_upper)); + xsave->n_zmm_upper = XSAVE_PB_NELEMS(struct avx_512_zmm_uppers_state, xsave, zmm_upper); + xsave->zmm_upper = xzalloc(pb_repeated_size(xsave, zmm_upper)); if (!xsave->zmm_upper) goto err; } if (compel_fpu_has_feature(XFEATURE_Hi16_ZMM)) { - xsave->n_hi16_zmm = XSAVE_PB_NELEMS(struct avx_512_hi16_state, xsave, hi16_zmm); - xsave->hi16_zmm = xzalloc(pb_repeated_size(xsave, hi16_zmm)); + xsave->n_hi16_zmm = XSAVE_PB_NELEMS(struct avx_512_hi16_state, xsave, hi16_zmm); + xsave->hi16_zmm = xzalloc(pb_repeated_size(xsave, hi16_zmm)); if (!xsave->hi16_zmm) goto err; } if (compel_fpu_has_feature(XFEATURE_PKRU)) { - xsave->n_pkru = XSAVE_PB_NELEMS(struct pkru_state, xsave, pkru); - xsave->pkru = xzalloc(pb_repeated_size(xsave, pkru)); + xsave->n_pkru = XSAVE_PB_NELEMS(struct pkru_state, xsave, pkru); + xsave->pkru = xzalloc(pb_repeated_size(xsave, pkru)); if (!xsave->pkru) goto err; } + if (compel_cpu_has_feature(X86_FEATURE_SHSTK)) { + xsave->cet = xzalloc(sizeof(UserX86CetEntry)); + if (!xsave->cet) + goto err; + user_x86_cet_entry__init(xsave->cet); + } + return 0; err: return -1; @@ -210,17 +226,17 @@ int arch_alloc_thread_info(CoreEntry *core) void *m; ThreadInfoX86 *ti = NULL; - with_fpu = compel_cpu_has_feature(X86_FEATURE_FPU); - sz = sizeof(ThreadInfoX86) + sizeof(UserX86RegsEntry) + - GDT_ENTRY_TLS_NUM*sizeof(UserDescT) + - GDT_ENTRY_TLS_NUM*sizeof(UserDescT*); + sz = sizeof(ThreadInfoX86) + sizeof(UserX86RegsEntry) + GDT_ENTRY_TLS_NUM * sizeof(UserDescT) + + GDT_ENTRY_TLS_NUM * sizeof(UserDescT *); if (with_fpu) { sz += sizeof(UserX86FpregsEntry); with_xsave = compel_cpu_has_feature(X86_FEATURE_OSXSAVE); if (with_xsave) sz += sizeof(UserX86XsaveEntry); + if (compel_cpu_has_feature(X86_FEATURE_SHSTK)) + sz += sizeof(UserX86CetEntry); } m = xmalloc(sz); @@ -240,11 +256,11 @@ int arch_alloc_thread_info(CoreEntry *core) user_x86_fpregs_entry__init(fpregs); /* These are numbers from kernel */ - fpregs->n_st_space = 32; - fpregs->n_xmm_space = 64; + fpregs->n_st_space = 32; + fpregs->n_xmm_space = 64; - fpregs->st_space = xzalloc(pb_repeated_size(fpregs, st_space)); - fpregs->xmm_space = xzalloc(pb_repeated_size(fpregs, xmm_space)); + fpregs->st_space = xzalloc(pb_repeated_size(fpregs, st_space)); + fpregs->xmm_space = xzalloc(pb_repeated_size(fpregs, xmm_space)); if (!fpregs->st_space || !fpregs->xmm_space) goto err; @@ -293,16 +309,14 @@ static bool valid_xsave_frame(CoreEntry *core) if (core->thread_info->fpregs->n_st_space < ARRAY_SIZE(x->i387.st_space)) { pr_err("Corruption in FPU st_space area " "(got %li but %li expected)\n", - (long)core->thread_info->fpregs->n_st_space, - (long)ARRAY_SIZE(x->i387.st_space)); + (long)core->thread_info->fpregs->n_st_space, (long)ARRAY_SIZE(x->i387.st_space)); return false; } if (core->thread_info->fpregs->n_xmm_space < ARRAY_SIZE(x->i387.xmm_space)) { pr_err("Corruption in FPU xmm_space area " "(got %li but %li expected)\n", - (long)core->thread_info->fpregs->n_st_space, - (long)ARRAY_SIZE(x->i387.xmm_space)); + (long)core->thread_info->fpregs->n_st_space, (long)ARRAY_SIZE(x->i387.xmm_space)); return false; } @@ -310,46 +324,52 @@ static bool valid_xsave_frame(CoreEntry *core) if (xsave) { size_t i; struct { - const char *name; - size_t expected; - size_t obtained; - void *ptr; + const char *name; + size_t expected; + size_t obtained; + void *ptr; } features[] = { { - .name = __stringify_1(XFEATURE_YMM), - .expected = XSAVE_PB_NELEMS(struct ymmh_struct, xsave, ymmh_space), - .obtained = xsave->n_ymmh_space, - .ptr = xsave->ymmh_space, - }, { - .name = __stringify_1(XFEATURE_BNDREGS), - .expected = XSAVE_PB_NELEMS(struct mpx_bndreg_state, xsave, bndreg_state), - .obtained = xsave->n_bndreg_state, - .ptr = xsave->bndreg_state, - }, { - .name = __stringify_1(XFEATURE_BNDCSR), - .expected = XSAVE_PB_NELEMS(struct mpx_bndcsr_state, xsave, bndcsr_state), - .obtained = xsave->n_bndcsr_state, - .ptr = xsave->bndcsr_state, - }, { - .name = __stringify_1(XFEATURE_OPMASK), - .expected = XSAVE_PB_NELEMS(struct avx_512_opmask_state, xsave, opmask_reg), - .obtained = xsave->n_opmask_reg, - .ptr = xsave->opmask_reg, - }, { - .name = __stringify_1(XFEATURE_ZMM_Hi256), - .expected = XSAVE_PB_NELEMS(struct avx_512_zmm_uppers_state, xsave, zmm_upper), - .obtained = xsave->n_zmm_upper, - .ptr = xsave->zmm_upper, - }, { - .name = __stringify_1(XFEATURE_Hi16_ZMM), - .expected = XSAVE_PB_NELEMS(struct avx_512_hi16_state, xsave, hi16_zmm), - .obtained = xsave->n_hi16_zmm, - .ptr = xsave->hi16_zmm, - }, { - .name = __stringify_1(XFEATURE_PKRU), - .expected = XSAVE_PB_NELEMS(struct pkru_state, xsave, pkru), - .obtained = xsave->n_pkru, - .ptr = xsave->pkru, + .name = __stringify_1(XFEATURE_YMM), + .expected = XSAVE_PB_NELEMS(struct ymmh_struct, xsave, ymmh_space), + .obtained = xsave->n_ymmh_space, + .ptr = xsave->ymmh_space, + }, + { + .name = __stringify_1(XFEATURE_BNDREGS), + .expected = XSAVE_PB_NELEMS(struct mpx_bndreg_state, xsave, bndreg_state), + .obtained = xsave->n_bndreg_state, + .ptr = xsave->bndreg_state, + }, + { + .name = __stringify_1(XFEATURE_BNDCSR), + .expected = XSAVE_PB_NELEMS(struct mpx_bndcsr_state, xsave, bndcsr_state), + .obtained = xsave->n_bndcsr_state, + .ptr = xsave->bndcsr_state, + }, + { + .name = __stringify_1(XFEATURE_OPMASK), + .expected = XSAVE_PB_NELEMS(struct avx_512_opmask_state, xsave, opmask_reg), + .obtained = xsave->n_opmask_reg, + .ptr = xsave->opmask_reg, + }, + { + .name = __stringify_1(XFEATURE_ZMM_Hi256), + .expected = XSAVE_PB_NELEMS(struct avx_512_zmm_uppers_state, xsave, zmm_upper), + .obtained = xsave->n_zmm_upper, + .ptr = xsave->zmm_upper, + }, + { + .name = __stringify_1(XFEATURE_Hi16_ZMM), + .expected = XSAVE_PB_NELEMS(struct avx_512_hi16_state, xsave, hi16_zmm), + .obtained = xsave->n_hi16_zmm, + .ptr = xsave->hi16_zmm, + }, + { + .name = __stringify_1(XFEATURE_PKRU), + .expected = XSAVE_PB_NELEMS(struct pkru_state, xsave, pkru), + .obtained = xsave->n_pkru, + .ptr = xsave->pkru, }, }; @@ -392,12 +412,11 @@ static void show_rt_xsave_frame(struct xsave_struct *x) pr_debug("xsave runtime structure\n"); pr_debug("-----------------------\n"); - pr_debug("cwd:%#x swd:%#x twd:%#x fop:%#x mxcsr:%#x mxcsr_mask:%#x\n", - (int)i387->cwd, (int)i387->swd, (int)i387->twd, - (int)i387->fop, (int)i387->mxcsr, (int)i387->mxcsr_mask); + pr_debug("cwd:%#x swd:%#x twd:%#x fop:%#x mxcsr:%#x mxcsr_mask:%#x\n", (int)i387->cwd, (int)i387->swd, + (int)i387->twd, (int)i387->fop, (int)i387->mxcsr, (int)i387->mxcsr_mask); - pr_debug("magic1:%#x extended_size:%u xstate_bv:%#lx xstate_size:%u\n", - fpx->magic1, fpx->extended_size, (long)fpx->xstate_bv, fpx->xstate_size); + pr_debug("magic1:%#x extended_size:%u xstate_bv:%#lx xstate_size:%u\n", fpx->magic1, fpx->extended_size, + (long)fpx->xstate_bv, fpx->xstate_size); pr_debug("xstate_bv: %#lx\n", (long)xsave_hdr->xstate_bv); pr_debug("-----------------------\n"); @@ -405,12 +424,9 @@ static void show_rt_xsave_frame(struct xsave_struct *x) int restore_fpu(struct rt_sigframe *sigframe, CoreEntry *core) { - fpu_state_t *fpu_state = core_is_compat(core) ? - &sigframe->compat.fpu_state : - &sigframe->native.fpu_state; - struct xsave_struct *x = core_is_compat(core) ? - (void *)&fpu_state->fpu_state_ia32.xsave : - (void *)&fpu_state->fpu_state_64.xsave; + fpu_state_t *fpu_state = core_is_compat(core) ? &sigframe->compat.fpu_state : &sigframe->native.fpu_state; + struct xsave_struct *x = core_is_compat(core) ? (void *)&fpu_state->fpu_state_ia32.xsave : + (void *)&fpu_state->fpu_state_64.xsave; /* * If no FPU information provided -- we're restoring @@ -427,30 +443,33 @@ int restore_fpu(struct rt_sigframe *sigframe, CoreEntry *core) fpu_state->has_fpu = true; -#define assign_reg(dst, src, e) do { dst.e = (__typeof__(dst.e))src->e; } while (0) -#define assign_array(dst, src, e) memcpy(dst.e, (src)->e, sizeof(dst.e)) -#define assign_xsave(feature, xsave, member, area) \ - do { \ - if (compel_fpu_has_feature(feature)) { \ - uint32_t off = compel_fpu_feature_offset(feature); \ - void *to = &area[off]; \ - void *from = xsave->member; \ - size_t size = pb_repeated_size(xsave, member); \ - size_t xsize = (size_t)compel_fpu_feature_size(feature); \ - if (xsize != size) { \ - if (size) { \ - pr_err("%s reported %zu bytes (expecting %zu)\n",\ - # feature, xsize, size); \ - return -1; \ - } else { \ - pr_debug("%s is not present in image, ignore\n",\ - # feature); \ - } \ - } \ - xstate_bv |= (1UL << feature); \ - xstate_size += xsize; \ - memcpy(to, from, size); \ - } \ +#define assign_reg(dst, src, e) \ + do { \ + dst.e = (__typeof__(dst.e))src->e; \ + } while (0) +#define assign_array(dst, src, e) memcpy(dst.e, (src)->e, sizeof(dst.e)) +#define assign_xsave(feature, xsave, member, area) \ + do { \ + if (compel_fpu_has_feature(feature) && (xsave->xstate_bv & (1UL << feature))) { \ + uint32_t off = compel_fpu_feature_offset(feature); \ + void *to = &area[off]; \ + void *from = xsave->member; \ + size_t size = pb_repeated_size(xsave, member); \ + size_t xsize = (size_t)compel_fpu_feature_size(feature); \ + size_t xstate_size_next = off + xsize; \ + if (xsize != size) { \ + if (size) { \ + pr_err("%s reported %zu bytes (expecting %zu)\n", #feature, xsize, size); \ + return -1; \ + } else { \ + pr_debug("%s is not present in image, ignore\n", #feature); \ + } \ + } \ + xstate_bv |= (1UL << feature); \ + BUG_ON(xstate_size > xstate_size_next); \ + xstate_size = xstate_size_next; \ + memcpy(to, from, size); \ + } \ } while (0) assign_reg(x->i387, core->thread_info->fpregs, cwd); @@ -485,21 +504,26 @@ int restore_fpu(struct rt_sigframe *sigframe, CoreEntry *core) UserX86XsaveEntry *xsave = core->thread_info->fpregs->xsave; uint8_t *extended_state_area = (void *)x; - assign_xsave(XFEATURE_YMM, xsave, ymmh_space, extended_state_area); - assign_xsave(XFEATURE_BNDREGS, xsave, bndreg_state, extended_state_area); - assign_xsave(XFEATURE_BNDCSR, xsave, bndcsr_state, extended_state_area); - assign_xsave(XFEATURE_OPMASK, xsave, opmask_reg, extended_state_area); - assign_xsave(XFEATURE_ZMM_Hi256,xsave, zmm_upper, extended_state_area); - assign_xsave(XFEATURE_Hi16_ZMM, xsave, hi16_zmm, extended_state_area); - assign_xsave(XFEATURE_PKRU, xsave, pkru, extended_state_area); + /* + * Note the order does matter here and bound + * to the increasing offsets of XFEATURE_x + * inside memory layout (xstate_size calculation). + */ + assign_xsave(XFEATURE_YMM, xsave, ymmh_space, extended_state_area); + assign_xsave(XFEATURE_BNDREGS, xsave, bndreg_state, extended_state_area); + assign_xsave(XFEATURE_BNDCSR, xsave, bndcsr_state, extended_state_area); + assign_xsave(XFEATURE_OPMASK, xsave, opmask_reg, extended_state_area); + assign_xsave(XFEATURE_ZMM_Hi256, xsave, zmm_upper, extended_state_area); + assign_xsave(XFEATURE_Hi16_ZMM, xsave, hi16_zmm, extended_state_area); + assign_xsave(XFEATURE_PKRU, xsave, pkru, extended_state_area); } - x->xsave_hdr.xstate_bv = xstate_bv; + x->xsave_hdr.xstate_bv = xstate_bv; - fpx_sw->magic1 = FP_XSTATE_MAGIC1; - fpx_sw->xstate_bv = xstate_bv; - fpx_sw->xstate_size = xstate_size; - fpx_sw->extended_size = xstate_size + FP_XSTATE_MAGIC2_SIZE; + fpx_sw->magic1 = FP_XSTATE_MAGIC1; + fpx_sw->xstate_bv = xstate_bv; + fpx_sw->xstate_size = xstate_size; + fpx_sw->extended_size = xstate_size + FP_XSTATE_MAGIC2_SIZE; /* * This should be at the end of xsave frame. @@ -517,7 +541,7 @@ int restore_fpu(struct rt_sigframe *sigframe, CoreEntry *core) return 0; } -#define CPREG32(d) f->compat.uc.uc_mcontext.d = r->d +#define CPREG32(d) f->compat.uc.uc_mcontext.d = r->d static void restore_compat_gpregs(struct rt_sigframe *f, UserX86RegsEntry *r) { CPREG32(gs); @@ -525,8 +549,15 @@ static void restore_compat_gpregs(struct rt_sigframe *f, UserX86RegsEntry *r) CPREG32(es); CPREG32(ds); - CPREG32(di); CPREG32(si); CPREG32(bp); CPREG32(sp); CPREG32(bx); - CPREG32(dx); CPREG32(cx); CPREG32(ip); CPREG32(ax); + CPREG32(di); + CPREG32(si); + CPREG32(bp); + CPREG32(sp); + CPREG32(bx); + CPREG32(dx); + CPREG32(cx); + CPREG32(ip); + CPREG32(ax); CPREG32(cs); CPREG32(ss); CPREG32(flags); @@ -535,7 +566,7 @@ static void restore_compat_gpregs(struct rt_sigframe *f, UserX86RegsEntry *r) } #undef CPREG32 -#define CPREG64(d, s) f->native.uc.uc_mcontext.d = r->s +#define CPREG64(d, s) f->native.uc.uc_mcontext.d = r->s static void restore_native_gpregs(struct rt_sigframe *f, UserX86RegsEntry *r) { CPREG64(rdi, di); @@ -568,15 +599,15 @@ static void restore_native_gpregs(struct rt_sigframe *f, UserX86RegsEntry *r) int restore_gpregs(struct rt_sigframe *f, UserX86RegsEntry *r) { switch (r->mode) { - case USER_X86_REGS_MODE__NATIVE: - restore_native_gpregs(f, r); - break; - case USER_X86_REGS_MODE__COMPAT: - restore_compat_gpregs(f, r); - break; - default: - pr_err("Can't prepare rt_sigframe: registers mode corrupted (%d)\n", r->mode); - return -1; + case USER_X86_REGS_MODE__NATIVE: + restore_native_gpregs(f, r); + break; + case USER_X86_REGS_MODE__COMPAT: + restore_compat_gpregs(f, r); + break; + default: + pr_err("Can't prepare rt_sigframe: registers mode corrupted (%d)\n", r->mode); + return -1; } return 0; } @@ -584,26 +615,24 @@ int restore_gpregs(struct rt_sigframe *f, UserX86RegsEntry *r) static int get_robust_list32(pid_t pid, uintptr_t head, uintptr_t len) { struct syscall_args32 s = { - .nr = __NR32_get_robust_list, - .arg0 = pid, - .arg1 = (uint32_t)head, - .arg2 = (uint32_t)len, + .nr = __NR32_get_robust_list, + .arg0 = pid, + .arg1 = (uint32_t)head, + .arg2 = (uint32_t)len, }; - do_full_int80(&s); - return (int)s.nr; + return do_full_int80(&s); } static int set_robust_list32(uint32_t head, uint32_t len) { struct syscall_args32 s = { - .nr = __NR32_set_robust_list, - .arg0 = head, - .arg1 = len, + .nr = __NR32_set_robust_list, + .arg0 = head, + .arg1 = len, }; - do_full_int80(&s); - return (int)s.nr; + return do_full_int80(&s); } int get_task_futex_robust_list_compat(pid_t pid, ThreadCoreEntry *info) @@ -620,19 +649,18 @@ int get_task_futex_robust_list_compat(pid_t pid, ThreadCoreEntry *info) if (ret == -ENOSYS) { /* Check native get_task_futex_robust_list() for details. */ if (set_robust_list32(0, 0) == (uint32_t)-ENOSYS) { - info->futex_rla = 0; - info->futex_rla_len = 0; + info->futex_rla = 0; + info->futex_rla_len = 0; ret = 0; } } else if (ret == 0) { - uint32_t *arg1 = (uint32_t*)mmap32; + uint32_t *arg1 = (uint32_t *)mmap32; - info->futex_rla = *arg1; - info->futex_rla_len = *(arg1 + 1); + info->futex_rla = *arg1; + info->futex_rla_len = *(arg1 + 1); ret = 0; } - free_compat_syscall_stack(mmap32); return ret; } diff --git a/criu/arch/x86/include/asm/compat.h b/criu/arch/x86/include/asm/compat.h index cd1ae472d..4ca704fd7 100644 --- a/criu/arch/x86/include/asm/compat.h +++ b/criu/arch/x86/include/asm/compat.h @@ -2,19 +2,21 @@ #define __CR_ASM_COMPAT_H__ #ifdef CR_NOGLIBC -# include -# include +#include +#include #else -# define sys_mmap mmap -# define sys_munmap munmap +#define sys_mmap mmap +#define sys_munmap munmap #endif #include +#include "log.h" + static inline void *alloc_compat_syscall_stack(void) { - void *mem = (void*)sys_mmap(NULL, PAGE_SIZE, PROT_READ | PROT_WRITE, - MAP_32BIT | MAP_ANONYMOUS | MAP_PRIVATE, -1, 0); + void *mem = (void *)sys_mmap(NULL, PAGE_SIZE, PROT_READ | PROT_WRITE, MAP_32BIT | MAP_ANONYMOUS | MAP_PRIVATE, + -1, 0); if ((uintptr_t)mem % PAGE_SIZE) { int err = (~(uint32_t)(uintptr_t)mem) + 1; @@ -30,39 +32,56 @@ static inline void free_compat_syscall_stack(void *mem) long int ret = sys_munmap(mem, PAGE_SIZE); if (ret) - pr_err("munmap() of compat addr %p failed with %ld\n", - mem, ret); + pr_err("munmap() of compat addr %p failed with %ld\n", mem, ret); } struct syscall_args32 { uint32_t nr, arg0, arg1, arg2, arg3, arg4, arg5; }; -static inline void do_full_int80(struct syscall_args32 *args) +static inline uint32_t do_full_int80(struct syscall_args32 *args) { /* - * r8-r11 registers are cleared during returning to userspace - * from syscall - that's x86_64 ABI to avoid leaking kernel - * pointers. + * Kernel older than v4.4 do not preserve r8-r15 registers when + * invoking int80, so we need to preserve them. * - * Other than that - we can't use %rbp in clobbers as GCC's inline - * assembly doesn't allow to do so. So, here is explicitly saving - * %rbp before syscall and restoring it's value afterward. + * Additionally, %rbp is used as the 6th syscall argument, and we need + * to preserve its value when returning from the syscall to avoid + * upsetting GCC. However, we can't use %rbp in the GCC asm clobbers + * due to a GCC limitation. Instead, we explicitly save %rbp on the + * stack before invoking the syscall and restore its value afterward. + * + * Further, GCC may not adjust the %rsp pointer when allocating the + * args and ret variables because 1) do_full_int80() is a leaf + * function, and 2) the local variables (args and ret) are in the + * 128-byte red-zone as defined in the x86_64 ABI. To use the stack + * when preserving %rbp, we must either tell GCC to a) mark the + * function as non-leaf, or b) move away from the red-zone when using + * the stack. It seems that there is no easy way to do a), so we'll go + * with b). + * Note 1: Another workaround would have been to add %rsp in the list + * of clobbers, but this was deprecated in GCC 9. + * Note 2: This red-zone bug only manifests when compiling CRIU with + * DEBUG=1. */ - asm volatile ("pushq %%rbp\n\t" - "mov %6, %%ebp\n\t" - "int $0x80\n\t" - "mov %%ebp, %6\n\t" - "popq %%rbp\n\t" - : "+a" (args->nr), - "+b" (args->arg0), "+c" (args->arg1), "+d" (args->arg2), - "+S" (args->arg3), "+D" (args->arg4), "+g" (args->arg5) - : : "r8", "r9", "r10", "r11"); + uint32_t ret; + + asm volatile("sub $128, %%rsp\n\t" + "pushq %%rbp\n\t" + "mov %7, %%ebp\n\t" + "int $0x80\n\t" + "popq %%rbp\n\t" + "add $128, %%rsp\n\t" + : "=a"(ret) + : "a"(args->nr), "b"(args->arg0), "c"(args->arg1), "d"(args->arg2), "S"(args->arg3), + "D"(args->arg4), "g"(args->arg5) + : "r8", "r9", "r10", "r11", "r12", "r13", "r14", "r15"); + return ret; } #ifndef CR_NOGLIBC -# undef sys_mmap -# undef sys_munmap +#undef sys_mmap +#undef sys_munmap #endif #endif diff --git a/criu/arch/x86/include/asm/dump.h b/criu/arch/x86/include/asm/dump.h index c79e0dfa9..925ea91ff 100644 --- a/criu/arch/x86/include/asm/dump.h +++ b/criu/arch/x86/include/asm/dump.h @@ -1,7 +1,7 @@ #ifndef __CR_ASM_DUMP_H__ #define __CR_ASM_DUMP_H__ -extern int save_task_regs(void *, user_regs_struct_t *, user_fpregs_struct_t *); +extern int save_task_regs(pid_t pid, void *, user_regs_struct_t *, user_fpregs_struct_t *); extern int arch_alloc_thread_info(CoreEntry *core); extern void arch_free_thread_info(CoreEntry *core); extern int get_task_futex_robust_list_compat(pid_t pid, ThreadCoreEntry *info); @@ -11,8 +11,7 @@ static inline void core_put_tls(CoreEntry *core, tls_t tls) ThreadInfoX86 *ti = core->thread_info; int i; - for (i = 0; i < GDT_ENTRY_TLS_NUM; i++) - { + for (i = 0; i < GDT_ENTRY_TLS_NUM; i++) { user_desc_t *from = &tls.desc[i]; UserDescT *to = ti->tls[i]; @@ -26,7 +25,7 @@ static inline void core_put_tls(CoreEntry *core, tls_t tls) COPY_TLS(read_exec_only); COPY_TLS(limit_in_pages); COPY_TLS(seg_not_present); - COPY_TLS(useable); + COPY_TLS(usable); #undef COPY_TLS } } diff --git a/criu/arch/x86/include/asm/kerndat.h b/criu/arch/x86/include/asm/kerndat.h index 903bc80f7..5c3717230 100644 --- a/criu/arch/x86/include/asm/kerndat.h +++ b/criu/arch/x86/include/asm/kerndat.h @@ -4,5 +4,6 @@ extern int kdat_compatible_cr(void); extern int kdat_can_map_vdso(void); extern int kdat_x86_has_ptrace_fpu_xsave_bug(void); +extern int kdat_has_shstk(void); #endif /* __CR_ASM_KERNDAT_H__ */ diff --git a/criu/arch/x86/include/asm/parasite.h b/criu/arch/x86/include/asm/parasite.h index 6b4d4ac59..7064f1df1 100644 --- a/criu/arch/x86/include/asm/parasite.h +++ b/criu/arch/x86/include/asm/parasite.h @@ -1,77 +1,13 @@ #ifndef __ASM_PARASITE_H__ #define __ASM_PARASITE_H__ -#include -#include -#include "asm/compat.h" - -static int arch_get_user_desc(user_desc_t *desc) +/* + * TLS is accessed through PTRACE_GET_THREAD_AREA, + * see compel_arch_fetch_thread_area(). + */ +static inline void arch_get_tls(tls_t *ptls) { - int ret = __NR32_get_thread_area; - /* - * For 64-bit applications, TLS (fs_base for Glibc) is - * in MSR, which are dumped with the help of arch_prctl(). - * - * But SET_FS_BASE will update GDT if base pointer fits in 4 bytes. - * Otherwise it will set only MSR, which allows for mixed 64/32-bit - * code to use: 2 MSRs as TLS base _and_ 3 GDT entries. - * Having in sum 5 TLS pointers, 3 of which are four bytes and - * other two bigger than four bytes: - * struct thread_struct { - * struct desc_struct tls_array[3]; - * ... - * #ifdef CONFIG_X86_64 - * unsigned long fsbase; - * unsigned long gsbase; - * #endif - * ... - * }; - */ - asm volatile ( - " mov %0,%%eax \n" - " mov %1,%%rbx \n" - " int $0x80 \n" - " mov %%eax,%0 \n" - : "+m"(ret) - : "m"(desc) - : "rax", "rbx", "r8", "r9", "r10", "r11", "memory"); - - if (ret) - pr_err("Failed to dump TLS descriptor #%d: %d\n", - desc->entry_number, ret); - return ret; -} - -static void arch_get_tls(tls_t *ptls) -{ - void *syscall_mem; - int i; - - syscall_mem = alloc_compat_syscall_stack(); - if (!syscall_mem) { - pr_err("Failed to allocate memory <4Gb for compat syscall\n"); - - for (i = 0; i < GDT_ENTRY_TLS_NUM; i++) { - user_desc_t *d = &ptls->desc[i]; - - d->seg_not_present = 1; - d->entry_number = GDT_ENTRY_TLS_MIN + i; - } - return; - } - - for (i = 0; i < GDT_ENTRY_TLS_NUM; i++) - { - user_desc_t *d = syscall_mem; - - memset(d, 0, sizeof(user_desc_t)); - d->seg_not_present = 1; - d->entry_number = GDT_ENTRY_TLS_MIN + i; - arch_get_user_desc(d); - memcpy(&ptls->desc[i], d, sizeof(user_desc_t)); - } - - free_compat_syscall_stack(syscall_mem); + (void)ptls; } #endif diff --git a/criu/arch/x86/include/asm/restore.h b/criu/arch/x86/include/asm/restore.h index 21787a7c8..addf716a4 100644 --- a/criu/arch/x86/include/asm/restore.h +++ b/criu/arch/x86/include/asm/restore.h @@ -5,6 +5,7 @@ #include "images/core.pb-c.h" +/* clang-format off */ #define JUMP_TO_RESTORER_BLOB(new_sp, restore_task_exec_start, \ task_args) \ asm volatile( \ @@ -18,6 +19,7 @@ "g"(restore_task_exec_start), \ "g"(task_args) \ : "rdi", "rsi", "rbx", "rax", "memory") +/* clang-format on */ static inline void core_get_tls(CoreEntry *pcore, tls_t *ptls) { @@ -47,12 +49,11 @@ static inline void core_get_tls(CoreEntry *pcore, tls_t *ptls) COPY_TLS(read_exec_only); COPY_TLS(limit_in_pages); COPY_TLS(seg_not_present); - COPY_TLS(useable); + COPY_TLS(usable); #undef COPY_TLS } } - int restore_fpu(struct rt_sigframe *sigframe, CoreEntry *core); #endif diff --git a/criu/arch/x86/include/asm/restorer.h b/criu/arch/x86/include/asm/restorer.h index 25559b57c..3a673958d 100644 --- a/criu/arch/x86/include/asm/restorer.h +++ b/criu/arch/x86/include/asm/restorer.h @@ -3,18 +3,21 @@ #include "asm/types.h" #include +#include #include "images/core.pb-c.h" #include #include #include "asm/compat.h" +#include "asm/shstk.h" #ifdef CONFIG_COMPAT extern void restore_tls(tls_t *ptls); -extern int arch_compat_rt_sigaction(void *stack32, int sig, - rt_sigaction_t_compat *act); +extern int arch_compat_rt_sigaction(void *stack32, int sig, rt_sigaction_t_compat *act); extern int set_compat_robust_list(uint32_t head_ptr, uint32_t len); -#else /* CONFIG_COMPAT */ -static inline void restore_tls(tls_t *ptls) { } +#else /* CONFIG_COMPAT */ +static inline void restore_tls(tls_t *ptls) +{ +} static inline int arch_compat_rt_sigaction(void *stack, int sig, void *act) { return -1; @@ -25,6 +28,22 @@ static inline int set_compat_robust_list(uint32_t head_ptr, uint32_t len) } #endif /* !CONFIG_COMPAT */ +/* + * Documentation copied from glibc sysdeps/unix/sysv/linux/x86_64/clone.S + * The kernel expects: + * rax: system call number + * rdi: flags + * rsi: child_stack + * rdx: TID field in parent + * r10: TID field in child + * r8: thread pointer + * + * int clone(unsigned long clone_flags, unsigned long newsp, + * int *parent_tidptr, int *child_tidptr, + * unsigned long tls); + */ + +/* clang-format off */ #define RUN_CLONE_RESTORE_FN(ret, clone_flags, new_sp, parent_tid, \ thread_args, clone_restore_fn) \ asm volatile( \ @@ -63,6 +82,83 @@ static inline int set_compat_robust_list(uint32_t head_ptr, uint32_t len) "g"(&thread_args[i]) \ : "rax", "rcx", "rdi", "rsi", "rdx", "r10", "r11", "memory") +/* int clone3(struct clone_args *args, size_t size) */ +#define RUN_CLONE3_RESTORE_FN(ret, clone_args, size, args, \ + clone_restore_fn) \ + asm volatile( \ + "clone3_emul: \n" \ + /* + * Prepare stack pointer for child process. The kernel does + * stack + stack_size before passing the stack pointer to the + * child process. As we have to put the function and the + * arguments for the new process on that stack we have handle + * the kernel's implicit stack + stack_size. + */ \ + "movq (%3), %%rsi /* new stack pointer */ \n" \ + /* Move the stack_size to %rax to use later as the offset */ \ + "movq %4, %%rax \n" \ + /* 16 bytes are needed on the stack for function and args */ \ + "subq $16, (%%rsi, %%rax) \n" \ + "movq %6, %%rdi /* thread args */ \n" \ + "movq %%rdi, 8(%%rsi, %%rax) \n" \ + "movq %5, %%rdi /* thread function */ \n" \ + "movq %%rdi, 0(%%rsi, %%rax) \n" \ + /* + * The stack address has been modified for the two + * elements above (child function, child arguments). + * This modified stack needs to be stored back into the + * clone_args structure. + */ \ + "movq (%%rsi), %3 \n" \ + /* + * Do the actual clone3() syscall. First argument (%rdi) is + * the clone_args structure, second argument is the size + * of clone_args. + */ \ + "movq %1, %%rdi /* clone_args */ \n" \ + "movq %2, %%rsi /* size */ \n" \ + "movl $"__stringify(__NR_clone3)", %%eax \n" \ + "syscall \n" \ + /* + * If clone3() was successful and if we are in the child + * '0' is returned. Jump to the child function handler. + */ \ + "testq %%rax,%%rax \n" \ + "jz thread3_run \n" \ + /* Return the PID to the parent process. */ \ + "movq %%rax, %0 \n" \ + "jmp clone3_end \n" \ + \ + "thread3_run: /* Child process */ \n" \ + /* Clear the frame pointer */ \ + "xorq %%rbp, %%rbp \n" \ + /* Pop the child function from the stack */ \ + "popq %%rax \n" \ + /* Pop the child function arguments from the stack */ \ + "popq %%rdi \n" \ + /* Run the child function */ \ + "callq *%%rax \n" \ + /* + * If the child function is expected to return, this + * would be the place to handle the return code. In CRIU's + * case the child function is expected to not return + * and do exit() itself. + */ \ + \ + "clone3_end: \n" \ + : "=r"(ret) \ + /* + * This uses the "r" modifier for all parameters + * as clang complained if using "g". + */ \ + : "r"(&clone_args), \ + "r"(size), \ + "r"(&clone_args.stack), \ + "r"(clone_args.stack_size), \ + "r"(clone_restore_fn), \ + "r"(args) \ + : "rax", "rcx", "rdi", "rsi", "rdx", "r10", "r11", "memory") + #define ARCH_FAIL_CORE_RESTORE \ asm volatile( \ "movq %0, %%rsp \n" \ @@ -71,30 +167,29 @@ static inline int set_compat_robust_list(uint32_t head_ptr, uint32_t len) : \ : "r"(ret) \ : "memory") +/* clang-format on */ -static inline void -__setup_sas_compat(struct ucontext_ia32* uc, ThreadSasEntry *sas) +static inline void __setup_sas_compat(struct ucontext_ia32 *uc, ThreadSasEntry *sas) { - uc->uc_stack.ss_sp = (compat_uptr_t)(sas)->ss_sp; - uc->uc_stack.ss_flags = (int)(sas)->ss_flags; - uc->uc_stack.ss_size = (compat_size_t)(sas)->ss_size; + uc->uc_stack.ss_sp = (compat_uptr_t)(sas)->ss_sp; + uc->uc_stack.ss_flags = (int)(sas)->ss_flags; + uc->uc_stack.ss_size = (compat_size_t)(sas)->ss_size; } -static inline void -__setup_sas(struct rt_sigframe* sigframe, ThreadSasEntry *sas) +static inline void __setup_sas(struct rt_sigframe *sigframe, ThreadSasEntry *sas) { if (sigframe->is_native) { - struct rt_ucontext *uc = &sigframe->native.uc; + struct rt_ucontext *uc = &sigframe->native.uc; - uc->uc_stack.ss_sp = (void *)decode_pointer((sas)->ss_sp); - uc->uc_stack.ss_flags = (int)(sas)->ss_flags; - uc->uc_stack.ss_size = (size_t)(sas)->ss_size; + uc->uc_stack.ss_sp = (void *)decode_pointer((sas)->ss_sp); + uc->uc_stack.ss_flags = (int)(sas)->ss_flags; + uc->uc_stack.ss_size = (size_t)(sas)->ss_size; } else { __setup_sas_compat(&sigframe->compat.uc, sas); } } -static inline void _setup_sas(struct rt_sigframe* sigframe, ThreadSasEntry *sas) +static inline void _setup_sas(struct rt_sigframe *sigframe, ThreadSasEntry *sas) { if (sas) __setup_sas(sigframe, sas); diff --git a/criu/arch/x86/include/asm/shstk.h b/criu/arch/x86/include/asm/shstk.h new file mode 100644 index 000000000..d113fd8ab --- /dev/null +++ b/criu/arch/x86/include/asm/shstk.h @@ -0,0 +1,304 @@ +#ifndef __CR_ASM_SHSTK_H__ +#define __CR_ASM_SHSTK_H__ + +/* + * Shadow stack constants from Linux + */ +/* arch/x86/include/uapi/asm/mman.h */ +#ifndef SHADOW_STACK_SET_TOKEN +#define SHADOW_STACK_SET_TOKEN 0x1 /* Set up a restore token in the shadow stack */ +#endif + +/* arch/x86/include/uapi/asm/prctl.h */ +#define ARCH_SHSTK_ENABLE 0x5001 +#define ARCH_SHSTK_DISABLE 0x5002 +#define ARCH_SHSTK_LOCK 0x5003 +#define ARCH_SHSTK_UNLOCK 0x5004 +#define ARCH_SHSTK_STATUS 0x5005 + +#define ARCH_SHSTK_SHSTK (1ULL << 0) +#define ARCH_SHSTK_WRSS (1ULL << 1) + +#define ARCH_HAS_SHSTK + +/* from arch/x86/kernel/shstk.c */ +#define SHSTK_DATA_BIT (1UL << 63) /* BIT(63) */ + +/* + * Shadow stack memory cannot be restored with memcpy/pread but only using + * a special instruction that can write to shadow stack. + * That instruction is only available when shadow stack is enabled, + * otherwise it causes #UD. + * + * Also, shadow stack VMAs cannot be mmap()ed or mrepmap()ed, they must be + * created using map_shadow_stack() system call. This pushes creation of + * shadow stack VMAs to the restorer blob after CRIU mappings are freed. + * + * And there is an additional jungling with shadow stacks to ensure that we + * don't unmap an active shadow stack + * + * The overall sequence of restoring shadow stack is + * - Enable shadow stack early after clone()ing the task + * - Unlock shadow stack features using ptrace + * - In the restorer blob: + * - switch to a temporary shadow stack to be able to unmap shadow stack + * with the CRIU mappings + * - after memory mappigns are restored, recreate shadow stack VMAs, + * populate them using wrss instruction and switch to the task shadow + * stack + * - lock shadow stack features + */ +struct rst_shstk_info { + unsigned long vma_start; /* start of shadow stack VMA */ + unsigned long vma_size; /* size of shadow stack VMA */ + unsigned long premmaped_addr; /* address of shadow stack copy in + the premmaped area */ + unsigned long tmp_shstk; /* address of temporary shadow stack */ + u64 ssp; /* shadow stack pointer */ + u64 cet; /* CET conrtol state */ +}; +#define rst_shstk_info rst_shstk_info + +struct task_restore_args; +struct pstree_item; + +int arch_shstk_prepare(struct pstree_item *item, CoreEntry *core, + struct task_restore_args *ta); +#define arch_shstk_prepare arch_shstk_prepare + +int arch_shstk_unlock(struct pstree_item *item, CoreEntry *core, pid_t pid); +#define arch_shstk_unlock arch_shstk_unlock + +int arch_shstk_trampoline(struct pstree_item *item, CoreEntry *core, + int (*func)(void *arg), void *arg); +#define arch_shstk_trampoline arch_shstk_trampoline + +static always_inline long shstk_restorer_stack_size(void) +{ + return PAGE_SIZE; +} +#define shstk_restorer_stack_size shstk_restorer_stack_size +static always_inline void shstk_set_restorer_stack(struct rst_shstk_info *info, void *ptr) +{ + info->tmp_shstk = (unsigned long)ptr; +} +#define shstk_set_restorer_stack shstk_set_restorer_stack + +static always_inline long shstk_min_mmap_addr(struct rst_shstk_info *info, unsigned long __maybe_unused def) +{ + return !(info->cet & ARCH_SHSTK_SHSTK) ? def : (4UL << 30); +} +#define shstk_min_mmap_addr shstk_min_mmap_addr + +#ifdef CR_NOGLIBC + +#include +#include +#include "vma.h" + +#define SHSTK_BUSY_BIT (1UL << 0) /* BIT(0) */ + +static inline int shstk_map(unsigned long addr, unsigned long size) +{ + long shstk = sys_map_shadow_stack(addr, size, SHADOW_STACK_SET_TOKEN); + + if (shstk < 0) { + pr_err("Failed to map shadow stack at %lx: %ld\n", addr, shstk); + return -1; + } + + if (shstk != addr) { + pr_err("Shadow stack address mismatch: need %lx, got %lx\n", addr, shstk); + return -1; + } + + pr_info("Created shadow stack at %lx\n", shstk); + + return 0; +} + +/* clang-format off */ +static inline unsigned long get_ssp(void) +{ + unsigned long ssp; + + asm volatile("rdsspq %0" : "=r"(ssp) :: ); + + return ssp; +} + +static inline void wrssq(unsigned long addr, unsigned long val) +{ + asm volatile("wrssq %1, (%0)" :: "r"(addr), "r"(val) : "memory"); +} +/* clang-format off */ + +static always_inline void shstk_switch_ssp(unsigned long new_ssp) +{ + unsigned long old_ssp = get_ssp(); + + asm volatile("rstorssp (%0)\n" :: "r"(new_ssp)); + asm volatile("saveprevssp"); + + pr_debug("changed ssp from %lx to %lx\n", old_ssp, new_ssp); +} + +/* + * Disable writes to the shadow stack and lock it's disable/enable control + */ +static inline int shstk_finalize(void) +{ + int ret = 0; + + ret = sys_arch_prctl(ARCH_SHSTK_DISABLE, ARCH_SHSTK_WRSS); + if (ret) { + pr_err("Failed to disable writes to shadow stack\n"); + return ret; + } + + ret = sys_arch_prctl(ARCH_SHSTK_LOCK, ARCH_SHSTK_SHSTK); + if (ret) + pr_err("Failed to lock shadow stack controls\n"); + + return ret; +} + +/* + * Create shadow stack vma and restore its content from premmapped anonymous (non-shstk) vma + */ +static always_inline int shstk_vma_restore(VmaEntry *vma_entry) +{ + long shstk, i; + unsigned long *shstk_data = (void *)vma_premmaped_start(vma_entry); + unsigned long vma_size = vma_entry_len(vma_entry); + long ret; + + shstk = sys_map_shadow_stack(0, vma_size, SHADOW_STACK_SET_TOKEN); + if (shstk < 0) { + pr_err("Failed to map shadow stack: %ld\n", shstk); + return -1; + } + + /* restore shadow stack contents */ + for (i = 0; i < vma_size / 8; i++) + wrssq(shstk + i * 8, shstk_data[i]); + + ret = sys_munmap(shstk_data, vma_size); + if (ret < 0) { + pr_err("Failed to unmap premmaped shadow stack\n"); + return ret; + } + + /* + * From that point premapped vma is (shstk) and we need + * to mremap() it to the final location. Originally premapped + * (shstk_data) has been unmapped already. + */ + vma_premmaped_start(vma_entry) = shstk; + + return 0; +} +#define shstk_vma_restore shstk_vma_restore + +/* + * Restore contents of the shadow stack and set shadow stack pointer + */ +static always_inline int shstk_restore(struct rst_shstk_info *cet) +{ + unsigned long ssp, val; + + if (!(cet->cet & ARCH_SHSTK_SHSTK)) + return 0; + + /* + * Add tokens for sigreturn frame and for switch of the shadow stack. + * The sigreturn token will be checked by the kernel during + * processing of sigreturn + * The token for stack switch is required by rstorssp and + * saveprevssp semantics + */ + + /* token for sigreturn frame */ + ssp = cet->ssp - 8; + val = ALIGN_DOWN(cet->ssp, 8) | SHSTK_DATA_BIT; + wrssq(ssp, val); + + /* shadow stack switch token */ + val = ssp | SHSTK_BUSY_BIT; + ssp -= 8; + wrssq(ssp, val); + + /* reset shadow stack pointer to the proper location */ + shstk_switch_ssp(ssp); + + return shstk_finalize(); +} +#define arch_shstk_restore shstk_restore + +/* + * Disable shadow stack + */ +static inline int shstk_disable(void) +{ + int ret; + + ret = sys_arch_prctl(ARCH_SHSTK_DISABLE, ARCH_SHSTK_WRSS); + if (ret) { + pr_err("Failed to disable writes to shadow stack\n"); + return ret; + } + + ret = sys_arch_prctl(ARCH_SHSTK_DISABLE, ARCH_SHSTK_SHSTK); + if (ret) { + pr_err("Failed to disable shadow stack\n"); + return ret; + } + + ret = sys_arch_prctl(ARCH_SHSTK_LOCK, ARCH_SHSTK_SHSTK); + if (ret) + pr_err("Failed to lock shadow stack controls\n"); + + return 0; +} + +/* + * Switch to temporary shadow stack + */ +static always_inline int shstk_switch_to_restorer(struct rst_shstk_info *cet) +{ + unsigned long ssp; + long ret; + + if (!(cet->cet & ARCH_SHSTK_SHSTK)) + return 0; + + ret = sys_munmap((void *)cet->tmp_shstk, PAGE_SIZE); + if (ret < 0) { + pr_err("Failed to unmap area for temporary shadow stack\n"); + return -1; + } + + ret = shstk_map(cet->tmp_shstk, PAGE_SIZE); + if (ret < 0) + return -1; + + /* + * Switch shadow stack from the default created by the kernel to a + * temporary shadow stack allocated in the premmaped area + */ + ssp = cet->tmp_shstk + PAGE_SIZE - 8; + shstk_switch_ssp(ssp); + + ret = sys_arch_prctl(ARCH_SHSTK_ENABLE, ARCH_SHSTK_WRSS); + if (ret) { + pr_err("Failed to enable writes to shadow stack\n"); + return ret; + } + + return 0; +} +#define arch_shstk_switch_to_restorer shstk_switch_to_restorer + +#endif /* CR_NOGLIBC */ + +#endif /* __CR_ASM_SHSTK_H__ */ diff --git a/criu/arch/x86/include/asm/thread_pointer.h b/criu/arch/x86/include/asm/thread_pointer.h new file mode 100644 index 000000000..08603aed4 --- /dev/null +++ b/criu/arch/x86/include/asm/thread_pointer.h @@ -0,0 +1,37 @@ +/* __thread_pointer definition. x86 version. + Copyright (C) 2021 Free Software Foundation, Inc. + This file is part of the GNU C Library. + + The GNU C Library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + The GNU C Library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with the GNU C Library. If not, see + . */ + +#ifndef _SYS_THREAD_POINTER_H +#define _SYS_THREAD_POINTER_H + +static inline void *__criu_thread_pointer(void) +{ +#if __GNUC_PREREQ(11, 1) + return __builtin_thread_pointer(); +#else + void *__result; +#ifdef __x86_64__ + __asm__("mov %%fs:0, %0" : "=r"(__result)); +#else + __asm__("mov %%gs:0, %0" : "=r"(__result)); +#endif + return __result; +#endif /* !GCC 11 */ +} + +#endif /* _SYS_THREAD_POINTER_H */ \ No newline at end of file diff --git a/criu/arch/x86/include/asm/types.h b/criu/arch/x86/include/asm/types.h index 3ff7fc630..8919d0ae6 100644 --- a/criu/arch/x86/include/asm/types.h +++ b/criu/arch/x86/include/asm/types.h @@ -15,12 +15,12 @@ static inline int core_is_compat(CoreEntry *c) { switch (c->thread_info->gpregs->mode) { - case USER_X86_REGS_MODE__NATIVE: - return 0; - case USER_X86_REGS_MODE__COMPAT: - return 1; - default: - return -1; + case USER_X86_REGS_MODE__NATIVE: + return 0; + case USER_X86_REGS_MODE__COMPAT: + return 1; + default: + return -1; } } @@ -28,25 +28,20 @@ static inline int core_is_compat(CoreEntry *c) #define CORE_THREAD_ARCH_INFO(core) core->thread_info +#define TI_IP(core) ((core)->thread_info->gpregs->ip) + typedef UserX86RegsEntry UserRegsEntry; -static inline u64 encode_pointer(void *p) { return (u64)(long)p; } -static inline void *decode_pointer(u64 v) { return (void*)(long)v; } +static inline u64 encode_pointer(void *p) +{ + return (u64)(long)p; +} +static inline void *decode_pointer(u64 v) +{ + return (void *)(long)v; +} -#define AT_VECTOR_SIZE 44 +#define AT_VECTOR_SIZE 44 typedef uint64_t auxv_t; -/* - * Linux preserves three TLS segments in GDT. - * Offsets in GDT differ between 32-bit and 64-bit machines. - * For 64-bit x86 those GDT offsets are the same - * for native and compat tasks. - */ -#define GDT_ENTRY_TLS_MIN 12 -#define GDT_ENTRY_TLS_MAX 14 -#define GDT_ENTRY_TLS_NUM 3 -typedef struct { - user_desc_t desc[GDT_ENTRY_TLS_NUM]; -} tls_t; - #endif /* __CR_ASM_TYPES_H__ */ diff --git a/criu/arch/x86/include/asm/vdso.h b/criu/arch/x86/include/asm/vdso.h index 28ae2d15a..ca46374a5 100644 --- a/criu/arch/x86/include/asm/vdso.h +++ b/criu/arch/x86/include/asm/vdso.h @@ -12,8 +12,8 @@ * This is a minimal amount of symbols * we should support at the moment. */ -#define VDSO_SYMBOL_MAX 6 -#define VDSO_SYMBOL_GTOD 2 +#define VDSO_SYMBOL_MAX 7 +#define VDSO_SYMBOL_GTOD 2 /* * XXX: we don't patch __kernel_vsyscall as it's too small: @@ -35,33 +35,36 @@ * vsyscall will be patched again when addressing: * https://github.com/checkpoint-restore/criu/issues/512 */ -#define ARCH_VDSO_SYMBOLS \ - "__vdso_clock_gettime", \ - "__vdso_getcpu", \ - "__vdso_gettimeofday", \ - "__vdso_time", \ - "__kernel_sigreturn", \ - "__kernel_rt_sigreturn" + +#define ARCH_VDSO_SYMBOLS_LIST \ + const char *aarch_vdso_symbol1 = "__vdso_clock_gettime"; \ + const char *aarch_vdso_symbol2 = "__vdso_getcpu"; \ + const char *aarch_vdso_symbol3 = "__vdso_gettimeofday"; \ + const char *aarch_vdso_symbol4 = "__vdso_time"; \ + const char *aarch_vdso_symbol5 = "__kernel_sigreturn"; \ + const char *aarch_vdso_symbol6 = "__kernel_rt_sigreturn"; \ + const char *aarch_vdso_symbol7 = "__vdso_clock_gettime64"; \ + +#define ARCH_VDSO_SYMBOLS \ + aarch_vdso_symbol1, aarch_vdso_symbol2, aarch_vdso_symbol3, aarch_vdso_symbol4, aarch_vdso_symbol5, \ + aarch_vdso_symbol6, aarch_vdso_symbol7 /* "__kernel_vsyscall", */ #ifndef ARCH_MAP_VDSO_32 -# define ARCH_MAP_VDSO_32 0x2002 +#define ARCH_MAP_VDSO_32 0x2002 #endif #ifndef ARCH_MAP_VDSO_64 -# define ARCH_MAP_VDSO_64 0x2003 +#define ARCH_MAP_VDSO_64 0x2003 #endif #if defined(CONFIG_COMPAT) && !defined(__ASSEMBLY__) struct vdso_symtable; -extern int vdso_fill_symtable(uintptr_t mem, size_t size, - struct vdso_symtable *t); -extern int vdso_fill_symtable_compat(uintptr_t mem, size_t size, - struct vdso_symtable *t); +extern int vdso_fill_symtable(uintptr_t mem, size_t size, struct vdso_symtable *t); +extern int vdso_fill_symtable_compat(uintptr_t mem, size_t size, struct vdso_symtable *t); -static inline int __vdso_fill_symtable(uintptr_t mem, size_t size, - struct vdso_symtable *t, bool compat_vdso) +static inline int __vdso_fill_symtable(uintptr_t mem, size_t size, struct vdso_symtable *t, bool compat_vdso) { if (compat_vdso) return vdso_fill_symtable_compat(mem, size, t); diff --git a/criu/arch/x86/kerndat.c b/criu/arch/x86/kerndat.c index f7593251b..3a58bbea7 100644 --- a/criu/arch/x86/kerndat.c +++ b/criu/arch/x86/kerndat.c @@ -17,6 +17,7 @@ #include "asm/compat.h" #include "asm/dump.h" +#include "asm/shstk.h" int kdat_can_map_vdso(void) { @@ -58,26 +59,22 @@ int kdat_can_map_vdso(void) return -1; return WEXITSTATUS(stat); - } #ifdef CONFIG_COMPAT -void *mmap_ia32(void *addr, size_t len, int prot, - int flags, int fildes, off_t off) +void *mmap_ia32(void *addr, size_t len, int prot, int flags, int fildes, off_t off) { struct syscall_args32 s; - s.nr = __NR32_mmap2; - s.arg0 = (uint32_t)(uintptr_t)addr; - s.arg1 = (uint32_t)len; - s.arg2 = prot; - s.arg3 = flags; - s.arg4 = fildes; - s.arg5 = (uint32_t)off; + s.nr = __NR32_mmap2; + s.arg0 = (uint32_t)(uintptr_t)addr; + s.arg1 = (uint32_t)len; + s.arg2 = prot; + s.arg3 = flags; + s.arg4 = fildes; + s.arg5 = (uint32_t)off; - do_full_int80(&s); - - return (void *)(uintptr_t)s.nr; + return (void *)(uintptr_t)do_full_int80(&s); } /* @@ -103,7 +100,7 @@ static void mmap_bug_test(void) void *map1, *map2; int err; - map1 = mmap_ia32(0, PAGE_SIZE, PROT_NONE, MAP_ANONYMOUS|MAP_PRIVATE, -1, 0); + map1 = mmap_ia32(0, PAGE_SIZE, PROT_NONE, MAP_ANONYMOUS | MAP_PRIVATE, -1, 0); /* 32-bit error, not sign-extended - can't use IS_ERR_VALUE() here */ err = (uintptr_t)map1 % PAGE_SIZE; if (err) { @@ -112,11 +109,11 @@ static void mmap_bug_test(void) } if (munmap(map1, PAGE_SIZE)) { - pr_err("Failed to unmap() 32-bit mapping: %m\n"); + pr_perror("Failed to unmap() 32-bit mapping"); exit(1); } - map2 = mmap_ia32(0, PAGE_SIZE, PROT_NONE, MAP_ANONYMOUS|MAP_PRIVATE, -1, 0); + map2 = mmap_ia32(0, PAGE_SIZE, PROT_NONE, MAP_ANONYMOUS | MAP_PRIVATE, -1, 0); err = (uintptr_t)map2 % PAGE_SIZE; if (err) { pr_err("ia32 mmap() failed: %d\n", err); @@ -202,7 +199,7 @@ static int kdat_x86_has_ptrace_fpu_xsave_bug_child(void *arg) */ int kdat_x86_has_ptrace_fpu_xsave_bug(void) { - user_fpregs_struct_t xsave = { }; + user_fpregs_struct_t xsave = {}; struct iovec iov; char stack[PAGE_SIZE]; int flags = CLONE_VM | CLONE_FILES | CLONE_UNTRACED | SIGCHLD; @@ -214,8 +211,7 @@ int kdat_x86_has_ptrace_fpu_xsave_bug(void) if (!compel_cpu_has_feature(X86_FEATURE_OSXSAVE)) return 0; - child = clone(kdat_x86_has_ptrace_fpu_xsave_bug_child, - stack + ARRAY_SIZE(stack), flags, 0); + child = clone(kdat_x86_has_ptrace_fpu_xsave_bug_child, stack + ARRAY_SIZE(stack), flags, 0); if (child < 0) { pr_perror("%s(): failed to clone()", __func__); return -1; @@ -226,7 +222,7 @@ int kdat_x86_has_ptrace_fpu_xsave_bug(void) * waitpid() may end with ECHILD if SIGCHLD == SIG_IGN, * and the child has stopped already. */ - pr_perror("Failed to wait for %s() test\n", __func__); + pr_perror("Failed to wait for %s() test", __func__); goto out_kill; } @@ -256,3 +252,29 @@ out_kill: return ret; } + +/* + * Unlike most kerndat knobs, this does not check for availability of the + * shadow stack in the kernel, but rather checks if criu runs with shadow + * stack enabled. + * + * This depends on hardware availability, kernel and glibc support, compiler + * options and glibc tunables. + */ +int kdat_has_shstk(void) +{ + unsigned long features; + + if (!compel_cpu_has_feature(X86_FEATURE_SHSTK)) + return 0; + + if (syscall(__NR_arch_prctl, ARCH_SHSTK_STATUS, &features)) { + /* kernels that don't support shadow stack return -EINVAL */ + if (errno == EINVAL) + return 0; + pr_perror("Cannot get shadow stack status"); + return 1; + } + + return !!(features & ARCH_SHSTK_SHSTK); +} diff --git a/criu/arch/x86/restorer.c b/criu/arch/x86/restorer.c index 2d335d5e1..2ceb26fca 100644 --- a/criu/arch/x86/restorer.c +++ b/criu/arch/x86/restorer.c @@ -17,8 +17,7 @@ int arch_map_vdso(unsigned long map_at, bool compatible) { int vdso_type = compatible ? ARCH_MAP_VDSO_32 : ARCH_MAP_VDSO_64; - pr_debug("Mapping %s vDSO at %lx\n", - compatible ? "compatible" : "native", map_at); + pr_debug("Mapping %s vDSO at %lx\n", compatible ? "compatible" : "native", map_at); return sys_arch_prctl(vdso_type, map_at); } @@ -49,13 +48,12 @@ int restore_nonsigframe_gpregs(UserX86RegsEntry *r) int set_compat_robust_list(uint32_t head_ptr, uint32_t len) { struct syscall_args32 s = { - .nr = __NR32_set_robust_list, - .arg0 = head_ptr, - .arg1 = len, + .nr = __NR32_set_robust_list, + .arg0 = head_ptr, + .arg1 = len, }; - do_full_int80(&s); - return (int)s.nr; + return do_full_int80(&s); } static int prepare_stack32(void **stack32) @@ -96,18 +94,16 @@ void restore_tls(tls_t *ptls) return; memcpy(stack32, desc, sizeof(user_desc_t)); - asm volatile ( - " mov %1,%%eax \n" - " mov %2,%%ebx \n" - " int $0x80 \n" - " mov %%eax,%0 \n" - : "=g"(ret) - : "r"(__NR32_set_thread_area), "r"((uint32_t)(uintptr_t)stack32) - : "eax", "ebx", "r8", "r9", "r10", "r11", "memory"); + asm volatile(" mov %1,%%eax \n" + " mov %2,%%ebx \n" + " int $0x80 \n" + " mov %%eax,%0 \n" + : "=g"(ret) + : "r"(__NR32_set_thread_area), "r"((uint32_t)(uintptr_t)stack32) + : "eax", "ebx", "r8", "r9", "r10", "r11", "memory"); if (ret) - pr_err("Failed to restore TLS descriptor %u in GDT: %d\n", - desc->entry_number, ret); + pr_err("Failed to restore TLS descriptor %u in GDT: %d\n", desc->entry_number, ret); } if (stack32) diff --git a/criu/arch/x86/shstk.c b/criu/arch/x86/shstk.c new file mode 100644 index 000000000..0810efac5 --- /dev/null +++ b/criu/arch/x86/shstk.c @@ -0,0 +1,222 @@ +#include +#include + +#include + +#include + +#include "pstree.h" +#include "restorer.h" +#include "rst-malloc.h" +#include "vma.h" + +static bool task_needs_shstk(struct pstree_item *item, CoreEntry *core) +{ + UserX86FpregsEntry *fpregs; + + if (!task_alive(item)) + return false; + + fpregs = core->thread_info->fpregs; + if (fpregs->xsave && fpregs->xsave->cet) { + if (!compel_cpu_has_feature(X86_FEATURE_SHSTK)) { + pr_warn_once("Restoring task with shadow stack on non-CET machine\n"); + return false; + } + + if (fpregs->xsave->cet->cet & ARCH_SHSTK_SHSTK) + return true; + } + + return false; +} + +static int shstk_prepare_task(struct vm_area_list *vmas, + struct rst_shstk_info *shstk) +{ + struct vma_area *vma; + + list_for_each_entry(vma, &vmas->h, list) { + if (vma_area_is(vma, VMA_AREA_SHSTK) && + in_vma_area(vma, shstk->ssp)) { + unsigned long premmaped_addr = vma->premmaped_addr; + unsigned long size = vma_area_len(vma); + + shstk->vma_start = vma->e->start; + shstk->vma_size = size; + shstk->premmaped_addr = premmaped_addr; + + break; + } + } + + return 0; +} + +int arch_shstk_prepare(struct pstree_item *item, CoreEntry *core, + struct task_restore_args *ta) +{ + struct thread_restore_args *args_array = (struct thread_restore_args *)(&ta[1]); + UserX86FpregsEntry *fpregs = core->thread_info->fpregs; + struct vm_area_list *vmas = &rsti(item)->vmas; + struct rst_shstk_info *shstk = &ta->shstk; + int i; + + if (!task_needs_shstk(item, core)) + return 0; + + shstk->cet = fpregs->xsave->cet->cet; + shstk->ssp = fpregs->xsave->cet->ssp; + + if (shstk_prepare_task(vmas, shstk)) { + pr_err("Failed to prepare shadow stack memory\n"); + return -1; + } + + for (i = 0; i < item->nr_threads; i++) { + struct thread_restore_args *thread_args = &args_array[i]; + + core = item->core[i]; + fpregs = core->thread_info->fpregs; + shstk = &thread_args->shstk; + + shstk->cet = fpregs->xsave->cet->cet; + shstk->ssp = fpregs->xsave->cet->ssp; + if (shstk_prepare_task(vmas, shstk)) { + pr_err("Failed to prepare shadow stack memory\n"); + return -1; + } + } + + return 0; +} + +int arch_shstk_unlock(struct pstree_item *item, CoreEntry *core, pid_t pid) +{ + unsigned long features; + int status; + int ret = -1; + + /* + * CRIU runs with no shadow stack and the task does not need one, + * nothing to do. + */ + if (!kdat.has_shstk && !task_needs_shstk(item, core)) + return 0; + + futex_wait_until(&rsti(item)->shstk_enable, 1); + + if (ptrace(PTRACE_SEIZE, pid, 0, 0)) { + pr_perror("Cannot attach to %d", pid); + goto futex_wake; + } + + if (ptrace(PTRACE_INTERRUPT, pid, 0, 0)) { + pr_perror("Cannot interrupt the %d task", pid); + goto detach; + } + + if (wait4(pid, &status, __WALL, NULL) != pid) { + pr_perror("waitpid(%d) failed", pid); + goto detach; + } + + features = ARCH_SHSTK_SHSTK | ARCH_SHSTK_WRSS; + if (ptrace(PTRACE_ARCH_PRCTL, pid, features, ARCH_SHSTK_UNLOCK)) { + pr_perror("Cannot unlock CET for %d task", pid); + goto detach; + } + +detach: + if (ptrace(PTRACE_DETACH, pid, NULL, 0)) { + pr_perror("Unable to detach %d", pid); + goto futex_wake; + } + + ret = 0; + +futex_wake: + futex_set_and_wake(&rsti(item)->shstk_unlock, 1); + + return ret; +} + +static void shstk_sync_unlock(struct pstree_item *item) +{ + /* notify parent that shadow stack is enabled ... */ + futex_set_and_wake(&rsti(item)->shstk_enable, 1); + + /* ... and wait until it unlocks its features with ptrace */ + futex_wait_until(&rsti(item)->shstk_unlock, 1); +} + +static void __arch_shstk_enable(struct pstree_item *item, + int (*func)(void *arg), void *arg) +{ + int ret; + + shstk_sync_unlock(item); + + /* return here would cause #CP, use exit() instead */ + ret = func(arg); + exit(ret); +} + +static int shstk_disable(struct pstree_item *item) +{ + shstk_sync_unlock(item); + + /* disable shadow stack, implicitly clears ARCH_SHSTK_WRSS */ + if (syscall(__NR_arch_prctl, ARCH_SHSTK_DISABLE, ARCH_SHSTK_SHSTK)) { + pr_perror("Failed to disable shadow stack"); + return -1; + } + + if (syscall(__NR_arch_prctl, ARCH_SHSTK_LOCK, + ARCH_SHSTK_SHSTK | ARCH_SHSTK_WRSS)) { + pr_perror("Failed to lock shadow stack controls"); + return -1; + } + + return 0; +} + +int arch_shstk_trampoline(struct pstree_item *item, CoreEntry *core, + int (*func)(void *arg), void *arg) +{ + unsigned long features = ARCH_SHSTK_SHSTK; + int code = ARCH_SHSTK_ENABLE; + + /* + * If task does not need shadow stack but CRIU runs with shadow + * stack enabled, we should disable it before continuing with + * restore + */ + if (!task_needs_shstk(item, core)) { + if (kdat.has_shstk && shstk_disable(item)) + return -1; + return func(arg); + } + + /* + * Calling sys_arch_prctl() means there will be use of retq + * instruction after shadow stack is enabled and this will cause + * Control Protectiond fault. Open code sys_arch_prctl() in + * assembly. + * + * code and addr should be in %rdi and %rsi and will be passed to + * the system call as is. + */ + asm volatile("movq $"__stringify(__NR_arch_prctl)", %%rax \n" + "syscall \n" + "cmpq $0, %%rax \n" + "je 1f \n" + "retq \n" + "1: \n" + :: "D"(code), "S"(features)); + + __arch_shstk_enable(item, func, arg); + + /* never reached */ + return -1; +} diff --git a/criu/arch/x86/sigaction_compat.c b/criu/arch/x86/sigaction_compat.c index b38ba8011..506a8d1bb 100644 --- a/criu/arch/x86/sigaction_compat.c +++ b/criu/arch/x86/sigaction_compat.c @@ -5,21 +5,21 @@ #include #ifdef CR_NOGLIBC -# include +#include #endif #include "cpu.h" -asm ( " .pushsection .text \n" - " .global restore_rt_sigaction \n" - " .code32 \n" - "restore_rt_sigaction: \n" - " mov %edx, %esi \n" - " mov $0, %edx \n" - " movl $"__stringify(__NR32_rt_sigaction)",%eax \n" - " int $0x80 \n" - " ret \n" - " .popsection \n" - " .code64"); +asm(" .pushsection .text \n" + " .global restore_rt_sigaction \n" + " .code32 \n" + "restore_rt_sigaction: \n" + " mov %edx, %esi \n" + " mov $0, %edx \n" + " movl $" __stringify(__NR32_rt_sigaction) ",%eax \n" + " int $0x80 \n" + " ret \n" + " .popsection \n" + " .code64"); extern char restore_rt_sigaction; /* @@ -28,7 +28,6 @@ extern char restore_rt_sigaction; */ int arch_compat_rt_sigaction(void *stack32, int sig, rt_sigaction_t_compat *act) { - int ret; struct syscall_args32 arg = {}; unsigned long act_stack = (unsigned long)stack32; @@ -43,14 +42,11 @@ int arch_compat_rt_sigaction(void *stack32, int sig, rt_sigaction_t_compat *act) * coping it on the bottom of the stack. */ memcpy(stack32, act, sizeof(rt_sigaction_t_compat)); - arg.nr = __NR32_rt_sigaction; - arg.arg0 = sig; - arg.arg1 = (uint32_t)act_stack; /* act */ - arg.arg2 = 0; /* oldact */ - arg.arg3 = (uint32_t)sizeof(act->rt_sa_mask); /* sigsetsize */ + arg.nr = __NR32_rt_sigaction; + arg.arg0 = sig; + arg.arg1 = (uint32_t)act_stack; /* act */ + arg.arg2 = 0; /* oldact */ + arg.arg3 = (uint32_t)sizeof(act->rt_sa_mask); /* sigsetsize */ - do_full_int80(&arg); - asm volatile ("\t movl %%eax,%0\n" : "=r"(ret)); - return ret; + return do_full_int80(&arg); } - diff --git a/criu/arch/x86/sigframe.c b/criu/arch/x86/sigframe.c index 11b0d640d..46612e70d 100644 --- a/criu/arch/x86/sigframe.c +++ b/criu/arch/x86/sigframe.c @@ -6,30 +6,30 @@ #include "log.h" -int sigreturn_prep_fpu_frame(struct rt_sigframe *sigframe, - struct rt_sigframe *rsigframe) +int sigreturn_prep_fpu_frame(struct rt_sigframe *sigframe, struct rt_sigframe *rsigframe) { /* * Use local sigframe to check native/compat type, * but set address for rsigframe. */ - fpu_state_t *fpu_state = (sigframe->is_native) ? - &rsigframe->native.fpu_state : - &rsigframe->compat.fpu_state; + fpu_state_t *fpu_state = (sigframe->is_native) ? &rsigframe->native.fpu_state : &rsigframe->compat.fpu_state; if (sigframe->is_native) { unsigned long addr = (unsigned long)(void *)&fpu_state->fpu_state_64.xsave; if ((addr % 64ul)) { - pr_err("Unaligned address passed: %lx (native %d)\n", - addr, sigframe->is_native); + pr_err("Unaligned address passed: %lx (native %d)\n", addr, sigframe->is_native); return -1; } sigframe->native.uc.uc_mcontext.fpstate = (uint64_t)addr; - } else if (!sigframe->is_native) { - sigframe->compat.uc.uc_mcontext.fpstate = - (uint32_t)(unsigned long)(void *)&fpu_state->fpu_state_ia32; + } else { + unsigned long addr = (unsigned long)(void *)&fpu_state->fpu_state_ia32.xsave; + sigframe->compat.uc.uc_mcontext.fpstate = (uint32_t)(unsigned long)(void *)&fpu_state->fpu_state_ia32; + if ((addr % 64ul)) { + pr_err("Unaligned address passed: %lx (native %d)\n", addr, sigframe->is_native); + return -1; + } } return 0; diff --git a/criu/arch/x86/sys-exec-tbl.c b/criu/arch/x86/sys-exec-tbl.c index 608dc2510..cd7cda47c 100644 --- a/criu/arch/x86/sys-exec-tbl.c +++ b/criu/arch/x86/sys-exec-tbl.c @@ -1,21 +1,19 @@ -#include static struct syscall_exec_desc sc_exec_table_64[] = { #include "sys-exec-tbl-64.c" - { }, /* terminator */ + {}, /* terminator */ }; #ifdef CONFIG_COMPAT static struct syscall_exec_desc sc_exec_table_32[] = { #include "sys-exec-tbl-32.c" - { }, /* terminator */ + {}, /* terminator */ }; #endif struct syscall_exec_desc; -static inline struct syscall_exec_desc * -find_syscall_table(char *name, struct syscall_exec_desc *tbl) +static inline struct syscall_exec_desc *find_syscall_table(char *name, struct syscall_exec_desc *tbl) { int i; @@ -28,7 +26,7 @@ find_syscall_table(char *name, struct syscall_exec_desc *tbl) #define ARCH_HAS_FIND_SYSCALL /* overwrite default to search in two tables above */ #ifdef CONFIG_COMPAT -struct syscall_exec_desc * find_syscall(char *name, struct parasite_ctl *ctl) +struct syscall_exec_desc *find_syscall(char *name, struct parasite_ctl *ctl) { if (compel_mode_native(ctl)) return find_syscall_table(name, sc_exec_table_64); @@ -36,8 +34,7 @@ struct syscall_exec_desc * find_syscall(char *name, struct parasite_ctl *ctl) return find_syscall_table(name, sc_exec_table_32); } #else -struct syscall_exec_desc * -find_syscall(char *name, __always_unused struct parasite_ctl *ctl) +struct syscall_exec_desc *find_syscall(char *name, __always_unused struct parasite_ctl *ctl) { return find_syscall_table(name, sc_exec_table_64); } diff --git a/criu/arch/x86/vdso-pie.c b/criu/arch/x86/vdso-pie.c index 988cf0869..6b4b82b02 100644 --- a/criu/arch/x86/vdso-pie.c +++ b/criu/arch/x86/vdso-pie.c @@ -9,22 +9,22 @@ #include "common/bug.h" #ifdef LOG_PREFIX -# undef LOG_PREFIX +#undef LOG_PREFIX #endif #define LOG_PREFIX "vdso: " static void insert_trampoline32(uintptr_t from, uintptr_t to) { struct { - u8 movl; - u32 imm32; - u16 jmp_eax; - u32 guards; + u8 movl; + u32 imm32; + u16 jmp_eax; + u32 guards; } __packed jmp = { - .movl = 0xb8, - .imm32 = (uint32_t)to, - .jmp_eax = 0xe0ff, - .guards = 0xcccccccc, + .movl = 0xb8, + .imm32 = (uint32_t)to, + .jmp_eax = 0xe0ff, + .guards = 0xcccccccc, }; memcpy((void *)from, &jmp, sizeof(jmp)); @@ -33,23 +33,22 @@ static void insert_trampoline32(uintptr_t from, uintptr_t to) static void insert_trampoline64(uintptr_t from, uintptr_t to) { struct { - u16 movabs; - u64 imm64; - u16 jmp_rax; - u32 guards; + u16 movabs; + u64 imm64; + u16 jmp_rax; + u32 guards; } __packed jmp = { - .movabs = 0xb848, - .imm64 = to, - .jmp_rax = 0xe0ff, - .guards = 0xcccccccc, + .movabs = 0xb848, + .imm64 = to, + .jmp_rax = 0xe0ff, + .guards = 0xcccccccc, }; memcpy((void *)from, &jmp, sizeof(jmp)); } -int vdso_redirect_calls(unsigned long base_to, unsigned long base_from, - struct vdso_symtable *sto, struct vdso_symtable *sfrom, - bool compat_vdso) +int vdso_redirect_calls(unsigned long base_to, unsigned long base_from, struct vdso_symtable *sto, + struct vdso_symtable *sfrom, bool compat_vdso) { unsigned int i; @@ -59,9 +58,8 @@ int vdso_redirect_calls(unsigned long base_to, unsigned long base_from, if (vdso_symbol_empty(&sfrom->symbols[i])) continue; - pr_debug("jmp: %lx/%lx -> %lx/%lx (index %d)\n", - base_from, sfrom->symbols[i].offset, - base_to, sto->symbols[i].offset, i); + pr_debug("jmp: %lx/%lx -> %lx/%lx (index %d)\n", base_from, sfrom->symbols[i].offset, base_to, + sto->symbols[i].offset, i); from = base_from + sfrom->symbols[i].offset; to = base_to + sto->symbols[i].offset; diff --git a/criu/autofs.c b/criu/autofs.c index a2dc60ffc..a1775cbc9 100644 --- a/criu/autofs.c +++ b/criu/autofs.c @@ -18,13 +18,13 @@ #include "images/autofs.pb-c.h" -#define AUTOFS_OPT_UNKNOWN INT_MIN +#define AUTOFS_OPT_UNKNOWN INT_MIN -#define AUTOFS_MODE_DIRECT 0 -#define AUTOFS_MODE_INDIRECT 1 -#define AUTOFS_MODE_OFFSET 2 +#define AUTOFS_MODE_DIRECT 0 +#define AUTOFS_MODE_INDIRECT 1 +#define AUTOFS_MODE_OFFSET 2 -#define AUTOFS_CATATONIC_FD -1 +#define AUTOFS_CATATONIC_FD -1 static int autofs_mnt_open(const char *mnt_path, dev_t devid); @@ -62,29 +62,56 @@ int autofs_parse(struct mount_info *pm) { long pipe_ino = AUTOFS_OPT_UNKNOWN; char **opts; - int nr_opts, i; + int nr_opts, i, ret; split(pm->options, ',', &opts, &nr_opts); if (!opts) return -1; + for (i = 0; i < nr_opts; i++) { if (!strncmp(opts[i], "pipe_ino=", strlen("pipe_ino="))) - pipe_ino = atoi(opts[i] + strlen("pipe_ino=")); + if (xatol(opts[i] + strlen("pipe_ino="), &pipe_ino)) { + pr_err("pipe_ino (%s) mount option parse failed\n", opts[i] + strlen("pipe_ino=")); + ret = -1; + goto free; + } + } + + /* + * We must inform user about bug if pipe_ino is greater than UINT32_MAX, + * because it means that something changed in Linux Kernel virtual fs + * inode numbers generation mechanism. What we have at the moment: + * 1. struct inode i_ino field (include/linux/fs.h in Linux kernel) + * has unsigned long type. + * 2. get_next_ino() function (fs/inode.c), that used for generating inode + * numbers on virtual filesystems (pipefs, debugfs for instance) + * has unsigned int as return type. + * So, it means that ATM it is safe to keep uint32 type for pipe_id field + * in pipe-data.proto. + */ + if (pipe_ino > UINT32_MAX) { + pr_err("overflow: pipe_ino > UINT32_MAX\n"); + ret = -1; + goto free; } - for (i = 0; i < nr_opts; i++) - xfree(opts[i]); - free(opts); if (pipe_ino == AUTOFS_OPT_UNKNOWN) { pr_warn("Failed to find pipe_ino option (old kernel?)\n"); - return 0; + ret = 0; + goto free; } - return autofs_gather_pipe(pipe_ino); + ret = autofs_gather_pipe(pipe_ino); + +free: + for (i = 0; i < nr_opts; i++) + xfree(opts[i]); + xfree(opts); + + return ret; } -static int autofs_check_fd_stat(struct stat *stat, int prgp, int fd, - long ino, int *mode) +static int autofs_check_fd_stat(struct stat *stat, int prgp, int fd, long ino, int *mode) { struct fdinfo_common fdinfo; @@ -115,6 +142,7 @@ static int autofs_kernel_pipe_alive(int pgrp, int fd, int ino) return 0; } pr_perror("Failed to stat %s", path); + xfree(path); return -1; } @@ -180,13 +208,15 @@ static int autofs_find_read_fd(int pgrp, long pipe_ino) /* We need to find read end and make sure, that it's empty */ if (autofs_find_pipe_read_end(pgrp, pipe_ino, &read_fd) < 0) { pr_err("Failed to find read pipe fd (ino %ld) " - "in process %d\n", pipe_ino, pgrp); + "in process %d\n", + pipe_ino, pgrp); return -1; } if (read_fd == -1) { pr_err("Master %d doesn't have a read end of the pipe with " - "inode %ld opened\n", pgrp, pipe_ino); + "inode %ld opened\n", + pgrp, pipe_ino); pr_err("Abandoned mount or control was delegated to child?\n"); return -ENOENT; } @@ -197,8 +227,7 @@ static int autofs_find_read_fd(int pgrp, long pipe_ino) return -1; if (fd_has_data(fd)) { - pr_err("Process %d autofs pipe fd %d is not empty.\n", pgrp, - read_fd); + pr_err("Process %d autofs pipe fd %d is not empty.\n", pgrp, read_fd); pr_err("Try again later.\n"); return -1; } @@ -301,11 +330,11 @@ static int parse_options(char *options, AutofsEntry *entry, long *pipe_ino) static int autofs_revisit_options(struct mount_info *pm) { FILE *f; - char *str; + char *buf; int ret = -ENOMEM; - str = xmalloc(1024); - if (!str) { + buf = xmalloc(1024); + if (!buf) { return -ENOMEM; } @@ -313,12 +342,13 @@ static int autofs_revisit_options(struct mount_info *pm) if (!f) goto free_str; - while (fgets(str, 1024, f)) { + while (fgets(buf, 1024, f)) { int mnt_id = -1; + char *str = buf; char *token; /* Removing '/n' */ - str[strlen(str)-1] = '\0'; + str[strlen(str) - 1] = '\0'; while ((token = strsep(&str, " ")) != NULL) { if (mnt_id == -1) { @@ -347,7 +377,7 @@ static int autofs_revisit_options(struct mount_info *pm) close_proc: fclose(f); free_str: - free(str); + free(buf); return ret; } @@ -358,10 +388,10 @@ free_str: */ static int access_autofs_mount(struct mount_info *pm) { - const char *mnt_path = pm->mountpoint + 1; + const char *mnt_path = service_mountpoint(pm) + 1; dev_t dev_id = pm->s_dev; int new_pid_ns = -1, old_pid_ns = -1; - int old_mnt_ns; + int old_mnt_ns, old_cwd_fd; int autofs_mnt; int err = -1; int pid, status; @@ -379,7 +409,7 @@ static int access_autofs_mount(struct mount_info *pm) if (old_pid_ns < 0) goto close_new_pid_ns; - if (switch_ns(pm->nsid->ns_pid, &mnt_ns_desc, &old_mnt_ns)) { + if (switch_mnt_ns(pm->nsid->ns_pid, &old_mnt_ns, &old_cwd_fd)) { pr_err("failed to switch to mount namespace\n"); goto close_old_pid_ns; } @@ -397,15 +427,14 @@ static int access_autofs_mount(struct mount_info *pm) pid = fork(); switch (pid) { - case -1: - pr_err("failed to fork\n"); - goto close_autofs_mnt; - case 0: - /* We don't care about results. - * All we need is to "touch" */ - openat(autofs_mnt, mnt_path, O_RDONLY|O_NONBLOCK|O_DIRECTORY); - _exit(0); - + case -1: + pr_err("failed to fork\n"); + goto close_autofs_mnt; + case 0: + /* We don't care about results, all we need is to "touch" */ + /* coverity[check_return] */ + openat(autofs_mnt, mnt_path, O_RDONLY | O_NONBLOCK | O_DIRECTORY); + _exit(0); } /* Here we also don't care about results */ waitpid(pid, &status, 0); @@ -421,7 +450,7 @@ restore_pid_ns: } old_pid_ns = -1; restore_mnt_ns: - if (restore_ns(old_mnt_ns, &mnt_ns_desc)) { + if (restore_mnt_ns(old_mnt_ns, &old_cwd_fd)) { pr_err("failed to restore mount namespace\n"); err = -1; } @@ -468,24 +497,21 @@ static int autofs_create_entry(struct mount_info *pm, AutofsEntry *entry) * options, then we can read them again and dump it. */ if (access_autofs_mount(pm)) { - pr_err("failed to access autofs %s\n", - pm->mountpoint + 1); + pr_err("failed to access autofs %s\n", service_mountpoint(pm) + 1); return -1; } if (parse_options(pm->options, entry, &pipe_ino)) return -1; if (entry->fd == AUTOFS_CATATONIC_FD) return 0; - pr_err("Autofs %d is alive, but unreachable.\n", - pm->mnt_id); + pr_err("Autofs %d is alive, but unreachable.\n", pm->mnt_id); return -1; } /* Let' check whether write end is still open */ found = autofs_kernel_pipe_alive(entry->pgrp, entry->fd, pipe_ino); if (found < 0) { - pr_err("Failed to check fd %d in process %d\n", - entry->fd, entry->pgrp); + pr_err("Failed to check fd %d in process %d\n", entry->fd, entry->pgrp); return -1; } /* Write end is absent. we need to carry read end to restore. */ @@ -497,8 +523,7 @@ static int autofs_create_entry(struct mount_info *pm, AutofsEntry *entry) /* We need to get virtual pgrp to restore mount */ virt_pgrp = pid_to_virt(entry->pgrp); if (!virt_pgrp) { - pr_err("failed to find pstree item with pid %d\n", - entry->pgrp); + pr_err("failed to find pstree item with pid %d\n", entry->pgrp); pr_err("Non-catatonic mount without master?\n"); return -1; } @@ -520,7 +545,6 @@ static int autofs_dump_entry(struct mount_info *pm, AutofsEntry *entry) return ret; } - int autofs_dump(struct mount_info *pm) { AutofsEntry *entry; @@ -551,8 +575,7 @@ typedef struct autofs_info_s { struct pprep_head ph; } autofs_info_t; -static int dup_pipe_info(struct pipe_info *pi, int flags, - struct file_desc_ops *ops) +static int dup_pipe_info(struct pipe_info *pi, int flags, struct file_desc_ops *ops) { struct pipe_info *new; PipeEntry *pe; @@ -578,9 +601,7 @@ static int dup_pipe_info(struct pipe_info *pi, int flags, return 0; } -static int autofs_dup_pipe(struct pstree_item *task, - struct fdinfo_list_entry *ple, - int new_fd) +static int autofs_dup_pipe(struct pstree_item *task, struct fdinfo_list_entry *ple, int new_fd) { struct pipe_info *pi = container_of(ple->desc, struct pipe_info, d); unsigned flags = O_WRONLY; @@ -588,23 +609,19 @@ static int autofs_dup_pipe(struct pstree_item *task, new_fd = find_unused_fd(task, new_fd); if (dup_pipe_info(pi, flags, pi->d.ops) < 0) { - pr_err("Failed to dup pipe entry ID %#x PIPE_ID %#x\n", - pi->pe->id, pi->pe->pipe_id); + pr_err("Failed to dup pipe entry ID %#x PIPE_ID %#x\n", pi->pe->id, pi->pe->pipe_id); return -1; } if (dup_fle(task, ple, new_fd, flags) < 0) { - pr_err("Failed to add fd %d to process %d\n", - new_fd, vpid(task)); + pr_err("Failed to add fd %d to process %d\n", new_fd, vpid(task)); return -1; } - pr_info("autofs: added pipe fd %d, flags %#x to %d\n", - new_fd, flags, vpid(task)); + pr_info("autofs: added pipe fd %d, flags %#x to %d\n", new_fd, flags, vpid(task)); return new_fd; } - static int autofs_ioctl(const char *path, int fd, int cmd, const void *param) { int err; @@ -618,7 +635,7 @@ static int autofs_ioctl(const char *path, int fd, int cmd, const void *param) static int autofs_dev_ioctl(int cmd, struct autofs_dev_ioctl *param) { - char *path = "/dev/"AUTOFS_DEVICE_NAME; + char *path = "/dev/" AUTOFS_DEVICE_NAME; int fd, err; fd = open(path, O_RDONLY); @@ -639,10 +656,9 @@ static int autofs_mnt_make_catatonic(const char *mnt_path, int mnt_fd) return autofs_ioctl(mnt_path, mnt_fd, AUTOFS_IOC_CATATONIC, NULL); } -static int autofs_mnt_set_timeout(time_t timeout, - const char *mnt_path, int mnt_fd) +static int autofs_mnt_set_timeout(time_t timeout, const char *mnt_path, int mnt_fd) { - pr_info("%s: set timeout %ld for %s\n", __func__, timeout, mnt_path); + pr_info("%s: set timeout %" PRId64 " for %s\n", __func__, (int64_t)timeout, mnt_path); return autofs_ioctl(mnt_path, mnt_fd, AUTOFS_IOC_SETTIMEOUT, &timeout); } @@ -654,8 +670,7 @@ static int autofs_mnt_set_pipefd(const autofs_info_t *i, int mnt_fd) if (i->entry->fd == AUTOFS_CATATONIC_FD) return 0; - pr_info("%s: set pipe fd %d (pgrp %d) for mount %s\n", __func__, - i->entry->fd, getpgrp(), i->mnt_path); + pr_info("%s: set pipe fd %d (pgrp %d) for mount %s\n", __func__, i->entry->fd, getpgrp(), i->mnt_path); init_autofs_dev_ioctl(¶m); param.ioctlfd = mnt_fd; @@ -668,8 +683,7 @@ static int autofs_mnt_close(const char *mnt_path, int mnt_fd) { struct autofs_dev_ioctl param; - pr_info("%s: closing fd %d for mount %s\n", __func__, mnt_fd, - mnt_path); + pr_info("%s: closing fd %d for mount %s\n", __func__, mnt_fd, mnt_path); init_autofs_dev_ioctl(¶m); param.ioctlfd = mnt_fd; @@ -699,8 +713,7 @@ static int autofs_mnt_open(const char *mnt_path, dev_t devid) fd = param->ioctlfd; free(param); if (err < 0) { - pr_err("Failed to get %s fd (devid: %ld)\n", - mnt_path, (long)devid); + pr_err("Failed to get %s fd (devid: %ld)\n", mnt_path, (long)devid); return -1; } return fd; @@ -711,15 +724,19 @@ static int autofs_create_dentries(const struct mount_info *mi, char *mnt_path) struct mount_info *c; list_for_each_entry(c, &mi->children, siblings) { - char *path, *basename; + char *path, *rel_path; - basename = strrchr(c->mountpoint, '/'); - if (!basename) { - pr_info("%s: mount path \"%s\" doesn't have '/'\n", - __func__, c->mountpoint); + rel_path = get_relative_path(c->ns_mountpoint, mi->ns_mountpoint); + if (!rel_path) { + pr_err("Can't get path %s relative to %s\n", c->ns_mountpoint, mi->ns_mountpoint); return -1; } - path = xsprintf("%s%s", mnt_path, basename); + + /* Skip children-overmount */ + if (*rel_path == '\0') + continue; + + path = xsprintf("%s/%s", mnt_path, rel_path); if (!path) return -1; if (mkdir(path, 0555) < 0) { @@ -732,22 +749,19 @@ static int autofs_create_dentries(const struct mount_info *mi, char *mnt_path) return 0; } -static int autofs_populate_mount(const struct mount_info *mi, - const AutofsEntry *entry) +static int autofs_populate_mount(const struct mount_info *mi, const AutofsEntry *entry) { if (entry->mode != AUTOFS_MODE_INDIRECT) return 0; - return autofs_create_dentries(mi, mi->mountpoint); + return autofs_create_dentries(mi, service_mountpoint(mi)); } -static int autofs_post_mount(const char *mnt_path, dev_t mnt_dev, - time_t timeout) +static int autofs_post_mount(const char *mnt_path, dev_t mnt_dev, time_t timeout) { int mnt_fd; - pr_info("%s: set timeout for %s and make it catatonic\n", - __func__, mnt_path); + pr_info("%s: set timeout for %s and make it catatonic\n", __func__, mnt_path); mnt_fd = autofs_mnt_open(mnt_path, mnt_dev); if (mnt_fd < 0) { @@ -756,8 +770,7 @@ static int autofs_post_mount(const char *mnt_path, dev_t mnt_dev, } if (autofs_mnt_set_timeout(timeout, mnt_path, mnt_fd)) { - pr_err("Failed to set timeout %ld for %s\n", - timeout, mnt_path); + pr_err("Failed to set timeout %" PRId64 " for %s\n", (int64_t)timeout, mnt_path); return -1; } @@ -799,23 +812,19 @@ static int autofs_post_open(struct file_desc *d, int fd) return -1; } - pr_info("autofs mount %s owner restored: pgrp=%d, fd=%d\n", - i->mnt_path, getpgrp(), i->entry->fd); + pr_info("autofs mount %s owner restored: pgrp=%d, fd=%d\n", i->mnt_path, getpgrp(), i->entry->fd); if (i->entry->has_read_fd) { - pr_info("%s: pid %d, closing write end %d\n", __func__, - getpid(), i->entry->fd); + pr_info("%s: pid %d, closing write end %d\n", __func__, getpid(), i->entry->fd); close(i->entry->fd); } - pr_info("%s: pid %d, closing artificial pipe end %d\n", __func__, - getpid(), fd); + pr_info("%s: pid %d, closing artificial pipe end %d\n", __func__, getpid(), fd); close(fd); return 0; } -static autofs_info_t *autofs_create_info(const struct mount_info *mi, - const struct file_desc *desc, +static autofs_info_t *autofs_create_info(const struct mount_info *mi, const struct file_desc *desc, const autofs_info_t *info) { autofs_info_t *i; @@ -845,8 +854,7 @@ static autofs_info_t *autofs_create_info(const struct mount_info *mi, return i; } -static struct fdinfo_list_entry *autofs_pipe_le(struct pstree_item *master, - AutofsEntry *entry) +static struct fdinfo_list_entry *autofs_pipe_le(struct pstree_item *master, AutofsEntry *entry) { struct fdinfo_list_entry *ple; int pipe_fd = entry->fd; @@ -856,13 +864,11 @@ static struct fdinfo_list_entry *autofs_pipe_le(struct pstree_item *master, ple = find_used_fd(master, pipe_fd); if (!ple) { - pr_err("Failed to find pipe fd %d in process %d\n", - pipe_fd, vpid(master)); + pr_err("Failed to find pipe fd %d in process %d\n", pipe_fd, vpid(master)); return NULL; } if (ple->fe->type != FD_TYPES__PIPE) { - pr_err("Fd %d in process %d is not a pipe: %d\n", pipe_fd, - vpid(master), ple->fe->type); + pr_err("Fd %d in process %d is not a pipe: %d\n", pipe_fd, vpid(master), ple->fe->type); return NULL; } return ple; @@ -884,8 +890,7 @@ static int autofs_open_pipefd(struct file_desc *d, int *new_fd) return autofs_post_open(d, fle->fe->fd); } -static int autofs_create_pipe(struct pstree_item *task, autofs_info_t *i, - struct fdinfo_list_entry *ple) +static int autofs_create_pipe(struct pstree_item *task, autofs_info_t *i, struct fdinfo_list_entry *ple) { struct pipe_info *pi = container_of(ple->desc, struct pipe_info, d); int fd = -1; @@ -922,8 +927,7 @@ static int autofs_create_pipe(struct pstree_item *task, autofs_info_t *i, return -1; fe->type = FD_TYPES__AUTOFS_PIPE; - pr_info("autofs: adding pipe fd %d, flags %#x to %d (with post_open)\n", - fe->fd, fe->flags, vpid(task)); + pr_info("autofs: adding pipe fd %d, flags %#x to %d (with post_open)\n", fe->fd, fe->flags, vpid(task)); return collect_fd(vpid(task), fe, rsti(task), false); } @@ -954,8 +958,7 @@ static int autofs_add_mount_info(struct pprep_head *ph) entry->fd = autofs_dup_pipe(master, ple, entry->fd); if (entry->fd < 0) { - pr_err("Failed to find free fd in process %d\n", - vpid(master)); + pr_err("Failed to find free fd in process %d\n", vpid(master)); return -1; } } @@ -994,8 +997,7 @@ static int autofs_restore_entry(struct mount_info *mi, AutofsEntry **entry) return 0; } -int autofs_mount(struct mount_info *mi, const char *source, const - char *filesystemtype, unsigned long mountflags) +int autofs_mount(struct mount_info *mi, const char *source, const char *filesystemtype, unsigned long mountflags) { AutofsEntry *entry; autofs_info_t *info; @@ -1017,8 +1019,7 @@ int autofs_mount(struct mount_info *mi, const char *source, const if (entry->mode == AUTOFS_MODE_OFFSET) mode = "offset"; - opts = xsprintf("fd=%d,pgrp=%d,minproto=%d,maxproto=%d,%s", - control_pipe[1], getpgrp(), entry->minproto, + opts = xsprintf("fd=%d,pgrp=%d,minproto=%d,maxproto=%d,%s", control_pipe[1], getpgrp(), entry->minproto, entry->maxproto, mode); if (opts && entry->has_uid) opts = xstrcat(opts, ",uid=%d", entry->uid); @@ -1029,11 +1030,10 @@ int autofs_mount(struct mount_info *mi, const char *source, const goto close_pipe; } - pr_info("autofs: mounting to %s with options: \"%s\"\n", - mi->mountpoint, opts); + pr_info("autofs: mounting to %s with options: \"%s\"\n", service_mountpoint(mi), opts); - if (mount(source, mi->mountpoint, filesystemtype, mountflags, opts) < 0) { - pr_perror("Failed to mount autofs to %s", mi->mountpoint); + if (mount(source, service_mountpoint(mi), filesystemtype, mountflags, opts) < 0) { + pr_perror("Failed to mount autofs to %s", service_mountpoint(mi)); goto free_opts; } @@ -1048,8 +1048,8 @@ int autofs_mount(struct mount_info *mi, const char *source, const * data is not ready yet. So, let's put in on mi->private and copy to * shared data in autofs_add_mount_info(). */ - if (stat(mi->mountpoint, &buf) < 0) { - pr_perror("Failed to stat %s", mi->mountpoint); + if (stat(service_mountpoint(mi), &buf) < 0) { + pr_perror("Failed to stat %s", service_mountpoint(mi)); goto free_info; } info->mnt_dev = buf.st_dev; @@ -1060,7 +1060,7 @@ int autofs_mount(struct mount_info *mi, const char *source, const goto free_info; /* In case of catatonic mounts all we need as the function call below */ - ret = autofs_post_mount(mi->mountpoint, buf.st_dev, entry->timeout); + ret = autofs_post_mount(service_mountpoint(mi), buf.st_dev, entry->timeout); if (ret < 0) goto free_info; @@ -1083,8 +1083,7 @@ close_pipe: free_info: free(info); umount: - if (umount(mi->mountpoint) < 0) - pr_perror("Failed to umount %s", mi->mountpoint); + if (umount(service_mountpoint(mi)) < 0) + pr_perror("Failed to umount %s", service_mountpoint(mi)); goto close_pipe; } - diff --git a/criu/bfd.c b/criu/bfd.c index 05824551b..2c5f5b64a 100644 --- a/criu/bfd.c +++ b/criu/bfd.c @@ -16,14 +16,14 @@ #include "xmalloc.h" #include "page.h" -#undef LOG_PREFIX +#undef LOG_PREFIX #define LOG_PREFIX "bfd: " /* * Kernel doesn't produce more than one page of * date per one read call on proc files. */ -#define BUFSIZE (PAGE_SIZE) +#define BUFSIZE (PAGE_SIZE) struct bfd_buf { char *mem; @@ -32,7 +32,7 @@ struct bfd_buf { static LIST_HEAD(bufs); -#define BUFBATCH (16) +#define BUFBATCH (16) static int buf_get(struct xbuf *xb) { @@ -42,8 +42,7 @@ static int buf_get(struct xbuf *xb) void *mem; int i; - mem = mmap(NULL, BUFBATCH * BUFSIZE, PROT_READ | PROT_WRITE, - MAP_PRIVATE | MAP_ANONYMOUS, 0, 0); + mem = mmap(NULL, BUFBATCH * BUFSIZE, PROT_READ | PROT_WRITE, MAP_PRIVATE | MAP_ANONYMOUS, 0, 0); if (mem == MAP_FAILED) { pr_perror("No buf"); return -1; @@ -145,7 +144,7 @@ static int brefill(struct bfd *f) memmove(b->mem, b->data, b->sz); b->data = b->mem; - ret = read(f->fd, b->mem + b->sz, BUFSIZE - b->sz); + ret = read_all(f->fd, b->mem + b->sz, BUFSIZE - b->sz); if (ret < 0) { pr_perror("Error reading file"); return -1; @@ -198,8 +197,7 @@ again: if (b->sz == BUFSIZE) { pr_err("The bfd buffer is too small\n"); - ERR_PTR(-EIO); - return NULL; + return ERR_PTR(-EIO); } /* * Last bytes may lack the \n at the @@ -242,7 +240,7 @@ static int bflush(struct bfd *bfd) if (!b->sz) return 0; - ret = write(bfd->fd, b->data, b->sz); + ret = write_all(bfd->fd, b->data, b->sz); if (ret != b->sz) return -1; @@ -262,7 +260,7 @@ static int __bwrite(struct bfd *bfd, const void *buf, int size) } if (size > BUFSIZE) - return write(bfd->fd, buf, size); + return write_all(bfd->fd, buf, size); memcpy(b->data + b->sz, buf, size); b->sz += size; @@ -272,7 +270,7 @@ static int __bwrite(struct bfd *bfd, const void *buf, int size) int bwrite(struct bfd *bfd, const void *buf, int size) { if (!bfd_buffered(bfd)) - return write(bfd->fd, buf, size); + return write_all(bfd->fd, buf, size); return __bwrite(bfd, buf, size); } @@ -281,8 +279,13 @@ int bwritev(struct bfd *bfd, const struct iovec *iov, int cnt) { int i, written = 0; - if (!bfd_buffered(bfd)) + if (!bfd_buffered(bfd)) { + /* + * FIXME writev() should be called again if writev() writes + * less bytes than requested. + */ return writev(bfd->fd, iov, cnt); + } for (i = 0; i < cnt; i++) { int ret; @@ -305,7 +308,7 @@ int bread(struct bfd *bfd, void *buf, int size) int more = 1, filled = 0; if (!bfd_buffered(bfd)) - return read(bfd->fd, buf, size); + return read_all(bfd->fd, buf, size); while (more > 0) { int chunk; diff --git a/criu/bitmap.c b/criu/bitmap.c index a28a89d8d..d81c93409 100644 --- a/criu/bitmap.c +++ b/criu/bitmap.c @@ -1,17 +1,12 @@ #include "common/bitsperlong.h" -#define BIT_WORD(nr) ((nr) / BITS_PER_LONG) +#define BIT_WORD(nr) ((nr) / BITS_PER_LONG) #define BITMAP_FIRST_WORD_MASK(start) (~0ul << ((start) % BITS_PER_LONG)) -#define BITMAP_LAST_WORD_MASK(nbits) \ -( \ - ((nbits) % BITS_PER_LONG) ? \ - (1ul << ((nbits) % BITS_PER_LONG)) - 1 : ~0ul \ -) +#define BITMAP_LAST_WORD_MASK(nbits) (((nbits) % BITS_PER_LONG) ? (1ul << ((nbits) % BITS_PER_LONG)) - 1 : ~0ul) -#define small_const_nbits(nbits) \ - (__builtin_constant_p(nbits) && (nbits) <= BITS_PER_LONG) +#define small_const_nbits(nbits) (__builtin_constant_p(nbits) && (nbits) <= BITS_PER_LONG) void bitmap_set(unsigned long *map, int start, int nr) { diff --git a/criu/bpfmap.c b/criu/bpfmap.c new file mode 100644 index 000000000..25098368d --- /dev/null +++ b/criu/bpfmap.c @@ -0,0 +1,365 @@ +#include +#include + +#include "common/compiler.h" +#include "imgset.h" +#include "bpfmap.h" +#include "fdinfo.h" +#include "image.h" +#include "util.h" +#include "log.h" + +#include "protobuf.h" + +#ifndef LIBBPF_OPTS +#define LIBBPF_OPTS DECLARE_LIBBPF_OPTS +#define LEGACY_LIBBPF /* Using libbpf < 0.7 */ +#endif + +int is_bpfmap_link(char *link) +{ + return is_anon_link_type(link, "bpf-map"); +} + +static void pr_info_bpfmap(char *action, BpfmapFileEntry *bpf) +{ + pr_info("%sbpfmap: id %#08x map_id %#08x map_type %d flags %" PRIx32 "\n", action, bpf->id, bpf->map_id, + bpf->map_type, bpf->map_flags); +} + +struct bpfmap_data_rst *bpfmap_data_hash_table[BPFMAP_DATA_TABLE_SIZE]; + +static int bpfmap_data_read(struct cr_img *img, struct bpfmap_data_rst *r) +{ + unsigned long bytes = r->bde->keys_bytes + r->bde->values_bytes; + if (!bytes) + return 0; + + r->data = mmap(NULL, bytes, PROT_READ | PROT_WRITE, MAP_SHARED | MAP_ANONYMOUS, 0, 0); + if (r->data == MAP_FAILED) { + pr_perror("Can't map mem for bpfmap buffers"); + return -1; + } + + return read_img_buf(img, r->data, bytes); +} + +int do_collect_bpfmap_data(struct bpfmap_data_rst *r, ProtobufCMessage *msg, struct cr_img *img, + struct bpfmap_data_rst **bpf_hash_table) +{ + int ret; + int table_index; + + r->bde = pb_msg(msg, BpfmapDataEntry); + ret = bpfmap_data_read(img, r); + if (ret < 0) + return ret; + + table_index = r->bde->map_id & BPFMAP_DATA_HASH_MASK; + r->next = bpf_hash_table[table_index]; + bpf_hash_table[table_index] = r; + + pr_info("Collected bpfmap data for %#x\n", r->bde->map_id); + return 0; +} + +int restore_bpfmap_data(int map_fd, uint32_t map_id, struct bpfmap_data_rst **bpf_hash_table) +{ + struct bpfmap_data_rst *map_data; + BpfmapDataEntry *bde; + void *keys = NULL; + void *values = NULL; + unsigned int count; + LIBBPF_OPTS(bpf_map_batch_opts, opts); + + for (map_data = bpf_hash_table[map_id & BPFMAP_DATA_HASH_MASK]; map_data != NULL; map_data = map_data->next) { + if (map_data->bde->map_id == map_id) + break; + } + + if (!map_data || map_data->bde->count == 0) { + pr_info("No data for BPF map %#x\n", map_id); + return 0; + } + + bde = map_data->bde; + count = bde->count; + + keys = mmap(NULL, bde->keys_bytes, PROT_READ | PROT_WRITE, MAP_SHARED | MAP_ANONYMOUS, 0, 0); + if (keys == MAP_FAILED) { + pr_perror("Can't map memory for BPF map keys"); + goto err; + } + memcpy(keys, map_data->data, bde->keys_bytes); + + values = mmap(NULL, bde->values_bytes, PROT_READ | PROT_WRITE, MAP_SHARED | MAP_ANONYMOUS, 0, 0); + if (values == MAP_FAILED) { + pr_perror("Can't map memory for BPF map values"); + goto err; + } + memcpy(values, map_data->data + bde->keys_bytes, bde->values_bytes); + + if (bpf_map_update_batch(map_fd, keys, values, &count, &opts)) { + pr_perror("Can't load key-value pairs to BPF map"); + goto err; + } + munmap(keys, bde->keys_bytes); + munmap(values, bde->values_bytes); + return 0; + +err: + munmap(keys, bde->keys_bytes); + munmap(values, bde->values_bytes); + return -1; +} + +static int collect_bpfmap_data(void *obj, ProtobufCMessage *msg, struct cr_img *img) +{ + return do_collect_bpfmap_data(obj, msg, img, bpfmap_data_hash_table); +} + +struct collect_image_info bpfmap_data_cinfo = { + .fd_type = CR_FD_BPFMAP_DATA, + .pb_type = PB_BPFMAP_DATA, + .priv_size = sizeof(struct bpfmap_data_rst), + .collect = collect_bpfmap_data, +}; + +int dump_one_bpfmap_data(BpfmapFileEntry *bpf, int lfd, const struct fd_parms *p) +{ + /* + * Linux kernel patch notes for bpf_map_*_batch(): + * + * in_batch/out_batch are opaque values use to communicate between + * user/kernel space, in_batch/out_batch must be of key_size length. + * To start iterating from the beginning in_batch must be null, + * count is the # of key/value elements to retrieve. Note that the 'keys' + * buffer must be a buffer of key_size * count size and the 'values' buffer + * must be value_size * count, where value_size must be aligned to 8 bytes + * by userspace if it's dealing with percpu maps. 'count' will contain the + * number of keys/values successfully retrieved. Note that 'count' is an + * input/output variable and it can contain a lower value after a call. + * + * If there's no more entries to retrieve, ENOENT will be returned. If error + * is ENOENT, count might be > 0 in case it copied some values but there were + * no more entries to retrieve. + * + * Note that if the return code is an error and not -EFAULT, + * count indicates the number of elements successfully processed. + */ + + struct cr_img *img; + uint32_t key_size, value_size, max_entries, count; + void *keys = NULL, *values = NULL; + void *in_batch = NULL, *out_batch = NULL; + BpfmapDataEntry bde = BPFMAP_DATA_ENTRY__INIT; + LIBBPF_OPTS(bpf_map_batch_opts, opts); + int ret; + + key_size = bpf->key_size; + value_size = bpf->value_size; + max_entries = bpf->max_entries; + count = max_entries; + + keys = mmap(NULL, key_size * max_entries, PROT_READ | PROT_WRITE, MAP_SHARED | MAP_ANONYMOUS, 0, 0); + if (keys == MAP_FAILED) { + pr_perror("Can't map memory for BPF map keys"); + goto err; + } + + values = mmap(NULL, value_size * max_entries, PROT_READ | PROT_WRITE, MAP_SHARED | MAP_ANONYMOUS, 0, 0); + if (values == MAP_FAILED) { + pr_perror("Can't map memory for BPF map values"); + goto err; + } + + out_batch = mmap(NULL, key_size, PROT_READ | PROT_WRITE, MAP_SHARED | MAP_ANONYMOUS, 0, 0); + if (out_batch == MAP_FAILED) { + pr_perror("Can't map memory for BPF map out_batch"); + goto err; + } + + ret = bpf_map_lookup_batch(lfd, in_batch, out_batch, keys, values, &count, &opts); + if (ret && errno != ENOENT) { + pr_perror("Can't perform a batch lookup on BPF map"); + goto err; + } + + img = img_from_set(glob_imgset, CR_FD_BPFMAP_DATA); + + bde.map_id = bpf->map_id; + bde.keys_bytes = (key_size * count); + bde.values_bytes = (value_size * count); + bde.count = count; + + if (pb_write_one(img, &bde, PB_BPFMAP_DATA)) + goto err; + + if (write(img_raw_fd(img), keys, key_size * count) != (key_size * count)) { + pr_perror("Can't write BPF map's keys"); + goto err; + } + if (write(img_raw_fd(img), values, value_size * count) != (value_size * count)) { + pr_perror("Can't write BPF map's values"); + goto err; + } + + munmap(keys, key_size * max_entries); + munmap(values, value_size * max_entries); + munmap(out_batch, key_size); + return 0; + +err: + munmap(keys, key_size * max_entries); + munmap(values, value_size * max_entries); + munmap(out_batch, key_size); + return -1; +} + +static int dump_one_bpfmap(int lfd, u32 id, const struct fd_parms *p) +{ + BpfmapFileEntry bpf = BPFMAP_FILE_ENTRY__INIT; + FileEntry fe = FILE_ENTRY__INIT; + int ret; + /* If we are using a bigger struct than the kernel knows of, + * ensure all the unknown bits are 0 - i.e. new user-space + * does not rely on any unknown kernel feature extensions. + * https://github.com/torvalds/linux/blob/a1994480/kernel/bpf/syscall.c#L70 + */ + struct bpf_map_info map_info = {}; + uint32_t info_len = sizeof(struct bpf_map_info); + + if (parse_fdinfo(lfd, FD_TYPES__BPFMAP, &bpf)) + return -1; + + ret = bpf_obj_get_info_by_fd(lfd, &map_info, &info_len); + if (ret) { + pr_perror("Could not get BPF map info"); + return -1; + } + + switch (bpf.map_type) { + case BPF_MAP_TYPE_HASH: + case BPF_MAP_TYPE_ARRAY: + bpf.id = id; + bpf.flags = p->flags; + bpf.fown = (FownEntry *)&p->fown; + bpf.map_name = xstrdup(map_info.name); + bpf.ifindex = map_info.ifindex; + + fe.type = FD_TYPES__BPFMAP; + fe.id = bpf.id; + fe.bpf = &bpf; + + pr_info_bpfmap("Dumping ", &bpf); + if (pb_write_one(img_from_set(glob_imgset, CR_FD_FILES), &fe, PB_FILE)) + return -1; + pr_info_bpfmap("Dumping data for ", &bpf); + ret = dump_one_bpfmap_data(&bpf, lfd, p); + break; + + default: + pr_err("CRIU does not currently support dumping BPF map type %u!\n", bpf.map_type); + ret = -1; + } + + return ret; +} + +const struct fdtype_ops bpfmap_dump_ops = { + .type = FD_TYPES__BPFMAP, + .dump = dump_one_bpfmap, +}; + +static int bpfmap_open(struct file_desc *d, int *new_fd) +{ + struct bpfmap_file_info *info; + BpfmapFileEntry *bpfe; + int bpfmap_fd; +#ifdef LEGACY_LIBBPF + struct bpf_create_map_attr xattr; +#else + LIBBPF_OPTS(bpf_map_create_opts, bpfmap_opts); +#endif + + info = container_of(d, struct bpfmap_file_info, d); + bpfe = info->bpfe; + + pr_info_bpfmap("Creating and opening ", bpfe); + +#ifdef LEGACY_LIBBPF + xattr.name = xstrdup(bpfe->map_name); + xattr.map_type = bpfe->map_type; + xattr.map_flags = bpfe->map_flags; + xattr.key_size = bpfe->key_size; + xattr.value_size = bpfe->value_size; + xattr.max_entries = bpfe->max_entries; + xattr.numa_node = 0; + xattr.btf_fd = 0; + xattr.btf_key_type_id = 0; + xattr.btf_value_type_id = 0; + xattr.map_ifindex = bpfe->ifindex; + xattr.inner_map_fd = 0; + + bpfmap_fd = bpf_create_map_xattr(&xattr); +#else + bpfmap_opts.map_flags = bpfe->map_flags; + bpfmap_opts.map_ifindex = bpfe->ifindex; + if (bpfe->has_map_extra) + bpfmap_opts.map_extra = bpfe->map_extra; + + bpfmap_fd = bpf_map_create(bpfe->map_type, bpfe->map_name, bpfe->key_size, bpfe->value_size, bpfe->max_entries, + &bpfmap_opts); +#endif + + if (bpfmap_fd < 0) { + pr_perror("Can't create bpfmap %#08x", bpfe->id); + return -1; + } + + if (bpfe->has_map_extra && bpfe->map_extra) + pr_warn("bpfmap map_extra has non-zero value. This will not be restored.\n"); + + if (restore_bpfmap_data(bpfmap_fd, bpfe->map_id, bpfmap_data_hash_table)) + return -1; + + if (bpfe->frozen) { + if (bpf_map_freeze(bpfmap_fd)) { + pr_perror("Can't freeze bpfmap %#08x", bpfe->id); + goto err_close; + } + } + + if (rst_file_params(bpfmap_fd, bpfe->fown, bpfe->flags)) { + pr_perror("Can't restore params on bpfmap %#08x", bpfe->id); + goto err_close; + } + + *new_fd = bpfmap_fd; + return 0; + +err_close: + close(bpfmap_fd); + return -1; +} + +static struct file_desc_ops bpfmap_desc_ops = { + .type = FD_TYPES__BPFMAP, + .open = bpfmap_open, +}; + +static int collect_one_bpfmap(void *obj, ProtobufCMessage *msg, struct cr_img *i) +{ + struct bpfmap_file_info *info = obj; + + info->bpfe = pb_msg(msg, BpfmapFileEntry); + pr_info_bpfmap("Collected ", info->bpfe); + return file_desc_add(&info->d, info->bpfe->id, &bpfmap_desc_ops); +} + +struct collect_image_info bpfmap_cinfo = { + .fd_type = CR_FD_BPFMAP_FILE, + .pb_type = PB_BPFMAP_FILE, + .priv_size = sizeof(struct bpfmap_file_info), + .collect = collect_one_bpfmap, +}; diff --git a/criu/cgroup-props.c b/criu/cgroup-props.c index ecd959352..1b85c5b5a 100644 --- a/criu/cgroup-props.c +++ b/criu/cgroup-props.c @@ -20,7 +20,7 @@ #include "log.h" #include "common/bug.h" -#undef LOG_PREFIX +#undef LOG_PREFIX #define LOG_PREFIX "cg-prop: " enum { @@ -35,15 +35,32 @@ static const char *____criu_global_props____[] = { "tasks", }; +/* cgroup2 global properties */ +// clang-format off +static const char *____criu_global_props_v2____[] = { + "cgroup.subtree_control", + "cgroup.max.descendants", + "cgroup.max.depth", + "cgroup.freeze", + "cgroup.type", +}; +// clang-format on + cgp_t cgp_global = { - .name = "____criu_global_props____", - .nr_props = ARRAY_SIZE(____criu_global_props____), - .props = ____criu_global_props____, + .name = "____criu_global_props____", + .nr_props = ARRAY_SIZE(____criu_global_props____), + .props = ____criu_global_props____, +}; + +cgp_t cgp_global_v2 = { + .name = "____criu_global_props_v2____", + .nr_props = ARRAY_SIZE(____criu_global_props_v2____), + .props = ____criu_global_props_v2____, }; typedef struct { - struct list_head list; - cgp_t cgp; + struct list_head list; + cgp_t cgp; } cgp_list_entry_t; static LIST_HEAD(cgp_list); @@ -91,9 +108,7 @@ static int cgp_handle_props(cgp_list_entry_t **p, int strategy) if (strcmp(t->cgp.name, s->cgp.name)) continue; - pr_debug("%s \"%s\" controller properties\n", - strategy == CGP_MERGE ? - "Merging" : "Replacing", + pr_debug("%s \"%s\" controller properties\n", strategy == CGP_MERGE ? "Merging" : "Replacing", s->cgp.name); if (strategy == CGP_MERGE) { @@ -258,21 +273,18 @@ static int cgp_parse_stream(char *stream, size_t len) } if (!eat_symbols(&stream, &len, ":\n - ", 5, true)) { - pr_err("Expected \':\\n - \' sequence controller's %s stream\n", - cgp_entry->cgp.name); + pr_err("Expected \':\\n - \' sequence controller's %s stream\n", cgp_entry->cgp.name); goto err_parse; } if (!eat_word(&stream, &len, "\"strategy\":", 11, true)) { - pr_err("Expected \'strategy:\' keyword in controller's %s stream\n", - cgp_entry->cgp.name); + pr_err("Expected \'strategy:\' keyword in controller's %s stream\n", cgp_entry->cgp.name); goto err_parse; } p = get_quoted(&stream, &len, true); if (!p) { - pr_err("Expected strategy in controller's %s stream\n", - cgp_entry->cgp.name); + pr_err("Expected strategy in controller's %s stream\n", cgp_entry->cgp.name); goto err_parse; }; @@ -281,8 +293,7 @@ static int cgp_parse_stream(char *stream, size_t len) } else if (!strcmp(p, "replace")) { strategy = CGP_REPLACE; } else { - pr_err("Unknown strategy \"%s\" in controller's %s stream\n", - p, cgp_entry->cgp.name); + pr_err("Unknown strategy \"%s\" in controller's %s stream\n", p, cgp_entry->cgp.name); xfree(p); goto err_parse; } @@ -291,34 +302,29 @@ static int cgp_parse_stream(char *stream, size_t len) xfree(p); if (!eat_symbols(&stream, &len, "\n - ", 4, true)) { - pr_err("Expected \':\\n - \' sequence controller's %s stream\n", - cgp_entry->cgp.name); + pr_err("Expected \':\\n - \' sequence controller's %s stream\n", cgp_entry->cgp.name); goto err_parse; } if (!eat_word(&stream, &len, "\"properties\":", 13, true)) { - pr_err("Expected \"properties:\" keyword in controller's %s stream\n", - cgp_entry->cgp.name); + pr_err("Expected \"properties:\" keyword in controller's %s stream\n", cgp_entry->cgp.name); goto err_parse; } if (!eat_symbol(&stream, &len, '[', true)) { - pr_err("Expected \'[\' sequence controller's %s properties stream\n", - cgp_entry->cgp.name); + pr_err("Expected \'[\' sequence controller's %s properties stream\n", cgp_entry->cgp.name); goto err_parse; } while ((p = get_quoted(&stream, &len, true))) { if (!p) { - pr_err("Expected property name for controller %s\n", - cgp_entry->cgp.name); + pr_err("Expected property name for controller %s\n", cgp_entry->cgp.name); goto err_parse; } - if (xrealloc_safe(&cgp_entry->cgp.props, - (cgp_entry->cgp.nr_props + 1) * sizeof(char *))) { - pr_err("Can't allocate property for controller %s\n", - cgp_entry->cgp.name); + if (xrealloc_safe(&cgp_entry->cgp.props, (cgp_entry->cgp.nr_props + 1) * sizeof(char *))) { + pr_err("Can't allocate property for controller %s\n", cgp_entry->cgp.name); + xfree(p); goto err_parse; } @@ -330,8 +336,7 @@ static int cgp_parse_stream(char *stream, size_t len) stream++, len--; break; } - pr_err("Expected ']' in controller's %s stream\n", - cgp_entry->cgp.name); + pr_err("Expected ']' in controller's %s stream\n", cgp_entry->cgp.name); goto err_parse; } } @@ -342,8 +347,7 @@ static int cgp_parse_stream(char *stream, size_t len) } if (!eat_symbol(&stream, &len, '\n', true) && len) { - pr_err("Expected \'\\n\' symbol in controller's %s stream\n", - cgp_entry->cgp.name); + pr_err("Expected \'\\n\' symbol in controller's %s stream\n", cgp_entry->cgp.name); goto err_parse; } @@ -401,96 +405,94 @@ err: static int cgp_parse_builtins(void) { - static const char predefined_stream[] = - "\"cpu\":\n" - " - \"strategy\": \"replace\"\n" - " - \"properties\": " - "[ " - "\"cpu.shares\", " - "\"cpu.cfs_period_us\", " - "\"cpu.cfs_quota_us\", " - "\"cpu.rt_period_us\", " - "\"cpu.rt_runtime_us\" " - "]\n" - /* limit_in_bytes and memsw.limit_in_bytes must be set in this order */ - "\"memory\":\n" - " - \"strategy\": \"replace\"\n" - " - \"properties\": " - "[ " - "\"memory.limit_in_bytes\", " - "\"memory.memsw.limit_in_bytes\", " - "\"memory.swappiness\", " - "\"memory.soft_limit_in_bytes\", " - "\"memory.move_charge_at_immigrate\", " - "\"memory.oom_control\", " - "\"memory.use_hierarchy\", " - "\"memory.kmem.limit_in_bytes\", " - "\"memory.kmem.tcp.limit_in_bytes\" " - "]\n" - /* + static const char predefined_stream[] = "\"cpu\":\n" + " - \"strategy\": \"replace\"\n" + " - \"properties\": " + "[ " + "\"cpu.shares\", " + "\"cpu.cfs_period_us\", " + "\"cpu.cfs_quota_us\", " + "\"cpu.rt_period_us\", " + "\"cpu.rt_runtime_us\" " + "]\n" + /* limit_in_bytes and memsw.limit_in_bytes must be set in this order */ + "\"memory\":\n" + " - \"strategy\": \"replace\"\n" + " - \"properties\": " + "[ " + "\"memory.limit_in_bytes\", " + "\"memory.memsw.limit_in_bytes\", " + "\"memory.swappiness\", " + "\"memory.soft_limit_in_bytes\", " + "\"memory.move_charge_at_immigrate\", " + "\"memory.oom_control\", " + "\"memory.use_hierarchy\", " + "\"memory.kmem.limit_in_bytes\", " + "\"memory.kmem.tcp.limit_in_bytes\" " + "]\n" + /* * cpuset.cpus and cpuset.mems must be set before the process moves * into its cgroup; they are "initialized" below to whatever the root * values are in copy_special_cg_props so as not to cause ENOSPC when * values are restored via this code. */ - "\"cpuset\":\n" - " - \"strategy\": \"replace\"\n" - " - \"properties\": " - "[ " - "\"cpuset.cpus\", " - "\"cpuset.mems\", " - "\"cpuset.memory_migrate\", " - "\"cpuset.cpu_exclusive\", " - "\"cpuset.mem_exclusive\", " - "\"cpuset.mem_hardwall\", " - "\"cpuset.memory_spread_page\", " - "\"cpuset.memory_spread_slab\", " - "\"cpuset.sched_load_balance\", " - "\"cpuset.sched_relax_domain_level\" " - "]\n" - "\"blkio\":\n" - " - \"strategy\": \"replace\"\n" - " - \"properties\": " - "[ " - "\"blkio.weight\" " - "]\n" - "\"freezer\":\n" - " - \"strategy\": \"replace\"\n" - " - \"properties\": " - "[ " - "]\n" - "\"perf_event\":\n" - " - \"strategy\": \"replace\"\n" - " - \"properties\": " - "[ " - "]\n" - "\"net_cls\":\n" - " - \"strategy\": \"replace\"\n" - " - \"properties\": " - "[ " - "\"net_cls.classid\" " - "]\n" - "\"net_prio\":\n" - " - \"strategy\": \"replace\"\n" - " - \"properties\": " - "[ " - "\"net_prio.ifpriomap\" " - "]\n" - "\"pids\":\n" - " - \"strategy\": \"replace\"\n" - " - \"properties\": " - "[ " - "\"pids.max\" " - "]\n" - "\"devices\":\n" - " - \"strategy\": \"replace\"\n" - " - \"properties\": " - "[ " - "\"devices.list\" " - "]\n"; + "\"cpuset\":\n" + " - \"strategy\": \"replace\"\n" + " - \"properties\": " + "[ " + "\"cpuset.cpus\", " + "\"cpuset.mems\", " + "\"cpuset.memory_migrate\", " + "\"cpuset.cpu_exclusive\", " + "\"cpuset.mem_exclusive\", " + "\"cpuset.mem_hardwall\", " + "\"cpuset.memory_spread_page\", " + "\"cpuset.memory_spread_slab\", " + "\"cpuset.sched_load_balance\", " + "\"cpuset.sched_relax_domain_level\" " + "]\n" + "\"blkio\":\n" + " - \"strategy\": \"replace\"\n" + " - \"properties\": " + "[ " + "\"blkio.weight\" " + "]\n" + "\"freezer\":\n" + " - \"strategy\": \"replace\"\n" + " - \"properties\": " + "[ " + "]\n" + "\"perf_event\":\n" + " - \"strategy\": \"replace\"\n" + " - \"properties\": " + "[ " + "]\n" + "\"net_cls\":\n" + " - \"strategy\": \"replace\"\n" + " - \"properties\": " + "[ " + "\"net_cls.classid\" " + "]\n" + "\"net_prio\":\n" + " - \"strategy\": \"replace\"\n" + " - \"properties\": " + "[ " + "\"net_prio.ifpriomap\" " + "]\n" + "\"pids\":\n" + " - \"strategy\": \"replace\"\n" + " - \"properties\": " + "[ " + "\"pids.max\" " + "]\n" + "\"devices\":\n" + " - \"strategy\": \"replace\"\n" + " - \"properties\": " + "[ " + "\"devices.list\" " + "]\n"; - return cgp_parse_stream((void *)predefined_stream, - strlen(predefined_stream)); + return cgp_parse_stream((void *)predefined_stream, strlen(predefined_stream)); } int cgp_init(char *stream, size_t len, char *path) diff --git a/criu/cgroup.c b/criu/cgroup.c index 332c79fb9..9246be639 100644 --- a/criu/cgroup.c +++ b/criu/cgroup.c @@ -1,4 +1,4 @@ -#define LOG_PREFIX "cg: " +#define LOG_PREFIX "cg: " #include #include #include @@ -8,6 +8,8 @@ #include #include #include +#include + #include "common/list.h" #include "xmalloc.h" #include "cgroup.h" @@ -24,6 +26,8 @@ #include "protobuf.h" #include "images/core.pb-c.h" #include "images/cgroup.pb-c.h" +#include "kerndat.h" +#include "linux/mount.h" /* * This structure describes set of controller groups @@ -33,10 +37,10 @@ */ struct cg_set { - u32 id; - struct list_head l; - unsigned int n_ctls; - struct list_head ctls; + u32 id; + struct list_head l; + unsigned int n_ctls; + struct list_head ctls; }; static LIST_HEAD(cg_sets); @@ -51,6 +55,7 @@ static u32 cg_set_ids = 1; static LIST_HEAD(cgroups); static unsigned int n_cgroups; +static pid_t cgroupd_pid; static CgSetEntry *find_rst_set_by_id(u32 id) { @@ -63,8 +68,8 @@ static CgSetEntry *find_rst_set_by_id(u32 id) return NULL; } -#define CGCMP_MATCH 1 /* check for exact match */ -#define CGCMP_ISSUB 2 /* check set is subset of ctls */ +#define CGCMP_MATCH 1 /* check for exact match */ +#define CGCMP_ISSUB 2 /* check set is subset of ctls */ static bool cg_set_compare(struct cg_set *set, struct list_head *ctls, int what) { @@ -78,7 +83,7 @@ static bool cg_set_compare(struct cg_set *set, struct list_head *ctls, int what) if (l2->next != ctls) c2 = list_first_entry(l2, struct cg_ctl, l); - if (!c1 || !c2) /* Nowhere to move next */ + if (!c1 || !c2) /* Nowhere to move next */ return !c1 && !c2; /* Both lists scanned -- match */ if (strcmp(c1->name, c2->name)) @@ -170,6 +175,7 @@ struct cg_controller *new_controller(const char *name) nc->n_controllers = 1; nc->n_heads = 0; + nc->is_threaded = false; INIT_LIST_HEAD(&nc->heads); return nc; @@ -186,11 +192,21 @@ int parse_cg_info(void) /* Check that co-mounted controllers from /proc/cgroups (e.g. cpu and cpuacct) * are contained in a comma separated string (e.g. from /proc/self/cgroup or * mount options). */ -static bool cgroup_contains(char **controllers, - unsigned int n_controllers, char *name, u64 *mask) +static bool cgroup_contains(char **controllers, unsigned int n_controllers, char *name, u64 *mask) { unsigned int i; bool all_match = true; + + /* Check whether this cgroup2 or not.*/ + if (n_controllers == 1 && controllers[0][0] == 0) { + bool match = name[0] == 0; + + if (mask && match) + *mask &= ~(1ULL); + + return match; + } + for (i = 0; i < n_controllers; i++) { bool found = false; const char *loc = name; @@ -216,12 +232,12 @@ static bool cgroup_contains(char **controllers, /* This is for use in add_cgroup() as additional arguments for the ftw() * callback */ -static struct cg_controller *current_controller; -static unsigned int path_pref_len; +static struct cg_controller *current_controller; +static unsigned int path_pref_len; -#define EXACT_MATCH 0 -#define PARENT_MATCH 1 -#define NO_MATCH 2 +#define EXACT_MATCH 0 +#define PARENT_MATCH 1 +#define NO_MATCH 2 static int find_dir(const char *path, struct list_head *dirs, struct cgroup_dir **rdir) { @@ -232,14 +248,13 @@ static int find_dir(const char *path, struct list_head *dirs, struct cgroup_dir return EXACT_MATCH; } - if (strstartswith(path, d->path)) { + if (issubpath(path, d->path)) { int ret = find_dir(path, &d->children, rdir); if (ret == NO_MATCH) { *rdir = d; return PARENT_MATCH; } return ret; - } } @@ -303,7 +318,7 @@ static int read_cgroup_prop(struct cgroup_prop *property, const char *fullpath) ret = read(fd, buf, sizeof(buf) - 1); if (ret == -1) { - pr_err("Failed scanning %s\n", fullpath); + pr_perror("Failed scanning %s", fullpath); close(fd); return -1; } @@ -358,7 +373,8 @@ static void free_all_cgroup_props(struct cgroup_dir *ncd) ncd->n_properties = 0; } -static int dump_cg_props_array(const char *fpath, struct cgroup_dir *ncd, const cgp_t *cgp) +static int dump_cg_props_array(const char *fpath, struct cgroup_dir *ncd, const cgp_t *cgp, + struct cg_controller *controller) { int j; char buf[PATH_MAX]; @@ -409,6 +425,14 @@ static int dump_cg_props_array(const char *fpath, struct cgroup_dir *ncd, const prop->value = new; } + /* + * Set the is_threaded flag if cgroup.type's value is threaded + * or it is a cgroup v1 (it has a 'tasks' property). + * Ignore all other values. + */ + if ((!strcmp("cgroup.type", prop->name) && !strcmp("threaded", prop->value)) || !strcmp("tasks", prop->name)) + controller->is_threaded = true; + pr_info("Dumping value %s from %s/%s\n", prop->value, fpath, prop->name); list_add_tail(&prop->list, &ncd->properties); ncd->n_properties++; @@ -417,20 +441,27 @@ static int dump_cg_props_array(const char *fpath, struct cgroup_dir *ncd, const return 0; } -static int add_cgroup_properties(const char *fpath, struct cgroup_dir *ncd, - struct cg_controller *controller) +static int add_cgroup_properties(const char *fpath, struct cgroup_dir *ncd, struct cg_controller *controller) { int i; for (i = 0; i < controller->n_controllers; ++i) { const cgp_t *cgp = cgp_get_props(controller->controllers[i]); - if (dump_cg_props_array(fpath, ncd, cgp) < 0) { + if (dump_cg_props_array(fpath, ncd, cgp, controller) < 0) { pr_err("dumping known properties failed\n"); return -1; } + } - if (dump_cg_props_array(fpath, ncd, &cgp_global) < 0) { + /* cgroup v2 */ + if (controller->controllers[0][0] == 0) { + if (dump_cg_props_array(fpath, ncd, &cgp_global_v2, controller) < 0) { + pr_err("dumping global properties v2 failed\n"); + return -1; + } + } else { + if (dump_cg_props_array(fpath, ncd, &cgp_global, controller) < 0) { pr_err("dumping global properties failed\n"); return -1; } @@ -513,7 +544,7 @@ static int add_freezer_state(struct cg_controller *controller) { struct cgroup_dir *it; - /* There is one more case, that cgroup namespaces might + /* There is one more case, that cgroup namespaces might * generate "multiple" heads if nothing is actually in the * root freezer cgroup, e.g. --freeze-cgroup=/lxc/foo and all * tasks in either /lxc/foo/a or /lxc/foo/b. @@ -542,6 +573,90 @@ static int add_freezer_state(struct cg_controller *controller) return 0; } +static const char namestr[] = "name="; +static int __new_open_cgroupfs(struct cg_ctl *cc) +{ + const char *fstype = cc->name[0] == 0 ? "cgroup2" : "cgroup"; + int fsfd, fd; + char *name; + + fsfd = cr_fsopen(fstype, 0); + if (fsfd < 0) { + pr_perror("Unable to open the cgroup file system"); + return -1; + } + + if (strstartswith(cc->name, namestr)) { + if (cr_fsconfig(fsfd, FSCONFIG_SET_STRING, "name", cc->name + strlen(namestr), 0)) { + fsfd_dump_messages(fsfd); + pr_perror("Unable to configure the cgroup (%s) file system", cc->name); + goto err; + } + } else if (cc->name[0] != 0) { /* cgroup v1 */ + char *saveptr = NULL, *buf = strdupa(cc->name); + name = strtok_r(buf, ",", &saveptr); + while (name) { + if (cr_fsconfig(fsfd, FSCONFIG_SET_FLAG, name, NULL, 0)) { + fsfd_dump_messages(fsfd); + pr_perror("Unable to configure the cgroup (%s) file system", name); + goto err; + } + name = strtok_r(NULL, ",", &saveptr); + } + } + + if (cr_fsconfig(fsfd, FSCONFIG_CMD_CREATE, NULL, NULL, 0)) { + fsfd_dump_messages(fsfd); + pr_perror("Unable to create the cgroup (%s) file system", cc->name); + goto err; + } + + fd = cr_fsmount(fsfd, 0, 0); + if (fd < 0) { + fsfd_dump_messages(fsfd); + pr_perror("Unable to mount the cgroup (%s) file system", cc->name); + } + close(fsfd); + + return fd; +err: + close(fsfd); + return -1; +} + +static int open_cgroupfs(struct cg_ctl *cc) +{ + const char *fstype = cc->name[0] == 0 ? "cgroup2" : "cgroup"; + char prefix[] = ".criu.cgmounts.XXXXXX"; + char mopts[1024]; + int fd; + + if (kdat.has_fsopen) + return __new_open_cgroupfs(cc); + + if (strstartswith(cc->name, namestr)) + snprintf(mopts, sizeof(mopts), "none,%s", cc->name); + else + snprintf(mopts, sizeof(mopts), "%s", cc->name); + + if (mkdtemp(prefix) == NULL) { + pr_perror("can't make dir for cg mounts"); + return -1; + } + + if (mount("none", prefix, fstype, 0, mopts[0] ? mopts : NULL) < 0) { + pr_perror("Unable to mount %s %s", fstype, mopts); + rmdir(prefix); + return -1; + } + + fd = open_detach_mount(prefix); + if (fd < 0) + return -1; + + return fd; +} + static int collect_cgroups(struct list_head *ctls) { struct cg_ctl *cc; @@ -549,8 +664,7 @@ static int collect_cgroups(struct list_head *ctls) int fd = -1; list_for_each_entry(cc, ctls, l) { - char path[PATH_MAX], mopts[1024], *root; - char prefix[] = ".criu.cgmounts.XXXXXX"; + char path[PATH_MAX], *root; struct cg_controller *cg; struct cg_root_opt *o; @@ -568,7 +682,7 @@ static int collect_cgroups(struct list_head *ctls) if (!current_controller) { /* only allow "fake" controllers to be created this way */ - if (!strstartswith(cc->name, "name=")) { + if (!strstartswith(cc->name, namestr)) { pr_err("controller %s not found\n", cc->name); return -1; } else { @@ -586,26 +700,29 @@ static int collect_cgroups(struct list_head *ctls) if (!opts.manage_cgroups) continue; - if (strstartswith(cc->name, "name=")) - snprintf(mopts, sizeof(mopts), "none,%s", cc->name); - else - snprintf(mopts, sizeof(mopts), "%s", cc->name); + if (opts.cgroup_yard) { + char dir_path[PATH_MAX]; + int off; - if (mkdtemp(prefix) == NULL) { - pr_perror("can't make dir for cg mounts"); - return -1; + off = snprintf(dir_path, PATH_MAX, "%s/", opts.cgroup_yard); + if (strstartswith(cc->name, namestr)) + snprintf(dir_path + off, PATH_MAX - off, "%s", cc->name + strlen(namestr)); + else if (cc->name[0] == 0) + snprintf(dir_path + off, PATH_MAX - off, "unified"); + else + snprintf(dir_path + off, PATH_MAX - off, "%s", cc->name); + + fd = open(dir_path, O_RDONLY | O_DIRECTORY, 0); + if (fd < 0) { + pr_perror("couldn't open %s", dir_path); + return -1; + } + } else { + fd = open_cgroupfs(cc); + if (fd < 0) + return -1; } - if (mount("none", prefix, "cgroup", 0, mopts) < 0) { - pr_perror("couldn't mount %s", mopts); - rmdir(prefix); - return -1; - } - - fd = open_detach_mount(prefix); - if (fd < 0) - return -1; - path_pref_len = snprintf(path, PATH_MAX, "/proc/self/fd/%d", fd); root = cc->path; @@ -620,6 +737,7 @@ static int collect_cgroups(struct list_head *ctls) snprintf(path + path_pref_len, PATH_MAX - path_pref_len, "%s", root); ret = ftw(path, add_cgroup, 4); + if (ret < 0) pr_perror("failed walking %s for empty cgroups", path); @@ -628,28 +746,35 @@ static int collect_cgroups(struct list_head *ctls) if (ret < 0) return ret; - if (opts.freeze_cgroup && !strcmp(cc->name, "freezer") && - add_freezer_state(current_controller)) + if (opts.freeze_cgroup && !strcmp(cc->name, "freezer") && add_freezer_state(current_controller)) return -1; } return 0; } -int dump_task_cgroup(struct pstree_item *item, u32 *cg_id, struct parasite_dump_cgroup_args *args) +int dump_thread_cgroup(const struct pstree_item *item, u32 *cg_id, struct parasite_dump_cgroup_args *args, int id) { - int pid; + int pid, tid; LIST_HEAD(ctls); unsigned int n_ctls = 0; struct cg_set *cs; + if (opts.unprivileged) + return 0; + if (item) pid = item->pid->real; else pid = getpid(); - pr_info("Dumping cgroups for %d\n", pid); - if (parse_task_cgroup(pid, args, &ctls, &n_ctls)) + if (id < 0) + tid = pid; + else + tid = item->threads[id].real; + + pr_info("Dumping cgroups for thread %d\n", tid); + if (parse_thread_cgroup(pid, tid, args, &ctls, &n_ctls)) return -1; cs = get_cg_set(&ctls, n_ctls, item); @@ -662,9 +787,10 @@ int dump_task_cgroup(struct pstree_item *item, u32 *cg_id, struct parasite_dump_ pr_info("Set %d is criu one\n", cs->id); } else { if (item == root_item) { - BUG_ON(root_cgset); - root_cgset = cs; - pr_info("Set %d is root one\n", cs->id); + if (!root_cgset) { + root_cgset = cs; + pr_info("Set %d is root one\n", cs->id); + } } else { struct cg_ctl *root, *stray; @@ -681,7 +807,8 @@ int dump_task_cgroup(struct pstree_item *item, u32 *cg_id, struct parasite_dump_ continue; if (strlen(stray->path) < root->cgns_prefix) { - pr_err("cg %s shorter than path prefix %d?\n", stray->path, root->cgns_prefix); + pr_err("cg %s shorter than path prefix %d?\n", stray->path, + root->cgns_prefix); return -1; } @@ -695,8 +822,7 @@ int dump_task_cgroup(struct pstree_item *item, u32 *cg_id, struct parasite_dump_ return 0; } -static int dump_cg_dir_props(struct list_head *props, size_t n_props, - CgroupPropEntry ***ents) +static int dump_cg_dir_props(struct list_head *props, size_t n_props, CgroupPropEntry ***ents) { struct cgroup_prop *prop_cur; CgroupPropEntry *cpe; @@ -770,7 +896,7 @@ static int dump_cg_dirs(struct list_head *dirs, size_t n_dirs, CgroupDirEntry ** cde->dir_perms->gid = cur->gid; cde->dir_name = cur->path + poff; - if (poff != 1) /* parent isn't "/" */ + if (poff != 1) /* parent isn't "/" */ cde->dir_name++; /* leading / */ cde->n_children = cur->n_children; if (cur->n_children > 0) @@ -781,8 +907,7 @@ static int dump_cg_dirs(struct list_head *dirs, size_t n_dirs, CgroupDirEntry ** cde->n_properties = cur->n_properties; if (cde->n_properties > 0) { - if (dump_cg_dir_props(&cur->properties, - cde->n_properties, &cde->properties) < 0) { + if (dump_cg_dir_props(&cur->properties, cde->n_properties, &cde->properties) < 0) { xfree(*ents); return -1; } @@ -812,6 +937,8 @@ static int dump_controllers(CgroupEntry *cg) list_for_each_entry(cur, &cgroups, l) { cg_controller_entry__init(ce); + ce->has_is_threaded = true; + ce->is_threaded = cur->is_threaded; ce->cnames = cur->controllers; ce->n_cnames = cur->n_controllers; ce->n_dirs = cur->n_heads; @@ -836,7 +963,6 @@ static void free_sets(CgroupEntry *cg, unsigned nr) xfree(cg->sets); } - static int dump_sets(CgroupEntry *cg) { struct cg_set *set; @@ -900,6 +1026,9 @@ int dump_cgroups(void) CgroupEntry cg = CGROUP_ENTRY__INIT; int ret = -1; + if (opts.unprivileged) + return 0; + BUG_ON(!criu_cgset || !root_cgset); /* @@ -929,8 +1058,7 @@ err: return ret; } -static int ctrl_dir_and_opt(CgControllerEntry *ctl, char *dir, int ds, - char *opt, int os) +static int ctrl_dir_and_opt(CgControllerEntry *ctl, char *dir, int ds, char *opt, int os) { int i, doff = 0, ooff = 0; bool none_opt = false; @@ -947,7 +1075,10 @@ static int ctrl_dir_and_opt(CgControllerEntry *ctl, char *dir, int ds, } } - doff += snprintf(dir + doff, ds - doff, "%s,", n); + if (n[0] == 0) + doff += snprintf(dir + doff, ds - doff, "unified,"); + else + doff += snprintf(dir + doff, ds - doff, "%s,", n); if (opt) ooff += snprintf(opt + ooff, os - ooff, "%s,", ctl->cnames[i]); } @@ -971,6 +1102,7 @@ static const char *special_props[] = { "memory.swappiness", "memory.oom_control", "memory.use_hierarchy", + "cgroup.type", NULL, }; @@ -1058,7 +1190,7 @@ static int prepare_cgns(CgSetEntry *se) ce->path[ce->cgns_prefix] = '\0'; pr_info("setting cgns prefix to %s\n", ce->path); - snprintf(aux + aux_off, sizeof(aux) - aux_off, "/%s/tasks", ce->path); + snprintf(aux + aux_off, sizeof(aux) - aux_off, "/%s/cgroup.procs", ce->path); ce->path[ce->cgns_prefix] = tmp; if (userns_call(userns_move, 0, aux, strlen(aux) + 1, -1) < 0) { pr_perror("couldn't set cgns prefix %s", aux); @@ -1067,7 +1199,6 @@ static int prepare_cgns(CgSetEntry *se) do_unshare = true; } - } if (do_unshare && unshare(CLONE_NEWCGROUP) < 0) { @@ -1078,17 +1209,12 @@ static int prepare_cgns(CgSetEntry *se) return 0; } -static int move_in_cgroup(CgSetEntry *se, bool setup_cgns) +static int move_in_cgroup(CgSetEntry *se) { int i; pr_info("Move into %d\n", se->id); - if (setup_cgns && prepare_cgns(se) < 0) { - pr_err("failed preparing cgns\n"); - return -1; - } - for (i = 0; i < se->n_ctls; i++) { char aux[PATH_MAX]; int fd = -1, err, j, aux_off; @@ -1116,7 +1242,7 @@ static int move_in_cgroup(CgSetEntry *se, bool setup_cgns) * the root cgns, we still want to use the full path here when * we move into the cgroup. */ - snprintf(aux + aux_off, sizeof(aux) - aux_off, "/%s/tasks", ce->path); + snprintf(aux + aux_off, sizeof(aux) - aux_off, "/%s/cgroup.procs", ce->path); pr_debug(" `-> %s\n", aux); err = userns_call(userns_move, 0, aux, strlen(aux) + 1, -1); if (err < 0) { @@ -1128,16 +1254,61 @@ static int move_in_cgroup(CgSetEntry *se, bool setup_cgns) return 0; } -int prepare_task_cgroup(struct pstree_item *me) +int prepare_cgroup_namespace(struct pstree_item *root_task) { CgSetEntry *se; + + if (opts.manage_cgroups == CG_MODE_IGNORE) + return 0; + + if (root_task->parent) { + pr_err("Expecting root_task to restore cgroup namespace\n"); + return -1; + } + + /* + * If on dump all dumped tasks are in same cgset with criu we don't + * dump cgsets and thus cgroup namespaces and rely that on restore + * criu caller would prepare proper cgset/cgns for us. Also in case + * of --unprivileged we don't even have the root cgset here. + */ + if (!rsti(root_task)->cg_set || rsti(root_task)->cg_set == root_cg_set) { + pr_info("Cgroup namespace inherited from parent\n"); + return 0; + } + + se = find_rst_set_by_id(rsti(root_task)->cg_set); + if (!se) { + pr_err("No set %d found\n", rsti(root_task)->cg_set); + return -1; + } + + if (prepare_cgns(se) < 0) { + pr_err("failed preparing cgns\n"); + return -1; + } + + return 0; +} + +int restore_task_cgroup(struct pstree_item *me) +{ + struct pstree_item *parent = me->parent; + CgSetEntry *se; u32 current_cgset; + if (opts.manage_cgroups == CG_MODE_IGNORE) + return 0; + if (!rsti(me)->cg_set) return 0; - if (me->parent) - current_cgset = rsti(me->parent)->cg_set; + /* Zombies and helpers can have cg_set == 0 so we skip them */ + while (parent && !rsti(parent)->cg_set) + parent = parent->parent; + + if (parent) + current_cgset = rsti(parent)->cg_set; else current_cgset = root_cg_set; @@ -1152,13 +1323,7 @@ int prepare_task_cgroup(struct pstree_item *me) return -1; } - /* Since don't support nesting of cgroup namespaces, let's only set up - * the cgns (if it exists) in the init task. In the future, we should - * just check that the cgns prefix string matches for all the entries - * in the cgset, and only unshare if that's true. - */ - - return move_in_cgroup(se, !me->parent); + return move_in_cgroup(se); } void fini_cgroup(void) @@ -1167,48 +1332,88 @@ void fini_cgroup(void) return; close_service_fd(CGROUP_YARD); - if (umount2(cg_yard, MNT_DETACH)) - pr_perror("Unable to umount %s", cg_yard); - if (rmdir(cg_yard)) - pr_perror("Unable to remove %s", cg_yard); + if (!opts.cgroup_yard) { + if (umount2(cg_yard, MNT_DETACH)) + pr_perror("Unable to umount %s", cg_yard); + if (rmdir(cg_yard)) + pr_perror("Unable to remove %s", cg_yard); + } xfree(cg_yard); cg_yard = NULL; } -static int restore_perms(int fd, const char *path, CgroupPerms *perms) +static int add_subtree_control_prop_prefix(char *input, char *output, char prefix) { - struct stat sb; + char *current, *next; + size_t len, off = 0; - if (perms) { - if (fstat(fd, &sb) < 0) { - pr_perror("stat of property %s failed", path); - return -1; - } + current = input; + do { + next = strchrnul(current, ' '); + len = next - current; - /* only chmod/chown if the perms are actually different: we aren't - * allowed to chmod some cgroup props (e.g. the read only ones), so we - * don't want to try if the perms already match. - */ - if (sb.st_mode != (mode_t) perms->mode && fchmod(fd, perms->mode) < 0) { - pr_perror("chmod of %s failed", path); - return -1; - } + output[off] = prefix; + off++; + memcpy(output + off, current, len); + off += len; + output[off] = ' '; + off++; - if ((sb.st_uid != perms->uid || sb.st_gid != perms->gid) && - fchown(fd, perms->uid, perms->gid)) { - pr_perror("chown of %s failed", path); - return -1; - } + current = next + 1; + } while (*next != '\0'); + + return off; +} + +static int restore_cgroup_subtree_control(const CgroupPropEntry *cg_prop_entry_p, int fd) +{ + char buf[1024]; + char line[1024]; + int ret, off = 0; + + ret = read(fd, buf, sizeof(buf) - 1); + if (ret < 0) { + pr_perror("read from cgroup.subtree_control"); + return ret; + } + /* Remove the trailing newline */ + buf[ret] = '\0'; + + /* Remove all current subsys in subtree_control */ + if (buf[0] != '\0') + off = add_subtree_control_prop_prefix(buf, line, '-'); + + /* Add subsys need to be restored in subtree_control */ + if (cg_prop_entry_p->value[0] != '\0') + off += add_subtree_control_prop_prefix(cg_prop_entry_p->value, line + off, '+'); + + /* Remove the trailing space */ + if (off != 0) { + off--; + line[off] = '\0'; + } + + if (write(fd, line, off) != off) { + pr_perror("write to cgroup.subtree_control"); + return -1; } return 0; } -static int restore_cgroup_prop(const CgroupPropEntry *cg_prop_entry_p, - char *path, int off, bool split_lines, bool skip_fails) +/* + * Note: The path string can be modified in this function, + * the length of path string should be at least PATH_MAX. + */ +static int restore_cgroup_prop(const CgroupPropEntry *cg_prop_entry_p, char *path, int off, bool split_lines, + bool skip_fails) { - int cg, fd, ret = -1; + int cg, fd, exit_code = -1, flag; CgroupPerms *perms = cg_prop_entry_p->perms; + int is_subtree_control = !strcmp(cg_prop_entry_p->name, "cgroup.subtree_control"); + + if (opts.manage_cgroups == CG_MODE_IGNORE) + return 0; if (!cg_prop_entry_p->value) { pr_err("cg_prop_entry->value was empty when should have had a value\n"); @@ -1222,19 +1427,35 @@ static int restore_cgroup_prop(const CgroupPropEntry *cg_prop_entry_p, pr_info("Restoring cgroup property value [%s] to [%s]\n", cg_prop_entry_p->value, path); + if (is_subtree_control) + flag = O_RDWR; + else + flag = O_WRONLY; + cg = get_service_fd(CGROUP_YARD); - fd = openat(cg, path, O_WRONLY); + fd = openat(cg, path, flag); if (fd < 0) { pr_perror("bad cgroup path: %s", path); return -1; } - if (restore_perms(fd, path, perms) < 0) + if (perms && cr_fchperm(fd, perms->uid, perms->gid, perms->mode) < 0) goto out; /* skip these two since restoring their values doesn't make sense */ if (!strcmp(cg_prop_entry_p->name, "cgroup.procs") || !strcmp(cg_prop_entry_p->name, "tasks")) { - ret = 0; + exit_code = 0; + goto out; + } + + if (is_subtree_control) { + exit_code = restore_cgroup_subtree_control(cg_prop_entry_p, fd); + goto out; + } + + /* skip restoring cgroup.type if its value is not "threaded" */ + if (!strcmp(cg_prop_entry_p->name, "cgroup.type") && strcmp(cg_prop_entry_p->value, "threaded")) { + exit_code = 0; goto out; } @@ -1253,24 +1474,31 @@ static int restore_cgroup_prop(const CgroupPropEntry *cg_prop_entry_p, goto out; } line = next_line + 1; - } while(*next_line != '\0'); + } while (*next_line != '\0'); } else { size_t len = strlen(cg_prop_entry_p->value); + int ret; - if (write(fd, cg_prop_entry_p->value, len) != len) { + ret = write(fd, cg_prop_entry_p->value, len); + /* memory.kmem.limit_in_bytes has been deprecated. Look at + * 58056f77502f3 ("memcg, kmem: further deprecate + * kmem.limit_in_bytes") for more details. */ + if (ret == -1 && errno == EOPNOTSUPP && + !strcmp(cg_prop_entry_p->name, "memory.kmem.limit_in_bytes")) + ret = len; + if (ret != len) { pr_perror("Failed writing %s to %s", cg_prop_entry_p->value, path); if (!skip_fails) goto out; } } - ret = 0; - + exit_code = 0; out: if (close(fd) != 0) pr_perror("Failed closing %s", path); - return ret; + return exit_code; } static CgroupPropEntry *freezer_state_entry; @@ -1284,8 +1512,7 @@ int restore_freezer_state(void) return 0; freezer_path_len = strlen(freezer_path); - return restore_cgroup_prop(freezer_state_entry, freezer_path, - freezer_path_len, false, false); + return restore_cgroup_prop(freezer_state_entry, freezer_path, freezer_path_len, false, false); } static void add_freezer_state_for_restore(CgroupPropEntry *entry, char *path, size_t path_len) @@ -1350,9 +1577,9 @@ static int filter_ifpriomap(char *out, char *line) strncpy(out, line, len + 1); out += len + 1; written = true; -next: + next: line = next_line + 1; - } while(*next_line != '\0'); + } while (*next_line != '\0'); if (written) *(out - 1) = '\0'; @@ -1381,8 +1608,7 @@ out: return ret; } -static int prepare_cgroup_dir_properties(char *path, int off, CgroupDirEntry **ents, - unsigned int n_ents) +static int prepare_cgroup_dir_properties(char *path, int off, CgroupDirEntry **ents, unsigned int n_ents) { unsigned int i, j; @@ -1423,7 +1649,7 @@ static int prepare_cgroup_dir_properties(char *path, int off, CgroupDirEntry **e if (restore_cgroup_prop(p, path, off2, false, false) < 0) return -1; } -skip: + skip: if (prepare_cgroup_dir_properties(path, off2, e->children, e->n_children) < 0) return -1; } @@ -1537,7 +1763,7 @@ static int restore_special_props(char *paux, size_t off, CgroupDirEntry *e) static int prepare_dir_perms(int cg, char *path, CgroupPerms *perms) { - int fd, ret; + int fd, ret = 0; fd = openat(cg, path, O_DIRECTORY); if (fd < 0) { @@ -1545,13 +1771,14 @@ static int prepare_dir_perms(int cg, char *path, CgroupPerms *perms) return -1; } - ret = restore_perms(fd, path, perms); + if (perms) + ret = cr_fchperm(fd, perms->uid, perms->gid, perms->mode); close(fd); return ret; } -static int prepare_cgroup_dirs(char **controllers, int n_controllers, char *paux, size_t off, - CgroupDirEntry **ents, size_t n_ents) +static int prepare_cgroup_dirs(char **controllers, int n_controllers, char *paux, size_t off, CgroupDirEntry **ents, + size_t n_ents) { size_t i, j; CgroupDirEntry *e; @@ -1584,13 +1811,9 @@ static int prepare_cgroup_dirs(char **controllers, int n_controllers, char *paux return -1; for (j = 0; j < n_controllers; j++) { - if (!strcmp(controllers[j], "cpuset") - || !strcmp(controllers[j], "memory") - || !strcmp(controllers[j], "devices")) { - if (restore_special_props(paux, off2, e) < 0) { - pr_err("Restoring special cpuset props failed!\n"); - return -1; - } + if (restore_special_props(paux, off2, e) < 0) { + pr_err("Restoring special cpuset props failed!\n"); + return -1; } } } else { @@ -1610,13 +1833,11 @@ static int prepare_cgroup_dirs(char **controllers, int n_controllers, char *paux } } - if (!(opts.manage_cgroups & CG_MODE_NONE) && - prepare_dir_perms(cg, paux, e->dir_perms) < 0) + if (!(opts.manage_cgroups & CG_MODE_NONE) && prepare_dir_perms(cg, paux, e->dir_perms) < 0) return -1; } - if (prepare_cgroup_dirs(controllers, n_controllers, paux, off2, - e->children, e->n_children) < 0) + if (prepare_cgroup_dirs(controllers, n_controllers, paux, off2, e->children, e->n_children) < 0) return -1; } @@ -1649,34 +1870,41 @@ static int prepare_cgroup_sfd(CgroupEntry *ce) if (!opts.manage_cgroups) return 0; - pr_info("Preparing cgroups yard (cgroups restore mode %#x)\n", - opts.manage_cgroups); + pr_info("Preparing cgroups yard (cgroups restore mode %#x)\n", opts.manage_cgroups); - off = sprintf(paux, ".criu.cgyard.XXXXXX"); - if (mkdtemp(paux) == NULL) { - pr_perror("Can't make temp cgyard dir"); - return -1; + if (opts.cgroup_yard) { + off = sprintf(paux, "%s", opts.cgroup_yard); + + cg_yard = xstrdup(paux); + if (!cg_yard) + return -1; + } else { + off = sprintf(paux, ".criu.cgyard.XXXXXX"); + if (mkdtemp(paux) == NULL) { + pr_perror("Can't make temp cgyard dir"); + return -1; + } + + cg_yard = xstrdup(paux); + if (!cg_yard) { + rmdir(paux); + return -1; + } + + if (make_yard(cg_yard)) + return -1; } - cg_yard = xstrdup(paux); - if (!cg_yard) { - rmdir(paux); - return -1; - } - - if (make_yard(cg_yard)) - goto err; - pr_debug("Opening %s as cg yard\n", cg_yard); i = open(cg_yard, O_DIRECTORY); if (i < 0) { pr_perror("Can't open cgyard"); - goto err; + return -1; } ret = install_service_fd(CGROUP_YARD, i); if (ret < 0) - goto err; + return -1; paux[off++] = '/'; @@ -1687,21 +1915,24 @@ static int prepare_cgroup_sfd(CgroupEntry *ce) if (ctrl->n_cnames < 1) { pr_err("Each cg_controller_entry must have at least 1 controller\n"); - goto err; + return -1; } - ctl_off += ctrl_dir_and_opt(ctrl, - paux + ctl_off, sizeof(paux) - ctl_off, - opt, sizeof(opt)); + ctl_off += ctrl_dir_and_opt(ctrl, paux + ctl_off, sizeof(paux) - ctl_off, opt, sizeof(opt)); /* Create controller if not yet present */ if (access(paux, F_OK)) { - pr_debug("\tMaking controller dir %s (%s)\n", paux, opt); + char *fstype = "cgroup"; + + if (ctrl->cnames[0][0] == 0) + fstype = "cgroup2"; + + pr_debug("\tMaking controller dir %s (%s), type %s\n", paux, opt, fstype); if (mkdir(paux, 0700)) { pr_perror("\tCan't make controller dir %s", paux); return -1; } - if (mount("none", paux, "cgroup", 0, opt) < 0) { + if (mount("none", paux, fstype, 0, opt) < 0) { pr_perror("\tCan't mount controller dir %s", paux); return -1; } @@ -1713,20 +1944,169 @@ static int prepare_cgroup_sfd(CgroupEntry *ce) yard = paux + strlen(cg_yard) + 1; yard_off = ctl_off - (strlen(cg_yard) + 1); if (opts.manage_cgroups && - prepare_cgroup_dirs(ctrl->cnames, ctrl->n_cnames, yard, yard_off, - ctrl->dirs, ctrl->n_dirs)) - goto err; + prepare_cgroup_dirs(ctrl->cnames, ctrl->n_cnames, yard, yard_off, ctrl->dirs, ctrl->n_dirs)) + return -1; } return 0; - -err: - fini_cgroup(); - return -1; } -static int rewrite_cgsets(CgroupEntry *cge, char **controllers, int n_controllers, - char **dir_name, char *newroot) +static int cgroupd_unblock_sigterm(void) +{ + sigset_t unblockmask; + + sigemptyset(&unblockmask); + sigaddset(&unblockmask, SIGTERM); + + if (sigprocmask(SIG_UNBLOCK, &unblockmask, NULL)) { + pr_perror("cgroupd: can't unblock SIGTERM"); + return -1; + } + + return 0; +} + +/* + * If a thread is a different cgroup set than the main thread in process, + * it means it is in a threaded controller. This daemon receives the cg_set + * number from the restored thread and move this thread to the correct + * cgroup controllers + */ +static int cgroupd(int sk) +{ + /* + * This pairs with SIGTERM in stop_cgroupd(), and ensures that cgroupd + * will receive termination signal, regardless of which signal block + * mask was inherited. + */ + if (cgroupd_unblock_sigterm()) + return -1; + + pr_info("cgroud: Daemon started\n"); + + while (1) { + struct unsc_msg um; + uns_call_t call; + pid_t tid; + int fd, cg_set, i; + CgSetEntry *cg_set_entry; + int ret; + + unsc_msg_init(&um, &call, &cg_set, NULL, 0, 0, NULL); + ret = recvmsg(sk, &um.h, 0); + if (ret <= 0) { + pr_perror("cgroupd: recv req error"); + return -1; + } + + unsc_msg_pid_fd(&um, &tid, &fd); + pr_debug("cgroupd: move process %d into cg_set %d\n", tid, cg_set); + + cg_set_entry = find_rst_set_by_id(cg_set); + if (!cg_set_entry) { + pr_err("cgroupd: No set found %d\n", cg_set); + return -1; + } + + for (i = 0; i < cg_set_entry->n_ctls; i++) { + int j, aux_off; + CgMemberEntry *ce = cg_set_entry->ctls[i]; + char aux[PATH_MAX]; + CgControllerEntry *ctrl = NULL; + const char *format; + + for (j = 0; j < n_controllers; j++) { + CgControllerEntry *cur = controllers[j]; + if (cgroup_contains(cur->cnames, cur->n_cnames, ce->name, NULL)) { + ctrl = cur; + break; + } + } + + if (!ctrl) { + pr_err("cgroupd: No cg_controller_entry found for %s/%s\n", ce->name, ce->path); + return -1; + } + + /* + * This is not a threaded controller, all threads in this + * process must be in this controller. Main thread has been + * restored, so this thread is in this controller already. + */ + if (!ctrl->has_is_threaded || !ctrl->is_threaded) + continue; + + aux_off = ctrl_dir_and_opt(ctrl, aux, sizeof(aux), NULL, 0); + format = ctrl->cnames[0][0] ? "/%s/tasks" : "/%s/cgroup.threads"; + snprintf(aux + aux_off, sizeof(aux) - aux_off, format, ce->path); + + /* + * Cgroupd runs outside of the namespaces so we don't + * need to use userns_call here + */ + if (userns_move(aux, 0, tid)) { + pr_err("cgroupd: Can't move thread %d into %s/%s\n", tid, ce->name, ce->path); + return -1; + } + } + + /* + * We only want to send the cred which contains thread id back. + * The restored thread recvmsg(MSG_PEEK) until it gets its own + * thread id. + */ + unsc_msg_init(&um, &call, &cg_set, NULL, 0, 0, &tid); + if (sendmsg(sk, &um.h, 0) <= 0) { + pr_perror("cgroupd: send req error"); + return -1; + } + } + + return 0; +} + +int stop_cgroupd(void) +{ + if (cgroupd_pid) { + sigset_t blockmask, oldmask; + + /* + * Block the SIGCHLD signal to avoid triggering + * sigchld_handler() + */ + sigemptyset(&blockmask); + sigaddset(&blockmask, SIGCHLD); + sigprocmask(SIG_BLOCK, &blockmask, &oldmask); + + kill(cgroupd_pid, SIGTERM); + waitpid(cgroupd_pid, NULL, 0); + + sigprocmask(SIG_SETMASK, &oldmask, NULL); + } + + return 0; +} + +static int prepare_cgroup_thread_sfd(void) +{ + int sk; + + sk = start_unix_cred_daemon(&cgroupd_pid, cgroupd); + if (sk < 0) { + pr_err("failed to start cgroupd\n"); + return -1; + } + + if (install_service_fd(CGROUPD_SK, sk) < 0) { + kill(cgroupd_pid, SIGKILL); + waitpid(cgroupd_pid, NULL, 0); + return -1; + } + + return 0; +} + +static int rewrite_cgsets(CgroupEntry *cge, char **controllers, int n_controllers, char **dir_name, char *newroot) { size_t dirlen = strlen(*dir_name); char *dir = *dir_name; @@ -1763,7 +2143,7 @@ static int rewrite_cgsets(CgroupEntry *cge, char **controllers, int n_controller * "/" is matching to be renamed. */ if (!(cgroup_contains(controllers, n_controllers, cg->name, NULL) && - strstartswith(cg->path + 1, dir))) + strstartswith(cg->path + 1, dir))) continue; if (cg->has_cgns_prefix && cg->cgns_prefix) { @@ -1772,6 +2152,7 @@ static int rewrite_cgsets(CgroupEntry *cge, char **controllers, int n_controller cg->path = xsprintf("%s%s", newroot, cg->path + cg->cgns_prefix); if (!cg->path) { cg->path = prev; + xfree(dirnew); return -ENOMEM; } xfree(prev); @@ -1790,10 +2171,10 @@ static int rewrite_cgsets(CgroupEntry *cge, char **controllers, int n_controller * root but make sure the rest of path is * untouched. */ - cg->path = xsprintf("%s%s", newroot, - cg->path + dirlen + 1); + cg->path = xsprintf("%s%s", newroot, cg->path + dirlen + 1); if (!cg->path) { cg->path = prev; + xfree(dirnew); return -ENOMEM; } xfree(prev); @@ -1826,12 +2207,11 @@ static int rewrite_cgroup_roots(CgroupEntry *cge) list_for_each_entry(o, &opts.new_cgroup_roots, node) { unsigned old_mask = ctrl_mask; - cgroup_contains(ctrl->cnames, ctrl->n_cnames, - o->controller, &ctrl_mask); + /* coverity[check_return] */ + cgroup_contains(ctrl->cnames, ctrl->n_cnames, o->controller, &ctrl_mask); if (old_mask != ctrl_mask) { if (newroot && strcmp(newroot, o->newroot)) { - pr_err("CG paths mismatch: %s %s\n", - newroot, o->newroot); + pr_err("CG paths mismatch: %s %s\n", newroot, o->newroot); return -1; } newroot = o->newroot; @@ -1880,15 +2260,19 @@ int prepare_cgroup(void) n_controllers = ce->n_controllers; controllers = ce->controllers; - if (n_sets) + if (n_sets) { /* * We rely on the fact that all sets contain the same * set of controllers. This is checked during dump * with cg_set_compare(CGCMP_ISSUB) call. */ ret = prepare_cgroup_sfd(ce); - else + if (ret < 0) + return ret; + ret = prepare_cgroup_thread_sfd(); + } else { ret = 0; + } return ret; } @@ -1906,10 +2290,20 @@ int new_cg_root_add(char *controller, char *newroot) if (!o) return -1; - o->controller = controller; - o->newroot = newroot; + o->controller = xstrdup(controller); + if (!o->controller) + goto err_ctrl; + o->newroot = xstrdup(newroot); + if (!o->newroot) + goto err_newroot; list_add(&o->node, &opts.new_cgroup_roots); + return 0; +err_newroot: + xfree(o->controller); +err_ctrl: + xfree(o); + return -1; } struct ns_desc cgroup_ns_desc = NS_DESC_ENTRY(CLONE_NEWCGROUP, "cgroup"); diff --git a/criu/clone-noasan.c b/criu/clone-noasan.c index 5ca280eb8..d657ea2e8 100644 --- a/criu/clone-noasan.c +++ b/criu/clone-noasan.c @@ -1,4 +1,10 @@ +#include #include +#include + +#include + +#include "sched.h" #include "common/compiler.h" #include "log.h" #include "common/bug.h" @@ -18,10 +24,20 @@ * https://gcc.gnu.org/bugzilla/show_bug.cgi?id=69863 * * So the only way is to put this wrapper in separate non-instrumented file + * + * WARNING: When calling clone_noasan make sure your not sitting in a later + * __restore__ phase where other tasks might be creating threads, otherwise + * all calls to clone_noasan should be guarder with + * + * lock_last_pid + * clone_noasan + * ... wait for process to finish ... + * unlock_last_pid */ int clone_noasan(int (*fn)(void *), int flags, void *arg) { void *stack_ptr = (void *)round_down((unsigned long)&stack_ptr - 1024, 16); + BUG_ON((flags & CLONE_VM) && !(flags & CLONE_VFORK)); /* * Reserve some bytes for clone() internal needs @@ -29,3 +45,40 @@ int clone_noasan(int (*fn)(void *), int flags, void *arg) */ return clone(fn, stack_ptr, flags, arg); } + +int clone3_with_pid_noasan(int (*fn)(void *), void *arg, int flags, int exit_signal, pid_t pid) +{ + struct _clone_args c_args = {}; + + BUG_ON(flags & CLONE_VM); + + /* + * Make sure no child signals are requested. clone3() uses + * exit_signal for that. + */ + BUG_ON(flags & 0xff); + + pr_debug("Creating process using clone3()\n"); + + /* + * clone3() explicitly blocks setting an exit_signal + * if CLONE_PARENT is specified. With clone() it also + * did not work, but there was no error message. The + * exit signal from the thread group leader is taken. + */ + if (!(flags & CLONE_PARENT)) { + if (exit_signal != SIGCHLD) { + pr_err("Exit signal not SIGCHLD\n"); + errno = EINVAL; + return -1; + } + c_args.exit_signal = exit_signal; + } + c_args.flags = flags; + c_args.set_tid = ptr_to_u64(&pid); + c_args.set_tid_size = 1; + pid = syscall(__NR_clone3, &c_args, sizeof(c_args)); + if (pid == 0) + exit(fn(arg)); + return pid; +} diff --git a/criu/config.c b/criu/config.c index 39aa071c9..d7ef3f8e8 100644 --- a/criu/config.c +++ b/criu/config.c @@ -18,8 +18,10 @@ #include "cr_options.h" #include "filesystems.h" #include "file-lock.h" +#include "image.h" #include "irmap.h" #include "mount.h" +#include "mount-v2.h" #include "namespaces.h" #include "net.h" #include "sk-inet.h" @@ -30,6 +32,7 @@ #include "common/xmalloc.h" struct cr_options opts; +char *rpc_cfg_file; static int count_elements(char **to_count) { @@ -43,79 +46,215 @@ static int count_elements(char **to_count) /* Parse one statement in configuration file */ int parse_statement(int i, char *line, char **configuration) { + cleanup_free char *input = NULL; int offset = 0, len = 0; - bool was_newline = true; - char *tmp_string, *quoted, *quotedptr; + char *tmp_string; - while (1) { - /* Ignore white-space */ - while ((isspace(*(line + offset)) && (*(line + offset) != '\n'))) offset++; + /* + * A line from the configuration file can be: + * - empty + * - a boolean option (tcp-close) + * - an option with one parameter (verbosity 4) + * - a parameter can be in quotes (lsm-profile "selinux:something") + * - a parameter can contain escaped quotes + * + * Whenever a '#' is found we ignore everything after as a comment. + * + * This function adds none, one (boolean option) or two entries + * in **configuration and returns i + (the number of entries). + */ - /* Read a single word. A word is everything - * that doesn't contain white-space characters. */ - if (sscanf(line + offset, "%m[^ \t\n]s", &configuration[i]) != 1) { - configuration[i] = NULL; - break; - } + if (strlen(line) == 0) + return i; - /* Ignore comments - everything between '#' and '\n' */ - if (configuration[i][0] == '#') { - configuration[i] = NULL; - break; - } + /* Ignore leading white-space */ + while ((isspace(*(line + offset)) && (*(line + offset) != '\n'))) + offset++; - if ((configuration[i][0] == '\"') && (strchr(line + offset + 1, '"'))) { - /* Handle empty strings which strtok ignores */ - if (!strcmp(configuration[i], "\"\"")) { - configuration[i] = ""; - offset += strlen("\"\""); - } else if ((configuration[i] = strtok_r(line + offset, "\"", "edptr))) { - /* Handle escaping of quotes in quoted string */ - while (configuration[i][strlen(configuration[i]) - 1] == '\\') { - offset++; - len = strlen(configuration[i]); - configuration[i][len - 1] = '"'; - if (*quotedptr == '"') { - quotedptr++; - break; - } - quoted = strtok_r(NULL, "\"", "edptr); - tmp_string = xmalloc(len + strlen(quoted) + 1); - if (tmp_string == NULL) - return -1; + /* Ignore empty line */ + if (line[offset] == '\n') + return i; - memmove(tmp_string, configuration[i], len); - memmove(tmp_string + len, quoted, strlen(quoted) + 1); - configuration[i] = tmp_string; - } - offset += 2; - } - } + /* Ignore line starting with a comment */ + if (line[offset] == '#') + return i; - offset += strlen(configuration[i]); + input = xstrdup(line + offset); + if (unlikely(!input)) + return -1; - if (was_newline) { - was_newline = false; - len = strlen(configuration[i]); - tmp_string = xrealloc(configuration[i], len + strlen("--") + 1); - if (tmp_string == NULL) - return -1; + offset = 0; - memmove(tmp_string + strlen("--"), tmp_string, len + 1); - memmove(tmp_string, "--", strlen("--")); - configuration[i] = tmp_string; - } + /* Remove trailing '\n' */ + if ((tmp_string = strchr(input, '\n'))) + tmp_string[0] = 0; + + if ((tmp_string = strchr(input, ' ')) || (tmp_string = strchr(input, '\t'))) { + configuration[i] = xzalloc(tmp_string - input + strlen("--") + 1); + if (unlikely(!configuration[i])) + return -1; + memcpy(configuration[i], "--", strlen("--")); + memcpy(configuration[i] + strlen("--"), input, tmp_string - input); + configuration[i][tmp_string - input + strlen("--")] = 0; + /* Go to the next character */ + offset += tmp_string - input + 1; i++; + } else { + if (unlikely(asprintf(&configuration[i], "--%s", input) == -1)) + return -1; + return i + 1; } - return i; + while ((isspace(*(input + offset)))) + offset++; + + /* Check if the next token is a comment */ + if (input[offset] == '#') + return i; + + if (input[offset] == '"') { + bool found_second_quote = false; + char *quote_start; + int quote_offset; + + /* Move by one to skip the leading quote. */ + offset++; + quote_start = input + offset; + quote_offset = offset; + + if (input[offset] == 0) { + /* The value for the parameter was a single quote, this is not supported. */ + xfree(configuration[i - 1]); + pr_err("Unsupported configuration file format. Please consult man page criu(8)\n"); + return -1; + } + + if (input[offset] == '"') { + /* We got "" as value */ + configuration[i] = xstrdup(""); + if (unlikely(!configuration[i])) { + xfree(configuration[i - 1]); + return -1; + } + offset = 0; + goto out; + } + + /* + * If it starts with a quote everything until the + * next unescaped quote needs to be looked at. + */ + while ((tmp_string = strchr(input + quote_offset + 1, '"'))) { + quote_offset = tmp_string - input; + /* Check if it is escaped */ + if (*(tmp_string - 1) == '\\') + continue; + + /* Not escaped. That is the end of the quoted string. */ + found_second_quote = true; + configuration[i] = xzalloc(quote_offset - offset + 1); + if (unlikely(!configuration[i])) { + xfree(configuration[i - 1]); + return -1; + } + memcpy(configuration[i], quote_start, quote_offset - offset); + configuration[i][quote_offset - offset] = 0; + /* We skipped one additional quote */ + offset++; + /* Check for excessive parameters on the original line. */ + tmp_string++; + if (tmp_string != 0 && strchr(tmp_string, ' ')) { + int j; + len = strlen(tmp_string); + for (j = 0; j < len - 1; j++) { + if (tmp_string[j] == '#') + break; + if (!isspace(tmp_string[j])) { + pr_err("Unsupported configuration file format. Please consult man page criu(8)\n"); + xfree(configuration[i - 1]); + xfree(configuration[i]); + return -1; + } + } + } + break; + } + if (!found_second_quote) { + pr_err("Unsupported configuration file format. Please consult man page criu(8)\n"); + xfree(configuration[i - 1]); + return -1; + } + } else { + /* Does not start with a quote. */ + if (unlikely(asprintf(&configuration[i], "%s", input + offset) == -1)) { + xfree(configuration[i - 1]); + return -1; + } + + if ((tmp_string = strchr(input + offset, ' '))) + offset = tmp_string - (input + offset); + else + offset = 0; + } + + len = strlen(configuration[i]); + if (strstr(configuration[i], "\\\"")) { + /* We found an escaped quote. Skip the backslash. */ + cleanup_free char *tmp = NULL; + int skipped = 0; + int start = 0; + int dest = 0; + int j; + + tmp = xzalloc(len); + if (tmp == NULL) + return -1; + + for (j = start; j < len; j++) { + if (configuration[i][j] == '\\' && j + 1 < len && configuration[i][j + 1] == '"') { + skipped++; + continue; + } + tmp[dest++] = configuration[i][j]; + } + memcpy(configuration[i], tmp, strlen(tmp)); + configuration[i][strlen(tmp)] = 0; + + /* Account for skipped backslashes. */ + offset += skipped + 1; + len -= skipped; + } + +out: + /* Remove potential comments at the end */ + if ((tmp_string = strstr(configuration[i], "#")) || (tmp_string = strstr(configuration[i], " #"))) + tmp_string[0] = 0; + + /* Check for unsupported configuration file entries */ + if (strchr(configuration[i] + offset, ' ')) { + int j; + len = strlen(configuration[i] + offset); + for (j = 0; j < len - 1; j++) { + if (!isspace(configuration[i][offset + j])) { + pr_err("Unsupported configuration file format. Please consult man page criu(8)\n"); + xfree(configuration[i - 1]); + xfree(configuration[i]); + return -1; + } + } + } + + if ((tmp_string = strchr(configuration[i] + offset, ' '))) + tmp_string[0] = 0; + + return i + 1; } /* Parse a configuration file */ -static char ** parse_config(char *filepath) +static char **parse_config(char *filepath) { -#define DEFAULT_CONFIG_SIZE 10 - FILE* configfile = fopen(filepath, "r"); +#define DEFAULT_CONFIG_SIZE 10 + FILE *configfile = fopen(filepath, "r"); int config_size = DEFAULT_CONFIG_SIZE; int i = 1; size_t line_size = 0; @@ -125,6 +264,8 @@ static char ** parse_config(char *filepath) if (!configfile) return NULL; + pr_debug("Parsing config file %s\n", filepath); + configuration = xmalloc(config_size * sizeof(char *)); if (configuration == NULL) { fclose(configfile); @@ -136,9 +277,22 @@ static char ** parse_config(char *filepath) configuration[0] = "criu"; while (getline(&line, &line_size, configfile) != -1) { + int spaces = 1; + int j; + /* + * The statement parser 'parse_statement()' needs as many + * elements in 'configuration' as spaces + 1, because it splits + * each line at a space to return a result that can used as + * input for getopt. So, let's count spaces to determine the + * memory requirements. + */ + for (j = 0; j < strlen(line); j++) + if (line[j] == ' ') + spaces++; + /* Extend configuration buffer if necessary */ - if (i >= config_size - 1) { - config_size *= 2; + if (i + spaces >= config_size - 1) { + config_size += spaces; configuration = xrealloc(configuration, config_size * sizeof(char *)); if (configuration == NULL) { fclose(configfile); @@ -163,8 +317,7 @@ static char ** parse_config(char *filepath) return configuration; } -static int next_config(char **argv, char ***_argv, bool no_default_config, - int state, char *cfg_file) +static int next_config(char **argv, char ***_argv, bool no_default_config, int state, char *cfg_file) { char local_filepath[PATH_MAX + 1]; char *home_dir = NULL; @@ -173,52 +326,51 @@ static int next_config(char **argv, char ***_argv, bool no_default_config, if (state >= PARSING_LAST) return 0; - switch(state) { - case PARSING_GLOBAL_CONF: - if (no_default_config) - break; - *_argv = parse_config(GLOBAL_CONFIG_DIR DEFAULT_CONFIG_FILENAME); + switch (state) { + case PARSING_GLOBAL_CONF: + if (no_default_config) break; - case PARSING_USER_CONF: - if (no_default_config) - break; - home_dir = getenv("HOME"); - if (!home_dir) { - pr_info("Unable to get $HOME directory, local configuration file will not be used."); - } else { - snprintf(local_filepath, PATH_MAX, "%s/%s%s", - home_dir, USER_CONFIG_DIR, DEFAULT_CONFIG_FILENAME); - *_argv = parse_config(local_filepath); - } + *_argv = parse_config(GLOBAL_CONFIG_DIR DEFAULT_CONFIG_FILENAME); + break; + case PARSING_USER_CONF: + if (no_default_config) break; - case PARSING_ENV_CONF: - cfg_from_env = getenv("CRIU_CONFIG_FILE"); - if (!cfg_from_env) - break; - *_argv = parse_config(cfg_from_env); + home_dir = getenv("HOME"); + if (!home_dir) { + pr_info("Unable to get $HOME directory, local configuration file will not be used.\n"); + } else { + snprintf(local_filepath, PATH_MAX, "%s/%s%s", home_dir, USER_CONFIG_DIR, + DEFAULT_CONFIG_FILENAME); + *_argv = parse_config(local_filepath); + } + break; + case PARSING_ENV_CONF: + cfg_from_env = getenv("CRIU_CONFIG_FILE"); + if (!cfg_from_env) break; - case PARSING_CMDLINE_CONF: - if (!cfg_file) - break; - *_argv = parse_config(cfg_file); + *_argv = parse_config(cfg_from_env); + break; + case PARSING_CMDLINE_CONF: + if (!cfg_file) break; - case PARSING_ARGV: - *_argv = argv; - break; - case PARSING_RPC_CONF: - if (!rpc_cfg_file) - break; - *_argv = parse_config(rpc_cfg_file); - break; - default: + *_argv = parse_config(cfg_file); + break; + case PARSING_ARGV: + *_argv = argv; + break; + case PARSING_RPC_CONF: + if (!rpc_cfg_file) break; + *_argv = parse_config(rpc_cfg_file); + break; + default: + break; } return ++state; } -static int pre_parse(int argc, char **argv, bool *usage_error, bool *no_default_config, - char **cfg_file) +static int pre_parse(int argc, char **argv, bool *usage_error, bool *no_default_config, char **cfg_file) { int i; /* @@ -276,6 +428,10 @@ void init_opts(void) opts.empty_ns = 0; opts.status_fd = -1; opts.log_level = DEFAULT_LOGLEVEL; + opts.pre_dump_mode = PRE_DUMP_SPLICE; + opts.file_validation_method = FILE_VALIDATION_DEFAULT; + opts.network_lock_method = NETWORK_LOCK_DEFAULT; + opts.ghost_fiemap = FIEMAP_DEFAULT; } bool deprecated_ok(char *what) @@ -293,12 +449,12 @@ static int parse_cpu_cap(struct cr_options *opts, const char *optarg) { bool inverse = false; -#define ____cpu_set_cap(__opts, __cap, __inverse) \ - do { \ - if ((__inverse)) \ - (__opts)->cpu_cap &= ~(__cap); \ - else \ - (__opts)->cpu_cap |= (__cap); \ +#define ____cpu_set_cap(__opts, __cap, __inverse) \ + do { \ + if ((__inverse)) \ + (__opts)->cpu_cap &= ~(__cap); \ + else \ + (__opts)->cpu_cap |= (__cap); \ } while (0) if (!optarg) { @@ -396,8 +552,13 @@ static size_t parse_size(char *optarg) static int parse_join_ns(const char *ptr) { char *aux, *ns_file, *extra_opts = NULL; + cleanup_free char *ns = NULL; - aux = strchr(ptr, ':'); + ns = xstrdup(ptr); + if (ns == NULL) + return -1; + + aux = strchr(ns, ':'); if (aux == NULL) return -1; *aux = '\0'; @@ -410,12 +571,28 @@ static int parse_join_ns(const char *ptr) } else { extra_opts = NULL; } - if (join_ns_add(ptr, ns_file, extra_opts)) + if (join_ns_add(ns, ns_file, extra_opts)) return -1; return 0; } +static int parse_file_validation_method(struct cr_options *opts, const char *optarg) +{ + if (!strcmp(optarg, "filesize")) + opts->file_validation_method = FILE_VALIDATION_FILE_SIZE; + else if (!strcmp(optarg, "buildid")) + opts->file_validation_method = FILE_VALIDATION_BUILD_ID; + else + goto Esyntax; + + return 0; + +Esyntax: + pr_err("Unknown file validation method `%s' selected\n", optarg); + return -1; +} + /* * parse_options() is the point where the getopt parsing happens. The CLI * parsing as well as the configuration file parsing happens here. @@ -424,8 +601,7 @@ static int parse_join_ns(const char *ptr) * correct, '1' if something failed and '2' if the CRIU help text should * be displayed. */ -int parse_options(int argc, char **argv, bool *usage_error, - bool *has_exec_cmd, int state) +int parse_options(int argc, char **argv, bool *usage_error, bool *has_exec_cmd, int state) { int ret; int opt = -1; @@ -434,95 +610,110 @@ int parse_options(int argc, char **argv, bool *usage_error, char *cfg_file = NULL; char **_argv = NULL; int _argc = 0; + bool has_network_lock_opt = false; - -#define BOOL_OPT(OPT_NAME, SAVE_TO) \ - {OPT_NAME, no_argument, SAVE_TO, true},\ - {"no-" OPT_NAME, no_argument, SAVE_TO, false} +#define BOOL_OPT(OPT_NAME, SAVE_TO) \ + { OPT_NAME, no_argument, SAVE_TO, true }, \ + { \ + "no-" OPT_NAME, no_argument, SAVE_TO, false \ + } static const char short_opts[] = "dSsRt:hD:o:v::x::Vr:jJ:lW:L:M:"; static struct option long_opts[] = { - { "tree", required_argument, 0, 't' }, - { "leave-stopped", no_argument, 0, 's' }, - { "leave-running", no_argument, 0, 'R' }, + { "tree", required_argument, 0, 't' }, + { "leave-stopped", no_argument, 0, 's' }, + { "leave-running", no_argument, 0, 'R' }, BOOL_OPT("restore-detached", &opts.restore_detach), BOOL_OPT("restore-sibling", &opts.restore_sibling), BOOL_OPT("daemon", &opts.restore_detach), - { "images-dir", required_argument, 0, 'D' }, - { "work-dir", required_argument, 0, 'W' }, - { "log-file", required_argument, 0, 'o' }, - { "join-ns", required_argument, 0, 'J' }, - { "root", required_argument, 0, 'r' }, - { USK_EXT_PARAM, optional_argument, 0, 'x' }, - { "help", no_argument, 0, 'h' }, + { "images-dir", required_argument, 0, 'D' }, + { "work-dir", required_argument, 0, 'W' }, + { "log-file", required_argument, 0, 'o' }, + { "join-ns", required_argument, 0, 'J' }, + { "root", required_argument, 0, 'r' }, + { USK_EXT_PARAM, optional_argument, 0, 'x' }, + { "help", no_argument, 0, 'h' }, BOOL_OPT(SK_EST_PARAM, &opts.tcp_established_ok), - { "close", required_argument, 0, 1043 }, + { "close", required_argument, 0, 1043 }, BOOL_OPT("log-pid", &opts.log_file_per_pid), - { "version", no_argument, 0, 'V' }, + { "version", no_argument, 0, 'V' }, BOOL_OPT("evasive-devices", &opts.evasive_devices), - { "pidfile", required_argument, 0, 1046 }, - { "veth-pair", required_argument, 0, 1047 }, - { "action-script", required_argument, 0, 1049 }, + { "pidfile", required_argument, 0, 1046 }, + { "veth-pair", required_argument, 0, 1047 }, + { "action-script", required_argument, 0, 1049 }, BOOL_OPT(LREMAP_PARAM, &opts.link_remap_ok), BOOL_OPT(OPT_SHELL_JOB, &opts.shell_job), BOOL_OPT(OPT_FILE_LOCKS, &opts.handle_file_locks), BOOL_OPT("page-server", &opts.use_page_server), - { "address", required_argument, 0, 1051 }, - { "port", required_argument, 0, 1052 }, - { "prev-images-dir", required_argument, 0, 1053 }, - { "ms", no_argument, 0, 1054 }, + { "address", required_argument, 0, 1051 }, + { "port", required_argument, 0, 1052 }, + { "prev-images-dir", required_argument, 0, 1053 }, + { "ms", no_argument, 0, 1054 }, BOOL_OPT("track-mem", &opts.track_mem), BOOL_OPT("auto-dedup", &opts.auto_dedup), - { "libdir", required_argument, 0, 'L' }, - { "cpu-cap", optional_argument, 0, 1057 }, + { "libdir", required_argument, 0, 'L' }, + { "cpu-cap", optional_argument, 0, 1057 }, BOOL_OPT("force-irmap", &opts.force_irmap), - { "ext-mount-map", required_argument, 0, 'M' }, - { "exec-cmd", no_argument, 0, 1059 }, - { "manage-cgroups", optional_argument, 0, 1060 }, - { "cgroup-root", required_argument, 0, 1061 }, - { "inherit-fd", required_argument, 0, 1062 }, - { "feature", required_argument, 0, 1063 }, - { "skip-mnt", required_argument, 0, 1064 }, - { "enable-fs", required_argument, 0, 1065 }, - { "enable-external-sharing", no_argument, &opts.enable_external_sharing, true }, - { "enable-external-masters", no_argument, &opts.enable_external_masters, true }, - { "freeze-cgroup", required_argument, 0, 1068 }, - { "ghost-limit", required_argument, 0, 1069 }, - { "irmap-scan-path", required_argument, 0, 1070 }, - { "lsm-profile", required_argument, 0, 1071 }, - { "timeout", required_argument, 0, 1072 }, - { "external", required_argument, 0, 1073 }, - { "empty-ns", required_argument, 0, 1074 }, - { "lazy-pages", no_argument, 0, 1076 }, + { "ext-mount-map", required_argument, 0, 'M' }, + { "exec-cmd", no_argument, 0, 1059 }, + { "manage-cgroups", optional_argument, 0, 1060 }, + { "cgroup-root", required_argument, 0, 1061 }, + { "inherit-fd", required_argument, 0, 1062 }, + { "feature", required_argument, 0, 1063 }, + { "skip-mnt", required_argument, 0, 1064 }, + { "enable-fs", required_argument, 0, 1065 }, + { "enable-external-sharing", no_argument, &opts.enable_external_sharing, true }, + { "enable-external-masters", no_argument, &opts.enable_external_masters, true }, + { "freeze-cgroup", required_argument, 0, 1068 }, + { "ghost-limit", required_argument, 0, 1069 }, + { "irmap-scan-path", required_argument, 0, 1070 }, + { "lsm-profile", required_argument, 0, 1071 }, + { "timeout", required_argument, 0, 1072 }, + { "external", required_argument, 0, 1073 }, + { "empty-ns", required_argument, 0, 1074 }, + { "lazy-pages", no_argument, 0, 1076 }, BOOL_OPT("extra", &opts.check_extra_features), BOOL_OPT("experimental", &opts.check_experimental_features), - { "all", no_argument, 0, 1079 }, - { "cgroup-props", required_argument, 0, 1080 }, - { "cgroup-props-file", required_argument, 0, 1081 }, - { "cgroup-dump-controller", required_argument, 0, 1082 }, + { "all", no_argument, 0, 1079 }, + { "cgroup-props", required_argument, 0, 1080 }, + { "cgroup-props-file", required_argument, 0, 1081 }, + { "cgroup-dump-controller", required_argument, 0, 1082 }, BOOL_OPT(SK_INFLIGHT_PARAM, &opts.tcp_skip_in_flight), BOOL_OPT("deprecated", &opts.deprecated_ok), BOOL_OPT("display-stats", &opts.display_stats), BOOL_OPT("weak-sysctls", &opts.weak_sysctls), - { "status-fd", required_argument, 0, 1088 }, + { "status-fd", required_argument, 0, 1088 }, BOOL_OPT(SK_CLOSE_PARAM, &opts.tcp_close), - { "verbosity", optional_argument, 0, 'v' }, - { "ps-socket", required_argument, 0, 1091}, - { "config", required_argument, 0, 1089}, - { "no-default-config", no_argument, 0, 1090}, - { "tls-cacert", required_argument, 0, 1092}, - { "tls-cacrl", required_argument, 0, 1093}, - { "tls-cert", required_argument, 0, 1094}, - { "tls-key", required_argument, 0, 1095}, + { "verbosity", optional_argument, 0, 'v' }, + { "ps-socket", required_argument, 0, 1091 }, + BOOL_OPT("stream", &opts.stream), + { "config", required_argument, 0, 1089 }, + { "no-default-config", no_argument, 0, 1090 }, + { "tls-cacert", required_argument, 0, 1092 }, + { "tls-cacrl", required_argument, 0, 1093 }, + { "tls-cert", required_argument, 0, 1094 }, + { "tls-key", required_argument, 0, 1095 }, BOOL_OPT("tls", &opts.tls), - {"tls-no-cn-verify", no_argument, &opts.tls_no_cn_verify, true}, - { }, + { "tls-no-cn-verify", no_argument, &opts.tls_no_cn_verify, true }, + { "cgroup-yard", required_argument, 0, 1096 }, + { "pre-dump-mode", required_argument, 0, 1097 }, + { "file-validation", required_argument, 0, 1098 }, + BOOL_OPT("skip-file-rwx-check", &opts.skip_file_rwx_check), + { "lsm-mount-context", required_argument, 0, 1099 }, + { "network-lock", required_argument, 0, 1100 }, + BOOL_OPT("mntns-compat-mode", &opts.mntns_compat_mode), + BOOL_OPT("unprivileged", &opts.unprivileged), + BOOL_OPT("ghost-fiemap", &opts.ghost_fiemap), + BOOL_OPT(OPT_ALLOW_UPROBES, &opts.allow_uprobes), + {}, }; #undef BOOL_OPT - ret = pre_parse(argc, argv, usage_error, &no_default_config, - &cfg_file); + if (argv && argv[0]) + SET_CHAR_OPTS(argv_0, argv[0]); + + ret = pre_parse(argc, argv, usage_error, &no_default_config, &cfg_file); if (ret) return 2; @@ -534,7 +725,7 @@ int parse_options(int argc, char **argv, bool *usage_error, /* Do not free any memory if it points to argv */ if (state != PARSING_ARGV + 1) { int i; - for (i=1; i < _argc; i++) { + for (i = 1; i < _argc; i++) { free(_argv[i]); } free(_argv); @@ -580,8 +771,10 @@ int parse_options(int argc, char **argv, bool *usage_error, opts.final_state = TASK_ALIVE; break; case 'x': - if (optarg && unix_sk_ids_parse(optarg) < 0) + if (optarg && unix_sk_ids_parse(optarg) < 0) { + pr_err("Failed to parse unix socket inode from optarg: %s\n", optarg); return 1; + } opts.ext_unix_sk = true; break; case 't': @@ -632,22 +825,24 @@ int parse_options(int argc, char **argv, bool *usage_error, case 1046: SET_CHAR_OPTS(pidfile, optarg); break; - case 1047: - { - char *aux; + case 1047: { + char *aux; - aux = strchr(optarg, '='); - if (aux == NULL) - goto bad_arg; + aux = strchr(optarg, '='); + if (aux == NULL) + goto bad_arg; - *aux = '\0'; - if (veth_pair_add(optarg, aux + 1)) - return 1; - } - break; - case 1049: - if (add_script(optarg)) + *aux = '\0'; + if (veth_pair_add(optarg, aux + 1)) { + pr_err("Failed to add veth pair: %s, %s.\n", optarg, aux + 1); return 1; + } + } break; + case 1049: + if (add_script(optarg)) { + pr_err("Failed to add action-script: %s.\n", optarg); + return 1; + } break; case 1051: SET_CHAR_OPTS(addr, optarg); @@ -678,7 +873,6 @@ int parse_options(int argc, char **argv, bool *usage_error, return 1; case 'L': SET_CHAR_OPTS(libdir, optarg); - opts.libdir = optarg; break; case 1059: *has_exec_cmd = true; @@ -687,42 +881,44 @@ int parse_options(int argc, char **argv, bool *usage_error, if (parse_manage_cgroups(&opts, optarg)) return 2; break; - case 1061: - { - char *path, *ctl; + case 1061: { + char *path, *ctl; - path = strchr(optarg, ':'); - if (path) { - *path = '\0'; - path++; - ctl = optarg; - } else { - path = optarg; - ctl = NULL; - } - - if (new_cg_root_add(ctl, path)) - return -1; + path = strchr(optarg, ':'); + if (path) { + *path = '\0'; + path++; + ctl = optarg; + } else { + path = optarg; + ctl = NULL; } - break; + + if (new_cg_root_add(ctl, path)) + return -1; + } break; case 1062: if (inherit_fd_parse(optarg) < 0) return 1; break; case 1063: ret = check_add_feature(optarg); - if (ret < 0) /* invalid kernel feature name */ + if (ret < 0) /* invalid kernel feature name */ return 1; - if (ret > 0) /* list kernel features and exit */ + if (ret > 0) /* list kernel features and exit */ return 0; break; case 1064: - if (!add_skip_mount(optarg)) + if (!add_skip_mount(optarg)) { + pr_err("Failed to add skip-mnt: %s\n", optarg); return 1; + } break; case 1065: - if (!add_fsname_auto(optarg)) + if (!add_fsname_auto(optarg)) { + pr_err("Failed while parsing --enable-fs option: %s\n", optarg); return 1; + } break; case 1068: SET_CHAR_OPTS(freeze_cgroup, optarg); @@ -731,8 +927,10 @@ int parse_options(int argc, char **argv, bool *usage_error, opts.ghost_limit = parse_size(optarg); break; case 1070: - if (irmap_scan_path_add(optarg)) + if (irmap_scan_path_add(optarg)) { + pr_err("Failed while parsing --irmap-scan-path option: %s\n", optarg); return -1; + } break; case 1071: SET_CHAR_OPTS(lsm_profile, optarg); @@ -744,34 +942,36 @@ int parse_options(int argc, char **argv, bool *usage_error, case 1076: opts.lazy_pages = true; break; - case 'M': - { - char *aux; + case 'M': { + char *aux; - if (strcmp(optarg, "auto") == 0) { - opts.autodetect_ext_mounts = true; - break; - } - - aux = strchr(optarg, ':'); - if (aux == NULL) - goto bad_arg; - - *aux = '\0'; - if (ext_mount_add(optarg, aux + 1)) - return 1; + if (strcmp(optarg, "auto") == 0) { + opts.autodetect_ext_mounts = true; + break; } - break; - case 1073: - if (add_external(optarg)) + + aux = strchr(optarg, ':'); + if (aux == NULL) + goto bad_arg; + + *aux = '\0'; + if (ext_mount_add(optarg, aux + 1)) { + pr_err("Could not add external mount when initializing config: %s, %s\n", optarg, + aux + 1); return 1; + } + } break; + case 1073: + if (add_external(optarg)) { + pr_err("Could not add external resource when initializing config: %s\n", optarg); + return 1; + } break; case 1074: if (!strcmp("net", optarg)) opts.empty_ns |= CLONE_NEWNET; else { - pr_err("Unsupported empty namespace: %s\n", - optarg); + pr_err("Unsupported empty namespace: %s\n", optarg); return 1; } break; @@ -814,6 +1014,37 @@ int parse_options(int argc, char **argv, bool *usage_error, case 1095: SET_CHAR_OPTS(tls_key, optarg); break; + case 1096: + SET_CHAR_OPTS(cgroup_yard, optarg); + break; + case 1097: + if (!strcmp("read", optarg)) { + opts.pre_dump_mode = PRE_DUMP_READ; + } else if (strcmp("splice", optarg)) { + pr_err("Unable to parse value of --pre-dump-mode\n"); + return 1; + } + break; + case 1098: + if (parse_file_validation_method(&opts, optarg)) + return 2; + break; + case 1099: + SET_CHAR_OPTS(lsm_mount_context, optarg); + break; + case 1100: + has_network_lock_opt = true; + if (!strcmp("iptables", optarg)) { + opts.network_lock_method = NETWORK_LOCK_IPTABLES; + } else if (!strcmp("nftables", optarg)) { + opts.network_lock_method = NETWORK_LOCK_NFTABLES; + } else if (!strcmp("skip", optarg) || !strcmp("none", optarg)) { + opts.network_lock_method = NETWORK_LOCK_SKIP; + } else { + pr_err("Invalid value for --network-lock: %s\n", optarg); + return 1; + } + break; case 'V': pr_msg("Version: %s\n", CRIU_VERSION); if (strcmp(CRIU_GITID, "0")) @@ -827,19 +1058,22 @@ int parse_options(int argc, char **argv, bool *usage_error, } } + if (has_network_lock_opt && !strcmp(argv[optind], "restore")) { + pr_warn("--network-lock will be ignored in restore command\n"); + pr_info("Network lock method from dump will be used in restore\n"); + } + return 0; bad_arg: if (idx < 0) /* short option */ - pr_msg("Error: invalid argument for -%c: %s\n", - opt, optarg); + pr_err("invalid argument for -%c: %s\n", opt, optarg); else /* long option */ - pr_msg("Error: invalid argument for --%s: %s\n", - long_opts[idx].name, optarg); + pr_err("invalid argument for --%s: %s\n", long_opts[idx].name, optarg); return 1; } -int check_options() +int check_options(void) { if (opts.tcp_established_ok) pr_info("Will dump/restore TCP connections\n"); @@ -850,7 +1084,7 @@ int check_options() if (opts.link_remap_ok) pr_info("Will allow link remaps on FS\n"); if (opts.weak_sysctls) - pr_info("Will skip non-existant sysctls on restore\n"); + pr_info("Will skip non-existent sysctls on restore\n"); if (opts.deprecated_ok) pr_info("Turn deprecated stuff ON\n"); @@ -860,7 +1094,7 @@ int check_options() } if (!opts.restore_detach && opts.restore_sibling) { - pr_err("--restore-sibling only makes sense with --restore-detach\n"); + pr_err("--restore-sibling only makes sense with --restore-detached\n"); return 1; } @@ -870,7 +1104,7 @@ int check_options() "combination with --ps-socket is obsolete\n"); if (opts.ps_socket <= STDERR_FILENO && opts.daemon_mode) { pr_err("Standard file descriptors will be closed" - " in daemon mode\n"); + " in daemon mode\n"); return 1; } } @@ -882,6 +1116,21 @@ int check_options() } #endif + if (opts.mntns_compat_mode && opts.mode != CR_RESTORE) { + pr_err("Option --mntns-compat-mode is only valid on restore\n"); + return 1; + } else if (!opts.mntns_compat_mode && opts.mode == CR_RESTORE) { + if (check_mount_v2()) { + pr_debug("Mount engine fallback to --mntns-compat-mode mode\n"); + opts.mntns_compat_mode = true; + } + } + + if (opts.track_mem && !kdat.has_dirty_track) { + pr_err("Tracking memory is not available. Consider omitting --track-mem option.\n"); + return 1; + } + if (check_namespace_opts()) { pr_err("Error: namespace flags conflict\n"); return 1; diff --git a/criu/cr-check.c b/criu/cr-check.c index 75a665cfb..7c3dc76dd 100644 --- a/criu/cr-check.c +++ b/criu/cr-check.c @@ -21,7 +21,8 @@ #include #include #include -#include +#include +#include #include "../soccr/soccr.h" @@ -30,7 +31,7 @@ #include "sockets.h" #include "crtools.h" #include "log.h" -#include "util-pie.h" +#include "util-caps.h" #include "prctl.h" #include "files.h" #include "sk-inet.h" @@ -45,13 +46,19 @@ #include "tun.h" #include "namespaces.h" #include "pstree.h" +#include "lsm.h" +#include "apparmor.h" #include "cr_options.h" #include "libnetlink.h" #include "net.h" #include "restorer.h" #include "uffd.h" +#include "linux/aio_abi.h" +#include "mount-v2.h" -static char *feature_name(int (*func)()); +#include "images/inventory.pb-c.h" + +static char *feature_name(int (*func)(void)); static int check_tty(void) { @@ -62,10 +69,9 @@ static int check_tty(void) int ret = -1; if (ARRAY_SIZE(t.c_cc) < TERMIOS_NCC) { - pr_msg("struct termios has %d @c_cc while " - "at least %d expected.\n", - (int)ARRAY_SIZE(t.c_cc), - TERMIOS_NCC); + pr_err("struct termios has %d @c_cc while " + "at least %d expected.\n", + (int)ARRAY_SIZE(t.c_cc), TERMIOS_NCC); goto out; } @@ -99,6 +105,14 @@ out: return ret; } +static int check_apparmor_stacking(void) +{ + if (!kdat.apparmor_ns_dumping_enabled) + return -1; + + return 0; +} + static int check_map_files(void) { int ret; @@ -190,7 +204,7 @@ static int check_prctl_cat1(void) ret = prctl(PR_GET_TID_ADDRESS, (unsigned long)&tid_addr, 0, 0, 0); if (ret < 0) { - pr_msg("prctl: PR_GET_TID_ADDRESS is not supported: %m"); + pr_perror("prctl: PR_GET_TID_ADDRESS is not supported"); return -1; } @@ -206,19 +220,19 @@ static int check_prctl_cat1(void) if (errno == EPERM) pr_msg("prctl: One needs CAP_SYS_RESOURCE capability to perform testing\n"); else - pr_msg("prctl: PR_SET_MM_BRK is not supported: %m\n"); + pr_perror("prctl: PR_SET_MM_BRK is not supported"); return -1; } ret = prctl(PR_SET_MM, PR_SET_MM_EXE_FILE, -1, 0, 0); if (ret < 0 && errno != EBADF) { - pr_msg("prctl: PR_SET_MM_EXE_FILE is not supported: %m\n"); + pr_perror("prctl: PR_SET_MM_EXE_FILE is not supported"); return -1; } ret = prctl(PR_SET_MM, PR_SET_MM_AUXV, (long)&user_auxv, sizeof(user_auxv), 0); if (ret < 0) { - pr_msg("prctl: PR_SET_MM_AUXV is not supported: %m\n"); + pr_perror("prctl: PR_SET_MM_AUXV is not supported"); return -1; } } @@ -293,8 +307,7 @@ static int check_fdinfo_eventfd(void) } if (fe.counter != cnt) { - pr_err("Counter mismatch (or not met) %d want %d\n", - (int)fe.counter, cnt); + pr_err("Counter mismatch (or not met) %d want %d\n", (int)fe.counter, cnt); return -1; } @@ -468,7 +481,7 @@ err: } #ifndef SO_GET_FILTER -#define SO_GET_FILTER SO_ATTACH_FILTER +#define SO_GET_FILTER SO_ATTACH_FILTER #endif static int check_so_gets(void) @@ -505,6 +518,14 @@ static int check_ipc(void) { int ret; + /* + * Since kernel 5.16 sem_next_id can be accessed via CAP_CHECKPOINT_RESTORE, however + * for non-root users access() runs with an empty set of caps and will therefore always + * fail. + */ + if (opts.uid) + return 0; + ret = access("/proc/sys/kernel/sem_next_id", R_OK | W_OK); if (!ret) return 0; @@ -513,7 +534,7 @@ static int check_ipc(void) return -1; } -static int check_sigqueuinfo() +static int check_sigqueuinfo(void) { siginfo_t info = { .si_code = 1 }; @@ -527,61 +548,6 @@ static int check_sigqueuinfo() return 0; } -static pid_t fork_and_ptrace_attach(int (*child_setup)(void)) -{ - pid_t pid; - int sk_pair[2], sk; - char c = 0; - - if (socketpair(PF_LOCAL, SOCK_SEQPACKET, 0, sk_pair)) { - pr_perror("socketpair"); - return -1; - } - - pid = fork(); - if (pid < 0) { - pr_perror("fork"); - return -1; - } else if (pid == 0) { - sk = sk_pair[1]; - close(sk_pair[0]); - - if (child_setup && child_setup() != 0) - exit(1); - - if (write(sk, &c, 1) != 1) { - pr_perror("write"); - exit(1); - } - - while (1) - sleep(1000); - exit(1); - } - - sk = sk_pair[0]; - close(sk_pair[1]); - - if (read(sk, &c, 1) != 1) { - close(sk); - kill(pid, SIGKILL); - pr_perror("read"); - return -1; - } - - close(sk); - - if (ptrace(PTRACE_ATTACH, pid, NULL, NULL) == -1) { - pr_perror("Unable to ptrace the child"); - kill(pid, SIGKILL); - return -1; - } - - waitpid(pid, NULL, 0); - - return pid; -} - static int check_ptrace_peeksiginfo(void) { struct ptrace_peeksiginfo_args arg; @@ -608,13 +574,14 @@ static int check_ptrace_peeksiginfo(void) } kill(pid, SIGKILL); + waitpid(pid, NULL, 0); return ret; } struct special_mapping { - const char *name; - void *addr; - size_t size; + const char *name; + void *addr; + size_t size; }; static int parse_special_maps(struct special_mapping *vmas, size_t nr) @@ -632,8 +599,7 @@ static int parse_special_maps(struct special_mapping *vmas, size_t nr) int r, tail; size_t i; - r = sscanf(buf, "%lx-%lx %*s %*s %*s %*s %n\n", - &start, &end, &tail); + r = sscanf(buf, "%lx-%lx %*s %*s %*s %*s %n\n", &start, &end, &tail); if (r != 2) { fclose(maps); pr_err("Bad maps format %d.%d (%s)\n", r, tail, buf + tail); @@ -674,8 +640,7 @@ static void dummy_sighandler(int sig) * And we definitely mremap() support by the fact that those special_mappings * are subjects for ASLR. (See #288 as a reference) */ -static void check_special_mapping_mremap_child(struct special_mapping *vmas, - size_t nr) +static void check_special_mapping_mremap_child(struct special_mapping *vmas, size_t nr) { size_t i, parking_size = 0; void *parking_lot; @@ -691,8 +656,7 @@ static void check_special_mapping_mremap_child(struct special_mapping *vmas, exit(1); } - parking_lot = mmap(NULL, parking_size, PROT_NONE, - MAP_PRIVATE | MAP_ANONYMOUS, -1, 0); + parking_lot = mmap(NULL, parking_size, PROT_NONE, MAP_PRIVATE | MAP_ANONYMOUS, -1, 0); if (parking_lot == MAP_FAILED) { pr_perror("mmap(%zu) failed", parking_size); exit(1); @@ -704,10 +668,8 @@ static void check_special_mapping_mremap_child(struct special_mapping *vmas, if (vmas[i].addr == MAP_FAILED) continue; - ret = syscall(__NR_mremap, (unsigned long)vmas[i].addr, - vmas[i].size, vmas[i].size, - MREMAP_FIXED | MREMAP_MAYMOVE, - (unsigned long)parking_lot); + ret = syscall(__NR_mremap, (unsigned long)vmas[i].addr, vmas[i].size, vmas[i].size, + MREMAP_FIXED | MREMAP_MAYMOVE, (unsigned long)parking_lot); if (ret != (unsigned long)parking_lot) syscall(__NR_exit, 1); parking_lot += vmas[i].size; @@ -763,6 +725,7 @@ static int check_special_mapping_mremap(void) /* Probably, we're interrupted with a signal - cleanup */ pr_err("Failed to wait for a child %d\n", errno); kill(child, SIGKILL); + waitpid(child, NULL, 0); return -1; } @@ -801,25 +764,26 @@ static int check_ptrace_suspend_seccomp(void) } kill(pid, SIGKILL); + waitpid(pid, NULL, 0); return ret; } static int setup_seccomp_filter(void) { struct sock_filter filter[] = { - BPF_STMT(BPF_LD+BPF_W+BPF_ABS, offsetof(struct seccomp_data, nr)), + BPF_STMT(BPF_LD + BPF_W + BPF_ABS, offsetof(struct seccomp_data, nr)), /* Allow all syscalls except ptrace */ - BPF_JUMP(BPF_JMP+BPF_JEQ+BPF_K, __NR_ptrace, 0, 1), - BPF_STMT(BPF_RET+BPF_K, SECCOMP_RET_KILL), - BPF_STMT(BPF_RET+BPF_K, SECCOMP_RET_ALLOW), + BPF_JUMP(BPF_JMP + BPF_JEQ + BPF_K, __NR_ptrace, 0, 1), + BPF_STMT(BPF_RET + BPF_K, SECCOMP_RET_KILL), + BPF_STMT(BPF_RET + BPF_K, SECCOMP_RET_ALLOW), }; struct sock_fprog bpf_prog = { - .len = (unsigned short)(sizeof(filter)/sizeof(filter[0])), + .len = (unsigned short)(sizeof(filter) / sizeof(filter[0])), .filter = filter, }; - if (prctl(PR_SET_SECCOMP, SECCOMP_MODE_FILTER, (long) &bpf_prog, 0, 0) < 0) + if (prctl(PR_SET_SECCOMP, SECCOMP_MODE_FILTER, (long)&bpf_prog, 0, 0) < 0) return -1; return 0; @@ -841,9 +805,19 @@ static int check_ptrace_dump_seccomp_filters(void) } kill(pid, SIGKILL); + waitpid(pid, NULL, 0); return ret; } +static int check_ptrace_get_rseq_conf(void) +{ + if (!kdat.has_ptrace_get_rseq_conf) { + pr_warn("ptrace(PTRACE_GET_RSEQ_CONFIGURATION) isn't supported. C/R of processes which are using rseq() won't work.\n"); + return -1; + } + return 0; +} + static int check_mem_dirty_track(void) { if (!kdat.has_dirty_track) { @@ -908,11 +882,11 @@ static int check_aio_remap(void) int r; if (syscall(SYS_io_setup, 16, &ctx) < 0) { - pr_err("No AIO syscall: %m\n"); + pr_perror("No AIO syscall"); return -1; } - len = get_ring_len((unsigned long) ctx); + len = get_ring_len((unsigned long)ctx); if (!len) return -1; @@ -930,7 +904,7 @@ static int check_aio_remap(void) ctx = (aio_context_t)naddr; r = syscall(SYS_io_getevents, ctx, 0, 1, NULL, NULL); if (r < 0) { - pr_err("AIO remap doesn't work properly: %m\n"); + pr_perror("AIO remap doesn't work properly"); return -1; } @@ -956,11 +930,12 @@ struct clone_arg { char stack_ptr[0]; }; -static int clone_cb(void *_arg) { +static int clone_cb(void *_arg) +{ exit(0); } -static int check_clone_parent_vs_pid() +static int check_clone_parent_vs_pid(void) { struct clone_arg ca; pid_t pid; @@ -1016,8 +991,7 @@ static int check_autofs(void) ret = -1; - options = xsprintf("fd=%d,pgrp=%d,minproto=5,maxproto=5,direct", - pfd[1], getpgrp()); + options = xsprintf("fd=%d,pgrp=%d,minproto=5,maxproto=5,direct", pfd[1], getpgrp()); if (!options) { pr_err("failed to allocate autofs options\n"); goto close_pipe; @@ -1076,10 +1050,14 @@ static int check_tcp(void) } val = 1; - ret = setsockopt(sk, SOL_TCP, TCP_REPAIR, &val, sizeof(val)); - if (ret < 0) { - pr_perror("Can't turn TCP repair mode ON"); - goto out; + if (!opts.unprivileged || has_cap_net_admin(opts.cap_eff)) { + ret = setsockopt(sk, SOL_TCP, TCP_REPAIR, &val, sizeof(val)); + if (ret < 0) { + pr_perror("Can't turn TCP repair mode ON"); + goto out; + } + } else { + pr_info("Not checking for TCP repair mode. Please set CAP_NET_ADMIN\n"); } optlen = sizeof(val); @@ -1110,6 +1088,8 @@ static int kerndat_tcp_repair_window(void) int sk, val = 1; sk = socket(AF_INET, SOCK_STREAM, 0); + if (sk < 0 && errno == EAFNOSUPPORT) + sk = socket(AF_INET6, SOCK_STREAM, 0); if (sk < 0) { pr_perror("Unable to create inet socket"); goto errn; @@ -1129,7 +1109,7 @@ static int kerndat_tcp_repair_window(void) pr_perror("Unable to set TCP_REPAIR_WINDOW"); goto err; } -now: + now: val = 0; } else val = 1; @@ -1201,6 +1181,44 @@ static int check_compat_cr(void) return -1; } +static int check_nftables_cr(void) +{ +#if defined(CONFIG_HAS_NFTABLES_LIB_API_0) || defined(CONFIG_HAS_NFTABLES_LIB_API_1) + return 0; +#else + pr_warn("CRIU was built without nftables support - nftables rules will " + "not be preserved during C/R\n"); + return -1; +#endif +} + +static int check_ipt_legacy(void) +{ + char *ipt_legacy_bin; + char *ip6t_legacy_bin; + + ipt_legacy_bin = get_legacy_iptables_bin(false, false); + if (!ipt_legacy_bin) { + pr_warn("Couldn't find iptables version which is using iptables legacy API\n"); + return -1; + } + + pr_info("iptables cmd: %s\n", ipt_legacy_bin); + + if (!kdat.ipv6) + return 0; + + ip6t_legacy_bin = get_legacy_iptables_bin(true, false); + if (!ip6t_legacy_bin) { + pr_warn("Couldn't find ip6tables version which is using iptables legacy API\n"); + return -1; + } + + pr_info("ip6tables cmd: %s\n", ip6t_legacy_bin); + + return 0; +} + static int check_uffd(void) { if (!kdat.has_uffd) { @@ -1224,6 +1242,16 @@ static int check_uffd_noncoop(void) return 0; } +static int check_clone3_set_tid(void) +{ + if (!kdat.has_clone3_set_tid) { + pr_warn("clone3() with set_tid not supported\n"); + return -1; + } + + return 0; +} + static int check_can_map_vdso(void) { if (kdat_can_map_vdso() == 1) @@ -1256,11 +1284,326 @@ static int check_kcmp_epoll(void) return 0; } +static int check_time_namespace(void) +{ + if (!kdat.has_timens) { + pr_err("Time namespaces are not supported\n"); + return -1; + } + + return 0; +} + +static int check_newifindex(void) +{ + if (!kdat.has_newifindex) { + pr_err("IFLA_NEW_IFINDEX isn't supported\n"); + return -1; + } + + return 0; +} + static int check_net_diag_raw(void) { check_sock_diag(); - return (socket_test_collect_bit(AF_INET, IPPROTO_RAW) && - socket_test_collect_bit(AF_INET6, IPPROTO_RAW)) ? 0 : -1; + return (socket_test_collect_bit(AF_INET, IPPROTO_RAW) && socket_test_collect_bit(AF_INET6, IPPROTO_RAW)) ? 0 : + -1; +} + +static int check_pidfd_store(void) +{ + if (!kdat.has_pidfd_open) { + pr_warn("Pidfd store requires pidfd_open syscall which is not supported\n"); + return -1; + } + + if (!kdat.has_pidfd_getfd) { + pr_warn("Pidfd store requires pidfd_getfd syscall which is not supported\n"); + return -1; + } + + return 0; +} + +static int check_ns_pid(void) +{ + if (!kdat.has_nspid) + return -1; + + return 0; +} + +static int check_memfd_hugetlb(void) +{ + if (!kdat.has_memfd_hugetlb) + return -1; + + return 0; +} + +static int check_network_lock_nftables(void) +{ + if (!kdat.has_nftables_concat) { + pr_warn("Nftables based locking requires libnftables and set concatenations support\n"); + return -1; + } + + return 0; +} + +static int check_sockopt_buf_lock(void) +{ + if (!kdat.has_sockopt_buf_lock) + return -1; + + return 0; +} + +static int check_move_mount_set_group(void) +{ + if (!kdat.has_move_mount_set_group) + return -1; + + return 0; +} + +static int check_openat2(void) +{ + if (!kdat.has_openat2) + return -1; + + return 0; +} + +static int check_ipv6_freebind(void) +{ + if (!kdat.has_ipv6_freebind) + return -1; + + return 0; +} + +static int check_pagemap_scan(void) +{ + if (!kdat.has_pagemap_scan) + return -1; + + return 0; +} + +static int check_timer_cr_ids(void) +{ + if (!kdat.has_timer_cr_ids) + return -1; + + return 0; +} + +/* musl doesn't have a statx wrapper... */ +struct staty { + __u32 stx_dev_major; + __u32 stx_dev_minor; + __u64 stx_ino; +}; + +static long get_file_dev_and_inode(void *addr, struct staty *stx) +{ + char buf[4096]; + FILE *mapf; + + mapf = fopen("/proc/self/maps", "r"); + if (mapf == NULL) { + pr_perror("fopen(/proc/self/maps)"); + return -1; + } + + while (fgets(buf, sizeof(buf), mapf)) { + unsigned long start, end; + uint32_t maj, min; + __u64 ino; + + if (sscanf(buf, "%lx-%lx %*s %*s %x:%x %llu", + &start, &end, &maj, &min, &ino) != 5) { + pr_perror("Unable to parse: %s", buf); + return -1; + } + if (start == (unsigned long)addr) { + stx->stx_dev_major = maj; + stx->stx_dev_minor = min; + stx->stx_ino = ino; + return 0; + } + } + + pr_err("Unable to find the mapping\n"); + return -1; +} + +static int ovl_mount(void) +{ + int tmpfs, fsfd, ovl; + + fsfd = cr_fsopen("tmpfs", 0); + if (fsfd == -1) { + pr_perror("Unable to fsopen tmpfs"); + return -1; + } + + if (cr_fsconfig(fsfd, FSCONFIG_CMD_CREATE, NULL, NULL, 0) == -1) { + pr_perror("Unable to create tmpfs mount"); + return -1; + } + + tmpfs = cr_fsmount(fsfd, 0, 0); + if (tmpfs == -1) { + pr_perror("Unable to mount tmpfs"); + return -1; + } + + close(fsfd); + + /* overlayfs can't be constructed on top of a detached mount. */ + if (sys_move_mount(tmpfs, "", AT_FDCWD, "/tmp", MOVE_MOUNT_F_EMPTY_PATH)) { + pr_perror("Unable to attach tmpfs mount"); + return -1; + } + close(tmpfs); + + if (chdir("/tmp")) { + pr_perror("Unable to change working directory"); + return -1; + } + + if (mkdir("/tmp/w", 0755) == -1 || + mkdir("/tmp/u", 0755) == -1 || + mkdir("/tmp/l", 0755) == -1) { + pr_perror("mkdir"); + return -1; + } + + fsfd = cr_fsopen("overlay", 0); + if (fsfd == -1) { + pr_perror("Unable to fsopen overlayfs"); + return -1; + } + if (cr_fsconfig(fsfd, FSCONFIG_SET_STRING, "source", "test", 0) == -1 || + cr_fsconfig(fsfd, FSCONFIG_SET_STRING, "lowerdir", "/tmp/l", 0) == -1 || + cr_fsconfig(fsfd, FSCONFIG_SET_STRING, "upperdir", "/tmp/u", 0) == -1 || + cr_fsconfig(fsfd, FSCONFIG_SET_STRING, "workdir", "/tmp/w", 0) == -1) { + pr_perror("Unable to configure overlayfs"); + return -1; + } + if (cr_fsconfig(fsfd, FSCONFIG_CMD_CREATE, NULL, NULL, 0) == -1) { + pr_perror("Unable to create overlayfs"); + return -1; + } + ovl = cr_fsmount(fsfd, 0, 0); + if (ovl == -1) { + pr_perror("Unable to mount overlayfs"); + return -1; + } + + return ovl; +} + +/* + * Check that the file device and inode shown in /proc/pid/maps match values + * returned by stat(2). + */ +static int do_check_overlayfs_maps(void) +{ + struct staty stx, mstx; + struct stat st; + int ovl, fd; + void *addr; + + /* Create a new mount namespace to not care about cleaning test mounts. */ + if (unshare(CLONE_NEWNS) == -1) { + pr_warn("Unable to create a new mount namespace\n"); + return 0; + } + + if (mount(NULL, "/", NULL, MS_SLAVE | MS_REC, NULL) == -1) { + pr_perror("Unable to remount / with MS_SLAVE"); + return -1; + } + + ovl = ovl_mount(); + if (ovl == -1) + return -1; + + fd = openat(ovl, "test", O_RDWR | O_CREAT, 0644); + if (fd == -1) { + pr_perror("Unable to open a test file"); + return -1; + } + + addr = mmap(NULL, PAGE_SIZE, PROT_READ | PROT_WRITE, MAP_FILE | MAP_SHARED, fd, 0); + if (addr == MAP_FAILED) { + pr_perror("Unable to map the test file"); + return -1; + } + + if (get_file_dev_and_inode(addr, &mstx)) + return -1; + if (fstat(fd, &st)) { + pr_perror("stat"); + return -1; + } + stx.stx_dev_major = major(st.st_dev); + stx.stx_dev_minor = minor(st.st_dev); + stx.stx_ino = st.st_ino; + + if (stx.stx_dev_major != mstx.stx_dev_major || + stx.stx_dev_minor != mstx.stx_dev_minor || + stx.stx_ino != mstx.stx_ino) { + pr_err("unmatched dev:ino %x:%x:%llx (expected %x:%x:%llx)\n", + mstx.stx_dev_major, mstx.stx_dev_minor, mstx.stx_ino, + stx.stx_dev_major, stx.stx_dev_minor, stx.stx_ino); + return -1; + } + + return 0; +} + +static int check_overlayfs_maps(void) +{ + pid_t pid; + int status; + + pid = fork(); + if (pid == -1) { + pr_perror("Unable to fork a child"); + return -1; + } + if (pid == 0) { + if (do_check_overlayfs_maps()) + exit(1); + exit(0); + } + if (waitpid(pid, &status, 0) == -1) { + pr_perror("waitpid"); + return -1; + } + return status == 0 ? 0 : -1; +} + +static int check_breakpoints(void) +{ + if (!kdat.has_breakpoints) { + pr_warn("Hardware breakpoints don't seem to work\n"); + return -1; + } + + return 0; +} + +static int check_pagemap_scan_guard_pages(void) +{ + kerndat_warn_about_madv_guards(); + + return kdat.has_pagemap_scan_guard_pages ? 0 : -1; } static int (*chk_feature)(void); @@ -1277,25 +1620,25 @@ static int (*chk_feature)(void); * We fail if any feature in category 1 is missing but tolerate failures * in the other categories. Currently, there is nothing in category 3. */ -#define CHECK_GOOD "Looks good." -#define CHECK_BAD "Does not look good." -#define CHECK_MAYBE "Looks good but some kernel features are missing\n" \ - "which, depending on your process tree, may cause\n" \ - "dump or restore failure." -#define CHECK_CAT1(fn) do { \ - if ((ret = fn) != 0) { \ - print_on_level(DEFAULT_LOGLEVEL, "%s\n", CHECK_BAD); \ - return ret; \ - } \ - } while (0) +#define CHECK_GOOD "Looks good." +#define CHECK_BAD "Does not look good." +#define CHECK_MAYBE \ + "Looks good but some kernel features are missing\n" \ + "which, depending on your process tree, may cause\n" \ + "dump or restore failure." +#define CHECK_CAT1(fn) \ + do { \ + if ((ret = fn) != 0) { \ + pr_warn("%s\n", CHECK_BAD); \ + return ret; \ + } \ + } while (0) + int cr_check(void) { struct ns_id *ns; int ret = 0; - if (!is_root_user()) - return -1; - root_item = alloc_pstree_item(); if (root_item == NULL) return -1; @@ -1316,8 +1659,7 @@ int cr_check(void) if (chk_feature) { if (chk_feature()) return -1; - print_on_level(DEFAULT_LOGLEVEL, "%s is supported\n", - feature_name(chk_feature)); + pr_msg("%s is supported\n", feature_name(chk_feature)); return 0; } @@ -1373,6 +1715,25 @@ int cr_check(void) ret |= check_sk_netns(); ret |= check_kcmp_epoll(); ret |= check_net_diag_raw(); + ret |= check_clone3_set_tid(); + ret |= check_time_namespace(); + ret |= check_newifindex(); + ret |= check_pidfd_store(); + ret |= check_ns_pid(); + ret |= check_network_lock_nftables(); + ret |= check_sockopt_buf_lock(); + ret |= check_memfd_hugetlb(); + ret |= check_move_mount_set_group(); + ret |= check_openat2(); + ret |= check_ptrace_get_rseq_conf(); + ret |= check_ipv6_freebind(); + ret |= check_pagemap_scan(); + ret |= check_overlayfs_maps(); + ret |= check_timer_cr_ids(); + ret |= check_pagemap_scan_guard_pages(); + + if (kdat.lsm == LSMTYPE__APPARMOR) + ret |= check_apparmor_stacking(); } /* @@ -1382,8 +1743,12 @@ int cr_check(void) ret |= check_autofs(); ret |= check_compat_cr(); } + /* + * Category 4 - optional. + */ + check_breakpoints(); - print_on_level(DEFAULT_LOGLEVEL, "%s\n", ret ? CHECK_MAYBE : CHECK_GOOD); + pr_msg("%s\n", ret ? CHECK_MAYBE : CHECK_GOOD); return ret; } #undef CHECK_GOOD @@ -1447,7 +1812,7 @@ static int check_external_net_ns(void) struct feature_list { char *name; - int (*func)(); + int (*func)(void); }; static struct feature_list feature_list[] = { @@ -1468,14 +1833,34 @@ static struct feature_list feature_list[] = { { "compat_cr", check_compat_cr }, { "uffd", check_uffd }, { "uffd-noncoop", check_uffd_noncoop }, - { "can_map_vdso", check_can_map_vdso}, + { "can_map_vdso", check_can_map_vdso }, { "sk_ns", check_sk_netns }, { "sk_unix_file", check_sk_unix_file }, { "net_diag_raw", check_net_diag_raw }, { "nsid", check_nsid }, - { "link_nsid", check_link_nsid}, - { "kcmp_epoll", check_kcmp_epoll}, - { "external_net_ns", check_external_net_ns}, + { "link_nsid", check_link_nsid }, + { "kcmp_epoll", check_kcmp_epoll }, + { "timens", check_time_namespace }, + { "external_net_ns", check_external_net_ns }, + { "clone3_set_tid", check_clone3_set_tid }, + { "newifindex", check_newifindex }, + { "nftables", check_nftables_cr }, + { "has_ipt_legacy", check_ipt_legacy }, + { "pidfd_store", check_pidfd_store }, + { "ns_pid", check_ns_pid }, + { "apparmor_stacking", check_apparmor_stacking }, + { "network_lock_nftables", check_network_lock_nftables }, + { "sockopt_buf_lock", check_sockopt_buf_lock }, + { "memfd_hugetlb", check_memfd_hugetlb }, + { "move_mount_set_group", check_move_mount_set_group }, + { "openat2", check_openat2 }, + { "get_rseq_conf", check_ptrace_get_rseq_conf }, + { "ipv6_freebind", check_ipv6_freebind }, + { "pagemap_scan", check_pagemap_scan }, + { "timer_cr_ids", check_timer_cr_ids }, + { "overlayfs_maps", check_overlayfs_maps }, + { "breakpoints", check_breakpoints }, + { "pagemap_scan_guard_pages", check_pagemap_scan_guard_pages }, { NULL, NULL }, }; @@ -1493,10 +1878,10 @@ void pr_check_features(const char *offset, const char *sep, int width) pr_msg("\n%s", offset); pos = offset_len; } - pr_msg("%s", fl->name); + pr_msg("%s", fl->name); // no \n pos += len; - if ((fl + 1)->name) { // not the last item - pr_msg("%s", sep); + if ((fl + 1)->name) { // not the last item + pr_msg("%s", sep); // no \n pos += sep_len; } } @@ -1517,7 +1902,7 @@ int check_add_feature(char *feat) return -1; } -static char *feature_name(int (*func)()) +static char *feature_name(int (*func)(void)) { struct feature_list *fl; @@ -1527,3 +1912,54 @@ static char *feature_name(int (*func)()) } return NULL; } + +static int pr_set_dumpable(int value) +{ + int ret = prctl(PR_SET_DUMPABLE, value, 0, 0, 0); + if (ret < 0) + pr_perror("Unable to set PR_SET_DUMPABLE"); + return ret; +} + +int check_caps(void) +{ + /* Read out effective capabilities and store in opts.cap_eff. */ + if (set_opts_cap_eff()) + goto out; + + /* + * No matter if running as root or not. CRIU always needs + * at least these capabilities. + */ + if (!has_cap_checkpoint_restore(opts.cap_eff)) + goto out; + + /* For some things we need to know if we are running as root. */ + opts.uid = geteuid(); + + if (!opts.uid) { + /* CRIU is running as root. No further checks are necessary. */ + return 0; + } + + if (!opts.unprivileged) { + pr_msg("Running as non-root requires '--unprivileged'\n"); + pr_msg("Please consult the documentation for limitations when running as non-root\n"); + return -1; + } + + /* + * At his point we know we are running as non-root with the necessary + * capabilities available. Now we have to make the process dumpable + * so that /proc/self is not owned by root. + */ + if (pr_set_dumpable(1)) + return -1; + + return 0; +out: + pr_msg("CRIU needs to have the CAP_SYS_ADMIN or the CAP_CHECKPOINT_RESTORE capability: \n"); + pr_msg("setcap cap_checkpoint_restore+eip %s\n", opts.argv_0); + + return -1; +} diff --git a/criu/cr-dedup.c b/criu/cr-dedup.c index 71b7a9cc3..feeb9ebb0 100644 --- a/criu/cr-dedup.c +++ b/criu/cr-dedup.c @@ -14,12 +14,12 @@ int cr_dedup(void) { int close_ret, ret = 0; unsigned long img_id; - DIR * dirp; + DIR *dirp; struct dirent *ent; dirp = opendir(CR_PARENT_LINK); if (dirp == NULL) { - pr_perror("Can't enter previous snapshot folder, error=%d", errno); + pr_perror("Can't enter previous snapshot folder"); ret = -1; goto err; } @@ -29,7 +29,7 @@ int cr_dedup(void) ent = readdir(dirp); if (ent == NULL) { if (errno) { - pr_perror("Failed readdir, error=%d", errno); + pr_perror("Failed readdir"); ret = -1; goto err; } @@ -71,7 +71,7 @@ static int cr_dedup_one_pagemap(unsigned long img_id, int flags) { int ret; struct page_read pr; - struct page_read * prp; + struct page_read *prp; flags |= PR_MOD; ret = open_page_read(img_id, &pr, flags); @@ -87,11 +87,10 @@ static int cr_dedup_one_pagemap(unsigned long img_id, int flags) if (ret <= 0) goto exit; - pr_debug("dedup iovec base=%"PRIx64", len=%lu\n", - pr.pe->vaddr, pagemap_len(pr.pe)); + pr_debug("dedup iovec %" PRIx64 " - %" PRIx64 "\n", + pr.pe->vaddr, pr.pe->vaddr + pagemap_len(pr.pe)); if (!pagemap_in_parent(pr.pe)) { - ret = dedup_one_iovec(prp, pr.pe->vaddr, - pagemap_len(pr.pe)); + ret = dedup_one_iovec(prp, pr.pe->vaddr, pagemap_len(pr.pe)); if (ret) goto exit; } diff --git a/criu/cr-dump.c b/criu/cr-dump.c index 9273fc0a5..a58aaf34a 100644 --- a/criu/cr-dump.c +++ b/criu/cr-dump.c @@ -15,7 +15,6 @@ #include #include - #include #include @@ -46,6 +45,7 @@ #include "proc_parse.h" #include "parasite.h" #include "parasite-syscall.h" +#include "compel/ptrace.h" #include "files.h" #include "files-reg.h" #include "shmem.h" @@ -80,6 +80,14 @@ #include "fault-injection.h" #include "dump.h" #include "eventpoll.h" +#include "memfd.h" +#include "timens.h" +#include "img-streamer.h" +#include "pidfd-store.h" +#include "apparmor.h" +#include "asm/dump.h" +#include "timer.h" +#include "sigact.h" /* * Architectures can overwrite this function to restore register sets that @@ -89,13 +97,12 @@ * with_threads = true : The register sets of the tasks with all their threads * are restored */ -int __attribute__((weak)) arch_set_thread_regs(struct pstree_item *item, - bool with_threads) +int __attribute__((weak)) arch_set_thread_regs(struct pstree_item *item, bool with_threads) { return 0; } -#define PERSONALITY_LENGTH 9 +#define PERSONALITY_LENGTH 9 static char loc_buf[PERSONALITY_LENGTH]; void free_mappings(struct vm_area_list *vma_area_list) @@ -111,8 +118,7 @@ void free_mappings(struct vm_area_list *vma_area_list) vm_area_list_init(vma_area_list); } -int collect_mappings(pid_t pid, struct vm_area_list *vma_area_list, - dump_filemap_t dump_file) +int collect_mappings(pid_t pid, struct vm_area_list *vma_area_list, dump_filemap_t dump_file) { int ret = -1; @@ -124,8 +130,24 @@ int collect_mappings(pid_t pid, struct vm_area_list *vma_area_list, if (ret < 0) goto err; - pr_info("Collected, longest area occupies %lu pages\n", - vma_area_list->nr_priv_pages_longest); + /* + * In addition to real process VMAs we should keep an info about + * madvise(MADV_GUARD_INSTALL) pages. While these are not represented + * as a struct vm_area_struct in the kernel, it is convenient to treat + * them as mappings in CRIU and reuse the same VMA images but with only + * VMA_AREA_GUARD flag set. + * + * Also, we don't need to dump them during pre-dump. + */ + if (dump_file) { + ret = collect_madv_guards(pid, vma_area_list); + if (ret < 0) { + pr_err("Collect MADV_GUARD_INSTALL pages (pid: %d) failed with %d\n", pid, ret); + goto err; + } + } + + pr_info("Collected, longest area occupies %lu pages\n", vma_area_list->nr_priv_pages_longest); pr_info_vma_list(&vma_area_list->h); pr_info("----------------------------------------\n"); @@ -154,6 +176,11 @@ static int dump_sched_info(int pid, ThreadCoreEntry *tc) tc->has_sched_policy = true; tc->sched_policy = ret; + /* The reset-on-fork flag might be used in combination + * with SCHED_FIFO or SCHED_RR to reset the scheduling + * policy/priority in child processes. + */ + ret &= ~SCHED_RESET_ON_FORK; if ((ret == SCHED_RR) || (ret == SCHED_FIFO)) { ret = syscall(__NR_sched_getparam, pid, &sp); if (ret < 0) { @@ -185,6 +212,25 @@ static int dump_sched_info(int pid, ThreadCoreEntry *tc) return 0; } +static int check_thread_rseq(pid_t tid, const struct parasite_check_rseq *ti_rseq) +{ + if (!kdat.has_rseq || kdat.has_ptrace_get_rseq_conf) + return 0; + + pr_debug("%d has rseq_inited = %d\n", tid, ti_rseq->rseq_inited); + + /* + * We have no kdat.has_ptrace_get_rseq_conf and user + * process has rseq() used, let's fail dump. + */ + if (ti_rseq->rseq_inited) { + pr_err("%d has rseq but kernel lacks get_rseq_conf feature\n", tid); + return -1; + } + + return 0; +} + struct cr_imgset *glob_imgset; static int collect_fds(pid_t pid, struct parasite_drain_fd **dfds) @@ -212,8 +258,10 @@ static int collect_fds(pid_t pid, struct parasite_drain_fd **dfds) size += PAGE_SIZE; t = xrealloc(*dfds, size); - if (!t) + if (!t) { + closedir(fd_dir); return -1; + } *dfds = t; } @@ -319,8 +367,7 @@ static int dump_task_fs(pid_t pid, struct parasite_dump_misc *misc, struct cr_im close(fd); - pr_info("Dumping task cwd id %#x root id %#x\n", - fe.cwd_id, fe.root_id); + pr_info("Dumping task cwd id %#x root id %#x\n", fe.cwd_id, fe.root_id); return pb_write_one(img_from_set(imgset, CR_FD_FS), &fe, PB_FS); } @@ -334,7 +381,7 @@ static int dump_task_rlimits(int pid, TaskRlimitsEntry *rls) { int res; - for (res = 0; res n_rlimits ; res++) { + for (res = 0; res < rls->n_rlimits; res++) { struct rlimit64 lim; if (syscall(__NR_prlimit64, pid, res, NULL, &lim)) { @@ -406,15 +453,17 @@ static int dump_filemap(struct vma_area *vma_area, int fd) if (vma_area->aufs_rpath) { struct fd_link aufs_link; - strlcpy(aufs_link.name, vma_area->aufs_rpath, - sizeof(aufs_link.name)); + __strlcpy(aufs_link.name, vma_area->aufs_rpath, sizeof(aufs_link.name)); aufs_link.len = strlen(aufs_link.name); p.link = &aufs_link; } /* Flags will be set during restore in open_filmap() */ - ret = dump_one_reg_file_cond(fd, &id, &p); + if (vma->status & VMA_AREA_MEMFD) + ret = dump_one_memfd_cond(fd, &id, &p); + else + ret = dump_one_reg_file_cond(fd, &id, &p); vma->shmid = id; return ret; @@ -425,8 +474,7 @@ static int check_sysvipc_map_dump(pid_t pid, VmaEntry *vma) if (root_ns_mask & CLONE_NEWIPC) return 0; - pr_err("Task %d with SysVIPC shmem map @%"PRIx64" doesn't live in IPC ns\n", - pid, vma->start); + pr_err("Task %d with SysVIPC shmem map @%" PRIx64 " doesn't live in IPC ns\n", pid, vma->start); return -1; } @@ -458,10 +506,8 @@ err: return ret; } -static int dump_task_mm(pid_t pid, const struct proc_pid_stat *stat, - const struct parasite_dump_misc *misc, - const struct vm_area_list *vma_area_list, - const struct cr_imgset *imgset) +static int dump_task_mm(pid_t pid, const struct proc_pid_stat *stat, const struct parasite_dump_misc *misc, + const struct vm_area_list *vma_area_list, const struct cr_imgset *imgset) { MmEntry mme = MM_ENTRY__INIT; struct vma_area *vma_area; @@ -569,8 +615,8 @@ static int get_task_futex_robust_list(pid_t pid, ThreadCoreEntry *info) goto err; } - info->futex_rla = encode_pointer(head); - info->futex_rla_len = (u32)len; + info->futex_rla = encode_pointer(head); + info->futex_rla_len = (u32)len; return 0; @@ -613,7 +659,7 @@ static int dump_task_kobj_ids(struct pstree_item *item) TaskKobjIdsEntry *ids = item->ids; elem.pid = pid; - elem.idx = 0; /* really 0 for all */ + elem.idx = 0; /* really 0 for all */ elem.genid = 0; /* FIXME optimize */ new = 0; @@ -687,25 +733,21 @@ int dump_thread_core(int pid, CoreEntry *core, const struct parasite_dump_thread int ret; ThreadCoreEntry *tc = core->thread_core; - ret = collect_lsm_profile(pid, tc->creds); - if (!ret) { - /* - * XXX: It's possible to set two: 32-bit and 64-bit - * futex list's heads. That makes about no sense, but - * it's possible. Until we meet such application, dump - * only one: native or compat futex's list pointer. - */ - if (!core_is_compat(core)) - ret = get_task_futex_robust_list(pid, tc); - else - ret = get_task_futex_robust_list_compat(pid, tc); - } + /* + * XXX: It's possible to set two: 32-bit and 64-bit + * futex list's heads. That makes about no sense, but + * it's possible. Until we meet such application, dump + * only one: native or compat futex's list pointer. + */ + if (!core_is_compat(core)) + ret = get_task_futex_robust_list(pid, tc); + else + ret = get_task_futex_robust_list_compat(pid, tc); if (!ret) ret = dump_sched_info(pid, tc); if (!ret) { core_put_tls(core, ti->tls); - CORE_THREAD_ARCH_INFO(core)->clear_tid_addr = - encode_pointer(ti->tid_addr); + CORE_THREAD_ARCH_INFO(core)->clear_tid_addr = encode_pointer(ti->tid_addr); BUG_ON(!tc->sas); copy_sas(tc->sas, &ti->sas); if (ti->pdeath_sig) { @@ -719,20 +761,29 @@ int dump_thread_core(int pid, CoreEntry *core, const struct parasite_dump_thread if (!ret) ret = seccomp_dump_thread(pid, tc); + /* + * We are dumping rseq() in the dump_thread_rseq() function, + * *before* processes gets infected (because of ptrace requests + * API restriction). At this point, if the kernel lacks + * kdat.has_ptrace_get_rseq_conf support we have to ensure + * that dumpable processes haven't initialized rseq() or + * fail dump if rseq() was used. + */ + if (!ret) + ret = check_thread_rseq(pid, &ti->rseq); + return ret; } -static int dump_task_core_all(struct parasite_ctl *ctl, - struct pstree_item *item, - const struct proc_pid_stat *stat, - const struct cr_imgset *cr_imgset, - const struct parasite_dump_misc *misc) +static int dump_task_core_all(struct parasite_ctl *ctl, struct pstree_item *item, const struct proc_pid_stat *stat, + const struct cr_imgset *cr_imgset, const struct parasite_dump_misc *misc) { struct cr_img *img; CoreEntry *core = item->core[0]; pid_t pid = item->pid->real; int ret = -1; struct parasite_dump_cgroup_args cgroup_args, *info = NULL; + u32 *cg_set; BUILD_BUG_ON(sizeof(cgroup_args) < PARASITE_ARG_SIZE_MIN); @@ -743,15 +794,28 @@ static int dump_task_core_all(struct parasite_ctl *ctl, core->tc->child_subreaper = misc->child_subreaper; core->tc->has_child_subreaper = true; + if (misc->membarrier_registration_mask) { + core->tc->membarrier_registration_mask = misc->membarrier_registration_mask; + core->tc->has_membarrier_registration_mask = true; + } + ret = get_task_personality(pid, &core->tc->personality); if (ret < 0) goto err; - strlcpy((char *)core->tc->comm, stat->comm, TASK_COMM_LEN); + __strlcpy((char *)core->tc->comm, stat->comm, TASK_COMM_LEN); core->tc->flags = stat->flags; core->tc->task_state = item->pid->state; core->tc->exit_code = 0; + core->thread_core->creds->lsm_profile = dmpi(item)->thread_lsms[0]->profile; + core->thread_core->creds->lsm_sockcreate = dmpi(item)->thread_lsms[0]->sockcreate; + + if (core->tc->task_state == TASK_STOPPED) { + core->tc->has_stop_signo = true; + core->tc->stop_signo = item->pid->stop_signo; + } + ret = parasite_dump_thread_leader_seized(ctl, pid, core); if (ret) goto err; @@ -770,20 +834,20 @@ static int dump_task_core_all(struct parasite_ctl *ctl, */ if (item->ids->has_cgroup_ns_id && !item->parent) { info = &cgroup_args; + strcpy(cgroup_args.thread_cgrp, "self/cgroup"); ret = parasite_dump_cgroup(ctl, &cgroup_args); if (ret) goto err; } - core->tc->has_cg_set = true; - ret = dump_task_cgroup(item, &core->tc->cg_set, info); + core->thread_core->has_cg_set = true; + cg_set = &core->thread_core->cg_set; + ret = dump_thread_cgroup(item, cg_set, info, -1); if (ret) goto err; img = img_from_set(cr_imgset, CR_FD_CORE); ret = pb_write_one(img, core, PB_CORE); - if (ret < 0) - goto err; err: pr_info("----------------------------------------\n"); @@ -798,7 +862,9 @@ static int collect_pstree_ids_predump(void) struct { struct pstree_item i; struct dmp_info d; - } crt = { .i.pid = &pid, }; + } crt = { + .i.pid = &pid, + }; /* * This thing is normally done inside @@ -838,8 +904,73 @@ static int collect_file_locks(void) return parse_file_locks(); } -static int dump_task_thread(struct parasite_ctl *parasite_ctl, - const struct pstree_item *item, int id) +static bool task_in_rseq(struct criu_rseq_cs *rseq_cs, uint64_t addr) +{ + return addr >= rseq_cs->start_ip && addr < rseq_cs->start_ip + rseq_cs->post_commit_offset; +} + +static int fixup_thread_rseq(const struct pstree_item *item, int i) +{ + CoreEntry *core = item->core[i]; + struct criu_rseq_cs *rseq_cs = &dmpi(item)->thread_rseq_cs[i]; + pid_t tid = item->threads[i].real; + + if (!kdat.has_ptrace_get_rseq_conf) + return 0; + + /* equivalent to (struct rseq)->rseq_cs is NULL */ + if (!rseq_cs->start_ip) + return 0; + + pr_debug( + "fixup_thread_rseq for %d: rseq_cs start_ip = %llx abort_ip = %llx post_commit_offset = %llx flags = %x version = %x; IP = %lx\n", + tid, rseq_cs->start_ip, rseq_cs->abort_ip, rseq_cs->post_commit_offset, rseq_cs->flags, + rseq_cs->version, (unsigned long)TI_IP(core)); + + if (rseq_cs->version != 0) { + pr_err("unsupported RSEQ ABI version = %d\n", rseq_cs->version); + return -1; + } + + if (task_in_rseq(rseq_cs, TI_IP(core))) { + struct pid *tid = &item->threads[i]; + + /* + * We need to fixup task instruction pointer from + * the original one (which lays inside rseq critical section) + * to rseq abort handler address. But we need to look on rseq_cs->flags + * (please refer to struct rseq -> flags field description). + * Naive idea of flags support may be like... let's change instruction pointer (IP) + * to rseq_cs->abort_ip if !(rseq_cs->flags & RSEQ_CS_FLAG_NO_RESTART_ON_SIGNAL). + * But unfortunately, it doesn't work properly, because the kernel does + * clean up of rseq_cs field in the struct rseq (modifies userspace memory). + * So, we need to preserve original value of (struct rseq)->rseq_cs field in the + * image and restore it's value before releasing threads (see restore_rseq_cs()). + * + * It's worth to mention that we need to fixup IP in CoreEntry + * (used when full dump/restore is performed) and also in + * the parasite regs storage (used if --leave-running option is used, + * or if dump error occurred and process execution is resumed). + */ + + if (!(rseq_cs->flags & RSEQ_CS_FLAG_NO_RESTART_ON_SIGNAL)) { + pr_warn("The %d task is in rseq critical section. IP will be set to rseq abort handler addr\n", + tid->real); + + TI_IP(core) = rseq_cs->abort_ip; + + if (item->pid->real == tid->real) { + compel_set_leader_ip(dmpi(item)->parasite_ctl, rseq_cs->abort_ip); + } else { + compel_set_thread_ip(dmpi(item)->thread_ctls[i], rseq_cs->abort_ip); + } + } + } + + return 0; +} + +static int dump_task_thread(struct parasite_ctl *parasite_ctl, const struct pstree_item *item, int id) { struct parasite_thread_ctl *tctl = dmpi(item)->thread_ctls[id]; struct pid *tid = &item->threads[id]; @@ -859,6 +990,15 @@ static int dump_task_thread(struct parasite_ctl *parasite_ctl, } pstree_insert_pid(tid); + core->thread_core->creds->lsm_profile = dmpi(item)->thread_lsms[id]->profile; + core->thread_core->creds->lsm_sockcreate = dmpi(item)->thread_lsms[0]->sockcreate; + + ret = fixup_thread_rseq(item, id); + if (ret) { + pr_err("Can't fixup rseq for pid %d\n", pid); + goto err; + } + img = open_image(CR_FD_CORE, O_DUMP, tid->ns[0].virt); if (!img) goto err; @@ -867,12 +1007,12 @@ static int dump_task_thread(struct parasite_ctl *parasite_ctl, close_image(img); err: + compel_release_thread(tctl); pr_info("----------------------------------------\n"); return ret; } -static int dump_one_zombie(const struct pstree_item *item, - const struct proc_pid_stat *pps) +static int dump_one_zombie(const struct pstree_item *item, const struct proc_pid_stat *pps) { CoreEntry *core; int ret = -1; @@ -882,7 +1022,7 @@ static int dump_one_zombie(const struct pstree_item *item, if (!core) return -1; - strlcpy((char *)core->tc->comm, pps->comm, TASK_COMM_LEN); + __strlcpy((char *)core->tc->comm, pps->comm, TASK_COMM_LEN); core->tc->task_state = TASK_DEAD; core->tc->exit_code = pps->exit_code; @@ -897,7 +1037,7 @@ err: return ret; } -#define SI_BATCH 32 +#define SI_BATCH 32 static int dump_signal_queue(pid_t tid, SignalQueueEntry **sqe, bool group) { @@ -930,8 +1070,10 @@ static int dump_signal_queue(pid_t tid, SignalQueueEntry **sqe, bool group) } nr = ret = ptrace(PTRACE_PEEKSIGINFO, tid, &arg, si); - if (ret == 0) + if (ret == 0) { + xfree(si); break; /* Finished */ + } if (ret < 0) { if (errno == EIO) { @@ -940,6 +1082,7 @@ static int dump_signal_queue(pid_t tid, SignalQueueEntry **sqe, bool group) } else pr_perror("ptrace"); + xfree(si); break; } @@ -947,11 +1090,11 @@ static int dump_signal_queue(pid_t tid, SignalQueueEntry **sqe, bool group) queue->signals = xrealloc(queue->signals, sizeof(*queue->signals) * queue->n_signals); if (!queue->signals) { ret = -1; + xfree(si); break; } - for (si_pos = queue->n_signals - nr; - si_pos < queue->n_signals; si_pos++) { + for (si_pos = queue->n_signals - nr; si_pos < queue->n_signals; si_pos++) { SiginfoEntry *se; se = xmalloc(sizeof(*se)); @@ -1001,12 +1144,152 @@ static int dump_task_signals(pid_t pid, struct pstree_item *item) return 0; } -static struct proc_pid_stat pps_buf; +static int read_rseq_cs(pid_t tid, struct __ptrace_rseq_configuration *rseqc, struct criu_rseq_cs *rseq_cs, + struct criu_rseq *rseq) +{ + int ret; -static int dump_task_threads(struct parasite_ctl *parasite_ctl, - const struct pstree_item *item) + /* rseq is not registered */ + if (!rseqc->rseq_abi_pointer) + return 0; + + /* + * We need to cover the case when victim process was inside rseq critical section + * at the moment when CRIU comes and seized it. We need to determine the borders + * of rseq critical section at first. To achieve that we need to access thread + * memory and read pointer to struct rseq_cs. + * + * We have two ways to access thread memory: from the parasite and using ptrace(). + * But it this case we can't use parasite, because if victim process returns to the + * execution, on the kernel side __rseq_handle_notify_resume hook will be called, + * then rseq_ip_fixup() -> clear_rseq_cs() and user space memory with struct rseq + * will be cleared. So, let's use ptrace(PTRACE_PEEKDATA). + */ + ret = ptrace_peek_area(tid, rseq, decode_pointer(rseqc->rseq_abi_pointer), sizeof(struct criu_rseq)); + if (ret) { + pr_err("ptrace_peek_area(%d, %lx, %lx, %lx): fail to read rseq struct\n", tid, (unsigned long)rseq, + (unsigned long)(rseqc->rseq_abi_pointer), (unsigned long)sizeof(uint64_t)); + return -1; + } + + if (!rseq->rseq_cs) + return 0; + + ret = ptrace_peek_area(tid, rseq_cs, decode_pointer(rseq->rseq_cs), sizeof(struct criu_rseq_cs)); + if (ret) { + pr_err("ptrace_peek_area(%d, %lx, %lx, %lx): fail to read rseq_cs struct\n", tid, + (unsigned long)rseq_cs, (unsigned long)rseq->rseq_cs, + (unsigned long)sizeof(struct criu_rseq_cs)); + return -1; + } + + return 0; +} + +static int dump_thread_rseq(struct pstree_item *item, int i) +{ + struct __ptrace_rseq_configuration rseqc; + RseqEntry *rseqe = NULL; + int ret; + CoreEntry *core = item->core[i]; + RseqEntry **rseqep = &core->thread_core->rseq_entry; + struct criu_rseq rseq = {}; + struct criu_rseq_cs *rseq_cs = &dmpi(item)->thread_rseq_cs[i]; + pid_t tid = item->threads[i].real; + + /* + * If we are here it means that rseq() syscall is supported, + * but ptrace(PTRACE_GET_RSEQ_CONFIGURATION) isn't supported, + * we can just fail dump here. But this is bad idea, IMHO. + * + * So, we will try to detect if victim process was used rseq(). + * See check_rseq() and check_thread_rseq() functions. + */ + if (!kdat.has_ptrace_get_rseq_conf) + return 0; + + ret = ptrace(PTRACE_GET_RSEQ_CONFIGURATION, tid, sizeof(rseqc), &rseqc); + if (ret != sizeof(rseqc)) { + pr_perror("ptrace(PTRACE_GET_RSEQ_CONFIGURATION, %d) = %d", tid, ret); + return -1; + } + + if (rseqc.flags != 0) { + pr_err("something wrong with ptrace(PTRACE_GET_RSEQ_CONFIGURATION, %d) flags = 0x%x\n", tid, + rseqc.flags); + return -1; + } + + pr_info("Dump rseq of %d: ptr = 0x%lx sign = 0x%x\n", tid, (unsigned long)rseqc.rseq_abi_pointer, + rseqc.signature); + + rseqe = xmalloc(sizeof(*rseqe)); + if (!rseqe) + return -1; + + rseq_entry__init(rseqe); + + rseqe->rseq_abi_pointer = rseqc.rseq_abi_pointer; + rseqe->rseq_abi_size = rseqc.rseq_abi_size; + rseqe->signature = rseqc.signature; + + if (read_rseq_cs(tid, &rseqc, rseq_cs, &rseq)) + goto err; + + /* we won't save rseq_cs to the image (only pointer), + * so let's combine flags from both struct rseq and struct rseq_cs + * (kernel does the same when interpreting RSEQ_CS_FLAG_*) + */ + rseq_cs->flags |= rseq.flags; + + if (rseq_cs->flags & RSEQ_CS_FLAG_NO_RESTART_ON_SIGNAL) { + rseqe->has_rseq_cs_pointer = true; + rseqe->rseq_cs_pointer = rseq.rseq_cs; + } + + /* save rseq entry to the image */ + *rseqep = rseqe; + + return 0; + +err: + xfree(rseqe); + return -1; +} + +static int dump_task_rseq(pid_t pid, struct pstree_item *item) { int i; + struct criu_rseq_cs *thread_rseq_cs; + + /* if rseq() syscall isn't supported then nothing to dump */ + if (!kdat.has_rseq) + return 0; + + thread_rseq_cs = xzalloc(sizeof(*thread_rseq_cs) * item->nr_threads); + if (!thread_rseq_cs) + return -1; + + dmpi(item)->thread_rseq_cs = thread_rseq_cs; + + for (i = 0; i < item->nr_threads; i++) { + if (dump_thread_rseq(item, i)) + goto free_rseq; + } + + return 0; + +free_rseq: + xfree(thread_rseq_cs); + dmpi(item)->thread_rseq_cs = NULL; + return -1; +} + +static struct proc_pid_stat pps_buf; + +static int dump_task_threads(struct parasite_ctl *parasite_ctl, const struct pstree_item *item) +{ + int i, ret = 0; for (i = 0; i < item->nr_threads; i++) { /* Leader is already dumped */ @@ -1014,18 +1297,21 @@ static int dump_task_threads(struct parasite_ctl *parasite_ctl, item->threads[i].ns[0].virt = vpid(item); continue; } - if (dump_task_thread(parasite_ctl, item, i)) - return -1; + ret = dump_task_thread(parasite_ctl, item, i); + if (ret) + break; } - return 0; + xfree(dmpi(item)->thread_rseq_cs); + dmpi(item)->thread_rseq_cs = NULL; + return ret; } /* * What this routine does is just reads pid-s of dead * tasks in item's children list from item's ns proc. * - * It does *not* find wihch real pid corresponds to + * It does *not* find which real pid corresponds to * which virtual one, but it's not required -- all we * need to dump for zombie can be found in the same * ns proc. @@ -1089,8 +1375,16 @@ static int dump_zombies(void) int ret = -1; int pidns = root_ns_mask & CLONE_NEWPID; - if (pidns && set_proc_fd(get_service_fd(CR_PROC_FD_OFF))) - return -1; + if (pidns) { + int fd; + + fd = get_service_fd(CR_PROC_FD_OFF); + if (fd < 0) + return -1; + + if (set_proc_fd(fd)) + return -1; + } /* * We dump zombies separately because for pid-ns case @@ -1119,7 +1413,14 @@ static int dump_zombies(void) item->sid = pps_buf.sid; item->pgid = pps_buf.pgid; - BUG_ON(!list_empty(&item->children)); + BUG_ON(has_children(item)); + + if (!item->sid) { + pr_err("A session leader of zombie process %d(%d) is outside of its pid namespace\n", + item->pid->real, vpid(item)); + goto err; + } + if (dump_one_zombie(item, &pps_buf) < 0) goto err; } @@ -1132,6 +1433,39 @@ err: return ret; } +static int dump_task_cgroup(struct parasite_ctl *parasite_ctl, const struct pstree_item *item) +{ + struct parasite_dump_cgroup_args cgroup_args, *info; + int i; + + BUILD_BUG_ON(sizeof(cgroup_args) < PARASITE_ARG_SIZE_MIN); + for (i = 0; i < item->nr_threads; i++) { + CoreEntry *core = item->core[i]; + + /* Leader is already dumped */ + if (item->pid->real == item->threads[i].real) + continue; + + /* For now, we only need to dump the root task's cgroup ns, because we + * know all the tasks are in the same cgroup namespace because we don't + * allow nesting. + */ + info = NULL; + if (item->ids->has_cgroup_ns_id && !item->parent) { + info = &cgroup_args; + sprintf(cgroup_args.thread_cgrp, "self/task/%d/cgroup", item->threads[i].ns[0].virt); + if (parasite_dump_cgroup(parasite_ctl, &cgroup_args)) + return -1; + } + + core->thread_core->has_cg_set = true; + if (dump_thread_cgroup(item, &core->thread_core->cg_set, info, i)) + return -1; + } + + return 0; +} + static int pre_dump_one_task(struct pstree_item *item, InventoryEntry *parent_ie) { pid_t pid = item->pid->real; @@ -1144,9 +1478,18 @@ static int pre_dump_one_task(struct pstree_item *item, InventoryEntry *parent_ie vm_area_list_init(&vmas); pr_info("========================================\n"); - pr_info("Pre-dumping task (pid: %d)\n", pid); + pr_info("Pre-dumping task (pid: %d comm: %s)\n", pid, __task_comm_info(pid)); pr_info("========================================\n"); + /* + * Add pidfd of task to pidfd_store if it is initialized. + * This pidfd will be used in the next pre-dump/dump iteration + * in detect_pid_reuse(). + */ + ret = pidfd_store_add(pid); + if (ret) + goto err; + if (item->pid->state == TASK_STOPPED) { pr_warn("Stopped tasks are not supported\n"); return 0; @@ -1225,7 +1568,7 @@ static int dump_one_task(struct pstree_item *item, InventoryEntry *parent_ie) vm_area_list_init(&vmas); pr_info("========================================\n"); - pr_info("Dumping task (pid: %d)\n", pid); + pr_info("Dumping task (pid: %d comm: %s)\n", pid, __task_comm_info(pid)); pr_info("========================================\n"); if (item->pid->state == TASK_DEAD) @@ -1273,12 +1616,24 @@ static int dump_one_task(struct pstree_item *item, InventoryEntry *parent_ie) goto err; } + ret = dump_task_rseq(pid, item); + if (ret) { + pr_err("Dump %d rseq failed %d\n", pid, ret); + goto err; + } + parasite_ctl = parasite_infect_seized(pid, item, &vmas); if (!parasite_ctl) { pr_err("Can't infect (pid: %d) with parasite\n", pid); goto err; } + ret = fixup_thread_rseq(item, 0); + if (ret) { + pr_err("Fixup rseq for %d failed %d\n", pid, ret); + goto err; + } + if (fault_injected(FI_DUMP_EARLY)) { pr_info("fault: CRIU sudden detach\n"); kill(getpid(), SIGKILL); @@ -1290,29 +1645,29 @@ static int dump_one_task(struct pstree_item *item, InventoryEntry *parent_ie) pfd = parasite_get_proc_fd_seized(parasite_ctl); if (pfd < 0) { pr_err("Can't get proc fd (pid: %d)\n", pid); - goto err_cure_imgset; + goto err_cure; } if (install_service_fd(CR_PROC_FD_OFF, pfd) < 0) - goto err_cure_imgset; + goto err_cure; } ret = parasite_fixup_vdso(parasite_ctl, pid, &vmas); if (ret) { pr_err("Can't fixup vdso VMAs (pid: %d)\n", pid); - goto err_cure_imgset; + goto err_cure; } ret = parasite_collect_aios(parasite_ctl, &vmas); /* FIXME -- merge with above */ if (ret) { pr_err("Failed to check aio rings (pid: %d)\n", pid); - goto err_cure_imgset; + goto err_cure; } ret = parasite_dump_misc_seized(parasite_ctl, &misc); if (ret) { pr_err("Can't dump misc (pid: %d)\n", pid); - goto err_cure_imgset; + goto err_cure; } item->pid->ns[0].virt = misc.pid; @@ -1320,12 +1675,10 @@ static int dump_one_task(struct pstree_item *item, InventoryEntry *parent_ie) item->sid = misc.sid; item->pgid = misc.pgid; - pr_info("sid=%d pgid=%d pid=%d\n", - item->sid, item->pgid, vpid(item)); + pr_info("sid=%d pgid=%d pid=%d\n", item->sid, item->pgid, vpid(item)); if (item->sid == 0) { - pr_err("A session leader of %d(%d) is outside of its pid namespace\n", - item->pid->real, vpid(item)); + pr_err("A session leader of %d(%d) is outside of its pid namespace\n", item->pid->real, vpid(item)); goto err_cure; } @@ -1385,18 +1738,28 @@ static int dump_one_task(struct pstree_item *item, InventoryEntry *parent_ie) goto err_cure; } + ret = dump_task_cgroup(parasite_ctl, item); + if (ret) { + pr_err("Dump cgroup of threads in process (pid: %d) failed with %d\n", pid, ret); + goto err_cure; + } + ret = compel_stop_daemon(parasite_ctl); if (ret) { - pr_err("Can't cure (pid: %d) from parasite\n", pid); - goto err; + pr_err("Can't stop daemon in parasite (pid: %d)\n", pid); + goto err_cure; } ret = dump_task_threads(parasite_ctl, item); if (ret) { pr_err("Can't dump threads\n"); - goto err; + goto err_cure; } + /* + * On failure local map will be cured in cr_dump_finish() + * for lazy pages. + */ if (opts.lazy_pages) ret = compel_cure_remote(parasite_ctl); else @@ -1418,45 +1781,44 @@ static int dump_one_task(struct pstree_item *item, InventoryEntry *parent_ie) goto err; } - close_cr_imgset(&cr_imgset); exit_code = 0; err: + close_cr_imgset(&cr_imgset); close_pid_proc(); free_mappings(&vmas); xfree(dfds); return exit_code; err_cure: - close_cr_imgset(&cr_imgset); -err_cure_imgset: - compel_cure(parasite_ctl); + ret = compel_cure(parasite_ctl); + if (ret) + pr_err("Can't cure (pid: %d) from parasite\n", pid); goto err; } static int alarm_attempts = 0; -bool alarm_timeouted() { +bool alarm_timeouted(void) +{ return alarm_attempts > 0; } static void alarm_handler(int signo) { - pr_err("Timeout reached. Try to interrupt: %d\n", alarm_attempts); if (alarm_attempts++ < 5) { alarm(1); - /* A curren syscall will be exited with EINTR */ + /* A current syscall will be exited with EINTR */ return; } pr_err("FATAL: Unable to interrupt the current operation\n"); BUG(); } -static int setup_alarm_handler() +static int setup_alarm_handler(void) { struct sigaction sa = { - .sa_handler = alarm_handler, - .sa_flags = 0, /* Don't restart syscalls */ + .sa_handler = alarm_handler, .sa_flags = 0, /* Don't restart syscalls */ }; sigemptyset(&sa.sa_mask); @@ -1487,6 +1849,9 @@ static int cr_pre_dump_finish(int status) if (ret) goto err; + he.has_pre_dump_mode = true; + he.pre_dump_mode = opts.pre_dump_mode; + pstree_switch_state(root_item, TASK_ALIVE); timing_stop(TIME_FROZEN); @@ -1512,7 +1877,13 @@ static int cr_pre_dump_finish(int status) goto err; mem_pp = dmpi(item)->mem_pp; - ret = page_xfer_dump_pages(&xfer, mem_pp); + + if (opts.pre_dump_mode == PRE_DUMP_READ) { + timing_stop(TIME_MEMWRITE); + ret = page_xfer_predump_pages(item->pid->real, &xfer, mem_pp); + } else { + ret = page_xfer_dump_pages(&xfer, mem_pp); + } xfer.close(&xfer); @@ -1522,7 +1893,8 @@ static int cr_pre_dump_finish(int status) timing_stop(TIME_MEMWRITE); destroy_page_pipe(mem_pp); - compel_cure_local(ctl); + if (compel_cure_local(ctl)) + pr_err("Can't cure local: something happened with mapping?\n"); } free_pstree(root_item); @@ -1534,6 +1906,9 @@ static int cr_pre_dump_finish(int status) } err: + if (unsuspend_lsm()) + ret = -1; + if (disconnect_from_page_server()) ret = -1; @@ -1611,6 +1986,9 @@ int cr_pre_dump_tasks(pid_t pid) if (collect_namespaces(false) < 0) goto err; + if (collect_and_suspend_lsm() < 0) + goto err; + /* Errors handled later in detect_pid_reuse */ parent_ie = get_parent_inventory(); @@ -1649,7 +2027,8 @@ static int cr_lazy_mem_dump(void) for_each_pstree_item(item) { if (item->pid->state != TASK_DEAD) { destroy_page_pipe(dmpi(item)->mem_pp); - compel_cure_local(dmpi(item)->parasite_ctl); + if (compel_cure_local(dmpi(item)->parasite_ctl)) + pr_err("Can't cure local: something happened with mapping?\n"); } } @@ -1673,7 +2052,6 @@ static int cr_dump_finish(int ret) if (bfd_flush_images()) ret = -1; - cr_plugin_fini(CR_PLUGIN_STAGE__DUMP, ret); cgp_fini(); if (!ret) { @@ -1716,6 +2094,7 @@ static int cr_dump_finish(int ret) * start rollback procedure and cleanup everything. */ if (ret || post_dump_ret || opts.final_state == TASK_ALIVE) { + unsuspend_lsm(); network_unlock(); delete_link_remaps(); clean_cr_time_mounts(); @@ -1726,9 +2105,10 @@ static int cr_dump_finish(int ret) if (arch_set_thread_regs(root_item, true) < 0) return -1; - pstree_switch_state(root_item, - (ret || post_dump_ret) ? - TASK_ALIVE : opts.final_state); + + cr_plugin_fini(CR_PLUGIN_STAGE__DUMP, ret); + + pstree_switch_state(root_item, (ret || post_dump_ret) ? TASK_ALIVE : opts.final_state); timing_stop(TIME_FROZEN); free_pstree(root_item); seccomp_free_entries(); @@ -1738,14 +2118,19 @@ static int cr_dump_finish(int ret) free_userns_maps(); close_service_fd(CR_PROC_FD_OFF); + close_image_dir(); - if (ret) { + if (ret || post_dump_ret) { + if (fault_injected(FI_DUMP_CRASH)) { + pr_info("fault: CRIU dump crashed!\n"); + abort(); + } pr_err("Dumping FAILED.\n"); } else { write_stats(DUMP_STATS); pr_info("Dumping finished successfully\n"); } - return post_dump_ret ? : (ret != 0); + return post_dump_ret ?: (ret != 0); } int cr_dump_tasks(pid_t pid) @@ -1753,11 +2138,13 @@ int cr_dump_tasks(pid_t pid) InventoryEntry he = INVENTORY_ENTRY__INIT; InventoryEntry *parent_ie = NULL; struct pstree_item *item; - int pre_dump_ret = 0; - int ret = -1; + int ret; + int exit_code = -1; + + kerndat_warn_about_madv_guards(); pr_info("========================================\n"); - pr_info("Dumping processes (pid: %d)\n", pid); + pr_info("Dumping processes (pid: %d comm: %s)\n", pid, __task_comm_info(pid)); pr_info("========================================\n"); /* @@ -1772,9 +2159,9 @@ int cr_dump_tasks(pid_t pid) goto err; root_item->pid->real = pid; - pre_dump_ret = run_scripts(ACT_PRE_DUMP); - if (pre_dump_ret != 0) { - pr_err("Pre dump script failed with %d!\n", pre_dump_ret); + ret = run_scripts(ACT_PRE_DUMP); + if (ret != 0) { + pr_err("Pre dump script failed with %d!\n", ret); goto err; } if (init_stats(DUMP_STATS)) @@ -1795,10 +2182,7 @@ int cr_dump_tasks(pid_t pid) if (vdso_init_dump()) goto err; - if (cgp_init(opts.cgroup_props, - opts.cgroup_props ? - strlen(opts.cgroup_props) : 0, - opts.cgroup_props_file)) + if (cgp_init(opts.cgroup_props, opts.cgroup_props ? strlen(opts.cgroup_props) : 0, opts.cgroup_props_file)) goto err; if (parse_cg_info()) @@ -1827,12 +2211,18 @@ int cr_dump_tasks(pid_t pid) if (collect_pstree()) goto err; + if (checkpoint_devices()) + goto err; + if (collect_pstree_ids()) goto err; if (network_lock()) goto err; + if (rpc_query_external_files()) + goto err; + if (collect_file_locks()) goto err; @@ -1849,11 +2239,18 @@ int cr_dump_tasks(pid_t pid) /* Errors handled later in detect_pid_reuse */ parent_ie = get_parent_inventory(); + if (collect_and_suspend_lsm() < 0) + goto err; + for_each_pstree_item(item) { if (dump_one_task(item, parent_ie)) goto err; } + ret = run_plugins(DUMP_DEVICES_LATE, pid); + if (ret && ret != -ENOTSUP) + goto err; + if (parent_ie) { inventory_entry__free_unpacked(parent_ie, NULL); parent_ie = NULL; @@ -1890,38 +2287,44 @@ int cr_dump_tasks(pid_t pid) * ipc shared memory, but an ipc namespace is dumped in a child * process. */ - ret = cr_dump_shmem(); - if (ret) + if (cr_dump_shmem()) goto err; if (root_ns_mask) { - ret = dump_namespaces(root_item, root_ns_mask); - if (ret) + if (dump_namespaces(root_item, root_ns_mask)) goto err; } - ret = dump_cgroups(); - if (ret) + if ((root_ns_mask & CLONE_NEWTIME) == 0) { + if (dump_time_ns(0)) + goto err; + } + + if (dump_aa_namespaces() < 0) goto err; - ret = fix_external_unix_sockets(); - if (ret) + if (dump_cgroups()) goto err; - ret = tty_post_actions(); - if (ret) + if (fix_external_unix_sockets()) goto err; - ret = inventory_save_uptime(&he); - if (ret) + if (tty_post_actions()) goto err; - ret = write_img_inventory(&he); - if (ret) + if (inventory_save_uptime(&he)) goto err; + + he.has_pre_dump_mode = false; + if (found_uprobes_vma()) { + he.has_allow_uprobes = true; + he.allow_uprobes = true; + } + + exit_code = write_img_inventory(&he); err: if (parent_ie) inventory_entry__free_unpacked(parent_ie, NULL); - return cr_dump_finish(ret); + return cr_dump_finish(exit_code); } diff --git a/criu/cr-restore.c b/criu/cr-restore.c index b4530f8e5..b92b92715 100644 --- a/criu/cr-restore.c +++ b/criu/cr-restore.c @@ -17,16 +17,19 @@ #include #include #include - +#include #include "types.h" #include #include "common/compiler.h" +#include "linux/rseq.h" + #include "clone-noasan.h" #include "cr_options.h" #include "servicefd.h" #include "image.h" +#include "img-streamer.h" #include "util.h" #include "util-pie.h" #include "criu-log.h" @@ -65,7 +68,6 @@ #include "timerfd.h" #include "action-scripts.h" #include "shmem.h" -#include #include "aio.h" #include "lsm.h" #include "seccomp.h" @@ -73,12 +75,20 @@ #include "sk-queue.h" #include "sigframe.h" #include "fdstore.h" +#include "string.h" +#include "memfd.h" +#include "timens.h" +#include "bpfmap.h" +#include "apparmor.h" +#include "pidfd.h" #include "parasite-syscall.h" #include "files-reg.h" #include #include "compel/include/asm/syscall.h" +#include "linux/mount.h" + #include "protobuf.h" #include "images/sa.pb-c.h" #include "images/timer.pb-c.h" @@ -90,20 +100,20 @@ #include "restore.h" #include "cr-errno.h" - -#include "pie/pie-relocs.h" +#include "timer.h" +#include "sigact.h" #ifndef arch_export_restore_thread -#define arch_export_restore_thread __export_restore_thread +#define arch_export_restore_thread __export_restore_thread #endif #ifndef arch_export_restore_task -#define arch_export_restore_task __export_restore_task +#define arch_export_restore_task __export_restore_task #endif #ifndef arch_export_unmap -#define arch_export_unmap __export_unmap -#define arch_export_unmap_compat __export_unmap_compat +#define arch_export_unmap __export_unmap +#define arch_export_unmap_compat __export_unmap_compat #endif struct pstree_item *current; @@ -112,7 +122,6 @@ static int restore_task_with_children(void *); static int sigreturn_restore(pid_t pid, struct task_restore_args *ta, unsigned long alen, CoreEntry *core); static int prepare_restorer_blob(void); static int prepare_rlimits(int pid, struct task_restore_args *, CoreEntry *core); -static int prepare_posix_timers(int pid, struct task_restore_args *ta, CoreEntry *core); static int prepare_signals(int pid, struct task_restore_args *, CoreEntry *core); /* @@ -180,13 +189,13 @@ static int __restore_wait_inprogress_tasks(int participants) return 0; } -static int restore_wait_inprogress_tasks() +static int restore_wait_inprogress_tasks(void) { return __restore_wait_inprogress_tasks(0); } /* Wait all tasks except the current one */ -static int restore_wait_other_tasks() +static int restore_wait_other_tasks(void) { int participants, stage; @@ -198,16 +207,14 @@ static int restore_wait_other_tasks() static inline void __restore_switch_stage_nw(int next_stage) { - futex_set(&task_entries->nr_in_progress, - stage_participants(next_stage)); + futex_set(&task_entries->nr_in_progress, stage_participants(next_stage)); futex_set(&task_entries->start, next_stage); } static inline void __restore_switch_stage(int next_stage) { if (next_stage != CR_STATE_COMPLETE) - futex_set(&task_entries->nr_in_progress, - stage_participants(next_stage)); + futex_set(&task_entries->nr_in_progress, stage_participants(next_stage)); futex_set_and_wake(&task_entries->start, next_stage); } @@ -229,6 +236,9 @@ static int restore_finish_ns_stage(int from, int to) static int crtools_prepare_shared(void) { + if (prepare_memfd_inodes()) + return -1; + if (prepare_files()) return -1; @@ -246,7 +256,7 @@ static int crtools_prepare_shared(void) if (tty_prep_fds()) return -1; - if (prepare_cgroup()) + if (prepare_apparmor_namespaces()) return -1; return 0; @@ -262,30 +272,17 @@ static int crtools_prepare_shared(void) */ static struct collect_image_info *cinfos[] = { - &file_locks_cinfo, - &pipe_data_cinfo, - &fifo_data_cinfo, - &sk_queues_cinfo, + &file_locks_cinfo, &pipe_data_cinfo, &fifo_data_cinfo, &sk_queues_cinfo, +#ifdef CONFIG_HAS_LIBBPF + &bpfmap_data_cinfo, +#endif }; static struct collect_image_info *cinfos_files[] = { - &unix_sk_cinfo, - &fifo_cinfo, - &pipe_cinfo, - &nsfile_cinfo, - &packet_sk_cinfo, - &netlink_sk_cinfo, - &eventfd_cinfo, - &epoll_cinfo, - &epoll_tfd_cinfo, - &signalfd_cinfo, - &tunfile_cinfo, - &timerfd_cinfo, - &inotify_cinfo, - &inotify_mark_cinfo, - &fanotify_cinfo, - &fanotify_mark_cinfo, - &ext_file_cinfo, + &unix_sk_cinfo, &fifo_cinfo, &pipe_cinfo, &nsfile_cinfo, &packet_sk_cinfo, + &netlink_sk_cinfo, &eventfd_cinfo, &epoll_cinfo, &epoll_tfd_cinfo, &signalfd_cinfo, + &tunfile_cinfo, &timerfd_cinfo, &inotify_cinfo, &inotify_mark_cinfo, &fanotify_cinfo, + &fanotify_mark_cinfo, &ext_file_cinfo, &memfd_cinfo, &pidfd_cinfo }; /* These images are required to restore namespaces */ @@ -329,8 +326,7 @@ static int root_prepare_shared(void) if (collect_images(cinfos, ARRAY_SIZE(cinfos))) return -1; - if (!files_collected() && - collect_images(cinfos_files, ARRAY_SIZE(cinfos_files))) + if (!files_collected() && collect_images(cinfos_files, ARRAY_SIZE(cinfos_files))) return -1; for_each_pstree_item(pi) { @@ -359,6 +355,10 @@ static int root_prepare_shared(void) if (ret) goto err; + ret = add_fake_unix_queuers(); + if (ret) + goto err; + /* * This should be called with all packets collected AND all * fdescs and fles prepared BUT post-prep-s not run. @@ -375,10 +375,6 @@ static int root_prepare_shared(void) if (ret) goto err; - ret = add_fake_unix_queuers(); - if (ret) - goto err; - show_saved_files(); err: return ret; @@ -407,268 +403,13 @@ static int populate_pid_proc(void) pr_err("Can't open PROC_SELF\n"); return -1; } + if (open_pid_proc(PROC_SELF) < 0) { + pr_err("Can't open PROC_SELF\n"); + return -1; + } return 0; } -static rt_sigaction_t sigchld_act; -/* - * If parent's sigaction has blocked SIGKILL (which is non-sense), - * this parent action is non-valid and shouldn't be inherited. - * Used to mark parent_act* no more valid. - */ -static rt_sigaction_t parent_act[SIGMAX]; -#ifdef CONFIG_COMPAT -static rt_sigaction_t_compat parent_act_compat[SIGMAX]; -#endif - -static bool sa_inherited(int sig, rt_sigaction_t *sa) -{ - rt_sigaction_t *pa; - int i; - - if (current == root_item) - return false; /* XXX -- inherit from CRIU? */ - - pa = &parent_act[sig]; - - /* Omitting non-valid sigaction */ - if (pa->rt_sa_mask.sig[0] & (1 << SIGKILL)) - return false; - - for (i = 0; i < _KNSIG_WORDS; i++) - if (pa->rt_sa_mask.sig[i] != sa->rt_sa_mask.sig[i]) - return false; - - return pa->rt_sa_handler == sa->rt_sa_handler && - pa->rt_sa_flags == sa->rt_sa_flags && - pa->rt_sa_restorer == sa->rt_sa_restorer; -} - -static int restore_native_sigaction(int sig, SaEntry *e) -{ - rt_sigaction_t act; - int ret; - - ASSIGN_TYPED(act.rt_sa_handler, decode_pointer(e->sigaction)); - ASSIGN_TYPED(act.rt_sa_flags, e->flags); - ASSIGN_TYPED(act.rt_sa_restorer, decode_pointer(e->restorer)); - BUILD_BUG_ON(sizeof(e->mask) != sizeof(act.rt_sa_mask.sig)); - memcpy(act.rt_sa_mask.sig, &e->mask, sizeof(act.rt_sa_mask.sig)); - - if (sig == SIGCHLD) { - sigchld_act = act; - return 0; - } - - if (sa_inherited(sig - 1, &act)) - return 1; - - /* - * A pure syscall is used, because glibc - * sigaction overwrites se_restorer. - */ - ret = syscall(SYS_rt_sigaction, sig, &act, NULL, sizeof(k_rtsigset_t)); - if (ret < 0) { - pr_perror("Can't restore sigaction"); - return ret; - } - - parent_act[sig - 1] = act; - /* Mark SIGKILL blocked which makes compat sigaction non-valid */ -#ifdef CONFIG_COMPAT - parent_act_compat[sig - 1].rt_sa_mask.sig[0] |= 1 << SIGKILL; -#endif - - return 1; -} - -static void *stack32; - -#ifdef CONFIG_COMPAT -static bool sa_compat_inherited(int sig, rt_sigaction_t_compat *sa) -{ - rt_sigaction_t_compat *pa; - int i; - - if (current == root_item) - return false; - - pa = &parent_act_compat[sig]; - - /* Omitting non-valid sigaction */ - if (pa->rt_sa_mask.sig[0] & (1 << SIGKILL)) - return false; - - for (i = 0; i < _KNSIG_WORDS; i++) - if (pa->rt_sa_mask.sig[i] != sa->rt_sa_mask.sig[i]) - return false; - - return pa->rt_sa_handler == sa->rt_sa_handler && - pa->rt_sa_flags == sa->rt_sa_flags && - pa->rt_sa_restorer == sa->rt_sa_restorer; -} - -static int restore_compat_sigaction(int sig, SaEntry *e) -{ - rt_sigaction_t_compat act; - int ret; - - ASSIGN_TYPED(act.rt_sa_handler, (u32)e->sigaction); - ASSIGN_TYPED(act.rt_sa_flags, e->flags); - ASSIGN_TYPED(act.rt_sa_restorer, (u32)e->restorer); - BUILD_BUG_ON(sizeof(e->mask) != sizeof(act.rt_sa_mask.sig)); - memcpy(act.rt_sa_mask.sig, &e->mask, sizeof(act.rt_sa_mask.sig)); - - if (sig == SIGCHLD) { - memcpy(&sigchld_act, &act, sizeof(rt_sigaction_t_compat)); - return 0; - } - - if (sa_compat_inherited(sig - 1, &act)) - return 1; - - if (!stack32) { - stack32 = alloc_compat_syscall_stack(); - if (!stack32) - return -1; - } - - ret = arch_compat_rt_sigaction(stack32, sig, &act); - if (ret < 0) { - pr_err("Can't restore compat sigaction: %d\n", ret); - return ret; - } - - parent_act_compat[sig - 1] = act; - /* Mark SIGKILL blocked which makes native sigaction non-valid */ - parent_act[sig - 1].rt_sa_mask.sig[0] |= 1 << SIGKILL; - - return 1; -} -#else -static int restore_compat_sigaction(int sig, SaEntry *e) -{ - return -1; -} -#endif - -static int prepare_sigactions_from_core(TaskCoreEntry *tc) -{ - int sig, i; - - if (tc->n_sigactions != SIGMAX - 2) { - pr_err("Bad number of sigactions in the image (%d, want %d)\n", - (int)tc->n_sigactions, SIGMAX - 2); - return -1; - } - - pr_info("Restore on-core sigactions for %d\n", vpid(current)); - - for (sig = 1, i = 0; sig <= SIGMAX; sig++) { - int ret; - SaEntry *e; - bool sigaction_is_compat; - - if (sig == SIGKILL || sig == SIGSTOP) - continue; - - e = tc->sigactions[i++]; - sigaction_is_compat = e->has_compat_sigaction && e->compat_sigaction; - if (sigaction_is_compat) - ret = restore_compat_sigaction(sig, e); - else - ret = restore_native_sigaction(sig, e); - - if (ret < 0) - return ret; - } - - return 0; -} - -/* Returns number of restored signals, -1 or negative errno on fail */ -static int restore_one_sigaction(int sig, struct cr_img *img, int pid) -{ - bool sigaction_is_compat; - SaEntry *e; - int ret = 0; - - BUG_ON(sig == SIGKILL || sig == SIGSTOP); - - ret = pb_read_one_eof(img, &e, PB_SIGACT); - if (ret == 0) { - if (sig != SIGMAX_OLD + 1) { /* backward compatibility */ - pr_err("Unexpected EOF %d\n", sig); - return -1; - } - pr_warn("This format of sigacts-%d.img is deprecated\n", pid); - return -1; - } - if (ret < 0) - return ret; - - sigaction_is_compat = e->has_compat_sigaction && e->compat_sigaction; - if (sigaction_is_compat) - ret = restore_compat_sigaction(sig, e); - else - ret = restore_native_sigaction(sig, e); - - sa_entry__free_unpacked(e, NULL); - - return ret; -} - -static int prepare_sigactions_from_image(void) -{ - int pid = vpid(current); - struct cr_img *img; - int sig, rst = 0; - int ret = 0; - - pr_info("Restore sigacts for %d\n", pid); - - img = open_image(CR_FD_SIGACT, O_RSTR, pid); - if (!img) - return -1; - - for (sig = 1; sig <= SIGMAX; sig++) { - if (sig == SIGKILL || sig == SIGSTOP) - continue; - - ret = restore_one_sigaction(sig, img, pid); - if (ret < 0) - break; - if (ret) - rst++; - } - - pr_info("Restored %d/%d sigacts\n", rst, - SIGMAX - 3 /* KILL, STOP and CHLD */); - - close_image(img); - return ret; -} - -static int prepare_sigactions(CoreEntry *core) -{ - int ret; - - if (!task_alive(current)) - return 0; - - if (core->tc->n_sigactions != 0) - ret = prepare_sigactions_from_core(core->tc); - else - ret = prepare_sigactions_from_image(); - - if (stack32) { - free_compat_syscall_stack(stack32); - stack32 = NULL; - } - - return ret; -} - static int __collect_child_pids(struct pstree_item *p, int state, unsigned int *n) { struct pstree_item *pi; @@ -703,8 +444,7 @@ static int collect_child_pids(int state, unsigned int *n) if (current == root_item) { for_each_pstree_item(pi) { - if (pi->pid->state != TASK_HELPER && - pi->pid->state != TASK_DEAD) + if (pi->pid->state != TASK_HELPER && pi->pid->state != TASK_DEAD) continue; if (__collect_child_pids(pi, state, n)) return -1; @@ -782,7 +522,7 @@ static int open_cores(int pid, CoreEntry *leader_core) int i, tpid; CoreEntry **cores = NULL; - cores = xmalloc(sizeof(*cores)*current->nr_threads); + cores = xmalloc(sizeof(*cores) * current->nr_threads); if (!cores) goto err; @@ -814,6 +554,23 @@ static int open_cores(int pid, CoreEntry *leader_core) } } + for (i = 0; i < current->nr_threads; i++) { + ThreadCoreEntry *tc = cores[i]->thread_core; + struct rst_rseq *rseqs = rsti(current)->rseqe; + RseqEntry *rseqe = tc->rseq_entry; + + /* compatibility with older CRIU versions */ + if (!rseqe) + continue; + + /* rseq cs had no RSEQ_CS_FLAG_NO_RESTART_ON_SIGNAL */ + if (!rseqe->has_rseq_cs_pointer) + continue; + + rseqs[i].rseq_abi_pointer = rseqe->rseq_abi_pointer; + rseqs[i].rseq_cs_pointer = rseqe->rseq_cs_pointer; + } + return 0; err: xfree(cores); @@ -847,12 +604,16 @@ static int prepare_proc_misc(pid_t pid, TaskCoreEntry *tc, struct task_restore_a if (tc->has_child_subreaper) args->child_subreaper = tc->child_subreaper; + if (tc->has_membarrier_registration_mask) + args->membarrier_registration_mask = tc->membarrier_registration_mask; + /* loginuid value is critical to restore */ - if (kdat.luid == LUID_FULL && tc->has_loginuid && - tc->loginuid != INVALID_UID) { - ret = prepare_loginuid(tc->loginuid, LOG_ERROR); - if (ret < 0) + if (kdat.luid == LUID_FULL && tc->has_loginuid && tc->loginuid != INVALID_UID) { + ret = prepare_loginuid(tc->loginuid); + if (ret < 0) { + pr_err("Setting loginuid for %d task failed\n", pid); return ret; + } } /* oom_score_adj is not critical: only log errors */ @@ -862,7 +623,6 @@ static int prepare_proc_misc(pid_t pid, TaskCoreEntry *tc, struct task_restore_a return 0; } -static int prepare_itimers(int pid, struct task_restore_args *args, CoreEntry *core); static int prepare_mm(pid_t pid, struct task_restore_args *args); static int restore_one_alive_task(int pid, CoreEntry *core) @@ -873,8 +633,7 @@ static int restore_one_alive_task(int pid, CoreEntry *core) rst_mem_switch_to_private(); - args_len = round_up(sizeof(*ta) + sizeof(struct thread_restore_args) * - current->nr_threads, page_size()); + args_len = round_up(sizeof(*ta) + sizeof(struct thread_restore_args) * current->nr_threads, page_size()); ta = mmap(NULL, args_len, PROT_READ | PROT_WRITE, MAP_ANONYMOUS | MAP_PRIVATE, 0, 0); if (!ta) return -1; @@ -956,6 +715,9 @@ static int restore_one_alive_task(int pid, CoreEntry *core) if (setup_uffd(pid, ta)) return -1; + if (arch_shstk_prepare(current, core, ta)) + return -1; + return sigreturn_restore(pid, ta, args_len, core); } @@ -975,33 +737,12 @@ static void zombie_prepare_signals(void) sigaction(sig, &act, NULL); } -#define SIG_FATAL_MASK ( \ - (1 << SIGHUP) |\ - (1 << SIGINT) |\ - (1 << SIGQUIT) |\ - (1 << SIGILL) |\ - (1 << SIGTRAP) |\ - (1 << SIGABRT) |\ - (1 << SIGIOT) |\ - (1 << SIGBUS) |\ - (1 << SIGFPE) |\ - (1 << SIGKILL) |\ - (1 << SIGUSR1) |\ - (1 << SIGSEGV) |\ - (1 << SIGUSR2) |\ - (1 << SIGPIPE) |\ - (1 << SIGALRM) |\ - (1 << SIGTERM) |\ - (1 << SIGXCPU) |\ - (1 << SIGXFSZ) |\ - (1 << SIGVTALRM)|\ - (1 << SIGPROF) |\ - (1 << SIGPOLL) |\ - (1 << SIGIO) |\ - (1 << SIGSYS) |\ - (1 << SIGSTKFLT)|\ - (1 << SIGPWR) \ - ) +#define SIG_FATAL_MASK \ + ((1 << SIGHUP) | (1 << SIGINT) | (1 << SIGQUIT) | (1 << SIGILL) | (1 << SIGTRAP) | (1 << SIGABRT) | \ + (1 << SIGIOT) | (1 << SIGBUS) | (1 << SIGFPE) | (1 << SIGKILL) | (1 << SIGUSR1) | (1 << SIGSEGV) | \ + (1 << SIGUSR2) | (1 << SIGPIPE) | (1 << SIGALRM) | (1 << SIGTERM) | (1 << SIGXCPU) | (1 << SIGXFSZ) | \ + (1 << SIGVTALRM) | (1 << SIGPROF) | (1 << SIGPOLL) | (1 << SIGIO) | (1 << SIGSYS) | (1 << SIGSTKFLT) | \ + (1 << SIGPWR)) static inline int sig_fatal(int sig) { @@ -1090,8 +831,7 @@ static int setup_newborn_fds(struct pstree_item *me) if (clone_service_fd(me)) return -1; - if (!me->parent || - (rsti(me->parent)->fdt && !(rsti(me)->clone_flags & CLONE_FILES))) { + if (!me->parent || (rsti(me->parent)->fdt && !(rsti(me)->clone_flags & CLONE_FILES))) { /* * When our parent has shared fd table, some of the table owners * may be already created. Files, they open, will be inherited @@ -1200,7 +940,7 @@ static int wait_exiting_children(void) futex_dec_and_wake(&task_entries->nr_in_progress); if (waitid(P_ALL, 0, &info, WEXITED | WNOWAIT)) { - pr_perror("Failed to wait\n"); + pr_perror("Failed to wait"); return -1; } @@ -1274,8 +1014,7 @@ struct cr_clone_arg { CoreEntry *core; }; -static void maybe_clone_parent(struct pstree_item *item, - struct cr_clone_arg *ca) +static void maybe_clone_parent(struct pstree_item *item, struct cr_clone_arg *ca) { /* * zdtm runs in kernel 3.11, which has the problem described below. We @@ -1319,9 +1058,32 @@ static bool needs_prep_creds(struct pstree_item *item) return (!item->parent && ((root_ns_mask & CLONE_NEWUSER) || getuid())); } +static int set_next_pid(void *arg) +{ + char buf[32]; + pid_t *pid = arg; + int len; + int fd; + + fd = open_proc_rw(PROC_GEN, LAST_PID_PATH); + if (fd < 0) + return -1; + + len = snprintf(buf, sizeof(buf), "%d", *pid - 1); + if (write(fd, buf, len) != len) { + pr_perror("Failed to write %s to /proc/%s", buf, LAST_PID_PATH); + close(fd); + return -1; + } + close(fd); + return 0; +} + static inline int fork_with_pid(struct pstree_item *item) { struct cr_clone_arg ca; + struct ns_id *pid_ns = NULL; + bool external_pidns = false; int ret = -1; pid_t pid = vpid(item); @@ -1333,7 +1095,22 @@ static inline int fork_with_pid(struct pstree_item *item) return -1; item->pid->state = ca.core->tc->task_state; - rsti(item)->cg_set = ca.core->tc->cg_set; + + /* + * Zombie tasks' cgroup is not dumped/restored. + * cg_set == 0 is skipped in prepare_task_cgroup() + */ + if (item->pid->state == TASK_DEAD) { + rsti(item)->cg_set = 0; + } else { + if (ca.core->thread_core->has_cg_set) + rsti(item)->cg_set = ca.core->thread_core->cg_set; + else + rsti(item)->cg_set = ca.core->tc->cg_set; + } + + if (ca.core->tc->has_stop_signo) + item->pid->stop_signo = ca.core->tc->stop_signo; if (item->pid->state != TASK_DEAD && !task_alive(item)) { pr_err("Unknown task state %d\n", item->pid->state); @@ -1360,7 +1137,36 @@ static inline int fork_with_pid(struct pstree_item *item) ca.core = NULL; } - ret = -1; + if (item->ids) + pid_ns = lookup_ns_by_id(item->ids->pid_ns_id, &pid_ns_desc); + + if (!current && pid_ns && pid_ns->ext_key) + external_pidns = true; + + if (external_pidns) { + int fd; + + /* Not possible to restore into an empty PID namespace. */ + if (pid == INIT_PID) { + pr_err("Unable to restore into an empty PID namespace\n"); + return -1; + } + + fd = inherit_fd_lookup_id(pid_ns->ext_key); + if (fd < 0) { + pr_err("Unable to find an external pidns: %s\n", pid_ns->ext_key); + return -1; + } + + ret = switch_ns_by_fd(fd, &pid_ns_desc, NULL); + close(fd); + if (ret) { + pr_err("Unable to enter existing PID namespace\n"); + return -1; + } + + pr_info("Inheriting external pidns %s for %d\n", pid_ns->ext_key, pid); + } ca.item = item; ca.clone_flags = rsti(item)->clone_flags; @@ -1370,92 +1176,126 @@ static inline int fork_with_pid(struct pstree_item *item) pr_info("Forking task with %d pid (flags 0x%lx)\n", pid, ca.clone_flags); if (!(ca.clone_flags & CLONE_NEWPID)) { - char buf[32]; - int len; - int fd; - - fd = open_proc_rw(PROC_GEN, LAST_PID_PATH); - if (fd < 0) - goto err; - lock_last_pid(); - len = snprintf(buf, sizeof(buf), "%d", pid - 1); - if (write(fd, buf, len) != len) { - pr_perror("%d: Write %s to %s", pid, buf, LAST_PID_PATH); - close(fd); - goto err_unlock; + if (!kdat.has_clone3_set_tid) { + if (external_pidns) { + /* + * Restoring into another namespace requires a helper + * to write to LAST_PID_PATH. Using clone3() this is + * so much easier and simpler. As long as CRIU supports + * clone() this is needed. + */ + ret = call_in_child_process(set_next_pid, (void *)&pid); + } else { + ret = set_next_pid((void *)&pid); + } + if (ret != 0) { + pr_err("Setting PID failed\n"); + goto err_unlock; + } } - close(fd); } else { - BUG_ON(pid != INIT_PID); + if (!external_pidns) { + if (pid != INIT_PID) { + pr_err("First PID in a PID namespace needs to be %d and not %d\n", pid, INIT_PID); + return -1; + } + } + } + + if (kdat.has_clone3_set_tid) { + ret = clone3_with_pid_noasan(restore_task_with_children, &ca, + (ca.clone_flags & ~(CLONE_NEWNET | CLONE_NEWCGROUP | CLONE_NEWTIME)), + SIGCHLD, pid); + } else { + /* + * Some kernel modules, such as network packet generator + * run kernel thread upon net-namespace creation taking + * the @pid we've been requesting via LAST_PID_PATH interface + * so that we can't restore a take with pid needed. + * + * Here is an idea -- unshare net namespace in callee instead. + */ + /* + * The cgroup namespace is also unshared explicitly in the + * move_in_cgroup(), so drop this flag here as well. + */ + close_pid_proc(); + ret = clone_noasan(restore_task_with_children, + (ca.clone_flags & ~(CLONE_NEWNET | CLONE_NEWCGROUP | CLONE_NEWTIME)) | SIGCHLD, &ca); } - /* - * Some kernel modules, such as network packet generator - * run kernel thread upon net-namespace creattion taking - * the @pid we've been requeting via LAST_PID_PATH interface - * so that we can't restore a take with pid needed. - * - * Here is an idea -- unhare net namespace in callee instead. - */ - /* - * The cgroup namespace is also unshared explicitly in the - * move_in_cgroup(), so drop this flag here as well. - */ - close_pid_proc(); - ret = clone_noasan(restore_task_with_children, - (ca.clone_flags & ~(CLONE_NEWNET | CLONE_NEWCGROUP)) | SIGCHLD, &ca); if (ret < 0) { pr_perror("Can't fork for %d", pid); + if (errno == EEXIST) + set_cr_errno(EEXIST); goto err_unlock; } - if (item == root_item) { item->pid->real = ret; - pr_debug("PID: real %d virt %d\n", - item->pid->real, vpid(item)); + pr_debug("PID: real %d virt %d\n", item->pid->real, vpid(item)); } + arch_shstk_unlock(item, ca.core, ret); + err_unlock: if (!(ca.clone_flags & CLONE_NEWPID)) unlock_last_pid(); -err: + if (ca.core) core_entry__free_unpacked(ca.core, NULL); return ret; } +/* Returns 0 if restore can be continued */ +static int sigchld_process(int status, pid_t pid) +{ + int sig; + + if (WIFEXITED(status)) { + pr_err("%d exited, status=%d\n", pid, WEXITSTATUS(status)); + return -1; + } else if (WIFSIGNALED(status)) { + sig = WTERMSIG(status); + pr_err("%d killed by signal %d: %s\n", pid, sig, strsignal(sig)); + return -1; + } else if (WIFSTOPPED(status)) { + sig = WSTOPSIG(status); + /* The root task is ptraced. Allow it to handle SIGCHLD */ + if (sig == SIGCHLD && !current) { + if (ptrace(PTRACE_CONT, pid, 0, SIGCHLD)) { + pr_perror("Unable to resume %d", pid); + return -1; + } + return 0; + } + pr_err("%d stopped by signal %d: %s\n", pid, sig, strsignal(sig)); + return -1; + } else if (WIFCONTINUED(status)) { + pr_err("%d unexpectedly continued\n", pid); + return -1; + } + pr_err("wait for %d resulted in %x status\n", pid, status); + return -1; +} + static void sigchld_handler(int signal, siginfo_t *siginfo, void *data) { - int status, pid, exit; - while (1) { + int status; + pid_t pid; + pid = waitpid(-1, &status, WNOHANG); if (pid <= 0) return; - if (!current && WIFSTOPPED(status) && - WSTOPSIG(status) == SIGCHLD) { - /* The root task is ptraced. Allow it to handle SIGCHLD */ - if (ptrace(PTRACE_CONT, pid, 0, SIGCHLD)) - pr_perror("Unable to resume %d", pid); - return; - } - - exit = WIFEXITED(status); - status = exit ? WEXITSTATUS(status) : WTERMSIG(status); - - break; + if (sigchld_process(status, pid) < 0) + goto err_abort; } - if (exit) - pr_err("%d exited, status=%d\n", pid, status); - else - pr_err("%d killed by signal %d: %s\n", - pid, status, strsignal(status)); - +err_abort: futex_abort_and_wake(&task_entries->nr_in_progress); } @@ -1532,8 +1372,7 @@ static void restore_sid(void) /* Skip the root task if it's not init */ if (current == root_item && vpid(root_item) != INIT_PID) return; - pr_err("Requested sid %d doesn't match inherited %d\n", - current->sid, sid); + pr_err("Requested sid %d doesn't match inherited %d\n", current->sid, sid); exit(1); } } @@ -1585,27 +1424,39 @@ static void restore_pgid(void) futex_set_and_wake(&rsti(current)->pgrp_set, 1); } +static int __legacy_mount_proc(void) +{ + char proc_mountpoint[] = "/tmp/crtools-proc.XXXXXX"; + int fd; + + if (mkdtemp(proc_mountpoint) == NULL) { + pr_perror("mkdtemp failed %s", proc_mountpoint); + return -1; + } + + pr_info("Mount procfs in %s\n", proc_mountpoint); + if (mount("proc", proc_mountpoint, "proc", MS_MGC_VAL | MS_NOSUID | MS_NOEXEC | MS_NODEV, NULL)) { + pr_perror("mount failed"); + if (rmdir(proc_mountpoint)) + pr_perror("Unable to remove %s", proc_mountpoint); + return -1; + } + + fd = open_detach_mount(proc_mountpoint); + return fd; +} + static int mount_proc(void) { int fd, ret; - char proc_mountpoint[] = "crtools-proc.XXXXXX"; if (root_ns_mask == 0) fd = ret = open("/proc", O_DIRECTORY); else { - if (mkdtemp(proc_mountpoint) == NULL) { - pr_perror("mkdtemp failed %s", proc_mountpoint); - return -1; - } - - pr_info("Mount procfs in %s\n", proc_mountpoint); - if (mount("proc", proc_mountpoint, "proc", MS_MGC_VAL | MS_NOSUID | MS_NOEXEC | MS_NODEV, NULL)) { - pr_perror("mount failed"); - rmdir(proc_mountpoint); - return -1; - } - - ret = fd = open_detach_mount(proc_mountpoint); + if (kdat.has_fsopen) + fd = ret = mount_detached_fs("proc"); + else + fd = ret = __legacy_mount_proc(); } if (fd >= 0) { @@ -1655,7 +1506,7 @@ static int create_children_and_session(void) return 0; } -static int restore_task_with_children(void *_arg) +static int __restore_task_with_children(void *_arg) { struct cr_clone_arg *ca = _arg; pid_t pid; @@ -1680,8 +1531,7 @@ static int restore_task_with_children(void *_arg) buf[ret] = '\0'; current->pid->real = atoi(buf); - pr_debug("PID: real %d virt %d\n", - current->pid->real, vpid(current)); + pr_debug("PID: real %d virt %d\n", current->pid->real, vpid(current)); } pid = getpid(); @@ -1692,7 +1542,7 @@ static int restore_task_with_children(void *_arg) } if (log_init_by_pid(vpid(current))) - return -1; + goto err; if (current->parent == NULL) { /* @@ -1711,9 +1561,27 @@ static int restore_task_with_children(void *_arg) } } + if (root_ns_mask & CLONE_NEWTIME) { + if (prepare_timens(current->ids->time_ns_id)) + goto err; + } else if (kdat.has_timens) { + if (prepare_timens(0)) + goto err; + } + + if (set_opts_cap_eff()) + goto err; + /* Wait prepare_userns */ if (restore_finish_ns_stage(CR_STATE_ROOT_TASK, CR_STATE_PREPARE_NAMESPACES) < 0) goto err; + + /* + * Since we don't support nesting of cgroup namespaces, let's + * only set up the cgns (if it exists) in the init task. + */ + if (prepare_cgroup_namespace(current) < 0) + goto err; } if (needs_prep_creds(current) && (prepare_userns_creds())) @@ -1725,7 +1593,7 @@ static int restore_task_with_children(void *_arg) * we will only move the root one there, others will * just have it inherited. */ - if (prepare_task_cgroup(current) < 0) + if (restore_task_cgroup(current) < 0) goto err; /* Restore root task */ @@ -1830,6 +1698,19 @@ err: exit(1); } +static int restore_task_with_children(void *_arg) +{ + struct cr_clone_arg *arg = _arg; + struct pstree_item *item = arg->item; + CoreEntry *core = arg->core; + + return arch_shstk_trampoline(item, core, __restore_task_with_children, + arg); +} + +int __attribute((weak)) arch_ptrace_restore(int pid, struct pstree_item *item); +int arch_ptrace_restore(int pid, struct pstree_item *item) { return 0; } + static int attach_to_tasks(bool root_seized) { struct pstree_item *item; @@ -1861,12 +1742,17 @@ static int attach_to_tasks(bool root_seized) return -1; } - if (wait4(pid, &status, __WALL, NULL) != pid) { pr_perror("waitpid(%d) failed", pid); return -1; } + if (ptrace(PTRACE_SETOPTIONS, pid, NULL, PTRACE_O_TRACESYSGOOD)) { + pr_perror("Unable to set PTRACE_O_TRACESYSGOOD for %d", pid); + return -1; + } + if (arch_ptrace_restore(pid, item)) + return -1; /* * Suspend seccomp if necessary. We need to do this because * although seccomp is restored at the very end of the @@ -1877,7 +1763,7 @@ static int attach_to_tasks(bool root_seized) if (rsti(item)->has_seccomp && ptrace_suspend_seccomp(pid) < 0) pr_err("failed to suspend seccomp, restore will probably fail...\n"); - if (ptrace(PTRACE_CONT, pid, NULL, NULL) ) { + if (ptrace(PTRACE_CONT, pid, NULL, NULL)) { pr_perror("Unable to resume %d", pid); return -1; } @@ -1887,10 +1773,55 @@ static int attach_to_tasks(bool root_seized) return 0; } -static int catch_tasks(bool root_seized, enum trace_flags *flag) +static int restore_rseq_cs(void) { struct pstree_item *item; + for_each_pstree_item(item) { + int i; + + if (!task_alive(item)) + continue; + + if (item->nr_threads == 1) { + item->threads[0].real = item->pid->real; + } else { + if (parse_threads(item->pid->real, &item->threads, &item->nr_threads)) { + pr_err("restore_rseq_cs: parse_threads failed\n"); + return -1; + } + } + + for (i = 0; i < item->nr_threads; i++) { + pid_t pid = item->threads[i].real; + struct rst_rseq *rseqe = rsti(item)->rseqe; + + if (!rseqe) { + pr_err("restore_rseq_cs: rsti(item)->rseqe is NULL\n"); + return -1; + } + + if (!rseqe[i].rseq_cs_pointer || !rseqe[i].rseq_abi_pointer) + continue; + + if (ptrace_poke_area( + pid, &rseqe[i].rseq_cs_pointer, + decode_pointer(rseqe[i].rseq_abi_pointer + offsetof(struct criu_rseq, rseq_cs)), + sizeof(uint64_t))) { + pr_err("Can't restore rseq_cs pointer (pid: %d)\n", pid); + return -1; + } + } + } + + return 0; +} + +static int catch_tasks(bool root_seized) +{ + struct pstree_item *item; + bool nobp = fault_injected(FI_NO_BREAKPOINTS) || !kdat.has_breakpoints; + for_each_pstree_item(item) { int status, i, ret; @@ -1917,8 +1848,7 @@ static int catch_tasks(bool root_seized, enum trace_flags *flag) return -1; } - ret = compel_stop_pie(pid, rsti(item)->breakpoint, - flag, fault_injected(FI_NO_BREAKPOINTS)); + ret = compel_stop_pie(pid, rsti(item)->breakpoint, nobp); if (ret < 0) return -1; } @@ -1927,24 +1857,6 @@ static int catch_tasks(bool root_seized, enum trace_flags *flag) return 0; } -static int clear_breakpoints() -{ - struct pstree_item *item; - int ret = 0, i; - - if (fault_injected(FI_NO_BREAKPOINTS)) - return 0; - - for_each_pstree_item(item) { - if (!task_alive(item)) - continue; - for (i = 0; i < item->nr_threads; i++) - ret |= ptrace_flush_breakpoints(item->threads[i].real); - } - - return ret; -} - static void finalize_restore(void) { struct pstree_item *item; @@ -1952,6 +1864,7 @@ static void finalize_restore(void) for_each_pstree_item(item) { pid_t pid = item->pid->real; struct parasite_ctl *ctl; + unsigned long restorer_addr; if (!task_alive(item)) continue; @@ -1961,17 +1874,24 @@ static void finalize_restore(void) if (ctl == NULL) continue; - compel_unmap(ctl, (unsigned long)rsti(item)->munmap_restorer); + restorer_addr = (unsigned long)rsti(item)->munmap_restorer; + if (compel_unmap(ctl, restorer_addr)) + pr_err("Failed to unmap restorer from %d\n", pid); xfree(ctl); - if ((item->pid->state == TASK_STOPPED) || - (opts.final_state == TASK_STOPPED)) + if (opts.final_state == TASK_STOPPED) kill(item->pid->real, SIGSTOP); + else if (item->pid->state == TASK_STOPPED) { + if (item->pid->stop_signo > 0) + kill(item->pid->real, item->pid->stop_signo); + else + kill(item->pid->real, SIGSTOP); + } } } -static void finalize_restore_detach(int status) +static int finalize_restore_detach(void) { struct pstree_item *item; @@ -1985,16 +1905,21 @@ static void finalize_restore_detach(int status) for (i = 0; i < item->nr_threads; i++) { pid = item->threads[i].real; if (pid < 0) { - BUG_ON(status >= 0); - break; + pr_err("pstree item has invalid pid %d\n", pid); + continue; } - if (arch_set_thread_regs_nosigrt(&item->threads[i])) + if (arch_set_thread_regs_nosigrt(&item->threads[i])) { pr_perror("Restoring regs for %d failed", pid); - if (ptrace(PTRACE_DETACH, pid, NULL, 0)) - pr_perror("Unable to execute %d", pid); + return -1; + } + if (ptrace(PTRACE_DETACH, pid, NULL, 0)) { + pr_perror("Unable to detach %d", pid); + return -1; + } } } + return 0; } static void ignore_kids(void) @@ -2024,7 +1949,7 @@ static int prepare_userns_hook(void) if (ret < 0) return -1; - if (prepare_loginuid(INVALID_UID, LOG_ERROR) < 0) { + if (prepare_loginuid(INVALID_UID) < 0) { pr_err("Setting loginuid for CT init task failed, CAP_AUDIT_CONTROL?\n"); return -1; } @@ -2037,7 +1962,7 @@ static void restore_origin_ns_hook(void) return; /* not critical: it does not affect CT in any way */ - if (prepare_loginuid(saved_loginuid, LOG_ERROR) < 0) + if (prepare_loginuid(saved_loginuid) < 0) pr_err("Restore original /proc/self/loginuid failed\n"); } @@ -2058,9 +1983,20 @@ static int write_restored_pid(void) return 0; } +static void reap_zombies(void) +{ + while (1) { + pid_t pid = wait(NULL); + if (pid == -1) { + if (errno != ECHILD) + pr_perror("Error while waiting for pids"); + return; + } + } +} + static int restore_root_task(struct pstree_item *init) { - enum trace_flags flag = TRACE_ALL; int ret, fd, mnt_ns_fd = -1; int root_seized = 0; struct pstree_item *item; @@ -2088,25 +2024,34 @@ static int restore_root_task(struct pstree_item *init) * this later. */ - if (vpid(init) == INIT_PID) { - if (!(root_ns_mask & CLONE_NEWPID)) { - pr_err("This process tree can only be restored " - "in a new pid namespace.\n" - "criu should be re-executed with the " - "\"--namespace pid\" option.\n"); - return -1; - } - } else if (root_ns_mask & CLONE_NEWPID) { - pr_err("Can't restore pid namespace without the process init\n"); - return -1; - } - if (prepare_userns_hook()) return -1; if (prepare_namespace_before_tasks()) return -1; + if (vpid(init) == INIT_PID) { + if (!(root_ns_mask & CLONE_NEWPID)) { + pr_err("This process tree can only be restored " + "in a new pid namespace.\n" + "criu should be re-executed with the " + "\"--namespace pid\" option.\n"); + return -1; + } + } else if (root_ns_mask & CLONE_NEWPID) { + struct ns_id *ns; + /* + * Restoring into an existing PID namespace. This disables + * the check to require a PID 1 when restoring a process + * which used to be in a PID namespace. + */ + ns = lookup_ns_by_id(init->ids->pid_ns_id, &pid_ns_desc); + if (!ns || !ns->ext_key) { + pr_err("Can't restore pid namespace without the process init\n"); + return -1; + } + } + __restore_switch_stage_nw(CR_STATE_ROOT_TASK); ret = fork_with_pid(init); @@ -2175,7 +2120,7 @@ static int restore_root_task(struct pstree_item *init) * the '--empty-ns net' mode no iptables C/R is done and we * need to return these rules by hands. */ - ret = network_lock_internal(); + ret = network_lock_internal(/* restore = */ true); if (ret) goto out_kill; } @@ -2187,11 +2132,18 @@ static int restore_root_task(struct pstree_item *init) __restore_switch_stage(CR_STATE_FORKING); skip_ns_bouncing: + ret = run_plugins(POST_FORKING); + if (ret < 0 && ret != -ENOTSUP) + goto out_kill; ret = restore_wait_inprogress_tasks(); if (ret < 0) goto out_kill; + ret = apply_memfd_seals(); + if (ret < 0) + goto out_kill; + /* * Zombies die after CR_STATE_RESTORE which is switched * by root task, not by us. See comment before CR_STATE_FORKING @@ -2210,6 +2162,10 @@ skip_ns_bouncing: if (ret < 0) goto out_kill; + ret = stop_cgroupd(); + if (ret < 0) + goto out_kill; + ret = move_veth_to_bridge(); if (ret < 0) goto out_kill; @@ -2252,32 +2208,63 @@ skip_ns_bouncing: /* * ------------------------------------------------------------- - * Below this line nothing should fail, because network is unlocked + * Network is unlocked. If something fails below - we lose data + * or a connection. */ attach_to_tasks(root_seized); - ret = restore_switch_stage(CR_STATE_RESTORE_CREDS); - BUG_ON(ret); + if (restore_switch_stage(CR_STATE_RESTORE_CREDS)) + goto out_kill_network_unlocked; timing_stop(TIME_RESTORE); - ret = catch_tasks(root_seized, &flag); + if (catch_tasks(root_seized)) { + pr_err("Can't catch all tasks\n"); + goto out_kill_network_unlocked; + } if (lazy_pages_finish_restore()) - goto out_kill; + goto out_kill_network_unlocked; - pr_info("Restore finished successfully. Resuming tasks.\n"); __restore_switch_stage(CR_STATE_COMPLETE); - if (ret == 0) - ret = compel_stop_on_syscall(task_entries->nr_threads, - __NR(rt_sigreturn, 0), __NR(rt_sigreturn, 1), flag); + ret = compel_stop_on_syscall(task_entries->nr_threads, __NR(rt_sigreturn, 0), __NR(rt_sigreturn, 1)); + if (ret) { + pr_err("Can't stop all tasks on rt_sigreturn\n"); + goto out_kill_network_unlocked; + } - if (clear_breakpoints()) - pr_err("Unable to flush breakpoints\n"); + finalize_restore(); - if (ret == 0) - finalize_restore(); + /* just before releasing threads we have to restore rseq_cs */ + if (restore_rseq_cs()) + pr_err("Unable to restore rseq_cs state\n"); + + /* + * Some external devices such as GPUs might need a very late + * trigger to kick-off some events, memory notifiers and for + * restarting the previously restored queues during criu restore + * stage. This is needed since criu pie code may shuffle VMAs + * around so things such as registering MMU notifiers (for GPU + * mapped memory) could be done sanely once the pie code hands + * over the control to master process. + */ + pr_info("Run late stage hook from criu master for external devices\n"); + for_each_pstree_item(item) { + if (!task_alive(item)) + continue; + ret = run_plugins(RESUME_DEVICES_LATE, item->pid->real); + /* + * This may not really be an error. Only certain plugin hooks + * (if available) will return success such as amdgpu_plugin that + * validates the pid of the resuming tasks in the kernel mode. + * Most of the times, it'll be -ENOTSUP and in few cases, it + * might actually be a true error code but that would be also + * captured in the plugin so no need to print the error here. + */ + if (ret < 0 && ret != -ENOTSUP) + pr_debug("restore late stage hook for external plugin failed\n"); + } ret = run_scripts(ACT_PRE_RESUME); if (ret) @@ -2286,28 +2273,34 @@ skip_ns_bouncing: if (restore_freezer_state()) pr_err("Unable to restore freezer state\n"); - fini_cgroup(); - /* Detaches from processes and they continue run through sigreturn. */ - finalize_restore_detach(ret); + if (finalize_restore_detach()) + goto out_kill_network_unlocked; + pr_info("Restore finished successfully. Tasks resumed.\n"); write_stats(RESTORE_STATS); + /* This has the effect of dismissing the image streamer */ + close_image_dir(); + ret = run_scripts(ACT_POST_RESUME); if (ret != 0) pr_err("Post-resume script ret code %d\n", ret); - if (!opts.restore_detach && !opts.exec_cmd) - wait(NULL); + if (!opts.restore_detach && !opts.exec_cmd) { + reap_zombies(); + } return 0; +out_kill_network_unlocked: + pr_err("Killing processes because of failure on restore.\nThe Network was unlocked so some data or a connection may have been lost.\n"); out_kill: /* * The processes can be killed only when all of them have been created, * otherwise an external processes can be killed. */ - if (root_ns_mask & CLONE_NEWPID) { + if (vpid(root_item) == INIT_PID) { int status; /* Kill init */ @@ -2315,8 +2308,7 @@ out_kill: kill(root_item->pid->real, SIGKILL); if (waitpid(root_item->pid->real, &status, 0) < 0) - pr_warn("Unable to wait %d: %s\n", - root_item->pid->real, strerror(errno)); + pr_warn("Unable to wait %d: %s\n", root_item->pid->real, strerror(errno)); } else { struct pstree_item *pi; @@ -2326,7 +2318,6 @@ out_kill: } out: - fini_cgroup(); depopulate_roots_yard(mnt_ns_fd, true); stop_usernsd(); __restore_switch_stage(CR_STATE_FAIL); @@ -2348,6 +2339,7 @@ int prepare_task_entries(void) task_entries->nr_helpers = 0; futex_set(&task_entries->start, CR_STATE_FAIL); mutex_init(&task_entries->userns_sync_lock); + mutex_init(&task_entries->cgroupd_sync_lock); mutex_init(&task_entries->last_pid_mutex); return 0; @@ -2373,42 +2365,48 @@ int cr_restore_tasks(void) if (init_service_fd()) return 1; - if (cr_plugin_init(CR_PLUGIN_STAGE__RESTORE)) + if (check_img_inventory(/* restore = */ true) < 0) return -1; - if (check_img_inventory() < 0) - goto err; - if (init_stats(RESTORE_STATS)) - goto err; + return -1; if (lsm_check_opts()) - goto err; + return -1; timing_start(TIME_RESTORE); if (cpu_init() < 0) - goto err; + return -1; if (vdso_init_restore()) - goto err; + return -1; if (tty_init_restore()) - goto err; + return -1; if (opts.cpu_cap & CPU_CAP_IMAGE) { if (cpu_validate_cpuinfo()) - goto err; + return -1; } if (prepare_task_entries() < 0) - goto err; + return -1; if (prepare_pstree() < 0) - goto err; + return -1; if (fdstore_init()) - goto err; + return -1; + + /* + * For the AMDGPU plugin, its parallel restore feature needs to use fdstore to store + * its socket file descriptor. This allows the main process and the target process to + * communicate with each other through this file descriptor. Therefore, cr_plugin_init + * must be initialized after fdstore_init. + */ + if (cr_plugin_init(CR_PLUGIN_STAGE__RESTORE)) + return -1; if (inherit_fd_move_to_fdstore()) goto err; @@ -2416,36 +2414,41 @@ int cr_restore_tasks(void) if (crtools_prepare_shared() < 0) goto err; + if (prepare_cgroup()) + goto clean_cgroup; + if (criu_signals_setup() < 0) - goto err; + goto clean_cgroup; if (prepare_lazy_pages_socket() < 0) - goto err; + goto clean_cgroup; ret = restore_root_task(root_item); +clean_cgroup: + fini_cgroup(); err: cr_plugin_fini(CR_PLUGIN_STAGE__RESTORE, ret); return ret; } -static long restorer_get_vma_hint(struct list_head *tgt_vma_list, - struct list_head *self_vma_list, long vma_len) +static long restorer_get_vma_hint(struct list_head *tgt_vma_list, struct list_head *self_vma_list, long min_addr, long vma_len) { struct vma_area *t_vma, *s_vma; - long prev_vma_end = 0; + long prev_vma_end = min_addr; struct vma_area end_vma; VmaEntry end_e; end_vma.e = &end_e; end_e.start = end_e.end = kdat.task_size; - prev_vma_end = kdat.mmap_min_addr; + INIT_LIST_HEAD(&end_vma.list); s_vma = list_first_entry(self_vma_list, struct vma_area, list); t_vma = list_first_entry(tgt_vma_list, struct vma_area, list); while (1) { if (prev_vma_end + vma_len > s_vma->e->start) { - if (s_vma->list.next == self_vma_list) { + if ((s_vma->list.next == self_vma_list) || + vma_area_is(vma_next(s_vma), VMA_AREA_GUARD)) { s_vma = &end_vma; continue; } @@ -2458,7 +2461,8 @@ static long restorer_get_vma_hint(struct list_head *tgt_vma_list, } if (prev_vma_end + vma_len > t_vma->e->start) { - if (t_vma->list.next == tgt_vma_list) { + if ((t_vma->list.next == tgt_vma_list) || + vma_area_is(vma_next(t_vma), VMA_AREA_GUARD)) { t_vma = &end_vma; continue; } @@ -2476,254 +2480,6 @@ static long restorer_get_vma_hint(struct list_head *tgt_vma_list, return -1; } -static inline int timeval_valid(struct timeval *tv) -{ - return (tv->tv_sec >= 0) && ((unsigned long)tv->tv_usec < USEC_PER_SEC); -} - -static inline int decode_itimer(char *n, ItimerEntry *ie, struct itimerval *val) -{ - if (ie->isec == 0 && ie->iusec == 0) { - memzero_p(val); - return 0; - } - - val->it_interval.tv_sec = ie->isec; - val->it_interval.tv_usec = ie->iusec; - - if (!timeval_valid(&val->it_interval)) { - pr_err("Invalid timer interval\n"); - return -1; - } - - if (ie->vsec == 0 && ie->vusec == 0) { - /* - * Remaining time was too short. Set it to - * interval to make the timer armed and work. - */ - val->it_value.tv_sec = ie->isec; - val->it_value.tv_usec = ie->iusec; - } else { - val->it_value.tv_sec = ie->vsec; - val->it_value.tv_usec = ie->vusec; - } - - if (!timeval_valid(&val->it_value)) { - pr_err("Invalid timer value\n"); - return -1; - } - - pr_info("Restored %s timer to %ld.%ld -> %ld.%ld\n", n, - val->it_value.tv_sec, val->it_value.tv_usec, - val->it_interval.tv_sec, val->it_interval.tv_usec); - - return 0; -} - -/* - * Legacy itimers restore from CR_FD_ITIMERS - */ - -static int prepare_itimers_from_fd(int pid, struct task_restore_args *args) -{ - int ret = -1; - struct cr_img *img; - ItimerEntry *ie; - - if (!deprecated_ok("Itimers")) - return -1; - - img = open_image(CR_FD_ITIMERS, O_RSTR, pid); - if (!img) - return -1; - - ret = pb_read_one(img, &ie, PB_ITIMER); - if (ret < 0) - goto out; - ret = decode_itimer("real", ie, &args->itimers[0]); - itimer_entry__free_unpacked(ie, NULL); - if (ret < 0) - goto out; - - ret = pb_read_one(img, &ie, PB_ITIMER); - if (ret < 0) - goto out; - ret = decode_itimer("virt", ie, &args->itimers[1]); - itimer_entry__free_unpacked(ie, NULL); - if (ret < 0) - goto out; - - ret = pb_read_one(img, &ie, PB_ITIMER); - if (ret < 0) - goto out; - ret = decode_itimer("prof", ie, &args->itimers[2]); - itimer_entry__free_unpacked(ie, NULL); - if (ret < 0) - goto out; -out: - close_image(img); - return ret; -} - -static int prepare_itimers(int pid, struct task_restore_args *args, CoreEntry *core) -{ - int ret = 0; - TaskTimersEntry *tte = core->tc->timers; - - if (!tte) - return prepare_itimers_from_fd(pid, args); - - ret |= decode_itimer("real", tte->real, &args->itimers[0]); - ret |= decode_itimer("virt", tte->virt, &args->itimers[1]); - ret |= decode_itimer("prof", tte->prof, &args->itimers[2]); - - return ret; -} - -static inline int timespec_valid(struct timespec *ts) -{ - return (ts->tv_sec >= 0) && ((unsigned long)ts->tv_nsec < NSEC_PER_SEC); -} - -static inline int decode_posix_timer(PosixTimerEntry *pte, - struct restore_posix_timer *pt) -{ - pt->val.it_interval.tv_sec = pte->isec; - pt->val.it_interval.tv_nsec = pte->insec; - - if (!timespec_valid(&pt->val.it_interval)) { - pr_err("Invalid timer interval(posix)\n"); - return -1; - } - - if (pte->vsec == 0 && pte->vnsec == 0) { - /* - * Remaining time was too short. Set it to - * interval to make the timer armed and work. - */ - pt->val.it_value.tv_sec = pte->isec; - pt->val.it_value.tv_nsec = pte->insec; - } else { - pt->val.it_value.tv_sec = pte->vsec; - pt->val.it_value.tv_nsec = pte->vnsec; - } - - if (!timespec_valid(&pt->val.it_value)) { - pr_err("Invalid timer value(posix)\n"); - return -1; - } - - pt->spt.it_id = pte->it_id; - pt->spt.clock_id = pte->clock_id; - pt->spt.si_signo = pte->si_signo; - pt->spt.it_sigev_notify = pte->it_sigev_notify; - pt->spt.sival_ptr = decode_pointer(pte->sival_ptr); - pt->overrun = pte->overrun; - - return 0; -} - -static int cmp_posix_timer_proc_id(const void *p1, const void *p2) -{ - return ((struct restore_posix_timer *)p1)->spt.it_id - ((struct restore_posix_timer *)p2)->spt.it_id; -} - -static void sort_posix_timers(struct task_restore_args *ta) -{ - void *tmem; - - /* - * This is required for restorer's create_posix_timers(), - * it will probe them one-by-one for the desired ID, since - * kernel doesn't provide another API for timer creation - * with given ID. - */ - - if (ta->posix_timers_n > 0) { - tmem = rst_mem_remap_ptr((unsigned long)ta->posix_timers, RM_PRIVATE); - qsort(tmem, ta->posix_timers_n, - sizeof(struct restore_posix_timer), - cmp_posix_timer_proc_id); - } -} - -/* - * Legacy posix timers restoration from CR_FD_POSIX_TIMERS - */ - -static int prepare_posix_timers_from_fd(int pid, struct task_restore_args *ta) -{ - struct cr_img *img; - int ret = -1; - struct restore_posix_timer *t; - - if (!deprecated_ok("Posix timers")) - return -1; - - img = open_image(CR_FD_POSIX_TIMERS, O_RSTR, pid); - if (!img) - return -1; - - ta->posix_timers_n = 0; - while (1) { - PosixTimerEntry *pte; - - ret = pb_read_one_eof(img, &pte, PB_POSIX_TIMER); - if (ret <= 0) - break; - - t = rst_mem_alloc(sizeof(struct restore_posix_timer), RM_PRIVATE); - if (!t) - break; - - ret = decode_posix_timer(pte, t); - if (ret < 0) - break; - - posix_timer_entry__free_unpacked(pte, NULL); - ta->posix_timers_n++; - } - - close_image(img); - if (!ret) - sort_posix_timers(ta); - - return ret; -} - -static int prepare_posix_timers(int pid, struct task_restore_args *ta, CoreEntry *core) -{ - int i, ret = -1; - TaskTimersEntry *tte = core->tc->timers; - struct restore_posix_timer *t; - - ta->posix_timers = (struct restore_posix_timer *)rst_mem_align_cpos(RM_PRIVATE); - - if (!tte) - return prepare_posix_timers_from_fd(pid, ta); - - ta->posix_timers_n = tte->n_posix; - for (i = 0; i < ta->posix_timers_n; i++) { - t = rst_mem_alloc(sizeof(struct restore_posix_timer), RM_PRIVATE); - if (!t) - goto out; - - if (decode_posix_timer(tte->posix[i], t)) - goto out; - } - - ret = 0; - sort_posix_timers(ta); -out: - return ret; -} - -static inline int verify_cap_size(CredsEntry *ce) -{ - return ((ce->n_cap_inh == CR_CAP_SIZE) && (ce->n_cap_eff == CR_CAP_SIZE) && - (ce->n_cap_prm == CR_CAP_SIZE) && (ce->n_cap_bnd == CR_CAP_SIZE)); -} - static int prepare_mm(pid_t pid, struct task_restore_args *args) { int exe_fd, i, ret = -1; @@ -2738,7 +2494,7 @@ static int prepare_mm(pid_t pid, struct task_restore_args *args) goto out; } - args->mm_saved_auxv_size = mm->n_mm_saved_auxv*sizeof(auxv_t); + args->mm_saved_auxv_size = mm->n_mm_saved_auxv * sizeof(auxv_t); for (i = 0; i < mm->n_mm_saved_auxv; ++i) { args->mm_saved_auxv[i] = (auxv_t)mm->mm_saved_auxv[i]; } @@ -2749,7 +2505,7 @@ static int prepare_mm(pid_t pid, struct task_restore_args *args) args->fd_exe_link = exe_fd; - args->has_thp_enabled = rsti(current)->has_thp_enabled; + args->thp_disabled = mm->has_thp_disabled && mm->thp_disabled; ret = 0; out: @@ -2767,32 +2523,64 @@ static int prepare_restorer_blob(void) * in turn will lead to set-exe-file prctl to fail with EBUSY. */ - restorer_len = pie_size(restorer); - restorer = mmap(NULL, restorer_len, - PROT_READ | PROT_WRITE | PROT_EXEC, - MAP_PRIVATE | MAP_ANONYMOUS, 0, 0); + struct parasite_blob_desc pbd; + + /* + * We pass native=true, which is then used to set the value of + * pbd.parasite_ip_off. We don't use parasite_ip_off, so the value we + * pass as native argument is not relevant. + */ + restorer_setup_c_header_desc(&pbd, true); + + /* + * args_off is the offset where the binary blob with its GOT table + * ends. As we don't do RPC, parasite sections after args_off can be + * ignored. See compel_infect() for a description of the parasite + * memory layout. + */ + restorer_len = round_up(pbd.hdr.args_off, page_size()); + + restorer = mmap(NULL, restorer_len, PROT_READ | PROT_WRITE | PROT_EXEC, MAP_PRIVATE | MAP_ANONYMOUS, 0, 0); if (restorer == MAP_FAILED) { pr_perror("Can't map restorer code"); return -1; } - memcpy(restorer, &restorer_blob, sizeof(restorer_blob)); + memcpy(restorer, pbd.hdr.mem, pbd.hdr.bsize); + return 0; } static int remap_restorer_blob(void *addr) { + struct parasite_blob_desc pbd; void *mem; - mem = mremap(restorer, restorer_len, restorer_len, - MREMAP_FIXED | MREMAP_MAYMOVE, addr); + mem = mremap(restorer, restorer_len, restorer_len, MREMAP_FIXED | MREMAP_MAYMOVE, addr); if (mem != addr) { pr_perror("Can't remap restorer blob"); return -1; } - compel_relocs_apply(addr, addr, sizeof(restorer_blob), - restorer_relocs, ARRAY_SIZE(restorer_relocs)); + /* + * Pass native=true, which is then used to set the value of + * pbd.parasite_ip_off. parasite_ip_off is unused in restorer + * as compat (ia32) tasks are restored from native (x86_64) + * mode, so the value we pass as native argument is not relevant. + */ + restorer_setup_c_header_desc(&pbd, true); + compel_relocs_apply(addr, addr, &pbd); + + /* + * Ensure the infected thread sees the updated code. + * + * On architectures like ARM64, the Data Cache (D-cache) and + * Instruction Cache (I-cache) are not automatically coherent. + * Modifications land in the D-cache, so we must flush (clean) the + * D-cache to push changes to RAM to ensure the CPU fetches the updated + * instructions. + */ + __builtin___clear_cache(addr, addr + pbd.hdr.bsize); return 0; } @@ -2802,7 +2590,7 @@ static int validate_sched_parm(struct rst_sched_param *sp) if ((sp->nice < -20) || (sp->nice > 19)) return 0; - switch (sp->policy) { + switch (sp->policy & ~SCHED_RESET_ON_FORK) { case SCHED_RR: case SCHED_FIFO: return ((sp->prio > 0) && (sp->prio < 100)); @@ -2828,14 +2616,62 @@ static int prep_sched_info(struct rst_sched_param *sp, ThreadCoreEntry *tc) sp->prio = tc->sched_prio; if (!validate_sched_parm(sp)) { - pr_err("Inconsistent sched params received (%d.%d.%d)\n", - sp->policy, sp->nice, sp->prio); + pr_err("Inconsistent sched params received (%d.%d.%d)\n", sp->policy, sp->nice, sp->prio); return -1; } return 0; } +static int prep_rseq(struct rst_rseq_param *rseq, ThreadCoreEntry *tc) +{ + /* compatibility with older CRIU versions */ + if (!tc->rseq_entry) + return 0; + + rseq->rseq_abi_pointer = tc->rseq_entry->rseq_abi_pointer; + rseq->rseq_abi_size = tc->rseq_entry->rseq_abi_size; + rseq->signature = tc->rseq_entry->signature; + + if (rseq->rseq_abi_pointer && !kdat.has_rseq) { + pr_err("rseq: can't restore as kernel doesn't support it\n"); + return -1; + } + + return 0; +} + +static void prep_libc_rseq_info(struct rst_rseq_param *rseq) +{ + if (!kdat.has_rseq) { + rseq->rseq_abi_pointer = 0; + return; + } + + if (!kdat.has_ptrace_get_rseq_conf) { +#if defined(__GLIBC__) && defined(RSEQ_SIG) + rseq->rseq_abi_pointer = encode_pointer(__criu_thread_pointer() + __rseq_offset); + /* + * Current glibc reports the feature/active size in + * __rseq_size, not the size passed to the kernel. + * This could be 20, but older kernels expect 32 for + * the size argument even if only 20 bytes are used. + */ + rseq->rseq_abi_size = __rseq_size; + if (rseq->rseq_abi_size < 32) + rseq->rseq_abi_size = 32; + rseq->signature = RSEQ_SIG; +#else + rseq->rseq_abi_pointer = 0; +#endif + return; + } + + rseq->rseq_abi_pointer = kdat.libc_rseq_conf.rseq_abi_pointer; + rseq->rseq_abi_size = kdat.libc_rseq_conf.rseq_abi_size; + rseq->signature = kdat.libc_rseq_conf.signature; +} + static rlim_t decode_rlim(rlim_t ival) { return ival == -1 ? RLIM_INFINITY : ival; @@ -2871,16 +2707,14 @@ static int prepare_rlimits_from_fd(int pid, struct task_restore_args *ta) r = rst_mem_alloc(sizeof(*r), RM_PRIVATE); if (!r) { - pr_err("Can't allocate memory for resource %d\n", - ta->rlims_n); + pr_err("Can't allocate memory for resource %d\n", ta->rlims_n); return -1; } r->rlim_cur = decode_rlim(re->cur); r->rlim_max = decode_rlim(re->max); if (r->rlim_cur > r->rlim_max) { - pr_err("Can't restore cur > max for %d.%d\n", - pid, ta->rlims_n); + pr_err("Can't restore cur > max for %d.%d\n", pid, ta->rlims_n); r->rlim_cur = r->rlim_max; } @@ -2925,11 +2759,11 @@ static int prepare_rlimits(int pid, struct task_restore_args *ta, CoreEntry *cor return 0; } -static int signal_to_mem(SiginfoEntry *sie) +static int signal_to_mem(SiginfoEntry *se) { siginfo_t *info, *t; - info = (siginfo_t *) sie->siginfo.data; + info = (siginfo_t *)se->siginfo.data; t = rst_mem_alloc(sizeof(siginfo_t), RM_PRIVATE); if (!t) return -1; @@ -2950,29 +2784,29 @@ static int open_signal_image(int type, pid_t pid, unsigned int *nr) *nr = 0; while (1) { - SiginfoEntry *sie; + SiginfoEntry *se; - ret = pb_read_one_eof(img, &sie, PB_SIGINFO); + ret = pb_read_one_eof(img, &se, PB_SIGINFO); if (ret <= 0) break; - if (sie->siginfo.len != sizeof(siginfo_t)) { + if (se->siginfo.len != sizeof(siginfo_t)) { pr_err("Unknown image format\n"); ret = -1; break; } - ret = signal_to_mem(sie); + ret = signal_to_mem(se); if (ret) break; (*nr)++; - siginfo_entry__free_unpacked(sie, NULL); + siginfo_entry__free_unpacked(se, NULL); } close_image(img); - return ret ? : 0; + return ret ?: 0; } static int prepare_one_signal_queue(SignalQueueEntry *sqe, unsigned int *nr) @@ -3000,7 +2834,7 @@ static int prepare_signals(int pid, struct task_restore_args *ta, CoreEntry *lea goto out; /* Prepare shared signals */ - if (!leader_core->tc->signals_s)/*backward compatibility*/ + if (!leader_core->tc->signals_s) /*backward compatibility*/ ret = open_signal_image(CR_FD_SIGNAL, pid, &ta->siginfo_n); else ret = prepare_one_signal_queue(leader_core->tc->signals_s, &ta->siginfo_n); @@ -3009,12 +2843,10 @@ static int prepare_signals(int pid, struct task_restore_args *ta, CoreEntry *lea goto out; for (i = 0; i < current->nr_threads; i++) { - if (!current->core[i]->thread_core->signals_p)/*backward compatibility*/ - ret = open_signal_image(CR_FD_PSIGNAL, - current->threads[i].ns[0].virt, &siginfo_priv_nr[i]); + if (!current->core[i]->thread_core->signals_p) /*backward compatibility*/ + ret = open_signal_image(CR_FD_PSIGNAL, current->threads[i].ns[0].virt, &siginfo_priv_nr[i]); else - ret = prepare_one_signal_queue(current->core[i]->thread_core->signals_p, - &siginfo_priv_nr[i]); + ret = prepare_one_signal_queue(current->core[i]->thread_core->signals_p, &siginfo_priv_nr[i]); if (ret < 0) goto out; } @@ -3023,10 +2855,11 @@ out: } extern void __gcov_flush(void) __attribute__((weak)); -void __gcov_flush(void) {} +void __gcov_flush(void) +{ +} -static void rst_reloc_creds(struct thread_restore_args *thread_args, - unsigned long *creds_pos_next) +static void rst_reloc_creds(struct thread_restore_args *thread_args, unsigned long *creds_pos_next) { struct thread_creds_args *args; @@ -3046,19 +2879,65 @@ static void rst_reloc_creds(struct thread_restore_args *thread_args, thread_args->creds_args = args; } -static struct thread_creds_args * -rst_prep_creds_args(CredsEntry *ce, unsigned long *prev_pos) +static bool groups_match(gid_t *groups, int n_groups) +{ + int n, len; + bool ret; + gid_t *gids; + + n = getgroups(0, NULL); + if (n == -1) { + pr_perror("Failed to get number of supplementary groups"); + return false; + } + if (n != n_groups) + return false; + if (n == 0) + return true; + + len = n * sizeof(gid_t); + gids = xmalloc(len); + if (gids == NULL) + return false; + + n = getgroups(n, gids); + if (n == -1) { + pr_perror("Failed to get supplementary groups"); + ret = false; + } else { + /* getgroups sorts gids, so it is safe to memcmp gid arrays */ + ret = !memcmp(gids, groups, len); + } + + xfree(gids); + return ret; +} + +static void copy_caps(u32 *out_caps, u32 *in_caps, int n_words) +{ + int i, cap_end; + + for (i = kdat.last_cap + 1; i < 32 * n_words; ++i) { + if (~in_caps[i / 32] & (1 << (i % 32))) + continue; + + pr_warn("Dropping unsupported capability %d > %d)\n", i, kdat.last_cap); + /* extra caps will be cleared below */ + } + + n_words = min(n_words, (kdat.last_cap + 31) / 32); + cap_end = (kdat.last_cap & 31) + 1; + memcpy(out_caps, in_caps, sizeof(*out_caps) * n_words); + if ((cap_end & 31) && n_words) + out_caps[n_words - 1] &= (1 << cap_end) - 1; + memset(out_caps + n_words, 0, sizeof(*out_caps) * (CR_CAP_SIZE - n_words)); +} + +static struct thread_creds_args *rst_prep_creds_args(CredsEntry *ce, unsigned long *prev_pos) { unsigned long this_pos; struct thread_creds_args *args; - if (!verify_cap_size(ce)) { - pr_err("Caps size mismatch %d %d %d %d\n", - (int)ce->n_cap_inh, (int)ce->n_cap_eff, - (int)ce->n_cap_prm, (int)ce->n_cap_bnd); - return ERR_PTR(-EINVAL); - } - this_pos = rst_mem_align_cpos(RM_PRIVATE); args = rst_mem_alloc(sizeof(*args), RM_PRIVATE); @@ -3072,8 +2951,6 @@ rst_prep_creds_args(CredsEntry *ce, unsigned long *prev_pos) char *rendered = NULL, *profile; profile = ce->lsm_profile; - if (opts.lsm_supplied) - profile = opts.lsm_profile; if (validate_lsm(profile) < 0) return ERR_PTR(-EINVAL); @@ -3096,7 +2973,7 @@ rst_prep_creds_args(CredsEntry *ce, unsigned long *prev_pos) args = rst_mem_remap_ptr(this_pos, RM_PRIVATE); args->lsm_profile = lsm_profile; - strncpy(args->lsm_profile, rendered, lsm_profile_len); + __strlcpy(args->lsm_profile, rendered, lsm_profile_len + 1); xfree(rendered); } } else { @@ -3130,7 +3007,7 @@ rst_prep_creds_args(CredsEntry *ce, unsigned long *prev_pos) args = rst_mem_remap_ptr(this_pos, RM_PRIVATE); args->lsm_sockcreate = lsm_sockcreate; - strncpy(args->lsm_sockcreate, rendered, lsm_sockcreate_len); + __strlcpy(args->lsm_sockcreate, rendered, lsm_sockcreate_len + 1); xfree(rendered); } } else { @@ -3145,15 +3022,17 @@ rst_prep_creds_args(CredsEntry *ce, unsigned long *prev_pos) args->creds.cap_eff = NULL; args->creds.cap_prm = NULL; args->creds.cap_bnd = NULL; + args->creds.cap_amb = NULL; args->creds.groups = NULL; args->creds.lsm_profile = NULL; - memcpy(args->cap_inh, ce->cap_inh, sizeof(args->cap_inh)); - memcpy(args->cap_eff, ce->cap_eff, sizeof(args->cap_eff)); - memcpy(args->cap_prm, ce->cap_prm, sizeof(args->cap_prm)); - memcpy(args->cap_bnd, ce->cap_bnd, sizeof(args->cap_bnd)); + copy_caps(args->cap_inh, ce->cap_inh, ce->n_cap_inh); + copy_caps(args->cap_eff, ce->cap_eff, ce->n_cap_eff); + copy_caps(args->cap_prm, ce->cap_prm, ce->n_cap_prm); + copy_caps(args->cap_bnd, ce->cap_bnd, ce->n_cap_bnd); + copy_caps(args->cap_amb, ce->cap_amb, ce->n_cap_amb); - if (ce->n_groups) { + if (ce->n_groups && !groups_match(ce->groups, ce->n_groups)) { unsigned int *groups; args->mem_groups_pos = rst_mem_align_cpos(RM_PRIVATE); @@ -3254,6 +3133,9 @@ static void *restorer_munmap_addr(CoreEntry *core, void *restorer_blob) return restorer_sym(restorer_blob, arch_export_unmap); } +void arch_rsti_init(struct pstree_item *p) __attribute__((weak)); +void arch_rsti_init(struct pstree_item *p) {} + static int sigreturn_restore(pid_t pid, struct task_restore_args *task_args, unsigned long alen, CoreEntry *core) { void *mem = MAP_FAILED; @@ -3298,8 +3180,7 @@ static int sigreturn_restore(pid_t pid, struct task_restore_args *task_args, uns /* Wait when all tasks restored all files */ if (restore_wait_other_tasks()) goto err_nv; - if (root_ns_mask & CLONE_NEWNS && - remount_readonly_mounts()) + if (root_ns_mask & CLONE_NEWNS && remount_readonly_mounts()) goto err_nv; } @@ -3315,10 +3196,9 @@ static int sigreturn_restore(pid_t pid, struct task_restore_args *task_args, uns rst_mem_size = rst_mem_lock(); memzone_size = round_up(sizeof(struct restore_mem_zone) * current->nr_threads, page_size()); - task_args->bootstrap_len = restorer_len + memzone_size + alen + rst_mem_size; + task_args->bootstrap_len = restorer_len + memzone_size + alen + rst_mem_size + shstk_restorer_stack_size(); BUG_ON(task_args->bootstrap_len & (PAGE_SIZE - 1)); - pr_info("%d threads require %ldK of memory\n", - current->nr_threads, KBYTES(task_args->bootstrap_len)); + pr_info("%d threads require %ldK of memory\n", current->nr_threads, KBYTES(task_args->bootstrap_len)); if (core_is_compat(core)) vdso_maps_rt = vdso_maps_compat; @@ -3326,10 +3206,13 @@ static int sigreturn_restore(pid_t pid, struct task_restore_args *task_args, uns vdso_maps_rt = vdso_maps; /* * Figure out how much memory runtime vdso and vvar will need. + * Check if vDSO or VVAR is not provided by kernel. */ - vdso_rt_size = vdso_maps_rt.sym.vdso_size; - if (vdso_rt_size && vdso_maps_rt.sym.vvar_size) - vdso_rt_size += ALIGN(vdso_maps_rt.sym.vvar_size, PAGE_SIZE); + if (vdso_maps_rt.sym.vdso_size != VDSO_BAD_SIZE) { + vdso_rt_size = vdso_maps_rt.sym.vdso_size; + if (vdso_maps_rt.sym.vvar_size != VVAR_BAD_SIZE) + vdso_rt_size += vdso_maps_rt.sym.vvar_size; + } task_args->bootstrap_len += vdso_rt_size; /* @@ -3344,15 +3227,14 @@ static int sigreturn_restore(pid_t pid, struct task_restore_args *task_args, uns */ mem = (void *)restorer_get_vma_hint(&vmas->h, &self_vmas.h, - task_args->bootstrap_len); + shstk_min_mmap_addr(&task_args->shstk, kdat.mmap_min_addr), + task_args->bootstrap_len); if (mem == (void *)-1) { - pr_err("No suitable area for task_restore bootstrap (%ldK)\n", - task_args->bootstrap_len); + pr_err("No suitable area for task_restore bootstrap (%ldK)\n", task_args->bootstrap_len); goto err; } - pr_info("Found bootstrap VMA hint at: %p (needs ~%ldK)\n", - mem, KBYTES(task_args->bootstrap_len)); + pr_info("Found bootstrap VMA hint at: %p (needs ~%ldK)\n", mem, KBYTES(task_args->bootstrap_len)); ret = remap_restorer_blob(mem); if (ret < 0) @@ -3362,17 +3244,16 @@ static int sigreturn_restore(pid_t pid, struct task_restore_args *task_args, uns * Prepare a memory map for restorer. Note a thread space * might be completely unused so it's here just for convenience. */ - task_args->clone_restore_fn = restorer_sym(mem, arch_export_restore_thread); - restore_task_exec_start = restorer_sym(mem, arch_export_restore_task); - rsti(current)->munmap_restorer = restorer_munmap_addr(core, mem); + task_args->clone_restore_fn = restorer_sym(mem, arch_export_restore_thread); + restore_task_exec_start = restorer_sym(mem, arch_export_restore_task); + rsti(current)->munmap_restorer = restorer_munmap_addr(core, mem); task_args->bootstrap_start = mem; mem += restorer_len; /* VMA we need for stacks and sigframes for threads */ - if (mmap(mem, memzone_size, PROT_READ | PROT_WRITE, - MAP_PRIVATE | MAP_ANONYMOUS | MAP_FIXED, 0, 0) != mem) { - pr_err("Can't mmap section for restore code\n"); + if (mmap(mem, memzone_size, PROT_READ | PROT_WRITE, MAP_PRIVATE | MAP_ANONYMOUS | MAP_FIXED, 0, 0) != mem) { + pr_perror("Can't mmap section for restore code"); goto err; } @@ -3381,7 +3262,7 @@ static int sigreturn_restore(pid_t pid, struct task_restore_args *task_args, uns mem += memzone_size; /* New home for task_restore_args and thread_restore_args */ - task_args = mremap(task_args, alen, alen, MREMAP_MAYMOVE|MREMAP_FIXED, mem); + task_args = mremap(task_args, alen, alen, MREMAP_MAYMOVE | MREMAP_FIXED, mem); if (task_args != mem) { pr_perror("Can't move task args"); goto err; @@ -3457,24 +3338,35 @@ static int sigreturn_restore(pid_t pid, struct task_restore_args *task_args, uns BUG_ON(core->mtype != CORE_ENTRY__MARCH); - task_args->logfd = log_get_fd(); - task_args->loglevel = log_get_loglevel(); + task_args->logfd = log_get_fd(); + task_args->loglevel = log_get_loglevel(); log_get_logstart(&task_args->logstart); - task_args->sigchld_act = sigchld_act; + task_args->sigchld_act = sigchld_act; strncpy(task_args->comm, core->tc->comm, TASK_COMM_LEN - 1); task_args->comm[TASK_COMM_LEN - 1] = 0; + prep_libc_rseq_info(&task_args->libc_rseq); + + task_args->uid = opts.uid; + for (i = 0; i < CR_CAP_SIZE; i++) + task_args->cap_eff[i] = opts.cap_eff[i]; + /* * Fill up per-thread data. */ creds_pos_next = creds_pos; siginfo_n = task_args->siginfo_n; + arch_rsti_init(current); for (i = 0; i < current->nr_threads; i++) { CoreEntry *tcore; struct rt_sigframe *sigframe; +#ifdef CONFIG_MIPS + k_rtsigset_t mips_blkset; +#else k_rtsigset_t *blkset = NULL; +#endif thread_args[i].pid = current->threads[i].ns[0].virt; thread_args[i].siginfo_n = siginfo_priv_nr[i]; thread_args[i].siginfo = task_args->siginfo; @@ -3485,35 +3377,55 @@ static int sigreturn_restore(pid_t pid, struct task_restore_args *task_args, uns if (thread_args[i].pid == pid) { task_args->t = thread_args + i; tcore = core; +#ifdef CONFIG_MIPS + mips_blkset.sig[0] = tcore->tc->blk_sigset; + mips_blkset.sig[1] = tcore->tc->blk_sigset_extended; +#else blkset = (void *)&tcore->tc->blk_sigset; +#endif } else { tcore = current->core[i]; - if (tcore->thread_core->has_blk_sigset) + if (tcore->thread_core->has_blk_sigset) { +#ifdef CONFIG_MIPS + mips_blkset.sig[0] = tcore->thread_core->blk_sigset; + mips_blkset.sig[1] = tcore->thread_core->blk_sigset_extended; +#else blkset = (void *)&tcore->thread_core->blk_sigset; +#endif + } } if ((tcore->tc || tcore->ids) && thread_args[i].pid != pid) { - pr_err("Thread has optional fields present %d\n", - thread_args[i].pid); + pr_err("Thread has optional fields present %d\n", thread_args[i].pid); ret = -1; } if (ret < 0) { - pr_err("Can't read core data for thread %d\n", - thread_args[i].pid); + pr_err("Can't read core data for thread %d\n", thread_args[i].pid); goto err; } - thread_args[i].ta = task_args; - thread_args[i].gpregs = *CORE_THREAD_ARCH_INFO(tcore)->gpregs; - thread_args[i].clear_tid_addr = CORE_THREAD_ARCH_INFO(tcore)->clear_tid_addr; + thread_args[i].ta = task_args; + thread_args[i].gpregs = *CORE_THREAD_ARCH_INFO(tcore)->gpregs; + thread_args[i].clear_tid_addr = CORE_THREAD_ARCH_INFO(tcore)->clear_tid_addr; core_get_tls(tcore, &thread_args[i].tls); + if (tcore->thread_core->has_cg_set && rsti(current)->cg_set != tcore->thread_core->cg_set) { + thread_args[i].cg_set = tcore->thread_core->cg_set; + thread_args[i].cgroupd_sk = dup(get_service_fd(CGROUPD_SK)); + } else { + thread_args[i].cg_set = -1; + } + + ret = prep_rseq(&thread_args[i].rseq, tcore->thread_core); + if (ret) + goto err; + rst_reloc_creds(&thread_args[i], &creds_pos_next); - thread_args[i].futex_rla = tcore->thread_core->futex_rla; - thread_args[i].futex_rla_len = tcore->thread_core->futex_rla_len; - thread_args[i].pdeath_sig = tcore->thread_core->pdeath_sig; + thread_args[i].futex_rla = tcore->thread_core->futex_rla; + thread_args[i].futex_rla_len = tcore->thread_core->futex_rla_len; + thread_args[i].pdeath_sig = tcore->thread_core->pdeath_sig; if (tcore->thread_core->pdeath_sig > _KNSIG) { pr_err("Pdeath signal is too big\n"); goto err; @@ -3529,7 +3441,11 @@ static int sigreturn_restore(pid_t pid, struct task_restore_args *task_args, uns thread_args[i].mz = mz + i; sigframe = (struct rt_sigframe *)&mz[i].rt_sigframe; +#ifdef CONFIG_MIPS + if (construct_sigframe(sigframe, sigframe, &mips_blkset, tcore)) +#else if (construct_sigframe(sigframe, sigframe, blkset, tcore)) +#endif goto err; if (tcore->thread_core->comm) @@ -3541,9 +3457,7 @@ static int sigreturn_restore(pid_t pid, struct task_restore_args *task_args, uns if (thread_args[i].pid != pid) core_entry__free_unpacked(tcore, NULL); - pr_info("Thread %4d stack %8p rt_sigframe %8p\n", - i, mz[i].stack, mz[i].rt_sigframe); - + pr_info("Thread %4d stack %8p rt_sigframe %8p\n", i, mz[i].stack, mz[i].rt_sigframe); } /* @@ -3553,10 +3467,15 @@ static int sigreturn_restore(pid_t pid, struct task_restore_args *task_args, uns * self-vmas are unmaped. */ mem += rst_mem_size; + + shstk_set_restorer_stack(&task_args->shstk, mem); + mem += shstk_restorer_stack_size(); + task_args->vdso_rt_parked_at = (unsigned long)mem; task_args->vdso_maps_rt = vdso_maps_rt; task_args->vdso_rt_size = vdso_rt_size; task_args->can_map_vdso = kdat.can_map_vdso; + task_args->has_clone3_set_tid = kdat.has_clone3_set_tid; new_sp = restorer_stack(task_args->t->mz); @@ -3567,17 +3486,17 @@ static int sigreturn_restore(pid_t pid, struct task_restore_args *task_args, uns /* * Now prepare run-time data for threads restore. */ - task_args->nr_threads = current->nr_threads; - task_args->thread_args = thread_args; + task_args->nr_threads = current->nr_threads; + task_args->thread_args = thread_args; - task_args->auto_dedup = opts.auto_dedup; + task_args->auto_dedup = opts.auto_dedup; /* * In the restorer we need to know if it is SELinux or not. For SELinux * we must change the process context before creating threads. For * Apparmor we can change each thread after they have been created. */ - task_args->lsm_type = kdat.lsm; + task_args->lsm_type = kdat.lsm; /* * Make root and cwd restore _that_ late not to break any @@ -3596,6 +3515,7 @@ static int sigreturn_restore(pid_t pid, struct task_restore_args *task_args, uns close_service_fd(USERNSD_SK); close_service_fd(FDSTORE_SK_OFF); close_service_fd(RPC_SK_OFF); + close_service_fd(CGROUPD_SK); __gcov_flush(); @@ -3604,9 +3524,7 @@ static int sigreturn_restore(pid_t pid, struct task_restore_args *task_args, uns "task_args->nr_threads: %d\n" "task_args->clone_restore_fn: %p\n" "task_args->thread_args: %p\n", - task_args, task_args->t->pid, - task_args->nr_threads, - task_args->clone_restore_fn, + task_args, task_args->t->pid, task_args->nr_threads, task_args->clone_restore_fn, task_args->thread_args); /* diff --git a/criu/cr-service.c b/criu/cr-service.c index 0938db02b..dccf4ef38 100644 --- a/criu/cr-service.c +++ b/criu/cr-service.c @@ -14,6 +14,7 @@ #include #include #include +#include #include "version.h" #include "crtools.h" @@ -27,6 +28,7 @@ #include "cr-service.h" #include "cr-service-const.h" #include "page-xfer.h" +#include "protobuf.h" #include "net.h" #include "mount.h" #include "filesystems.h" @@ -39,6 +41,7 @@ #include "proc_parse.h" #include "common/scm.h" #include "uffd.h" +#include "pidfd-store.h" #include "setproctitle.h" @@ -49,18 +52,21 @@ unsigned int service_sk_ino = -1; static int recv_criu_msg(int socket_fd, CriuReq **req) { - unsigned char *buf; - int len; + u8 local[PB_PKOBJ_LOCAL_SIZE]; + void *buf = (void *)&local; + int len, exit_code = -1; len = recv(socket_fd, NULL, 0, MSG_TRUNC | MSG_PEEK); if (len == -1) { pr_perror("Can't read request"); - return -1; + goto err; } - buf = xmalloc(len); - if (!buf) - return -ENOMEM; + if (len > sizeof(local)) { + buf = xmalloc(len); + if (!buf) + return -ENOMEM; + } len = recv(socket_fd, buf, len, MSG_TRUNC); if (len == -1) { @@ -80,43 +86,47 @@ static int recv_criu_msg(int socket_fd, CriuReq **req) goto err; } - xfree(buf); - return 0; + exit_code = 0; err: - xfree(buf); - return -1; + if (buf != (void *)&local) + xfree(buf); + return exit_code; } static int send_criu_msg_with_fd(int socket_fd, CriuResp *msg, int fd) { - unsigned char *buf; - int len, ret; + u8 local[PB_PKOBJ_LOCAL_SIZE]; + void *buf = (void *)&local; + int len, exit_code = -1; len = criu_resp__get_packed_size(msg); - buf = xmalloc(len); - if (!buf) - return -ENOMEM; + if (len > sizeof(local)) { + buf = xmalloc(len); + if (!buf) + return -ENOMEM; + } if (criu_resp__pack(msg, buf) != len) { pr_perror("Failed packing response"); goto err; } - if (fd >= 0) { - ret = send_fds(socket_fd, NULL, 0, &fd, 1, buf, len); - } else - ret = write(socket_fd, buf, len); - if (ret < 0) { + if (fd >= 0) + exit_code = send_fds(socket_fd, NULL, 0, &fd, 1, buf, len); + else + exit_code = write(socket_fd, buf, len); + + if (exit_code < 0) { pr_perror("Can't send response"); goto err; } - xfree(buf); - return 0; + exit_code = 0; err: - xfree(buf); - return -1; + if (buf != (void *)&local) + xfree(buf); + return exit_code; } static int send_criu_msg(int socket_fd, CriuResp *msg) @@ -160,11 +170,11 @@ int send_criu_dump_resp(int socket_fd, bool success, bool restored) return send_criu_msg(socket_fd, &msg); } -static int send_criu_pre_dump_resp(int socket_fd, bool success) +static int send_criu_pre_dump_resp(int socket_fd, bool success, bool single) { CriuResp msg = CRIU_RESP__INIT; - msg.type = CRIU_REQ_TYPE__PRE_DUMP; + msg.type = single ? CRIU_REQ_TYPE__SINGLE_PRE_DUMP : CRIU_REQ_TYPE__PRE_DUMP; msg.success = success; set_resp_err(&msg); @@ -230,15 +240,165 @@ int send_criu_rpc_script(enum script_actions act, char *name, int sk, int fd) return 0; } -static char images_dir[PATH_MAX]; +int exec_rpc_query_external_files(char *name, int sk) +{ + int i, ret; + CriuNotify cn = CRIU_NOTIFY__INIT; + CriuResp msg = CRIU_RESP__INIT; + CriuReq *req; + + cn.script = name; + + msg.type = CRIU_REQ_TYPE__NOTIFY; + msg.success = true; + msg.notify = &cn; + + ret = send_criu_msg_with_fd(sk, &msg, -1); + if (ret < 0) + return ret; + + ret = recv_criu_msg(sk, &req); + if (ret < 0) + return ret; + + if (req->type != CRIU_REQ_TYPE__NOTIFY || !req->notify_success) { + pr_err("RPC client reported script error\n"); + return -1; + } + + ret = 0; + if (req->opts) + for (i = 0; i < req->opts->n_external; i++) { + char *key = req->opts->external[i]; + pr_info("Adding external object: %s\n", key); + if (add_external(key)) { + pr_err("Failed to add external object: %s\n", key); + ret = -1; + } + } + else + pr_info("RPC NOTIFY %s: no `opts` returned.\n", name); + + criu_req__free_unpacked(req, NULL); + return ret; +} + +static int resolve_images_dir_path(char *images_dir_path, + bool imgs_changed_by_rpc_conf, + const CriuOpts *req, + pid_t peer_pid) +{ + /* + * images_dir_fd is a required RPC parameter with -1 as default value. + * + * This assumes that if opts.imgs_dir is set, we have a value + * from the configuration file parser. The test to see that + * imgs_changed_by_rpc_conf is true is used to make sure the value + * is from the RPC configuration file. The idea is that only the + * RPC configuration file is able to overwrite RPC settings: + * * apply_config(global_conf) + * * apply_config(user_conf) + * * apply_config(environment variable) + * * apply_rpc_options() + * * apply_config(rpc_conf) + */ + if (imgs_changed_by_rpc_conf) { + strncpy(images_dir_path, opts.imgs_dir, PATH_MAX - 1); + images_dir_path[PATH_MAX - 1] = '\0'; + } else if (req->images_dir_fd != -1) { + snprintf(images_dir_path, PATH_MAX, "/proc/%d/fd/%d", peer_pid, req->images_dir_fd); + } else if (req->images_dir) { + strncpy(images_dir_path, req->images_dir, PATH_MAX - 1); + images_dir_path[PATH_MAX - 1] = '\0'; + } else { + /* + * Since images dir is not required in CHECK mode, we need to + * check for work_dir_fd in setup_images_and_workdir() + */ + if (opts.mode == CR_CHECK) + return 0; + pr_err("Neither images_dir_fd nor images_dir was passed by RPC client.\n"); + return -1; + } + + return 0; +} + +static int setup_images_and_workdir(const char *images_dir_path, + bool work_changed_by_rpc_conf, + CriuOpts *req, + pid_t peer_pid) +{ + char work_dir_path[PATH_MAX] = ""; + + /* We don't need to open images dir in CHECK mode. */ + if (opts.mode != CR_CHECK) { + /* + * Image streaming is not supported with CRIU's service feature as + * the streamer must be started for each dump/restore operation. + * It is unclear how to do that with RPC, so we punt for now. + * This explains why we provide the argument mode=-1 instead of + * O_RSTR or O_DUMP. + */ + if (open_image_dir(images_dir_path, -1) < 0) { + pr_perror("Can't open images directory"); + return -1; + } + } + + if (work_changed_by_rpc_conf) + strncpy(work_dir_path, opts.work_dir, PATH_MAX - 1); + else if (req->has_work_dir_fd) + sprintf(work_dir_path, "/proc/%d/fd/%d", peer_pid, req->work_dir_fd); + else if (opts.work_dir) + strncpy(work_dir_path, opts.work_dir, PATH_MAX - 1); + else if (images_dir_path[0] != '\0') + strcpy(work_dir_path, images_dir_path); + + if (work_dir_path[0] == '\0') { + pr_err("images-dir or work-dir is required when using log file\n"); + return -1; + } + + if (chdir(work_dir_path)) { + pr_perror("Can't chdir to work_dir"); + return -1; + } + + return 0; +} + +static int setup_logging_from_req(CriuOpts *req, bool output_changed_by_rpc_conf) +{ + if (req->log_file && !output_changed_by_rpc_conf) { + if (strchr(req->log_file, '/')) { + pr_perror("No subdirs are allowed in log_file name"); + return -1; + } + SET_CHAR_OPTS(output, req->log_file); + } else if (req->has_log_to_stderr && req->log_to_stderr && !output_changed_by_rpc_conf) { + xfree(opts.output); + opts.output = NULL; /* log_init(NULL) writes to stderr */ + } else if (!opts.output) { + SET_CHAR_OPTS(output, DEFAULT_LOG_FILENAME); + } + + opts.log_level = req->log_level; + log_set_loglevel(opts.log_level); + if (log_init(opts.output)) { + pr_perror("Can't initiate log"); + return -1; + } + + return 0; +} static int setup_opts_from_req(int sk, CriuOpts *req) { struct ucred ids; struct stat st; socklen_t ids_len = sizeof(struct ucred); - char images_dir_path[PATH_MAX]; - char work_dir_path[PATH_MAX]; + char images_dir_path[PATH_MAX] = ""; char status_fd[PATH_MAX]; bool output_changed_by_rpc_conf = false; bool work_changed_by_rpc_conf = false; @@ -251,6 +411,23 @@ static int setup_opts_from_req(int sk, CriuOpts *req) goto err; } + /* + * The options relevant in CHECK mode are: log_file, log_to_stderr, and log_level. + * When logging to a file, we also need to resolve images_dir and work_dir. + */ + if (opts.mode == CR_CHECK) { + if (!req) + return 0; /* nothing to do */ + + /* + * A log file is needed only if: + * - log_file is explicitly set, or + * - log_to_stderr is NOT requested (i.e., using DEFAULT_LOG_FILENAME) + */ + if (!req->log_file || (req->has_log_to_stderr && req->log_to_stderr)) + return 0; /* no log file, don't require images_dir or work_dir */ + } + if (fstat(sk, &st)) { pr_perror("Can't get socket stat"); goto err; @@ -259,142 +436,8 @@ static int setup_opts_from_req(int sk, CriuOpts *req) BUG_ON(st.st_ino == -1); service_sk_ino = st.st_ino; - /* - * Evaluate an additional configuration file if specified. - * This needs to happen twice, because it is needed early to detect - * things like work_dir, imgs_dir and logfile. The second parsing - * of the optional RPC configuration file happens at the end and - * overwrites all options set via RPC. - */ - if (req->config_file) { - char *tmp_output = opts.output; - char *tmp_work = opts.work_dir; - char *tmp_imgs = opts.imgs_dir; - - opts.output = NULL; - opts.work_dir = NULL; - opts.imgs_dir = NULL; - - rpc_cfg_file = req->config_file; - i = parse_options(0, NULL, &dummy, &dummy, PARSING_RPC_CONF); - if (i) { - xfree(tmp_output); - xfree(tmp_work); - xfree(tmp_imgs); - goto err; - } - /* If this is non-NULL, the RPC configuration file had a value, use it.*/ - if (opts.output) - output_changed_by_rpc_conf = true; - /* If this is NULL, use the old value if it was set. */ - if (!opts.output && tmp_output) { - opts.output = tmp_output; - tmp_output = NULL; - } - - if (opts.work_dir) - work_changed_by_rpc_conf = true; - if (!opts.work_dir && tmp_work) { - opts.work_dir = tmp_work; - tmp_work = NULL; - } - - if (opts.imgs_dir) - imgs_changed_by_rpc_conf = true; - /* - * As the images directory is a required RPC setting, it is not - * necessary to use the value from other configuration files. - * Either it is set in the RPC configuration file or it is set - * via RPC. - */ - xfree(tmp_output); - xfree(tmp_work); - xfree(tmp_imgs); - } - - /* - * open images_dir - images_dir_fd is a required RPC parameter - * - * This assumes that if opts.imgs_dir is set we have a value - * from the configuration file parser. The test to see that - * imgs_changed_by_rpc_conf is true is used to make sure the value - * is from the RPC configuration file. - * The idea is that only the RPC configuration file is able to - * overwrite RPC settings: - * * apply_config(global_conf) - * * apply_config(user_conf) - * * apply_config(environment variable) - * * apply_rpc_options() - * * apply_config(rpc_conf) - */ - if (imgs_changed_by_rpc_conf) - strncpy(images_dir_path, opts.imgs_dir, PATH_MAX - 1); - else - sprintf(images_dir_path, "/proc/%d/fd/%d", ids.pid, req->images_dir_fd); - - if (req->parent_img) - SET_CHAR_OPTS(img_parent, req->parent_img); - - if (open_image_dir(images_dir_path) < 0) { - pr_perror("Can't open images directory"); - goto err; - } - - /* get full path to images_dir to use in process title */ - if (readlink(images_dir_path, images_dir, PATH_MAX) == -1) { - pr_perror("Can't readlink %s", images_dir_path); - goto err; - } - - /* chdir to work dir */ - if (work_changed_by_rpc_conf) - /* Use the value from the RPC configuration file first. */ - strncpy(work_dir_path, opts.work_dir, PATH_MAX - 1); - else if (req->has_work_dir_fd) - /* Use the value set via RPC. */ - sprintf(work_dir_path, "/proc/%d/fd/%d", ids.pid, req->work_dir_fd); - else if (opts.work_dir) - /* Use the value from one of the other configuration files. */ - strncpy(work_dir_path, opts.work_dir, PATH_MAX - 1); - else - /* Use the images directory a work directory. */ - strcpy(work_dir_path, images_dir_path); - - if (chdir(work_dir_path)) { - pr_perror("Can't chdir to work_dir"); - goto err; - } - - /* initiate log file in work dir */ - if (req->log_file && !output_changed_by_rpc_conf) { - /* - * If RPC sets a log file and if there nothing from the - * RPC configuration file, use the RPC value. - */ - if (strchr(req->log_file, '/')) { - pr_perror("No subdirs are allowed in log_file name"); - goto err; - } - - SET_CHAR_OPTS(output, req->log_file); - } else if (!opts.output) { - SET_CHAR_OPTS(output, DEFAULT_LOG_FILENAME); - } - - /* This is needed later to correctly set the log_level */ - opts.log_level = req->log_level; - log_set_loglevel(req->log_level); - if (log_init(opts.output) == -1) { - pr_perror("Can't initiate log"); - goto err; - } - - if (req->config_file) { - pr_debug("Overwriting RPC settings with values from %s\n", req->config_file); - } - - if (kerndat_init()) - return 1; + if (req->has_unprivileged) + opts.unprivileged = req->unprivileged; if (log_keep_err()) { pr_perror("Can't tune log"); @@ -405,6 +448,9 @@ static int setup_opts_from_req(int sk, CriuOpts *req) if (req->has_leave_running && req->leave_running) opts.final_state = TASK_ALIVE; + if (req->has_leave_stopped && req->leave_stopped) + opts.final_state = TASK_STOPPED; + if (!req->has_pid) { req->has_pid = true; req->pid = ids.pid; @@ -448,6 +494,9 @@ static int setup_opts_from_req(int sk, CriuOpts *req) if (req->has_shell_job) opts.shell_job = req->shell_job; + if (req->has_skip_file_rwx_check) + opts.skip_file_rwx_check = req->skip_file_rwx_check; + if (req->has_file_locks) opts.handle_file_locks = req->file_locks; @@ -473,6 +522,35 @@ static int setup_opts_from_req(int sk, CriuOpts *req) opts.lazy_pages = req->lazy_pages; } + if (req->has_pre_dump_mode) { + switch (req->pre_dump_mode) { + case CRIU_PRE_DUMP_MODE__SPLICE: + opts.pre_dump_mode = PRE_DUMP_SPLICE; + break; + case CRIU_PRE_DUMP_MODE__VM_READ: + opts.pre_dump_mode = PRE_DUMP_READ; + break; + default: + goto err; + } + } + + if (req->has_network_lock) { + switch (req->network_lock) { + case CRIU_NETWORK_LOCK_METHOD__IPTABLES: + opts.network_lock_method = NETWORK_LOCK_IPTABLES; + break; + case CRIU_NETWORK_LOCK_METHOD__NFTABLES: + opts.network_lock_method = NETWORK_LOCK_NFTABLES; + break; + case CRIU_NETWORK_LOCK_METHOD__SKIP: + opts.network_lock_method = NETWORK_LOCK_SKIP; + break; + default: + goto err; + } + } + if (req->ps) { opts.port = (short)req->ps->port; @@ -524,8 +602,7 @@ static int setup_opts_from_req(int sk, CriuOpts *req) goto err; for (i = 0; i < req->n_cg_root; i++) { - if (new_cg_root_add(req->cg_root[i]->ctrl, - req->cg_root[i]->path)) + if (new_cg_root_add(req->cg_root[i]->ctrl, req->cg_root[i]->path)) goto err; } @@ -594,6 +671,9 @@ static int setup_opts_from_req(int sk, CriuOpts *req) SET_CHAR_OPTS(lsm_profile, req->lsm_profile); } + if (req->lsm_mount_context) + SET_CHAR_OPTS(lsm_mount_context, req->lsm_mount_context); + if (req->has_timeout) opts.timeout = req->timeout; @@ -608,6 +688,9 @@ static int setup_opts_from_req(int sk, CriuOpts *req) goto err; } + if (req->cgroup_yard) + SET_CHAR_OPTS(cgroup_yard, req->cgroup_yard); + if (req->tls_cacert) SET_CHAR_OPTS(tls_cacert, req->tls_cacert); if (req->tls_cacrl) @@ -638,6 +721,81 @@ static int setup_opts_from_req(int sk, CriuOpts *req) if (req->empty_ns & ~(CLONE_NEWNET)) goto err; } + if (req->has_status_fd) { + pr_warn("status_fd is obsoleted; use status-ready notification instead\n"); + + sprintf(status_fd, "/proc/%d/fd/%d", ids.pid, req->status_fd); + opts.status_fd = open(status_fd, O_WRONLY); + if (opts.status_fd < 0) { + pr_perror("Can't reopen status fd %s", status_fd); + goto err; + } + } + + if (req->orphan_pts_master) + opts.orphan_pts_master = true; + + if (req->has_display_stats) + opts.display_stats = req->display_stats; + + /* Evaluate additional configuration file (e.g., runc.conf) to overwrite all RPC settings. */ + if (req->config_file) { + char *tmp_output = opts.output; + char *tmp_work = opts.work_dir; + + opts.output = NULL; + opts.work_dir = NULL; + + /* + * As the images directory is a required RPC setting, it is not + * necessary to use the value from other configuration files. + * Either it is set in the RPC configuration file or it is set + * via RPC. + */ + xfree(opts.imgs_dir); + opts.imgs_dir = NULL; + + pr_debug("Would overwrite RPC settings with values from %s\n", req->config_file); + + rpc_cfg_file = req->config_file; + i = parse_options(0, NULL, &dummy, &dummy, PARSING_RPC_CONF); + if (i) { + xfree(tmp_output); + xfree(tmp_work); + goto err; + } + + /* If opts.{output,work_dir} is non-NULL, the RPC configuration file had a value, use it.*/ + /* If opts.{output,work_dir} is NULL, use the old value if it was set. */ + if (opts.output) { + output_changed_by_rpc_conf = true; + } else { + opts.output = tmp_output; + tmp_output = NULL; + } + + if (opts.work_dir) { + work_changed_by_rpc_conf = true; + } else { + opts.work_dir = tmp_work; + tmp_work = NULL; + } + + if (opts.imgs_dir) + imgs_changed_by_rpc_conf = true; + + xfree(tmp_output); + xfree(tmp_work); + } + + if (resolve_images_dir_path(images_dir_path, imgs_changed_by_rpc_conf, req, ids.pid) < 0) + goto err; + + if (req->parent_img) + SET_CHAR_OPTS(img_parent, req->parent_img); + + if (setup_images_and_workdir(images_dir_path, work_changed_by_rpc_conf, req, ids.pid)) + goto err; if (req->n_irmap_scan_paths) { for (i = 0; i < req->n_irmap_scan_paths; i++) { @@ -646,27 +804,23 @@ static int setup_opts_from_req(int sk, CriuOpts *req) } } - if (req->has_status_fd) { - sprintf(status_fd, "/proc/%d/fd/%d", ids.pid, req->status_fd); - opts.status_fd = open(status_fd, O_WRONLY); - if (opts.status_fd < 0) - goto err; - } + /* initiate log file in work dir */ + if (setup_logging_from_req(req, output_changed_by_rpc_conf)) + goto err; - if (req->orphan_pts_master) - opts.orphan_pts_master = true; + if (check_caps()) + goto err; + if (kerndat_init()) + goto err; - /* Evaluate additional configuration file a second time to overwrite - * all RPC settings. */ - if (req->config_file) { - rpc_cfg_file = req->config_file; - i = parse_options(0, NULL, &dummy, &dummy, PARSING_RPC_CONF); - if (i) - goto err; - } + /* init_pidfd_store_sk must be called after kerndat_init. */ + if (req->has_pidfd_store_sk && init_pidfd_store_sk(ids.pid, req->pidfd_store_sk)) + goto err; + + if (req->mntns_compat_mode) + opts.mntns_compat_mode = true; - log_set_loglevel(opts.log_level); if (check_options()) goto err; @@ -682,11 +836,14 @@ static int dump_using_req(int sk, CriuOpts *req) bool success = false; bool self_dump = !req->pid; + opts.mode = CR_DUMP; if (setup_opts_from_req(sk, req)) goto exit; - setproctitle("dump --rpc -t %d -D %s", req->pid, images_dir); + __setproctitle("dump --rpc -t %d", req->pid); + if (init_pidfd_store_hash()) + goto pidfd_store_err; /* * FIXME -- cr_dump_tasks() may return code from custom * scripts, that can be positive. However, right now we @@ -698,7 +855,9 @@ static int dump_using_req(int sk, CriuOpts *req) success = true; exit: - if (req->leave_running || !self_dump || !success) { + free_pidfd_store(); +pidfd_store_err: + if (req->leave_running || !self_dump || !success) { if (send_criu_dump_resp(sk, success, false) == -1) { pr_perror("Can't send response"); success = false; @@ -720,18 +879,18 @@ static int restore_using_req(int sk, CriuOpts *req) opts.restore_detach = true; + opts.mode = CR_RESTORE; if (setup_opts_from_req(sk, req)) goto exit; - setproctitle("restore --rpc -D %s", images_dir); + __setproctitle("restore --rpc"); if (cr_restore_tasks()) goto exit; success = true; exit: - if (send_criu_restore_resp(sk, success, - root_item ? root_item->pid->real : -1) == -1) { + if (send_criu_restore_resp(sk, success, root_item ? root_item->pid->real : -1) == -1) { pr_perror("Can't send response"); success = false; } @@ -763,6 +922,11 @@ static int check(int sk, CriuOpts *req) resp.type = CRIU_REQ_TYPE__CHECK; + if (log_keep_err()) { + pr_perror("Can't tune log"); + goto out; + } + pid = fork(); if (pid < 0) { pr_perror("Can't fork"); @@ -770,8 +934,9 @@ static int check(int sk, CriuOpts *req) } if (pid == 0) { - setproctitle("check --rpc"); + __setproctitle("check --rpc"); + opts.mode = CR_CHECK; if (setup_opts_from_req(sk, req)) exit(1); @@ -786,14 +951,20 @@ static int check(int sk, CriuOpts *req) resp.success = true; out: + set_resp_err(&resp); return send_criu_msg(sk, &resp); } -static int pre_dump_using_req(int sk, CriuOpts *req) +static int pre_dump_using_req(int sk, CriuOpts *req, bool single) { int pid, status; bool success = false; + if (log_keep_err()) { + pr_perror("Can't tune log"); + goto out; + } + pid = fork(); if (pid < 0) { pr_perror("Can't fork"); @@ -803,16 +974,22 @@ static int pre_dump_using_req(int sk, CriuOpts *req) if (pid == 0) { int ret = 1; + opts.mode = CR_PRE_DUMP; if (setup_opts_from_req(sk, req)) goto cout; - setproctitle("pre-dump --rpc -t %d -D %s", req->pid, images_dir); + __setproctitle("pre-dump --rpc -t %d", req->pid); + + if (init_pidfd_store_hash()) + goto pidfd_store_err; if (cr_pre_dump_tasks(req->pid)) goto cout; ret = 0; -cout: + cout: + free_pidfd_store(); + pidfd_store_err: exit(ret); } @@ -825,7 +1002,7 @@ cout: success = true; out: - if (send_criu_pre_dump_resp(sk, success) == -1) { + if (send_criu_pre_dump_resp(sk, success, single) == -1) { pr_perror("Can't send pre-dump resp"); success = false; } @@ -838,7 +1015,7 @@ static int pre_dump_loop(int sk, CriuReq *msg) int ret; do { - ret = pre_dump_using_req(sk, msg->opts); + ret = pre_dump_using_req(sk, msg->opts, false); if (ret < 0) return ret; @@ -866,6 +1043,11 @@ static int start_page_server_req(int sk, CriuOpts *req, bool daemon_mode) CriuPageServerInfo ps = CRIU_PAGE_SERVER_INFO__INIT; struct ps_info info; + if (log_keep_err()) { + pr_perror("Can't tune log"); + goto out; + } + if (pipe(start_pipe)) { pr_perror("No start pipe"); goto out; @@ -875,10 +1057,11 @@ static int start_page_server_req(int sk, CriuOpts *req, bool daemon_mode) if (pid == 0) { close(start_pipe[0]); + opts.mode = CR_PAGE_SERVER; if (setup_opts_from_req(sk, req)) goto out_ch; - setproctitle("page-server --rpc --address %s --port %hu", opts.addr, opts.port); + __setproctitle("page-server --rpc --address %s --port %hu", opts.addr, opts.port); pr_debug("Starting page server\n"); @@ -896,7 +1079,7 @@ static int start_page_server_req(int sk, CriuOpts *req, bool daemon_mode) } ret = 0; -out_ch: + out_ch: if (daemon_mode && ret < 0 && pid > 0) kill(pid, SIGKILL); close(start_pipe[1]); @@ -938,6 +1121,7 @@ out_ch: out: resp.type = CRIU_REQ_TYPE__PAGE_SERVER; resp.success = success; + set_resp_err(&resp); return send_criu_msg(sk, &resp); } @@ -961,8 +1145,7 @@ static int chk_keepopen_req(CriuReq *msg) if (msg->type == CRIU_REQ_TYPE__PAGE_SERVER_CHLD) /* This just fork()-s so no leaks */ return 0; - else if (msg->type == CRIU_REQ_TYPE__CPUINFO_DUMP || - msg->type == CRIU_REQ_TYPE__CPUINFO_CHECK) + else if (msg->type == CRIU_REQ_TYPE__CPUINFO_DUMP || msg->type == CRIU_REQ_TYPE__CPUINFO_CHECK) return 0; else if (msg->type == CRIU_REQ_TYPE__FEATURE_CHECK) return 0; @@ -976,7 +1159,7 @@ static int chk_keepopen_req(CriuReq *msg) * Return the version information, depending on the information * available in version.h */ -static int handle_version(int sk, CriuReq * msg) +static int handle_version(int sk, CriuReq *msg) { CriuResp resp = CRIU_RESP__INIT; CriuVersion version = CRIU_VERSION__INIT; @@ -1014,7 +1197,7 @@ static int handle_version(int sk, CriuReq * msg) * For each feature which has been requested in msg->features * the corresponding parameter will be set in resp.features. */ -static int handle_feature_check(int sk, CriuReq * msg) +static int handle_feature_check(int sk, CriuReq *msg) { CriuResp resp = CRIU_RESP__INIT; CriuFeatures feat = CRIU_FEATURES__INIT; @@ -1026,6 +1209,8 @@ static int handle_feature_check(int sk, CriuReq * msg) feat.mem_track = false; feat.has_lazy_pages = 1; feat.lazy_pages = false; + feat.has_pidfd_store = 1; + feat.pidfd_store = false; pid = fork(); if (pid < 0) { @@ -1034,20 +1219,20 @@ static int handle_feature_check(int sk, CriuReq * msg) } if (pid == 0) { - /* kerndat_init() is called from setup_opts_from_req() */ - if (setup_opts_from_req(sk, msg->opts)) + if (kerndat_init()) exit(1); - setproctitle("feature-check --rpc"); + __setproctitle("feature-check --rpc"); - if ((msg->features->has_mem_track == 1) && - (msg->features->mem_track == true)) + if ((msg->features->has_mem_track == 1) && (msg->features->mem_track == true)) feat.mem_track = kdat.has_dirty_track; - if ((msg->features->has_lazy_pages == 1) && - (msg->features->lazy_pages == true)) + if ((msg->features->has_lazy_pages == 1) && (msg->features->lazy_pages == true)) feat.lazy_pages = kdat.has_uffd && uffd_noncooperative(); + if ((msg->features->has_pidfd_store == 1) && (msg->features->pidfd_store == true)) + feat.pidfd_store = kdat.has_pidfd_getfd && kdat.has_pidfd_open; + resp.features = &feat; resp.type = msg->type; /* The feature check is working, actual results are in resp.features */ @@ -1069,6 +1254,8 @@ static int handle_feature_check(int sk, CriuReq * msg) if (status != 0) goto out; + return 0; + /* * The child process was not able to send an answer. Tell * the RPC client that something did not work as expected. @@ -1109,6 +1296,11 @@ static int handle_cpuinfo(int sk, CriuReq *msg) bool success = false; int pid, status; + if (log_keep_err()) { + pr_perror("Can't tune log"); + goto out; + } + pid = fork(); if (pid < 0) { pr_perror("Can't fork"); @@ -1118,19 +1310,17 @@ static int handle_cpuinfo(int sk, CriuReq *msg) if (pid == 0) { int ret = 1; + opts.mode = (msg->type == CRIU_REQ_TYPE__CPUINFO_DUMP) ? CR_CPUINFO_DUMP : CR_CPUINFO_CHECK; if (setup_opts_from_req(sk, msg->opts)) goto cout; - setproctitle("cpuinfo %s --rpc -D %s", - msg->type == CRIU_REQ_TYPE__CPUINFO_DUMP ? - "dump" : "check", - images_dir); + __setproctitle("cpuinfo %s --rpc", msg->type == CRIU_REQ_TYPE__CPUINFO_DUMP ? "dump" : "check"); if (msg->type == CRIU_REQ_TYPE__CPUINFO_DUMP) ret = cpuinfo_dump(); else ret = cpuinfo_check(); -cout: + cout: exit(ret); } @@ -1159,7 +1349,7 @@ cout: out: resp.type = msg->type; resp.success = success; - + set_resp_err(&resp); return send_criu_msg(sk, &resp); } @@ -1168,7 +1358,17 @@ int cr_service_work(int sk) int ret = -1; CriuReq *msg = 0; + /* + * util_init initializes criu_run_id and compel_run_id so that sockets + * are generated with an unique name identifying the specific process + * even in cases where multiple processes with the same pid in + * different pid namespaces are sharing the same network namespace. + */ + util_init(); + more: + opts.mode = CR_SWRK; + if (recv_criu_msg(sk, &msg) != 0) { pr_perror("Can't recv request"); goto err; @@ -1197,7 +1397,7 @@ more: ret = start_page_server_req(sk, msg->opts, false); break; case CRIU_REQ_TYPE__WAIT_PID: - ret = handle_wait_pid(sk, msg->pid); + ret = handle_wait_pid(sk, msg->pid); break; case CRIU_REQ_TYPE__CPUINFO_DUMP: case CRIU_REQ_TYPE__CPUINFO_CHECK: @@ -1209,6 +1409,9 @@ more: case CRIU_REQ_TYPE__VERSION: ret = handle_version(sk, msg); break; + case CRIU_REQ_TYPE__SINGLE_PRE_DUMP: + ret = pre_dump_using_req(sk, msg->opts, true); + break; default: send_criu_err(sk, "Invalid req"); @@ -1246,22 +1449,21 @@ static void reap_worker(int signo) } if (WIFEXITED(status)) - pr_info("Worker(pid %d) exited with %d\n", - pid, WEXITSTATUS(status)); + pr_info("Worker(pid %d) exited with %d\n", pid, WEXITSTATUS(status)); else if (WIFSIGNALED(status)) - pr_info("Worker(pid %d) was killed by %d: %s\n", pid, - WTERMSIG(status), strsignal(WTERMSIG(status))); + pr_info("Worker(pid %d) was killed by %d: %s\n", pid, WTERMSIG(status), + strsignal(WTERMSIG(status))); } } -static int setup_sigchld_handler() +static int setup_sigchld_handler(void) { struct sigaction action; sigemptyset(&action.sa_mask); sigaddset(&action.sa_mask, SIGCHLD); - action.sa_handler = reap_worker; - action.sa_flags = SA_RESTART; + action.sa_handler = reap_worker; + action.sa_flags = SA_RESTART; if (sigaction(SIGCHLD, &action, NULL)) { pr_perror("Can't setup SIGCHLD handler"); @@ -1271,14 +1473,14 @@ static int setup_sigchld_handler() return 0; } -static int restore_sigchld_handler() +static int restore_sigchld_handler(void) { struct sigaction action; sigemptyset(&action.sa_mask); sigaddset(&action.sa_mask, SIGCHLD); - action.sa_handler = SIG_DFL; - action.sa_flags = SA_RESTART; + action.sa_handler = SIG_DFL; + action.sa_flags = SA_RESTART; if (sigaction(SIGCHLD, &action, NULL)) { pr_perror("Can't restore SIGCHLD handler"); @@ -1315,17 +1517,14 @@ int cr_service(bool daemon_mode) SET_CHAR_OPTS(addr, CR_DEFAULT_SERVICE_ADDRESS); } - strncpy(server_addr.sun_path, opts.addr, - sizeof(server_addr.sun_path) - 1); + strncpy(server_addr.sun_path, opts.addr, sizeof(server_addr.sun_path) - 1); - server_addr_len = strlen(server_addr.sun_path) - + sizeof(server_addr.sun_family); + server_addr_len = strlen(server_addr.sun_path) + sizeof(server_addr.sun_family); client_addr_len = sizeof(client_addr); unlink(server_addr.sun_path); - if (bind(server_fd, (struct sockaddr *) &server_addr, - server_addr_len) == -1) { + if (bind(server_fd, (struct sockaddr *)&server_addr, server_addr_len) == -1) { pr_perror("Can't bind"); goto err; } @@ -1361,7 +1560,7 @@ int cr_service(bool daemon_mode) if (setup_sigchld_handler()) goto err; - if (close_status_fd()) + if (status_ready()) goto err; while (1) { diff --git a/criu/crtools.c b/criu/crtools.c index a94875684..4dc55a065 100644 --- a/criu/crtools.c +++ b/criu/crtools.c @@ -47,6 +47,84 @@ #include "setproctitle.h" #include "sysctl.h" +void flush_early_log_to_stderr(void) __attribute__((destructor)); + +void flush_early_log_to_stderr(void) +{ + flush_early_log_buffer(STDERR_FILENO); +} + +static int image_dir_mode(void) +{ + switch (opts.mode) { + case CR_DUMP: + /* fallthrough */ + case CR_CPUINFO_DUMP: + /* fallthrough */ + case CR_PRE_DUMP: + return O_DUMP; + case CR_RESTORE: + return O_RSTR; + default: + return -1; + } + + /* never reached */ + BUG(); + return -1; +} + +struct { + char *cmd; + int mode; +} commands[] = { + { "dump", CR_DUMP }, + { "pre-dump", CR_PRE_DUMP }, + { "restore", CR_RESTORE }, + { "lazy-pages", CR_LAZY_PAGES }, + { "check", CR_CHECK }, + { "page-server", CR_PAGE_SERVER }, + { "service", CR_SERVICE }, + { "swrk", CR_SWRK }, + { "dedup", CR_DEDUP }, + { "exec", CR_EXEC_DEPRECATED }, + { "show", CR_SHOW_DEPRECATED }, +}; + +static int parse_criu_mode(int argc, char **argv, int *optind) +{ + char *cmd = argv[*optind]; + bool has_sub_command = (argc - *optind) > 1; + char *subcommand = has_sub_command ? argv[*optind + 1] : NULL; + int i; + + for (i = 0; i < ARRAY_SIZE(commands); i++) { + if (strcmp(cmd, commands[i].cmd)) + continue; + opts.mode = commands[i].mode; + return 0; + } + + if (!strcmp(cmd, "cpuinfo")) { + if (subcommand == NULL) { + pr_err("cpuinfo requires an action: dump or check\n"); + return -1; + } + if (!strcmp(subcommand, "dump")) + opts.mode = CR_CPUINFO_DUMP; + else if (!strcmp(subcommand, "check")) + opts.mode = CR_CPUINFO_CHECK; + else { + pr_err("unknown cpuinfo sub-command: %s\n", subcommand); + return -1; + } + (*optind)++; + return 0; + } + pr_err("unknown command: %s\n", argv[*optind]); + return -1; +} + int main(int argc, char *argv[], char *envp[]) { int ret = -1; @@ -54,37 +132,74 @@ int main(int argc, char *argv[], char *envp[]) bool has_exec_cmd = false; bool has_sub_command; int state = PARSING_GLOBAL_CONF; + char *cmd; BUILD_BUG_ON(CTL_32 != SYSCTL_TYPE__CTL_32); BUILD_BUG_ON(__CTL_STR != SYSCTL_TYPE__CTL_STR); /* We use it for fd overlap handling in clone_service_fd() */ - BUG_ON(get_service_fd(SERVICE_FD_MIN+1) < - get_service_fd(SERVICE_FD_MAX-1)); + BUG_ON(get_service_fd(SERVICE_FD_MIN + 1) < get_service_fd(SERVICE_FD_MAX - 1)); - if (fault_injection_init()) + if (fault_injection_init()) { + pr_err("Failed to initialize fault injection when initializing crtools.\n"); return 1; + } cr_pb_init(); - setproctitle_init(argc, argv, envp); + __setproctitle_init(argc, argv, envp); if (argc < 2) goto usage; init_opts(); - ret = parse_options(argc, argv, &usage_error, &has_exec_cmd, state); if (ret == 1) return 1; if (ret == 2) goto usage; + if (optind >= argc) { + pr_err("command is required\n"); + goto usage; + } log_set_loglevel(opts.log_level); - if (!strcmp(argv[1], "swrk")) { - if (argc < 3) - goto usage; + /* + * There kernel might send us lethal signals in the following cases: + * 1) Writing a pipe which reader has disappeared. + * 2) Writing to a socket of type SOCK_STREAM which is no longer connected. + * We deal with write()/Send() failures on our own, and prefer not to get killed. + * So we ignore SIGPIPEs. + * + * Pipes are used in various places: + * 1) Receiving application page data + * 2) Transmitting data to the image streamer + * 3) Emitting logs (potentially to a pipe). + * Sockets are mainly used in transmitting memory data. + */ + if (signal(SIGPIPE, SIG_IGN) == SIG_ERR) { + pr_perror("Failed to set a SIGPIPE signal ignore."); + return 1; + } + + cmd = argv[optind]; + ret = parse_criu_mode(argc, argv, &optind); + if (ret) + goto usage; + + /* + * util_init initializes criu_run_id and compel_run_id so that sockets + * are generated with an unique name identifying the specific process + * even in cases where multiple processes with the same pid in + * different pid namespaces are sharing the same network namespace. + */ + util_init(); + if (opts.mode == CR_SWRK) { + if (argc != optind + 2) { + fprintf(stderr, "Usage: criu swrk \n"); + return 1; + } /* * This is to start criu service worker from libcriu calls. * The usage is "criu swrk " and is not for CLI/scripts. @@ -92,13 +207,11 @@ int main(int argc, char *argv[], char *envp[]) * corresponding lib call change. */ opts.swrk_restore = true; - return cr_service_work(atoi(argv[2])); + return cr_service_work(atoi(argv[optind + 1])); } - if (check_options()) { - flush_early_log_buffer(STDERR_FILENO); + if (check_caps()) return 1; - } if (opts.imgs_dir == NULL) SET_CHAR_OPTS(imgs_dir, "."); @@ -106,26 +219,21 @@ int main(int argc, char *argv[], char *envp[]) if (opts.work_dir == NULL) SET_CHAR_OPTS(work_dir, opts.imgs_dir); - if (optind >= argc) { - pr_msg("Error: command is required\n"); - goto usage; - } - has_sub_command = (argc - optind) > 1; if (has_exec_cmd) { if (!has_sub_command) { - pr_msg("Error: --exec-cmd requires a command\n"); + pr_err("--exec-cmd requires a command\n"); goto usage; } - if (strcmp(argv[optind], "restore")) { - pr_msg("Error: --exec-cmd is available for the restore command only\n"); + if (opts.mode != CR_RESTORE) { + pr_err("--exec-cmd is available for the restore command only\n"); goto usage; } if (opts.restore_detach) { - pr_msg("Error: --restore-detached and --exec-cmd cannot be used together\n"); + pr_err("--restore-detached and --exec-cmd cannot be used together\n"); goto usage; } @@ -134,30 +242,30 @@ int main(int argc, char *argv[], char *envp[]) return 1; memcpy(opts.exec_cmd, &argv[optind + 1], (argc - optind - 1) * sizeof(char *)); opts.exec_cmd[argc - optind - 1] = NULL; - } else { - /* No subcommands except for cpuinfo and restore --exec-cmd */ - if (strcmp(argv[optind], "cpuinfo") && has_sub_command) { - pr_msg("Error: excessive parameter%s for command %s\n", - (argc - optind) > 2 ? "s" : "", argv[optind]); - goto usage; - } + } else if (has_sub_command) { + pr_err("excessive parameter%s for command %s\n", (argc - optind) > 2 ? "s" : "", cmd); + goto usage; + } + + if (opts.stream && image_dir_mode() == -1) { + pr_err("--stream cannot be used with the %s command\n", cmd); + goto usage; } /* We must not open imgs dir, if service is called */ - if (strcmp(argv[optind], "service")) { - ret = open_image_dir(opts.imgs_dir); - if (ret < 0) + if (opts.mode != CR_SERVICE) { + ret = open_image_dir(opts.imgs_dir, image_dir_mode()); + if (ret < 0) { + pr_err("Couldn't open image dir %s\n", opts.imgs_dir); return 1; + } } /* * When a process group becomes an orphan, * its processes are sent a SIGHUP signal */ - if (!strcmp(argv[optind], "restore") && - opts.restore_detach && - opts.final_state == TASK_STOPPED && - opts.shell_job) + if (opts.mode == CR_RESTORE && opts.restore_detach && opts.final_state == TASK_STOPPED && opts.shell_job) pr_warn("Stopped and detached shell job will get SIGHUP from OS.\n"); if (chdir(opts.work_dir)) { @@ -168,14 +276,19 @@ int main(int argc, char *argv[], char *envp[]) if (log_init(opts.output)) return 1; - if (kerndat_init()) + if (kerndat_init()) { + pr_err("Could not initialize kernel features detection.\n"); + return 1; + } + + if (check_options()) return 1; - if (opts.deprecated_ok) - pr_debug("DEPRECATED ON\n"); + if (fault_injected(FI_CANNOT_MAP_VDSO)) + kdat.can_map_vdso = 0; if (!list_empty(&opts.inherit_fds)) { - if (strcmp(argv[optind], "restore")) { + if (opts.mode != CR_RESTORE) { pr_err("--inherit-fd is restore-only option\n"); return 1; } @@ -186,13 +299,13 @@ int main(int argc, char *argv[], char *envp[]) if (opts.img_parent) pr_info("Will do snapshot from %s\n", opts.img_parent); - if (!strcmp(argv[optind], "dump")) { + switch (opts.mode) { + case CR_DUMP: if (!opts.tree_id) goto opt_pid_missing; - return cr_dump_tasks(opts.tree_id); - } - if (!strcmp(argv[optind], "pre-dump")) { + return cr_dump_tasks(opts.tree_id); + case CR_PRE_DUMP: if (!opts.tree_id) goto opt_pid_missing; @@ -202,9 +315,7 @@ int main(int argc, char *argv[], char *envp[]) } return cr_pre_dump_tasks(opts.tree_id) != 0; - } - - if (!strcmp(argv[optind], "restore")) { + case CR_RESTORE: if (opts.tree_id) pr_warn("Using -t with criu restore is obsoleted\n"); @@ -217,68 +328,62 @@ int main(int argc, char *argv[], char *envp[]) } return ret != 0; - } - if (!strcmp(argv[optind], "lazy-pages")) + case CR_LAZY_PAGES: return cr_lazy_pages(opts.daemon_mode) != 0; - if (!strcmp(argv[optind], "check")) + case CR_CHECK: return cr_check() != 0; - if (!strcmp(argv[optind], "page-server")) + case CR_PAGE_SERVER: return cr_page_server(opts.daemon_mode, false, -1) != 0; - if (!strcmp(argv[optind], "service")) + case CR_SERVICE: return cr_service(opts.daemon_mode); - if (!strcmp(argv[optind], "dedup")) + case CR_DEDUP: return cr_dedup() != 0; - if (!strcmp(argv[optind], "cpuinfo")) { - if (!argv[optind + 1]) { - pr_msg("Error: cpuinfo requires an action: dump or check\n"); - goto usage; - } - if (!strcmp(argv[optind + 1], "dump")) - return cpuinfo_dump(); - else if (!strcmp(argv[optind + 1], "check")) - return cpuinfo_check(); - } + case CR_CPUINFO_DUMP: + return cpuinfo_dump(); - if (!strcmp(argv[optind], "exec")) { - pr_msg("The \"exec\" action is deprecated by the Compel library.\n"); + case CR_CPUINFO_CHECK: + return cpuinfo_check(); + + case CR_EXEC_DEPRECATED: + pr_err("The \"exec\" action is deprecated by the Compel library.\n"); return -1; - } - if (!strcmp(argv[optind], "show")) { - pr_msg("The \"show\" action is deprecated by the CRIT utility.\n"); - pr_msg("To view an image use the \"crit decode -i $name --pretty\" command.\n"); + case CR_SHOW_DEPRECATED: + pr_err("The \"show\" action is deprecated by the CRIT utility.\n"); + pr_err("To view an image use the \"crit decode -i $name --pretty\" command.\n"); return -1; - } - pr_msg("Error: unknown command: %s\n", argv[optind]); + case CR_UNSET: + default: + pr_err("unknown command: %s\n", cmd); + } usage: pr_msg("\n" -"Usage:\n" -" criu dump|pre-dump -t PID []\n" -" criu restore []\n" -" criu check [--feature FEAT]\n" -" criu page-server\n" -" criu service []\n" -" criu dedup\n" -" criu lazy-pages -D DIR []\n" -"\n" -"Commands:\n" -" dump checkpoint a process/tree identified by pid\n" -" pre-dump pre-dump task(s) minimizing their frozen time\n" -" restore restore a process/tree\n" -" check checks whether the kernel support is up-to-date\n" -" page-server launch page server\n" -" service launch service\n" -" dedup remove duplicates in memory dump\n" -" cpuinfo dump writes cpu information into image file\n" -" cpuinfo check validates cpu information read from image file\n" - ); + "Usage:\n" + " criu dump|pre-dump -t PID []\n" + " criu restore []\n" + " criu check [--feature FEAT]\n" + " criu page-server\n" + " criu service []\n" + " criu dedup\n" + " criu lazy-pages -D DIR []\n" + "\n" + "Commands:\n" + " dump checkpoint a process/tree identified by pid\n" + " pre-dump pre-dump task(s) minimizing their frozen time\n" + " restore restore a process/tree\n" + " check checks whether the kernel support is up-to-date\n" + " page-server launch page server\n" + " service launch service\n" + " dedup remove duplicates in memory dump\n" + " cpuinfo dump writes cpu information into image file\n" + " cpuinfo check validates cpu information read from image file\n"); if (usage_error) { pr_msg("\nTry -h|--help for more info\n"); @@ -287,165 +392,194 @@ usage: pr_msg("\n" -"Most of the true / false long options (the ones without arguments) can be\n" -"prefixed with --no- to negate the option (example: --display-stats and\n" -"--no-display-stats).\n" -"\n" -"Dump/Restore options:\n" -"\n" -"* Generic:\n" -" -t|--tree PID checkpoint a process tree identified by PID\n" -" -d|--restore-detached detach after restore\n" -" -S|--restore-sibling restore root task as sibling\n" -" -s|--leave-stopped leave tasks in stopped state after checkpoint\n" -" -R|--leave-running leave tasks in running state after checkpoint\n" -" -D|--images-dir DIR directory for image files\n" -" --pidfile FILE write root task, service or page-server pid to FILE\n" -" -W|--work-dir DIR directory to cd and write logs/pidfiles/stats to\n" -" (if not specified, value of --images-dir is used)\n" -" --cpu-cap [CAP] CPU capabilities to write/check. CAP is comma-separated\n" -" list of: cpu, fpu, all, ins, none. To disable\n" -" a capability, use ^CAP. Empty argument implies all\n" -" --exec-cmd execute the command specified after '--' on successful\n" -" restore making it the parent of the restored process\n" -" --freeze-cgroup use cgroup freezer to collect processes\n" -" --weak-sysctls skip restoring sysctls that are not available\n" -" --lazy-pages restore pages on demand\n" -" this requires running a second instance of criu\n" -" in lazy-pages mode: 'criu lazy-pages -D DIR'\n" -" --lazy-pages and lazy-pages mode require userfaultfd\n" -"\n" -"* External resources support:\n" -" --external RES dump objects from this list as external resources:\n" -" Formats of RES on dump:\n" -" tty[rdev:dev]\n" -" file[mnt_id:inode]\n" -" dev[major/minor]:NAME\n" -" unix[ino]\n" -" mnt[MOUNTPOINT]:COOKIE\n" -" mnt[]{:AUTO_OPTIONS}\n" -" Formats of RES on restore:\n" -" dev[NAME]:DEVPATH\n" -" veth[IFNAME]:OUTNAME{@BRIDGE}\n" -" macvlan[IFNAME]:OUTNAME\n" -" mnt[COOKIE]:ROOT\n" -"\n" -"* Special resources support:\n" -" --" SK_EST_PARAM " checkpoint/restore established TCP connections\n" -" --" SK_INFLIGHT_PARAM " skip (ignore) in-flight TCP connections\n" -" --" SK_CLOSE_PARAM " restore connected TCP sockets in closed state\n" -" -r|--root PATH change the root filesystem (when run in mount namespace)\n" -" --evasive-devices use any path to a device file if the original one\n" -" is inaccessible\n" -" --link-remap allow one to link unlinked files back when possible\n" -" --ghost-limit size limit max size of deleted file contents inside image\n" -" --action-script FILE add an external action script\n" -" -j|--" OPT_SHELL_JOB " allow one to dump and restore shell jobs\n" -" -l|--" OPT_FILE_LOCKS " handle file locks, for safety, only used for container\n" -" -L|--libdir path to a plugin directory (by default " CR_PLUGIN_DEFAULT ")\n" -" --force-irmap force resolving names for inotify/fsnotify watches\n" -" --irmap-scan-path FILE\n" -" add a path the irmap hints to scan\n" -" --manage-cgroups [m] dump/restore process' cgroups; argument can be one of\n" -" 'none', 'props', 'soft' (default), 'full', 'strict'\n" -" or 'ignore'\n" -" --cgroup-root [controller:]/newroot\n" -" on dump: change the root for the controller that will\n" -" be dumped. By default, only the paths with tasks in\n" -" them and below will be dumped.\n" -" on restore: change the root cgroup the controller will\n" -" be installed into. No controller means that root is the\n" -" default for all controllers not specified\n" -" --cgroup-props STRING\n" -" define cgroup controllers and properties\n" -" to be checkpointed, which are described\n" -" via STRING using simplified YAML format\n" -" --cgroup-props-file FILE\n" -" same as --cgroup-props, but taking description\n" -" from the path specified\n" -" --cgroup-dump-controller NAME\n" -" define cgroup controller to be dumped\n" -" and skip anything else present in system\n" -" --lsm-profile TYPE:NAME\n" -" Specify an LSM profile to be used during restore.\n" -" The type can be either 'apparmor' or 'selinux'.\n" -" --skip-mnt PATH ignore this mountpoint when dumping the mount namespace\n" -" --enable-fs FSNAMES a comma separated list of filesystem names or \"all\"\n" -" force criu to (try to) dump/restore these filesystem's\n" -" mountpoints even if fs is not supported\n" -" --inherit-fd fd[NUM]:RES\n" -" Inherit file descriptors, treating fd NUM as being\n" -" already opened via an existing RES, which can be:\n" -" tty[rdev:dev]\n" -" pipe[inode]\n" -" socket[inode]\n" -" file[mnt_id:inode]\n" -" path/to/file\n" -" --empty-ns net Create a namespace, but don't restore its properties\n" -" (assuming it will be restored by action scripts)\n" -" -J|--join-ns NS:{PID|NS_FILE}[,OPTIONS]\n" -" Join existing namespace and restore process in it.\n" -" Namespace can be specified as either pid or file path.\n" -" OPTIONS can be used to specify parameters for userns:\n" -" user:PID,UID,GID\n" -"\n" -"Check options:\n" -" Without options, \"criu check\" checks availability of absolutely required\n" -" kernel features, critical for performing dump and restore.\n" -" --extra add check for extra kernel features\n" -" --experimental add check for experimental kernel features\n" -" --all same as --extra --experimental\n" -" --feature FEAT only check a particular feature, one of:" - ); + "Most of the true / false long options (the ones without arguments) can be\n" + "prefixed with --no- to negate the option (example: --display-stats and\n" + "--no-display-stats).\n" + "\n" + "Dump/Restore options:\n" + "\n" + "* Generic:\n" + " -t|--tree PID checkpoint a process tree identified by PID\n" + " -d|--restore-detached detach after restore\n" + " -S|--restore-sibling restore root task as sibling\n" + " -s|--leave-stopped leave tasks in stopped state after checkpoint\n" + " -R|--leave-running leave tasks in running state after checkpoint\n" + " -D|--images-dir DIR directory for image files\n" + " --pidfile FILE write root task, service or page-server pid to FILE\n" + " -W|--work-dir DIR directory to cd and write logs/pidfiles/stats to\n" + " (if not specified, value of --images-dir is used)\n" + " --cpu-cap [CAP] CPU capabilities to write/check. CAP is comma-separated\n" + " list of: cpu, fpu, all, ins, none. To disable\n" + " a capability, use ^CAP. Empty argument implies all\n" + " --exec-cmd execute the command specified after '--' on successful\n" + " restore making it the parent of the restored process\n" + " --freeze-cgroup use cgroup freezer to collect processes\n" + " --weak-sysctls skip restoring sysctls that are not available\n" + " --lazy-pages restore pages on demand\n" + " this requires running a second instance of criu\n" + " in lazy-pages mode: 'criu lazy-pages -D DIR'\n" + " --lazy-pages and lazy-pages mode require userfaultfd\n" + " --stream dump/restore images using criu-image-streamer\n" + " --mntns-compat-mode Use mount engine in compatibility mode. By default criu\n" + " tries to use mount-v2 mode with more reliable algorithm\n" + " based on MOVE_MOUNT_SET_GROUP kernel feature\n" + " --network-lock METHOD network locking/unlocking method; argument\n" + " can be 'nftables' or 'iptables' (default).\n" + " --unprivileged accept limitations when running as non-root\n" + " --allow-uprobes allow dump/restore with uprobes vma\n" + "\n" + "* External resources support:\n" + " --external RES dump objects from this list as external resources:\n" + " Formats of RES on dump:\n" + " tty[rdev:dev]\n" + " file[mnt_id:inode]\n" + " dev[major/minor]:NAME\n" + " unix[ino]\n" + " mnt[MOUNTPOINT]:COOKIE\n" + " mnt[]{:AUTO_OPTIONS}\n" + " Formats of RES on restore:\n" + " dev[NAME]:DEVPATH\n" + " veth[IFNAME]:OUTNAME{@BRIDGE}\n" + " macvlan[IFNAME]:OUTNAME\n" + " mnt[COOKIE]:ROOT\n" + " netdev[IFNAME]:ORIGNAME\n" + "\n" + "* Special resources support:\n" + " --" SK_EST_PARAM " checkpoint/restore established TCP connections\n" + " --" SK_INFLIGHT_PARAM " skip (ignore) in-flight TCP connections\n" + " --" SK_CLOSE_PARAM " don't dump the state of, or block, established tcp\n" + " connections, and restore them in closed state.\n" + " -r|--root PATH change the root filesystem (when run in mount namespace)\n" + " --evasive-devices use any path to a device file if the original one\n" + " is inaccessible\n" + " --link-remap allow one to link unlinked files back when possible\n" + " --ghost-limit size limit max size of deleted file contents inside image\n" + " --ghost-fiemap enable dumping of deleted files using fiemap\n" + " --action-script FILE add an external action script\n" + " -j|--" OPT_SHELL_JOB " allow one to dump and restore shell jobs\n" + " -l|--" OPT_FILE_LOCKS " handle file locks, for safety, only used for container\n" + " -L|--libdir path to a plugin directory (by default " CR_PLUGIN_DEFAULT ")\n" + " --timeout NUM a timeout (in seconds) on collecting tasks during dump\n" + " (default 10 seconds)\n" + " --force-irmap force resolving names for inotify/fsnotify watches\n" + " --irmap-scan-path FILE\n" + " add a path the irmap hints to scan\n" + " --manage-cgroups [m] dump/restore process' cgroups; argument can be one of\n" + " 'none', 'props', 'soft' (default), 'full', 'strict'\n" + " or 'ignore'\n" + " --cgroup-root [controller:]/newroot\n" + " on dump: change the root for the controller that will\n" + " be dumped. By default, only the paths with tasks in\n" + " them and below will be dumped.\n" + " on restore: change the root cgroup the controller will\n" + " be installed into. No controller means that root is the\n" + " default for all controllers not specified\n" + " --cgroup-props STRING\n" + " define cgroup controllers and properties\n" + " to be checkpointed, which are described\n" + " via STRING using simplified YAML format\n" + " --cgroup-props-file FILE\n" + " same as --cgroup-props, but taking description\n" + " from the path specified\n" + " --cgroup-dump-controller NAME\n" + " define cgroup controller to be dumped\n" + " and skip anything else present in system\n" + " --cgroup-yard PATH\n" + " instead of trying to mount cgroups in CRIU, provide\n" + " a path to a directory with already created cgroup yard.\n" + " Useful if you don't want to grant CAP_SYS_ADMIN to CRIU\n" + " --lsm-profile TYPE:NAME\n" + " Specify an LSM profile to be used during restore.\n" + " The type can be either 'apparmor' or 'selinux'.\n" + " --lsm-mount-context CTX\n" + " Specify a mount context to be used during restore.\n" + " Only mounts with an existing context will have their\n" + " mount context replaced with CTX.\n" + " --skip-mnt PATH ignore this mountpoint when dumping the mount namespace\n" + " --enable-fs FSNAMES a comma separated list of filesystem names or \"all\"\n" + " force criu to (try to) dump/restore these filesystem's\n" + " mountpoints even if fs is not supported\n" + " --inherit-fd fd[NUM]:RES\n" + " Inherit file descriptors, treating fd NUM as being\n" + " already opened via an existing RES, which can be:\n" + " tty[rdev:dev]\n" + " pipe:[inode]\n" + " socket:[inode]\n" + " file[mnt_id:inode]\n" + " /memfd:name\n" + " path/to/file\n" + " --empty-ns net Create a namespace, but don't restore its properties\n" + " (assuming it will be restored by action scripts)\n" + " -J|--join-ns NS:{PID|NS_FILE}[,OPTIONS]\n" + " Join existing namespace and restore process in it.\n" + " Namespace can be specified as either pid or file path.\n" + " OPTIONS can be used to specify parameters for userns:\n" + " user:PID,UID,GID\n" + " --file-validation METHOD\n" + " pass the validation method to be used; argument\n" + " can be 'filesize' or 'buildid' (default).\n" + " --skip-file-rwx-check\n" + " Skip checking file permissions\n" + " (r/w/x for u/g/o) on restore.\n" + "\n" + "Check options:\n" + " Without options, \"criu check\" checks availability of absolutely required\n" + " kernel features, critical for performing dump and restore.\n" + " --extra add check for extra kernel features\n" + " --experimental add check for experimental kernel features\n" + " --all same as --extra --experimental\n" + " --feature FEAT only check a particular feature, one of:"); pr_check_features(" ", ", ", 80); - pr_msg( -"\n" -"* Logging:\n" -" -o|--log-file FILE log file name\n" -" --log-pid enable per-process logging to separate FILE.pid files\n" -" -v[v...]|--verbosity increase verbosity (can use multiple v)\n" -" -vNUM|--verbosity=NUM set verbosity to NUM (higher level means more output):\n" -" -v1 - only errors and messages\n" -" -v2 - also warnings (default level)\n" -" -v3 - also information messages and timestamps\n" -" -v4 - lots of debug\n" -" --display-stats print out dump/restore stats\n" -"\n" -"* Memory dumping options:\n" -" --track-mem turn on memory changes tracker in kernel\n" -" --prev-images-dir DIR path to images from previous dump (relative to -D)\n" -" --page-server send pages to page server (see options below as well)\n" -" --auto-dedup when used on dump it will deduplicate \"old\" data in\n" -" pages images of previous dump\n" -" when used on restore, as soon as page is restored, it\n" -" will be punched from the image\n" -"\n" -"Page/Service server options:\n" -" --address ADDR address of server or service\n" -" --port PORT port of page server\n" -" --ps-socket FD use specified FD as page server socket\n" -" -d|--daemon run in the background after creating socket\n" -" --status-fd FD write \\0 to the FD and close it once process is ready\n" -" to handle requests\n" -" --tls-cacert FILE trust certificates signed only by this CA\n" -" --tls-cacrl FILE path to CA certificate revocation list file\n" -" --tls-cert FILE path to TLS certificate file\n" -" --tls-key FILE path to TLS private key file\n" -" --tls use TLS to secure remote connection\n" -" --tls-no-cn-verify do not verify common name in server certificate\n" -"\n" -"Configuration file options:\n" -" --config FILEPATH pass a specific configuration file\n" -" --no-default-config forbid usage of default configuration files\n" -"\n" -"Other options:\n" -" -h|--help show this text\n" -" -V|--version show version\n" - ); + pr_msg("\n" + "* Logging:\n" + " -o|--log-file FILE log file name\n" + " --log-pid enable per-process logging to separate FILE.pid files\n" + " -v[v...]|--verbosity increase verbosity (can use multiple v)\n" + " -vNUM|--verbosity=NUM set verbosity to NUM (higher level means more output):\n" + " -v1 - only errors and messages\n" + " -v2 - also warnings (default level)\n" + " -v3 - also information messages and timestamps\n" + " -v4 - lots of debug\n" + " --display-stats print out dump/restore stats\n" + "\n" + "* Memory dumping options:\n" + " --track-mem turn on memory changes tracker in kernel\n" + " --prev-images-dir DIR path to images from previous dump (relative to -D)\n" + " --page-server send pages to page server (see options below as well)\n" + " --auto-dedup when used on dump it will deduplicate \"old\" data in\n" + " pages images of previous dump\n" + " when used on restore, as soon as page is restored, it\n" + " will be punched from the image\n" + " --pre-dump-mode splice - parasite based pre-dumping (default)\n" + " read - process_vm_readv syscall based pre-dumping\n" + "\n" + "Page/Service server options:\n" + " --address ADDR address of server or service\n" + " --port PORT port of page server\n" + " --ps-socket FD use specified FD as page server socket\n" + " -d|--daemon run in the background after creating socket\n" + " --status-fd FD write \\0 to the FD and close it once process is ready\n" + " to handle requests\n" +#ifdef CONFIG_GNUTLS + " --tls-cacert FILE trust certificates signed only by this CA\n" + " --tls-cacrl FILE path to CA certificate revocation list file\n" + " --tls-cert FILE path to TLS certificate file\n" + " --tls-key FILE path to TLS private key file\n" + " --tls use TLS to secure remote connection\n" + " --tls-no-cn-verify do not verify common name in server certificate\n" +#endif + "\n" + "Configuration file options:\n" + " --config FILEPATH pass a specific configuration file\n" + " --no-default-config forbid usage of default configuration files\n" + "\n" + "Other options:\n" + " -h|--help show this text\n" + " -V|--version show version\n"); return 0; opt_pid_missing: - pr_msg("Error: pid not specified\n"); + pr_err("pid not specified\n"); return 1; } diff --git a/criu/eventfd.c b/criu/eventfd.c index da31ce9f5..a5d51a189 100644 --- a/criu/eventfd.c +++ b/criu/eventfd.c @@ -22,12 +22,12 @@ #include "protobuf.h" #include "images/eventfd.pb-c.h" -#undef LOG_PREFIX +#undef LOG_PREFIX #define LOG_PREFIX "eventfd: " struct eventfd_file_info { - EventfdFileEntry *efe; - struct file_desc d; + EventfdFileEntry *efe; + struct file_desc d; }; /* Checks if file descriptor @lfd is eventfd */ @@ -38,8 +38,7 @@ int is_eventfd_link(char *link) static void pr_info_eventfd(char *action, EventfdFileEntry *efe) { - pr_info("%s: id %#08x flags %#04x counter %#016"PRIx64"\n", - action, efe->id, efe->flags, efe->counter); + pr_info("%s: id %#08x flags %#04x counter %#016" PRIx64 "\n", action, efe->id, efe->flags, efe->counter); } static int dump_one_eventfd(int lfd, u32 id, const struct fd_parms *p) @@ -63,8 +62,8 @@ static int dump_one_eventfd(int lfd, u32 id, const struct fd_parms *p) } const struct fdtype_ops eventfd_dump_ops = { - .type = FD_TYPES__EVENTFD, - .dump = dump_one_eventfd, + .type = FD_TYPES__EVENTFD, + .dump = dump_one_eventfd, }; static int eventfd_open(struct file_desc *d, int *new_fd) @@ -76,14 +75,12 @@ static int eventfd_open(struct file_desc *d, int *new_fd) tmp = eventfd(info->efe->counter, 0); if (tmp < 0) { - pr_perror("Can't create eventfd %#08x", - info->efe->id); + pr_perror("Can't create eventfd %#08x", info->efe->id); return -1; } if (rst_file_params(tmp, info->efe->fown, info->efe->flags)) { - pr_perror("Can't restore params on eventfd %#08x", - info->efe->id); + pr_perror("Can't restore params on eventfd %#08x", info->efe->id); goto err_close; } diff --git a/criu/eventpoll.c b/criu/eventpoll.c index e1384fa4b..ca5ee9c59 100644 --- a/criu/eventpoll.c +++ b/criu/eventpoll.c @@ -30,33 +30,33 @@ #include "protobuf.h" #include "images/eventpoll.pb-c.h" -#undef LOG_PREFIX +#undef LOG_PREFIX #define LOG_PREFIX "epoll: " static LIST_HEAD(dinfo_list); typedef struct { - uint32_t tfd; - uint32_t off; - uint32_t idx; + uint32_t tfd; + uint32_t off; + uint32_t idx; } toff_t; struct eventpoll_dinfo { - struct list_head list; + struct list_head list; - FileEntry *fe; - EventpollFileEntry *e; + FileEntry *fe; + EventpollFileEntry *e; - toff_t *toff; - FownEntry fown; + toff_t *toff; + FownEntry fown; - pid_t pid; - int efd; + pid_t pid; + int efd; }; struct eventpoll_file_info { - EventpollFileEntry *efe; - struct file_desc d; + EventpollFileEntry *efe; + struct file_desc d; }; /* Checks if file descriptor @lfd is eventfd */ @@ -67,8 +67,8 @@ int is_eventpoll_link(char *link) static void pr_info_eventpoll_tfd(char *action, uint32_t id, EventpollTfdEntry *e) { - pr_info("%seventpoll-tfd: id %#08x tfd %8d events %#08x data %#016"PRIx64"\n", - action, id, e->tfd, e->events, e->data); + pr_info("%seventpoll-tfd: id %#08x tfd %8d events %#08x data %#016" PRIx64 "\n", action, id, e->tfd, e->events, + e->data); } static void pr_info_eventpoll(char *action, EventpollFileEntry *e) @@ -90,16 +90,16 @@ static int queue_dinfo(FileEntry **fe, EventpollFileEntry **e, toff_t **toff, co INIT_LIST_HEAD(&dinfo->list); - dinfo->fe = *fe; - dinfo->e = *e; - dinfo->toff = *toff; - dinfo->e->fown = &dinfo->fown; - dinfo->pid = p->pid; - dinfo->efd = p->fd; + dinfo->fe = *fe; + dinfo->e = *e; + dinfo->toff = *toff; + dinfo->e->fown = &dinfo->fown; + dinfo->pid = p->pid; + dinfo->efd = p->fd; - *fe = NULL; - *e = NULL; - *toff = NULL; + *fe = NULL; + *e = NULL; + *toff = NULL; list_add_tail(&dinfo->list, &dinfo_list); return 0; @@ -133,31 +133,29 @@ int flush_eventpoll_dinfo_queue(void) for (i = 0; i < e->n_tfd; i++) { EventpollTfdEntry *tfde = e->tfd[i]; struct kid_elem ke = { - .pid = dinfo->pid, - .genid = make_gen_id(tfde->dev, - tfde->inode, - tfde->pos), - .idx = tfde->tfd, + .pid = dinfo->pid, + .genid = make_gen_id(tfde->dev, tfde->inode, tfde->pos), + .idx = tfde->tfd, }; kcmp_epoll_slot_t slot = { - .efd = dinfo->efd, - .tfd = tfde->tfd, - .toff = dinfo->toff[i].off, + .efd = dinfo->efd, + .tfd = tfde->tfd, + .toff = dinfo->toff[i].off, }; struct kid_elem *t = kid_lookup_epoll_tfd(&fd_tree, &ke, &slot); if (!t) { - pr_debug("kid_lookup_epoll: no match pid %d efd %d tfd %d toff %u\n", - dinfo->pid, dinfo->efd, tfde->tfd, dinfo->toff[i].off); + pr_debug("kid_lookup_epoll: no match pid %d efd %d tfd %d toff %u\n", dinfo->pid, + dinfo->efd, tfde->tfd, dinfo->toff[i].off); goto err; } - pr_debug("kid_lookup_epoll: rbsearch match pid %d efd %d tfd %d toff %u -> %d\n", - dinfo->pid, dinfo->efd, tfde->tfd, dinfo->toff[i].off, t->idx); + pr_debug("kid_lookup_epoll: rbsearch match pid %d efd %d tfd %d toff %u -> %d\n", dinfo->pid, + dinfo->efd, tfde->tfd, dinfo->toff[i].off, t->idx); /* Make sure the pid matches */ if (t->pid != dinfo->pid) { - pr_debug("kid_lookup_epoll: pid mismatch %d %d efd %d tfd %d toff %u\n", - dinfo->pid, t->pid, dinfo->efd, tfde->tfd, dinfo->toff[i].off); + pr_debug("kid_lookup_epoll: pid mismatch %d %d efd %d tfd %d toff %u\n", dinfo->pid, + t->pid, dinfo->efd, tfde->tfd, dinfo->toff[i].off); goto err; } @@ -205,17 +203,25 @@ static int toff_cmp(const void *a, const void *b) return 0; } +static int toff_cmp_idx(const void *a, const void *b) +{ + if (((toff_t *)a)[0].idx > ((toff_t *)b)[0].idx) + return 1; + if (((toff_t *)a)[0].idx < ((toff_t *)b)[0].idx) + return -1; + return 0; +} + /* * fds in fd_parms are sorted so we can use binary search * for better performance. */ -static int find_tfd_bsearch(pid_t pid, int efd, int fds[], size_t nr_fds, - int tfd, unsigned int toff) +static int find_tfd_bsearch(pid_t pid, int efd, int fds[], size_t nr_fds, int tfd, unsigned int toff) { kcmp_epoll_slot_t slot = { - .efd = efd, - .tfd = tfd, - .toff = toff, + .efd = efd, + .tfd = tfd, + .toff = toff, }; int *tfd_found; @@ -234,20 +240,19 @@ static int find_tfd_bsearch(pid_t pid, int efd, int fds[], size_t nr_fds, return tfd; } } else { - pr_debug("find_tfd_bsearch (kcmp-no): bsearch match pid %d efd %d tfd %d toff %u\n", - pid, efd, tfd, toff); + pr_debug("find_tfd_bsearch (kcmp-no): bsearch match pid %d efd %d tfd %d toff %u\n", pid, efd, + tfd, toff); return tfd; } } - pr_debug("find_tfd_bsearch: no match pid %d efd %d tfd %d toff %u\n", - pid, efd, tfd, toff); + pr_debug("find_tfd_bsearch: no match pid %d efd %d tfd %d toff %u\n", pid, efd, tfd, toff); return -1; } static int dump_one_eventpoll(int lfd, u32 id, const struct fd_parms *p) { - toff_t *toff_base, *toff = NULL; + toff_t *toff = NULL; EventpollFileEntry *e = NULL; FileEntry *fe = NULL; int ret = -1; @@ -263,16 +268,16 @@ static int dump_one_eventpoll(int lfd, u32 id, const struct fd_parms *p) goto out; file_entry__init(fe); - e->id = id; - e->flags = p->flags; - e->fown = (FownEntry *)&p->fown; + e->id = id; + e->flags = p->flags; + e->fown = (FownEntry *)&p->fown; if (parse_fdinfo(lfd, FD_TYPES__EVENTPOLL, e)) goto out; - fe->type = FD_TYPES__EVENTPOLL; - fe->id = e->id; - fe->epfd = e; + fe->type = FD_TYPES__EVENTPOLL; + fe->id = e->id; + fe->epfd = e; /* * In regular case there is no so many dup'ed @@ -284,22 +289,18 @@ static int dump_one_eventpoll(int lfd, u32 id, const struct fd_parms *p) if (!toff) goto out; for (i = 0; i < e->n_tfd; i++) { - toff[i].idx = i; - toff[i].tfd = e->tfd[i]->tfd; - toff[i].off = 0; + toff[i].idx = i; + toff[i].tfd = e->tfd[i]->tfd; + toff[i].off = 0; } qsort(toff, e->n_tfd, sizeof(*toff), toff_cmp); - toff_base = NULL; - for (i = 1; i < e->n_tfd; i++) { - if (toff[i].tfd == toff[i - 1].tfd) { - if (!toff_base) - toff_base = &toff[i - 1]; - toff[i].off = toff[i].idx - toff_base->idx; - } else - toff_base = NULL; - } + for (i = 1; i < e->n_tfd; i++) + if (toff[i].tfd == toff[i - 1].tfd) + toff[i].off = toff[i - 1].off + 1; + + qsort(toff, e->n_tfd, sizeof(*toff), toff_cmp_idx); } /* @@ -307,19 +308,18 @@ static int dump_one_eventpoll(int lfd, u32 id, const struct fd_parms *p) * files is tricky: we need to use kcmp * to find out where file came from. Until * it's implemented lets use simpler approach - * just check the targets are blonging to the + * just check the targets are belonging to the * pid's file set. */ if (p->dfds) { for (i = 0; i < e->n_tfd; i++) { - int tfd = find_tfd_bsearch(p->pid, p->fd, p->dfds->fds, - p->dfds->nr_fds, e->tfd[i]->tfd, toff[i].off); + int tfd = find_tfd_bsearch(p->pid, p->fd, p->dfds->fds, p->dfds->nr_fds, e->tfd[i]->tfd, + toff[i].off); if (tfd == -1) { if (kdat.has_kcmp_epoll_tfd) { ret = queue_dinfo(&fe, &e, &toff, p); } else { - pr_err("Escaped/closed fd descriptor %d on pid %d\n", - e->tfd[i]->tfd, p->pid); + pr_err("Escaped/closed fd descriptor %d on pid %d\n", e->tfd[i]->tfd, p->pid); } goto out; } @@ -347,8 +347,8 @@ out: } const struct fdtype_ops eventpoll_dump_ops = { - .type = FD_TYPES__EVENTPOLL, - .dump = dump_one_eventpoll, + .type = FD_TYPES__EVENTPOLL, + .dump = dump_one_eventpoll, }; static int eventpoll_post_open(struct file_desc *d, int fd); @@ -368,14 +368,12 @@ static int eventpoll_open(struct file_desc *d, int *new_fd) tmp = epoll_create(1); if (tmp < 0) { - pr_perror("Can't create epoll %#08x", - info->efe->id); + pr_perror("Can't create epoll %#08x", info->efe->id); return -1; } if (rst_file_params(tmp, info->efe->fown, info->efe->flags)) { - pr_perror("Can't restore file params on epoll %#08x", - info->efe->id); + pr_perror("Can't restore file params on epoll %#08x", info->efe->id); goto err_close; } @@ -413,8 +411,8 @@ static int eventpoll_retore_tfd(int fd, int id, EventpollTfdEntry *tdefe) pr_info_eventpoll_tfd("Restore ", id, tdefe); - event.events = tdefe->events; - event.data.u64 = tdefe->data; + event.events = tdefe->events; + event.data.u64 = tdefe->data; if (epoll_ctl(fd, EPOLL_CTL_ADD, tdefe->tfd, &event)) { pr_perror("Can't add event on %#08x", id); return -1; @@ -443,8 +441,8 @@ static int eventpoll_post_open(struct file_desc *d, int fd) } static struct file_desc_ops desc_ops = { - .type = FD_TYPES__EVENTPOLL, - .open = eventpoll_open, + .type = FD_TYPES__EVENTPOLL, + .open = eventpoll_open, }; static int collect_one_epoll_tfd(void *o, ProtobufCMessage *msg, struct cr_img *i) @@ -479,10 +477,10 @@ static int collect_one_epoll_tfd(void *o, ProtobufCMessage *msg, struct cr_img * } struct collect_image_info epoll_tfd_cinfo = { - .fd_type = CR_FD_EVENTPOLL_TFD, - .pb_type = PB_EVENTPOLL_TFD, - .collect = collect_one_epoll_tfd, - .flags = COLLECT_NOFREE, + .fd_type = CR_FD_EVENTPOLL_TFD, + .pb_type = PB_EVENTPOLL_TFD, + .collect = collect_one_epoll_tfd, + .flags = COLLECT_NOFREE, }; static int collect_one_epoll(void *o, ProtobufCMessage *msg, struct cr_img *i) @@ -495,8 +493,8 @@ static int collect_one_epoll(void *o, ProtobufCMessage *msg, struct cr_img *i) } struct collect_image_info epoll_cinfo = { - .fd_type = CR_FD_EVENTPOLL_FILE, - .pb_type = PB_EVENTPOLL_FILE, - .priv_size = sizeof(struct eventpoll_file_info), - .collect = collect_one_epoll, + .fd_type = CR_FD_EVENTPOLL_FILE, + .pb_type = PB_EVENTPOLL_FILE, + .priv_size = sizeof(struct eventpoll_file_info), + .collect = collect_one_epoll, }; diff --git a/criu/external.c b/criu/external.c index 96e676849..bbbcd17cb 100644 --- a/criu/external.c +++ b/criu/external.c @@ -12,24 +12,28 @@ int add_external(char *key) { struct external *ext; + if (strstartswith(key, "mnt[]")) + return ext_mount_parse_auto(key + 5); + ext = xmalloc(sizeof(*ext)); if (!ext) return -1; - ext->id = key; - if (strstartswith(key, "macvlan") && macvlan_ext_add(ext) < 0) { - xfree(ext); - return -1; - } + ext->id = xstrdup(key); + if (!ext->id) + goto err_id; - if (strstartswith(key, "mnt[]")) { - xfree(ext); - return ext_mount_parse_auto(key + 5); - } + if (strstartswith(key, "macvlan") && macvlan_ext_add(ext) < 0) + goto err; list_add(&ext->node, &opts.external); return 0; +err: + xfree(ext->id); +err_id: + xfree(ext); + return -1; } bool external_lookup_id(char *id) diff --git a/criu/fault-injection.c b/criu/fault-injection.c index 4128814d5..5dd9acf60 100644 --- a/criu/fault-injection.c +++ b/criu/fault-injection.c @@ -1,9 +1,11 @@ #include +#include "criu-log.h" #include "fault-injection.h" +#include "seize.h" enum faults fi_strategy; -int fault_injection_init() +int fault_injection_init(void) { char *val; int start; @@ -14,9 +16,19 @@ int fault_injection_init() start = atoi(val); - if (start <= 0 || start >= FI_MAX) + if (start <= 0 || start >= FI_MAX) { + pr_err("CRIU_FAULT out of bounds.\n"); return -1; + } fi_strategy = start; + + switch (fi_strategy) { + case FI_COMPEL_INTERRUPT_ONLY_MODE: + set_compel_interrupt_only_mode(); + break; + default: + break; + }; return 0; } diff --git a/criu/fdstore.c b/criu/fdstore.c index a4583fdf4..6ac639c55 100644 --- a/criu/fdstore.c +++ b/criu/fdstore.c @@ -12,11 +12,17 @@ #include "xmalloc.h" #include "rst-malloc.h" #include "log.h" +#include "util.h" +#include "cr_options.h" +#include "util-caps.h" +#include "sockets.h" +/* clang-format off */ static struct fdstore_desc { int next_id; mutex_t lock; /* to protect a peek offset */ } *desc; +/* clang-format on */ int fdstore_init(void) { @@ -46,15 +52,14 @@ int fdstore_init(void) return -1; } - if (setsockopt(sk, SOL_SOCKET, SO_SNDBUFFORCE, &buf[0], sizeof(buf[0])) < 0 || - setsockopt(sk, SOL_SOCKET, SO_RCVBUFFORCE, &buf[1], sizeof(buf[1])) < 0) { - pr_perror("Unable to set SO_SNDBUFFORCE/SO_RCVBUFFORCE"); + if (sk_setbufs(sk, buf)) { close(sk); return -1; } addr.sun_family = AF_UNIX; - addrlen = snprintf(addr.sun_path, sizeof(addr.sun_path), "X/criu-fdstore-%"PRIx64, st.st_ino); + addrlen = snprintf(addr.sun_path, sizeof(addr.sun_path), "X/criu-fdstore-%" PRIx64 "-%s", st.st_ino, + criu_run_id); addrlen += sizeof(addr.sun_family); addr.sun_path[0] = 0; @@ -66,12 +71,12 @@ int fdstore_init(void) * a queue and remember its sequence number. Then we can set SO_PEEK_OFF * to get a file descriptor without dequeuing it. */ - if (bind(sk, (struct sockaddr *) &addr, addrlen)) { + if (bind(sk, (struct sockaddr *)&addr, addrlen)) { pr_perror("Unable to bind a socket"); close(sk); return -1; } - if (connect(sk, (struct sockaddr *) &addr, addrlen)) { + if (connect(sk, (struct sockaddr *)&addr, addrlen)) { pr_perror("Unable to connect a socket"); close(sk); return -1; @@ -93,7 +98,7 @@ int fdstore_add(int fd) ret = send_fd(sk, NULL, 0, fd); if (ret) { - pr_perror("Can't send fd %d into store\n", fd); + pr_perror("Can't send fd %d into store", fd); mutex_unlock(&desc->lock); return -1; } @@ -107,8 +112,13 @@ int fdstore_add(int fd) int fdstore_get(int id) { - int sk = get_service_fd(FDSTORE_SK_OFF); - int fd; + int sk, fd; + + sk = get_service_fd(FDSTORE_SK_OFF); + if (sk < 0) { + pr_err("Cannot get FDSTORE_SK_OFF fd\n"); + return -1; + } mutex_lock(&desc->lock); if (setsockopt(sk, SOL_SOCKET, SO_PEEK_OFF, &id, sizeof(id))) { diff --git a/criu/fifo.c b/criu/fifo.c index a26934319..bb291c14e 100644 --- a/criu/fifo.c +++ b/criu/fifo.c @@ -30,14 +30,16 @@ */ struct fifo_info { - struct list_head list; - struct file_desc d; - FifoEntry *fe; - bool restore_data; + struct list_head list; + struct file_desc d; + FifoEntry *fe; + bool restore_data; }; static LIST_HEAD(fifo_head); -static struct pipe_data_dump pd_fifo = { .img_type = CR_FD_FIFO_DATA, }; +static struct pipe_data_dump pd_fifo = { + .img_type = CR_FD_FIFO_DATA, +}; static int dump_one_fifo(int lfd, u32 id, const struct fd_parms *p) { @@ -56,13 +58,12 @@ static int dump_one_fifo(int lfd, u32 id, const struct fd_parms *p) if (dump_one_reg_file(lfd, rf_id, p)) return -1; - pr_info("Dumping fifo %d with id %#x pipe_id %#x\n", - lfd, id, pipe_id(p)); + pr_info("Dumping fifo %d with id %#x pipe_id %#x\n", lfd, id, pipe_id(p)); - e.id = id; - e.pipe_id = pipe_id(p); - e.has_regf_id = true; - e.regf_id = rf_id; + e.id = id; + e.pipe_id = pipe_id(p); + e.has_regf_id = true; + e.regf_id = rf_id; fe.type = FD_TYPES__FIFO; fe.id = e.id; @@ -75,8 +76,8 @@ static int dump_one_fifo(int lfd, u32 id, const struct fd_parms *p) } const struct fdtype_ops fifo_dump_ops = { - .type = FD_TYPES__FIFO, - .dump = dump_one_fifo, + .type = FD_TYPES__FIFO, + .dump = dump_one_fifo, }; static struct pipe_data_rst *pd_hash_fifo[PIPE_DATA_HASH_SIZE]; @@ -105,8 +106,7 @@ static int do_open_fifo(int ns_root_fd, struct reg_file_info *rfi, void *arg) } if (info->restore_data) - if (restore_pipe_data(CR_FD_FIFO_DATA, fake_fifo, - info->fe->pipe_id, pd_hash_fifo)) { + if (restore_pipe_data(CR_FD_FIFO_DATA, fake_fifo, info->fe->pipe_id, pd_hash_fifo)) { close(new_fifo); new_fifo = -1; } @@ -122,8 +122,7 @@ static int open_fifo_fd(struct file_desc *d, int *new_fd) struct file_desc *reg_d; int fd; - reg_d = collect_special_file(info->fe->has_regf_id ? - info->fe->regf_id : info->fe->id); + reg_d = collect_special_file(info->fe->has_regf_id ? info->fe->regf_id : info->fe->id); if (!reg_d) return -1; @@ -135,8 +134,8 @@ static int open_fifo_fd(struct file_desc *d, int *new_fd) } static struct file_desc_ops fifo_desc_ops = { - .type = FD_TYPES__FIFO, - .open = open_fifo_fd, + .type = FD_TYPES__FIFO, + .open = open_fifo_fd, }; static int collect_one_fifo(void *o, ProtobufCMessage *base, struct cr_img *i) @@ -144,8 +143,7 @@ static int collect_one_fifo(void *o, ProtobufCMessage *base, struct cr_img *i) struct fifo_info *info = o, *f; info->fe = pb_msg(base, FifoEntry); - pr_info("Collected fifo entry ID %#x PIPE ID %#x\n", - info->fe->id, info->fe->pipe_id); + pr_info("Collected fifo entry ID %#x PIPE ID %#x\n", info->fe->id, info->fe->pipe_id); /* check who will restore the fifo data */ list_for_each_entry(f, &fifo_head, list) @@ -161,7 +159,6 @@ static int collect_one_fifo(void *o, ProtobufCMessage *base, struct cr_img *i) } return file_desc_add(&info->d, info->fe->id, &fifo_desc_ops); - } struct collect_image_info fifo_cinfo = { diff --git a/criu/file-ids.c b/criu/file-ids.c index 006e47d64..772bd92cf 100644 --- a/criu/file-ids.c +++ b/criu/file-ids.c @@ -21,9 +21,9 @@ DECLARE_KCMP_TREE(fd_tree, KCMP_FILE); -#define FDID_BITS 5 -#define FDID_SIZE (1 << FDID_BITS) -#define FDID_MASK (FDID_SIZE - 1) +#define FDID_BITS 5 +#define FDID_SIZE (1 << FDID_BITS) +#define FDID_MASK (FDID_SIZE - 1) static inline int fdid_hashfn(unsigned int s_dev, unsigned long i_ino) { @@ -63,11 +63,8 @@ static struct fd_id *fd_id_cache_lookup(struct fd_parms *p) struct stat *st = &p->stat; struct fd_id *fi; - for (fi = fd_id_cache[fdid_hashfn(st->st_dev, st->st_ino)]; - fi; fi = fi->n) - if (fi->dev == st->st_dev && - fi->ino == st->st_ino && - fi->mnt_id == p->mnt_id) + for (fi = fd_id_cache[fdid_hashfn(st->st_dev, st->st_ino)]; fi; fi = fi->n) + if (fi->dev == st->st_dev && fi->ino == st->st_ino && fi->mnt_id == p->mnt_id) return fi; return NULL; @@ -80,8 +77,14 @@ int fd_id_generate_special(struct fd_parms *p, u32 *id) fi = fd_id_cache_lookup(p); if (fi) { - *id = fi->id; - return 0; + if (p->stat.st_mode & (S_IFCHR | S_IFBLK)) { + /* Don't cache the id for mapped devices */ + *id = fd_tree.subid++; + return 1; + } else { + *id = fi->id; + return 0; + } } } diff --git a/criu/file-lock.c b/criu/file-lock.c index 8be7589df..6334462b6 100644 --- a/criu/file-lock.c +++ b/criu/file-lock.c @@ -76,11 +76,10 @@ void free_file_locks(void) static int dump_one_file_lock(FileLockEntry *fle) { - pr_info("LOCK flag: %d,type: %d,pid: %d,fd: %d,start: %8"PRIx64",len: %8"PRIx64"\n", - fle->flag, fle->type, fle->pid, fle->fd, fle->start, fle->len); + pr_info("LOCK flag: %d,type: %d,pid: %d,fd: %d,start: %8" PRIx64 ",len: %8" PRIx64 "\n", fle->flag, fle->type, + fle->pid, fle->fd, fle->start, fle->len); - return pb_write_one(img_from_set(glob_imgset, CR_FD_FILE_LOCKS), - fle, PB_FILE_LOCK); + return pb_write_one(img_from_set(glob_imgset, CR_FD_FILE_LOCKS), fle, PB_FILE_LOCK); } static void fill_flock_entry(FileLockEntry *fle, int fl_kind, int fl_ltype) @@ -91,23 +90,28 @@ static void fill_flock_entry(FileLockEntry *fle, int fl_kind, int fl_ltype) int dump_file_locks(void) { - FileLockEntry fle; + FileLockEntry fle; struct file_lock *fl; - int ret = 0; + int ret = 0; pr_info("Dumping file-locks\n"); list_for_each_entry(fl, &file_lock_list, list) { if (fl->real_owner == -1) { if (fl->fl_kind == FL_POSIX) { - pr_err("Unresolved lock found pid %d ino %ld\n", - fl->fl_owner, fl->i_no); + pr_err("Unresolved lock found pid %d ino %ld\n", fl->fl_owner, fl->i_no); return -1; } continue; } + if (!opts.handle_file_locks) { + pr_err("Some file locks are hold by dumping tasks! " + "You can try --" OPT_FILE_LOCKS " to dump them.\n"); + return -1; + } + file_lock_entry__init(&fle); fle.pid = fl->real_owner; fle.fd = fl->owners_fd; @@ -148,7 +152,7 @@ static int lock_btrfs_file_match(pid_t pid, int fd, struct file_lock *fl, struct link[ret] = 0; ns = lookup_nsid_by_mnt_id(p->mnt_id); - return phys_stat_dev_match(p->stat.st_dev, phys_dev, ns, link); + return phys_stat_dev_match(p->stat.st_dev, phys_dev, ns, link); } static inline int lock_file_match(pid_t pid, int fd, struct file_lock *fl, struct fd_parms *p) @@ -195,7 +199,7 @@ static int lock_check_fd(int lfd, struct file_lock *fl) } else { /* * The ret == 0 means, that new lock doesn't conflict - * with any others on the file. But since we do know, + * with any others on the file. But since we do know, * that there should be some other one (file is found * in /proc/locks), it means that the lock is already * on file pointed by fd. @@ -218,11 +222,7 @@ static int lock_ofd_check_fd(int lfd, struct file_lock *fl) { int ret; - struct flock lck = { - .l_whence = SEEK_SET, - .l_type = F_WRLCK, - .l_start = fl->start - }; + struct flock lck = { .l_whence = SEEK_SET, .l_type = F_WRLCK, .l_start = fl->start }; if (strcmp(fl->end, "EOF")) { unsigned long end; @@ -360,7 +360,7 @@ int note_file_lock(struct pid *pid, int fd, int lfd, struct fd_parms *p) if (!opts.handle_file_locks) { pr_err("Some file locks are hold by dumping tasks!" - "You can try --" OPT_FILE_LOCKS " to dump them.\n"); + "You can try --" OPT_FILE_LOCKS " to dump them.\n"); return -1; } @@ -374,8 +374,7 @@ int note_file_lock(struct pid *pid, int fd, int lfd, struct fd_parms *p) } else if (fl->fl_kind == FL_LEASE) { if (fl->owners_fd >= 0) continue; - if (fl->fl_owner != pid->real && - fl->real_owner != -1) + if (fl->fl_owner != pid->real && fl->real_owner != -1) continue; ret = lease_check_fd(lfd, p->flags, fl); @@ -393,8 +392,7 @@ int note_file_lock(struct pid *pid, int fd, int lfd, struct fd_parms *p) * anyway. */ - if (fl->fl_owner != pid->real && - fl->real_owner != -1) + if (fl->fl_owner != pid->real && fl->real_owner != -1) continue; pr_debug("Checking lock holder %d:%d\n", pid->real, fd); @@ -413,9 +411,7 @@ int note_file_lock(struct pid *pid, int fd, int lfd, struct fd_parms *p) fl->real_owner = pid->ns[0].virt; fl->owners_fd = fd; - pr_info("Found lock entry %d.%d %d vs %d\n", - pid->real, pid->ns[0].virt, fd, - fl->fl_owner); + pr_info("Found lock entry %d.%d %d vs %d\n", pid->real, pid->ns[0].virt, fd, fl->fl_owner); } return 0; @@ -444,8 +440,7 @@ int correct_file_leases_type(struct pid *pid, int fd, int lfd) if (fl->fl_holder != pid->real || fl->owners_fd != fd) continue; - if (fl->fl_kind == FL_LEASE && - (fl->fl_ltype & LEASE_BREAKING)) { + if (fl->fl_kind == FL_LEASE && (fl->fl_ltype & LEASE_BREAKING)) { /* * Set lease type to actual 'target lease type' * instead of 'READ' returned by procfs. @@ -473,7 +468,7 @@ static int open_break_cb(int ns_root_fd, struct reg_file_info *rfi, void *arg) close(fd); return -1; } else if (errno != EWOULDBLOCK) { - pr_perror("Can't break lease\n"); + pr_perror("Can't break lease"); return -1; } return 0; @@ -506,7 +501,7 @@ static int set_file_lease(int fd, int type) struct stat st; if (fstat(fd, &st)) { - pr_perror("Can't get file stat (%i)\n", fd); + pr_perror("Can't get file stat (%i)", fd); return -1; } @@ -518,7 +513,7 @@ static int set_file_lease(int fd, int type) ret = fcntl(fd, F_SETLEASE, type); if (ret < 0) - pr_perror("Can't set lease\n"); + pr_perror("Can't set lease"); setfsuid(old_fsuid); return ret; @@ -532,8 +527,7 @@ static int restore_lease_prebreaking_state(int fd, int fd_type) return set_file_lease(fd, lease_type); } -static struct fdinfo_list_entry *find_fd_unordered(struct pstree_item *task, - int fd) +static struct fdinfo_list_entry *find_fd_unordered(struct pstree_item *task, int fd) { struct list_head *head = &rsti(task)->fds; struct fdinfo_list_entry *fle; @@ -583,20 +577,19 @@ static int restore_file_lease(FileLockEntry *fle) signum_fcntl = fcntl(fle->fd, F_GETSIG); signum = signum_fcntl ? signum_fcntl : SIGIO; if (signum_fcntl < 0) { - pr_perror("Can't get file i/o signum\n"); + pr_perror("Can't get file i/o signum"); return -1; } - if (sigemptyset(&blockmask) || - sigaddset(&blockmask, signum) || - sigprocmask(SIG_BLOCK, &blockmask, &oldmask)) { - pr_perror("Can't block file i/o signal\n"); + if (sigemptyset(&blockmask) || sigaddset(&blockmask, signum) || + sigprocmask(SIG_BLOCK, &blockmask, &oldmask)) { + pr_perror("Can't block file i/o signal"); return -1; } ret = restore_breaking_file_lease(fle); if (sigprocmask(SIG_SETMASK, &oldmask, NULL)) { - pr_perror("Can't restore sigmask\n"); + pr_perror("Can't restore sigmask"); ret = -1; } return ret; @@ -627,8 +620,8 @@ static int restore_file_lock(FileLockEntry *fle) goto err; } - pr_info("(flock)flag: %d, type: %d, cmd: %d, pid: %d, fd: %d\n", - fle->flag, fle->type, cmd, fle->pid, fle->fd); + pr_info("(flock)flag: %d, type: %d, cmd: %d, pid: %d, fd: %d\n", fle->flag, fle->type, cmd, fle->pid, + fle->fd); ret = flock(fle->fd, cmd); if (ret < 0) { @@ -640,15 +633,14 @@ static int restore_file_lock(FileLockEntry *fle) memset(&flk, 0, sizeof(flk)); flk.l_whence = SEEK_SET; - flk.l_start = fle->start; - flk.l_len = fle->len; - flk.l_pid = fle->pid; - flk.l_type = fle->type; + flk.l_start = fle->start; + flk.l_len = fle->len; + flk.l_pid = fle->pid; + flk.l_type = fle->type; pr_info("(posix)flag: %d, type: %d, pid: %d, fd: %d, " - "start: %8"PRIx64", len: %8"PRIx64"\n", - fle->flag, fle->type, fle->pid, fle->fd, - fle->start, fle->len); + "start: %8" PRIx64 ", len: %8" PRIx64 "\n", + fle->flag, fle->type, fle->pid, fle->fd, fle->start, fle->len); ret = fcntl(fle->fd, F_SETLKW, &flk); if (ret < 0) { @@ -657,17 +649,12 @@ static int restore_file_lock(FileLockEntry *fle) } } else if (fle->flag & FL_OFD) { struct flock flk = { - .l_whence = SEEK_SET, - .l_start = fle->start, - .l_len = fle->len, - .l_pid = 0, - .l_type = fle->type + .l_whence = SEEK_SET, .l_start = fle->start, .l_len = fle->len, .l_pid = 0, .l_type = fle->type }; pr_info("(ofd)flag: %d, type: %d, pid: %d, fd: %d, " - "start: %8"PRIx64", len: %8"PRIx64"\n", - fle->flag, fle->type, fle->pid, fle->fd, - fle->start, fle->len); + "start: %8" PRIx64 ", len: %8" PRIx64 "\n", + fle->flag, fle->type, fle->pid, fle->fd, fle->start, fle->len); ret = fcntl(fle->fd, F_OFD_SETLK, &flk); if (ret < 0) { @@ -676,9 +663,8 @@ static int restore_file_lock(FileLockEntry *fle) } } else if (fle->flag & FL_LEASE) { pr_info("(lease)flag: %d, type: %d, pid: %d, fd: %d, " - "start: %8"PRIx64", len: %8"PRIx64"\n", - fle->flag, fle->type, fle->pid, fle->fd, - fle->start, fle->len); + "start: %8" PRIx64 ", len: %8" PRIx64 "\n", + fle->flag, fle->type, fle->pid, fle->fd, fle->start, fle->len); ret = restore_file_lease(fle); if (ret < 0) goto err; @@ -714,5 +700,4 @@ int prepare_file_locks(int pid) return 0; return restore_file_locks(pid); - } diff --git a/criu/files-ext.c b/criu/files-ext.c index a6247d673..4cc99d921 100644 --- a/criu/files-ext.c +++ b/criu/files-ext.c @@ -20,8 +20,8 @@ static int dump_one_ext_file(int lfd, u32 id, const struct fd_parms *p) if (ret < 0) return ret; - xfe.id = id; - xfe.fown = (FownEntry *)&p->fown; + xfe.id = id; + xfe.fown = (FownEntry *)&p->fown; fe.type = FD_TYPES__EXT; fe.id = xfe.id; @@ -32,23 +32,24 @@ static int dump_one_ext_file(int lfd, u32 id, const struct fd_parms *p) } const struct fdtype_ops ext_dump_ops = { - .type = FD_TYPES__EXT, - .dump = dump_one_ext_file, + .type = FD_TYPES__EXT, + .dump = dump_one_ext_file, }; struct ext_file_info { - struct file_desc d; - ExtFileEntry *xfe; + struct file_desc d; + ExtFileEntry *xfe; }; static int open_fd(struct file_desc *d, int *new_fd) { struct ext_file_info *xfi; int fd; + bool retry_needed; xfi = container_of(d, struct ext_file_info, d); - fd = run_plugins(RESTORE_EXT_FILE, xfi->xfe->id); + fd = run_plugins(RESTORE_EXT_FILE, xfi->xfe->id, &retry_needed); if (fd < 0) { pr_err("Unable to restore %#x\n", xfi->xfe->id); return -1; @@ -57,8 +58,11 @@ static int open_fd(struct file_desc *d, int *new_fd) if (restore_fown(fd, xfi->xfe->fown)) return -1; - *new_fd = fd; - return 0; + if (!retry_needed) + *new_fd = fd; + else + *new_fd = -1; + return retry_needed; } static struct file_desc_ops ext_desc_ops = { @@ -83,8 +87,7 @@ struct collect_image_info ext_file_cinfo = { .collect = collect_one_ext, }; -int dump_unsupp_fd(struct fd_parms *p, int lfd, - char *more, char *info, FdinfoEntry *e) +int dump_unsupp_fd(struct fd_parms *p, int lfd, char *more, char *info, FdinfoEntry *e) { int ret; @@ -92,7 +95,6 @@ int dump_unsupp_fd(struct fd_parms *p, int lfd, if (ret == 0) return 0; if (ret == -ENOTSUP) - pr_err("Can't dump file %d of that type [%o] (%s %s)\n", - p->fd, p->stat.st_mode, more, info); + pr_err("Can't dump file %d of that type [%o] (%s %s)\n", p->fd, p->stat.st_mode, more, info); return -1; } diff --git a/criu/files-reg.c b/criu/files-reg.c index 2f68bc03f..66c0e6cda 100644 --- a/criu/files-reg.c +++ b/criu/files-reg.c @@ -11,17 +11,32 @@ #include #include #include -#include +#include +#include +#include +#include + +#include "tty.h" +#include "stats.h" #ifndef SEEK_DATA -#define SEEK_DATA 3 -#define SEEK_HOLE 4 +#define SEEK_DATA 3 +#define SEEK_HOLE 4 #endif /* Stolen from kernel/fs/nfs/unlink.c */ -#define SILLYNAME_PREF ".nfs" +#define SILLYNAME_PREF ".nfs" #define SILLYNAME_SUFF_LEN (((unsigned)sizeof(u64) << 1) + ((unsigned)sizeof(unsigned int) << 1)) +/* + * If the build-id exists, then it will most likely be present in the + * beginning of the file. Therefore only the first 1MB will be mapped + * and checked. + */ +#define BUILD_ID_MAP_SIZE 1048576 +#define ST_UNIT 512 +#define EXTENT_MAX_COUNT 512 + #include "cr_options.h" #include "imgset.h" #include "file-ids.h" @@ -33,8 +48,10 @@ #include "namespaces.h" #include "proc_parse.h" #include "pstree.h" +#include "string.h" #include "fault-injection.h" #include "external.h" +#include "memfd.h" #include "protobuf.h" #include "util.h" @@ -43,6 +60,7 @@ #include "files-reg.h" #include "plugin.h" +#include "string.h" int setfsuid(uid_t fsuid); int setfsgid(gid_t fsuid); @@ -53,13 +71,13 @@ int setfsgid(gid_t fsuid); * us. Any brave soul to implement link unlinked file back? */ struct ghost_file { - struct list_head list; - u32 id; + struct list_head list; + u32 id; - u32 dev; - u32 ino; + u32 dev; + u32 ino; - struct file_remap remap; + struct file_remap remap; }; static u32 ghost_file_ids = 1; @@ -68,7 +86,7 @@ static LIST_HEAD(ghost_files); /* * When opening remaps we first create a link on the remap * target, then open one, then unlink. In case the remap - * source has more than one instance, these tree steps + * source has more than one instance, these three steps * should be serialized with each other. */ static mutex_t *remap_open_lock; @@ -92,9 +110,9 @@ static LIST_HEAD(remaps); * we keep all data in memory. */ struct link_remap_rlb { - struct list_head list; - struct ns_id *mnt_ns; - char *path; + struct list_head list; + struct ns_id *mnt_ns; + char *path; }; static int note_link_remap(char *path, struct ns_id *nsid) @@ -148,11 +166,10 @@ static int trim_last_parent(char *path) return 0; } -#define BUFSIZE (4096) +#define BUFSIZE (4096) static int copy_chunk_from_file(int fd, int img, off_t off, size_t len) { - char *buf = NULL; int ret; while (len > 0) { @@ -165,7 +182,6 @@ static int copy_chunk_from_file(int fd, int img, off_t off, size_t len) len -= ret; } - xfree(buf); return 0; } @@ -209,9 +225,94 @@ static int copy_file_to_chunks(int fd, struct cr_img *img, size_t file_size) return 0; } +static int skip_outstanding(struct fiemap_extent *fe, size_t file_size) +{ + /* Skip outstanding extent */ + if (fe->fe_logical > file_size) + return 1; + + /* Skip outstanding part of the extent */ + if (fe->fe_logical + fe->fe_length > file_size) + fe->fe_length = file_size - fe->fe_logical; + return 0; +} + +static int copy_file_to_chunks_fiemap(int fd, struct cr_img *img, size_t file_size) +{ + GhostChunkEntry ce = GHOST_CHUNK_ENTRY__INIT; + struct fiemap *fiemap_buf; + struct fiemap_extent *ext_buf; + int ext_buf_size, fie_buf_size; + off_t pos = 0; + unsigned int i; + int ret = 0; + int exit_code = 0; + + ext_buf_size = EXTENT_MAX_COUNT * sizeof(struct fiemap_extent); + fie_buf_size = sizeof(struct fiemap) + ext_buf_size; + + fiemap_buf = xzalloc(fie_buf_size); + if (!fiemap_buf) { + pr_perror("Out of memory when allocating fiemap"); + return -1; + } + + ext_buf = fiemap_buf->fm_extents; + fiemap_buf->fm_length = FIEMAP_MAX_OFFSET; + fiemap_buf->fm_flags |= FIEMAP_FLAG_SYNC; + fiemap_buf->fm_extent_count = EXTENT_MAX_COUNT; + + do { + fiemap_buf->fm_start = pos; + memzero(ext_buf, ext_buf_size); + ret = ioctl(fd, FS_IOC_FIEMAP, fiemap_buf); + if (ret < 0) { + if (errno == EOPNOTSUPP) { + exit_code = -EOPNOTSUPP; + } else { + exit_code = -1; + pr_perror("fiemap ioctl() failed"); + } + goto out; + } else if (fiemap_buf->fm_mapped_extents == 0) { + goto out; + } + + for (i = 0; i < fiemap_buf->fm_mapped_extents; i++) { + if (skip_outstanding(&fiemap_buf->fm_extents[i], file_size)) + continue; + + ce.len = fiemap_buf->fm_extents[i].fe_length; + ce.off = fiemap_buf->fm_extents[i].fe_logical; + + if (pb_write_one(img, &ce, PB_GHOST_CHUNK)) { + exit_code = -1; + goto out; + } + + if (copy_chunk_from_file(fd, img_raw_fd(img), ce.off, ce.len)) { + exit_code = -1; + goto out; + } + + if (fiemap_buf->fm_extents[i].fe_flags & FIEMAP_EXTENT_LAST) { + /* there are no extents left, break. */ + goto out; + } + } + + /* Record file's logical offset as pos */ + pos = ce.len + ce.off; + + /* Since there are still extents left, continue. */ + } while (fiemap_buf->fm_mapped_extents == EXTENT_MAX_COUNT); +out: + xfree(fiemap_buf); + return exit_code; +} + static int copy_chunk_to_file(int img, int fd, off_t off, size_t len) { - char *buf = NULL; int ret; while (len > 0) { @@ -219,7 +320,11 @@ static int copy_chunk_to_file(int img, int fd, off_t off, size_t len) pr_perror("Can't seek file"); return -1; } - ret = sendfile(fd, img, NULL, len); + + if (opts.stream) + ret = splice(img, NULL, fd, NULL, len, SPLICE_F_MOVE); + else + ret = sendfile(fd, img, NULL, len); if (ret < 0) { pr_perror("Can't send data"); return -1; @@ -229,7 +334,6 @@ static int copy_chunk_to_file(int img, int fd, off_t off, size_t len) len -= ret; } - xfree(buf); return 0; } @@ -280,63 +384,54 @@ static int mkreg_ghost(char *path, GhostFileEntry *gfe, struct cr_img *img) return ret; } +static int mklnk_ghost(char *path, GhostFileEntry *gfe) +{ + if (!gfe->symlnk_target) { + pr_err("Ghost symlink target is NULL for %s. Image from old CRIU?\n", path); + return -1; + } + + if (symlink(gfe->symlnk_target, path) < 0) { + /* + * ENOENT case is OK + * Take a look closer on create_ghost() function + */ + if (errno != ENOENT) + pr_perror("symlink(%s, %s) failed", gfe->symlnk_target, path); + return -1; + } + + return 0; +} + static int ghost_apply_metadata(const char *path, GhostFileEntry *gfe) { struct timeval tv[2]; - int ret = -1; - if (chown(path, gfe->uid, gfe->gid) < 0) { - pr_perror("Can't reset user/group on ghost %s", path); - goto err; + if (cr_fchpermat(AT_FDCWD, path, gfe->uid, gfe->gid, gfe->mode, AT_SYMLINK_NOFOLLOW) < 0) + return -1; + + if (!gfe->atim) + return 0; + + tv[0].tv_sec = gfe->atim->tv_sec; + tv[0].tv_usec = gfe->atim->tv_usec; + tv[1].tv_sec = gfe->mtim->tv_sec; + tv[1].tv_usec = gfe->mtim->tv_usec; + + if (lutimes(path, tv)) { + pr_perror("Can't set access and modification times on ghost %s", path); + return -1; } - if (chmod(path, gfe->mode)) { - pr_perror("Can't set perms %o on ghost %s", gfe->mode, path); - goto err; - } - - if (gfe->atim) { - tv[0].tv_sec = gfe->atim->tv_sec; - tv[0].tv_usec = gfe->atim->tv_usec; - tv[1].tv_sec = gfe->mtim->tv_sec; - tv[1].tv_usec = gfe->mtim->tv_usec; - if (lutimes(path, tv)) { - pr_perror("Can't set access and modification times on ghost %s", path); - goto err; - } - } - - ret = 0; -err: - return ret; + return 0; } -static int create_ghost(struct ghost_file *gf, GhostFileEntry *gfe, struct cr_img *img) +static int create_ghost_dentry(char *path, GhostFileEntry *gfe, struct cr_img *img) { - struct mount_info *mi; - char path[PATH_MAX]; - int ret, root_len; + int ret = -1; char *msg; - root_len = ret = rst_get_mnt_root(gf->remap.rmnt_id, path, sizeof(path)); - if (ret < 0) { - pr_err("The %d mount is not found for ghost\n", gf->remap.rmnt_id); - goto err; - } - - /* Add a '/' only if we have no at the end */ - if (path[root_len-1] != '/') { - path[root_len++] = '/'; - path[root_len] = '\0'; - } - - snprintf(path + root_len, sizeof(path) - root_len, "%s", gf->remap.rpath); - ret = -1; - - mi = lookup_mnt_id(gf->remap.rmnt_id); - /* We get here while in service mntns */ - if (mi && try_remount_writable(mi, false)) - goto err; again: if (S_ISFIFO(gfe->mode)) { if ((ret = mknod(path, gfe->mode, 0)) < 0) @@ -351,6 +446,9 @@ again: } else if (S_ISDIR(gfe->mode)) { if ((ret = mkdirpat(AT_FDCWD, path, gfe->mode)) < 0) msg = "Can't make ghost dir"; + } else if (S_ISLNK(gfe->mode)) { + if ((ret = mklnk_ghost(path, gfe)) < 0) + msg = "Can't create ghost symlink"; } else { if ((ret = mkreg_ghost(path, gfe, img)) < 0) msg = "Can't create ghost regfile"; @@ -370,26 +468,89 @@ again: goto err; } - strcpy(gf->remap.rpath, path + root_len); - pr_debug("Remap rpath is %s\n", gf->remap.rpath); - - ret = -1; - if (ghost_apply_metadata(path, gfe)) - goto err; - ret = 0; err: return ret; } -static inline void ghost_path(char *path, int plen, - struct reg_file_info *rfi, RemapFilePathEntry *rpe) +static int nomntns_create_ghost(struct ghost_file *gf, GhostFileEntry *gfe, struct cr_img *img) +{ + char path[PATH_MAX]; + + snprintf(path, sizeof(path), "/%s", gf->remap.rpath); + + if (create_ghost_dentry(path, gfe, img)) + return -1; + + if (ghost_apply_metadata(path, gfe)) + return -1; + + __strlcpy(gf->remap.rpath, path + 1, PATH_MAX); + pr_debug("Remap rpath is %s\n", gf->remap.rpath); + return 0; +} + +static int create_ghost(struct ghost_file *gf, GhostFileEntry *gfe, struct cr_img *img) +{ + struct mount_info *mi; + char path[PATH_MAX], *rel_path, *rel_mp; + + if (!(root_ns_mask & CLONE_NEWNS)) + return nomntns_create_ghost(gf, gfe, img); + + mi = lookup_mnt_id(gf->remap.rmnt_id); + if (!mi) { + pr_err("The %d mount is not found for ghost\n", gf->remap.rmnt_id); + return -1; + } + + /* Get path relative to mountpoint from path relative to mntns */ + rel_path = get_relative_path(gf->remap.rpath, mi->ns_mountpoint); + if (!rel_path) { + pr_err("Can't get path %s relative to %s\n", gf->remap.rpath, mi->ns_mountpoint); + return -1; + } + + snprintf(path, sizeof(path), "%s%s%s", service_mountpoint(mi), rel_path[0] ? "/" : "", rel_path); + pr_debug("Trying to create ghost on path %s\n", path); + + /* We get here while in service mntns */ + if (try_remount_writable(mi, false)) + return -1; + + if (create_ghost_dentry(path, gfe, img)) + return -1; + + if (ghost_apply_metadata(path, gfe)) + return -1; + + /* + * Convert the path back to mntns relative, as create_ghost_dentry + * might have changed it. + */ + rel_path = get_relative_path(path, service_mountpoint(mi)); + if (!rel_path) { + pr_err("Can't get path %s relative to %s\n", path, service_mountpoint(mi)); + return -1; + } + + rel_mp = get_relative_path(mi->ns_mountpoint, "/"); + if (!rel_mp) { + pr_err("Can't get path %s relative to %s\n", mi->ns_mountpoint, "/"); + return -1; + } + + snprintf(gf->remap.rpath, PATH_MAX, "%s%s%s", rel_mp, (rel_mp[0] && rel_path[0]) ? "/" : "", rel_path); + pr_debug("Remap rpath is %s\n", gf->remap.rpath); + return 0; +} + +static inline void ghost_path(char *path, int plen, struct reg_file_info *rfi, RemapFilePathEntry *rpe) { snprintf(path, plen, "%s.cr.%x.ghost", rfi->path, rpe->remap_id); } -static int collect_remap_ghost(struct reg_file_info *rfi, - RemapFilePathEntry *rpe) +static int collect_remap_ghost(struct reg_file_info *rfi, RemapFilePathEntry *rpe) { struct ghost_file *gf; @@ -429,8 +590,7 @@ gf_found: return 0; } -static int open_remap_ghost(struct reg_file_info *rfi, - RemapFilePathEntry *rpe) +static int open_remap_ghost(struct reg_file_info *rfi, RemapFilePathEntry *rpe) { struct ghost_file *gf = container_of(rfi->remap, struct ghost_file, remap); GhostFileEntry *gfe = NULL; @@ -456,7 +616,7 @@ static int open_remap_ghost(struct reg_file_info *rfi, gf->remap.rmnt_id = rfi->rfe->mnt_id; if (S_ISDIR(gfe->mode)) - strncpy(gf->remap.rpath, rfi->path, PATH_MAX); + __strlcpy(gf->remap.rpath, rfi->path, PATH_MAX); else ghost_path(gf->remap.rpath, PATH_MAX, rfi, rpe); @@ -480,8 +640,7 @@ err: return -1; } -static int collect_remap_linked(struct reg_file_info *rfi, - RemapFilePathEntry *rpe) +static int collect_remap_linked(struct reg_file_info *rfi, RemapFilePathEntry *rpe) { struct file_remap *rm; struct file_desc *rdesc; @@ -528,8 +687,7 @@ static int open_remap_linked(struct reg_file_info *rfi) return 0; } -static int collect_remap_dead_process(struct reg_file_info *rfi, - RemapFilePathEntry *rfe) +static int collect_remap_dead_process(struct reg_file_info *rfi, RemapFilePathEntry *rfe) { struct pstree_item *helper; @@ -542,7 +700,6 @@ static int collect_remap_dead_process(struct reg_file_info *rfi, return 0; } - helper->sid = root_item->sid; helper->pgid = root_item->pgid; helper->pid->ns[0].virt = rfe->remap_id; @@ -662,44 +819,50 @@ int prepare_remaps(void) static int clean_one_remap(struct remap_info *ri) { struct file_remap *remap = ri->rfi->remap; - int mnt_id, ret, rmntns_root; + int mnt_id, ret; struct mount_info *mi; - char path[PATH_MAX]; + char path[PATH_MAX], *rel_path; if (remap->rpath[0] == 0) return 0; + if (!(root_ns_mask & CLONE_NEWNS)) { + snprintf(path, sizeof(path), "/%s", remap->rpath); + goto nomntns; + } + mnt_id = ri->rfi->rfe->mnt_id; /* rirfirfe %) */ - ret = rst_get_mnt_root(mnt_id, path, sizeof(path)); - if (ret < 0) - return -1; - if (ret >= sizeof(path) - 1) { - pr_err("The path buffer is too small\n"); - return -1; - } - - rmntns_root = open(path, O_RDONLY); - if (rmntns_root < 0) { - pr_perror("Unable to open %s", path); - return -1; - } - mi = lookup_mnt_id(mnt_id); + if (!mi) { + pr_err("The %d mount is not found for ghost\n", mnt_id); + return -1; + } + + rel_path = get_relative_path(remap->rpath, mi->ns_mountpoint); + if (!rel_path) { + pr_err("Can't get path %s relative to %s\n", remap->rpath, mi->ns_mountpoint); + return -1; + } + + snprintf(path, sizeof(path), "%s%s%s", service_mountpoint(mi), strlen(rel_path) ? "/" : "", rel_path); + /* We get here while in service mntns */ - if (mi && try_remount_writable(mi, false)) { - close(rmntns_root); + if (try_remount_writable(mi, false)) + return -1; + +nomntns: + pr_info("Unlink remap %s\n", path); + + if (remap->is_dir) + ret = rmdir(path); + else + ret = unlink(path); + + if (ret) { + pr_perror("Couldn't unlink remap %s", path); return -1; } - pr_info("Unlink remap %s\n", remap->rpath); - - ret = unlinkat(rmntns_root, remap->rpath, remap->is_dir ? AT_REMOVEDIR : 0); - if (ret < 0) { - close(rmntns_root); - pr_perror("Couldn't unlink remap %s %s", path, remap->rpath); - return -1; - } - close(rmntns_root); remap->rpath[0] = 0; return 0; @@ -730,7 +893,7 @@ static struct collect_image_info remap_cinfo = { }; /* Tiny files don't need to generate chunks in ghost image. */ -#define GHOST_CHUNKS_THRESH (3 * 4096) +#define GHOST_CHUNKS_THRESH (3 * 4096) static int dump_ghost_file(int _fd, u32 id, const struct stat *st, dev_t phys_dev) { @@ -738,6 +901,7 @@ static int dump_ghost_file(int _fd, u32 id, const struct stat *st, dev_t phys_de int exit_code = -1; GhostFileEntry gfe = GHOST_FILE_ENTRY__INIT; Timeval atim = TIMEVAL__INIT, mtim = TIMEVAL__INIT; + char pathbuf[PATH_MAX]; pr_info("Dumping ghost file contents (id %#x)\n", id); @@ -771,28 +935,66 @@ static int dump_ghost_file(int _fd, u32 id, const struct stat *st, dev_t phys_de gfe.size = st->st_size; } + /* + * We set gfe.symlnk_target only if we need to dump + * symlink content, otherwise we leave it NULL. + * It will be taken into account on restore in mklnk_ghost function. + */ + if (S_ISLNK(st->st_mode)) { + ssize_t ret; + + /* + * We assume that _fd opened with O_PATH | O_NOFOLLOW + * flags because S_ISLNK(st->st_mode). With current kernel version, + * it's looks like correct assumption in any case. + */ + ret = readlinkat(_fd, "", pathbuf, sizeof(pathbuf) - 1); + if (ret < 0) { + pr_perror("Can't readlinkat"); + goto err_out; + } + + pathbuf[ret] = 0; + + if (ret != st->st_size) { + pr_err("Buffer for readlinkat is too small: ret %zd, st_size %" PRId64 ", buf %u %s\n", ret, + st->st_size, PATH_MAX, pathbuf); + goto err_out; + } + + gfe.symlnk_target = pathbuf; + } + if (pb_write_one(img, &gfe, PB_GHOST_FILE)) goto err_out; if (S_ISREG(st->st_mode)) { int fd, ret; - char lpath[PSFDS]; /* * Reopen file locally since it may have no read * permissions when drained */ - sprintf(lpath, "/proc/self/fd/%d", _fd); - fd = open(lpath, O_RDONLY); + fd = open_proc(PROC_SELF, "fd/%d", _fd); if (fd < 0) { pr_perror("Can't open ghost original file"); goto err_out; } - if (gfe.chunks) - ret = copy_file_to_chunks(fd, img, st->st_size); - else + if (gfe.chunks) { + if (opts.ghost_fiemap) { + ret = copy_file_to_chunks_fiemap(fd, img, st->st_size); + if (ret == -EOPNOTSUPP) { + pr_debug("file system don't support fiemap\n"); + ret = copy_file_to_chunks(fd, img, st->st_size); + } + } else { + ret = copy_file_to_chunks(fd, img, st->st_size); + } + } else { ret = copy_file(fd, img_raw_fd(img), st->st_size); + } + close(fd); if (ret) goto err_out; @@ -817,8 +1019,7 @@ struct file_remap *lookup_ghost_remap(u32 dev, u32 ino) return NULL; } -static int dump_ghost_remap(char *path, const struct stat *st, - int lfd, u32 id, struct ns_id *nsid) +static int dump_ghost_remap(char *path, const struct stat *st, int lfd, u32 id, struct ns_id *nsid) { struct ghost_file *gf; RemapFilePathEntry rpe = REMAP_FILE_PATH_ENTRY__INIT; @@ -826,9 +1027,8 @@ static int dump_ghost_remap(char *path, const struct stat *st, pr_info("Dumping ghost file for fd %d id %#x\n", lfd, id); - if (st->st_size > opts.ghost_limit) { - pr_err("Can't dump ghost file %s of %"PRIu64" size, increase limit\n", - path, st->st_size); + if (st->st_blocks * ST_UNIT > opts.ghost_limit) { + pr_err("Can't dump ghost file %s of %" PRIu64 " size, increase limit\n", path, st->st_blocks * ST_UNIT); return -1; } @@ -844,10 +1044,13 @@ static int dump_ghost_remap(char *path, const struct stat *st, gf->dev = phys_dev; gf->ino = st->st_ino; gf->id = ghost_file_ids++; - list_add_tail(&gf->list, &ghost_files); - if (dump_ghost_file(lfd, gf->id, st, phys_dev)) + if (dump_ghost_file(lfd, gf->id, st, phys_dev)) { + xfree(gf); return -1; + } + + list_add_tail(&gf->list, &ghost_files); dump_entry: rpe.orig_id = id; @@ -855,8 +1058,7 @@ dump_entry: rpe.has_remap_type = true; rpe.remap_type = REMAP_TYPE__GHOST; - return pb_write_one(img_from_set(glob_imgset, CR_FD_REMAP_FPATH), - &rpe, PB_REMAP_FPATH); + return pb_write_one(img_from_set(glob_imgset, CR_FD_REMAP_FPATH), &rpe, PB_REMAP_FPATH); } static void __rollback_link_remaps(bool do_unlink) @@ -879,24 +1081,47 @@ static void __rollback_link_remaps(bool do_unlink) } } -void delete_link_remaps(void) { __rollback_link_remaps(true); } -void free_link_remaps(void) { __rollback_link_remaps(false); } +void delete_link_remaps(void) +{ + __rollback_link_remaps(true); +} +void free_link_remaps(void) +{ + __rollback_link_remaps(false); +} static int linkat_hard(int odir, char *opath, int ndir, char *npath, uid_t uid, gid_t gid, int flags); -static int create_link_remap(char *path, int len, int lfd, - u32 *idp, struct ns_id *nsid, - const struct stat *st) +static void check_overlayfs_fallback(char *path, const struct fd_parms *parms, bool *fallback) +{ + if (!fallback || parms->fs_type != OVERLAYFS_SUPER_MAGIC) + return; + + /* + * In overlayFS, linkat() fails with ENOENT if the removed file is + * originated from lower layer. The cause of failure is that linkat() + * sees the file has st_nlink=0, which is different than st_nlink=1 we + * got from earlier fstat() on lfd. By setting *fb=true, we will fall + * back to dump_ghost_remap() as it is what should have been done to + * removed files with st_nlink=0. + */ + pr_info("Unable to link-remap %s on overlayFS, fall back to dump_ghost_remap\n", path); + *fallback = true; +} + +static int create_link_remap(char *path, int len, int lfd, u32 *idp, struct ns_id *nsid, const struct fd_parms *parms, + bool *fallback) { char link_name[PATH_MAX], *tmp; FileEntry fe = FILE_ENTRY__INIT; RegFileEntry rfe = REG_FILE_ENTRY__INIT; FownEntry fwn = FOWN_ENTRY__INIT; int mntns_root; - int ret; + const struct stat *ost = &parms->stat; if (!opts.link_remap_ok) { pr_err("Can't create link remap for %s. " - "Use " LREMAP_PARAM " option.\n", path); + "Use " LREMAP_PARAM " option.\n", + path); return -1; } @@ -918,30 +1143,29 @@ static int create_link_remap(char *path, int len, int lfd, } fd_id_generate_special(NULL, idp); - rfe.id = *idp; - rfe.flags = 0; - rfe.pos = 0; - rfe.fown = &fwn; - rfe.name = link_name + 1; + rfe.id = *idp; + rfe.flags = 0; + rfe.pos = 0; + rfe.fown = &fwn; + rfe.name = link_name + 1; /* Any 'unique' name works here actually. Remap works by reg-file ids. */ - snprintf(tmp + 1, sizeof(link_name) - (size_t)(tmp - link_name - 1), "link_remap.%d", rfe.id); + snprintf(tmp + 1, sizeof(link_name) - (size_t)(tmp - link_name) - 1, "link_remap.%d", rfe.id); mntns_root = mntns_get_root_fd(nsid); -again: - ret = linkat_hard(lfd, "", mntns_root, link_name, - st->st_uid, st->st_gid, AT_EMPTY_PATH); - if (ret < 0 && errno == ENOENT) { + while (linkat_hard(lfd, "", mntns_root, link_name, ost->st_uid, ost->st_gid, AT_EMPTY_PATH) < 0) { + if (errno != ENOENT) { + pr_perror("Can't link remap to %s", path); + return -1; + } + /* Use grand parent, if parent directory does not exist. */ if (trim_last_parent(link_name) < 0) { pr_err("trim failed: @%s@\n", link_name); + check_overlayfs_fallback(path, parms, fallback); return -1; } - goto again; - } else if (ret < 0) { - pr_perror("Can't link remap to %s", path); - return -1; } if (note_link_remap(link_name, nsid)) @@ -954,13 +1178,13 @@ again: return pb_write_one(img_from_set(glob_imgset, CR_FD_FILES), &fe, PB_FILE); } -static int dump_linked_remap(char *path, int len, const struct stat *ost, - int lfd, u32 id, struct ns_id *nsid) +static int dump_linked_remap(char *path, int len, const struct fd_parms *parms, int lfd, u32 id, struct ns_id *nsid, + bool *fallback) { u32 lid; RemapFilePathEntry rpe = REMAP_FILE_PATH_ENTRY__INIT; - if (create_link_remap(path, len, lfd, &lid, nsid, ost)) + if (create_link_remap(path, len, lfd, &lid, nsid, parms, fallback)) return -1; rpe.orig_id = id; @@ -968,8 +1192,7 @@ static int dump_linked_remap(char *path, int len, const struct stat *ost, rpe.has_remap_type = true; rpe.remap_type = REMAP_TYPE__LINKED; - return pb_write_one(img_from_set(glob_imgset, CR_FD_REMAP_FPATH), - &rpe, PB_REMAP_FPATH); + return pb_write_one(img_from_set(glob_imgset, CR_FD_REMAP_FPATH), &rpe, PB_REMAP_FPATH); } static pid_t *dead_pids; @@ -992,7 +1215,7 @@ int dead_pid_conflict(void) continue; pr_err("Conflict with a dead task with the same PID as of this thread (virt %d, real %d).\n", - node->ns[0].virt, node->real); + node->ns[0].virt, node->real); return -1; } @@ -1033,8 +1256,7 @@ static int dump_dead_process_remap(pid_t pid, u32 id) rpe.has_remap_type = true; rpe.remap_type = REMAP_TYPE__PROCFS; - return pb_write_one(img_from_set(glob_imgset, CR_FD_REMAP_FPATH), - &rpe, PB_REMAP_FPATH); + return pb_write_one(img_from_set(glob_imgset, CR_FD_REMAP_FPATH), &rpe, PB_REMAP_FPATH); } static bool is_sillyrename_name(char *name) @@ -1071,48 +1293,15 @@ static inline bool nfs_silly_rename(char *rpath, const struct fd_parms *parms) return (parms->fs_type == NFS_SUPER_MAGIC) && is_sillyrename_name(rpath); } -int strip_deleted(struct fd_link *link) -{ - struct dcache_prepends { - const char *str; - size_t len; - } static const prepends[] = { - { - .str = " (deleted)", - .len = 10, - }, { - .str = "//deleted", - .len = 9, - } - }; - size_t i; - - for (i = 0; i < ARRAY_SIZE(prepends); i++) { - size_t at; - - if (link->len <= prepends[i].len) - continue; - - at = link->len - prepends[i].len; - if (!strcmp(&link->name[at], prepends[i].str)) { - pr_debug("Strip '%s' tag from '%s'\n", - prepends[i].str, link->name); - link->name[at] = '\0'; - link->len -= prepends[i].len; - return 1; - } - } - return 0; -} - -static int check_path_remap(struct fd_link *link, const struct fd_parms *parms, - int lfd, u32 id, struct ns_id *nsid) +static int check_path_remap(struct fd_link *link, const struct fd_parms *parms, int lfd, u32 id, struct ns_id *nsid) { char *rpath = link->name; int plen = link->len; int ret, mntns_root; struct stat pst; const struct stat *ost = &parms->stat; + int flags = 0; + bool fallback = false; if (parms->fs_type == PROC_SUPER_MAGIC) { /* The file points to /proc/pid/ where pid is a dead @@ -1139,7 +1328,7 @@ static int check_path_remap(struct fd_link *link, const struct fd_parms *parms, * cases. */ if (pid != 0) { - bool is_dead = strip_deleted(link); + bool is_dead = link_strip_deleted(link); mntns_root = mntns_get_root_fd(nsid); if (mntns_root < 0) return -1; @@ -1172,7 +1361,7 @@ static int check_path_remap(struct fd_link *link, const struct fd_parms *parms, * this FS and can't have a valid " (deleted)" * postfix as a part of not deleted filename. */ - strip_deleted(link); + link_strip_deleted(link); /* * Devpts devices/files are generated by the * kernel itself so we should not try to generate @@ -1189,7 +1378,7 @@ static int check_path_remap(struct fd_link *link, const struct fd_parms *parms, * be careful whether anybody still has any of its hardlinks * also open. */ - strip_deleted(link); + link_strip_deleted(link); return dump_ghost_remap(rpath + 1, ost, lfd, id, nsid); } @@ -1202,14 +1391,17 @@ static int check_path_remap(struct fd_link *link, const struct fd_parms *parms, * links on it) to have some persistent name at hands. */ pr_debug("Dump silly-rename linked remap for %x\n", id); - return dump_linked_remap(rpath + 1, plen - 1, ost, lfd, id, nsid); + return dump_linked_remap(rpath + 1, plen - 1, parms, lfd, id, nsid, NULL); } mntns_root = mntns_get_root_fd(nsid); if (mntns_root < 0) return -1; - ret = fstatat(mntns_root, rpath, &pst, 0); + if (S_ISLNK(parms->stat.st_mode)) + flags = AT_SYMLINK_NOFOLLOW; + + ret = fstatat(mntns_root, rpath, &pst, flags); if (ret < 0) { /* * Linked file, but path is not accessible (unless any @@ -1218,17 +1410,25 @@ static int check_path_remap(struct fd_link *link, const struct fd_parms *parms, * name. */ - if (errno == ENOENT) - return dump_linked_remap(rpath + 1, plen - 1, - ost, lfd, id, nsid); + if (errno == ENOENT) { + link_strip_deleted(link); + ret = dump_linked_remap(rpath + 1, plen - 1, parms, lfd, id, nsid, &fallback); + if (ret < 0 && fallback) { + /* fallback is true only if following conditions are true: + * 1. linkat() inside dump_linked_remap() failed with ENOENT + * 2. parms->fs_type == overlayFS + */ + return dump_ghost_remap(rpath + 1, ost, lfd, id, nsid); + } + return ret; + } pr_perror("Can't stat path"); return -1; } if ((pst.st_ino != ost->st_ino) || (pst.st_dev != ost->st_dev)) { - if (opts.evasive_devices && - (S_ISCHR(ost->st_mode) || S_ISBLK(ost->st_mode)) && + if (opts.evasive_devices && (S_ISCHR(ost->st_mode) || S_ISBLK(ost->st_mode)) && pst.st_rdev == ost->st_rdev) return 0; /* @@ -1241,9 +1441,8 @@ static int check_path_remap(struct fd_link *link, const struct fd_parms *parms, * have the "(deleted)" suffix in proc and name conflict * is unlikely :) */ - pr_err("Unaccessible path opened %u:%u, need %u:%u\n", - (int)pst.st_dev, (int)pst.st_ino, - (int)ost->st_dev, (int)ost->st_ino); + pr_err("Unaccessible path opened %u:%u, need %u:%u\n", (int)pst.st_dev, (int)pst.st_ino, + (int)ost->st_dev, (int)ost->st_ino); return -1; } @@ -1257,21 +1456,315 @@ static int check_path_remap(struct fd_link *link, const struct fd_parms *parms, static bool should_check_size(int flags) { /* Skip size if file has O_APPEND and O_WRONLY flags (e.g. log file). */ - if (((flags & O_ACCMODE) == O_WRONLY) && - (flags & O_APPEND)) + if (((flags & O_ACCMODE) == O_WRONLY) && (flags & O_APPEND)) return false; return true; } +/* + * Gets the build-id (If it exists) from 32-bit ELF files. + * Returns the number of bytes of the build-id if it could + * be obtained, else -1. + */ +static int get_build_id_32(Elf32_Ehdr *file_header, unsigned char **build_id, const int fd, size_t mapped_size) +{ + int size, num_iterations; + size_t file_header_end; + Elf32_Phdr *program_header, *program_header_end; + Elf32_Nhdr *note_header_end, *note_header = NULL; + + file_header_end = (size_t)file_header + mapped_size; + if (sizeof(Elf32_Ehdr) > mapped_size) + return -1; + + /* + * If the file doesn't have at least 1 program header entry, it definitely can't + * have a build-id. + */ + if (!file_header->e_phnum) { + pr_warn("Couldn't find any program headers for file with fd %d\n", fd); + return -1; + } + + program_header = (Elf32_Phdr *)(file_header->e_phoff + (char *)file_header); + if (program_header <= (Elf32_Phdr *)file_header) + return -1; + + program_header_end = (Elf32_Phdr *)(file_header_end - sizeof(Elf32_Phdr)); + + /* + * If the file has a build-id, it will be in the PT_NOTE program header + * entry AKA the note sections. + */ + for (num_iterations = 0; num_iterations < file_header->e_phnum; num_iterations++, program_header++) { + if (program_header > program_header_end) + break; + if (program_header->p_type != PT_NOTE) + continue; + + note_header = (Elf32_Nhdr *)(program_header->p_offset + (char *)file_header); + if (note_header <= (Elf32_Nhdr *)file_header) { + note_header = NULL; + continue; + } + + note_header_end = (Elf32_Nhdr *)min_t(char *, (char *)note_header + program_header->p_filesz, + (char *)(file_header_end - sizeof(Elf32_Nhdr))); + + /* The note type for the build-id is NT_GNU_BUILD_ID. */ + while (note_header <= note_header_end && note_header->n_type != NT_GNU_BUILD_ID) + note_header = (Elf32_Nhdr *)((char *)note_header + sizeof(Elf32_Nhdr) + + ALIGN(note_header->n_namesz, 4) + ALIGN(note_header->n_descsz, 4)); + + if (note_header > note_header_end) { + note_header = NULL; + continue; + } + break; + } + + if (!note_header) { + pr_debug("Couldn't find the build-id note for file with fd %d\n", fd); + return -1; + } + + /* + * If the size of the notes description is too large or is invalid + * then the build-id could not be obtained. + */ + if (note_header->n_descsz <= 0 || note_header->n_descsz > 512) { + pr_warn("Invalid description size for build-id note for file with fd %d\n", fd); + return -1; + } + + size = note_header->n_descsz; + note_header = (Elf32_Nhdr *)((char *)note_header + sizeof(Elf32_Nhdr) + ALIGN(note_header->n_namesz, 4)); + note_header_end = (Elf32_Nhdr *)(file_header_end - size); + if (note_header <= (Elf32_Nhdr *)file_header || note_header > note_header_end) + return -1; + + *build_id = (unsigned char *)xmalloc(size); + if (!*build_id) + return -1; + + memcpy(*build_id, (void *)note_header, size); + return size; +} + +/* + * Gets the build-id (If it exists) from 64-bit ELF files. + * Returns the number of bytes of the build-id if it could + * be obtained, else -1. + */ +static int get_build_id_64(Elf64_Ehdr *file_header, unsigned char **build_id, const int fd, size_t mapped_size) +{ + int size, num_iterations; + size_t file_header_end; + Elf64_Phdr *program_header, *program_header_end; + Elf64_Nhdr *note_header_end, *note_header = NULL; + + file_header_end = (size_t)file_header + mapped_size; + if (sizeof(Elf64_Ehdr) > mapped_size) + return -1; + + /* + * If the file doesn't have at least 1 program header entry, it definitely can't + * have a build-id. + */ + if (!file_header->e_phnum) { + pr_warn("Couldn't find any program headers for file with fd %d\n", fd); + return -1; + } + + program_header = (Elf64_Phdr *)(file_header->e_phoff + (char *)file_header); + if (program_header <= (Elf64_Phdr *)file_header) + return -1; + + program_header_end = (Elf64_Phdr *)(file_header_end - sizeof(Elf64_Phdr)); + + /* + * If the file has a build-id, it will be in the PT_NOTE program header + * entry AKA the note sections. + */ + for (num_iterations = 0; num_iterations < file_header->e_phnum; num_iterations++, program_header++) { + if (program_header > program_header_end) + break; + if (program_header->p_type != PT_NOTE) + continue; + + note_header = (Elf64_Nhdr *)(program_header->p_offset + (char *)file_header); + if (note_header <= (Elf64_Nhdr *)file_header) { + note_header = NULL; + continue; + } + + note_header_end = (Elf64_Nhdr *)min_t(char *, (char *)note_header + program_header->p_filesz, + (char *)(file_header_end - sizeof(Elf64_Nhdr))); + + /* The note type for the build-id is NT_GNU_BUILD_ID. */ + while (note_header <= note_header_end && note_header->n_type != NT_GNU_BUILD_ID) + note_header = (Elf64_Nhdr *)((char *)note_header + sizeof(Elf64_Nhdr) + + ALIGN(note_header->n_namesz, 4) + ALIGN(note_header->n_descsz, 4)); + + if (note_header > note_header_end) { + note_header = NULL; + continue; + } + break; + } + + if (!note_header) { + pr_debug("Couldn't find the build-id note for file with fd %d\n", fd); + return -1; + } + + /* + * If the size of the notes description is too large or is invalid + * then the build-id could not be obtained. + */ + if (note_header->n_descsz <= 0 || note_header->n_descsz > 512) { + pr_warn("Invalid description size for build-id note for file with fd %d\n", fd); + return -1; + } + + size = note_header->n_descsz; + note_header = (Elf64_Nhdr *)((char *)note_header + sizeof(Elf64_Nhdr) + ALIGN(note_header->n_namesz, 4)); + note_header_end = (Elf64_Nhdr *)(file_header_end - size); + if (note_header <= (Elf64_Nhdr *)file_header || note_header > note_header_end) + return -1; + + *build_id = (unsigned char *)xmalloc(size); + if (!*build_id) + return -1; + + memcpy(*build_id, (void *)note_header, size); + return size; +} + +/* + * Finds the build-id of the file by checking if the file is an ELF file + * and then calling either the 32-bit or the 64-bit function as necessary. + * Returns the number of bytes of the build-id if it could be + * obtained, else -1. + */ +static int get_build_id(const int fd, const struct stat *fd_status, unsigned char **build_id) +{ + char *start_addr; + size_t mapped_size; + int ret = -1; + + /* + * If the build-id exists, then it will most likely be present in the + * beginning of the file. Therefore at most only the first 1 MB of the + * file is mapped. + */ + mapped_size = min_t(size_t, fd_status->st_size, BUILD_ID_MAP_SIZE); + start_addr = mmap(0, mapped_size, PROT_READ, MAP_PRIVATE | MAP_FILE, fd, 0); + if ((void*)start_addr == MAP_FAILED) { + pr_warn("Couldn't mmap file with fd %d\n", fd); + return -1; + } + + /* + * The first 4 bytes contain a magic number identifying the file as an + * ELF file. They should contain the characters ‘\x7f’, ‘E’, ‘L’, and + * ‘F’, respectively. These characters are together defined as ELFMAG. + */ + if (memcmp(start_addr, ELFMAG, SELFMAG)) + goto out; + + if (start_addr[EI_CLASS] == ELFCLASS32) + ret = get_build_id_32((Elf32_Ehdr *)start_addr, build_id, fd, mapped_size); + if (start_addr[EI_CLASS] == ELFCLASS64) + ret = get_build_id_64((Elf64_Ehdr *)start_addr, build_id, fd, mapped_size); + +out: + munmap(start_addr, mapped_size); + return ret; +} + +/* + * Finds and stores the build-id of a file, if it exists, so that it can be validated + * while restoring. + * Returns 1 if the build-id of the file could be stored, -1 if there was an error + * or 0 if the build-id could not be obtained. + */ +static int store_validation_data_build_id(RegFileEntry *rfe, int lfd, const struct fd_parms *p) +{ + unsigned char *build_id = NULL; + int build_id_size, allocated_size; + int fd; + + /* + * Checks whether the file is at least big enough to try and read the first + * four (SELFMAG) bytes which should correspond to the ELF magic number + * and the next byte which indicates whether the file is 32-bit or 64-bit. + */ + if (p->stat.st_size < SELFMAG + 1) + return 0; + + fd = open_proc(PROC_SELF, "fd/%d", lfd); + if (fd < 0) { + pr_err("Build-ID (For validation) could not be obtained for file %s because can't open the file\n", + rfe->name); + return -1; + } + + build_id_size = get_build_id(fd, &(p->stat), &build_id); + close(fd); + if (!build_id || build_id_size == -1) + return 0; + + allocated_size = round_up(build_id_size, sizeof(uint32_t)); + rfe->build_id = xzalloc(allocated_size); + if (!rfe->build_id) { + pr_warn("Build-ID (For validation) could not be set for file %s\n", rfe->name); + xfree(build_id); + return -1; + } + + rfe->n_build_id = allocated_size / sizeof(uint32_t); + memcpy(rfe->build_id, (void *)build_id, build_id_size); + + xfree(build_id); + return 1; +} + +/* + * This routine stores metadata about the open file (File size, build-id, CRC32C checksum) + * so that validation can be done while restoring to make sure that the right file is + * being restored. + * Returns true if at least some metadata was stored, if there was an error it returns false. + */ +static bool store_validation_data(RegFileEntry *rfe, const struct fd_parms *p, int lfd) +{ + int result = 1; + + rfe->has_size = true; + rfe->size = p->stat.st_size; + + if (opts.file_validation_method == FILE_VALIDATION_BUILD_ID) + result = store_validation_data_build_id(rfe, lfd, p); + + if (result == -1) + return false; + + if (!result) + pr_info("Only file size could be stored for validation for file %s\n", rfe->name); + return true; +} + int dump_one_reg_file(int lfd, u32 id, const struct fd_parms *p) { struct fd_link _link, *link; struct mount_info *mi; struct cr_img *rimg; char ext_id[64]; + int ret; FileEntry fe = FILE_ENTRY__INIT; RegFileEntry rfe = REG_FILE_ENTRY__INIT; + bool skip_for_shell_job = false; if (!p->link) { if (fill_fdlink(lfd, p, &_link)) @@ -1280,9 +1773,7 @@ int dump_one_reg_file(int lfd, u32 id, const struct fd_parms *p) } else link = p->link; - - - snprintf(ext_id, sizeof(ext_id), "file[%x:%"PRIx64"]", p->mnt_id, p->stat.st_ino); + snprintf(ext_id, sizeof(ext_id), "file[%x:%" PRIx64 "]", p->mnt_id, p->stat.st_ino); if (external_lookup_id(ext_id)) { /* the first symbol will be cut on restore to get an relative path*/ rfe.name = xstrdup(ext_id); @@ -1293,13 +1784,17 @@ int dump_one_reg_file(int lfd, u32 id, const struct fd_parms *p) mi = lookup_mnt_id(p->mnt_id); if (mi == NULL) { - pr_err("Can't lookup mount=%d for fd=%d path=%s\n", - p->mnt_id, p->fd, link->name + 1); - return -1; + if (opts.shell_job && is_tty(p->stat.st_rdev, p->stat.st_dev)) { + skip_for_shell_job = true; + } else { + pr_err("Can't lookup mount=%d for fd=%d path=%s\n", p->mnt_id, p->fd, link->name + 1); + return -1; + } } - if (mnt_is_overmounted(mi)) { - pr_err("Open files on overmounted mounts are not supported yet\n"); + if (!skip_for_shell_job && mnt_is_overmounted(mi)) { + pr_err("Open files on overmounted mounts are not supported yet; mount=%d fd=%d path=%s\n", + p->mnt_id, p->fd, link->name + 1); return -1; } @@ -1308,8 +1803,7 @@ int dump_one_reg_file(int lfd, u32 id, const struct fd_parms *p) rfe.has_mnt_id = true; } - pr_info("Dumping path for %d fd via self %d [%s]\n", - p->fd, lfd, &link->name[1]); + pr_info("Dumping path for %d fd via self %d [%s]\n", p->fd, lfd, &link->name[1]); /* * The regular path we can handle should start with slash. @@ -1319,38 +1813,39 @@ int dump_one_reg_file(int lfd, u32 id, const struct fd_parms *p) return -1; } - if (check_path_remap(link, p, lfd, id, mi->nsid)) + if (!skip_for_shell_job && check_path_remap(link, p, lfd, id, mi->nsid)) return -1; - rfe.name = &link->name[1]; + rfe.name = &link->name[1]; ext: - rfe.id = id; - rfe.flags = p->flags; - rfe.pos = p->pos; - rfe.fown = (FownEntry *)&p->fown; - rfe.has_mode = true; - rfe.mode = p->stat.st_mode; + rfe.id = id; + rfe.flags = p->flags; + rfe.pos = p->pos; + rfe.fown = (FownEntry *)&p->fown; + rfe.has_mode = true; + rfe.mode = p->stat.st_mode; - if (S_ISREG(p->stat.st_mode) && should_check_size(rfe.flags)) { - rfe.has_size = true; - rfe.size = p->stat.st_size; - } + if (S_ISREG(p->stat.st_mode) && should_check_size(rfe.flags) && !store_validation_data(&rfe, p, lfd)) + return -1; fe.type = FD_TYPES__REG; fe.id = rfe.id; fe.reg = &rfe; rimg = img_from_set(glob_imgset, CR_FD_FILES); - return pb_write_one(rimg, &fe, PB_FILE); + ret = pb_write_one(rimg, &fe, PB_FILE); + + if (rfe.build_id) + xfree(rfe.build_id); + + return ret; } const struct fdtype_ops regfile_dump_ops = { - .type = FD_TYPES__REG, - .dump = dump_one_reg_file, + .type = FD_TYPES__REG, + .dump = dump_one_reg_file, }; -static void convert_path_from_another_mp(char *src, char *dst, int dlen, - struct mount_info *smi, - struct mount_info *dmi) +static void convert_path_from_another_mp(char *src, char *dst, int dlen, struct mount_info *smi, struct mount_info *dmi) { int off; @@ -1367,10 +1862,7 @@ static void convert_path_from_another_mp(char *src, char *dst, int dlen, * Absolute path to the mount point + difference between source * and destination roots + path relative to the mountpoint. */ - snprintf(dst, dlen, "./%s/%s/%s", - dmi->ns_mountpoint + 1, - smi->root + strlen(dmi->root), - src + off); + snprintf(dst, dlen, "./%s/%s/%s", dmi->ns_mountpoint + 1, smi->root + strlen(dmi->root), src + off); } static int linkat_hard(int odir, char *opath, int ndir, char *npath, uid_t uid, gid_t gid, int flags) @@ -1384,7 +1876,7 @@ static int linkat_hard(int odir, char *opath, int ndir, char *npath, uid_t uid, if (ret == 0) return 0; - if (!( (errno == EPERM || errno == EOVERFLOW) && (root_ns_mask & CLONE_NEWUSER) )) { + if (!((errno == EPERM || errno == EOVERFLOW) && (root_ns_mask & CLONE_NEWUSER))) { errno_save = errno; pr_warn("Can't link %s -> %s\n", opath, npath); errno = errno_save; @@ -1463,34 +1955,46 @@ out: return ret; } -static void rm_parent_dirs(int mntns_root, char *path, int count) +int rm_parent_dirs(int mntns_root, char *path, int count) { char *p, *prev = NULL; + int ret = -1; - if (!count) - return; - - while (count > 0) { - count -= 1; + while (count-- > 0) { p = strrchr(path, '/'); - if (p) + if (p) { + /* We don't handle "//" in path */ + BUG_ON(prev && (prev - p == 1)); *p = '\0'; + } else { + /* Inconsistent path and count */ + pr_perror("Can't strrchr \"/\" in \"%s\"/\"%s\"]" + " left count=%d\n", + path, prev ? prev + 1 : "", count + 1); + goto err; + } + if (prev) *prev = '/'; - - if (unlinkat(mntns_root, path, AT_REMOVEDIR)) - pr_perror("Can't remove %s AT %d", path, mntns_root); - else - pr_debug("Unlinked parent dir: %s AT %d\n", path, mntns_root); prev = p; + + if (unlinkat(mntns_root, path, AT_REMOVEDIR)) { + pr_perror("Can't remove %s AT %d", path, mntns_root); + goto err; + } + pr_debug("Unlinked parent dir: %s AT %d\n", path, mntns_root); } + ret = 0; +err: if (prev) *prev = '/'; + + return ret; } /* Construct parent dir name and mkdir parent/grandparents if they're not exist */ -static int make_parent_dirs_if_need(int mntns_root, char *path) +int make_parent_dirs_if_need(int mntns_root, char *path) { char *p, *last_delim; int err, count = 0; @@ -1518,6 +2022,7 @@ static int make_parent_dirs_if_need(int mntns_root, char *path) err = mkdirat(mntns_root, path, 0777); if (err && errno != EEXIST) { pr_perror("Can't create dir: %s AT %d", path, mntns_root); + /* Failing anyway -> no retcode check */ rm_parent_dirs(mntns_root, path, count); count = -1; goto out; @@ -1538,6 +2043,9 @@ out: * This routine properly resolves d's path handling ghost/link-remaps. * The open_cb is a routine that does actual open, it differs for * files, directories, fifos, etc. + * + * Return 0 on success, -1 on error and 1 to indicate soft error, which can be + * retried. */ static int rfi_remap(struct reg_file_info *rfi, int *level) @@ -1582,7 +2090,7 @@ static int rfi_remap(struct reg_file_info *rfi, int *level) BUG_ON(tmi->s_dev != rmi->s_dev); BUG_ON(tmi->s_dev != mi->s_dev); - /* Calcalate paths on the device (root mount) */ + /* Calculate paths on the device (root mount) */ convert_path_from_another_mp(rfi->path, path, sizeof(_path), mi, tmi); convert_path_from_another_mp(rfi->remap->rpath, rpath, sizeof(_rpath), rmi, tmi); @@ -1599,25 +2107,95 @@ out_root: if (*level < 0) return -1; - if (linkat_hard(mntns_root, rpath, mntns_root, path, - rfi->remap->uid, rfi->remap->gid, 0) < 0) { + if (linkat_hard(mntns_root, rpath, mntns_root, path, rfi->remap->uid, rfi->remap->gid, 0) < 0) { int errno_saved = errno; - rm_parent_dirs(mntns_root, path, *level); - errno = errno_saved; + + if (!rm_parent_dirs(mntns_root, path, *level) && errno_saved == EEXIST) { + errno = errno_saved; + return 1; + } return -1; } return 0; } -int open_path(struct file_desc *d, - int(*open_cb)(int mntns_root, struct reg_file_info *, void *), void *arg) +/* + * Compares the file's build-id with the stored value. + * Returns 1 if the build-id of the file matches the build-id that was stored + * while dumping, -1 if there is a mismatch or 0 if the build-id has not been + * stored or could not be obtained. + */ +static int validate_with_build_id(const int fd, const struct stat *fd_status, const struct reg_file_info *rfi) { - int tmp, mntns_root, level = 0; + unsigned char *build_id; + int build_id_size; + + if (!rfi->rfe->has_size) + return 1; + + if (!rfi->rfe->n_build_id) + return 0; + + build_id = NULL; + build_id_size = get_build_id(fd, fd_status, &build_id); + if (!build_id || build_id_size == -1) + return 0; + + if (round_up(build_id_size, sizeof(uint32_t)) != rfi->rfe->n_build_id * sizeof(uint32_t)) { + pr_err("File %s has bad build-ID length %d (expect %d)\n", rfi->path, + round_up(build_id_size, sizeof(uint32_t)), (int)(rfi->rfe->n_build_id * sizeof(uint32_t))); + xfree(build_id); + return -1; + } + + if (memcmp(build_id, rfi->rfe->build_id, build_id_size)) { + pr_err("File %s has bad build-ID\n", rfi->path); + xfree(build_id); + return -1; + } + + xfree(build_id); + return 1; +} + +/* + * This function determines whether it was the same file that was open during dump + * by checking the file's size, build-id and/or checksum with the same metadata + * that was stored before dumping. + * Checksum is calculated with CRC32C. + * Returns true if the metadata of the file matches the metadata stored while + * dumping else returns false. + */ +static bool validate_file(const int fd, const struct stat *fd_status, const struct reg_file_info *rfi) +{ + int result = 1; + + if (rfi->rfe->has_size && (fd_status->st_size != rfi->rfe->size)) { + pr_err("File %s has bad size %" PRIu64 " (expect %" PRIu64 ")\n", rfi->path, fd_status->st_size, + rfi->rfe->size); + return false; + } + + if (opts.file_validation_method == FILE_VALIDATION_BUILD_ID) + result = validate_with_build_id(fd, fd_status, rfi); + + if (result == -1) + return false; + + if (!result) + pr_info("File %s could only be validated with file size\n", rfi->path); + return true; +} + +int open_path(struct file_desc *d, int (*open_cb)(int mntns_root, struct reg_file_info *, void *), void *arg) +{ + int tmp = -1, mntns_root, level = 0; struct reg_file_info *rfi; char *orig_path = NULL; char path[PATH_MAX]; int inh_fd = -1; + int ret; if (inherited_fd(d, &tmp)) return tmp; @@ -1628,7 +2206,7 @@ int open_path(struct file_desc *d, tmp = inherit_fd_lookup_id(rfi->rfe->name); if (tmp >= 0) { inh_fd = tmp; - /* + /* * PROC_SELF isn't used, because only service * descriptors can be used here. */ @@ -1654,15 +2232,9 @@ int open_path(struct file_desc *d, */ orig_path = rfi->path; rfi->path = rfi->remap->rpath; - } else if (rfi_remap(rfi, &level) < 0) { + } else if ((ret = rfi_remap(rfi, &level)) == 1) { static char tmp_path[PATH_MAX]; - if (errno != EEXIST) { - pr_perror("Can't link %s -> %s", - rfi->remap->rpath, rfi->path); - return -1; - } - /* * The file whose name we're trying to create * exists. Need to pick some other one, we're @@ -1676,12 +2248,15 @@ int open_path(struct file_desc *d, orig_path = rfi->path; rfi->path = tmp_path; snprintf(tmp_path, sizeof(tmp_path), "%s.cr_link", orig_path); - pr_debug("Fake %s -> %s link\n", rfi->path, rfi->remap->rpath); + pr_debug("Fake %s -> %s link\n", rfi->remap->rpath, rfi->path); - if (rfi_remap(rfi, &level) < 0) { + if (rfi_remap(rfi, &level)) { pr_perror("Can't create even fake link!"); - return -1; + goto err; } + } else if (ret < 0) { + pr_perror("Can't link %s -> %s", rfi->remap->rpath, rfi->path); + goto err; } } @@ -1691,31 +2266,36 @@ ext: if (tmp < 0) { pr_perror("Can't open file %s", rfi->path); close_safe(&inh_fd); - return -1; + goto err; } close_safe(&inh_fd); - if ((rfi->rfe->has_size || rfi->rfe->has_mode) && - !rfi->size_mode_checked) { + if ((rfi->rfe->has_size || rfi->rfe->has_mode) && !rfi->size_mode_checked) { struct stat st; if (fstat(tmp, &st) < 0) { pr_perror("Can't fstat opened file"); - return -1; + goto err; } - if (rfi->rfe->has_size && (st.st_size != rfi->rfe->size)) { - pr_err("File %s has bad size %"PRIu64" (expect %"PRIu64")\n", - rfi->path, st.st_size, - rfi->rfe->size); - return -1; - } + if (!validate_file(tmp, &st, rfi)) + goto err; - if (rfi->rfe->has_mode && (st.st_mode != rfi->rfe->mode)) { - pr_err("File %s has bad mode 0%o (expect 0%o)\n", - rfi->path, (int)st.st_mode, - rfi->rfe->mode); - return -1; + if (rfi->rfe->has_mode) { + mode_t curr_mode = st.st_mode; + mode_t saved_mode = rfi->rfe->mode; + + if (opts.skip_file_rwx_check) { + curr_mode &= ~(S_IRWXU | S_IRWXG | S_IRWXO); + saved_mode &= ~(S_IRWXU | S_IRWXG | S_IRWXO); + } + + if (curr_mode != saved_mode) { + pr_err("File %s has bad mode 0%o (expect 0%o)\n" + "File r/w/x checks can be skipped with the --skip-file-rwx-check option\n", + rfi->path, (int)curr_mode, saved_mode); + goto err; + } } /* @@ -1728,8 +2308,18 @@ ext: if (rfi->remap) { if (!rfi->remap->is_dir) { - unlinkat(mntns_root, rfi->path, 0); - rm_parent_dirs(mntns_root, rfi->path, level); + struct mount_info *mi = lookup_mnt_id(rfi->rfe->mnt_id); + + if (mi && try_remount_writable(mi, true)) + goto err; + + pr_debug("Unlink: %d:%s\n", rfi->rfe->mnt_id, rfi->path); + if (unlinkat(mntns_root, rfi->path, 0)) { + pr_perror("Failed to unlink the remap file"); + goto err; + } + if (rm_parent_dirs(mntns_root, rfi->path, level)) + goto err; } mutex_unlock(remap_open_lock); @@ -1737,10 +2327,17 @@ ext: if (orig_path) rfi->path = orig_path; - if (restore_fown(tmp, rfi->rfe->fown)) + if (restore_fown(tmp, rfi->rfe->fown)) { + close(tmp); return -1; + } return tmp; +err: + if (rfi->remap) + mutex_unlock(remap_open_lock); + close_safe(&tmp); + return -1; } int do_open_reg_noseek_flags(int ns_root_fd, struct reg_file_info *rfi, void *arg) @@ -1773,11 +2370,16 @@ static int do_open_reg(int ns_root_fd, struct reg_file_info *rfi, void *arg) if (fd < 0) return fd; - if ((rfi->rfe->pos != -1ULL) && - lseek(fd, rfi->rfe->pos, SEEK_SET) < 0) { - pr_perror("Can't restore file pos"); - close(fd); - return -1; + /* + * O_PATH opened files carry empty fops in kernel, + * just ignore positioning at all. + */ + if (!(rfi->rfe->flags & O_PATH)) { + if (rfi->rfe->pos != -1ULL && lseek(fd, rfi->rfe->pos, SEEK_SET) < 0) { + pr_perror("Can't restore file pos"); + close(fd); + return -1; + } } return fd; @@ -1843,10 +2445,10 @@ static struct filemap_ctx ctx; void filemap_ctx_init(bool auto_close) { - ctx.desc = NULL; /* to fail the first comparison in open_ */ - ctx.fd = -1; /* not to close random fd in _fini */ - ctx.vma = NULL; /* not to put spurious VMA_CLOSE in _fini */ - /* flags may remain any */ + ctx.desc = NULL; /* to fail the first comparison in open_ */ + ctx.fd = -1; /* not to close random fd in _fini */ + ctx.vma = NULL; /* not to put spurious VMA_CLOSE in _fini */ + /* flags may remain any */ ctx.close = auto_close; } @@ -1865,6 +2467,7 @@ static int open_filemap(int pid, struct vma_area *vma) { u32 flags; int ret; + int plugin_fd = -1; /* * The vma->fd should have been assigned in collect_filemap @@ -1875,8 +2478,38 @@ static int open_filemap(int pid, struct vma_area *vma) BUG_ON((vma->vmfd == NULL) || !vma->e->has_fdflags); flags = vma->e->fdflags; + /* update the new device file page offsets and file paths set during restore */ + if (vma->e->status & VMA_EXT_PLUGIN) { + uint64_t new_pgoff; + int ret; + + struct reg_file_info *rfi = container_of(vma->vmfd, struct reg_file_info, d); + ret = run_plugins(UPDATE_VMA_MAP, rfi->rfe->name, vma->e->start, vma->e->pgoff, &new_pgoff, &plugin_fd); + if (ret == 1) { + pr_info("New mmap %#016" PRIx64 ":%#016" PRIx64 "->%#016" PRIx64 " fd %d\n", vma->e->start, + vma->e->pgoff, new_pgoff, plugin_fd); + vma->e->pgoff = new_pgoff; + } + /* Device plugin will restore vma contents, so no need for write permission */ + vma->e->status |= VMA_NO_PROT_WRITE; + } + if (ctx.flags != flags || ctx.desc != vma->vmfd) { - ret = open_path(vma->vmfd, do_open_reg_noseek_flags, &flags); + if (plugin_fd >= 0) { + /* + * Vma handled by device plugin. + * Some device drivers (e.g DRM) only allow the file descriptor that was used to create vma to + * be used when calling mmap. In this case, use the FD returned by plugin. FD can be copied + * using dup because dup returns a reference to the same struct file inside kernel, but we + * cannot open a new FD. + */ + ret = plugin_fd; + } else if (vma->e->status & VMA_AREA_MEMFD) { + if (!inherited_fd(vma->vmfd, &ret)) + ret = memfd_open(vma->vmfd, &flags, true); + } else { + ret = open_path(vma->vmfd, do_open_reg_noseek_flags, &flags); + } if (ret < 0) return ret; @@ -1899,14 +2532,16 @@ int collect_filemap(struct vma_area *vma) if (!vma->e->has_fdflags) { /* Make a wild guess for the fdflags */ vma->e->has_fdflags = true; - if ((vma->e->prot & PROT_WRITE) && - vma_area_is(vma, VMA_FILE_SHARED)) + if ((vma->e->prot & PROT_WRITE) && vma_area_is(vma, VMA_FILE_SHARED)) vma->e->fdflags = O_RDWR; else vma->e->fdflags = O_RDONLY; } - fd = collect_special_file(vma->e->shmid); + if (vma->e->status & VMA_AREA_MEMFD) + fd = collect_memfd(vma->e->shmid); + else + fd = collect_special_file(vma->e->shmid); if (!fd) return -1; diff --git a/criu/files.c b/criu/files.c index ffdaa459f..af4b8aeac 100644 --- a/criu/files.c +++ b/criu/files.c @@ -21,7 +21,7 @@ #include "image.h" #include "common/list.h" #include "rst-malloc.h" -#include "util-pie.h" +#include "util-caps.h" #include "common/lock.h" #include "sockets.h" #include "pstree.h" @@ -34,6 +34,7 @@ #include "sk-packet.h" #include "mount.h" #include "signalfd.h" +#include "memfd.h" #include "namespaces.h" #include "tun.h" #include "timerfd.h" @@ -44,8 +45,11 @@ #include "autofs.h" #include "parasite.h" #include "parasite-syscall.h" +#include "string.h" #include "kerndat.h" #include "fdstore.h" +#include "bpfmap.h" +#include "pidfd.h" #include "protobuf.h" #include "util.h" @@ -54,7 +58,7 @@ #include "plugin.h" -#define FDESC_HASH_SIZE 64 +#define FDESC_HASH_SIZE 64 static struct hlist_head file_desc_hash[FDESC_HASH_SIZE]; /* file_desc's, which fle is not owned by a process, that is able to open them */ static LIST_HEAD(fake_master_head); @@ -75,8 +79,8 @@ void file_desc_init(struct file_desc *d, u32 id, struct file_desc_ops *ops) INIT_LIST_HEAD(&d->fake_master_list); INIT_HLIST_NODE(&d->hash); - d->id = id; - d->ops = ops; + d->id = id; + d->ops = ops; } int file_desc_add(struct file_desc *d, u32 id, struct file_desc_ops *ops) @@ -97,8 +101,7 @@ struct file_desc *find_file_desc_raw(int type, u32 id) chain = &file_desc_hash[id % FDESC_HASH_SIZE]; hlist_for_each_entry(d, chain, hash) - if ((d->id == id) && - (d->ops->type == type || type == FD_TYPES__UND)) + if ((d->id == id) && (d->ops->type == type || type == FD_TYPES__UND)) /* * Warning -- old CRIU might generate matching IDs * for different file types! So any code that uses @@ -181,6 +184,18 @@ out: return fd; } +int find_unused_fd_pid(pid_t pid) +{ + struct pstree_item *task; + + task = pstree_item_by_virt(pid); + if (!task) { + pr_err("Invalid pid:%d\n", pid); + return -1; + } + return find_unused_fd(task, -1); +} + int set_fds_event(pid_t virt) { struct pstree_item *item; @@ -216,8 +231,7 @@ struct fdinfo_list_entry *try_file_master(struct file_desc *d) if (list_empty(&d->fd_info_head)) return NULL; - return list_first_entry(&d->fd_info_head, - struct fdinfo_list_entry, desc_list); + return list_first_entry(&d->fd_info_head, struct fdinfo_list_entry, desc_list); } struct fdinfo_list_entry *file_master(struct file_desc *d) @@ -226,8 +240,7 @@ struct fdinfo_list_entry *file_master(struct file_desc *d) fle = try_file_master(d); if (!fle) { - pr_err("Empty list on file desc id %#x(%d)\n", d->id, - d->ops ? d->ops->type : -1); + pr_err("Empty list on file desc id %#x(%d)\n", d->id, d->ops ? d->ops->type : -1); BUG(); } @@ -286,13 +299,12 @@ static int fixup_overlayfs(struct fd_parms *p, struct fd_link *link) * If the bug is present, the file path from /proc//fd * does not include the mountpoint, so we prepend it ourselves. */ - if (strcmp("./", m->mountpoint) != 0) { + if (strcmp("./", m->ns_mountpoint) != 0) { char buf[PATH_MAX]; int n; - strncpy(buf, link->name, PATH_MAX); - buf[PATH_MAX - 1] = 0; - n = snprintf(link->name, PATH_MAX, "%s/%s", m->mountpoint, buf + 2); + __strlcpy(buf, link->name, PATH_MAX); + n = snprintf(link->name, PATH_MAX, "%s/%s", m->ns_mountpoint, buf + 2); if (n >= PATH_MAX) { pr_err("Not enough space to replace %s\n", buf); return -1; @@ -317,16 +329,13 @@ uint32_t make_gen_id(uint32_t st_dev, uint32_t st_ino, uint64_t pos) return st_dev ^ st_ino ^ pos_hi ^ pos_low; } -int do_dump_gen_file(struct fd_parms *p, int lfd, - const struct fdtype_ops *ops, FdinfoEntry *e) +int do_dump_gen_file(struct fd_parms *p, int lfd, const struct fdtype_ops *ops, FdinfoEntry *e) { int ret = -1; - e->type = ops->type; - e->id = make_gen_id((uint32_t)p->stat.st_dev, - (uint32_t)p->stat.st_ino, - (uint64_t)p->pos); - e->fd = p->fd; + e->type = ops->type; + e->id = make_gen_id((uint32_t)p->stat.st_dev, (uint32_t)p->stat.st_ino, (uint64_t)p->pos); + e->fd = p->fd; e->flags = p->fd_flags; ret = fd_id_generate(p->pid, e, p); @@ -359,8 +368,7 @@ int fill_fdlink(int lfd, const struct fd_parms *p, struct fd_link *link) return 0; } -static int fill_fd_params(struct pid *owner_pid, int fd, int lfd, - struct fd_opts *opts, struct fd_parms *p) +static int fill_fd_params(struct pid *owner_pid, int fd, int lfd, struct fd_opts *opts, struct fd_parms *p) { int ret; struct statfs fsbuf; @@ -379,20 +387,29 @@ static int fill_fd_params(struct pid *owner_pid, int fd, int lfd, if (parse_fdinfo_pid(owner_pid->real, fd, FD_TYPES__UND, &fdinfo)) return -1; - p->fs_type = fsbuf.f_type; - p->fd = fd; - p->pos = fdinfo.pos; - p->flags = fdinfo.flags; - p->mnt_id = fdinfo.mnt_id; - p->pid = owner_pid->real; - p->fd_flags = opts->flags; + p->fs_type = fsbuf.f_type; + p->fd = fd; + p->pos = fdinfo.pos; + /* + * The kernel artificially adds the O_CLOEXEC flag on the file pointer + * flags by looking at the flags on the file descriptor (see kernel + * code fs/proc/fd.c). FD_CLOEXEC is a file descriptor property, which + * is saved in fd_flags. + */ + p->flags = fdinfo.flags & ~O_CLOEXEC; + p->mnt_id = fdinfo.mnt_id; + p->pid = owner_pid->real; + p->fd_flags = opts->flags; fown_entry__init(&p->fown); - pr_info("%d fdinfo %d: pos: %#16"PRIx64" flags: %16o/%#x\n", - owner_pid->real, fd, p->pos, p->flags, (int)p->fd_flags); + pr_info("%d fdinfo %d: pos: %#16" PRIx64 " flags: %16o/%#x\n", owner_pid->real, fd, p->pos, p->flags, + (int)p->fd_flags); - ret = fcntl(lfd, F_GETSIG, 0); + if (p->flags & O_PATH) + ret = 0; + else + ret = fcntl(lfd, F_GETSIG, 0); if (ret < 0) { pr_perror("Can't get owner signum on %d", lfd); return -1; @@ -402,10 +419,10 @@ static int fill_fd_params(struct pid *owner_pid, int fd, int lfd, if (opts->fown.pid == 0) return 0; - p->fown.pid = opts->fown.pid; + p->fown.pid = opts->fown.pid; p->fown.pid_type = opts->fown.pid_type; - p->fown.uid = opts->fown.uid; - p->fown.euid = opts->fown.euid; + p->fown.uid = opts->fown.uid; + p->fown.euid = opts->fown.euid; return 0; } @@ -478,9 +495,8 @@ static int dump_chrdev(struct fd_parms *p, int lfd, FdinfoEntry *e) return err; } -static int dump_one_file(struct pid *pid, int fd, int lfd, struct fd_opts *opts, - struct parasite_ctl *ctl, FdinfoEntry *e, - struct parasite_drain_fd *dfds) +static int dump_one_file(struct pid *pid, int fd, int lfd, struct fd_opts *opts, struct parasite_ctl *ctl, + FdinfoEntry *e, struct parasite_drain_fd *dfds) { struct fd_parms p = FD_PARMS_INIT; const struct fdtype_ops *ops; @@ -503,7 +519,7 @@ static int dump_one_file(struct pid *pid, int fd, int lfd, struct fd_opts *opts, } p.fd_ctl = ctl; /* Some dump_opts require this to talk to parasite */ - p.dfds = dfds; /* epoll needs to verify if target fd exist */ + p.dfds = dfds; /* epoll needs to verify if target fd exist */ if (S_ISSOCK(p.stat.st_mode)) return dump_socket(&p, lfd, e); @@ -529,24 +545,40 @@ static int dump_one_file(struct pid *pid, int fd, int lfd, struct fd_opts *opts, ops = &signalfd_dump_ops; else if (is_timerfd_link(link)) ops = &timerfd_dump_ops; + else if (is_pidfd_link(link)) + ops = &pidfd_dump_ops; +#ifdef CONFIG_HAS_LIBBPF + else if (is_bpfmap_link(link)) + ops = &bpfmap_dump_ops; +#endif else return dump_unsupp_fd(&p, lfd, "anon", link, e); return do_dump_gen_file(&p, lfd, ops, e); } - if (S_ISREG(p.stat.st_mode) || S_ISDIR(p.stat.st_mode)) { + if (p.fs_type == PID_FS_MAGIC) { + ops = &pidfd_dump_ops; + return do_dump_gen_file(&p, lfd, ops, e); + } + + if (S_ISREG(p.stat.st_mode) || S_ISDIR(p.stat.st_mode) || S_ISLNK(p.stat.st_mode)) { if (fill_fdlink(lfd, &p, &link)) return -1; p.link = &link; - if (link.name[1] == '/') - return do_dump_gen_file(&p, lfd, ®file_dump_ops, e); - if (check_ns_proc(&link)) - return do_dump_gen_file(&p, lfd, &nsfile_dump_ops, e); + /* TODO: Dump for hugetlb fd when memfd hugetlb is not supported */ + if (is_memfd(p.stat.st_dev) || (kdat.has_memfd_hugetlb && is_hugetlb_dev(p.stat.st_dev, NULL))) + ops = &memfd_dump_ops; + else if (link.name[1] == '/') + ops = ®file_dump_ops; + else if (check_ns_proc(&link)) + ops = &nsfile_dump_ops; + else + return dump_unsupp_fd(&p, lfd, "reg", link.name + 1, e); - return dump_unsupp_fd(&p, lfd, "reg", link.name + 1, e); + return do_dump_gen_file(&p, lfd, ops, e); } if (S_ISFIFO(p.stat.st_mode)) { @@ -572,13 +604,13 @@ static int dump_one_file(struct pid *pid, int fd, int lfd, struct fd_opts *opts, int dump_my_file(int lfd, u32 *id, int *type) { struct pid me = {}; - struct fd_opts fo = {}; + struct fd_opts fdo = {}; FdinfoEntry e = FDINFO_ENTRY__INIT; me.real = getpid(); me.ns[0].virt = -1; /* FIXME */ - if (dump_one_file(&me, lfd, lfd, &fo, NULL, &e, NULL)) + if (dump_one_file(&me, lfd, lfd, &fdo, NULL, &e, NULL)) return -1; *id = e.id; @@ -586,14 +618,13 @@ int dump_my_file(int lfd, u32 *id, int *type) return 0; } -int dump_task_files_seized(struct parasite_ctl *ctl, struct pstree_item *item, - struct parasite_drain_fd *dfds) +int dump_task_files_seized(struct parasite_ctl *ctl, struct pstree_item *item, struct parasite_drain_fd *dfds) { int *lfds = NULL; struct cr_img *img = NULL; struct fd_opts *opts = NULL; int i, ret = -1; - int off, nr_fds = min((int) PARASITE_MAX_FDS, dfds->nr_fds); + int off, nr_fds = min((int)PARASITE_MAX_FDS, dfds->nr_fds); pr_info("\n"); pr_info("Dumping opened files (pid: %d)\n", item->pid->real); @@ -616,16 +647,14 @@ int dump_task_files_seized(struct parasite_ctl *ctl, struct pstree_item *item, if (nr_fds + off > dfds->nr_fds) nr_fds = dfds->nr_fds - off; - ret = parasite_drain_fds_seized(ctl, dfds, nr_fds, - off, lfds, opts); + ret = parasite_drain_fds_seized(ctl, dfds, nr_fds, off, lfds, opts); if (ret) goto err; for (i = 0; i < nr_fds; i++) { FdinfoEntry e = FDINFO_ENTRY__INIT; - ret = dump_one_file(item->pid, dfds->fds[i + off], - lfds[i], opts + i, ctl, &e, dfds); + ret = dump_one_file(item->pid, dfds->fds[i + off], lfds[i], opts + i, ctl, &e, dfds); if (ret) break; @@ -790,8 +819,7 @@ static void __collect_desc_fle(struct fdinfo_list_entry *new_le, struct file_des list_add(&new_le->desc_list, &le->desc_list); } -static void collect_desc_fle(struct fdinfo_list_entry *new_le, - struct file_desc *fdesc, bool force_master) +static void collect_desc_fle(struct fdinfo_list_entry *new_le, struct file_desc *fdesc, bool force_master) { new_le->desc = fdesc; @@ -803,9 +831,8 @@ static void collect_desc_fle(struct fdinfo_list_entry *new_le, } } -struct fdinfo_list_entry *collect_fd_to(int pid, FdinfoEntry *e, - struct rst_info *rst_info, struct file_desc *fdesc, - bool fake, bool force_master) +struct fdinfo_list_entry *collect_fd_to(int pid, FdinfoEntry *e, struct rst_info *rst_info, struct file_desc *fdesc, + bool fake, bool force_master) { struct fdinfo_list_entry *new_le; @@ -823,8 +850,7 @@ int collect_fd(int pid, FdinfoEntry *e, struct rst_info *rst_info, bool fake) { struct file_desc *fdesc; - pr_info("Collect fdinfo pid=%d fd=%d id=%#x\n", - pid, e->fd, e->id); + pr_info("Collect fdinfo pid=%d fd=%d id=%#x\n", pid, e->fd, e->id); fdesc = find_file_desc(e); if (fdesc == NULL) { @@ -848,15 +874,14 @@ FdinfoEntry *dup_fdinfo(FdinfoEntry *old, int fd, unsigned flags) fdinfo_entry__init(e); - e->id = old->id; - e->type = old->type; - e->fd = fd; - e->flags = flags; + e->id = old->id; + e->type = old->type; + e->fd = fd; + e->flags = flags; return e; } -int dup_fle(struct pstree_item *task, struct fdinfo_list_entry *ple, - int fd, unsigned flags) +int dup_fle(struct pstree_item *task, struct fdinfo_list_entry *ple, int fd, unsigned flags) { FdinfoEntry *e; @@ -932,7 +957,8 @@ int set_fd_flags(int fd, int flags) if (ret != flags) { pr_err("fcntl call on fd %d (flags %#o) succeeded, " - "but some flags were dropped: %#o\n", fd, flags, ret); + "but some flags were dropped: %#o\n", + fd, flags, ret); return -1; } return 0; @@ -952,7 +978,7 @@ static int receive_fd(struct fdinfo_list_entry *fle); static void transport_name_gen(struct sockaddr_un *addr, int *len, int pid) { addr->sun_family = AF_UNIX; - snprintf(addr->sun_path, UNIX_PATH_MAX, "x/crtools-fd-%d", pid); + snprintf(addr->sun_path, UNIX_PATH_MAX, "x/crtools-fd-%d-%s", pid, criu_run_id); *len = SUN_LEN(addr); *addr->sun_path = '\0'; } @@ -1109,12 +1135,12 @@ int setup_and_serve_out(struct fdinfo_list_entry *fle, int new_fd) static int open_fd(struct fdinfo_list_entry *fle) { struct file_desc *d = fle->desc; - struct fdinfo_list_entry *flem; + struct fdinfo_list_entry *fle_m; int new_fd = -1, ret; - flem = file_master(d); - if (fle != flem) { - BUG_ON (fle->stage != FLE_INITIALIZED); + fle_m = file_master(d); + if (fle != fle_m) { + BUG_ON(fle->stage != FLE_INITIALIZED); ret = receive_fd(fle); if (ret != 0) return ret; @@ -1192,8 +1218,7 @@ static int open_fdinfos(struct pstree_item *me) BUG_ON(st == FLE_RESTORED); ret = open_fd(fle); if (ret == -1) { - pr_err("Unable to open fd=%d id=%#x\n", - fle->fe->fd, fle->fe->id); + pr_err("Unable to open fd=%d id=%#x\n", fle->fe->fd, fle->fe->id); goto splice; } if (st != fle->stage || ret == 0) @@ -1211,7 +1236,7 @@ static int open_fdinfos(struct pstree_item *me) list_add(&fle->ps_list, &fake); } if (ret == 1) - again = true; + again = true; } if (!progress && again) wait_fds_event(); @@ -1236,6 +1261,14 @@ int close_old_fds(void) struct dirent *de; int fd, ret; + /** + * Close previous /proc/self/ service fd, as we don't want to reuse it + * from a different task. Also there can be some junk fd in it's place + * after we've moved our service fds (e.g. from other task of parents + * shared fdtable), we need to close it before opendir_proc() below. + */ + __close_service_fd(PROC_SELF_FD_OFF); + dir = opendir_proc(PROC_SELF, "fd"); if (dir == NULL) return -1; @@ -1277,7 +1310,6 @@ int prepare_fds(struct pstree_item *me) sfds_protected = false; close_service_fd(CGROUP_YARD); sfds_protected = true; - set_proc_self_fd(-1); /* flush any proc cached fds we may have */ if (rsti(me)->fdt) { struct fdt *fdt = rsti(me)->fdt; @@ -1297,7 +1329,6 @@ int prepare_fds(struct pstree_item *me) } } - BUG_ON(current->pid->state == TASK_HELPER); ret = open_fdinfos(me); if (rsti(me)->fdt) @@ -1322,10 +1353,35 @@ static int fchroot(int fd) return chroot("."); } +static int need_chroot(int saved_root) +{ + struct stat saved_root_stat, cur_root_stat; + int psd; + + if (fstat(saved_root, &saved_root_stat) == -1) { + pr_perror("Failed to stat saved root dir"); + return -1; + } + + psd = open_pid_proc(PROC_SELF); + if (psd < 0) { + pr_perror("Failed to open PROC_SELF"); + return -1; + } + + if (fstatat(psd, "root", &cur_root_stat, 0) == -1) { + pr_perror("Failed to stat current root dir"); + return -1; + } + + return saved_root_stat.st_ino != cur_root_stat.st_ino || saved_root_stat.st_dev != cur_root_stat.st_dev; +} + int restore_fs(struct pstree_item *me) { int dd_root = -1, dd_cwd = -1, ret, err = -1; struct rst_info *ri = rsti(me); + bool do_chroot = true; /* * First -- open both descriptors. We will not @@ -1344,15 +1400,24 @@ int restore_fs(struct pstree_item *me) goto out; } + /* + * In unprivileged mode chroot() may fail if we don't have + * sufficient privileges, therefore only do it if the process + * is actually chrooted. + */ + if (opts.unprivileged) + do_chroot = need_chroot(dd_root); + /* * Now do chroot/chdir. Chroot goes first as it calls chdir into * dd_root so we'd need to fix chdir after it anyway. */ - - ret = fchroot(dd_root); - if (ret < 0) { - pr_perror("Can't change root"); - goto out; + if (do_chroot) { + ret = fchroot(dd_root); + if (ret < 0) { + pr_perror("Can't change root"); + goto out; + } } ret = fchdir(dd_cwd); @@ -1474,8 +1539,8 @@ int shared_fdt_prepare(struct pstree_item *item) struct inherit_fd { struct list_head inh_list; - char *inh_id; /* file identifier */ - int inh_fd; /* criu's descriptor to inherit */ + char *inh_id; /* file identifier */ + int inh_fd; /* criu's descriptor to inherit */ int inh_fd_id; }; @@ -1514,8 +1579,7 @@ int inherit_fd_parse(char *optarg) if (dbg) { n = strlen(cp); if (write(fd, cp, n) != n) { - pr_err("Can't write debug message %s to inherit fd %d\n", - cp, fd); + pr_err("Can't write debug message %s to inherit fd %d\n", cp, fd); return -1; } return 0; @@ -1541,7 +1605,12 @@ int inherit_fd_add(int fd, char *key) if (fd > inh_fd_max) inh_fd_max = fd; - inh->inh_id = key; + inh->inh_id = xstrdup(key); + if (inh->inh_id == NULL) { + xfree(inh); + return -1; + } + inh->inh_fd = fd; list_add_tail(&inh->inh_list, &opts.inherit_fds); return 0; @@ -1556,8 +1625,7 @@ void inherit_fd_log(void) struct inherit_fd *inh; list_for_each_entry(inh, &opts.inherit_fds, inh_list) { - pr_info("File %s will be restored from inherit fd %d\n", - inh->inh_id, inh->inh_fd); + pr_info("File %s will be restored from inherit fd %d\n", inh->inh_id, inh->inh_fd); } } @@ -1587,8 +1655,7 @@ int inherit_fd_lookup_id(char *id) list_for_each_entry(inh, &opts.inherit_fds, inh_list) { if (!strcmp(inh->inh_id, id)) { ret = fdstore_get(inh->inh_fd_id); - pr_debug("Found id %s (fd %d) in inherit fd list\n", - id, ret); + pr_debug("Found id %s (fd %d) in inherit fd list\n", id, ret); break; } } @@ -1597,7 +1664,7 @@ int inherit_fd_lookup_id(char *id) bool inherited_fd(struct file_desc *d, int *fd_p) { - char buf[32], *id_str; + char buf[PATH_MAX], *id_str; int i_fd; if (!d->ops->name) @@ -1613,7 +1680,8 @@ bool inherited_fd(struct file_desc *d, int *fd_p) *fd_p = i_fd; pr_info("File %s will be restored from fd %d dumped " - "from inherit fd %d\n", id_str, *fd_p, i_fd); + "from inherit fd %d\n", + id_str, *fd_p, i_fd); return true; } @@ -1643,8 +1711,7 @@ out: return ret; } -static int collect_one_file_entry(FileEntry *fe, u_int32_t id, ProtobufCMessage *base, - struct collect_image_info *cinfo) +static int collect_one_file_entry(FileEntry *fe, u_int32_t id, ProtobufCMessage *base, struct collect_image_info *cinfo) { if (fe->id != id) { pr_err("ID mismatch %u != %u\n", fe->id, id); @@ -1715,6 +1782,17 @@ static int collect_one_file(void *o, ProtobufCMessage *base, struct cr_img *i) case FD_TYPES__TTY: ret = collect_one_file_entry(fe, fe->tty->id, &fe->tty->base, &tty_cinfo); break; + case FD_TYPES__MEMFD: + ret = collect_one_file_entry(fe, fe->memfd->id, &fe->memfd->base, &memfd_cinfo); + break; + case FD_TYPES__PIDFD: + ret = collect_one_file_entry(fe, fe->pidfd->id, &fe->pidfd->base, &pidfd_cinfo); + break; +#ifdef CONFIG_HAS_LIBBPF + case FD_TYPES__BPFMAP: + ret = collect_one_file_entry(fe, fe->bpf->id, &fe->bpf->base, &bpfmap_cinfo); + break; +#endif } return ret; @@ -1731,5 +1809,7 @@ struct collect_image_info files_cinfo = { int prepare_files(void) { init_fdesc_hash(); + init_sk_info_hash(); + init_dead_pidfd_hash(); return collect_image(&files_cinfo); } diff --git a/criu/filesystems.c b/criu/filesystems.c index 1e4550b37..093e1c492 100644 --- a/criu/filesystems.c +++ b/criu/filesystems.c @@ -44,7 +44,6 @@ static int binfmt_misc_parse_or_collect(struct mount_info *pm) { opts.has_binfmt_misc = true; return 0; - } static int binfmt_misc_virtual(struct mount_info *pm) @@ -78,13 +77,13 @@ static int parse_binfmt_misc_entry(struct bfd *f, BinfmtMiscEntry *bme) continue; } -#define DUP_EQUAL_AS(key, member) \ - if (!strncmp(str, key, strlen(key))) { \ - bme->member = xstrdup(str + strlen(key)); \ - if (!bme->member) \ - return -1; \ - continue; \ - } +#define DUP_EQUAL_AS(key, member) \ + if (!strncmp(str, key, strlen(key))) { \ + bme->member = xstrdup(str + strlen(key)); \ + if (!bme->member) \ + return -1; \ + continue; \ + } DUP_EQUAL_AS("interpreter ", interpreter) DUP_EQUAL_AS("flags: ", flags) DUP_EQUAL_AS("extension .", extension) @@ -130,7 +129,6 @@ err: free(bme.mask); bclose(&f); return ret; - } static int binfmt_misc_dump(struct mount_info *pm) @@ -192,7 +190,7 @@ out: static int write_binfmt_misc_entry(char *mp, char *buf, BinfmtMiscEntry *bme) { int fd, len, ret = -1; - char path[PATH_MAX+1]; + char path[PATH_MAX + 1]; snprintf(path, PATH_MAX, "%s/register", mp); @@ -243,9 +241,9 @@ static int make_bfmtm_magic_str(char *buf, BinfmtMiscEntry *bme) * dump them without changes. But for registering a new entry * it expects every byte is prepended with \x, i.e. \x61\x62\x63. */ - len = strlen(bme->name) + 3 /* offset < 128 */ + 2 * strlen(bme->magic) - + (bme->mask ? 2 * strlen(bme->mask) : 0) + strlen(bme->interpreter) - + (bme->flags ? strlen(bme->flags) : 0) + strlen(":::::::"); + len = strlen(bme->name) + 3 /* offset < 128 */ + 2 * strlen(bme->magic) + + (bme->mask ? 2 * strlen(bme->mask) : 0) + strlen(bme->interpreter) + + (bme->flags ? strlen(bme->flags) : 0) + strlen(":::::::"); if ((len > BINFMT_MISC_STR - 1) || bme->offset > 128) return -1; @@ -264,7 +262,7 @@ static int make_bfmtm_magic_str(char *buf, BinfmtMiscEntry *bme) buf += sprintf(buf, "\\x%c%c", bme->mask[i], bme->mask[i + 1]); } - sprintf(buf, ":%s:%s", bme->interpreter, bme->flags ? : "\0"); + sprintf(buf, ":%s:%s", bme->interpreter, bme->flags ?: "\0"); return 1; } @@ -281,9 +279,8 @@ static int binfmt_misc_restore_bme(struct mount_info *mi, BinfmtMiscEntry *bme, ret = make_bfmtm_magic_str(buf, bme); } else if (bme->extension) { /* :name:E::extension::interpreter:flags */ - ret = snprintf(buf, BINFMT_MISC_STR, ":%s:E::%s::%s:%s", - bme->name, bme->extension, bme->interpreter, - bme->flags ? : "\0"); + ret = snprintf(buf, BINFMT_MISC_STR, ":%s:E::%s::%s:%s", bme->name, bme->extension, bme->interpreter, + bme->flags ?: "\0"); if (ret >= BINFMT_MISC_STR) /* output truncated */ ret = -1; } else @@ -293,7 +290,7 @@ static int binfmt_misc_restore_bme(struct mount_info *mi, BinfmtMiscEntry *bme, goto bad_dump; pr_debug("binfmt_misc_pattern=%s\n", buf); - ret = write_binfmt_misc_entry(mi->mountpoint, buf, bme); + ret = write_binfmt_misc_entry(service_mountpoint(mi), buf, bme); return ret; @@ -374,8 +371,8 @@ int collect_binfmt_misc(void) return collect_image(&binfmt_misc_cinfo); } #else -#define binfmt_misc_dump NULL -#define binfmt_misc_restore NULL +#define binfmt_misc_dump NULL +#define binfmt_misc_restore NULL #define binfmt_misc_parse_or_collect NULL #endif @@ -419,17 +416,11 @@ static int tmpfs_dump(struct mount_info *pm) if (root_ns_mask & CLONE_NEWUSER) userns_pid = root_item->pid->real; - ret = cr_system_userns(fd, img_raw_fd(img), -1, "tar", (char *[]) - { "tar", "--create", - "--gzip", - "--no-unquote", - "--no-wildcards", - "--one-file-system", - "--check-links", - "--preserve-permissions", - "--sparse", - "--numeric-owner", - "--directory", "/proc/self/fd/0", ".", NULL }, 0, userns_pid); + ret = cr_system_userns(fd, img_raw_fd(img), -1, "tar", + (char *[]){ "tar", "--create", "--gzip", "--no-unquote", "--no-wildcards", + "--one-file-system", "--check-links", "--preserve-permissions", "--sparse", + "--numeric-owner", "--directory", "/proc/self/fd/0", ".", NULL }, + 0, userns_pid); if (ret) pr_err("Can't dump tmpfs content\n"); @@ -460,9 +451,9 @@ static int tmpfs_restore(struct mount_info *pm) } ret = cr_system(img_raw_fd(img), -1, -1, "tar", - (char *[]) {"tar", "--extract", "--gzip", - "--no-unquote", "--no-wildcards", - "--directory", pm->mountpoint, NULL}, 0); + (char *[]){ "tar", "--extract", "--gzip", "--no-unquote", "--no-wildcards", "--directory", + service_mountpoint(pm), NULL }, + 0); close_image(img); if (ret) { @@ -556,9 +547,9 @@ static int fusectl_dump(struct mount_info *pm) } for (it = mntinfo; it; it = it->next) { - if (it->fstype->code == FSTYPE__FUSE && - id == kdev_minor(it->s_dev) && !it->external) { - pr_err("%s is a fuse mount but not external\n", it->mountpoint); + if (it->fstype->code == FSTYPE__FUSE && id == kdev_minor(it->s_dev) && + !mnt_is_external_bind(it)) { + pr_err("%s is a fuse mount but not external\n", it->ns_mountpoint); goto out; } } @@ -588,8 +579,7 @@ static int tracefs_parse(struct mount_info *pm) static bool cgroup_sb_equal(struct mount_info *a, struct mount_info *b) { - if (a->private && b->private && - strcmp(a->private, b->private)) + if (a->private && b->private && strcmp(a->private, b->private)) return false; if (strcmp(a->options, b->options)) return false; @@ -656,13 +646,12 @@ static int dump_empty_fs(struct mount_info *pm) return fd; ret = is_empty_dir(fd); - close(fd); - if (ret < 0) { + if (ret == 0) { pr_err("%s isn't empty\n", pm->fstype->name); return -1; } - return ret ? 0 : -1; + return ret == 1 ? 0 : -1; } /* @@ -671,8 +660,7 @@ static int dump_empty_fs(struct mount_info *pm) */ static int always_fail(struct mount_info *pm) { - pr_err("failed to dump fs %s (%s): always fail\n", pm->mountpoint, - pm->fstype->name); + pr_err("failed to dump fs %s (%s): always fail\n", pm->ns_mountpoint, pm->fstype->name); return -1; } @@ -680,87 +668,113 @@ static struct fstype fstypes[] = { { .name = "unsupported", .code = FSTYPE__UNSUPPORTED, - }, { + }, + { .name = "auto_cr", .code = FSTYPE__AUTO, - }, { + }, + { .name = "proc", .code = FSTYPE__PROC, - }, { + }, + { .name = "sysfs", .code = FSTYPE__SYSFS, - }, { + }, + { .name = "devtmpfs", .code = FSTYPE__DEVTMPFS, .dump = devtmpfs_dump, .restore = devtmpfs_restore, - }, { + }, + { .name = "binfmt_misc", .parse = binfmt_misc_parse_or_collect, .collect = binfmt_misc_parse_or_collect, .code = FSTYPE__BINFMT_MISC, .dump = binfmt_misc_dump, .restore = binfmt_misc_restore, - }, { + }, + { .name = "tmpfs", .code = FSTYPE__TMPFS, .dump = tmpfs_dump, .restore = tmpfs_restore, - }, { + }, + { .name = "devpts", .parse = devpts_parse, .code = FSTYPE__DEVPTS, .restore = devpts_restore, .check_bindmount = devpts_check_bindmount, - }, { + }, + { .name = "simfs", .code = FSTYPE__SIMFS, - }, { + }, + { .name = "btrfs", .code = FSTYPE__UNSUPPORTED, .sb_equal = btrfs_sb_equal, - }, { + }, + { .name = "pstore", .dump = dump_empty_fs, .code = FSTYPE__PSTORE, - }, { + }, + { .name = "mqueue", .dump = dump_empty_fs, .code = FSTYPE__MQUEUE, - }, { + }, + { .name = "securityfs", .code = FSTYPE__SECURITYFS, - }, { + }, + { .name = "fusectl", .dump = fusectl_dump, .code = FSTYPE__FUSECTL, - }, { + }, + { .name = "debugfs", .code = FSTYPE__DEBUGFS, .parse = debugfs_parse, - }, { + }, + { .name = "tracefs", .code = FSTYPE__TRACEFS, .parse = tracefs_parse, - }, { + }, + { .name = "cgroup", .code = FSTYPE__CGROUP, .parse = cgroup_parse, .sb_equal = cgroup_sb_equal, - }, { + }, + { + .name = "cgroup2", + .code = FSTYPE__CGROUP2, + .parse = cgroup_parse, + .sb_equal = cgroup_sb_equal, + }, + { .name = "aufs", .code = FSTYPE__AUFS, .parse = aufs_parse, - }, { + }, + { .name = "fuse", .code = FSTYPE__FUSE, .dump = always_fail, .restore = always_fail, - }, { + }, + { .name = "overlay", .code = FSTYPE__OVERLAYFS, .parse = overlayfs_parse, - }, { + }, + { .name = "autofs", .code = FSTYPE__AUTOFS, .parse = autofs_parse, @@ -769,7 +783,10 @@ static struct fstype fstypes[] = { }, }; -struct fstype *fstype_auto(void) { return &fstypes[1]; } +struct fstype *fstype_auto(void) +{ + return &fstypes[1]; +} static char fsauto_all[] = "all"; static char *fsauto_names; @@ -813,9 +830,11 @@ bool add_fsname_auto(const char *names) if (css_contains(names, fsauto_all)) fsauto_names = fsauto_all; - else if (!old) + else if (!old) { fsauto_names = xstrdup(names); - else { + if (!fsauto_names) + abort(); + } else { if (asprintf(&fsauto_names, "%s,%s", old, names) < 0) fsauto_names = NULL; } @@ -867,4 +886,3 @@ struct fstype *decode_fstype(u32 fst) uns: return &fstypes[0]; } - diff --git a/criu/fsnotify.c b/criu/fsnotify.c index 09093c0be..8572dc2f3 100644 --- a/criu/fsnotify.c +++ b/criu/fsnotify.c @@ -46,26 +46,26 @@ #include "images/fsnotify.pb-c.h" #include "images/mnt.pb-c.h" -#undef LOG_PREFIX +#undef LOG_PREFIX #define LOG_PREFIX "fsnotify: " struct fsnotify_mark_info { - struct list_head list; + struct list_head list; union { - InotifyWdEntry *iwe; - FanotifyMarkEntry *fme; + InotifyWdEntry *iwe; + FanotifyMarkEntry *fme; }; - struct pprep_head prep; /* XXX union with remap */ - struct file_remap *remap; + struct pprep_head prep; /* XXX union with remap */ + struct file_remap *remap; }; struct fsnotify_file_info { union { - InotifyFileEntry *ife; - FanotifyFileEntry *ffe; + InotifyFileEntry *ife; + FanotifyFileEntry *ffe; }; - struct list_head marks; - struct file_desc d; + struct list_head marks; + struct file_desc d; }; /* File handle */ @@ -91,12 +91,10 @@ static void decode_handle(fh_t *handle, FhEntry *img) { memzero(handle, sizeof(*handle)); - handle->type = img->type; - handle->bytes = img->bytes; + handle->type = img->type; + handle->bytes = img->bytes; - memcpy(handle->__handle, img->handle, - min(pb_repeated_size(img, handle), - sizeof(handle->__handle))); + memcpy(handle->__handle, img->handle, min(pb_repeated_size(img, handle), sizeof(handle->__handle))); } static int open_by_handle(void *arg, int fd, int pid) @@ -104,12 +102,15 @@ static int open_by_handle(void *arg, int fd, int pid) return syscall(__NR_open_by_handle_at, fd, arg, O_PATH); } +enum { ERR_NO_MOUNT = -1, ERR_NO_PATH_IN_MOUNT = -2, ERR_GENERIC = -3 }; + static char *alloc_openable(unsigned int s_dev, unsigned long i_ino, FhEntry *f_handle) { struct mount_info *m; fh_t handle; int fd = -1; char *path; + char suitable_mount_found = 0; decode_handle(&handle, f_handle); @@ -131,17 +132,17 @@ static char *alloc_openable(unsigned int s_dev, unsigned long i_ino, FhEntry *f_ if (!mnt_is_dir(m)) continue; - mntfd = __open_mountpoint(m, -1); - pr_debug("\t\tTrying via mntid %d root %s ns_mountpoint @%s (%d)\n", - m->mnt_id, m->root, m->ns_mountpoint, mntfd); + mntfd = __open_mountpoint(m); + pr_debug("\t\tTrying via mntid %d root %s ns_mountpoint @%s (%d)\n", m->mnt_id, m->root, + m->ns_mountpoint, mntfd); if (mntfd < 0) continue; - fd = userns_call(open_by_handle, UNS_FDOUT, &handle, - sizeof(handle), mntfd); + fd = userns_call(open_by_handle, UNS_FDOUT, &handle, sizeof(handle), mntfd); close(mntfd); if (fd < 0) continue; + suitable_mount_found = 1; if (read_fd_link(fd, buf, sizeof(buf)) < 0) { close(fd); @@ -164,18 +165,17 @@ static char *alloc_openable(unsigned int s_dev, unsigned long i_ino, FhEntry *f_ if (fstat(openable_fd, &st)) { pr_perror("Can't stat on %s", __path); close(openable_fd); - return ERR_PTR(-errno); + goto err; } close(openable_fd); - pr_debug("\t\t\topenable (inode %s) as %s\n", - st.st_ino == i_ino ? - "match" : "don't match", __path); + pr_debug("\t\t\topenable (inode %s) as %s\n", st.st_ino == i_ino ? "match" : "don't match", + __path); if (st.st_ino == i_ino) { path = xstrdup(buf); if (path == NULL) - return ERR_PTR(-ENOMEM); + return ERR_PTR(ERR_GENERIC); if (root_ns_mask & CLONE_NEWNS) { f_handle->has_mnt_id = true; f_handle->mnt_id = m->mnt_id; @@ -183,16 +183,16 @@ static char *alloc_openable(unsigned int s_dev, unsigned long i_ino, FhEntry *f_ return path; } } else - pr_debug("\t\t\tnot openable as %s (%m)\n", __path); + pr_debug("\t\t\tnot openable as %s (%s)\n", __path, strerror(errno)); } - return ERR_PTR(-ENOENT); err: - return ERR_PTR(-1); + if (suitable_mount_found) + return ERR_PTR(ERR_NO_PATH_IN_MOUNT); + return ERR_PTR(ERR_NO_MOUNT); } -static int open_handle(unsigned int s_dev, unsigned long i_ino, - FhEntry *f_handle) +static int open_handle(unsigned int s_dev, unsigned long i_ino, FhEntry *f_handle) { struct mount_info *m; int mntfd, fd = -1; @@ -200,16 +200,15 @@ static int open_handle(unsigned int s_dev, unsigned long i_ino, decode_handle(&handle, f_handle); - pr_debug("Opening fhandle %x:%llx...\n", - s_dev, (unsigned long long)handle.__handle[0]); + pr_debug("Opening fhandle %x:%llx...\n", s_dev, (unsigned long long)handle.__handle[0]); for (m = mntinfo; m; m = m->next) { if (m->s_dev != s_dev || !mnt_is_dir(m)) continue; - mntfd = __open_mountpoint(m, -1); + mntfd = __open_mountpoint(m); if (mntfd < 0) { - pr_err("Can't open mount for s_dev %x, continue\n", s_dev); + pr_warn("Can't open mount for s_dev %x, continue\n", s_dev); continue; } @@ -224,67 +223,61 @@ out: return fd; } -int check_open_handle(unsigned int s_dev, unsigned long i_ino, - FhEntry *f_handle) +int check_open_handle(unsigned int s_dev, unsigned long i_ino, FhEntry *f_handle) { char *path, *irmap_path; - int fd = -1; + struct mount_info *mi; - if (fault_injected(FI_CHECK_OPEN_HANDLE)) { - fd = -1; + if (fault_injected(FI_CHECK_OPEN_HANDLE)) goto fault; - } - fd = open_handle(s_dev, i_ino, f_handle); -fault: - if (fd >= 0) { - struct mount_info *mi; + /* + * Always try to fetch watchee path first. There are several reasons: + * + * - tmpfs/devtmps do not save inode numbers between mounts, + * so it is critical to have the complete path under our + * hands for restore purpose; + * + * - in case of migration the inodes might be changed as well + * so the only portable solution is to carry the whole path + * to the watchee inside image. + */ + path = alloc_openable(s_dev, i_ino, f_handle); + if (!IS_ERR_OR_NULL(path)) { pr_debug("\tHandle 0x%x:0x%lx is openable\n", s_dev, i_ino); - - mi = lookup_mnt_sdev(s_dev); - if (mi == NULL) { - pr_err("Unable to lookup a mount by dev 0x%x\n", s_dev); - goto err; - } - - /* - * Always try to fetch watchee path first. There are several reasons: - * - * - tmpfs/devtmps do not save inode numbers between mounts, - * so it is critical to have the complete path under our - * hands for restore purpose; - * - * - in case of migration the inodes might be changed as well - * so the only portable solution is to carry the whole path - * to the watchee inside image. - */ - path = alloc_openable(s_dev, i_ino, f_handle); - if (!IS_ERR_OR_NULL(path)) - goto out; - else if (IS_ERR(path) && PTR_ERR(path) == -ENOMEM) - goto err; - - if ((mi->fstype->code == FSTYPE__TMPFS) || - (mi->fstype->code == FSTYPE__DEVTMPFS)) { - pr_err("Can't find suitable path for handle (dev %#x ino %#lx): %d\n", - s_dev, i_ino, (int)PTR_ERR(path)); - goto err; - } - - if (!opts.force_irmap) - /* - * If we're not forced to do irmap, then - * say we have no path for watch. Otherwise - * do irmap scan even if the handle is - * working. - * - * FIXME -- no need to open-by-handle if - * we are in force-irmap and not on tempfs - */ - goto out_nopath; + goto out; + } else if (IS_ERR(path) && PTR_ERR(path) == ERR_NO_MOUNT) { + goto fault; + } else if (IS_ERR(path) && PTR_ERR(path) == ERR_GENERIC) { + goto err; } + mi = lookup_mnt_sdev(s_dev); + if (mi == NULL) { + pr_err("Unable to lookup a mount by dev 0x%x\n", s_dev); + goto err; + } + + if ((mi->fstype->code == FSTYPE__TMPFS) || (mi->fstype->code == FSTYPE__DEVTMPFS)) { + pr_err("Can't find suitable path for handle (dev %#x ino %#lx): %d\n", s_dev, i_ino, + (int)PTR_ERR(path)); + goto err; + } + + if (!opts.force_irmap) + /* + * If we're not forced to do irmap, then + * say we have no path for watch. Otherwise + * do irmap scan even if the handle is + * working. + * + * FIXME -- no need to open-by-handle if + * we are in force-irmap and not on tempfs + */ + goto out_nopath; + +fault: pr_warn("\tHandle 0x%x:0x%lx cannot be opened\n", s_dev, i_ino); irmap_path = irmap_lookup(s_dev, i_ino); if (!irmap_path) { @@ -298,20 +291,16 @@ out: pr_debug("\tDumping %s as path for handle\n", path); f_handle->path = path; out_nopath: - close_safe(&fd); return 0; err: - close_safe(&fd); return -1; } static int check_one_wd(InotifyWdEntry *we) { - pr_info("wd: wd %#08x s_dev %#08x i_ino %#16"PRIx64" mask %#08x\n", - we->wd, we->s_dev, we->i_ino, we->mask); - pr_info("\t[fhandle] bytes %#08x type %#08x __handle %#016"PRIx64":%#016"PRIx64"\n", - we->f_handle->bytes, we->f_handle->type, - we->f_handle->handle[0], we->f_handle->handle[1]); + pr_info("wd: wd %#08x s_dev %#08x i_ino %#16" PRIx64 " mask %#08x\n", we->wd, we->s_dev, we->i_ino, we->mask); + pr_info("\t[fhandle] bytes %#08x type %#08x __handle %#016" PRIx64 ":%#016" PRIx64 "\n", we->f_handle->bytes, + we->f_handle->type, we->f_handle->handle[0], we->f_handle->handle[1]); if (we->mask & KERNEL_FS_EVENT_ON_CHILD) pr_warn_once("\t\tDetected FS_EVENT_ON_CHILD bit " @@ -384,23 +373,21 @@ static int pre_dump_one_inotify(int pid, int lfd) } const struct fdtype_ops inotify_dump_ops = { - .type = FD_TYPES__INOTIFY, - .dump = dump_one_inotify, - .pre_dump = pre_dump_one_inotify, + .type = FD_TYPES__INOTIFY, + .dump = dump_one_inotify, + .pre_dump = pre_dump_one_inotify, }; static int check_one_mark(FanotifyMarkEntry *fme) { if (fme->type == MARK_TYPE__INODE) { - BUG_ON(!fme->ie); - pr_info("mark: s_dev %#08x i_ino %#016"PRIx64" mask %#08x\n", - fme->s_dev, fme->ie->i_ino, fme->mask); + pr_info("mark: s_dev %#08x i_ino %#016" PRIx64 " mask %#08x\n", fme->s_dev, fme->ie->i_ino, fme->mask); - pr_info("\t[fhandle] bytes %#08x type %#08x __handle %#016"PRIx64":%#016"PRIx64"\n", - fme->ie->f_handle->bytes, fme->ie->f_handle->type, - fme->ie->f_handle->handle[0], fme->ie->f_handle->handle[1]); + pr_info("\t[fhandle] bytes %#08x type %#08x __handle %#016" PRIx64 ":%#016" PRIx64 "\n", + fme->ie->f_handle->bytes, fme->ie->f_handle->type, fme->ie->f_handle->handle[0], + fme->ie->f_handle->handle[1]); if (check_open_handle(fme->s_dev, fme->ie->i_ino, fme->ie->f_handle)) return -1; @@ -417,12 +404,10 @@ static int check_one_mark(FanotifyMarkEntry *fme) return -1; } if (!(root_ns_mask & CLONE_NEWNS)) - fme->me->path = m->mountpoint + 1; + fme->me->path = m->ns_mountpoint + 1; fme->s_dev = m->s_dev; - pr_info("mark: s_dev %#08x mnt_id %#08x mask %#08x\n", - fme->s_dev, fme->me->mnt_id, fme->mask); - + pr_info("mark: s_dev %#08x mnt_id %#08x mask %#08x\n", fme->s_dev, fme->me->mnt_id, fme->mask); } return 0; @@ -477,9 +462,7 @@ static int pre_dump_one_fanotify(int pid, int lfd) for (i = 0; i < fe.n_mark; i++) { FanotifyMarkEntry *me = fe.mark[i]; - if (me->type == MARK_TYPE__INODE && - irmap_queue_cache(me->s_dev, me->ie->i_ino, - me->ie->f_handle)) + if (me->type == MARK_TYPE__INODE && irmap_queue_cache(me->s_dev, me->ie->i_ino, me->ie->f_handle)) return -1; xfree(me); @@ -489,13 +472,12 @@ static int pre_dump_one_fanotify(int pid, int lfd) } const struct fdtype_ops fanotify_dump_ops = { - .type = FD_TYPES__FANOTIFY, - .dump = dump_one_fanotify, - .pre_dump = pre_dump_one_fanotify, + .type = FD_TYPES__FANOTIFY, + .dump = dump_one_fanotify, + .pre_dump = pre_dump_one_fanotify, }; -static char *get_mark_path(const char *who, struct file_remap *remap, - FhEntry *f_handle, unsigned long i_ino, +static char *get_mark_path(const char *who, struct file_remap *remap, FhEntry *f_handle, unsigned long i_ino, unsigned int s_dev, char *buf, int *target) { char *path = NULL; @@ -505,11 +487,10 @@ static char *get_mark_path(const char *who, struct file_remap *remap, mntns_root = mntns_get_root_by_mnt_id(remap->rmnt_id); - pr_debug("\t\tRestore %s watch for %#08x:%#016lx (via %s)\n", - who, s_dev, i_ino, remap->rpath); + pr_debug("\t\tRestore %s watch for %#08x:%#016lx (via %s)\n", who, s_dev, i_ino, remap->rpath); *target = openat(mntns_root, remap->rpath, O_PATH); } else if (f_handle->path) { - int mntns_root; + int mntns_root; char *path = "."; uint32_t mnt_id = f_handle->has_mnt_id ? f_handle->mnt_id : -1; @@ -533,7 +514,7 @@ static char *get_mark_path(const char *who, struct file_remap *remap, /* * fanotify/inotify open syscalls want path to attach * watch to. But the only thing we have is an FD obtained - * via fhandle. Fortunatelly, when trying to attach the + * via fhandle. Fortunately, when trying to attach the * /proc/pid/fd/ link, we will watch the inode the link * points to, i.e. -- just what we want. */ @@ -547,8 +528,7 @@ static char *get_mark_path(const char *who, struct file_remap *remap, if (read_fd_link(*target, link, sizeof(link)) < 0) link[0] = '\0'; - pr_debug("\t\tRestore %s watch for %#08x:%#016lx (via %s -> %s)\n", - who, s_dev, i_ino, path, link); + pr_debug("\t\tRestore %s watch for %#08x:%#016lx (via %s -> %s)\n", who, s_dev, i_ino, path, link); } err: return path; @@ -561,15 +541,13 @@ static int restore_one_inotify(int inotify_fd, struct fsnotify_mark_info *info) char buf[PSFDS], *path; uint32_t mask; - path = get_mark_path("inotify", info->remap, iwe->f_handle, - iwe->i_ino, iwe->s_dev, buf, &target); + path = get_mark_path("inotify", info->remap, iwe->f_handle, iwe->i_ino, iwe->s_dev, buf, &target); if (!path) goto err; mask = iwe->mask & IN_ALL_EVENTS; if (iwe->mask & ~IN_ALL_EVENTS) { - pr_info("\t\tfilter event mask %#x -> %#x\n", - iwe->mask, mask); + pr_info("\t\tfilter event mask %#x -> %#x\n", iwe->mask, mask); } if (kdat.has_inotify_setnextwd) { @@ -640,9 +618,8 @@ static int restore_one_fanotify(int fd, struct fsnotify_mark_info *mark) snprintf(buf, sizeof(buf), "/proc/self/fd/%d", target); path = buf; } else if (fme->type == MARK_TYPE__INODE) { - path = get_mark_path("fanotify", mark->remap, - fme->ie->f_handle, fme->ie->i_ino, - fme->s_dev, buf, &target); + path = get_mark_path("fanotify", mark->remap, fme->ie->f_handle, fme->ie->i_ino, fme->s_dev, buf, + &target); if (!path) goto err; } else { @@ -655,18 +632,16 @@ static int restore_one_fanotify(int fd, struct fsnotify_mark_info *mark) if (mark->fme->mask) { ret = fanotify_mark(fd, flags, fme->mask, AT_FDCWD, path); if (ret) { - pr_err("Adding fanotify mask 0x%x on 0x%x/%s failed (%d)\n", - fme->mask, fme->id, path, ret); + pr_err("Adding fanotify mask 0x%x on 0x%x/%s failed (%d)\n", fme->mask, fme->id, path, ret); goto err; } } if (fme->ignored_mask) { - ret = fanotify_mark(fd, flags | FAN_MARK_IGNORED_MASK, - fme->ignored_mask, AT_FDCWD, path); + ret = fanotify_mark(fd, flags | FAN_MARK_IGNORED_MASK, fme->ignored_mask, AT_FDCWD, path); if (ret) { - pr_err("Adding fanotify ignored-mask 0x%x on 0x%x/%s failed (%d)\n", - fme->ignored_mask, fme->id, path, ret); + pr_err("Adding fanotify ignored-mask 0x%x on 0x%x/%s failed (%d)\n", fme->ignored_mask, fme->id, + path, ret); goto err; } } @@ -788,8 +763,7 @@ static int __collect_inotify_mark(struct fsnotify_file_info *p, struct fsnotify_ return 0; } -static int __collect_fanotify_mark(struct fsnotify_file_info *p, - struct fsnotify_mark_info *mark) +static int __collect_fanotify_mark(struct fsnotify_file_info *p, struct fsnotify_mark_info *mark) { list_add(&mark->list, &p->marks); if (mark->fme->type == MARK_TYPE__INODE) { @@ -827,10 +801,10 @@ static int collect_one_inotify(void *o, ProtobufCMessage *msg, struct cr_img *im } struct collect_image_info inotify_cinfo = { - .fd_type = CR_FD_INOTIFY_FILE, - .pb_type = PB_INOTIFY_FILE, - .priv_size = sizeof(struct fsnotify_file_info), - .collect = collect_one_inotify, + .fd_type = CR_FD_INOTIFY_FILE, + .pb_type = PB_INOTIFY_FILE, + .priv_size = sizeof(struct fsnotify_file_info), + .collect = collect_one_inotify, }; static int collect_one_fanotify(void *o, ProtobufCMessage *msg, struct cr_img *img) @@ -861,10 +835,10 @@ static int collect_one_fanotify(void *o, ProtobufCMessage *msg, struct cr_img *i } struct collect_image_info fanotify_cinfo = { - .fd_type = CR_FD_FANOTIFY_FILE, - .pb_type = PB_FANOTIFY_FILE, - .priv_size = sizeof(struct fsnotify_file_info), - .collect = collect_one_fanotify, + .fd_type = CR_FD_FANOTIFY_FILE, + .pb_type = PB_FANOTIFY_FILE, + .priv_size = sizeof(struct fsnotify_file_info), + .collect = collect_one_fanotify, }; static int collect_one_inotify_mark(void *o, ProtobufCMessage *msg, struct cr_img *i) @@ -899,10 +873,10 @@ static int collect_one_inotify_mark(void *o, ProtobufCMessage *msg, struct cr_im } struct collect_image_info inotify_mark_cinfo = { - .fd_type = CR_FD_INOTIFY_WD, - .pb_type = PB_INOTIFY_WD, - .priv_size = sizeof(struct fsnotify_mark_info), - .collect = collect_one_inotify_mark, + .fd_type = CR_FD_INOTIFY_WD, + .pb_type = PB_INOTIFY_WD, + .priv_size = sizeof(struct fsnotify_mark_info), + .collect = collect_one_inotify_mark, }; static int collect_one_fanotify_mark(void *o, ProtobufCMessage *msg, struct cr_img *i) @@ -927,8 +901,8 @@ static int collect_one_fanotify_mark(void *o, ProtobufCMessage *msg, struct cr_i } struct collect_image_info fanotify_mark_cinfo = { - .fd_type = CR_FD_FANOTIFY_MARK, - .pb_type = PB_FANOTIFY_MARK, - .priv_size = sizeof(struct fsnotify_mark_info), - .collect = collect_one_fanotify_mark, + .fd_type = CR_FD_FANOTIFY_MARK, + .pb_type = PB_FANOTIFY_MARK, + .priv_size = sizeof(struct fsnotify_mark_info), + .collect = collect_one_fanotify_mark, }; diff --git a/criu/hugetlb.c b/criu/hugetlb.c new file mode 100644 index 000000000..866c4050f --- /dev/null +++ b/criu/hugetlb.c @@ -0,0 +1,60 @@ +#include "hugetlb.h" +#include "kerndat.h" +#include "sizes.h" + +// clang-format off +struct htlb_info hugetlb_info[HUGETLB_MAX] = { + [HUGETLB_16KB] = { SZ_16K, MAP_HUGETLB_16KB }, + [HUGETLB_64KB] = { SZ_64K, MAP_HUGETLB_64KB }, + [HUGETLB_512KB] = { SZ_512K, MAP_HUGETLB_512KB }, + [HUGETLB_1MB] = { SZ_1M, MAP_HUGETLB_1MB }, + [HUGETLB_2MB] = { SZ_2M, MAP_HUGETLB_2MB }, + [HUGETLB_8MB] = { SZ_8M, MAP_HUGETLB_8MB }, + [HUGETLB_16MB] = { SZ_16M, MAP_HUGETLB_16MB }, + [HUGETLB_32MB] = { SZ_32M, MAP_HUGETLB_32MB }, + [HUGETLB_256MB] = { SZ_256M, MAP_HUGETLB_256MB }, + [HUGETLB_512MB] = { SZ_512M, MAP_HUGETLB_512MB }, + [HUGETLB_1GB] = { SZ_1G, MAP_HUGETLB_1GB }, + [HUGETLB_2GB] = { SZ_2G, MAP_HUGETLB_2GB }, + [HUGETLB_16GB] = { SZ_16G, MAP_HUGETLB_16GB }, +}; +// clang-format on + +int is_hugetlb_dev(dev_t dev, int *hugetlb_size_flag) +{ + int i; + + for (i = 0; i < HUGETLB_MAX; i++) { + if (kdat.hugetlb_dev[i] == dev) { + if (hugetlb_size_flag) + *hugetlb_size_flag = hugetlb_info[i].flag; + return 1; + } + } + + return 0; +} + +int can_dump_with_memfd_hugetlb(dev_t dev, int *hugetlb_size_flag, const char *file_path, struct vma_area *vma) +{ + /* + * Dump the hugetlb backed mapping using memfd_hugetlb when it is not + * anonymous private mapping. + */ + if (kdat.has_memfd_hugetlb && is_hugetlb_dev(dev, hugetlb_size_flag) && + !((vma->e->flags & MAP_PRIVATE) && !strncmp(file_path, ANON_HUGEPAGE_PREFIX, ANON_HUGEPAGE_PREFIX_LEN))) + return 1; + + return 0; +} + +unsigned long get_size_from_hugetlb_flag(int flag) +{ + int i; + + for (i = 0; i < HUGETLB_MAX; i++) + if (flag == hugetlb_info[i].flag) + return hugetlb_info[i].size; + + return -1; +} diff --git a/criu/image-desc.c b/criu/image-desc.c index 053e7af21..2d87c7381 100644 --- a/criu/image-desc.c +++ b/criu/image-desc.c @@ -11,17 +11,17 @@ * for more details. */ -#define FD_ENTRY(_name, _fmt) \ - [CR_FD_##_name] = { \ - .fmt = _fmt ".img", \ - .magic = _name##_MAGIC, \ +#define FD_ENTRY(_name, _fmt) \ + [CR_FD_##_name] = { \ + .fmt = _fmt ".img", \ + .magic = _name##_MAGIC, \ } -#define FD_ENTRY_F(_name, _fmt, _f) \ - [CR_FD_##_name] = { \ - .fmt = _fmt ".img", \ - .magic = _name##_MAGIC, \ - .oflags = _f, \ +#define FD_ENTRY_F(_name, _fmt, _f) \ + [CR_FD_##_name] = { \ + .fmt = _fmt ".img", \ + .magic = _name##_MAGIC, \ + .oflags = _f, \ } struct cr_fd_desc_tmpl imgset_template[CR_FD_MAX] = { @@ -66,6 +66,7 @@ struct cr_fd_desc_tmpl imgset_template[CR_FD_MAX] = { FD_ENTRY(FS, "fs-%u"), FD_ENTRY(REMAP_FPATH, "remap-fpath"), FD_ENTRY_F(GHOST_FILE, "ghost-file-%x", O_NOBUF), + FD_ENTRY_F(MEMFD_INODE, "memfd", O_NOBUF), FD_ENTRY(TCP_STREAM, "tcp-stream-%x"), FD_ENTRY(MNTS, "mountpoints-%u"), FD_ENTRY(NETDEV, "netdev-%u"), @@ -76,6 +77,7 @@ struct cr_fd_desc_tmpl imgset_template[CR_FD_MAX] = { FD_ENTRY_F(RULE, "rule-%u", O_NOBUF), FD_ENTRY_F(IPTABLES, "iptables-%u", O_NOBUF), FD_ENTRY_F(IP6TABLES, "ip6tables-%u", O_NOBUF), + FD_ENTRY_F(NFTABLES, "nftables-%u", O_NOBUF), FD_ENTRY_F(TMPFS_IMG, "tmpfs-%u.tar.gz", O_NOBUF), FD_ENTRY_F(TMPFS_DEV, "tmpfs-dev-%u.tar.gz", O_NOBUF), FD_ENTRY_F(AUTOFS, "autofs-%u", O_NOBUF), @@ -100,6 +102,12 @@ struct cr_fd_desc_tmpl imgset_template[CR_FD_MAX] = { FD_ENTRY(NETNF_CT, "netns-ct-%u"), FD_ENTRY(NETNF_EXP, "netns-exp-%u"), FD_ENTRY(FILES, "files"), + FD_ENTRY(TIMENS, "timens-%u"), + FD_ENTRY(PIDNS, "pidns-%u"), + FD_ENTRY_F(BPFMAP_FILE, "bpfmap-file", O_NOBUF), + FD_ENTRY_F(BPFMAP_DATA, "bpfmap-data", O_NOBUF), + FD_ENTRY(APPARMOR, "apparmor"), + FD_ENTRY(PIDFD, "pidfd"), [CR_FD_STATS] = { .fmt = "stats-%s", @@ -112,9 +120,4 @@ struct cr_fd_desc_tmpl imgset_template[CR_FD_MAX] = { .magic = IRMAP_CACHE_MAGIC, .oflags = O_SERVICE | O_FORCE_LOCAL, }, - - [CR_FD_FILE_LOCKS_PID] = { - .fmt = "filelocks-%u.img", - .magic = FILE_LOCKS_MAGIC, - }, }; diff --git a/criu/image.c b/criu/image.c index 2eb926929..91101c3eb 100644 --- a/criu/image.c +++ b/criu/image.c @@ -17,6 +17,7 @@ #include "images/inventory.pb-c.h" #include "images/pagemap.pb-c.h" #include "proc_parse.h" +#include "img-streamer.h" #include "namespaces.h" bool ns_per_id = false; @@ -24,8 +25,17 @@ bool img_common_magic = true; TaskKobjIdsEntry *root_ids; u32 root_cg_set; Lsmtype image_lsm; +char dump_criu_run_id[RUN_ID_HASH_LENGTH]; -int check_img_inventory(void) +struct inventory_plugin { + struct list_head node; + char *name; +}; + +struct list_head inventory_plugins_list = LIST_HEAD_INIT(inventory_plugins_list); +static int n_inventory_plugins; + +int check_img_inventory(bool restore) { int ret = -1; struct cr_img *img; @@ -80,6 +90,62 @@ int check_img_inventory(void) goto out_err; } + if (restore && he->tcp_close && !opts.tcp_close) { + pr_err("Need to set the --tcp-close options.\n"); + goto out_err; + } + + if (restore && he->allow_uprobes && !opts.allow_uprobes) { + pr_err("Dumped with --" OPT_ALLOW_UPROBES ". Need to set it on restore as well.\n"); + goto out_err; + } + + if (restore) { + if (!he->has_network_lock_method) { + /* + * Image files were generated with an older version of CRIU + * so we should fall back to iptables because this is the + * network-lock mechanism used in older versions. + */ + pr_info("Network lock method not found in inventory image\n"); + pr_info("Falling back to iptables network lock method\n"); + opts.network_lock_method = NETWORK_LOCK_IPTABLES; + } else { + opts.network_lock_method = he->network_lock_method; + } + + if (!he->plugins_entry) { + /* backwards compatibility: if the 'plugins_entry' field is missing, + * all plugins should be enabled during restore. + */ + n_inventory_plugins = -1; + } else { + PluginsEntry *pe = he->plugins_entry; + for (int i = 0; i < pe->n_plugins; i++) { + if (add_inventory_plugin(pe->plugins[i])) + goto out_err; + } + } + + /** + * This contains the criu_run_id during dumping of the process. + * For things like removing network locking (nftables) this + * information is needed to identify the name of the network + * locking table. + */ + if (he->dump_criu_run_id) { + strncpy(dump_criu_run_id, he->dump_criu_run_id, sizeof(dump_criu_run_id) - 1); + pr_info("Dump CRIU run id = %s\n", dump_criu_run_id); + } else { + /** + * If restoring from an old image this is a marker + * that no dump_criu_run_id exists. + */ + dump_criu_run_id[0] = NO_DUMP_CRIU_RUN_ID; + } + + } + ret = 0; out_err: @@ -89,8 +155,92 @@ out_close: return ret; } +/** + * Check if the 'plugins' field in the inventory image contains + * the specified plugin name. If found, the plugin is removed + * from the linked list. + */ +bool check_and_remove_inventory_plugin(const char *name, size_t n) +{ + if (n_inventory_plugins == -1) + return true; /* backwards compatibility */ + + if (n_inventory_plugins > 0) { + struct inventory_plugin *p, *tmp; + + list_for_each_entry_safe(p, tmp, &inventory_plugins_list, node) { + if (!strncmp(name, p->name, n)) { + xfree(p->name); + list_del(&p->node); + xfree(p); + n_inventory_plugins--; + return true; + } + } + } + + return false; +} + +/** + * We expect during restore all loaded plugins to be removed from + * the inventory_plugins_list. If the list is not empty, show an + * error message for each missing plugin. + */ +int check_inventory_plugins(void) +{ + struct inventory_plugin *p; + + if (n_inventory_plugins <= 0) + return 0; + + list_for_each_entry(p, &inventory_plugins_list, node) { + pr_err("Missing required plugin: %s\n", p->name); + } + + return -1; +} + +/** + * Add plugin name to the inventory image. These values + * can be used to identify required plugins during restore. + */ +int add_inventory_plugin(const char *name) +{ + struct inventory_plugin *p; + + p = xmalloc(sizeof(struct inventory_plugin)); + if (p == NULL) + return -1; + + p->name = xstrdup(name); + if (!p->name) { + xfree(p); + return -1; + } + list_add(&p->node, &inventory_plugins_list); + n_inventory_plugins++; + + return 0; +} + +void free_inventory_plugins_list(void) +{ + struct inventory_plugin *p, *tmp; + + if (!list_empty(&inventory_plugins_list)) { + list_for_each_entry_safe(p, tmp, &inventory_plugins_list, node) { + xfree(p->name); + list_del(&p->node); + xfree(p); + } + } + n_inventory_plugins = 0; +} + int write_img_inventory(InventoryEntry *he) { + PluginsEntry pe = PLUGINS_ENTRY__INIT; struct cr_img *img; int ret; @@ -100,8 +250,27 @@ int write_img_inventory(InventoryEntry *he) if (!img) return -1; + if (!list_empty(&inventory_plugins_list)) { + struct inventory_plugin *p; + int i = 0; + + pe.n_plugins = n_inventory_plugins; + pe.plugins = xmalloc(n_inventory_plugins * sizeof(char *)); + if (!pe.plugins) + return -1; + + list_for_each_entry(p, &inventory_plugins_list, node) { + pe.plugins[i] = p->name; + i++; + } + } + he->plugins_entry = &pe; + ret = pb_write_one(img, he, PB_INVENTORY); + free_inventory_plugins_list(); + xfree(pe.plugins); + xfree(he->root_ids); close_image(img); if (ret < 0) @@ -143,8 +312,7 @@ InventoryEntry *get_parent_inventory(void) InventoryEntry *ie; int dir; - dir = openat(get_service_fd(IMG_FD_OFF), CR_PARENT_LINK, O_RDONLY); - if (dir == -1) { + if (open_parent(get_service_fd(IMG_FD_OFF), &dir)) { /* * We print the warning below to be notified that we had some * unexpected problem on open. For instance we have a parent @@ -152,10 +320,11 @@ InventoryEntry *get_parent_inventory(void) * when also having no parent directory is an expected case of * first dump iteration. */ - if (errno != ENOENT) - pr_warn("Failed to open parent directory\n"); + pr_warn("Failed to open parent directory\n"); return NULL; } + if (dir < 0) + return NULL; img = open_image_at(dir, CR_FD_INVENTORY, O_RSTR); if (!img) { @@ -190,7 +359,7 @@ int prepare_inventory(InventoryEntry *he) struct dmp_info d; } crt = { .i.pid = &pid }; - pr_info("Perparing image inventory (version %u)\n", CRTOOLS_IMAGES_V1); + pr_info("Preparing image inventory (version %u)\n", CRTOOLS_IMAGES_V1); he->img_version = CRTOOLS_IMAGES_V1_1; he->fdinfo_per_id = true; @@ -205,12 +374,34 @@ int prepare_inventory(InventoryEntry *he) if (get_task_ids(&crt.i)) return -1; - he->has_root_cg_set = true; - if (dump_task_cgroup(NULL, &he->root_cg_set, NULL)) + if (!opts.unprivileged) + he->has_root_cg_set = true; + if (dump_thread_cgroup(NULL, &he->root_cg_set, NULL, -1)) return -1; he->root_ids = crt.i.ids; + /* tcp_close has to be set on restore if it has been set on dump. */ + if (opts.tcp_close) { + he->tcp_close = true; + he->has_tcp_close = true; + } + + /* Save network lock method to reuse in restore */ + he->has_network_lock_method = true; + he->network_lock_method = opts.network_lock_method; + + /** + * This contains the criu_run_id during dumping of the process. + * For things like removing network locking (nftables) this + * information is needed to identify the name of the network + * locking table. + */ + he->dump_criu_run_id = xstrdup(criu_run_id); + + if (!he->dump_criu_run_id) + return -1; + return 0; } @@ -262,8 +453,7 @@ void close_cr_imgset(struct cr_imgset **cr_imgset) *cr_imgset = NULL; } -struct cr_imgset *cr_imgset_open_range(int pid, int from, int to, - unsigned long flags) +struct cr_imgset *cr_imgset_open_range(int pid, int from, int to, unsigned long flags) { struct cr_imgset *imgset; unsigned int i; @@ -391,10 +581,10 @@ static int img_write_magic(struct cr_img *img, int oflags, int type) } struct openat_args { - char path[PATH_MAX]; - int flags; - int err; - int mode; + char path[PATH_MAX]; + int flags; + int err; + int mode; }; static int userns_openat(void *arg, int dfd, int pid) @@ -415,13 +605,15 @@ static int do_open_image(struct cr_img *img, int dfd, int type, unsigned long of flags = oflags & ~(O_NOBUF | O_SERVICE | O_FORCE_LOCAL); - /* - * For pages images dedup we need to open images read-write on - * restore, that may require proper capabilities, so we ask - * usernsd to do it for us - */ - if (root_ns_mask & CLONE_NEWUSER && - type == CR_FD_PAGES && oflags & O_RDWR) { + if (opts.stream && !(oflags & O_FORCE_LOCAL)) { + ret = img_streamer_open(path, flags); + errno = EIO; /* errno value is meaningless, only the ret value is meaningful */ + } else if (root_ns_mask & CLONE_NEWUSER && type == CR_FD_PAGES && oflags & O_RDWR) { + /* + * For pages images dedup we need to open images read-write on + * restore, that may require proper capabilities, so we ask + * usernsd to do it for us + */ struct openat_args pa = { .flags = flags, .err = 0, @@ -520,7 +712,12 @@ struct cr_img *img_from_fd(int fd) return img; } -int open_image_dir(char *dir) +/* + * `mode` should be O_RSTR or O_DUMP depending on the intent. + * This is used when opts.stream is enabled for picking the right streamer + * socket name. `mode` is ignored when opts.stream is not enabled. + */ +int open_image_dir(const char *dir, int mode) { int fd, ret; @@ -531,11 +728,21 @@ int open_image_dir(char *dir) } ret = install_service_fd(IMG_FD_OFF, fd); - if (ret < 0) + if (ret < 0) { + pr_err("install_service_fd failed.\n"); return -1; + } fd = ret; - if (opts.img_parent) { + if (opts.stream) { + if (img_streamer_init(dir, mode) < 0) + goto err; + } else if (opts.img_parent) { + if (faccessat(fd, opts.img_parent, R_OK, 0)) { + pr_perror("Invalid parent image directory provided"); + goto err; + } + ret = symlinkat(opts.img_parent, fd, CR_PARENT_LINK); if (ret < 0 && errno != EEXIST) { pr_perror("Can't link parent snapshot"); @@ -544,7 +751,7 @@ int open_image_dir(char *dir) if (opts.img_parent[0] == '/') pr_warn("Absolute paths for parent links " - "may not work on restore!\n"); + "may not work on restore!\n"); } return 0; @@ -556,9 +763,31 @@ err: void close_image_dir(void) { + if (opts.stream) + img_streamer_finish(); close_service_fd(IMG_FD_OFF); } +int open_parent(int dfd, int *pfd) +{ + struct stat st; + + *pfd = -1; + /* Check if the parent symlink exists */ + if (fstatat(dfd, CR_PARENT_LINK, &st, AT_SYMLINK_NOFOLLOW) && errno == ENOENT) { + pr_debug("No parent images directory provided\n"); + return 0; + } + + *pfd = openat(dfd, CR_PARENT_LINK, O_RDONLY); + if (*pfd < 0) { + pr_perror("Can't open parent path"); + return -1; + } + + return 0; +} + static unsigned long page_ids = 1; void up_page_ids_base(void) diff --git a/criu/img-streamer.c b/criu/img-streamer.c new file mode 100644 index 000000000..305e6fae5 --- /dev/null +++ b/criu/img-streamer.c @@ -0,0 +1,240 @@ +#include +#include +#include +#include + +#include "cr_options.h" +#include "img-streamer.h" +#include "image.h" +#include "images/img-streamer.pb-c.h" +#include "protobuf.h" +#include "servicefd.h" +#include "rst-malloc.h" +#include "common/scm.h" +#include "common/lock.h" +#include "action-scripts.h" + +/* + * We use different path names for the dump and restore sockets because: + * 1) The user may want to perform both at the same time (akin to live + * migration). Specifying the same images-dir is convenient. + * 2) It fails quickly when the user mix-up the streamer and CRIU operations. + * (e.g., streamer is in capture more, while CRIU is in restore mode). + */ +#define IMG_STREAMER_CAPTURE_SOCKET_NAME "streamer-capture.sock" +#define IMG_STREAMER_SERVE_SOCKET_NAME "streamer-serve.sock" + +/* All requests go through the same socket connection. We must synchronize */ +static mutex_t *img_streamer_fd_lock; + +/* Either O_DUMP or O_RSTR */ +static int img_streamer_mode; + +static const char *socket_name_for_mode(int mode) +{ + switch (mode) { + case O_DUMP: + return IMG_STREAMER_CAPTURE_SOCKET_NAME; + case O_RSTR: + return IMG_STREAMER_SERVE_SOCKET_NAME; + default: + BUG(); + return NULL; + } +} + +/* + * img_streamer_init() connects to the image streamer socket. + * mode should be either O_DUMP or O_RSTR. + */ +int img_streamer_init(const char *image_dir, int mode) +{ + struct sockaddr_un addr; + int pre_stream_ret; + int sockfd; + + img_streamer_mode = mode; + + pre_stream_ret = run_scripts(ACT_PRE_STREAM); + if (pre_stream_ret != 0) { + pr_err("Pre-stream script failed with %d!\n", pre_stream_ret); + return -1; + } + + sockfd = socket(AF_UNIX, SOCK_STREAM, 0); + if (sockfd < 0) { + pr_perror("Unable to instantiate UNIX socket"); + return -1; + } + + memset(&addr, 0, sizeof(addr)); + addr.sun_family = AF_UNIX; + snprintf(addr.sun_path, sizeof(addr.sun_path), "%s/%s", image_dir, socket_name_for_mode(mode)); + + if (connect(sockfd, (struct sockaddr *)&addr, sizeof(addr)) < 0) { + pr_perror("Unable to connect to image streamer socket: %s", addr.sun_path); + goto err; + } + + img_streamer_fd_lock = shmalloc(sizeof(*img_streamer_fd_lock)); + if (!img_streamer_fd_lock) { + pr_err("Failed to allocate memory\n"); + goto err; + } + mutex_init(img_streamer_fd_lock); + + if (install_service_fd(IMG_STREAMER_FD_OFF, sockfd) < 0) + return -1; + + return 0; + +err: + close(sockfd); + return -1; +} + +/* + * img_streamer_finish() indicates that no more files will be opened. + * In other words, img_streamer_open() will no longer be called. + */ +void img_streamer_finish(void) +{ + if (get_service_fd(IMG_STREAMER_FD_OFF) >= 0) { + pr_info("Dismissing the image streamer\n"); + close_service_fd(IMG_STREAMER_FD_OFF); + } +} + +/* + * The regular protobuf APIs pb_write_one() and pb_read_one() operate over a + * `struct cr_img` object. Sadly, we don't have such object. We just have a + * file descriptor. The following pb_write_one_fd() and pb_read_one_fd() + * provide a protobuf API over a file descriptor. The implementation is a bit + * of a hack, but should be fine. At some point we can revisit to have a + * proper protobuf API over fds. + */ +static int pb_write_one_fd(int fd, void *obj, int type) +{ + int ret; + struct cr_img img; + memset(&img, 0, sizeof(img)); + + img._x.fd = fd; + ret = pb_write_one(&img, obj, type); + if (ret < 0) + pr_perror("Failed to communicate with the image streamer"); + return ret; +} + +static int pb_read_one_fd(int fd, void **pobj, int type) +{ + int ret; + struct cr_img img; + memset(&img, 0, sizeof(img)); + + img._x.fd = fd; + ret = pb_read_one(&img, pobj, type); + if (ret < 0) + pr_perror("Failed to communicate with the image streamer"); + return ret; +} + +static int send_file_request(char *filename) +{ + ImgStreamerRequestEntry req = IMG_STREAMER_REQUEST_ENTRY__INIT; + req.filename = filename; + return pb_write_one_fd(get_service_fd(IMG_STREAMER_FD_OFF), &req, PB_IMG_STREAMER_REQUEST); +} + +static int recv_file_reply(bool *exists) +{ + ImgStreamerReplyEntry *reply; + int ret = pb_read_one_fd(get_service_fd(IMG_STREAMER_FD_OFF), (void **)&reply, PB_IMG_STREAMER_REPLY); + if (ret < 0) + return ret; + + *exists = reply->exists; + free(reply); + + return 0; +} + +/* + * Using a pipe for image file transfers allows the data to be spliced by the + * image streamer, greatly improving performance. + * Transfer rates of up to 15GB/s can be seen with this technique. + */ +#define READ_PIPE 0 /* index of the read pipe returned by pipe() */ +#define WRITE_PIPE 1 +static int establish_streamer_file_pipe(void) +{ + /* + * If the other end of the pipe closes, the kernel will want to kill + * us with a SIGPIPE. These signal must be ignored, which we do in + * crtools.c:main() with signal(SIGPIPE, SIG_IGN). + */ + int ret = -1; + int criu_pipe_direction = img_streamer_mode == O_DUMP ? WRITE_PIPE : READ_PIPE; + int streamer_pipe_direction = 1 - criu_pipe_direction; + int fds[2]; + + if (pipe(fds) < 0) { + pr_perror("Unable to create pipe"); + return -1; + } + + if (send_fd(get_service_fd(IMG_STREAMER_FD_OFF), NULL, 0, fds[streamer_pipe_direction]) < 0) + close(fds[criu_pipe_direction]); + else + ret = fds[criu_pipe_direction]; + + close(fds[streamer_pipe_direction]); + + return ret; +} + +static int _img_streamer_open(char *filename) +{ + if (send_file_request(filename) < 0) + return -1; + + if (img_streamer_mode == O_RSTR) { + /* The streamer replies whether the file exists */ + bool exists; + if (recv_file_reply(&exists) < 0) + return -1; + + if (!exists) + return -ENOENT; + } + + /* + * When the image streamer encounters a fatal error, it won't report + * errors via protobufs. Instead, CRIU will get a broken pipe error + * when trying to access a streaming pipe. This behavior is similar to + * what would happen if we were connecting criu and * criu-image-streamer + * via a shell pipe. + */ + + return establish_streamer_file_pipe(); +} + +/* + * Opens an image file via a UNIX pipe with the image streamer. + * + * Return: + * A file descriptor on success + * -ENOENT when the file was not found. + * -1 on any other error. + */ +int img_streamer_open(char *filename, int flags) +{ + int ret; + + BUG_ON(flags != img_streamer_mode); + + mutex_lock(img_streamer_fd_lock); + ret = _img_streamer_open(filename); + mutex_unlock(img_streamer_fd_lock); + return ret; +} diff --git a/criu/include/action-scripts.h b/criu/include/action-scripts.h index 40b09b160..6a331a32f 100644 --- a/criu/include/action-scripts.h +++ b/criu/include/action-scripts.h @@ -4,6 +4,7 @@ #include "asm/int.h" enum script_actions { + ACT_PRE_STREAM, ACT_PRE_DUMP, ACT_POST_DUMP, ACT_PRE_RESTORE, @@ -15,6 +16,8 @@ enum script_actions { ACT_POST_RESUME, ACT_PRE_RESUME, ACT_ORPHAN_PTS_MASTER, + ACT_STATUS_READY, + ACT_QUERY_EXT_FILES, ACT_MAX }; @@ -23,6 +26,8 @@ extern int add_script(char *path); extern int add_rpc_notify(int sk); extern int run_scripts(enum script_actions); extern int rpc_send_fd(enum script_actions, int fd); +extern int rpc_query_external_files(void); +extern int exec_rpc_query_external_files(char *name, int sk); extern int send_criu_rpc_script(enum script_actions act, char *name, int sk, int fd); #endif /* __CR_ACTION_SCRIPTS_H__ */ diff --git a/criu/include/aio.h b/criu/include/aio.h index 858ccd3cf..38e704020 100644 --- a/criu/include/aio.h +++ b/criu/include/aio.h @@ -1,7 +1,7 @@ #ifndef __CR_AIO_H__ #define __CR_AIO_H__ -#include +#include "linux/aio_abi.h" #include "images/mm.pb-c.h" unsigned int aio_estimate_nr_reqs(unsigned int size); int dump_aio_ring(MmEntry *mme, struct vma_area *vma); @@ -13,18 +13,18 @@ struct task_restore_args; int prepare_aios(struct pstree_item *t, struct task_restore_args *ta); struct aio_ring { - unsigned id; /* kernel internal index number */ - unsigned nr; /* number of io_events */ - unsigned head; /* Written to by userland or under ring_lock + unsigned id; /* kernel internal index number */ + unsigned nr; /* number of io_events */ + unsigned head; /* Written to by userland or under ring_lock * mutex by aio_read_events_ring(). */ - unsigned tail; + unsigned tail; - unsigned magic; - unsigned compat_features; - unsigned incompat_features; - unsigned header_length; /* size of aio_ring */ + unsigned magic; + unsigned compat_features; + unsigned incompat_features; + unsigned header_length; /* size of aio_ring */ - struct io_event io_events[0]; + struct io_event io_events[0]; }; struct rst_aio_ring { diff --git a/criu/include/asm-generic/int.h b/criu/include/asm-generic/int.h index ac3088d5a..5bf484776 100644 --- a/criu/include/asm-generic/int.h +++ b/criu/include/asm-generic/int.h @@ -3,13 +3,13 @@ #include -typedef uint64_t u64; -typedef int64_t s64; -typedef uint32_t u32; -typedef int32_t s32; -typedef uint16_t u16; -typedef int16_t s16; -typedef uint8_t u8; -typedef int8_t s8; +typedef uint64_t u64; +typedef int64_t s64; +typedef uint32_t u32; +typedef int32_t s32; +typedef uint16_t u16; +typedef int16_t s16; +typedef uint8_t u8; +typedef int8_t s8; #endif /* __CR_INT_H__ */ diff --git a/criu/include/asm-generic/vdso.h b/criu/include/asm-generic/vdso.h index 6c3e3d137..cddd1a7f0 100644 --- a/criu/include/asm-generic/vdso.h +++ b/criu/include/asm-generic/vdso.h @@ -1,15 +1,15 @@ #ifndef __CR_ASM_GENERIC_VDSO_H__ #define __CR_ASM_GENERIC_VDSO_H__ -#define VDSO_PROT (PROT_READ | PROT_EXEC) -#define VVAR_PROT (PROT_READ) +#define VDSO_PROT (PROT_READ | PROT_EXEC) +#define VVAR_PROT (PROT_READ) /* Just in case of LPAE system PFN is u64. */ -#define VDSO_BAD_PFN (-1ull) -#define VVAR_BAD_PFN (-1ull) -#define VDSO_BAD_ADDR (-1ul) -#define VVAR_BAD_ADDR (-1ul) -#define VDSO_BAD_SIZE (-1ul) -#define VVAR_BAD_SIZE (-1ul) +#define VDSO_BAD_PFN (-1ull) +#define VVAR_BAD_PFN (-1ull) +#define VDSO_BAD_ADDR (-1ul) +#define VVAR_BAD_ADDR (-1ul) +#define VDSO_BAD_SIZE (-1ul) +#define VVAR_BAD_SIZE (-1ul) #endif /* __CR_ASM_GENERIC_VDSO_H__ */ diff --git a/criu/include/autofs.h b/criu/include/autofs.h index c4618859b..b158025c7 100644 --- a/criu/include/autofs.h +++ b/criu/include/autofs.h @@ -2,7 +2,7 @@ #define __CR_AUTOFS_H__ #ifndef AUTOFS_MINOR -#define AUTOFS_MINOR 235 +#define AUTOFS_MINOR 235 #endif #include @@ -12,78 +12,76 @@ bool is_autofs_pipe(unsigned long inode); struct mount_info; int autofs_parse(struct mount_info *pm); int autofs_dump(struct mount_info *pm); -int autofs_mount(struct mount_info *mi, const char *source, const - char *filesystemtype, unsigned long mountflags); +int autofs_mount(struct mount_info *mi, const char *source, const char *filesystemtype, unsigned long mountflags); #include #include #include -#define AUTOFS_DEVICE_NAME "autofs" +#define AUTOFS_DEVICE_NAME "autofs" #define AUTOFS_DEV_IOCTL_VERSION_MAJOR 1 #define AUTOFS_DEV_IOCTL_VERSION_MINOR 0 -#define AUTOFS_DEVID_LEN 16 +#define AUTOFS_DEVID_LEN 16 -#define AUTOFS_DEV_IOCTL_SIZE sizeof(struct autofs_dev_ioctl) +#define AUTOFS_DEV_IOCTL_SIZE sizeof(struct autofs_dev_ioctl) /* * An ioctl interface for autofs mount point control. */ struct args_protover { - __u32 version; + __u32 version; }; struct args_protosubver { - __u32 sub_version; + __u32 sub_version; }; struct args_openmount { - __u32 devid; + __u32 devid; }; struct args_ready { - __u32 token; + __u32 token; }; struct args_fail { - __u32 token; - __s32 status; + __u32 token; + __s32 status; }; struct args_setpipefd { - __s32 pipefd; + __s32 pipefd; }; struct args_timeout { - __u64 timeout; + __u64 timeout; }; struct args_requester { - __u32 uid; - __u32 gid; + __u32 uid; + __u32 gid; }; struct args_expire { - __u32 how; + __u32 how; }; - struct args_askumount { - __u32 may_umount; + __u32 may_umount; }; struct args_ismountpoint { union { struct args_in { - __u32 type; + __u32 type; } in; struct args_out { - __u32 devid; - __u32 magic; + __u32 devid; + __u32 magic; } out; }; }; @@ -98,24 +96,24 @@ struct args_ismountpoint { struct autofs_dev_ioctl { __u32 ver_major; __u32 ver_minor; - __u32 size; /* total size of data passed in + __u32 size; /* total size of data passed in * including this struct */ - __s32 ioctlfd; /* automount command fd */ + __s32 ioctlfd; /* automount command fd */ /* Command parameters */ union { - struct args_protover protover; - struct args_protosubver protosubver; - struct args_openmount openmount; - struct args_ready ready; - struct args_fail fail; - struct args_setpipefd setpipefd; - struct args_timeout timeout; - struct args_requester requester; - struct args_expire expire; - struct args_askumount askumount; - struct args_ismountpoint ismountpoint; + struct args_protover protover; + struct args_protosubver protosubver; + struct args_openmount openmount; + struct args_ready ready; + struct args_fail fail; + struct args_setpipefd setpipefd; + struct args_timeout timeout; + struct args_requester requester; + struct args_expire expire; + struct args_askumount askumount; + struct args_ismountpoint ismountpoint; }; char path[0]; @@ -131,7 +129,6 @@ static inline void init_autofs_dev_ioctl(struct autofs_dev_ioctl *in) return; } - /* * If you change this make sure you make the corresponding change * to autofs-dev-ioctl.c:lookup_ioctl() @@ -174,61 +171,32 @@ enum { #define AUTOFS_IOCTL 0x93 -#define AUTOFS_DEV_IOCTL_VERSION \ - _IOWR(AUTOFS_IOCTL, \ - AUTOFS_DEV_IOCTL_VERSION_CMD, struct autofs_dev_ioctl) +#define AUTOFS_DEV_IOCTL_VERSION _IOWR(AUTOFS_IOCTL, AUTOFS_DEV_IOCTL_VERSION_CMD, struct autofs_dev_ioctl) -#define AUTOFS_DEV_IOCTL_PROTOVER \ - _IOWR(AUTOFS_IOCTL, \ - AUTOFS_DEV_IOCTL_PROTOVER_CMD, struct autofs_dev_ioctl) +#define AUTOFS_DEV_IOCTL_PROTOVER _IOWR(AUTOFS_IOCTL, AUTOFS_DEV_IOCTL_PROTOVER_CMD, struct autofs_dev_ioctl) -#define AUTOFS_DEV_IOCTL_PROTOSUBVER \ - _IOWR(AUTOFS_IOCTL, \ - AUTOFS_DEV_IOCTL_PROTOSUBVER_CMD, struct autofs_dev_ioctl) +#define AUTOFS_DEV_IOCTL_PROTOSUBVER _IOWR(AUTOFS_IOCTL, AUTOFS_DEV_IOCTL_PROTOSUBVER_CMD, struct autofs_dev_ioctl) -#define AUTOFS_DEV_IOCTL_OPENMOUNT \ - _IOWR(AUTOFS_IOCTL, \ - AUTOFS_DEV_IOCTL_OPENMOUNT_CMD, struct autofs_dev_ioctl) +#define AUTOFS_DEV_IOCTL_OPENMOUNT _IOWR(AUTOFS_IOCTL, AUTOFS_DEV_IOCTL_OPENMOUNT_CMD, struct autofs_dev_ioctl) +#define AUTOFS_DEV_IOCTL_CLOSEMOUNT _IOWR(AUTOFS_IOCTL, AUTOFS_DEV_IOCTL_CLOSEMOUNT_CMD, struct autofs_dev_ioctl) -#define AUTOFS_DEV_IOCTL_CLOSEMOUNT \ - _IOWR(AUTOFS_IOCTL, \ - AUTOFS_DEV_IOCTL_CLOSEMOUNT_CMD, struct autofs_dev_ioctl) +#define AUTOFS_DEV_IOCTL_READY _IOWR(AUTOFS_IOCTL, AUTOFS_DEV_IOCTL_READY_CMD, struct autofs_dev_ioctl) -#define AUTOFS_DEV_IOCTL_READY \ - _IOWR(AUTOFS_IOCTL, \ - AUTOFS_DEV_IOCTL_READY_CMD, struct autofs_dev_ioctl) +#define AUTOFS_DEV_IOCTL_FAIL _IOWR(AUTOFS_IOCTL, AUTOFS_DEV_IOCTL_FAIL_CMD, struct autofs_dev_ioctl) -#define AUTOFS_DEV_IOCTL_FAIL \ - _IOWR(AUTOFS_IOCTL, \ - AUTOFS_DEV_IOCTL_FAIL_CMD, struct autofs_dev_ioctl) +#define AUTOFS_DEV_IOCTL_SETPIPEFD _IOWR(AUTOFS_IOCTL, AUTOFS_DEV_IOCTL_SETPIPEFD_CMD, struct autofs_dev_ioctl) -#define AUTOFS_DEV_IOCTL_SETPIPEFD \ - _IOWR(AUTOFS_IOCTL, \ - AUTOFS_DEV_IOCTL_SETPIPEFD_CMD, struct autofs_dev_ioctl) +#define AUTOFS_DEV_IOCTL_CATATONIC _IOWR(AUTOFS_IOCTL, AUTOFS_DEV_IOCTL_CATATONIC_CMD, struct autofs_dev_ioctl) -#define AUTOFS_DEV_IOCTL_CATATONIC \ - _IOWR(AUTOFS_IOCTL, \ - AUTOFS_DEV_IOCTL_CATATONIC_CMD, struct autofs_dev_ioctl) +#define AUTOFS_DEV_IOCTL_TIMEOUT _IOWR(AUTOFS_IOCTL, AUTOFS_DEV_IOCTL_TIMEOUT_CMD, struct autofs_dev_ioctl) -#define AUTOFS_DEV_IOCTL_TIMEOUT \ - _IOWR(AUTOFS_IOCTL, \ - AUTOFS_DEV_IOCTL_TIMEOUT_CMD, struct autofs_dev_ioctl) +#define AUTOFS_DEV_IOCTL_REQUESTER _IOWR(AUTOFS_IOCTL, AUTOFS_DEV_IOCTL_REQUESTER_CMD, struct autofs_dev_ioctl) -#define AUTOFS_DEV_IOCTL_REQUESTER \ - _IOWR(AUTOFS_IOCTL, \ - AUTOFS_DEV_IOCTL_REQUESTER_CMD, struct autofs_dev_ioctl) +#define AUTOFS_DEV_IOCTL_EXPIRE _IOWR(AUTOFS_IOCTL, AUTOFS_DEV_IOCTL_EXPIRE_CMD, struct autofs_dev_ioctl) -#define AUTOFS_DEV_IOCTL_EXPIRE \ - _IOWR(AUTOFS_IOCTL, \ - AUTOFS_DEV_IOCTL_EXPIRE_CMD, struct autofs_dev_ioctl) +#define AUTOFS_DEV_IOCTL_ASKUMOUNT _IOWR(AUTOFS_IOCTL, AUTOFS_DEV_IOCTL_ASKUMOUNT_CMD, struct autofs_dev_ioctl) -#define AUTOFS_DEV_IOCTL_ASKUMOUNT \ - _IOWR(AUTOFS_IOCTL, \ - AUTOFS_DEV_IOCTL_ASKUMOUNT_CMD, struct autofs_dev_ioctl) - -#define AUTOFS_DEV_IOCTL_ISMOUNTPOINT \ - _IOWR(AUTOFS_IOCTL, \ - AUTOFS_DEV_IOCTL_ISMOUNTPOINT_CMD, struct autofs_dev_ioctl) +#define AUTOFS_DEV_IOCTL_ISMOUNTPOINT _IOWR(AUTOFS_IOCTL, AUTOFS_DEV_IOCTL_ISMOUNTPOINT_CMD, struct autofs_dev_ioctl) #endif diff --git a/criu/include/bfd.h b/criu/include/bfd.h index 9f4bdb27b..2846ec628 100644 --- a/criu/include/bfd.h +++ b/criu/include/bfd.h @@ -5,9 +5,9 @@ struct bfd_buf; struct xbuf { - char *mem; /* buffer */ - char *data; /* position we see bytes at */ - unsigned int sz; /* bytes sitting after b->pos */ + char *mem; /* buffer */ + char *data; /* position we see bytes at */ + unsigned int sz; /* bytes sitting after b->pos */ struct bfd_buf *buf; }; diff --git a/criu/include/bpfmap.h b/criu/include/bpfmap.h new file mode 100644 index 000000000..8ab2d3ce6 --- /dev/null +++ b/criu/include/bpfmap.h @@ -0,0 +1,33 @@ +#ifndef __CR_BPFMAP_H__ +#define __CR_BPFMAP_H__ + +#include "files.h" +#include "bpfmap-file.pb-c.h" +#include "bpfmap-data.pb-c.h" + +struct bpfmap_file_info { + BpfmapFileEntry *bpfe; + struct file_desc d; +}; + +struct bpfmap_data_rst { + BpfmapDataEntry *bde; + void *data; + struct bpfmap_data_rst *next; +}; + +#define BPFMAP_DATA_HASH_BITS 5 +#define BPFMAP_DATA_TABLE_SIZE (1 << BPFMAP_DATA_HASH_BITS) +#define BPFMAP_DATA_HASH_MASK (BPFMAP_DATA_TABLE_SIZE - 1) + +extern int is_bpfmap_link(char *link); +extern int dump_one_bpfmap_data(BpfmapFileEntry *bpf, int lfd, const struct fd_parms *p); +extern int do_collect_bpfmap_data(struct bpfmap_data_rst *, ProtobufCMessage *, struct cr_img *, + struct bpfmap_data_rst **); +extern int restore_bpfmap_data(int, uint32_t, struct bpfmap_data_rst **); + +extern const struct fdtype_ops bpfmap_dump_ops; +extern struct collect_image_info bpfmap_cinfo; +extern struct collect_image_info bpfmap_data_cinfo; + +#endif /* __CR_BPFMAP_H__ */ diff --git a/criu/include/cgroup-props.h b/criu/include/cgroup-props.h index 0e5201098..10a7061b8 100644 --- a/criu/include/cgroup-props.h +++ b/criu/include/cgroup-props.h @@ -4,12 +4,13 @@ #include typedef struct { - const char *name; - size_t nr_props; - const char **props; + const char *name; + size_t nr_props; + const char **props; } cgp_t; extern cgp_t cgp_global; +extern cgp_t cgp_global_v2; extern const cgp_t *cgp_get_props(const char *name); extern bool cgp_should_skip_controller(const char *name); extern bool cgp_add_dump_controller(const char *name); diff --git a/criu/include/cgroup.h b/criu/include/cgroup.h index 949266d40..dc264032e 100644 --- a/criu/include/cgroup.h +++ b/criu/include/cgroup.h @@ -7,9 +7,10 @@ struct pstree_item; struct parasite_dump_cgroup_args; extern u32 root_cg_set; -int dump_task_cgroup(struct pstree_item *, u32 *, struct parasite_dump_cgroup_args *args); +int dump_thread_cgroup(const struct pstree_item *, u32 *, struct parasite_dump_cgroup_args *args, int id); int dump_cgroups(void); -int prepare_task_cgroup(struct pstree_item *); +int restore_task_cgroup(struct pstree_item *); +int prepare_cgroup_namespace(struct pstree_item *); int prepare_cgroup(void); /* Restore things like cpu_limit in known cgroups. */ int prepare_cgroup_properties(void); @@ -19,47 +20,50 @@ void fini_cgroup(void); struct cg_controller; struct cgroup_prop { - char *name; - char *value; - mode_t mode; - uid_t uid; - gid_t gid; - struct list_head list; + char *name; + char *value; + mode_t mode; + uid_t uid; + gid_t gid; + struct list_head list; }; /* This describes a particular cgroup path, e.g. the '/lxc/u1' part of * 'blkio/lxc/u1' and any properties it has. */ struct cgroup_dir { - char *path; - mode_t mode; - uid_t uid; - gid_t gid; + char *path; + mode_t mode; + uid_t uid; + gid_t gid; - struct list_head properties; - unsigned int n_properties; + struct list_head properties; + unsigned int n_properties; /* this is how children are linked together */ - struct list_head siblings; + struct list_head siblings; /* more cgroup_dirs */ - struct list_head children; - unsigned int n_children; + struct list_head children; + unsigned int n_children; }; /* This describes a particular cgroup controller, e.g. blkio or cpuset. * The heads are subdirectories organized in their tree format. */ struct cg_controller { - unsigned int n_controllers; - char **controllers; + unsigned int n_controllers; + char **controllers; /* cgroup_dirs */ - struct list_head heads; - unsigned int n_heads; + struct list_head heads; + unsigned int n_heads; /* for cgroup list in cgroup.c */ - struct list_head l; + struct list_head l; + + /* controller is a threaded cgroup or not */ + int is_threaded; }; struct cg_controller *new_controller(const char *name); @@ -87,9 +91,12 @@ struct cg_ctl { */ struct list_head; struct parasite_dump_cgroup_args; -extern int parse_task_cgroup(int pid, struct parasite_dump_cgroup_args *args, struct list_head *l, unsigned int *n); +extern int parse_thread_cgroup(int pid, int tid, struct parasite_dump_cgroup_args *args, struct list_head *l, + unsigned int *n); extern void put_ctls(struct list_head *); int collect_controllers(struct list_head *cgroups, unsigned int *n_cgroups); +int stop_cgroupd(void); + #endif /* __CR_CGROUP_H__ */ diff --git a/criu/include/clone-noasan.h b/criu/include/clone-noasan.h index 8ef75fa73..aff773296 100644 --- a/criu/include/clone-noasan.h +++ b/criu/include/clone-noasan.h @@ -2,5 +2,6 @@ #define __CR_CLONE_NOASAN_H__ int clone_noasan(int (*fn)(void *), int flags, void *arg); +int clone3_with_pid_noasan(int (*fn)(void *), void *arg, int flags, int exit_signal, pid_t pid); #endif /* __CR_CLONE_NOASAN_H__ */ diff --git a/criu/include/cr-errno.h b/criu/include/cr-errno.h index 1f94988cf..0c7c1796e 100644 --- a/criu/include/cr-errno.h +++ b/criu/include/cr-errno.h @@ -11,7 +11,7 @@ int get_cr_errno(void); * EBADRQC - bad options */ -#define set_task_cr_err(new_err) atomic_cmpxchg(&task_entries->cr_err, 0, new_err) -#define get_task_cr_err() atomic_read(&task_entries->cr_err) +#define set_task_cr_err(new_err) atomic_cmpxchg(&task_entries->cr_err, 0, new_err) +#define get_task_cr_err() atomic_read(&task_entries->cr_err) #endif /* __CR_ERRNO_H__ */ diff --git a/criu/include/cr_options.h b/criu/include/cr_options.h index 82f76ad94..8c5707b41 100644 --- a/criu/include/cr_options.h +++ b/criu/include/cr_options.h @@ -1,36 +1,41 @@ #ifndef __CR_OPTIONS_H__ #define __CR_OPTIONS_H__ -#include #include +#include #include "common/config.h" #include "common/list.h" +#include "int.h" +#include "image.h" /* Configuration and CLI parsing order defines */ -#define PARSING_GLOBAL_CONF 1 -#define PARSING_USER_CONF 2 -#define PARSING_ENV_CONF 3 -#define PARSING_CMDLINE_CONF 4 -#define PARSING_ARGV 5 -#define PARSING_RPC_CONF 6 -#define PARSING_LAST 7 +#define PARSING_GLOBAL_CONF 1 +#define PARSING_USER_CONF 2 +#define PARSING_ENV_CONF 3 +#define PARSING_CMDLINE_CONF 4 +#define PARSING_ARGV 5 +#define PARSING_RPC_CONF 6 +#define PARSING_LAST 7 -#define SET_CHAR_OPTS(__dest, __src) \ - do { \ - free(opts.__dest); \ - opts.__dest = xstrdup(__src); \ - } while(0) +#define SET_CHAR_OPTS(__dest, __src) \ + do { \ + char *__src_dup = xstrdup(__src); \ + if (!__src_dup) \ + abort(); \ + xfree(opts.__dest); \ + opts.__dest = __src_dup; \ + } while (0) /* * CPU capability options. */ -#define CPU_CAP_NONE (0u << 0) /* Don't check capability at all */ -#define CPU_CAP_FPU (1u << 0) /* Only FPU capability required */ -#define CPU_CAP_CPU (1u << 1) /* Strict CPU capability required */ -#define CPU_CAP_INS (1u << 2) /* Instructions CPU capability */ -#define CPU_CAP_IMAGE (1u << 3) /* Write capability on dump and read on restore*/ -#define CPU_CAP_ALL (CPU_CAP_FPU | CPU_CAP_CPU | CPU_CAP_INS) -#define CPU_CAP_DEFAULT (CPU_CAP_FPU | CPU_CAP_INS) +#define CPU_CAP_NONE (0u << 0) /* Don't check capability at all */ +#define CPU_CAP_FPU (1u << 0) /* Only FPU capability required */ +#define CPU_CAP_CPU (1u << 1) /* Strict CPU capability required */ +#define CPU_CAP_INS (1u << 2) /* Instructions CPU capability */ +#define CPU_CAP_IMAGE (1u << 3) /* Write capability on dump and read on restore*/ +#define CPU_CAP_ALL (CPU_CAP_FPU | CPU_CAP_CPU | CPU_CAP_INS) +#define CPU_CAP_DEFAULT (CPU_CAP_FPU | CPU_CAP_INS) struct cg_root_opt { struct list_head node; @@ -38,24 +43,69 @@ struct cg_root_opt { char *newroot; }; +/* + * Pre-dump variants + */ +#define PRE_DUMP_SPLICE 1 /* Pre-dump using parasite */ +#define PRE_DUMP_READ 2 /* Pre-dump using process_vm_readv syscall */ + /* * Cgroup management options. */ -#define CG_MODE_IGNORE (0u << 0) /* Zero is important here */ -#define CG_MODE_NONE (1u << 0) -#define CG_MODE_PROPS (1u << 1) -#define CG_MODE_SOFT (1u << 2) -#define CG_MODE_FULL (1u << 3) -#define CG_MODE_STRICT (1u << 4) +#define CG_MODE_IGNORE (0u << 0) /* Zero is important here */ +#define CG_MODE_NONE (1u << 0) +#define CG_MODE_PROPS (1u << 1) +#define CG_MODE_SOFT (1u << 2) +#define CG_MODE_FULL (1u << 3) +#define CG_MODE_STRICT (1u << 4) -#define CG_MODE_DEFAULT (CG_MODE_SOFT) +#define CG_MODE_DEFAULT (CG_MODE_SOFT) + +/* + * Network locking method + */ +enum NETWORK_LOCK_METHOD { + NETWORK_LOCK_IPTABLES, + NETWORK_LOCK_NFTABLES, + NETWORK_LOCK_SKIP, +}; + +/** + * CRIU currently defaults to the iptables locking backend. + * + * It is, however, possible to change this by defining + * NETWORK_LOCK_DEFAULT to a different value on the command-line. + */ +#ifndef NETWORK_LOCK_DEFAULT +#define NETWORK_LOCK_DEFAULT NETWORK_LOCK_IPTABLES +#endif /* * Ghost file size we allow to carry by default. */ -#define DEFAULT_GHOST_LIMIT (1 << 20) +#define DEFAULT_GHOST_LIMIT (1 << 20) -#define DEFAULT_TIMEOUT 10 +#define DEFAULT_TIMEOUT 10 + +enum FILE_VALIDATION_OPTIONS { + /* + * This constant indicates that the file validation should be tried with the + * file size method by default. + */ + FILE_VALIDATION_FILE_SIZE, + + /* + * This constant indicates that the file validation should be tried with the + * build-ID method by default. + */ + FILE_VALIDATION_BUILD_ID +}; + +/* This constant dictates which file validation method should be tried by default. */ +#define FILE_VALIDATION_DEFAULT FILE_VALIDATION_BUILD_ID + +/* This constant dictates that criu use fiemap to copy ghost file by default.*/ +#define FIEMAP_DEFAULT 1 struct irmap; @@ -64,65 +114,89 @@ struct irmap_path_opt { struct irmap *ir; }; +enum criu_mode { + CR_UNSET = 0, + CR_DUMP, + CR_PRE_DUMP, + CR_RESTORE, + CR_LAZY_PAGES, + CR_CHECK, + CR_PAGE_SERVER, + CR_SERVICE, + CR_SWRK, + CR_DEDUP, + CR_CPUINFO_DUMP, + CR_CPUINFO_CHECK, + CR_EXEC_DEPRECATED, + CR_SHOW_DEPRECATED, +}; + struct cr_options { - int final_state; - int check_extra_features; - int check_experimental_features; + int final_state; + int check_extra_features; + int check_experimental_features; union { - int restore_detach; - bool daemon_mode; + int restore_detach; + bool daemon_mode; }; - int restore_sibling; - bool ext_unix_sk; - int shell_job; - int handle_file_locks; - int tcp_established_ok; - int tcp_close; - int evasive_devices; - int link_remap_ok; - int log_file_per_pid; - bool swrk_restore; - char *output; - char *root; - char *pidfile; - char *freeze_cgroup; - struct list_head ext_mounts; - struct list_head inherit_fds; - struct list_head external; - struct list_head join_ns; - char *libdir; - int use_page_server; - unsigned short port; - char *addr; - int ps_socket; - int track_mem; - char *img_parent; - int auto_dedup; - unsigned int cpu_cap; - int force_irmap; - char **exec_cmd; - unsigned int manage_cgroups; - char *new_global_cg_root; - char *cgroup_props; - char *cgroup_props_file; - struct list_head new_cgroup_roots; - bool autodetect_ext_mounts; - int enable_external_sharing; - int enable_external_masters; - bool aufs; /* auto-detected, not via cli */ - bool overlayfs; + int restore_sibling; + bool ext_unix_sk; + int shell_job; + int handle_file_locks; + int tcp_established_ok; + int tcp_close; + int evasive_devices; + int link_remap_ok; + int log_file_per_pid; + int pre_dump_mode; + bool swrk_restore; + char *output; + char *root; + char *pidfile; + char *freeze_cgroup; + struct list_head ext_mounts; + struct list_head inherit_fds; + struct list_head external; + struct list_head join_ns; + char *libdir; + int use_page_server; + unsigned short port; + char *addr; + int ps_socket; + int track_mem; + char *img_parent; + int auto_dedup; + unsigned int cpu_cap; + int force_irmap; + char **exec_cmd; + unsigned int manage_cgroups; + char *new_global_cg_root; + char *cgroup_props; + char *cgroup_props_file; + struct list_head new_cgroup_roots; + char *cgroup_yard; + bool autodetect_ext_mounts; + int enable_external_sharing; + int enable_external_masters; + bool aufs; /* auto-detected, not via cli */ + bool overlayfs; + int ghost_fiemap; #ifdef CONFIG_BINFMT_MISC_VIRTUALIZED - bool has_binfmt_misc; /* auto-detected */ + bool has_binfmt_misc; /* auto-detected */ #endif - size_t ghost_limit; - struct list_head irmap_scan_paths; - bool lsm_supplied; - char *lsm_profile; - unsigned int timeout; - unsigned int empty_ns; - int tcp_skip_in_flight; - bool lazy_pages; - char *work_dir; + size_t ghost_limit; + struct list_head irmap_scan_paths; + bool lsm_supplied; + char *lsm_profile; + char *lsm_mount_context; + unsigned int timeout; + unsigned int empty_ns; + int tcp_skip_in_flight; + bool lazy_pages; + char *work_dir; + int network_lock_method; + int skip_file_rwx_check; + int allow_uprobes; /* * When we scheduler for removal some functionality we first @@ -130,27 +204,56 @@ struct cr_options { * the deprecated stuff is not working, but it's still possible * to turn one ON while the code is in. */ - int deprecated_ok; - int display_stats; - int weak_sysctls; - int status_fd; - bool orphan_pts_master; - pid_t tree_id; - int log_level; - char *imgs_dir; - char *tls_cacert; - char *tls_cacrl; - char *tls_cert; - char *tls_key; - int tls; - int tls_no_cn_verify; + int deprecated_ok; + int display_stats; + int weak_sysctls; + int status_fd; + bool orphan_pts_master; + int stream; + pid_t tree_id; + int log_level; + char *imgs_dir; + char *tls_cacert; + char *tls_cacrl; + char *tls_cert; + char *tls_key; + int tls; + int tls_no_cn_verify; + + /* This stores which method to use for file validation. */ + int file_validation_method; + + /* Shows the mode criu is running at the moment: dump/pre-dump/restore/... */ + enum criu_mode mode; + + int mntns_compat_mode; + + /* Remember the program name passed to main() so we can use it in + * error messages elsewhere. + */ + char *argv_0; + /* + * This contains the eUID of the current CRIU user. It + * will only be set to a non-zero value if CRIU has + * the necessary capabilities to run as non root. + * CAP_CHECKPOINT_RESTORE or CAP_SYS_ADMIN + */ + uid_t uid; + /* This contains the value from capget()->effective */ + u32 cap_eff[_LINUX_CAPABILITY_U32S_3]; + /* + * If CRIU should be running as non-root with the help of + * CAP_CHECKPOINT_RESTORE or CAP_SYS_ADMIN the user should + * explicitly request it as it comes with many limitations. + */ + int unprivileged; }; extern struct cr_options opts; -char *rpc_cfg_file; +extern char *rpc_cfg_file; extern int parse_options(int argc, char **argv, bool *usage_error, bool *has_exec_cmd, int state); -extern int check_options(); -extern void init_opts(); +extern int check_options(void); +extern void init_opts(void); #endif /* __CR_OPTIONS_H__ */ diff --git a/criu/include/criu-log.h b/criu/include/criu-log.h index c2a635ba7..9d52fbdb1 100644 --- a/criu/include/criu-log.h +++ b/criu/include/criu-log.h @@ -21,11 +21,11 @@ #define __CRIU_LOG_H__ #include "log.h" +#include extern int log_init(const char *output); extern void log_fini(void); extern int log_init_by_pid(pid_t pid); -extern void log_closedir(void); extern int log_keep_err(void); extern char *log_first_err(void); diff --git a/criu/include/criu-plugin.h b/criu/include/criu-plugin.h index b76f5f839..c3bea1385 100644 --- a/criu/include/criu-plugin.h +++ b/criu/include/criu-plugin.h @@ -22,47 +22,71 @@ #include #include +#include +#include -#define CRIU_PLUGIN_GEN_VERSION(a,b,c) (((a) << 16) + ((b) << 8) + (c)) -#define CRIU_PLUGIN_VERSION_MAJOR 0 -#define CRIU_PLUGIN_VERSION_MINOR 2 -#define CRIU_PLUGIN_VERSION_SUBLEVEL 0 +#define CRIU_PLUGIN_GEN_VERSION(a, b, c) (((a) << 16) + ((b) << 8) + (c)) +#define CRIU_PLUGIN_VERSION_MAJOR 0 +#define CRIU_PLUGIN_VERSION_MINOR 2 +#define CRIU_PLUGIN_VERSION_SUBLEVEL 0 -#define CRIU_PLUGIN_VERSION_OLD CRIU_PLUGIN_GEN_VERSION(0,1,0) +#define CRIU_PLUGIN_VERSION_OLD CRIU_PLUGIN_GEN_VERSION(0, 1, 0) -#define CRIU_PLUGIN_VERSION \ - CRIU_PLUGIN_GEN_VERSION(CRIU_PLUGIN_VERSION_MAJOR, \ - CRIU_PLUGIN_VERSION_MINOR, \ - CRIU_PLUGIN_VERSION_SUBLEVEL) +#define CRIU_PLUGIN_VERSION \ + CRIU_PLUGIN_GEN_VERSION(CRIU_PLUGIN_VERSION_MAJOR, CRIU_PLUGIN_VERSION_MINOR, CRIU_PLUGIN_VERSION_SUBLEVEL) /* * Plugin hook points and their arguments in hooks. */ enum { - CR_PLUGIN_HOOK__DUMP_UNIX_SK = 0, - CR_PLUGIN_HOOK__RESTORE_UNIX_SK = 1, + CR_PLUGIN_HOOK__DUMP_UNIX_SK = 0, + CR_PLUGIN_HOOK__RESTORE_UNIX_SK = 1, - CR_PLUGIN_HOOK__DUMP_EXT_FILE = 2, - CR_PLUGIN_HOOK__RESTORE_EXT_FILE = 3, + CR_PLUGIN_HOOK__DUMP_EXT_FILE = 2, + CR_PLUGIN_HOOK__RESTORE_EXT_FILE = 3, - CR_PLUGIN_HOOK__DUMP_EXT_MOUNT = 4, - CR_PLUGIN_HOOK__RESTORE_EXT_MOUNT = 5, + CR_PLUGIN_HOOK__DUMP_EXT_MOUNT = 4, + CR_PLUGIN_HOOK__RESTORE_EXT_MOUNT = 5, - CR_PLUGIN_HOOK__DUMP_EXT_LINK = 6, + CR_PLUGIN_HOOK__DUMP_EXT_LINK = 6, + + CR_PLUGIN_HOOK__HANDLE_DEVICE_VMA = 7, + + CR_PLUGIN_HOOK__UPDATE_VMA_MAP = 8, + + CR_PLUGIN_HOOK__RESUME_DEVICES_LATE = 9, + + CR_PLUGIN_HOOK__PAUSE_DEVICES = 10, + + CR_PLUGIN_HOOK__CHECKPOINT_DEVICES = 11, + + CR_PLUGIN_HOOK__POST_FORKING = 12, + + CR_PLUGIN_HOOK__RESTORE_INIT = 13, + + CR_PLUGIN_HOOK__DUMP_DEVICES_LATE = 14, CR_PLUGIN_HOOK__MAX }; -#define DECLARE_PLUGIN_HOOK_ARGS(__hook, ...) \ - typedef int (__hook ##_t)(__VA_ARGS__) +#define DECLARE_PLUGIN_HOOK_ARGS(__hook, ...) typedef int(__hook##_t)(__VA_ARGS__) DECLARE_PLUGIN_HOOK_ARGS(CR_PLUGIN_HOOK__DUMP_UNIX_SK, int fd, int id); DECLARE_PLUGIN_HOOK_ARGS(CR_PLUGIN_HOOK__RESTORE_UNIX_SK, int id); DECLARE_PLUGIN_HOOK_ARGS(CR_PLUGIN_HOOK__DUMP_EXT_FILE, int fd, int id); -DECLARE_PLUGIN_HOOK_ARGS(CR_PLUGIN_HOOK__RESTORE_EXT_FILE, int id); +DECLARE_PLUGIN_HOOK_ARGS(CR_PLUGIN_HOOK__RESTORE_EXT_FILE, int id, bool *retry_needed); DECLARE_PLUGIN_HOOK_ARGS(CR_PLUGIN_HOOK__DUMP_EXT_MOUNT, char *mountpoint, int id); DECLARE_PLUGIN_HOOK_ARGS(CR_PLUGIN_HOOK__RESTORE_EXT_MOUNT, int id, char *mountpoint, char *old_root, int *is_file); DECLARE_PLUGIN_HOOK_ARGS(CR_PLUGIN_HOOK__DUMP_EXT_LINK, int index, int type, char *kind); +DECLARE_PLUGIN_HOOK_ARGS(CR_PLUGIN_HOOK__HANDLE_DEVICE_VMA, int fd, const struct stat *stat); +DECLARE_PLUGIN_HOOK_ARGS(CR_PLUGIN_HOOK__UPDATE_VMA_MAP, const char *path, const uint64_t addr, + const uint64_t old_pgoff, uint64_t *new_pgoff, int *plugin_fd); +DECLARE_PLUGIN_HOOK_ARGS(CR_PLUGIN_HOOK__RESUME_DEVICES_LATE, int pid); +DECLARE_PLUGIN_HOOK_ARGS(CR_PLUGIN_HOOK__PAUSE_DEVICES, int pid); +DECLARE_PLUGIN_HOOK_ARGS(CR_PLUGIN_HOOK__CHECKPOINT_DEVICES, int pid); +DECLARE_PLUGIN_HOOK_ARGS(CR_PLUGIN_HOOK__POST_FORKING, void); +DECLARE_PLUGIN_HOOK_ARGS(CR_PLUGIN_HOOK__RESTORE_INIT, void); +DECLARE_PLUGIN_HOOK_ARGS(CR_PLUGIN_HOOK__DUMP_DEVICES_LATE, int id); enum { CR_PLUGIN_STAGE__DUMP, @@ -76,42 +100,47 @@ enum { * Plugin descriptor. */ typedef struct { - const char *name; - int (*init)(int stage); - void (*exit)(int stage, int ret); - unsigned int version; - unsigned int max_hooks; - void *hooks[CR_PLUGIN_HOOK__MAX]; + const char *name; + int (*init)(int stage); + void (*exit)(int stage, int ret); + unsigned int version; + unsigned int max_hooks; + void *hooks[CR_PLUGIN_HOOK__MAX]; } cr_plugin_desc_t; extern cr_plugin_desc_t CR_PLUGIN_DESC; -#define CR_PLUGIN_REGISTER(___name, ___init, ___exit) \ - cr_plugin_desc_t CR_PLUGIN_DESC = { \ - .name = ___name, \ - .init = ___init, \ - .exit = ___exit, \ - .version = CRIU_PLUGIN_VERSION, \ - .max_hooks = CR_PLUGIN_HOOK__MAX, \ +#define CR_PLUGIN_REGISTER(___name, ___init, ___exit) \ + cr_plugin_desc_t CR_PLUGIN_DESC = { \ + .name = ___name, \ + .init = ___init, \ + .exit = ___exit, \ + .version = CRIU_PLUGIN_VERSION, \ + .max_hooks = CR_PLUGIN_HOOK__MAX, \ }; -static inline int cr_plugin_dummy_init(int stage) { return 0; } -static inline void cr_plugin_dummy_exit(int stage, int ret) { } - -#define CR_PLUGIN_REGISTER_DUMMY(___name) \ - cr_plugin_desc_t CR_PLUGIN_DESC = { \ - .name = ___name, \ - .init = cr_plugin_dummy_init, \ - .exit = cr_plugin_dummy_exit, \ - .version = CRIU_PLUGIN_VERSION, \ - .max_hooks = CR_PLUGIN_HOOK__MAX, \ - }; - -#define CR_PLUGIN_REGISTER_HOOK(__hook, __func) \ -static void __attribute__((constructor)) cr_plugin_register_hook_##__func (void) \ -{ \ - CR_PLUGIN_DESC.hooks[__hook] = (void *)__func; \ +static inline int cr_plugin_dummy_init(int stage) +{ + return 0; } +static inline void cr_plugin_dummy_exit(int stage, int ret) +{ +} + +#define CR_PLUGIN_REGISTER_DUMMY(___name) \ + cr_plugin_desc_t CR_PLUGIN_DESC = { \ + .name = ___name, \ + .init = cr_plugin_dummy_init, \ + .exit = cr_plugin_dummy_exit, \ + .version = CRIU_PLUGIN_VERSION, \ + .max_hooks = CR_PLUGIN_HOOK__MAX, \ + }; + +#define CR_PLUGIN_REGISTER_HOOK(__hook, __func) \ + static void __attribute__((constructor)) cr_plugin_register_hook_##__func(void) \ + { \ + CR_PLUGIN_DESC.hooks[__hook] = (void *)__func; \ + } /* Public API */ extern int criu_get_image_dir(void); @@ -119,14 +148,19 @@ extern int criu_get_image_dir(void); /* * Deprecated, will be removed in next version. */ -typedef int (cr_plugin_init_t)(void); -typedef void (cr_plugin_fini_t)(void); -typedef int (cr_plugin_dump_unix_sk_t)(int fd, int id); -typedef int (cr_plugin_restore_unix_sk_t)(int id); -typedef int (cr_plugin_dump_file_t)(int fd, int id); -typedef int (cr_plugin_restore_file_t)(int id); -typedef int (cr_plugin_dump_ext_mount_t)(char *mountpoint, int id); -typedef int (cr_plugin_restore_ext_mount_t)(int id, char *mountpoint, char *old_root, int *is_file); -typedef int (cr_plugin_dump_ext_link_t)(int index, int type, char *kind); +typedef int(cr_plugin_init_t)(void); +typedef void(cr_plugin_fini_t)(void); +typedef int(cr_plugin_dump_unix_sk_t)(int fd, int id); +typedef int(cr_plugin_restore_unix_sk_t)(int id); +typedef int(cr_plugin_dump_file_t)(int fd, int id); +typedef int(cr_plugin_restore_file_t)(int id); +typedef int(cr_plugin_dump_ext_mount_t)(char *mountpoint, int id); +typedef int(cr_plugin_restore_ext_mount_t)(int id, char *mountpoint, char *old_root, int *is_file); +typedef int(cr_plugin_dump_ext_link_t)(int index, int type, char *kind); +typedef int(cr_plugin_handle_device_vma_t)(int fd, const struct stat *stat); +typedef int(cr_plugin_update_vma_map_t)(const char *path, const uint64_t addr, const uint64_t old_pgoff, + uint64_t *new_pgoff, int *plugin_fd); +typedef int(cr_plugin_resume_devices_late_t)(int pid); +typedef int(cr_plugin_post_forking_t)(void); #endif /* __CRIU_PLUGIN_H__ */ diff --git a/criu/include/crtools.h b/criu/include/crtools.h index c5a5b6499..b54b9d929 100644 --- a/criu/include/crtools.h +++ b/criu/include/crtools.h @@ -8,9 +8,9 @@ #include "images/inventory.pb-c.h" -#define CR_FD_PERM (S_IRUSR | S_IWUSR | S_IRGRP | S_IROTH) +#define CR_FD_PERM (S_IRUSR | S_IWUSR | S_IRGRP | S_IROTH) -extern int check_img_inventory(void); +extern int check_img_inventory(bool restore); extern int write_img_inventory(InventoryEntry *he); extern int inventory_save_uptime(InventoryEntry *he); extern InventoryEntry *get_parent_inventory(void); @@ -26,22 +26,25 @@ extern int cr_pre_dump_tasks(pid_t pid); extern int cr_restore_tasks(void); extern int convert_to_elf(char *elf_path, int fd_core); extern int cr_check(void); +extern int check_caps(void); extern int cr_dedup(void); extern int cr_lazy_pages(bool daemon); extern int check_add_feature(char *arg); extern void pr_check_features(const char *offset, const char *sep, int width); -#define PPREP_HEAD_INACTIVE ((struct pprep_head *)-1) +#define PPREP_HEAD_INACTIVE ((struct pprep_head *)-1) -#define add_post_prepare_cb_once(phead) do { \ - if ((phead)->next == PPREP_HEAD_INACTIVE)\ - add_post_prepare_cb(phead); \ +#define add_post_prepare_cb_once(phead) \ + do { \ + if ((phead)->next == PPREP_HEAD_INACTIVE) \ + add_post_prepare_cb(phead); \ } while (0) -#define MAKE_PPREP_HEAD(name) struct pprep_head name = { \ - .next = PPREP_HEAD_INACTIVE, \ - .actor = name##_cb, \ +#define MAKE_PPREP_HEAD(name) \ + struct pprep_head name = { \ + .next = PPREP_HEAD_INACTIVE, \ + .actor = name##_cb, \ } #endif /* __CR_CRTOOLS_H__ */ diff --git a/criu/include/fault-injection.h b/criu/include/fault-injection.h index 852d27166..e987c18ce 100644 --- a/criu/include/fault-injection.h +++ b/criu/include/fault-injection.h @@ -17,28 +17,26 @@ enum faults { FI_NO_BREAKPOINTS = 130, FI_PARTIAL_PAGES = 131, FI_HUGE_ANON_SHMEM_ID = 132, + FI_CANNOT_MAP_VDSO = 133, + FI_CORRUPT_EXTREGS = 134, + FI_DONT_USE_PAGEMAP_SCAN = 135, + FI_DUMP_CRASH = 136, + FI_COMPEL_INTERRUPT_ONLY_MODE = 137, + FI_PLUGIN_CUDA_FORCE_ENABLE = 138, FI_MAX, }; static inline bool __fault_injected(enum faults f, enum faults fi_strategy) { - /* - * Temporary workaround for Xen guests. Breakpoints degrade - * performance linearly, so until we find out the reason, - * let's disable them. - */ - if (f == FI_NO_BREAKPOINTS) - return true; - return fi_strategy == f; } -#define FI_HUGE_ANON_SHMEM_ID_BASE (0xfffffffflu) +#define FI_HUGE_ANON_SHMEM_ID_BASE (0xfffffffflu) #ifndef CR_NOGLIBC extern enum faults fi_strategy; -#define fault_injected(f) __fault_injected(f, fi_strategy) +#define fault_injected(f) __fault_injected(f, fi_strategy) extern int fault_injection_init(void); diff --git a/criu/include/fcntl.h b/criu/include/fcntl.h index d9c5c5e7b..35f880523 100644 --- a/criu/include/fcntl.h +++ b/criu/include/fcntl.h @@ -5,41 +5,49 @@ #include #ifndef F_SETOWN_EX -#define F_SETOWN_EX 15 -#define F_GETOWN_EX 16 +#define F_SETOWN_EX 15 +#define F_GETOWN_EX 16 struct f_owner_ex { - int type; - pid_t pid; + int type; + pid_t pid; }; #endif #ifndef F_GETOWNER_UIDS -#define F_GETOWNER_UIDS 17 +#define F_GETOWNER_UIDS 17 #endif /* * These things are required to compile on CentOS-6 */ #ifndef F_LINUX_SPECIFIC_BASE -# define F_LINUX_SPECIFIC_BASE 1024 +#define F_LINUX_SPECIFIC_BASE 1024 #endif #ifndef F_SETPIPE_SZ -# define F_SETPIPE_SZ (F_LINUX_SPECIFIC_BASE + 7) +#define F_SETPIPE_SZ (F_LINUX_SPECIFIC_BASE + 7) #endif #ifndef F_GETPIPE_SZ -# define F_GETPIPE_SZ (F_LINUX_SPECIFIC_BASE + 8) +#define F_GETPIPE_SZ (F_LINUX_SPECIFIC_BASE + 8) +#endif + +#ifndef F_ADD_SEALS +#define F_ADD_SEALS (F_LINUX_SPECIFIC_BASE + 9) +#endif + +#ifndef F_GET_SEALS +#define F_GET_SEALS (F_LINUX_SPECIFIC_BASE + 10) #endif #ifndef O_PATH -# define O_PATH 010000000 +#define O_PATH 010000000 #endif #ifndef __O_TMPFILE -#define __O_TMPFILE 020000000 +#define __O_TMPFILE 020000000 #endif #ifndef O_TMPFILE diff --git a/criu/include/file-ids.h b/criu/include/file-ids.h index 9a39f0d2d..4479a9b37 100644 --- a/criu/include/file-ids.h +++ b/criu/include/file-ids.h @@ -6,8 +6,8 @@ #include "images/fdinfo.pb-c.h" -#define FD_PID_INVALID (-2U) -#define FD_DESC_INVALID (-3U) +#define FD_PID_INVALID (-2U) +#define FD_DESC_INVALID (-3U) struct fdinfo_entry; struct stat; diff --git a/criu/include/file-lock.h b/criu/include/file-lock.h index dc4f38216..9ab79b66b 100644 --- a/criu/include/file-lock.h +++ b/criu/include/file-lock.h @@ -6,57 +6,58 @@ #include "protobuf.h" #include "images/file-lock.pb-c.h" -#define FL_UNKNOWN -1 -#define FL_POSIX 1 -#define FL_FLOCK 2 -#define FL_OFD 4 -#define FL_LEASE 8 +#define FL_UNKNOWN -1 +#define FL_POSIX 1 +#define FL_FLOCK 2 +#define FL_OFD 4 +#define FL_LEASE 8 /* for posix fcntl() and lockf() */ #ifndef F_RDLCK -#define F_RDLCK 0 -#define F_WRLCK 1 -#define F_UNLCK 2 +#define F_RDLCK 0 +#define F_WRLCK 1 +#define F_UNLCK 2 #endif /* for OFD locks fcntl() */ #ifndef F_OFD_GETLK -#define F_OFD_GETLK 36 -#define F_OFD_SETLK 37 -#define F_OFD_SETLKW 38 +#define F_OFD_GETLK 36 +#define F_OFD_SETLK 37 +#define F_OFD_SETLKW 38 #endif /* operations for bsd flock(), also used by the kernel implementation */ -#define LOCK_SH 1 /* shared lock */ -#define LOCK_EX 2 /* exclusive lock */ -#define LOCK_NB 4 /* or'd with one of the above to prevent +#define LOCK_SH 1 /* shared lock */ +#define LOCK_EX 2 /* exclusive lock */ +#define LOCK_NB \ + 4 /* or'd with one of the above to prevent blocking */ -#define LOCK_UN 8 /* remove lock */ +#define LOCK_UN 8 /* remove lock */ -#define LOCK_MAND 32 /* This is a mandatory flock ... */ -#define LOCK_READ 64 /* which allows concurrent read operations */ -#define LOCK_WRITE 128 /* which allows concurrent write operations */ -#define LOCK_RW 192 /* which allows concurrent read & write ops */ +#define LOCK_MAND 32 /* This is a mandatory flock ... */ +#define LOCK_READ 64 /* which allows concurrent read operations */ +#define LOCK_WRITE 128 /* which allows concurrent write operations */ +#define LOCK_RW 192 /* which allows concurrent read & write ops */ /* for leases */ -#define LEASE_BREAKING 4 +#define LEASE_BREAKING 4 struct file_lock { - long long fl_id; - int fl_kind; - int fl_ltype; + long long fl_id; + int fl_kind; + int fl_ltype; - pid_t fl_owner; /* process, which created the lock */ - pid_t fl_holder; /* pid of fd on whose the lock is found */ - int maj, min; - unsigned long i_no; - long long start; - char end[32]; + pid_t fl_owner; /* process, which created the lock */ + pid_t fl_holder; /* pid of fd on whose the lock is found */ + int maj, min; + unsigned long i_no; + long long start; + char end[32]; - struct list_head list; /* list of all file locks */ + struct list_head list; /* list of all file locks */ - int real_owner; - int owners_fd; + int real_owner; + int owners_fd; }; extern struct list_head file_lock_list; @@ -74,6 +75,6 @@ extern int correct_file_leases_type(struct pid *, int fd, int lfd); extern int note_file_lock(struct pid *, int fd, int lfd, struct fd_parms *); extern int dump_file_locks(void); -#define OPT_FILE_LOCKS "file-locks" +#define OPT_FILE_LOCKS "file-locks" #endif /* __FILE_LOCK_H__ */ diff --git a/criu/include/files-reg.h b/criu/include/files-reg.h index 7a22d4d82..d4934c4ae 100644 --- a/criu/include/files-reg.h +++ b/criu/include/files-reg.h @@ -2,6 +2,7 @@ #define __CR_FILES_REG_H__ #include "files.h" +#include "util.h" #include "images/regfile.pb-c.h" #include "images/ghost-file.pb-c.h" @@ -12,25 +13,23 @@ struct fd_parms; struct file_remap { char *rpath; bool is_dir; - int rmnt_id; + int rmnt_id; uid_t uid; gid_t gid; }; struct reg_file_info { - struct file_desc d; - RegFileEntry *rfe; - struct file_remap *remap; - bool size_mode_checked; - bool is_dir; - char *path; + struct file_desc d; + RegFileEntry *rfe; + struct file_remap *remap; + bool size_mode_checked; + bool is_dir; + char *path; }; extern int open_reg_by_id(u32 id); extern int open_reg_fd(struct file_desc *); -extern int open_path(struct file_desc *, int (*open_cb)(int ns_root_fd, - struct reg_file_info *, void *), void *arg); -extern void clear_ghost_files(void); +extern int open_path(struct file_desc *, int (*open_cb)(int ns_root_fd, struct reg_file_info *, void *), void *arg); extern const struct fdtype_ops regfile_dump_ops; extern int do_open_reg_noseek_flags(int ns_root_fd, struct reg_file_info *rfi, void *arg); @@ -39,7 +38,7 @@ extern int dump_one_reg_file(int lfd, u32 id, const struct fd_parms *p); extern struct file_remap *lookup_ghost_remap(u32 dev, u32 ino); extern struct file_desc *try_collect_special_file(u32 id, int optional); -#define collect_special_file(id) try_collect_special_file(id, 0) +#define collect_special_file(id) try_collect_special_file(id, 0) extern int collect_filemap(struct vma_area *); extern void filemap_ctx_init(bool auto_close); extern void filemap_ctx_fini(void); @@ -52,8 +51,14 @@ extern void free_link_remaps(void); extern int prepare_remaps(void); extern int try_clean_remaps(bool only_ghosts); -extern int strip_deleted(struct fd_link *link); +static inline int link_strip_deleted(struct fd_link *link) +{ + return strip_deleted(link->name, link->len); +} extern int dead_pid_conflict(void); +extern int rm_parent_dirs(int mntns_root, char *path, int count); +extern int make_parent_dirs_if_need(int mntns_root, char *path); + #endif /* __CR_FILES_REG_H__ */ diff --git a/criu/include/files.h b/criu/include/files.h index 2c1e1e723..31ebb0ca0 100644 --- a/criu/include/files.h +++ b/criu/include/files.h @@ -26,8 +26,8 @@ struct fd_link { union { /* Link info for generic file (path) */ struct { - char name[PATH_MAX]; - size_t len; + char name[PATH_MAX]; + size_t len; }; /* Link info for proc-ns file */ @@ -39,28 +39,26 @@ struct fd_link { }; struct fd_parms { - int fd; - off_t pos; - unsigned int flags; - char fd_flags; - struct stat stat; - pid_t pid; - FownEntry fown; - struct fd_link *link; - long fs_type; - int mnt_id; + int fd; + off_t pos; + unsigned int flags; + char fd_flags; + struct stat stat; + pid_t pid; + FownEntry fown; + struct fd_link *link; + long fs_type; + int mnt_id; struct parasite_ctl *fd_ctl; struct parasite_drain_fd *dfds; }; -#define FD_PARMS_INIT \ -(struct fd_parms) { \ - .fd = FD_DESC_INVALID, \ - .fown = FOWN_ENTRY__INIT, \ - .link = NULL, \ - .mnt_id = -1, \ -} +#define FD_PARMS_INIT \ + (struct fd_parms) \ + { \ + .fd = FD_DESC_INVALID, .fown = FOWN_ENTRY__INIT, .link = NULL, .mnt_id = -1, \ + } extern int fill_fdlink(int lfd, const struct fd_parms *p, struct fd_link *link); extern uint32_t make_gen_id(uint32_t st_dev, uint32_t st_ino, uint64_t pos); @@ -83,15 +81,15 @@ enum { }; struct fdinfo_list_entry { - struct list_head desc_list; /* To chain on @fd_info_head */ - struct file_desc *desc; /* Associated file descriptor */ - struct list_head ps_list; /* To chain per-task files */ - struct pstree_item *task; - FdinfoEntry *fe; - int pid; - u8 received:1; - u8 stage:3; - u8 fake:1; + struct list_head desc_list; /* To chain on @fd_info_head */ + struct file_desc *desc; /* Associated file descriptor */ + struct list_head ps_list; /* To chain per-task files */ + struct pstree_item *task; + FdinfoEntry *fe; + int pid; + u8 received : 1; + u8 stage : 3; + u8 fake : 1; }; extern int inh_fd_max; @@ -99,55 +97,50 @@ extern int inh_fd_max; /* reports whether fd_a takes prio over fd_b */ static inline int fdinfo_rst_prio(struct fdinfo_list_entry *fd_a, struct fdinfo_list_entry *fd_b) { - return pid_rst_prio(fd_a->pid, fd_b->pid) || - ((fd_a->pid == fd_b->pid) && (fd_a->fe->fd < fd_b->fe->fd)); + return pid_rst_prio(fd_a->pid, fd_b->pid) || ((fd_a->pid == fd_b->pid) && (fd_a->fe->fd < fd_b->fe->fd)); } struct file_desc_ops { /* fd_types from images/fdinfo.proto */ - unsigned int type; + unsigned int type; /* * Opens a file by whatever syscall is required for that. * The returned descriptor may be closed (dup2-ed to another) * so it shouldn't be saved for any post-actions. */ - int (*open)(struct file_desc *d, int *new_fd); - char * (*name)(struct file_desc *, char *b, size_t s); + int (*open)(struct file_desc *d, int *new_fd); + char *(*name)(struct file_desc *, char *b, size_t s); }; int collect_fd(int pid, FdinfoEntry *e, struct rst_info *rst_info, bool ghost); -struct fdinfo_list_entry *collect_fd_to(int pid, FdinfoEntry *e, - struct rst_info *rst_info, struct file_desc *fdesc, - bool fake, bool force_master); +struct fdinfo_list_entry *collect_fd_to(int pid, FdinfoEntry *e, struct rst_info *rst_info, struct file_desc *fdesc, + bool fake, bool force_master); u32 find_unused_file_desc_id(void); unsigned int find_unused_fd(struct pstree_item *, int hint_fd); struct fdinfo_list_entry *find_used_fd(struct pstree_item *, int fd); struct file_desc { - u32 id; /* File id, unique */ - struct hlist_node hash; /* Descriptor hashing and lookup */ - struct list_head fd_info_head; /* Chain of fdinfo_list_entry-s with same ID and type but different pids */ - struct file_desc_ops *ops; /* Associated operations */ - struct list_head fake_master_list;/* To chain in the list of file_desc, which don't - have a fle in a task, that having permissions */ + u32 id; /* File id, unique */ + struct hlist_node hash; /* Descriptor hashing and lookup */ + struct list_head fd_info_head; /* Chain of fdinfo_list_entry-s with same ID and type but different pids */ + struct file_desc_ops *ops; /* Associated operations */ + struct list_head fake_master_list; /* To chain in the list of file_desc, which don't + * have a fle in a task, that having permissions */ }; struct fdtype_ops { - unsigned int type; - int (*dump)(int lfd, u32 id, const struct fd_parms *p); - int (*pre_dump)(int pid, int lfd); + unsigned int type; + int (*dump)(int lfd, u32 id, const struct fd_parms *p); + int (*pre_dump)(int pid, int lfd); }; struct cr_img; extern int dump_my_file(int lfd, u32 *, int *type); -extern int do_dump_gen_file(struct fd_parms *p, int lfd, - const struct fdtype_ops *ops, - FdinfoEntry *e); +extern int do_dump_gen_file(struct fd_parms *p, int lfd, const struct fdtype_ops *ops, FdinfoEntry *e); struct parasite_drain_fd; -int dump_task_files_seized(struct parasite_ctl *ctl, struct pstree_item *item, - struct parasite_drain_fd *dfds); +int dump_task_files_seized(struct parasite_ctl *ctl, struct pstree_item *item, struct parasite_drain_fd *dfds); int predump_task_files(int pid); extern void file_desc_init(struct file_desc *d, u32 id, struct file_desc_ops *ops); @@ -179,13 +172,12 @@ extern int close_old_fds(void); #define AT_EMPTY_PATH 0x1000 #endif -#define LREMAP_PARAM "link-remap" +#define LREMAP_PARAM "link-remap" extern int shared_fdt_prepare(struct pstree_item *item); extern struct collect_image_info ext_file_cinfo; -extern int dump_unsupp_fd(struct fd_parms *p, int lfd, - char *more, char *info, FdinfoEntry *); +extern int dump_unsupp_fd(struct fd_parms *p, int lfd, char *more, char *info, FdinfoEntry *); extern int inherit_fd_parse(char *optarg); extern int inherit_fd_add(int fd, char *key); @@ -197,11 +189,11 @@ extern int inherit_fd_lookup_id(char *id); extern bool inherited_fd(struct file_desc *, int *fdp); extern FdinfoEntry *dup_fdinfo(FdinfoEntry *old, int fd, unsigned flags); -int dup_fle(struct pstree_item *task, struct fdinfo_list_entry *ple, - int fd, unsigned flags); +int dup_fle(struct pstree_item *task, struct fdinfo_list_entry *ple, int fd, unsigned flags); extern int open_transport_socket(void); extern int set_fds_event(pid_t virt); extern void wait_fds_event(void); +int find_unused_fd_pid(pid_t pid); #endif /* __CR_FILES_H__ */ diff --git a/criu/include/filesystems.h b/criu/include/filesystems.h index bd798062d..251e9e2aa 100644 --- a/criu/include/filesystems.h +++ b/criu/include/filesystems.h @@ -5,8 +5,7 @@ extern struct fstype *decode_fstype(u32 fst); extern bool add_fsname_auto(const char *names); struct mount_info; -typedef int (*mount_fn_t)(struct mount_info *mi, const char *src, const - char *fstype, unsigned long mountflags); +typedef int (*mount_fn_t)(struct mount_info *mi, const char *src, const char *fstype, unsigned long mountflags); struct fstype { char *name; diff --git a/criu/include/fs-magic.h b/criu/include/fs-magic.h index ced3377f9..ffc0455d5 100644 --- a/criu/include/fs-magic.h +++ b/criu/include/fs-magic.h @@ -10,47 +10,55 @@ */ #ifndef NFS_SUPER_MAGIC -# define NFS_SUPER_MAGIC 0x6969 +#define NFS_SUPER_MAGIC 0x6969 #endif #ifndef PIPEFS_MAGIC -# define PIPEFS_MAGIC 0x50495045 +#define PIPEFS_MAGIC 0x50495045 #endif #ifndef ANON_INODE_FS_MAGIC -# define ANON_INODE_FS_MAGIC 0x09041934 +#define ANON_INODE_FS_MAGIC 0x09041934 #endif #ifndef TMPFS_MAGIC -# define TMPFS_MAGIC 0x01021994 +#define TMPFS_MAGIC 0x01021994 #endif #ifndef SOCKFS_MAGIC -# define SOCKFS_MAGIC 0x534f434b +#define SOCKFS_MAGIC 0x534f434b #endif #ifndef DEVPTS_SUPER_MAGIC -#define DEVPTS_SUPER_MAGIC 0x1cd1 +#define DEVPTS_SUPER_MAGIC 0x1cd1 #endif #ifndef BTRFS_SUPER_MAGIC -#define BTRFS_SUPER_MAGIC 0x9123683E +#define BTRFS_SUPER_MAGIC 0x9123683E #endif #ifndef AUFS_SUPER_MAGIC -#define AUFS_SUPER_MAGIC 0x61756673 +#define AUFS_SUPER_MAGIC 0x61756673 #endif #ifndef PROC_SUPER_MAGIC -#define PROC_SUPER_MAGIC 0x9fa0 +#define PROC_SUPER_MAGIC 0x9fa0 #endif #ifndef BINFMTFS_MAGIC -#define BINFMTFS_MAGIC 0x42494e4d +#define BINFMTFS_MAGIC 0x42494e4d #endif #ifndef AUTOFS_SUPER_MAGIC -#define AUTOFS_SUPER_MAGIC 0x0187 +#define AUTOFS_SUPER_MAGIC 0x0187 +#endif + +#ifndef OVERLAYFS_SUPER_MAGIC +#define OVERLAYFS_SUPER_MAGIC 0x794c7630 +#endif + +#ifndef PID_FS_MAGIC +#define PID_FS_MAGIC 0x50494446 #endif #endif /* __CR_FS_MAGIC_H__ */ diff --git a/criu/include/fsnotify.h b/criu/include/fsnotify.h index 935dd60f4..3435f0c86 100644 --- a/criu/include/fsnotify.h +++ b/criu/include/fsnotify.h @@ -9,7 +9,7 @@ #define KERNEL_FS_EVENT_ON_CHILD 0x08000000 #ifndef INOTIFY_IOC_SETNEXTWD -#define INOTIFY_IOC_SETNEXTWD _IOW('I', 0, __s32) +#define INOTIFY_IOC_SETNEXTWD _IOW('I', 0, __s32) #endif extern int is_inotify_link(char *link); diff --git a/criu/include/hugetlb.h b/criu/include/hugetlb.h new file mode 100644 index 000000000..9aee5bed3 --- /dev/null +++ b/criu/include/hugetlb.h @@ -0,0 +1,61 @@ +#ifndef __CR_HUGETLB_H_ +#define __CR_HUGETLB_H_ + +#include +#include + +#include "vma.h" + +#define ANON_HUGEPAGE_PREFIX "/anon_hugepage" +#define ANON_HUGEPAGE_PREFIX_LEN (sizeof(ANON_HUGEPAGE_PREFIX) - 1) + +enum hugepage_size { + HUGETLB_16KB, + HUGETLB_64KB, + HUGETLB_512KB, + HUGETLB_1MB, + HUGETLB_2MB, + HUGETLB_8MB, + HUGETLB_16MB, + HUGETLB_32MB, + HUGETLB_256MB, + HUGETLB_512MB, + HUGETLB_1GB, + HUGETLB_2GB, + HUGETLB_16GB, + HUGETLB_MAX +}; + +#define MAP_HUGETLB_SHIFT 26 +#define MAP_HUGETLB_SIZE_MASK (0x3f << MAP_HUGETLB_SHIFT) + +#define MAP_HUGETLB_16KB (14 << MAP_HUGETLB_SHIFT) +#define MAP_HUGETLB_64KB (16 << MAP_HUGETLB_SHIFT) +#define MAP_HUGETLB_512KB (19 << MAP_HUGETLB_SHIFT) +#define MAP_HUGETLB_1MB (20 << MAP_HUGETLB_SHIFT) +#define MAP_HUGETLB_2MB (21 << MAP_HUGETLB_SHIFT) +#define MAP_HUGETLB_8MB (23 << MAP_HUGETLB_SHIFT) +#define MAP_HUGETLB_16MB (24 << MAP_HUGETLB_SHIFT) +#define MAP_HUGETLB_32MB (25 << MAP_HUGETLB_SHIFT) +#define MAP_HUGETLB_256MB (28 << MAP_HUGETLB_SHIFT) +#define MAP_HUGETLB_512MB (29 << MAP_HUGETLB_SHIFT) +#define MAP_HUGETLB_1GB (30 << MAP_HUGETLB_SHIFT) +#define MAP_HUGETLB_2GB (31 << MAP_HUGETLB_SHIFT) +#define MAP_HUGETLB_16GB (34 << MAP_HUGETLB_SHIFT) + +struct htlb_info { + unsigned long long size; + int flag; +}; + +extern struct htlb_info hugetlb_info[HUGETLB_MAX]; + +int is_hugetlb_dev(dev_t dev, int *hugetlb_size_flag); +int can_dump_with_memfd_hugetlb(dev_t dev, int *hugetlb_size_flag, const char *file_path, struct vma_area *vma); +unsigned long get_size_from_hugetlb_flag(int flag); + +#ifndef MFD_HUGETLB +#define MFD_HUGETLB 4 +#endif + +#endif diff --git a/criu/include/image-desc.h b/criu/include/image-desc.h index 3135f56b4..79e1ac111 100644 --- a/criu/include/image-desc.h +++ b/criu/include/image-desc.h @@ -26,6 +26,8 @@ enum { CR_FD_UTSNS, CR_FD_MNTS, CR_FD_USERNS, + CR_FD_TIMENS, + CR_FD_PIDNS, _CR_FD_IPCNS_FROM, CR_FD_IPC_VAR, @@ -42,6 +44,7 @@ enum { CR_FD_RULE, CR_FD_IPTABLES, CR_FD_IP6TABLES, + CR_FD_NFTABLES, CR_FD_NETNS, CR_FD_NETNF_CT, CR_FD_NETNF_EXP, @@ -64,6 +67,10 @@ enum { CR_FD_CGROUP, CR_FD_FILE_LOCKS, CR_FD_SECCOMP, + CR_FD_APPARMOR, + CR_FD_MEMFD_INODE, + CR_FD_BPFMAP_FILE, + CR_FD_BPFMAP_DATA, _CR_FD_GLOB_TO, CR_FD_TMPFS_IMG, @@ -79,7 +86,6 @@ enum { CR_FD_RLIMIT, CR_FD_ITIMERS, CR_FD_POSIX_TIMERS, - CR_FD_FILE_LOCKS_PID, CR_FD_IRMAP_CACHE, CR_FD_CPUINFO, @@ -106,6 +112,8 @@ enum { CR_FD_FIFO, CR_FD_PIPES, CR_FD_TTY_FILES, + CR_FD_MEMFD_FILE, + CR_FD_PIDFD, CR_FD_AUTOFS, @@ -114,9 +122,9 @@ enum { /* file descriptors template */ struct cr_fd_desc_tmpl { - const char *fmt; /* format for the name */ - u32 magic; /* magic in the header */ - int oflags; /* flags for image_open */ + const char *fmt; /* format for the name */ + u32 magic; /* magic in the header */ + int oflags; /* flags for image_open */ }; extern struct cr_fd_desc_tmpl imgset_template[CR_FD_MAX]; diff --git a/criu/include/image.h b/criu/include/image.h index 2baa39496..30e32323d 100644 --- a/criu/include/image.h +++ b/criu/include/image.h @@ -12,14 +12,14 @@ #include "log.h" #include "common/bug.h" -#define PAGE_RSS 1 -#define PAGE_ANON 2 +#define PAGE_RSS 1 +#define PAGE_ANON 2 /* * Top bit set in the tgt id means we've remapped * to a ghost file. */ -#define REMAP_GHOST (1 << 31) +#define REMAP_GHOST (1 << 31) /* * VMA_AREA status: @@ -35,13 +35,15 @@ * - stack * the memory area is used in application stack so we * should be careful about guard page here + * - shadow stack + * the memory area is used by shadow stack * - vsyscall * special memory area injected into the task memory * space by the kernel itself, represent virtual syscall * implementation and it is specific to every kernel version, * its contents should not be dumped ever * - vdso,vvar - * the vDSO area, it might reqire additional memory + * the vDSO area, it might require additional memory * contents modification especially when tasks are * migrating between different kernel versions * - heap @@ -66,43 +68,62 @@ * processing exiting with error; while the rest of bits * are part of image ABI, this particular one must never * be used in image. + * - guard + * stands for a fake VMA (not represented in the kernel + * by a struct vm_area_struct). Used to keep an information + * about virtual address space ranges covered by + * MADV_GUARD_INSTALL guards. These ones must be always at + * the end of the vma_area_list and properly skipped a.e. + * - uprobes + * stands for a "[uprobes]" vma that's automatically mapped by + * the kernel when an active uprobe is hit. Contents of this vma + * are not dumped and neither are its madvise bits restored, + * because the kernel is in complete control of this vma. This is + * just used to track the existence of the uprobes vma. */ -#define VMA_AREA_NONE (0 << 0) -#define VMA_AREA_REGULAR (1 << 0) -#define VMA_AREA_STACK (1 << 1) -#define VMA_AREA_VSYSCALL (1 << 2) -#define VMA_AREA_VDSO (1 << 3) -#define VMA_AREA_HEAP (1 << 5) +#define VMA_AREA_NONE (0 << 0) +#define VMA_AREA_REGULAR (1 << 0) +#define VMA_AREA_STACK (1 << 1) +#define VMA_AREA_VSYSCALL (1 << 2) +#define VMA_AREA_VDSO (1 << 3) +#define VMA_AREA_HEAP (1 << 5) -#define VMA_FILE_PRIVATE (1 << 6) -#define VMA_FILE_SHARED (1 << 7) -#define VMA_ANON_SHARED (1 << 8) -#define VMA_ANON_PRIVATE (1 << 9) +#define VMA_FILE_PRIVATE (1 << 6) +#define VMA_FILE_SHARED (1 << 7) +#define VMA_ANON_SHARED (1 << 8) +#define VMA_ANON_PRIVATE (1 << 9) -#define VMA_AREA_SYSVIPC (1 << 10) -#define VMA_AREA_SOCKET (1 << 11) -#define VMA_AREA_VVAR (1 << 12) -#define VMA_AREA_AIORING (1 << 13) +#define VMA_AREA_SYSVIPC (1 << 10) +#define VMA_AREA_SOCKET (1 << 11) +#define VMA_AREA_VVAR (1 << 12) +#define VMA_AREA_AIORING (1 << 13) +#define VMA_AREA_MEMFD (1 << 14) +#define VMA_AREA_SHSTK (1 << 15) +#define VMA_AREA_GUARD (1 << 16) +#define VMA_AREA_UPROBES (1 << 17) -#define VMA_CLOSE (1 << 28) -#define VMA_NO_PROT_WRITE (1 << 29) -#define VMA_PREMMAPED (1 << 30) -#define VMA_UNSUPP (1 << 31) +#define VMA_EXT_PLUGIN (1 << 27) +#define VMA_CLOSE (1 << 28) +#define VMA_NO_PROT_WRITE (1 << 29) +#define VMA_PREMMAPED (1 << 30) +#define VMA_UNSUPP (1 << 31) -#define CR_CAP_SIZE 2 +#define CR_CAP_SIZE 2 #define TASK_COMM_LEN 16 #define CR_PARENT_LINK "parent" +#define OPT_ALLOW_UPROBES "allow-uprobes" + extern bool ns_per_id; extern bool img_common_magic; -#define O_NOBUF (O_DIRECT) -#define O_SERVICE (O_DIRECTORY) -#define O_DUMP (O_WRONLY | O_CREAT | O_TRUNC) -#define O_RSTR (O_RDONLY) -#define O_FORCE_LOCAL (O_SYNC) +#define O_NOBUF (O_DIRECT) +#define O_SERVICE (O_DIRECTORY) +#define O_DUMP (O_WRONLY | O_CREAT | O_TRUNC) +#define O_RSTR (O_RDONLY) +#define O_FORCE_LOCAL (O_SYNC) struct cr_img { union { @@ -116,8 +137,8 @@ struct cr_img { }; }; -#define EMPTY_IMG_FD (-404) -#define LAZY_IMG_FD (-505) +#define EMPTY_IMG_FD (-404) +#define LAZY_IMG_FD (-505) static inline bool empty_image(struct cr_img *img) { @@ -144,8 +165,14 @@ static inline int img_raw_fd(struct cr_img *img) extern off_t img_raw_size(struct cr_img *img); -extern int open_image_dir(char *dir); +extern int open_image_dir(const char *dir, int mode); extern void close_image_dir(void); +/* + * Return -1 -- parent symlink points to invalid target + * Return 0 && pfd < 0 -- parent symlink does not exist + * Return 0 && pfd >= 0 -- opened + */ +extern int open_parent(int dfd, int *pfd); extern struct cr_img *open_image_at(int dfd, int type, unsigned long flags, ...); #define open_image(typ, flags, ...) open_image_at(-1, typ, flags, ##__VA_ARGS__) @@ -157,13 +184,17 @@ extern void up_page_ids_base(void); extern struct cr_img *img_from_fd(int fd); /* for cr-show mostly */ extern int write_img_buf(struct cr_img *, const void *ptr, int size); -#define write_img(img, ptr) write_img_buf((img), (ptr), sizeof(*(ptr))) +#define write_img(img, ptr) write_img_buf((img), (ptr), sizeof(*(ptr))) extern int read_img_buf_eof(struct cr_img *, void *ptr, int size); -#define read_img_eof(img, ptr) read_img_buf_eof((img), (ptr), sizeof(*(ptr))) +#define read_img_eof(img, ptr) read_img_buf_eof((img), (ptr), sizeof(*(ptr))) extern int read_img_buf(struct cr_img *, void *ptr, int size); -#define read_img(img, ptr) read_img_buf((img), (ptr), sizeof(*(ptr))) +#define read_img(img, ptr) read_img_buf((img), (ptr), sizeof(*(ptr))) extern int read_img_str(struct cr_img *, char **pstr, int size); extern void close_image(struct cr_img *); +extern int add_inventory_plugin(const char *name); +extern int check_inventory_plugins(void); +extern bool check_and_remove_inventory_plugin(const char *name, size_t n); + #endif /* __CR_IMAGE_H__ */ diff --git a/criu/include/img-streamer.h b/criu/include/img-streamer.h new file mode 100644 index 000000000..0c380c915 --- /dev/null +++ b/criu/include/img-streamer.h @@ -0,0 +1,8 @@ +#ifndef IMAGE_STREAMER_H +#define IMAGE_STREAMER_H + +extern int img_streamer_init(const char *image_dir, int mode); +extern void img_streamer_finish(void); +extern int img_streamer_open(char *filename, int flags); + +#endif /* IMAGE_STREAMER_H */ diff --git a/criu/include/imgset.h b/criu/include/imgset.h index 02ad169df..9846f6cdc 100644 --- a/criu/include/imgset.h +++ b/criu/include/imgset.h @@ -27,10 +27,8 @@ extern struct cr_imgset *glob_imgset; extern struct cr_fd_desc_tmpl imgset_template[CR_FD_MAX]; extern struct cr_imgset *cr_task_imgset_open(int pid, int mode); -extern struct cr_imgset *cr_imgset_open_range(int pid, int from, int to, - unsigned long flags); -#define cr_imgset_open(pid, type, flags) cr_imgset_open_range(pid, \ - _CR_FD_##type##_FROM, _CR_FD_##type##_TO, flags) +extern struct cr_imgset *cr_imgset_open_range(int pid, int from, int to, unsigned long flags); +#define cr_imgset_open(pid, type, flags) cr_imgset_open_range(pid, _CR_FD_##type##_FROM, _CR_FD_##type##_TO, flags) extern struct cr_imgset *cr_glob_imgset_open(int mode); extern void close_cr_imgset(struct cr_imgset **cr_imgset); diff --git a/criu/include/inet_diag.h b/criu/include/inet_diag.h index 95be2c19d..4996dd556 100644 --- a/criu/include/inet_diag.h +++ b/criu/include/inet_diag.h @@ -4,42 +4,42 @@ #include /* Just some random number */ -#define TCPDIAG_GETSOCK 18 +#define TCPDIAG_GETSOCK 18 #define DCCPDIAG_GETSOCK 19 #define INET_DIAG_GETSOCK_MAX 24 /* Socket identity */ struct inet_diag_sockid { - __be16 idiag_sport; - __be16 idiag_dport; - __be32 idiag_src[4]; - __be32 idiag_dst[4]; - __u32 idiag_if; - __u32 idiag_cookie[2]; + __be16 idiag_sport; + __be16 idiag_dport; + __be32 idiag_src[4]; + __be32 idiag_dst[4]; + __u32 idiag_if; + __u32 idiag_cookie[2]; #define INET_DIAG_NOCOOKIE (~0U) }; /* Request structure */ struct inet_diag_req_compat { - __u8 idiag_family; /* Family of addresses. */ - __u8 idiag_src_len; - __u8 idiag_dst_len; - __u8 idiag_ext; /* Query extended information */ + __u8 idiag_family; /* Family of addresses. */ + __u8 idiag_src_len; + __u8 idiag_dst_len; + __u8 idiag_ext; /* Query extended information */ struct inet_diag_sockid id; - __u32 idiag_states; /* States to dump */ - __u32 idiag_dbs; /* Tables to dump (NI) */ + __u32 idiag_states; /* States to dump */ + __u32 idiag_dbs; /* Tables to dump (NI) */ }; struct inet_diag_req_v2 { - __u8 sdiag_family; - __u8 sdiag_protocol; - __u8 idiag_ext; - __u8 pad; - __u32 idiag_states; + __u8 sdiag_family; + __u8 sdiag_protocol; + __u8 idiag_ext; + __u8 pad; + __u32 idiag_states; struct inet_diag_sockid id; }; @@ -57,9 +57,9 @@ enum { */ struct inet_diag_bc_op { - unsigned char code; - unsigned char yes; - unsigned short no; + unsigned char code; + unsigned char yes; + unsigned short no; }; enum { @@ -75,27 +75,27 @@ enum { }; struct inet_diag_hostcond { - __u8 family; - __u8 prefix_len; - int port; - __be32 addr[0]; + __u8 family; + __u8 prefix_len; + int port; + __be32 addr[0]; }; /* Base info structure. It contains socket identity (addrs/ports/cookie) * and, alas, the information shown by netstat. */ struct inet_diag_msg { - __u8 idiag_family; - __u8 idiag_state; - __u8 idiag_timer; - __u8 idiag_retrans; + __u8 idiag_family; + __u8 idiag_state; + __u8 idiag_timer; + __u8 idiag_retrans; struct inet_diag_sockid id; - __u32 idiag_expires; - __u32 idiag_rqueue; - __u32 idiag_wqueue; - __u32 idiag_uid; - __u32 idiag_inode; + __u32 idiag_expires; + __u32 idiag_rqueue; + __u32 idiag_wqueue; + __u32 idiag_uid; + __u32 idiag_inode; }; /* Extensions */ @@ -114,23 +114,22 @@ enum { #define INET_DIAG_MAX INET_DIAG_SHUTDOWN - /* INET_DIAG_MEM */ struct inet_diag_meminfo { - __u32 idiag_rmem; - __u32 idiag_wmem; - __u32 idiag_fmem; - __u32 idiag_tmem; + __u32 idiag_rmem; + __u32 idiag_wmem; + __u32 idiag_fmem; + __u32 idiag_tmem; }; /* INET_DIAG_VEGASINFO */ struct tcpvegas_info { - __u32 tcpv_enabled; - __u32 tcpv_rttcnt; - __u32 tcpv_rtt; - __u32 tcpv_minrtt; + __u32 tcpv_enabled; + __u32 tcpv_rttcnt; + __u32 tcpv_rtt; + __u32 tcpv_minrtt; }; #endif /* __CR_INET_DIAG_H__ */ diff --git a/criu/include/irmap.h b/criu/include/irmap.h index 033f71e37..e668952b8 100644 --- a/criu/include/irmap.h +++ b/criu/include/irmap.h @@ -1,13 +1,13 @@ #ifndef __CR_IRMAP__H__ #define __CR_IRMAP__H__ + +#include "images/fh.pb-c.h" + char *irmap_lookup(unsigned int s_dev, unsigned long i_ino); -struct _FhEntry; -int irmap_queue_cache(unsigned int dev, unsigned long ino, - struct _FhEntry *fh); +int irmap_queue_cache(unsigned int dev, unsigned long ino, FhEntry *fh); int irmap_predump_prep(void); int irmap_predump_run(void); -int check_open_handle(unsigned int s_dev, unsigned long i_ino, - struct _FhEntry *f_handle); +int check_open_handle(unsigned int s_dev, unsigned long i_ino, FhEntry *f_handle); int irmap_load_cache(void); int irmap_scan_path_add(char *path); #endif diff --git a/criu/include/kcmp-ids.h b/criu/include/kcmp-ids.h index a37622c50..b6cdbb262 100644 --- a/criu/include/kcmp-ids.h +++ b/criu/include/kcmp-ids.h @@ -7,30 +7,26 @@ #include "kcmp.h" struct kid_tree { - struct rb_root root; - unsigned int kcmp_type; - unsigned long subid; - + struct rb_root root; + unsigned int kcmp_type; + unsigned long subid; }; -#define DECLARE_KCMP_TREE(name, type) \ - struct kid_tree name = { \ - .root = RB_ROOT, \ - .kcmp_type = type, \ - .subid = 1, \ +#define DECLARE_KCMP_TREE(name, type) \ + struct kid_tree name = { \ + .root = RB_ROOT, \ + .kcmp_type = type, \ + .subid = 1, \ } struct kid_elem { - pid_t pid; - unsigned int genid; - unsigned int idx; + pid_t pid; + unsigned int genid; + unsigned int idx; }; -extern uint32_t kid_generate_gen(struct kid_tree *tree, - struct kid_elem *elem, int *new_id); +extern uint32_t kid_generate_gen(struct kid_tree *tree, struct kid_elem *elem, int *new_id); -extern struct kid_elem *kid_lookup_epoll_tfd(struct kid_tree *tree, - struct kid_elem *elem, - kcmp_epoll_slot_t *slot); +extern struct kid_elem *kid_lookup_epoll_tfd(struct kid_tree *tree, struct kid_elem *elem, kcmp_epoll_slot_t *slot); #endif /* __CR_KCMP_IDS_H__ */ diff --git a/criu/include/kcmp.h b/criu/include/kcmp.h index f1c898d71..575135f80 100644 --- a/criu/include/kcmp.h +++ b/criu/include/kcmp.h @@ -18,9 +18,9 @@ enum kcmp_type { /* Slot for KCMP_EPOLL_TFD */ typedef struct { - uint32_t efd; /* epoll file descriptor */ - uint32_t tfd; /* target file number */ - uint32_t toff; /* target offset within same numbered sequence */ + uint32_t efd; /* epoll file descriptor */ + uint32_t tfd; /* target file number */ + uint32_t toff; /* target offset within same numbered sequence */ } kcmp_epoll_slot_t; #endif /* __CR_KCMP_H__ */ diff --git a/criu/include/kerndat.h b/criu/include/kerndat.h index d93e07813..e4922f401 100644 --- a/criu/include/kerndat.h +++ b/criu/include/kerndat.h @@ -6,6 +6,8 @@ #include "common/config.h" #include "asm/kerndat.h" #include "util-vdso.h" +#include "hugetlb.h" +#include struct stat; @@ -18,8 +20,8 @@ extern int kerndat_init(void); enum pagemap_func { PM_UNKNOWN, - PM_DISABLED, /* /proc/pid/pagemap doesn't open (user mode) */ - PM_FLAGS_ONLY, /* pagemap zeroes pfn part (user mode) */ + PM_DISABLED, /* /proc/pid/pagemap doesn't open (user mode) */ + PM_FLAGS_ONLY, /* pagemap zeroes pfn part (user mode) */ PM_FULL, }; @@ -36,6 +38,7 @@ struct kerndat_s { u64 zero_page_pfn; bool has_dirty_track; bool has_memfd; + bool has_memfd_hugetlb; bool has_fdinfo_lock; unsigned long task_size; bool ipv6; @@ -50,14 +53,15 @@ struct kerndat_s { bool has_tcp_half_closed; bool stack_guard_gap_hidden; int lsm; + bool apparmor_ns_dumping_enabled; bool has_uffd; unsigned long uffd_features; bool has_thp_disable; bool can_map_vdso; bool vdso_hint_reliable; - struct vdso_symtable vdso_sym; + struct vdso_symtable vdso_sym; #ifdef CONFIG_COMPAT - struct vdso_symtable vdso_sym_compat; + struct vdso_symtable vdso_sym_compat; #endif bool has_nsid; bool has_link_nsid; @@ -65,6 +69,30 @@ struct kerndat_s { bool x86_has_ptrace_fpu_xsave_bug; bool has_inotify_setnextwd; bool has_kcmp_epoll_tfd; + bool has_fsopen; + bool has_clone3_set_tid; + bool has_timens; + bool has_newifindex; + bool has_pidfd_open; + bool has_pidfd_getfd; + bool has_nspid; + bool has_nftables_concat; + bool has_sockopt_buf_lock; + dev_t hugetlb_dev[HUGETLB_MAX]; + bool has_move_mount_set_group; + bool has_openat2; + bool has_rseq; + bool has_ptrace_get_rseq_conf; + struct __ptrace_rseq_configuration libc_rseq_conf; + bool has_ipv6_freebind; + bool has_membarrier_get_registrations; + bool has_pagemap_scan; + bool has_shstk; + bool has_close_range; + bool has_timer_cr_ids; + bool has_breakpoints; + bool has_madv_guard; + bool has_pagemap_scan_guard_pages; }; extern struct kerndat_s kdat; @@ -85,4 +113,8 @@ enum { */ extern int kerndat_fs_virtualized(unsigned int which, u32 kdev); +extern int kerndat_has_nspid(void); + +extern void kerndat_warn_about_madv_guards(void); + #endif /* __CR_KERNDAT_H__ */ diff --git a/criu/include/libnetlink.h b/criu/include/libnetlink.h index f21a0e750..2cdb543c0 100644 --- a/criu/include/libnetlink.h +++ b/criu/include/libnetlink.h @@ -1,24 +1,21 @@ #ifndef __CR_LIBNETLINK_H__ #define __CR_LIBNETLINK_H__ -#define CR_NLMSG_SEQ 24680 /* arbitrary chosen */ +#define CR_NLMSG_SEQ 24680 /* arbitrary chosen */ struct ns_id; extern int do_rtnl_req(int nl, void *req, int size, - int (*receive_callback)(struct nlmsghdr *h, struct ns_id *ns, void *), - int (*error_callback)(int err, struct ns_id *ns, void *), struct ns_id *ns, void *); + int (*receive_callback)(struct nlmsghdr *h, struct ns_id *ns, void *), + int (*error_callback)(int err, struct ns_id *ns, void *), struct ns_id *ns, void *); -extern int addattr_l(struct nlmsghdr *n, int maxlen, int type, - const void *data, int alen); +extern int addattr_l(struct nlmsghdr *n, int maxlen, int type, const void *data, int alen); extern int32_t nla_get_s32(const struct nlattr *nla); -#define NLMSG_TAIL(nmsg) \ - ((struct rtattr *) (((void *) (nmsg)) + NLMSG_ALIGN((nmsg)->nlmsg_len))) +#define NLMSG_TAIL(nmsg) ((struct rtattr *)(((void *)(nmsg)) + NLMSG_ALIGN((nmsg)->nlmsg_len))) #ifndef NETNS_RTA -#define NETNS_RTA(r) \ - ((struct rtattr*)(((char*)(r)) + NLMSG_ALIGN(sizeof(struct rtgenmsg)))) +#define NETNS_RTA(r) ((struct rtattr *)(((char *)(r)) + NLMSG_ALIGN(sizeof(struct rtgenmsg)))) #endif #endif /* __CR_LIBNETLINK_H__ */ diff --git a/criu/include/linux/aio_abi.h b/criu/include/linux/aio_abi.h new file mode 100644 index 000000000..d9ce78720 --- /dev/null +++ b/criu/include/linux/aio_abi.h @@ -0,0 +1,14 @@ +#ifndef __LINUX__AIO_ABI_H +#define __LINUX__AIO_ABI_H + +typedef __kernel_ulong_t aio_context_t; + +/* read() from /dev/aio returns these structures. */ +struct io_event { + __u64 data; /* the data field from the iocb */ + __u64 obj; /* what iocb this event came from */ + __s64 res; /* result code for this event */ + __s64 res2; /* secondary result */ +}; + +#endif /* __LINUX__AIO_ABI_H */ diff --git a/criu/include/linux/mount.h b/criu/include/linux/mount.h new file mode 100644 index 000000000..fefafa89e --- /dev/null +++ b/criu/include/linux/mount.h @@ -0,0 +1,43 @@ +#ifndef _CRIU_LINUX_MOUNT_H +#define _CRIU_LINUX_MOUNT_H + +#include "common/config.h" +#include "compel/plugins/std/syscall-codes.h" + +/* Copied from /usr/include/sys/mount.h */ + +#ifndef FSOPEN_CLOEXEC +/* The type of fsconfig call made. */ +enum fsconfig_command { + FSCONFIG_SET_FLAG = 0, /* Set parameter, supplying no value */ +#define FSCONFIG_SET_FLAG FSCONFIG_SET_FLAG + FSCONFIG_SET_STRING = 1, /* Set parameter, supplying a string value */ +#define FSCONFIG_SET_STRING FSCONFIG_SET_STRING + FSCONFIG_SET_BINARY = 2, /* Set parameter, supplying a binary blob value */ +#define FSCONFIG_SET_BINARY FSCONFIG_SET_BINARY + FSCONFIG_SET_PATH = 3, /* Set parameter, supplying an object by path */ +#define FSCONFIG_SET_PATH FSCONFIG_SET_PATH + FSCONFIG_SET_PATH_EMPTY = 4, /* Set parameter, supplying an object by (empty) path */ +#define FSCONFIG_SET_PATH_EMPTY FSCONFIG_SET_PATH_EMPTY + FSCONFIG_SET_FD = 5, /* Set parameter, supplying an object by fd */ +#define FSCONFIG_SET_FD FSCONFIG_SET_FD + FSCONFIG_CMD_CREATE = 6, /* Invoke superblock creation */ +#define FSCONFIG_CMD_CREATE FSCONFIG_CMD_CREATE + FSCONFIG_CMD_RECONFIGURE = 7, /* Invoke superblock reconfiguration */ +#define FSCONFIG_CMD_RECONFIGURE FSCONFIG_CMD_RECONFIGURE +}; + +#endif // FSOPEN_CLOEXEC + +/* fsopen flags. With the redundant definition, we check if the kernel, + * glibc value and our value still match. + */ +#define FSOPEN_CLOEXEC 0x00000001 + +#ifndef MS_MGC_VAL +/* Magic mount flag number. Has to be or-ed to the flag values. */ +#define MS_MGC_VAL 0xc0ed0000 /* Magic flag number to indicate "new" flags */ +#define MS_MGC_MSK 0xffff0000 /* Magic flag number mask */ +#endif + +#endif diff --git a/criu/include/linux/openat2.h b/criu/include/linux/openat2.h new file mode 100644 index 000000000..1e9ccff05 --- /dev/null +++ b/criu/include/linux/openat2.h @@ -0,0 +1,18 @@ +#ifndef _CRIU_LINUX_OPENAT2_H +#define _CRIU_LINUX_OPENAT2_H + +#include + +#include "common/config.h" + +#ifdef CONFIG_HAS_OPENAT2 +#include +#else +struct open_how { + __u64 flags; + __u64 mode; + __u64 resolve; +}; +#endif + +#endif diff --git a/criu/include/linux/rseq.h b/criu/include/linux/rseq.h new file mode 100644 index 000000000..5ceefbf8e --- /dev/null +++ b/criu/include/linux/rseq.h @@ -0,0 +1,154 @@ +/* SPDX-License-Identifier: GPL-2.0+ WITH Linux-syscall-note */ +#ifndef _UAPI_LINUX_RSEQ_H +#define _UAPI_LINUX_RSEQ_H + +#ifdef __has_include +#if __has_include("sys/rseq.h") +#include +#include "asm/thread_pointer.h" +#endif +#endif + +#include +#include + +#include "common/config.h" + +#ifdef CONFIG_HAS_NO_LIBC_RSEQ_DEFS +/* + * linux/rseq.h + * + * Restartable sequences system call API + * + * Copyright (c) 2015-2018 Mathieu Desnoyers + */ + +enum rseq_cpu_id_state { + RSEQ_CPU_ID_UNINITIALIZED = -1, + RSEQ_CPU_ID_REGISTRATION_FAILED = -2, +}; + +enum rseq_flags { + RSEQ_FLAG_UNREGISTER = (1 << 0), +}; + +enum rseq_cs_flags_bit { + RSEQ_CS_FLAG_NO_RESTART_ON_PREEMPT_BIT = 0, + RSEQ_CS_FLAG_NO_RESTART_ON_SIGNAL_BIT = 1, + RSEQ_CS_FLAG_NO_RESTART_ON_MIGRATE_BIT = 2, +}; + +enum rseq_cs_flags { + RSEQ_CS_FLAG_NO_RESTART_ON_PREEMPT = (1U << RSEQ_CS_FLAG_NO_RESTART_ON_PREEMPT_BIT), + RSEQ_CS_FLAG_NO_RESTART_ON_SIGNAL = (1U << RSEQ_CS_FLAG_NO_RESTART_ON_SIGNAL_BIT), + RSEQ_CS_FLAG_NO_RESTART_ON_MIGRATE = (1U << RSEQ_CS_FLAG_NO_RESTART_ON_MIGRATE_BIT), +}; +#endif /* CONFIG_HAS_NO_LIBC_RSEQ_DEFS */ + +/* + * Let's use our own definition of struct rseq_cs because some distros + * (for example Mariner GNU/Linux) declares this structure their-own way. + * This makes trouble with inconsistency between printf formatters and + * struct rseq_cs field types. + */ +/* + * struct rseq_cs is aligned on 4 * 8 bytes to ensure it is always + * contained within a single cache-line. It is usually declared as + * link-time constant data. + */ +struct criu_rseq_cs { + /* Version of this structure. */ + __u32 version; + /* enum rseq_cs_flags */ + __u32 flags; + __u64 start_ip; + /* Offset from start_ip. */ + __u64 post_commit_offset; + __u64 abort_ip; +} __attribute__((aligned(4 * sizeof(__u64)))); + +/* + * We have to have our own copy of struct rseq definition because + * of breaking UAPI change: + * https://git.kernel.org/pub/scm/linux/kernel/git/tip/tip.git/commit/?id=bfdf4e6208051ed7165b2e92035b4bf11f43eb63 + */ +/* + * struct rseq is aligned on 4 * 8 bytes to ensure it is always + * contained within a single cache-line. + * + * A single struct rseq per thread is allowed. + */ +struct criu_rseq { + /* + * Restartable sequences cpu_id_start field. Updated by the + * kernel. Read by user-space with single-copy atomicity + * semantics. This field should only be read by the thread which + * registered this data structure. Aligned on 32-bit. Always + * contains a value in the range of possible CPUs, although the + * value may not be the actual current CPU (e.g. if rseq is not + * initialized). This CPU number value should always be compared + * against the value of the cpu_id field before performing a rseq + * commit or returning a value read from a data structure indexed + * using the cpu_id_start value. + */ + __u32 cpu_id_start; + /* + * Restartable sequences cpu_id field. Updated by the kernel. + * Read by user-space with single-copy atomicity semantics. This + * field should only be read by the thread which registered this + * data structure. Aligned on 32-bit. Values + * RSEQ_CPU_ID_UNINITIALIZED and RSEQ_CPU_ID_REGISTRATION_FAILED + * have a special semantic: the former means "rseq uninitialized", + * and latter means "rseq initialization failed". This value is + * meant to be read within rseq critical sections and compared + * with the cpu_id_start value previously read, before performing + * the commit instruction, or read and compared with the + * cpu_id_start value before returning a value loaded from a data + * structure indexed using the cpu_id_start value. + */ + __u32 cpu_id; + /* + * Restartable sequences rseq_cs field. + * + * Contains NULL when no critical section is active for the current + * thread, or holds a pointer to the currently active struct rseq_cs. + * + * Updated by user-space, which sets the address of the currently + * active rseq_cs at the beginning of assembly instruction sequence + * block, and set to NULL by the kernel when it restarts an assembly + * instruction sequence block, as well as when the kernel detects that + * it is preempting or delivering a signal outside of the range + * targeted by the rseq_cs. Also needs to be set to NULL by user-space + * before reclaiming memory that contains the targeted struct rseq_cs. + * + * Read and set by the kernel. Set by user-space with single-copy + * atomicity semantics. This field should only be updated by the + * thread which registered this data structure. Aligned on 64-bit. + * + * 32-bit architectures should update the low order bits of the + * rseq_cs field, leaving the high order bits initialized to 0. + */ + __u64 rseq_cs; + + /* + * Restartable sequences flags field. + * + * This field should only be updated by the thread which + * registered this data structure. Read by the kernel. + * Mainly used for single-stepping through rseq critical sections + * with debuggers. + * + * - RSEQ_CS_FLAG_NO_RESTART_ON_PREEMPT + * Inhibit instruction sequence block restart on preemption + * for this thread. + * - RSEQ_CS_FLAG_NO_RESTART_ON_SIGNAL + * Inhibit instruction sequence block restart on signal + * delivery for this thread. + * - RSEQ_CS_FLAG_NO_RESTART_ON_MIGRATE + * Inhibit instruction sequence block restart on migration for + * this thread. + */ + __u32 flags; +} __attribute__((aligned(4 * sizeof(__u64)))); + +#endif /* _UAPI_LINUX_RSEQ_H */ diff --git a/criu/include/linux/userfaultfd.h b/criu/include/linux/userfaultfd.h index 3b059530d..cfcf48571 100644 --- a/criu/include/linux/userfaultfd.h +++ b/criu/include/linux/userfaultfd.h @@ -18,23 +18,12 @@ * means the userland is reading). */ #define UFFD_API ((__u64)0xAA) -#define UFFD_API_FEATURES (UFFD_FEATURE_EVENT_FORK | \ - UFFD_FEATURE_EVENT_REMAP | \ - UFFD_FEATURE_EVENT_REMOVE | \ - UFFD_FEATURE_EVENT_UNMAP | \ - UFFD_FEATURE_MISSING_HUGETLBFS | \ - UFFD_FEATURE_MISSING_SHMEM) -#define UFFD_API_IOCTLS \ - ((__u64)1 << _UFFDIO_REGISTER | \ - (__u64)1 << _UFFDIO_UNREGISTER | \ - (__u64)1 << _UFFDIO_API) -#define UFFD_API_RANGE_IOCTLS \ - ((__u64)1 << _UFFDIO_WAKE | \ - (__u64)1 << _UFFDIO_COPY | \ - (__u64)1 << _UFFDIO_ZEROPAGE) -#define UFFD_API_RANGE_IOCTLS_BASIC \ - ((__u64)1 << _UFFDIO_WAKE | \ - (__u64)1 << _UFFDIO_COPY) +#define UFFD_API_FEATURES \ + (UFFD_FEATURE_EVENT_FORK | UFFD_FEATURE_EVENT_REMAP | UFFD_FEATURE_EVENT_REMOVE | UFFD_FEATURE_EVENT_UNMAP | \ + UFFD_FEATURE_MISSING_HUGETLBFS | UFFD_FEATURE_MISSING_SHMEM) +#define UFFD_API_IOCTLS ((__u64)1 << _UFFDIO_REGISTER | (__u64)1 << _UFFDIO_UNREGISTER | (__u64)1 << _UFFDIO_API) +#define UFFD_API_RANGE_IOCTLS ((__u64)1 << _UFFDIO_WAKE | (__u64)1 << _UFFDIO_COPY | (__u64)1 << _UFFDIO_ZEROPAGE) +#define UFFD_API_RANGE_IOCTLS_BASIC ((__u64)1 << _UFFDIO_WAKE | (__u64)1 << _UFFDIO_COPY) /* * Valid ioctl command number range with this API is from 0x00 to @@ -44,62 +33,56 @@ * which ioctl the running kernel implements through the ioctl command * bitmask written by the UFFDIO_API. */ -#define _UFFDIO_REGISTER (0x00) -#define _UFFDIO_UNREGISTER (0x01) -#define _UFFDIO_WAKE (0x02) -#define _UFFDIO_COPY (0x03) -#define _UFFDIO_ZEROPAGE (0x04) -#define _UFFDIO_API (0x3F) +#define _UFFDIO_REGISTER (0x00) +#define _UFFDIO_UNREGISTER (0x01) +#define _UFFDIO_WAKE (0x02) +#define _UFFDIO_COPY (0x03) +#define _UFFDIO_ZEROPAGE (0x04) +#define _UFFDIO_API (0x3F) /* userfaultfd ioctl ids */ -#define UFFDIO 0xAA -#define UFFDIO_API _IOWR(UFFDIO, _UFFDIO_API, \ - struct uffdio_api) -#define UFFDIO_REGISTER _IOWR(UFFDIO, _UFFDIO_REGISTER, \ - struct uffdio_register) -#define UFFDIO_UNREGISTER _IOR(UFFDIO, _UFFDIO_UNREGISTER, \ - struct uffdio_range) -#define UFFDIO_WAKE _IOR(UFFDIO, _UFFDIO_WAKE, \ - struct uffdio_range) -#define UFFDIO_COPY _IOWR(UFFDIO, _UFFDIO_COPY, \ - struct uffdio_copy) -#define UFFDIO_ZEROPAGE _IOWR(UFFDIO, _UFFDIO_ZEROPAGE, \ - struct uffdio_zeropage) +#define UFFDIO 0xAA +#define UFFDIO_API _IOWR(UFFDIO, _UFFDIO_API, struct uffdio_api) +#define UFFDIO_REGISTER _IOWR(UFFDIO, _UFFDIO_REGISTER, struct uffdio_register) +#define UFFDIO_UNREGISTER _IOR(UFFDIO, _UFFDIO_UNREGISTER, struct uffdio_range) +#define UFFDIO_WAKE _IOR(UFFDIO, _UFFDIO_WAKE, struct uffdio_range) +#define UFFDIO_COPY _IOWR(UFFDIO, _UFFDIO_COPY, struct uffdio_copy) +#define UFFDIO_ZEROPAGE _IOWR(UFFDIO, _UFFDIO_ZEROPAGE, struct uffdio_zeropage) /* read() structure */ struct uffd_msg { - __u8 event; + __u8 event; - __u8 reserved1; - __u16 reserved2; - __u32 reserved3; + __u8 reserved1; + __u16 reserved2; + __u32 reserved3; union { struct { - __u64 flags; - __u64 address; + __u64 flags; + __u64 address; } pagefault; struct { - __u32 ufd; + __u32 ufd; } fork; struct { - __u64 from; - __u64 to; - __u64 len; + __u64 from; + __u64 to; + __u64 len; } remap; struct { - __u64 start; - __u64 end; + __u64 start; + __u64 end; } remove; struct { /* unused reserved fields */ - __u64 reserved1; - __u64 reserved2; - __u64 reserved3; + __u64 reserved1; + __u64 reserved2; + __u64 reserved3; } reserved; } arg; } __packed; @@ -107,15 +90,15 @@ struct uffd_msg { /* * Start at 0x12 and not at 0 to be more strict against bugs. */ -#define UFFD_EVENT_PAGEFAULT 0x12 -#define UFFD_EVENT_FORK 0x13 -#define UFFD_EVENT_REMAP 0x14 -#define UFFD_EVENT_REMOVE 0x15 -#define UFFD_EVENT_UNMAP 0x16 +#define UFFD_EVENT_PAGEFAULT 0x12 +#define UFFD_EVENT_FORK 0x13 +#define UFFD_EVENT_REMAP 0x14 +#define UFFD_EVENT_REMOVE 0x15 +#define UFFD_EVENT_UNMAP 0x16 /* flags for UFFD_EVENT_PAGEFAULT */ -#define UFFD_PAGEFAULT_FLAG_WRITE (1<<0) /* If this was a write fault */ -#define UFFD_PAGEFAULT_FLAG_WP (1<<1) /* If reason is VM_UFFD_WP */ +#define UFFD_PAGEFAULT_FLAG_WRITE (1 << 0) /* If this was a write fault */ +#define UFFD_PAGEFAULT_FLAG_WP (1 << 1) /* If reason is VM_UFFD_WP */ struct uffdio_api { /* userland asks for an API number and the features to enable */ @@ -154,13 +137,13 @@ struct uffdio_api { * UFFD_FEATURE_MISSING_HUGETLBFS, but it applies to shmem * (i.e. tmpfs and other shmem based APIs). */ -#define UFFD_FEATURE_PAGEFAULT_FLAG_WP (1<<0) -#define UFFD_FEATURE_EVENT_FORK (1<<1) -#define UFFD_FEATURE_EVENT_REMAP (1<<2) -#define UFFD_FEATURE_EVENT_REMOVE (1<<3) -#define UFFD_FEATURE_MISSING_HUGETLBFS (1<<4) -#define UFFD_FEATURE_MISSING_SHMEM (1<<5) -#define UFFD_FEATURE_EVENT_UNMAP (1<<6) +#define UFFD_FEATURE_PAGEFAULT_FLAG_WP (1 << 0) +#define UFFD_FEATURE_EVENT_FORK (1 << 1) +#define UFFD_FEATURE_EVENT_REMAP (1 << 2) +#define UFFD_FEATURE_EVENT_REMOVE (1 << 3) +#define UFFD_FEATURE_MISSING_HUGETLBFS (1 << 4) +#define UFFD_FEATURE_MISSING_SHMEM (1 << 5) +#define UFFD_FEATURE_EVENT_UNMAP (1 << 6) __u64 features; __u64 ioctls; @@ -173,8 +156,8 @@ struct uffdio_range { struct uffdio_register { struct uffdio_range range; -#define UFFDIO_REGISTER_MODE_MISSING ((__u64)1<<0) -#define UFFDIO_REGISTER_MODE_WP ((__u64)1<<1) +#define UFFDIO_REGISTER_MODE_MISSING ((__u64)1 << 0) +#define UFFDIO_REGISTER_MODE_WP ((__u64)1 << 1) __u64 mode; /* @@ -194,7 +177,7 @@ struct uffdio_copy { * available if the wrprotection ioctl are implemented for the * range according to the uffdio_register.ioctls. */ -#define UFFDIO_COPY_MODE_DONTWAKE ((__u64)1<<0) +#define UFFDIO_COPY_MODE_DONTWAKE ((__u64)1 << 0) __u64 mode; /* @@ -206,7 +189,7 @@ struct uffdio_copy { struct uffdio_zeropage { struct uffdio_range range; -#define UFFDIO_ZEROPAGE_MODE_DONTWAKE ((__u64)1<<0) +#define UFFDIO_ZEROPAGE_MODE_DONTWAKE ((__u64)1 << 0) __u64 mode; /* diff --git a/criu/include/log.h b/criu/include/log.h index 15787b09f..cbed33007 100644 --- a/criu/include/log.h +++ b/criu/include/log.h @@ -9,72 +9,58 @@ #include #include -extern void vprint_on_level(unsigned int loglevel, const char *format, - va_list params); - #endif /* CR_NOGLIBC */ -#define LOG_UNSET (-1) -#define LOG_MSG (0) /* Print message regardless of log level */ -#define LOG_ERROR (1) /* Errors only, when we're in trouble */ -#define LOG_WARN (2) /* Warnings, dazen and confused but trying to continue */ -#define LOG_INFO (3) /* Informative, everything is fine */ -#define LOG_DEBUG (4) /* Debug only */ +#define LOG_UNSET (-1) +#define LOG_MSG (0) /* Print message regardless of log level */ +#define LOG_ERROR (1) /* Errors only, when we're in trouble */ +#define LOG_WARN (2) /* Warnings, dazen and confused but trying to continue */ +#define LOG_INFO (3) /* Informative, everything is fine */ +#define LOG_DEBUG (4) /* Debug only */ -#define DEFAULT_LOGLEVEL LOG_WARN +#define DEFAULT_LOGLEVEL LOG_WARN +/* + * This is low-level printing helper, try hard not to use it directly + * and use the pr_foo() helpers below. + */ extern void print_on_level(unsigned int loglevel, const char *format, ...) - __attribute__ ((__format__ (__printf__, 2, 3))); + __attribute__((__format__(__printf__, 2, 3))); #ifndef LOG_PREFIX -# define LOG_PREFIX +#define LOG_PREFIX #endif void flush_early_log_buffer(int fd); -#define print_once(loglevel, fmt, ...) \ - do { \ - static bool __printed; \ - if (!__printed) { \ - print_on_level(loglevel, fmt, ##__VA_ARGS__); \ - __printed = 1; \ - } \ +#define print_once(loglevel, fmt, ...) \ + do { \ + static bool __printed; \ + if (!__printed) { \ + print_on_level(loglevel, fmt, ##__VA_ARGS__); \ + __printed = 1; \ + } \ } while (0) -#define pr_msg(fmt, ...) \ - print_on_level(LOG_MSG, \ - fmt, ##__VA_ARGS__) +#define pr_msg(fmt, ...) print_on_level(LOG_MSG, fmt, ##__VA_ARGS__) -#define pr_info(fmt, ...) \ - print_on_level(LOG_INFO, \ - LOG_PREFIX fmt, ##__VA_ARGS__) +#define pr_info(fmt, ...) print_on_level(LOG_INFO, LOG_PREFIX fmt, ##__VA_ARGS__) -#define pr_err(fmt, ...) \ - print_on_level(LOG_ERROR, \ - "Error (%s:%d): " LOG_PREFIX fmt, \ - __FILE__, __LINE__, ##__VA_ARGS__) +#define pr_err(fmt, ...) print_on_level(LOG_ERROR, "Error (%s:%d): " LOG_PREFIX fmt, __FILE__, __LINE__, ##__VA_ARGS__) -#define pr_err_once(fmt, ...) \ - print_once(LOG_ERROR, fmt, ##__VA_ARGS__) +#define pr_err_once(fmt, ...) print_once(LOG_ERROR, fmt, ##__VA_ARGS__) -#define pr_warn(fmt, ...) \ - print_on_level(LOG_WARN, \ - "Warn (%s:%d): " LOG_PREFIX fmt, \ - __FILE__, __LINE__, ##__VA_ARGS__) +#define pr_warn(fmt, ...) print_on_level(LOG_WARN, "Warn (%s:%d): " LOG_PREFIX fmt, __FILE__, __LINE__, ##__VA_ARGS__) -#define pr_warn_once(fmt, ...) \ - print_once(LOG_WARN, \ - "Warn (%s:%d): " LOG_PREFIX fmt, \ - __FILE__, __LINE__, ##__VA_ARGS__) +#define pr_warn_once(fmt, ...) print_once(LOG_WARN, "Warn (%s:%d): " LOG_PREFIX fmt, __FILE__, __LINE__, ##__VA_ARGS__) -#define pr_debug(fmt, ...) \ - print_on_level(LOG_DEBUG, \ - LOG_PREFIX fmt, ##__VA_ARGS__) +#define pr_debug(fmt, ...) print_on_level(LOG_DEBUG, LOG_PREFIX fmt, ##__VA_ARGS__) #ifndef CR_NOGLIBC -#define pr_perror(fmt, ...) \ - pr_err(fmt ": %s\n", ##__VA_ARGS__, strerror(errno)) +#define pr_perror(fmt, ...) pr_err(fmt ": %s\n", ##__VA_ARGS__, strerror(errno)) + +#define pr_pwarn(fmt, ...) pr_warn(fmt ": %s\n", ##__VA_ARGS__, strerror(errno)) #endif /* CR_NOGLIBC */ diff --git a/criu/include/lsm.h b/criu/include/lsm.h index 3b8271282..b4891b4b3 100644 --- a/criu/include/lsm.h +++ b/criu/include/lsm.h @@ -17,10 +17,8 @@ extern Lsmtype host_lsm_type(void); */ extern void kerndat_lsm(void); -/* - * Read the LSM profile for the pstree item - */ -extern int collect_lsm_profile(pid_t, CredsEntry *); +int collect_and_suspend_lsm(void); +int unsuspend_lsm(void); /* * Validate that the LSM profiles can be correctly applied (must happen after @@ -30,7 +28,8 @@ int validate_lsm(char *profile); /* * Render the profile name in the way that the LSM wants it written to - * /proc//attr/current. + * /proc//attr/current, according to whatever is in the images and + * specified by --lsm-profile. */ int render_lsm_profile(char *profile, char **val); @@ -39,15 +38,18 @@ extern int lsm_check_opts(void); #ifdef CONFIG_HAS_SELINUX int dump_xattr_security_selinux(int fd, FdinfoEntry *e); int run_setsockcreatecon(FdinfoEntry *e); -int reset_setsockcreatecon(); +int reset_setsockcreatecon(void); #else -static inline int dump_xattr_security_selinux(int fd, FdinfoEntry *e) { +static inline int dump_xattr_security_selinux(int fd, FdinfoEntry *e) +{ return 0; } -static inline int run_setsockcreatecon(FdinfoEntry *e) { +static inline int run_setsockcreatecon(FdinfoEntry *e) +{ return 0; } -static inline int reset_setsockcreatecon() { +static inline int reset_setsockcreatecon(void) +{ return 0; } #endif diff --git a/criu/include/magic.h b/criu/include/magic.h index 05101f436..6f0aff26d 100644 --- a/criu/include/magic.h +++ b/criu/include/magic.h @@ -5,121 +5,129 @@ * Basic multi-file images */ -#define CRTOOLS_IMAGES_V1 1 +#define CRTOOLS_IMAGES_V1 1 /* * v1.1 has common magic in the head of each image file, * except for inventory */ -#define CRTOOLS_IMAGES_V1_1 2 +#define CRTOOLS_IMAGES_V1_1 2 /* * Raw images are images in which data is stored in some * non-crtool format (ip tool dumps, tarballs, etc.) */ -#define RAW_IMAGE_MAGIC 0x0 +#define RAW_IMAGE_MAGIC 0x0 /* * Images have the IMG_COMMON_MAGIC in the head. Service files * such as stats and irmap-cache have the IMG_SERVICE_MAGIC. */ -#define IMG_COMMON_MAGIC 0x54564319 /* Sarov (a.k.a. Arzamas-16) */ -#define IMG_SERVICE_MAGIC 0x55105940 /* Zlatoust */ +#define IMG_COMMON_MAGIC 0x54564319 /* Sarov (a.k.a. Arzamas-16) */ +#define IMG_SERVICE_MAGIC 0x55105940 /* Zlatoust */ /* * The magic-s below correspond to coordinates - * of various Russian towns in the NNNNEEEE form. + * of various towns in the NNNNEEEE form. */ -#define INVENTORY_MAGIC 0x58313116 /* Veliky Novgorod */ -#define PSTREE_MAGIC 0x50273030 /* Kyiv */ -#define FDINFO_MAGIC 0x56213732 /* Dmitrov */ -#define PAGEMAP_MAGIC 0x56084025 /* Vladimir */ -#define SHMEM_PAGEMAP_MAGIC PAGEMAP_MAGIC -#define PAGES_MAGIC RAW_IMAGE_MAGIC -#define CORE_MAGIC 0x55053847 /* Kolomna */ -#define IDS_MAGIC 0x54432030 /* Konigsberg */ -#define VMAS_MAGIC 0x54123737 /* Tula */ -#define PIPES_MAGIC 0x56513555 /* Tver */ -#define PIPES_DATA_MAGIC 0x56453709 /* Dubna */ -#define FIFO_MAGIC 0x58364939 /* Kirov */ -#define FIFO_DATA_MAGIC 0x59333054 /* Tosno */ -#define SIGACT_MAGIC 0x55344201 /* Murom */ -#define UNIXSK_MAGIC 0x54373943 /* Ryazan */ -#define INETSK_MAGIC 0x56443851 /* Pereslavl */ -#define PACKETSK_MAGIC 0x60454618 /* Veliky Ustyug */ -#define ITIMERS_MAGIC 0x57464056 /* Kostroma */ -#define POSIX_TIMERS_MAGIC 0x52603957 /* Lipetsk */ -#define SK_QUEUES_MAGIC 0x56264026 /* Suzdal */ -#define UTSNS_MAGIC 0x54473203 /* Smolensk */ -#define CREDS_MAGIC 0x54023547 /* Kozelsk */ -#define IPC_VAR_MAGIC 0x53115007 /* Samara */ -#define IPCNS_SHM_MAGIC 0x46283044 /* Odessa */ -#define IPCNS_MSG_MAGIC 0x55453737 /* Moscow */ -#define IPCNS_SEM_MAGIC 0x59573019 /* St. Petersburg */ -#define REG_FILES_MAGIC 0x50363636 /* Belgorod */ -#define EXT_FILES_MAGIC 0x59255641 /* Usolye */ -#define FS_MAGIC 0x51403912 /* Voronezh */ -#define MM_MAGIC 0x57492820 /* Pskov */ -#define REMAP_FPATH_MAGIC 0x59133954 /* Vologda */ -#define GHOST_FILE_MAGIC 0x52583605 /* Oryol */ -#define TCP_STREAM_MAGIC 0x51465506 /* Orenburg */ -#define EVENTFD_FILE_MAGIC 0x44523722 /* Anapa */ -#define EVENTPOLL_FILE_MAGIC 0x45023858 /* Krasnodar */ -#define EVENTPOLL_TFD_MAGIC 0x44433746 /* Novorossiysk */ -#define SIGNALFD_MAGIC 0x57323820 /* Uglich */ -#define INOTIFY_FILE_MAGIC 0x48424431 /* Volgograd */ -#define INOTIFY_WD_MAGIC 0x54562009 /* Svetlogorsk (Rauschen) */ -#define MNTS_MAGIC 0x55563928 /* Petushki */ -#define NETDEV_MAGIC 0x57373951 /* Yaroslavl */ -#define NETNS_MAGIC 0x55933752 /* Dolgoprudny */ -#define TTY_FILES_MAGIC 0x59433025 /* Pushkin */ -#define TTY_INFO_MAGIC 0x59453036 /* Kolpino */ -#define TTY_DATA_MAGIC 0x59413026 /* Pavlovsk */ -#define FILE_LOCKS_MAGIC 0x54323616 /* Kaluga */ -#define RLIMIT_MAGIC 0x57113925 /* Rostov */ -#define FANOTIFY_FILE_MAGIC 0x55096122 /* Chelyabinsk */ -#define FANOTIFY_MARK_MAGIC 0x56506035 /* Yekaterinburg */ -#define SIGNAL_MAGIC 0x59255647 /* Berezniki */ -#define PSIGNAL_MAGIC SIGNAL_MAGIC -#define NETLINK_SK_MAGIC 0x58005614 /* Perm */ -#define NS_FILES_MAGIC 0x61394011 /* Nyandoma */ -#define TUNFILE_MAGIC 0x57143751 /* Kalyazin */ -#define CGROUP_MAGIC 0x59383330 /* Tikhvin */ -#define TIMERFD_MAGIC 0x50493712 /* Korocha */ -#define CPUINFO_MAGIC 0x61404013 /* Nyandoma */ -#define USERNS_MAGIC 0x55474906 /* Kazan */ -#define SECCOMP_MAGIC 0x64413049 /* Kostomuksha */ -#define BINFMT_MISC_MAGIC 0x67343323 /* Apatity */ -#define AUTOFS_MAGIC 0x49353943 /* Sochi */ -#define FILES_MAGIC 0x56303138 /* Toropets */ +#define INVENTORY_MAGIC 0x58313116 /* Veliky Novgorod */ +#define PSTREE_MAGIC 0x50273030 /* Kyiv */ +#define FDINFO_MAGIC 0x56213732 /* Dmitrov */ +#define PAGEMAP_MAGIC 0x56084025 /* Vladimir */ +#define SHMEM_PAGEMAP_MAGIC PAGEMAP_MAGIC +#define PAGES_MAGIC RAW_IMAGE_MAGIC +#define CORE_MAGIC 0x55053847 /* Kolomna */ +#define IDS_MAGIC 0x54432030 /* Konigsberg */ +#define VMAS_MAGIC 0x54123737 /* Tula */ +#define PIPES_MAGIC 0x56513555 /* Tver */ +#define PIPES_DATA_MAGIC 0x56453709 /* Dubna */ +#define FIFO_MAGIC 0x58364939 /* Kirov */ +#define FIFO_DATA_MAGIC 0x59333054 /* Tosno */ +#define SIGACT_MAGIC 0x55344201 /* Murom */ +#define UNIXSK_MAGIC 0x54373943 /* Ryazan */ +#define INETSK_MAGIC 0x56443851 /* Pereslavl */ +#define PACKETSK_MAGIC 0x60454618 /* Veliky Ustyug */ +#define ITIMERS_MAGIC 0x57464056 /* Kostroma */ +#define POSIX_TIMERS_MAGIC 0x52603957 /* Lipetsk */ +#define SK_QUEUES_MAGIC 0x56264026 /* Suzdal */ +#define UTSNS_MAGIC 0x54473203 /* Smolensk */ +#define CREDS_MAGIC 0x54023547 /* Kozelsk */ +#define IPC_VAR_MAGIC 0x53115007 /* Samara */ +#define IPCNS_SHM_MAGIC 0x46283044 /* Odessa */ +#define IPCNS_MSG_MAGIC 0x55453737 /* Moscow */ +#define IPCNS_SEM_MAGIC 0x59573019 /* St. Petersburg */ +#define REG_FILES_MAGIC 0x50363636 /* Belgorod */ +#define EXT_FILES_MAGIC 0x59255641 /* Usolye */ +#define FS_MAGIC 0x51403912 /* Voronezh */ +#define MM_MAGIC 0x57492820 /* Pskov */ +#define REMAP_FPATH_MAGIC 0x59133954 /* Vologda */ +#define GHOST_FILE_MAGIC 0x52583605 /* Oryol */ +#define TCP_STREAM_MAGIC 0x51465506 /* Orenburg */ +#define EVENTFD_FILE_MAGIC 0x44523722 /* Anapa */ +#define EVENTPOLL_FILE_MAGIC 0x45023858 /* Krasnodar */ +#define EVENTPOLL_TFD_MAGIC 0x44433746 /* Novorossiysk */ +#define SIGNALFD_MAGIC 0x57323820 /* Uglich */ +#define INOTIFY_FILE_MAGIC 0x48424431 /* Volgograd */ +#define INOTIFY_WD_MAGIC 0x54562009 /* Svetlogorsk (Rauschen) */ +#define MNTS_MAGIC 0x55563928 /* Petushki */ +#define NETDEV_MAGIC 0x57373951 /* Yaroslavl */ +#define NETNS_MAGIC 0x55933752 /* Dolgoprudny */ +#define TTY_FILES_MAGIC 0x59433025 /* Pushkin */ +#define TTY_INFO_MAGIC 0x59453036 /* Kolpino */ +#define TTY_DATA_MAGIC 0x59413026 /* Pavlovsk */ +#define FILE_LOCKS_MAGIC 0x54323616 /* Kaluga */ +#define RLIMIT_MAGIC 0x57113925 /* Rostov */ +#define FANOTIFY_FILE_MAGIC 0x55096122 /* Chelyabinsk */ +#define FANOTIFY_MARK_MAGIC 0x56506035 /* Yekaterinburg */ +#define SIGNAL_MAGIC 0x59255647 /* Berezniki */ +#define PSIGNAL_MAGIC SIGNAL_MAGIC +#define NETLINK_SK_MAGIC 0x58005614 /* Perm */ +#define NS_FILES_MAGIC 0x61394011 /* Nyandoma */ +#define TUNFILE_MAGIC 0x57143751 /* Kalyazin */ +#define CGROUP_MAGIC 0x59383330 /* Tikhvin */ +#define TIMERFD_MAGIC 0x50493712 /* Korocha */ +#define CPUINFO_MAGIC 0x61404013 /* Nyandoma */ +#define USERNS_MAGIC 0x55474906 /* Kazan */ +#define SECCOMP_MAGIC 0x64413049 /* Kostomuksha */ +#define BINFMT_MISC_MAGIC 0x67343323 /* Apatity */ +#define AUTOFS_MAGIC 0x49353943 /* Sochi */ +#define FILES_MAGIC 0x56303138 /* Toropets */ +#define MEMFD_INODE_MAGIC 0x48453499 /* Dnipro */ +#define TIMENS_MAGIC 0x43114433 /* Beslan */ +#define PIDNS_MAGIC 0x61157326 /* Surgut */ +#define BPFMAP_FILE_MAGIC 0x57506142 /* Alapayevsk */ +#define BPFMAP_DATA_MAGIC 0x64324033 /* Arkhangelsk */ +#define APPARMOR_MAGIC 0x59423047 /* Nikolskoye */ +#define PIDFD_MAGIC 0x54435556 /* Ufa */ -#define IFADDR_MAGIC RAW_IMAGE_MAGIC -#define ROUTE_MAGIC RAW_IMAGE_MAGIC -#define ROUTE6_MAGIC RAW_IMAGE_MAGIC -#define RULE_MAGIC RAW_IMAGE_MAGIC -#define TMPFS_IMG_MAGIC RAW_IMAGE_MAGIC -#define TMPFS_DEV_MAGIC RAW_IMAGE_MAGIC -#define IPTABLES_MAGIC RAW_IMAGE_MAGIC -#define IP6TABLES_MAGIC RAW_IMAGE_MAGIC -#define NETNF_CT_MAGIC RAW_IMAGE_MAGIC -#define NETNF_EXP_MAGIC RAW_IMAGE_MAGIC +#define IFADDR_MAGIC RAW_IMAGE_MAGIC +#define ROUTE_MAGIC RAW_IMAGE_MAGIC +#define ROUTE6_MAGIC RAW_IMAGE_MAGIC +#define RULE_MAGIC RAW_IMAGE_MAGIC +#define TMPFS_IMG_MAGIC RAW_IMAGE_MAGIC +#define TMPFS_DEV_MAGIC RAW_IMAGE_MAGIC +#define IPTABLES_MAGIC RAW_IMAGE_MAGIC +#define IP6TABLES_MAGIC RAW_IMAGE_MAGIC +#define NFTABLES_MAGIC RAW_IMAGE_MAGIC +#define NETNF_CT_MAGIC RAW_IMAGE_MAGIC +#define NETNF_EXP_MAGIC RAW_IMAGE_MAGIC -#define PAGES_OLD_MAGIC PAGEMAP_MAGIC -#define SHM_PAGES_OLD_MAGIC PAGEMAP_MAGIC -#define BINFMT_MISC_OLD_MAGIC BINFMT_MISC_MAGIC +#define PAGES_OLD_MAGIC PAGEMAP_MAGIC +#define SHM_PAGES_OLD_MAGIC PAGEMAP_MAGIC +#define BINFMT_MISC_OLD_MAGIC BINFMT_MISC_MAGIC /* * These are special files, not exactly images */ -#define STATS_MAGIC 0x57093306 /* Ostashkov */ -#define IRMAP_CACHE_MAGIC 0x57004059 /* Ivanovo */ +#define STATS_MAGIC 0x57093306 /* Ostashkov */ +#define IRMAP_CACHE_MAGIC 0x57004059 /* Ivanovo */ /* * Main magic for kerndat_s structure. */ -#define KDAT_MAGIC 0x57023458 /* Torzhok */ +#define KDAT_MAGIC 0x57023458 /* Torzhok */ #endif /* __CR_MAGIC_H__ */ diff --git a/criu/include/mem.h b/criu/include/mem.h index 251cb1a9e..e9ce3518a 100644 --- a/criu/include/mem.h +++ b/criu/include/mem.h @@ -7,6 +7,7 @@ #include "pid.h" #include "proc_parse.h" #include "inventory.pb-c.h" +#include "pagemap-cache.h" struct parasite_ctl; struct vm_area_list; @@ -15,10 +16,10 @@ struct pstree_item; struct vma_area; struct mem_dump_ctl { - bool pre_dump; - bool lazy; - struct proc_pid_stat *stat; - InventoryEntry *parent_ie; + bool pre_dump; + bool lazy; + struct proc_pid_stat *stat; + InventoryEntry *parent_ie; }; extern bool vma_has_guard_gap_hidden(struct vma_area *vma); @@ -28,26 +29,33 @@ extern int prepare_mm_pid(struct pstree_item *i); extern void prepare_cow_vmas(void); extern int do_task_reset_dirty_track(int pid); extern unsigned long dump_pages_args_size(struct vm_area_list *vmas); -extern int parasite_dump_pages_seized(struct pstree_item *item, - struct vm_area_list *vma_area_list, - struct mem_dump_ctl *mdc, - struct parasite_ctl *ctl); +extern int parasite_dump_pages_seized(struct pstree_item *item, struct vm_area_list *vma_area_list, + struct mem_dump_ctl *mdc, struct parasite_ctl *ctl); +extern int collect_madv_guards(pid_t pid, struct vm_area_list *vma_area_list); -#define PME_PRESENT (1ULL << 63) -#define PME_SWAP (1ULL << 62) -#define PME_FILE (1ULL << 61) -#define PME_SOFT_DIRTY (1ULL << 55) -#define PME_PSHIFT_BITS (6) -#define PME_STATUS_BITS (3) -#define PME_STATUS_OFFSET (64 - PME_STATUS_BITS) -#define PME_PSHIFT_OFFSET (PME_STATUS_OFFSET - PME_PSHIFT_BITS) -#define PME_PFRAME_MASK ((1ULL << PME_PSHIFT_OFFSET) - 1) -#define PME_PFRAME(x) ((x) & PME_PFRAME_MASK) +#define PME_PRESENT (1ULL << 63) +#define PME_SWAP (1ULL << 62) +#define PME_FILE (1ULL << 61) +#define PME_GUARD_REGION (1ULL << 58) +#define PME_SOFT_DIRTY (1ULL << 55) +#define PME_PSHIFT_BITS (6) +#define PME_STATUS_BITS (3) +#define PME_STATUS_OFFSET (64 - PME_STATUS_BITS) +#define PME_PSHIFT_OFFSET (PME_STATUS_OFFSET - PME_PSHIFT_BITS) +#define PME_PFRAME_MASK ((1ULL << PME_PSHIFT_OFFSET) - 1) +#define PME_PFRAME(x) ((x)&PME_PFRAME_MASK) struct task_restore_args; int open_vmas(struct pstree_item *t); int prepare_vmas(struct pstree_item *t, struct task_restore_args *ta); int unmap_guard_pages(struct pstree_item *t); int prepare_mappings(struct pstree_item *t); -bool should_dump_page(VmaEntry *vmae, u64 pme); + +struct page_info { + u64 next; + bool softdirty; +}; + +int should_dump_page(pmc_t *pmc, VmaEntry *vmae, u64 vaddr, struct page_info *page_info); + #endif /* __CR_MEM_H__ */ diff --git a/criu/include/memfd.h b/criu/include/memfd.h new file mode 100644 index 000000000..78d810019 --- /dev/null +++ b/criu/include/memfd.h @@ -0,0 +1,35 @@ +#ifndef __CR_MEMFD_H__ +#define __CR_MEMFD_H__ + +#include +#include + +#include "int.h" +#include "common/config.h" + +struct fd_parms; +struct file_desc; + +extern int is_memfd(dev_t dev); +extern int dump_one_memfd_cond(int lfd, u32 *id, struct fd_parms *parms); +extern const struct fdtype_ops memfd_dump_ops; + +extern int memfd_open(struct file_desc *d, u32 *fdflags, bool filemap); +extern struct collect_image_info memfd_cinfo; +extern struct file_desc *collect_memfd(u32 id); +extern int apply_memfd_seals(void); + +extern int prepare_memfd_inodes(void); + +#ifdef CONFIG_HAS_MEMFD_CREATE +#include +#else +#include +#include +static inline int memfd_create(const char *name, unsigned int flags) +{ + return syscall(SYS_memfd_create, name, flags); +} +#endif /* CONFIG_HAS_MEMFD_CREATE */ + +#endif /* __CR_MEMFD_H__ */ diff --git a/criu/include/mman.h b/criu/include/mman.h index 340d36927..43e0b6cc7 100644 --- a/criu/include/mman.h +++ b/criu/include/mman.h @@ -2,16 +2,25 @@ #define __CR_MMAN_H__ #ifndef MAP_HUGETLB -# define MAP_HUGETLB 0x40000 +#define MAP_HUGETLB 0x40000 +#endif +#ifndef MAP_DROPPABLE +#define MAP_DROPPABLE 0x08 #endif #ifndef MADV_HUGEPAGE -# define MADV_HUGEPAGE 14 +#define MADV_HUGEPAGE 14 #endif #ifndef MADV_NOHUGEPAGE -# define MADV_NOHUGEPAGE 15 +#define MADV_NOHUGEPAGE 15 #endif #ifndef MADV_DONTDUMP -# define MADV_DONTDUMP 16 +#define MADV_DONTDUMP 16 +#endif +#ifndef MADV_WIPEONFORK +#define MADV_WIPEONFORK 18 +#endif +#ifndef MADV_GUARD_INSTALL +#define MADV_GUARD_INSTALL 102 #endif #endif /* __CR_MMAN_H__ */ diff --git a/criu/include/mount-v2.h b/criu/include/mount-v2.h new file mode 100644 index 000000000..096f08f3b --- /dev/null +++ b/criu/include/mount-v2.h @@ -0,0 +1,95 @@ +#ifndef __CR_MOUNT_V2_H__ +#define __CR_MOUNT_V2_H__ + +#include "linux/mount.h" +#include "linux/openat2.h" + +#include "common/list.h" + +#include + +#ifndef MOVE_MOUNT_SET_GROUP +#define MOVE_MOUNT_SET_GROUP 0x00000100 /* Set sharing group instead */ +#endif +#ifndef MOVE_MOUNT_F_EMPTY_PATH +#define MOVE_MOUNT_F_EMPTY_PATH 0x00000004 /* Empty from path permitted */ +#endif +#ifndef MOVE_MOUNT_T_EMPTY_PATH +#define MOVE_MOUNT_T_EMPTY_PATH 0x00000040 /* Empty to path permitted */ +#endif + +static inline int sys_move_mount(int from_dirfd, const char *from_pathname, int to_dirfd, const char *to_pathname, + unsigned int flags) +{ + return syscall(__NR_move_mount, from_dirfd, from_pathname, to_dirfd, to_pathname, flags); +} + +#ifndef OPEN_TREE_CLONE +#define OPEN_TREE_CLONE 1 /* Clone the target tree and attach the clone */ +#endif +#ifndef OPEN_TREE_CLOEXEC +#define OPEN_TREE_CLOEXEC O_CLOEXEC /* Close the file on execve() */ +#endif +#ifndef AT_SYMLINK_NOFOLLOW +#define AT_SYMLINK_NOFOLLOW 0x100 /* Do not follow symbolic links. */ +#endif +#ifndef AT_NO_AUTOMOUNT +#define AT_NO_AUTOMOUNT 0x800 /* Suppress terminal automount traversal */ +#endif +#ifndef AT_EMPTY_PATH +#define AT_EMPTY_PATH 0x1000 /* Allow empty relative pathname */ +#endif +#ifndef AT_RECURSIVE +#define AT_RECURSIVE 0x8000 /* Apply to the entire subtree */ +#endif + +static inline int sys_open_tree(int dfd, const char *filename, unsigned int flags) +{ + return syscall(__NR_open_tree, dfd, filename, flags); +} + +#ifndef RESOLVE_NO_XDEV +#define RESOLVE_NO_XDEV 0x01 /* Block mount-point crossings (includes bind-mounts). */ +#endif + +static inline long sys_openat2(int dirfd, const char *pathname, struct open_how *how, size_t size) +{ + return syscall(__NR_openat2, dirfd, pathname, how, size); +} + +extern int check_mount_v2(void); + +struct sharing_group { + /* This pair identifies the group */ + int shared_id; + int master_id; + + /* List of shared groups */ + struct list_head list; + + /* List of mounts in this group */ + struct list_head mnt_list; + + /* + * List of dependent shared groups: + * - all siblings have equal master_id + * - the parent has shared_id equal to children's master_id + * + * This is a bit tricky: parent pointer indicates if there is one + * parent sharing_group in list or only siblings. + * So for traversal if parent pointer is set we can do: + * list_for_each_entry(t, &sg->parent->children, siblings) + * and otherwise we can do: + * list_for_each_entry(t, &sg->siblings, siblings) + */ + struct list_head children; + struct list_head siblings; + struct sharing_group *parent; + + char *source; +}; + +extern int resolve_shared_mounts_v2(void); +extern int prepare_mnt_ns_v2(void); + +#endif /* __CR_MOUNT_V2_H__ */ diff --git a/criu/include/mount.h b/criu/include/mount.h index d9b375f5d..6587c63b2 100644 --- a/criu/include/mount.h +++ b/criu/include/mount.h @@ -10,7 +10,21 @@ struct pstree_item; struct fstype; struct ns_id; -#define MOUNT_INVALID_DEV (0) +#define MS_PROPAGATE (MS_SHARED | MS_PRIVATE | MS_UNBINDABLE | MS_SLAVE) + +/* + * Here are a set of flags which we know how to handle for the one mount call. + * All of them except MS_RDONLY are set only as mnt flags. + * MS_RDONLY is set for both mnt and sb flags, so we can restore it for one + * mount call only if it set for both masks. + */ +#define MS_MNT_KNOWN_FLAGS (MS_NOSUID | MS_NOEXEC | MS_NODEV | MS_NOATIME | MS_NODIRATIME | MS_RELATIME | MS_RDONLY) + +#define BINFMT_MISC_HOME "proc/sys/fs/binfmt_misc" + +#define HELPER_MNT_ID 0 + +#define MOUNT_INVALID_DEV (0) #define MNT_UNREACHABLE INT_MIN @@ -26,12 +40,16 @@ struct ns_id; */ #define REMOUNTED_RW_SERVICE 2 +struct rst_mount_info { + int remounted_rw; +}; + struct mount_info { - int mnt_id; - int parent_mnt_id; - unsigned int s_dev; - unsigned int s_dev_rt; - char *root; + int mnt_id; + int parent_mnt_id; + unsigned int s_dev; + unsigned int s_dev_rt; + char *root; /* * During dump mountpoint contains path with dot at the * beginning. It allows to use openat, statat, etc without @@ -42,61 +60,101 @@ struct mount_info { * mount tree is constructed. Check mnt_roots for details. * The ns_mountpoint contains path w/o this prefix. */ - char *mountpoint; - char *ns_mountpoint; - int fd; - unsigned flags; - unsigned sb_flags; - int master_id; - int shared_id; - struct fstype *fstype; - char *source; - char *options; - char *fsname; - union { - bool mounted; - bool dumped; - }; - bool need_plugin; - bool is_ns_root; - bool deleted; - struct mount_info *next; - struct ns_id *nsid; + char *mountpoint; + char *ns_mountpoint; - char *external; - bool internal_sharing; + /* Mount-v2 specific */ + char *plain_mountpoint; + int is_dir; + int mp_fd_id; + int mnt_fd_id; + struct sharing_group *sg; + struct list_head mnt_sharing; + + int fd; + unsigned flags; + unsigned sb_flags; + int master_id; + int shared_id; + struct fstype *fstype; + char *source; + char *options; + char *fsname; + union { + bool mounted; + bool dumped; + }; + bool need_plugin; + bool is_ns_root; + bool deleted; + int deleted_level; + struct list_head deleted_list; + struct mount_info *next; + struct ns_id *nsid; + + char *external; + bool internal_sharing; /* tree linkage */ - struct mount_info *parent; - struct mount_info *bind; - struct list_head children; - struct list_head siblings; + struct mount_info *parent; + struct mount_info *bind; + struct list_head children; + struct list_head siblings; - struct list_head mnt_bind; /* circular list of derivatives of one real mount */ - struct list_head mnt_share; /* circular list of shared mounts */ - struct list_head mnt_slave_list; /* list of slave mounts */ - struct list_head mnt_slave; /* slave list entry */ - struct mount_info *mnt_master; /* slave is on master->mnt_slave_list */ - struct list_head mnt_propagate; /* circular list of mounts which propagate from each other */ - struct list_head mnt_notprop; /* temporary list used in can_mount_now */ + struct list_head mnt_bind; /* circular list of derivatives of one real mount */ + bool mnt_bind_is_populated; /* indicate that mnt_bind list is ready to use */ + struct list_head mnt_share; /* circular list of shared mounts */ + struct list_head mnt_slave_list; /* list of slave mounts */ + struct list_head mnt_slave; /* slave list entry */ + struct list_head mnt_ext_slave; /* external slave list entry */ + struct mount_info *mnt_master; /* slave is on master->mnt_slave_list */ + struct list_head mnt_propagate; /* circular list of mounts which propagate from each other */ + struct list_head mnt_notprop; /* temporary list used in can_mount_now */ + struct list_head mnt_unbindable; /* list of mounts with delayed unbindable */ - struct list_head postpone; + struct list_head postpone; - int is_overmounted; - int remounted_rw; + int is_overmounted; - void *private; /* associated filesystem data */ + struct rst_mount_info *rmi; + + void *private; /* associated filesystem data */ }; extern struct mount_info *mntinfo; + +extern void mntinfo_add_list_before(struct mount_info **head, struct mount_info *new); + +/* + * Put a : in here since those are invalid on + * the cli, so we know it's autogenerated in + * debugging. + */ +#define AUTODETECTED_MOUNT "CRIU:AUTOGENERATED" +#define EXTERNAL_DEV_MOUNT "CRIU:EXTERNAL_DEV" +#define NO_ROOT_MOUNT "CRIU:NO_ROOT" + +static inline bool mnt_is_dev_external(struct mount_info *mi) +{ + return mi->external && !strcmp(mi->external, EXTERNAL_DEV_MOUNT); +} + +static inline bool mnt_is_nodev_external(struct mount_info *mi) +{ + return mi->external && strcmp(mi->external, EXTERNAL_DEV_MOUNT); +} + extern struct ns_desc mnt_ns_desc; #ifdef CONFIG_BINFMT_MISC_VIRTUALIZED extern int collect_binfmt_misc(void); #else -static inline int collect_binfmt_misc(void) { return 0; } +static inline int collect_binfmt_misc(void) +{ + return 0; +} #endif -extern struct mount_info *mnt_entry_alloc(); +extern struct mount_info *mnt_entry_alloc(bool rst); extern void mnt_entry_free(struct mount_info *mi); extern int __mntns_get_root_fd(pid_t pid); @@ -105,7 +163,9 @@ extern int mntns_get_root_by_mnt_id(int mnt_id); extern struct ns_id *lookup_nsid_by_mnt_id(int mnt_id); extern int open_mount(unsigned int s_dev); -extern int __open_mountpoint(struct mount_info *pm, int mnt_fd); +extern int __check_mountpoint_fd(struct mount_info *pm, int mnt_fd, bool parse_mountinfo); +extern int check_mountpoint_fd(struct mount_info *pm, int mnt_fd); +extern int __open_mountpoint(struct mount_info *pm); extern int mnt_is_dir(struct mount_info *pm); extern int open_mountpoint(struct mount_info *pm); @@ -114,14 +174,12 @@ extern int prepare_mnt_ns(void); extern int pivot_root(const char *new_root, const char *put_old); -extern struct mount_info *lookup_overlayfs(char *rpath, unsigned int s_dev, - unsigned int st_ino, unsigned int mnt_id); +extern struct mount_info *lookup_overlayfs(char *rpath, unsigned int s_dev, unsigned int st_ino, unsigned int mnt_id); extern struct mount_info *lookup_mnt_id(unsigned int id); extern struct mount_info *lookup_mnt_sdev(unsigned int s_dev); extern dev_t phys_stat_resolve_dev(struct ns_id *, dev_t st_dev, const char *path); -extern bool phys_stat_dev_match(dev_t st_dev, dev_t phys_dev, - struct ns_id *, const char *path); +extern bool phys_stat_dev_match(dev_t st_dev, dev_t phys_dev, struct ns_id *, const char *path); extern int restore_task_mnt_ns(struct pstree_item *current); extern void fini_restore_mntns(void); @@ -135,8 +193,10 @@ extern int read_mnt_ns_img(void); extern void cleanup_mnt_ns(void); extern void clean_cr_time_mounts(void); +extern char *get_plain_mountpoint(int mnt_id, char *name); + extern bool add_skip_mount(const char *mountpoint); -struct ns_id; +extern int get_sdev_from_fd(int fd, unsigned int *sdev, bool parse_mountinfo); extern struct mount_info *parse_mountinfo(pid_t pid, struct ns_id *nsid, bool for_dump); extern int check_mnt_id(void); @@ -145,4 +205,35 @@ extern int remount_readonly_mounts(void); extern int try_remount_writable(struct mount_info *mi, bool ns); extern bool mnt_is_overmounted(struct mount_info *mi); +extern struct mount_info *mnt_get_external_bind(struct mount_info *mi); +extern bool mnt_is_external_bind(struct mount_info *mi); +extern bool has_mounted_external_bind(struct mount_info *mi); +extern bool rst_mnt_is_root(struct mount_info *mi); +extern struct mount_info *mnt_get_root_bind(struct mount_info *mi); +extern bool mnt_is_root_bind(struct mount_info *mi); +extern struct mount_info *mnt_get_external_bind_nodev(struct mount_info *mi); + +extern struct mount_info *mnt_bind_pick(struct mount_info *mi, + bool (*pick)(struct mount_info *mi, struct mount_info *bind)); + +extern int mnt_tree_for_each(struct mount_info *start, int (*fn)(struct mount_info *)); + +extern char *service_mountpoint(const struct mount_info *mi); + +extern int validate_mounts(struct mount_info *info, bool for_dump); +extern __maybe_unused struct mount_info *add_cr_time_mount(struct mount_info *root, char *fsname, const char *path, + unsigned int s_dev, bool rst); +extern char *resolve_source(struct mount_info *mi); +extern int fetch_rt_stat(struct mount_info *m, const char *where); +extern int do_simple_mount(struct mount_info *mi, const char *src, const char *fstype, unsigned long mountflags); +extern char *mnt_fsname(struct mount_info *mi); +extern int apply_sb_flags(void *args, int fd, pid_t pid); +extern int mount_root(void *args, int fd, pid_t pid); +extern int restore_ext_mount(struct mount_info *mi); +extern int cr_pivot_root(char *root); +extern int print_ns_root(struct ns_id *ns, int remap_id, char *buf, int bs); + +extern struct mount_info *root_yard_mp; +extern char *mnt_roots; + #endif /* __CR_MOUNT_H__ */ diff --git a/criu/include/namespaces.h b/criu/include/namespaces.h index 287abb3c8..183a3b852 100644 --- a/criu/include/namespaces.h +++ b/criu/include/namespaces.h @@ -1,67 +1,75 @@ #ifndef __CR_NS_H__ #define __CR_NS_H__ +#include + #include "common/compiler.h" #include "files.h" #include "common/list.h" #include "images/netdev.pb-c.h" #ifndef CLONE_NEWNS -#define CLONE_NEWNS 0x00020000 +#define CLONE_NEWNS 0x00020000 #endif #ifndef CLONE_NEWPID -#define CLONE_NEWPID 0x20000000 +#define CLONE_NEWPID 0x20000000 #endif #ifndef CLONE_NEWUTS -#define CLONE_NEWUTS 0x04000000 +#define CLONE_NEWUTS 0x04000000 #endif #ifndef CLONE_NEWIPC -#define CLONE_NEWIPC 0x08000000 +#define CLONE_NEWIPC 0x08000000 #endif #ifndef CLONE_NEWNET -#define CLONE_NEWNET 0x40000000 +#define CLONE_NEWNET 0x40000000 #endif #ifndef CLONE_NEWUSER -#define CLONE_NEWUSER 0x10000000 +#define CLONE_NEWUSER 0x10000000 #endif #ifndef CLONE_NEWCGROUP -#define CLONE_NEWCGROUP 0x02000000 +#define CLONE_NEWCGROUP 0x02000000 #endif -#define CLONE_ALLNS (CLONE_NEWPID | CLONE_NEWNET | CLONE_NEWIPC | CLONE_NEWUTS | CLONE_NEWNS | CLONE_NEWUSER | CLONE_NEWCGROUP) +#ifndef CLONE_NEWTIME +#define CLONE_NEWTIME 0x00000080 +#endif + +#define CLONE_ALLNS \ + (CLONE_NEWPID | CLONE_NEWNET | CLONE_NEWIPC | CLONE_NEWUTS | CLONE_NEWNS | CLONE_NEWUSER | CLONE_NEWCGROUP | \ + CLONE_NEWTIME) /* Nested namespaces are supported only for these types */ -#define CLONE_SUBNS (CLONE_NEWNS | CLONE_NEWNET) +#define CLONE_SUBNS (CLONE_NEWNS | CLONE_NEWNET) -#define EXTRA_SIZE 20 +#define EXTRA_SIZE 20 struct ns_desc { - unsigned int cflag; - char *str; - size_t len; + unsigned int cflag; + char *str; + size_t len; }; struct user_ns_extra { - char *uid; - char *gid; + char *uid; + char *gid; }; /* struct join_ns is used for storing parameters specified by --join-ns */ struct join_ns { - struct list_head list; - char *ns_file; - struct ns_desc *nd; /* namespace descriptor */ - int ns_fd; + struct list_head list; + char *ns_file; + struct ns_desc *nd; /* namespace descriptor */ + int ns_fd; /* extra options of --join-ns, like uid&gid in user namespace */ union { - struct user_ns_extra user_extra; - char *common_extra; + struct user_ns_extra user_extra; + char *common_extra; } extra_opts; }; @@ -73,15 +81,15 @@ enum ns_type { }; struct netns_id { - unsigned target_ns_id; - unsigned netnsid_value; - struct list_head node; + unsigned target_ns_id; + unsigned netnsid_value; + struct list_head node; }; struct net_link { - NetDeviceEntry *nde; - bool created; - struct list_head node; + NetDeviceEntry *nde; + bool created; + struct list_head node; }; struct ns_id { @@ -110,7 +118,6 @@ struct ns_id { } mnt; struct { - /* * ns_fd is used when network namespaces are being * restored. On this stage we access these file @@ -122,11 +129,11 @@ struct ns_id { * with restored file descriptors. */ union { - int nsfd_id; /* a namespace descriptor id in fdstore */ - int ns_fd; /* a namespace file descriptor */ + int nsfd_id; /* a namespace descriptor id in fdstore */ + int ns_fd; /* a namespace file descriptor */ }; - int nlsk; /* for sockets collection */ - int seqsk; /* to talk to parasite daemons */ + int nlsk; /* for sockets collection */ + int seqsk; /* to talk to parasite daemons */ struct list_head ids; struct list_head links; NetnsEntry *netns; @@ -135,17 +142,16 @@ struct ns_id { }; extern struct ns_id *ns_ids; -#define NS_DESC_ENTRY(_cflag, _str) \ - { \ - .cflag = _cflag, \ - .str = _str, \ - .len = sizeof(_str) - 1, \ +#define NS_DESC_ENTRY(_cflag, _str) \ + { \ + .cflag = _cflag, .str = _str, .len = sizeof(_str) - 1, \ } extern bool check_ns_proc(struct fd_link *link); extern struct ns_desc pid_ns_desc; extern struct ns_desc user_ns_desc; +extern struct ns_desc time_ns_desc; extern unsigned long root_ns_mask; extern const struct fdtype_ops nsfile_dump_ops; @@ -161,12 +167,13 @@ extern int prepare_namespace(struct pstree_item *item, unsigned long clone_flags extern int prepare_userns_creds(void); extern int switch_ns(int pid, struct ns_desc *nd, int *rst); +extern int switch_mnt_ns(int pid, int *rst, int *cwd_fd); extern int switch_ns_by_fd(int nsfd, struct ns_desc *nd, int *rst); extern int restore_ns(int rst, struct ns_desc *nd); +extern int restore_mnt_ns(int rst, int *cwd_fd); extern int dump_task_ns_ids(struct pstree_item *); extern int predump_task_ns_ids(struct pstree_item *); -extern struct ns_id *rst_new_ns_id(unsigned int id, pid_t pid, struct ns_desc *nd, enum ns_type t); extern int rst_add_ns_id(unsigned int id, struct pstree_item *, struct ns_desc *nd); extern struct ns_id *lookup_ns_by_id(unsigned int id, struct ns_desc *nd); @@ -191,12 +198,12 @@ typedef int (*uns_call_t)(void *arg, int fd, pid_t pid); * W/o flag the call is synchronous -- this function returns * strictly after the call finishes. */ -#define UNS_ASYNC 0x1 +#define UNS_ASYNC 0x1 /* * The call returns an FD which should be sent back. Conflicts * with UNS_ASYNC. */ -#define UNS_FDOUT 0x2 +#define UNS_FDOUT 0x2 #define MAX_UNSFD_MSG_SIZE 8192 @@ -209,16 +216,29 @@ typedef int (*uns_call_t)(void *arg, int fd, pid_t pid); * In case we're not in userns, just call the callback immediately * in the context of calling task. */ -extern int __userns_call(const char *func_name, uns_call_t call, int flags, - void *arg, size_t arg_size, int fd); +extern int __userns_call(const char *func_name, uns_call_t call, int flags, void *arg, size_t arg_size, int fd); -#define userns_call(__call, __flags, __arg, __arg_size, __fd) \ - __userns_call(__stringify(__call), __call, __flags, \ - __arg, __arg_size, __fd) +#define userns_call(__call, __flags, __arg, __arg_size, __fd) \ + __userns_call(__stringify(__call), __call, __flags, __arg, __arg_size, __fd) extern int add_ns_shared_cb(int (*actor)(void *data), void *data); extern struct ns_id *get_socket_ns(int lfd); extern struct ns_id *lookup_ns_by_kid(unsigned int kid, struct ns_desc *nd); +struct unsc_msg { + struct msghdr h; + /* + * 0th is the call address + * 1st is the flags + * 2nd is the optional (NULL in response) arguments + */ + struct iovec iov[3]; + char c[CMSG_SPACE(sizeof(struct ucred)) + CMSG_SPACE(sizeof(int))]; +}; + +extern void unsc_msg_init(struct unsc_msg *m, uns_call_t *c, int *x, void *arg, size_t asize, int fd, pid_t *pid); +extern void unsc_msg_pid_fd(struct unsc_msg *um, pid_t *pid, int *fd); +extern int start_unix_cred_daemon(pid_t *pid, int (*daemon_func)(int sk)); + #endif /* __CR_NS_H__ */ diff --git a/criu/include/net.h b/criu/include/net.h index 9976f6eb0..7c5ede21e 100644 --- a/criu/include/net.h +++ b/criu/include/net.h @@ -7,7 +7,7 @@ #include "external.h" #ifndef RTM_GETNSID -#define RTM_GETNSID 90 +#define RTM_GETNSID 90 #endif struct cr_imgset; @@ -31,7 +31,7 @@ extern int collect_net_namespaces(bool for_dump); extern int network_lock(void); extern void network_unlock(void); -extern int network_lock_internal(); +extern int network_lock_internal(bool restore); extern struct ns_desc net_ns_desc; @@ -45,13 +45,14 @@ extern int veth_pair_add(char *in, char *out); extern int macvlan_ext_add(struct external *ext); extern int move_veth_to_bridge(void); +extern int kerndat_has_newifindex(void); + extern int kerndat_link_nsid(void); extern int net_get_nsid(int rtsk, int fd, int *nsid); -extern struct ns_id *net_get_root_ns(); -extern int kerndat_nsid(void); +extern struct ns_id *net_get_root_ns(void); extern void check_has_netns_ioc(int fd, bool *kdat_val, const char *name); extern int net_set_ext(struct ns_id *ns); -extern struct ns_id *get_root_netns(); -extern int read_net_ns_img(); +extern struct ns_id *get_root_netns(void); +extern int read_net_ns_img(void); #endif /* __CR_NET_H__ */ diff --git a/criu/include/netfilter.h b/criu/include/netfilter.h index 35ef26205..005573a4f 100644 --- a/criu/include/netfilter.h +++ b/criu/include/netfilter.h @@ -2,12 +2,24 @@ #define __CR_NETFILTER_H__ struct inet_sk_desc; -extern int nf_lock_connection(struct inet_sk_desc *); -extern int nf_unlock_connection(struct inet_sk_desc *); +extern int iptables_lock_connection(struct inet_sk_desc *); +extern int iptables_unlock_connection(struct inet_sk_desc *); struct inet_sk_info; -extern int nf_unlock_connection_info(struct inet_sk_info *); +extern int iptables_unlock_connection_info(struct inet_sk_info *); extern void preload_netfilter_modules(void); +extern int nftables_init_connection_lock(void); +extern int nftables_lock_connection(struct inet_sk_desc *); +extern int nftables_get_table(char *table, int n); + +#if defined(CONFIG_HAS_NFTABLES_LIB_API_0) +#define NFT_RUN_CMD(nft, cmd) nft_run_cmd_from_buffer(nft, cmd, strlen(cmd)) +#elif defined(CONFIG_HAS_NFTABLES_LIB_API_1) +#define NFT_RUN_CMD(nft, cmd) nft_run_cmd_from_buffer(nft, cmd) +#else +#define NFT_RUN_CMD(nft, cmd) BUILD_BUG_ON(1) +#endif + #endif /* __CR_NETFILTER_H__ */ diff --git a/criu/include/netlink_diag.h b/criu/include/netlink_diag.h index 14ca403b8..65ff938c0 100644 --- a/criu/include/netlink_diag.h +++ b/criu/include/netlink_diag.h @@ -4,25 +4,25 @@ #include struct netlink_diag_req { - __u8 sdiag_family; - __u8 sdiag_protocol; - __u16 pad; - __u32 ndiag_ino; - __u32 ndiag_show; - __u32 ndiag_cookie[2]; + __u8 sdiag_family; + __u8 sdiag_protocol; + __u16 pad; + __u32 ndiag_ino; + __u32 ndiag_show; + __u32 ndiag_cookie[2]; }; struct netlink_diag_msg { - __u8 ndiag_family; - __u8 ndiag_type; - __u8 ndiag_protocol; - __u8 ndiag_state; + __u8 ndiag_family; + __u8 ndiag_type; + __u8 ndiag_protocol; + __u8 ndiag_state; - __u32 ndiag_portid; - __u32 ndiag_dst_portid; - __u32 ndiag_dst_group; - __u32 ndiag_ino; - __u32 ndiag_cookie[2]; + __u32 ndiag_portid; + __u32 ndiag_dst_portid; + __u32 ndiag_dst_group; + __u32 ndiag_ino; + __u32 ndiag_cookie[2]; }; enum { @@ -34,9 +34,9 @@ enum { #define NETLINK_DIAG_MAX (__NETLINK_DIAG_MAX - 1) -#define NDIAG_PROTO_ALL ((__u8) ~0) +#define NDIAG_PROTO_ALL ((__u8)~0) -#define NDIAG_SHOW_MEMINFO 0x00000001 /* show memory info of a socket */ -#define NDIAG_SHOW_GROUPS 0x00000002 /* show groups of a netlink socket */ +#define NDIAG_SHOW_MEMINFO 0x00000001 /* show memory info of a socket */ +#define NDIAG_SHOW_GROUPS 0x00000002 /* show groups of a netlink socket */ #endif /* __CR_NETLINK_DIAG_H__ */ diff --git a/criu/include/packet_diag.h b/criu/include/packet_diag.h index 287de84ec..69e317adc 100644 --- a/criu/include/packet_diag.h +++ b/criu/include/packet_diag.h @@ -4,26 +4,26 @@ #include struct packet_diag_req { - __u8 sdiag_family; - __u8 sdiag_protocol; - __u16 pad; - __u32 pdiag_ino; - __u32 pdiag_show; - __u32 pdiag_cookie[2]; + __u8 sdiag_family; + __u8 sdiag_protocol; + __u16 pad; + __u32 pdiag_ino; + __u32 pdiag_show; + __u32 pdiag_cookie[2]; }; -#define PACKET_SHOW_INFO 0x00000001 /* Basic packet_sk information */ -#define PACKET_SHOW_MCLIST 0x00000002 /* A set of packet_diag_mclist-s */ -#define PACKET_SHOW_RING_CFG 0x00000004 /* Rings configuration parameters */ -#define PACKET_SHOW_FANOUT 0x00000008 +#define PACKET_SHOW_INFO 0x00000001 /* Basic packet_sk information */ +#define PACKET_SHOW_MCLIST 0x00000002 /* A set of packet_diag_mclist-s */ +#define PACKET_SHOW_RING_CFG 0x00000004 /* Rings configuration parameters */ +#define PACKET_SHOW_FANOUT 0x00000008 struct packet_diag_msg { - __u8 pdiag_family; - __u8 pdiag_type; - __u16 pdiag_num; + __u8 pdiag_family; + __u8 pdiag_type; + __u16 pdiag_num; - __u32 pdiag_ino; - __u32 pdiag_cookie[2]; + __u32 pdiag_ino; + __u32 pdiag_cookie[2]; }; enum { @@ -37,40 +37,40 @@ enum { }; struct packet_diag_info { - __u32 pdi_index; - __u32 pdi_version; - __u32 pdi_reserve; - __u32 pdi_copy_thresh; - __u32 pdi_tstamp; - __u32 pdi_flags; + __u32 pdi_index; + __u32 pdi_version; + __u32 pdi_reserve; + __u32 pdi_copy_thresh; + __u32 pdi_tstamp; + __u32 pdi_flags; -#define PDI_RUNNING 0x1 -#define PDI_AUXDATA 0x2 -#define PDI_ORIGDEV 0x4 -#define PDI_VNETHDR 0x8 -#define PDI_LOSS 0x10 +#define PDI_RUNNING 0x1 +#define PDI_AUXDATA 0x2 +#define PDI_ORIGDEV 0x4 +#define PDI_VNETHDR 0x8 +#define PDI_LOSS 0x10 }; #ifndef MAX_ADDR_LEN -#define MAX_ADDR_LEN 32 +#define MAX_ADDR_LEN 32 #endif struct packet_diag_mclist { - __u32 pdmc_index; - __u32 pdmc_count; - __u16 pdmc_type; - __u16 pdmc_alen; - __u8 pdmc_addr[MAX_ADDR_LEN]; + __u32 pdmc_index; + __u32 pdmc_count; + __u16 pdmc_type; + __u16 pdmc_alen; + __u8 pdmc_addr[MAX_ADDR_LEN]; }; struct packet_diag_ring { - __u32 pdr_block_size; - __u32 pdr_block_nr; - __u32 pdr_frame_size; - __u32 pdr_frame_nr; - __u32 pdr_retire_tmo; - __u32 pdr_sizeof_priv; - __u32 pdr_features; + __u32 pdr_block_size; + __u32 pdr_block_nr; + __u32 pdr_frame_size; + __u32 pdr_frame_nr; + __u32 pdr_retire_tmo; + __u32 pdr_sizeof_priv; + __u32 pdr_features; }; #endif /* __CR_PACKET_DIAG_H__ */ diff --git a/criu/include/page-pipe.h b/criu/include/page-pipe.h index decd14321..65292b7ab 100644 --- a/criu/include/page-pipe.h +++ b/criu/include/page-pipe.h @@ -19,8 +19,7 @@ struct kernel_pipe_buffer { * fails very often, so we need to restrict the pipe capacity to not * allocate big chunks. */ -#define PIPE_MAX_SIZE ((1 << PAGE_ALLOC_COSTLY_ORDER) * PAGE_SIZE / \ - sizeof(struct kernel_pipe_buffer)) +#define PIPE_MAX_SIZE ((1 << PAGE_ALLOC_COSTLY_ORDER) * PAGE_SIZE / sizeof(struct kernel_pipe_buffer)) /* The number of pipes for one chunk */ #define NR_PIPES_PER_CHUNK 8 @@ -91,15 +90,15 @@ struct kernel_pipe_buffer { */ struct page_pipe_buf { - int p[2]; /* pipe with pages */ - unsigned int pipe_size; /* how many pages can be fit into pipe */ - unsigned int pipe_off; /* where this buf is started in a pipe */ - unsigned int pages_in; /* how many pages are there */ - unsigned int nr_segs; /* how many iov-s are busy */ + int p[2]; /* pipe with pages */ + unsigned int pipe_size; /* how many pages can be fit into pipe */ + unsigned int nr_segs; /* how many iov-s are busy */ + unsigned long pipe_off; /* where this buf is started in a pipe */ + unsigned long pages_in; /* how many pages are there */ #define PPB_LAZY (1 << 0) - unsigned int flags; - struct iovec *iov; /* vaddr:len map */ - struct list_head l; /* links into page_pipe->bufs */ + unsigned int flags; + struct iovec *iov; /* vaddr:len map */ + struct list_head l; /* links into page_pipe->bufs */ }; /* @@ -109,38 +108,35 @@ struct page_pipe_buf { * Currently we have 2 types: the buffers that are always stored in * the images and the buffers that are lazily migrated */ -#define PP_PIPE_TYPES 2 +#define PP_PIPE_TYPES 2 #define PP_HOLE_PARENT (1 << 0) struct page_pipe { - unsigned int nr_pipes; /* how many page_pipe_bufs in there */ - struct list_head bufs; /* list of bufs */ - struct list_head free_bufs; /* list of bufs */ - struct page_pipe_buf *prev[PP_PIPE_TYPES]; /* last ppb of each type for pipe sharing */ - unsigned int nr_iovs; /* number of iovs */ - unsigned int free_iov; /* first free iov */ + unsigned int nr_pipes; /* how many page_pipe_bufs in there */ + struct list_head bufs; /* list of bufs */ + struct list_head free_bufs; /* list of bufs */ + struct page_pipe_buf *prev[PP_PIPE_TYPES]; /* last ppb of each type for pipe sharing */ + unsigned int nr_iovs; /* number of iovs */ + unsigned int free_iov; /* first free iov */ - struct iovec *iovs; /* iovs. They are provided into create_page_pipe + struct iovec *iovs; /* iovs. They are provided into create_page_pipe and all bufs have their iov-s in there */ - unsigned int nr_holes; /* number of holes allocated */ - unsigned int free_hole; /* number of holes in use */ - struct iovec *holes; /* holes */ - unsigned int *hole_flags; - unsigned int flags; /* PP_FOO flags below */ + unsigned int nr_holes; /* number of holes allocated */ + unsigned int free_hole; /* number of holes in use */ + struct iovec *holes; /* holes */ + unsigned int *hole_flags; + unsigned int flags; /* PP_FOO flags below */ }; -#define PP_CHUNK_MODE 0x1 /* Restrict the maximum buffer size of pipes - and dump memory for a few iterations */ -#define PP_OWN_IOVS 0x4 /* create_page_pipe allocated IOVs memory */ +#define PP_CHUNK_MODE 0x1 /* Restrict the maximum buffer size of pipes and dump memory for a few iterations */ +#define PP_OWN_IOVS 0x4 /* create_page_pipe allocated IOVs memory */ struct page_pipe *create_page_pipe(unsigned int nr_segs, struct iovec *iovs, unsigned flags); extern void destroy_page_pipe(struct page_pipe *p); -extern int page_pipe_add_page(struct page_pipe *p, unsigned long addr, - unsigned int flags); -extern int page_pipe_add_hole(struct page_pipe *pp, unsigned long addr, - unsigned int flags); +extern int page_pipe_add_page(struct page_pipe *p, unsigned long addr, unsigned int flags); +extern int page_pipe_add_hole(struct page_pipe *pp, unsigned long addr, unsigned int flags); extern void debug_show_page_pipe(struct page_pipe *pp); void page_pipe_reinit(struct page_pipe *pp); @@ -153,8 +149,7 @@ struct pipe_read_dest { }; extern int pipe_read_dest_init(struct pipe_read_dest *prd); -extern int page_pipe_read(struct page_pipe *pp, struct pipe_read_dest *prd, - unsigned long addr, unsigned int *nr_pages, +extern int page_pipe_read(struct page_pipe *pp, struct pipe_read_dest *prd, unsigned long addr, unsigned long *nr_pages, unsigned int ppb_flags); #endif /* __CR_PAGE_PIPE_H__ */ diff --git a/criu/include/page-xfer.h b/criu/include/page-xfer.h index fa72273ea..0d9b35019 100644 --- a/criu/include/page-xfer.h +++ b/criu/include/page-xfer.h @@ -9,6 +9,9 @@ struct ps_info { extern int cr_page_server(bool daemon_mode, bool lazy_dump, int cfd); +/* User buffer for read-mode pre-dump*/ +#define PIPE_MAX_BUFFER_SIZE (PIPE_MAX_SIZE << PAGE_SHIFT) + /* * page_xfer -- transfer pages into image file. * Two images backends are implemented -- local image file @@ -48,6 +51,7 @@ struct page_xfer { extern int open_page_xfer(struct page_xfer *xfer, int fd_type, unsigned long id); struct page_pipe; extern int page_xfer_dump_pages(struct page_xfer *, struct page_pipe *); +extern int page_xfer_predump_pages(int pid, struct page_xfer *, struct page_pipe *); extern int connect_to_page_server_to_send(void); extern int connect_to_page_server_to_recv(int epfd); extern int disconnect_from_page_server(void); @@ -65,10 +69,9 @@ extern int check_parent_page_xfer(int fd_type, unsigned long id); */ /* async request/receive of remote pages */ -extern int request_remote_pages(unsigned long img_id, unsigned long addr, int nr_pages); +extern int request_remote_pages(unsigned long img_id, unsigned long addr, unsigned long nr_pages); -typedef int (*ps_async_read_complete)(unsigned long img_id, unsigned long vaddr, int nr_pages, void *); -extern int page_server_start_read(void *buf, int nr_pages, - ps_async_read_complete complete, void *priv, unsigned flags); +typedef int (*ps_async_read_complete)(unsigned long img_id, unsigned long vaddr, unsigned long nr_pages, void *); +extern int page_server_start_read(void *buf, unsigned long nr_pages, ps_async_read_complete complete, void *priv, unsigned flags); #endif /* __CR_PAGE_XFER__H__ */ diff --git a/criu/include/pagemap-cache.h b/criu/include/pagemap-cache.h index d3ace242a..875e69e56 100644 --- a/criu/include/pagemap-cache.h +++ b/criu/include/pagemap-cache.h @@ -1,29 +1,41 @@ #ifndef __CR_PAGEMAP_H__ #define __CR_PAGEMAP_H__ +#include #include #include "int.h" #include "common/list.h" +#include "pagemap_scan.h" struct vma_area; -#define PAGEMAP_PFN_OFF(addr) (PAGE_PFN(addr) * sizeof(u64)) +#define PAGEMAP_PFN_OFF(addr) (PAGE_PFN(addr) * sizeof(u64)) typedef struct { - pid_t pid; /* which process it belongs */ - unsigned long start; /* start of area */ - unsigned long end; /* end of area */ - const struct list_head *vma_head; /* list head of VMAs we're serving */ - u64 *map; /* local buffer */ - size_t map_len; /* length of a buffer */ - int fd; /* file to read PMs from */ + pid_t pid; /* which process it belongs */ + unsigned long start; /* start of area */ + unsigned long end; /* end of area */ + const struct list_head *vma_head; /* list head of VMAs we're serving */ + int fd; /* file to read PMs from */ + + u64 *map; /* local buffer */ + size_t map_len; /* length of a buffer */ + + struct page_region *regs; /* buffer for the PAGEMAP_SCAN ioctl */ + size_t regs_len; /* actual length of regs */ + size_t regs_max_len; /* maximum length of regs */ + size_t regs_idx; /* current index in the regs array */ } pmc_t; -#define PMC_INIT (pmc_t){ } +#define PMC_INIT \ + (pmc_t) \ + { \ + } extern int pmc_init(pmc_t *pmc, pid_t pid, const struct list_head *vma_head, size_t size); -extern u64 *pmc_get_map(pmc_t *pmc, const struct vma_area *vma); +extern int pmc_get_map(pmc_t *pmc, const struct vma_area *vma); extern void pmc_fini(pmc_t *pmc); +extern int pmc_fill(pmc_t *pmc, u64 start, u64 end); #endif /* __CR_PAGEMAP_H__ */ diff --git a/criu/include/pagemap.h b/criu/include/pagemap.h index 45284b87d..4cbc87cc6 100644 --- a/criu/include/pagemap.h +++ b/criu/include/pagemap.h @@ -44,8 +44,7 @@ struct page_read { /* reads page from current pagemap */ - int (*read_pages)(struct page_read *, unsigned long vaddr, int nr, - void *, unsigned flags); + int (*read_pages)(struct page_read *, unsigned long vaddr, unsigned long nr, void *, unsigned flags); /* Advance page_read to the next entry */ int (*advance)(struct page_read *pr); void (*close)(struct page_read *); @@ -53,49 +52,48 @@ struct page_read { int (*sync)(struct page_read *pr); int (*seek_pagemap)(struct page_read *pr, unsigned long vaddr); void (*reset)(struct page_read *pr); - int (*io_complete)(struct page_read *, unsigned long vaddr, int nr); - int (*maybe_read_page)(struct page_read *pr, unsigned long vaddr, - int nr, void *buf, unsigned flags); + int (*io_complete)(struct page_read *, unsigned long vaddr, unsigned long nr); + int (*maybe_read_page)(struct page_read *pr, unsigned long vaddr, unsigned long nr, void *buf, unsigned flags); /* Whether or not pages can be read in PIE code */ bool pieok; + /* Whether or not disable image deduplication*/ + bool disable_dedup; + /* Private data of reader */ struct cr_img *pmi; struct cr_img *pi; u32 pages_img_id; - PagemapEntry *pe; /* current pagemap we are on */ - struct page_read *parent; /* parent pagemap (if ->in_parent - pagemap is met in image, then - go to this guy for page, see - read_pagemap_page */ - unsigned long cvaddr; /* vaddr we are on */ - off_t pi_off; /* current offset in pages file */ + PagemapEntry *pe; /* current pagemap we are on */ + struct page_read *parent; /* parent pagemap (if ->in_parent pagemap is met in image, + * then go to this guy for page, see read_pagemap_page */ + unsigned long cvaddr; /* vaddr we are on */ + off_t pi_off; /* current offset in pages file */ - struct iovec bunch; /* record consequent neighbour - iovecs to punch together */ - unsigned id; /* for logging */ - unsigned long img_id; /* pagemap image file ID */ + struct iovec bunch; /* record consequent neighbour iovecs to punch together */ + unsigned id; /* for logging */ + unsigned long img_id; /* pagemap image file ID */ PagemapEntry **pmes; int nr_pmes; int curr_pme; - struct list_head async; + struct list_head async; }; /* flags for ->read_pages */ -#define PR_ASYNC 0x1 /* may exit w/o data in the buffer */ -#define PR_ASAP 0x2 /* PR_ASYNC, but start the IO right now */ +#define PR_ASYNC 0x1 /* may exit w/o data in the buffer */ +#define PR_ASAP 0x2 /* PR_ASYNC, but start the IO right now */ /* flags for open_page_read */ -#define PR_SHMEM 0x1 -#define PR_TASK 0x2 +#define PR_SHMEM 0x1 +#define PR_TASK 0x2 -#define PR_TYPE_MASK 0x3 -#define PR_MOD 0x4 /* Will need to modify */ -#define PR_REMOTE 0x8 +#define PR_TYPE_MASK 0x3 +#define PR_MOD 0x4 /* Will need to modify */ +#define PR_REMOTE 0x8 /* * -1 -- error @@ -103,13 +101,11 @@ struct page_read { * 1 -- opened */ extern int open_page_read(unsigned long id, struct page_read *, int pr_flags); -extern int open_page_read_at(int dfd, unsigned long id, struct page_read *pr, - int pr_flags); +extern int open_page_read_at(int dfd, unsigned long id, struct page_read *pr, int pr_flags); struct task_restore_args; -int pagemap_enqueue_iovec(struct page_read *pr, void *buf, - unsigned long len, struct list_head *to); +int pagemap_enqueue_iovec(struct page_read *pr, void *buf, unsigned long len, struct list_head *to); int pagemap_render_iovec(struct list_head *from, struct task_restore_args *ta); /* @@ -119,8 +115,9 @@ int pagemap_render_iovec(struct list_head *from, struct task_restore_args *ta); */ extern void dup_page_read(struct page_read *src, struct page_read *dst); -extern int dedup_one_iovec(struct page_read *pr, unsigned long base, - unsigned long len); +extern void page_read_disable_dedup(struct page_read *pr); + +extern int dedup_one_iovec(struct page_read *pr, unsigned long base, unsigned long len); static inline unsigned long pagemap_len(PagemapEntry *pe) { @@ -133,9 +130,9 @@ static inline bool page_read_has_parent(struct page_read *pr) } /* Pagemap flags */ -#define PE_PARENT (1 << 0) /* pages are in parent snapshot */ -#define PE_LAZY (1 << 1) /* pages can be lazily restored */ -#define PE_PRESENT (1 << 2) /* pages are present in pages*img */ +#define PE_PARENT (1 << 0) /* pages are in parent snapshot */ +#define PE_LAZY (1 << 1) /* pages can be lazily restored */ +#define PE_PRESENT (1 << 2) /* pages are present in pages*img */ static inline bool pagemap_in_parent(PagemapEntry *pe) { diff --git a/criu/include/pagemap_scan.h b/criu/include/pagemap_scan.h new file mode 100644 index 000000000..9046e01ed --- /dev/null +++ b/criu/include/pagemap_scan.h @@ -0,0 +1,69 @@ +#ifndef __CR_PAGEMAP_SCAN_H__ +#define __CR_PAGEMAP_SCAN_H__ + +#ifndef PAGEMAP_SCAN +#include +#include "int.h" + +/* Bitmasks provided in pm_scan_args masks and reported in page_region.categories. */ +#define PAGE_IS_WPALLOWED (1 << 0) +#define PAGE_IS_WRITTEN (1 << 1) +#define PAGE_IS_FILE (1 << 2) +#define PAGE_IS_PRESENT (1 << 3) +#define PAGE_IS_SWAPPED (1 << 4) +#define PAGE_IS_PFNZERO (1 << 5) +#define PAGE_IS_HUGE (1 << 6) +#define PAGE_IS_SOFT_DIRTY (1 << 7) +#define PAGE_IS_GUARD (1 << 8) + +/* + * struct page_region - Page region with flags + * @start: Start of the region + * @end: End of the region (exclusive) + * @categories: PAGE_IS_* category bitmask for the region + */ +struct page_region { + u64 start; + u64 end; + u64 categories; +}; + +#define PAGEMAP_SCAN _IOWR('f', 16, struct pm_scan_arg) + +/* Flags for PAGEMAP_SCAN ioctl */ +#define PM_SCAN_WP_MATCHING (1 << 0) /* Write protect the pages matched. */ +#define PM_SCAN_CHECK_WPASYNC (1 << 1) /* Abort the scan when a non-WP-enabled page is found. */ + +/* + * struct pm_scan_arg - Pagemap ioctl argument + * @size: Size of the structure + * @flags: Flags for the IOCTL + * @start: Starting address of the region + * @end: Ending address of the region + * @walk_end Address where the scan stopped (written by kernel). + * walk_end == end (address tags cleared) informs that the scan completed on entire range. + * @vec: Address of page_region struct array for output + * @vec_len: Length of the page_region struct array + * @max_pages: Optional limit for number of returned pages (0 = disabled) + * @category_inverted: PAGE_IS_* categories which values match if 0 instead of 1 + * @category_mask: Skip pages for which any category doesn't match + * @category_anyof_mask: Skip pages for which no category matches + * @return_mask: PAGE_IS_* categories that are to be reported in `page_region`s returned + */ +struct pm_scan_arg { + u64 size; + u64 flags; + u64 start; + u64 end; + u64 walk_end; + u64 vec; + u64 vec_len; + u64 max_pages; + u64 category_inverted; + u64 category_mask; + u64 category_anyof_mask; + u64 return_mask; +}; +#endif /* PAGEMAP_SCAN */ + +#endif /* __CR_PAGEMAP_SCAN_H__ */ diff --git a/criu/include/parasite-syscall.h b/criu/include/parasite-syscall.h index c86a724fd..4a8ec2fee 100644 --- a/criu/include/parasite-syscall.h +++ b/criu/include/parasite-syscall.h @@ -11,8 +11,6 @@ struct parasite_dump_misc; struct parasite_drain_fd; struct vm_area_list; struct pstree_item; -struct _CredsEntry; -struct _CoreEntry; struct list_head; struct cr_imgset; struct fd_opts; @@ -23,29 +21,18 @@ struct rt_sigframe; struct parasite_ctl; struct parasite_thread_ctl; -extern int parasite_dump_sigacts_seized(struct parasite_ctl *ctl, struct pstree_item *); -extern int parasite_dump_itimers_seized(struct parasite_ctl *ctl, struct pstree_item *); - -struct proc_posix_timers_stat; -extern int parasite_dump_posix_timers_seized(struct proc_posix_timers_stat *proc_args, - struct parasite_ctl *ctl, struct pstree_item *); - extern int parasite_dump_misc_seized(struct parasite_ctl *ctl, struct parasite_dump_misc *misc); -extern int parasite_dump_creds(struct parasite_ctl *ctl, struct _CredsEntry *ce); -extern int parasite_dump_thread_leader_seized(struct parasite_ctl *ctl, int pid, struct _CoreEntry *core); -extern int parasite_dump_thread_seized(struct parasite_thread_ctl *tctl, - struct parasite_ctl *ctl, int id, - struct pid *tid, struct _CoreEntry *core); -extern int dump_thread_core(int pid, CoreEntry *core, - const struct parasite_dump_thread *dt); +extern int parasite_dump_creds(struct parasite_ctl *ctl, CredsEntry *ce); +extern int parasite_dump_thread_leader_seized(struct parasite_ctl *ctl, int pid, CoreEntry *core); +extern int parasite_dump_thread_seized(struct parasite_thread_ctl *tctl, struct parasite_ctl *ctl, int id, + struct pid *tid, CoreEntry *core); +extern int dump_thread_core(int pid, CoreEntry *core, const struct parasite_dump_thread *dt); -extern int parasite_drain_fds_seized(struct parasite_ctl *ctl, - struct parasite_drain_fd *dfds, int nr_fds, int off, - int *lfds, struct fd_opts *flags); +extern int parasite_drain_fds_seized(struct parasite_ctl *ctl, struct parasite_drain_fd *dfds, int nr_fds, int off, + int *lfds, struct fd_opts *flags); extern int parasite_get_proc_fd_seized(struct parasite_ctl *ctl); -extern struct parasite_ctl *parasite_infect_seized(pid_t pid, - struct pstree_item *item, +extern struct parasite_ctl *parasite_infect_seized(pid_t pid, struct pstree_item *item, struct vm_area_list *vma_area_list); extern void parasite_ensure_args_size(unsigned long sz); extern unsigned long get_exec_start(struct vm_area_list *); diff --git a/criu/include/parasite-vdso.h b/criu/include/parasite-vdso.h index 9ee32f2a7..a50594d2a 100644 --- a/criu/include/parasite-vdso.h +++ b/criu/include/parasite-vdso.h @@ -32,28 +32,28 @@ static inline bool vdso_symbol_empty(struct vdso_symbol *s) * from list of VMAs to save in images, we save rt-vvar address also. */ struct vdso_mark { - u64 signature; - unsigned long orig_vdso_addr; - unsigned long version; - unsigned long orig_vvar_addr; - unsigned long rt_vvar_addr; + u64 signature; + unsigned long orig_vdso_addr; + unsigned long version; + unsigned long orig_vvar_addr; + unsigned long rt_vvar_addr; }; -#define VDSO_MARK_SIGNATURE_V1 (0x6f73647675697263ULL) /* Magic number (criuvdso) */ -#define VDSO_MARK_SIGNATURE_V2 (0x4f53447675697263ULL) /* Magic number (criuvDSO) */ -#define VDSO_MARK_SIGNATURE_V3 (0x4f53447655495243ULL) /* Magic number (CRIUvDSO) */ -#define VDSO_MARK_CUR_VERSION (3) +#define VDSO_MARK_SIGNATURE_V1 (0x6f73647675697263ULL) /* Magic number (criuvdso) */ +#define VDSO_MARK_SIGNATURE_V2 (0x4f53447675697263ULL) /* Magic number (criuvDSO) */ +#define VDSO_MARK_SIGNATURE_V3 (0x4f53447655495243ULL) /* Magic number (CRIUvDSO) */ +#define VDSO_MARK_CUR_VERSION (3) -static inline void vdso_put_mark(void *where, unsigned long rt_vvar_addr, - unsigned long orig_vdso_addr, unsigned long orig_vvar_addr) +static inline void vdso_put_mark(void *where, unsigned long rt_vvar_addr, unsigned long orig_vdso_addr, + unsigned long orig_vvar_addr) { struct vdso_mark *m = where; - m->signature = VDSO_MARK_SIGNATURE_V3; - m->orig_vdso_addr = orig_vdso_addr; - m->version = VDSO_MARK_CUR_VERSION; - m->orig_vvar_addr = orig_vvar_addr; - m->rt_vvar_addr = rt_vvar_addr; + m->signature = VDSO_MARK_SIGNATURE_V3; + m->orig_vdso_addr = orig_vdso_addr; + m->version = VDSO_MARK_CUR_VERSION; + m->orig_vvar_addr = orig_vvar_addr; + m->rt_vvar_addr = rt_vvar_addr; } static inline bool is_vdso_mark(void *addr) @@ -68,13 +68,11 @@ static inline bool is_vdso_mark(void *addr) * to the version we support. */ case VDSO_MARK_SIGNATURE_V2: - vdso_put_mark(m, VVAR_BAD_ADDR, - m->orig_vdso_addr, m->orig_vvar_addr); + vdso_put_mark(m, VVAR_BAD_ADDR, m->orig_vdso_addr, m->orig_vvar_addr); return true; case VDSO_MARK_SIGNATURE_V1: - vdso_put_mark(m, VVAR_BAD_ADDR, - m->orig_vdso_addr, VVAR_BAD_ADDR); + vdso_put_mark(m, VVAR_BAD_ADDR, m->orig_vdso_addr, VVAR_BAD_ADDR); return true; } @@ -82,14 +80,11 @@ static inline bool is_vdso_mark(void *addr) } extern void vdso_update_gtod_addr(struct vdso_maps *rt); -extern int vdso_do_park(struct vdso_maps *rt, unsigned long park_at, - unsigned long park_size); +extern int vdso_do_park(struct vdso_maps *rt, unsigned long park_at, unsigned long park_size); extern int vdso_map_compat(unsigned long map_at); -extern int vdso_proxify(struct vdso_maps *rt, bool *added_proxy, - VmaEntry *vmas, size_t nr_vmas, - bool compat_vdso, bool force_trampolines); -extern int vdso_redirect_calls(unsigned long base_to, unsigned long base_from, - struct vdso_symtable *to, struct vdso_symtable *from, - bool compat_vdso); +extern int vdso_proxify(struct vdso_maps *rt, bool *added_proxy, VmaEntry *vmas, size_t nr_vmas, bool compat_vdso, + bool force_trampolines); +extern int vdso_redirect_calls(unsigned long base_to, unsigned long base_from, struct vdso_symtable *to, + struct vdso_symtable *from, bool compat_vdso); #endif /* __CR_PARASITE_VDSO_H__ */ diff --git a/criu/include/parasite.h b/criu/include/parasite.h index d9570948a..176357711 100644 --- a/criu/include/parasite.h +++ b/criu/include/parasite.h @@ -1,7 +1,7 @@ #ifndef __CR_PARASITE_H__ #define __CR_PARASITE_H__ -#define PARASITE_MAX_SIZE (64 << 10) +#define PARASITE_MAX_SIZE (64 << 10) #ifndef __ASSEMBLY__ @@ -10,6 +10,8 @@ #include #include +#include "linux/rseq.h" + #include "image.h" #include "util-pie.h" #include "common/lock.h" @@ -39,30 +41,29 @@ enum { PARASITE_CMD_MAX, }; -struct parasite_vma_entry -{ - unsigned long start; - unsigned long len; - int prot; +struct parasite_vma_entry { + unsigned long start; + unsigned long len; + int prot; }; struct parasite_vdso_vma_entry { - unsigned long start; - unsigned long len; - unsigned long orig_vdso_addr; - unsigned long orig_vvar_addr; - unsigned long rt_vvar_addr; - int is_marked; - bool try_fill_symtable; - bool is_vdso; + unsigned long start; + unsigned long len; + unsigned long orig_vdso_addr; + unsigned long orig_vvar_addr; + unsigned long rt_vvar_addr; + int is_marked; + bool try_fill_symtable; + bool is_vdso; }; struct parasite_dump_pages_args { - unsigned int nr_vmas; - unsigned int add_prot; - unsigned int off; - unsigned int nr_segs; - unsigned int nr_pages; + unsigned int nr_vmas; + unsigned int add_prot; + unsigned int off; + unsigned int nr_segs; + unsigned long nr_pages; }; static inline struct parasite_vma_entry *pargs_vmas(struct parasite_dump_pages_args *a) @@ -117,7 +118,9 @@ static inline int posix_timers_dump_size(int timer_n) */ struct parasite_dump_misc { - unsigned long brk; + bool has_membarrier_get_registrations; /* this is sent from criu to parasite. */ + + unsigned long brk; u32 pid; u32 sid; @@ -127,28 +130,31 @@ struct parasite_dump_misc { int dumpable; int thp_disabled; int child_subreaper; + int membarrier_registration_mask; }; /* * Calculate how long we can make the groups array in parasite_dump_creds * and still fit the struct in one page */ -#define PARASITE_MAX_GROUPS \ - ((PAGE_SIZE - sizeof(struct parasite_dump_thread) - \ - offsetof(struct parasite_dump_creds, groups)) / sizeof(unsigned int)) /* groups */ +#define PARASITE_MAX_GROUPS \ + ((PAGE_SIZE - sizeof(struct parasite_dump_thread) - offsetof(struct parasite_dump_creds, groups)) / \ + sizeof(unsigned int)) /* groups */ struct parasite_dump_creds { - unsigned int cap_last_cap; + unsigned int cap_last_cap; - u32 cap_inh[CR_CAP_SIZE]; - u32 cap_prm[CR_CAP_SIZE]; - u32 cap_eff[CR_CAP_SIZE]; - u32 cap_bnd[CR_CAP_SIZE]; + u32 cap_inh[CR_CAP_SIZE]; + u32 cap_prm[CR_CAP_SIZE]; + u32 cap_eff[CR_CAP_SIZE]; + u32 cap_bnd[CR_CAP_SIZE]; + u32 cap_amb[CR_CAP_SIZE]; - int uids[4]; - int gids[4]; - unsigned int secbits; - unsigned int ngroups; + int uids[4]; + int gids[4]; + int no_new_privs; + unsigned int secbits; + unsigned int ngroups; /* * FIXME -- this structure is passed to parasite code * through parasite args area so in parasite_dump_creds() @@ -162,17 +168,24 @@ struct parasite_dump_creds { * of memory in use doesn't exceed the PAGE_SIZE and the * args area is at least one page (PARASITE_ARG_SIZE_MIN). */ - unsigned int groups[0]; + unsigned int groups[0]; +}; + +struct parasite_check_rseq { + bool has_rseq; + bool has_ptrace_get_rseq_conf; /* no need to check if supported */ + bool rseq_inited; }; struct parasite_dump_thread { - unsigned int *tid_addr; - pid_t tid; - tls_t tls; - stack_t sas; - int pdeath_sig; - char comm[TASK_COMM_LEN]; - struct parasite_dump_creds creds[0]; + unsigned int *tid_addr; + pid_t tid; + tls_t tls; + struct parasite_check_rseq rseq; + stack_t sas; + int pdeath_sig; + char comm[TASK_COMM_LEN]; + struct parasite_dump_creds creds[0]; }; static inline void copy_sas(ThreadSasEntry *dst, const stack_t *src) @@ -190,11 +203,11 @@ static inline void copy_sas(ThreadSasEntry *dst, const stack_t *src) * are transferred with help of send_fds and recv_fds. * 3) criu should work with a default value of the file limit (1024) */ -#define PARASITE_MAX_FDS CR_SCM_MAX_FD * 3 +#define PARASITE_MAX_FDS CR_SCM_MAX_FD * 3 struct parasite_drain_fd { - int nr_fds; - int fds[0]; + int nr_fds; + int fds[0]; }; struct fd_opts { @@ -215,16 +228,16 @@ static inline int drain_fds_size(struct parasite_drain_fd *dfds) } struct parasite_tty_args { - int fd; - int type; + int fd; + int type; - int sid; - int pgrp; - bool hangup; + int sid; + int pgrp; + bool hangup; - int st_pckt; - int st_lock; - int st_excl; + int st_pckt; + int st_lock; + int st_excl; }; struct parasite_dump_cgroup_args { @@ -233,7 +246,12 @@ struct parasite_dump_cgroup_args { * * The string is null terminated. */ - char contents[1 << 12]; + char contents[(1 << 12) - 32]; + /* + * Contains the path to thread cgroup procfs. + * "self/task//cgroup" + */ + char thread_cgrp[32]; }; #endif /* !__ASSEMBLY__ */ diff --git a/criu/include/path.h b/criu/include/path.h index c475986eb..70e09587d 100644 --- a/criu/include/path.h +++ b/criu/include/path.h @@ -35,7 +35,6 @@ char *cut_root_for_bind(char *target_root, char *source_root); * Get a mount point for a sibling of m if m->parent and p are in the same * shared group. */ -char *mnt_get_sibling_path(struct mount_info *m, - struct mount_info *p, char *buf, int len); +char *mnt_get_sibling_path(struct mount_info *m, struct mount_info *p, char *buf, int len); #endif diff --git a/criu/include/pid.h b/criu/include/pid.h index c749176fa..b2b7a361a 100644 --- a/criu/include/pid.h +++ b/criu/include/pid.h @@ -8,18 +8,17 @@ /* * Task states, used in e.g. struct pid's state. */ -enum __criu_task_state -{ +enum __criu_task_state { /* Values shared with compel */ - TASK_ALIVE = COMPEL_TASK_ALIVE, - TASK_DEAD = COMPEL_TASK_DEAD, - TASK_STOPPED = COMPEL_TASK_STOPPED, - TASK_ZOMBIE = COMPEL_TASK_ZOMBIE, + TASK_ALIVE = COMPEL_TASK_ALIVE, + TASK_DEAD = COMPEL_TASK_DEAD, + TASK_STOPPED = COMPEL_TASK_STOPPED, + TASK_ZOMBIE = COMPEL_TASK_ZOMBIE, /* Own internal states */ - TASK_HELPER = COMPEL_TASK_MAX + 1, + TASK_HELPER = COMPEL_TASK_MAX + 1, TASK_THREAD, /* new values are to be added before this line */ - TASK_UNDEF = 0xff + TASK_UNDEF = 0xff }; struct pid { @@ -31,7 +30,11 @@ struct pid { */ pid_t real; - int state; /* TASK_XXX constants */ + int state; /* TASK_XXX constants */ + /* If an item is in stopped state it has a signal number + * that caused task to stop. + */ + int stop_signo; /* * The @virt pid is one which used in the image itself and keeps diff --git a/criu/include/pidfd-store.h b/criu/include/pidfd-store.h new file mode 100644 index 000000000..a76e681b8 --- /dev/null +++ b/criu/include/pidfd-store.h @@ -0,0 +1,13 @@ +#ifndef __CR_PIDFD_STORE_H__ +#define __CR_PIDFD_STORE_H__ + +#include + +int init_pidfd_store_sk(pid_t pid, int fd); +int init_pidfd_store_hash(void); +void free_pidfd_store(void); +int pidfd_store_add(pid_t pid); +int pidfd_store_check_pid_reuse(pid_t pid); +bool pidfd_store_ready(void); + +#endif /* __CR_PIDFD_STORE_H__ */ diff --git a/criu/include/pidfd.h b/criu/include/pidfd.h new file mode 100644 index 000000000..bcc0fb45a --- /dev/null +++ b/criu/include/pidfd.h @@ -0,0 +1,16 @@ +#ifndef __CR_PIDFD_H__ +#define __CR_PIDFD_H__ + +#include "files.h" +#include "pidfd.pb-c.h" + +extern const struct fdtype_ops pidfd_dump_ops; +extern struct collect_image_info pidfd_cinfo; +extern int is_pidfd_link(char *link); +extern void init_dead_pidfd_hash(void); +struct pidfd_dump_info { + PidfdEntry pidfe; + pid_t pid; +}; + +#endif /* __CR_PIDFD_H__ */ diff --git a/criu/include/pipes.h b/criu/include/pipes.h index 83fb71cfc..f442d7f65 100644 --- a/criu/include/pipes.h +++ b/criu/include/pipes.h @@ -13,28 +13,28 @@ static inline u32 pipe_id(const struct fd_parms *p) return p->stat.st_ino; } -#define NR_PIPES_WITH_DATA 1024 +#define NR_PIPES_WITH_DATA 1024 struct pipe_data_dump { - int img_type; - unsigned int nr; - u32 ids[NR_PIPES_WITH_DATA]; + int img_type; + unsigned int nr; + u32 ids[NR_PIPES_WITH_DATA]; }; extern int dump_one_pipe_data(struct pipe_data_dump *pd, int lfd, const struct fd_parms *p); struct pipe_data_rst { - PipeDataEntry *pde; + PipeDataEntry *pde; void *data; - struct pipe_data_rst *next; + struct pipe_data_rst *next; }; -#define PIPE_DATA_HASH_BITS 5 -#define PIPE_DATA_HASH_SIZE (1 << PIPE_DATA_HASH_BITS) -#define PIPE_DATA_HASH_MASK (PIPE_DATA_HASH_SIZE - 1) +#define PIPE_DATA_HASH_BITS 5 +#define PIPE_DATA_HASH_SIZE (1 << PIPE_DATA_HASH_BITS) +#define PIPE_DATA_HASH_MASK (PIPE_DATA_HASH_SIZE - 1) -extern int do_collect_pipe_data(struct pipe_data_rst *, - ProtobufCMessage *, struct cr_img *, struct pipe_data_rst **hash); +extern int do_collect_pipe_data(struct pipe_data_rst *, ProtobufCMessage *, struct cr_img *, + struct pipe_data_rst **hash); extern int restore_pipe_data(int img_type, int pfd, u32 id, struct pipe_data_rst **hash); /* @@ -47,17 +47,15 @@ extern int restore_pipe_data(int img_type, int pfd, u32 id, struct pipe_data_rst #include "images/pipe.pb-c.h" struct pipe_info { - PipeEntry *pe; - struct list_head pipe_list; /* All pipe_info with the same pipe_id - * This is pure circular list without head */ - struct list_head list; /* global list of pipes */ - struct file_desc d; - unsigned int create : 1, - reopen : 1; + PipeEntry *pe; + struct list_head pipe_list; /* All pipe_info with the same pipe_id + * This is pure circular list without head */ + struct list_head list; /* global list of pipes */ + struct file_desc d; + unsigned int create : 1, reopen : 1; }; -extern int collect_one_pipe_ops(void *o, ProtobufCMessage *base, - struct file_desc_ops *ops); +extern int collect_one_pipe_ops(void *o, ProtobufCMessage *base, struct file_desc_ops *ops); extern int open_pipe(struct file_desc *d, int *new_fd); #endif /* __CR_PIPES_H__ */ diff --git a/criu/include/plugin.h b/criu/include/plugin.h index 82a6723d0..0115e6ea0 100644 --- a/criu/include/plugin.h +++ b/criu/include/plugin.h @@ -5,42 +5,43 @@ #include "common/compiler.h" #include "common/list.h" -#define CR_PLUGIN_DEFAULT "/var/lib/criu/" +#ifndef CR_PLUGIN_DEFAULT +#define CR_PLUGIN_DEFAULT "/usr/lib/criu/" +#endif void cr_plugin_fini(int stage, int err); int cr_plugin_init(int stage); typedef struct { - struct list_head head; - struct list_head hook_chain[CR_PLUGIN_HOOK__MAX]; + struct list_head head; + struct list_head hook_chain[CR_PLUGIN_HOOK__MAX]; } cr_plugin_ctl_t; extern cr_plugin_ctl_t cr_plugin_ctl; typedef struct { - cr_plugin_desc_t *d; - struct list_head list; - void *dlhandle; - struct list_head link[CR_PLUGIN_HOOK__MAX]; + cr_plugin_desc_t *d; + struct list_head list; + void *dlhandle; + struct list_head link[CR_PLUGIN_HOOK__MAX]; } plugin_desc_t; -#define run_plugins(__hook, ...) \ -({ \ - plugin_desc_t *this; \ - int __ret = -ENOTSUP; \ - \ - list_for_each_entry(this, &cr_plugin_ctl.hook_chain[CR_PLUGIN_HOOK__ ##__hook], \ - link[CR_PLUGIN_HOOK__ ##__hook]) { \ - pr_debug("plugin: `%s' hook %u -> %p\n", \ - this->d->name, CR_PLUGIN_HOOK__ ##__hook, \ - this->d->hooks[CR_PLUGIN_HOOK__ ##__hook]); \ - __ret = ((CR_PLUGIN_HOOK__ ##__hook ##_t *) \ - this->d->hooks[CR_PLUGIN_HOOK__ ##__hook])(__VA_ARGS__); \ - if (__ret == -ENOTSUP) \ - continue; \ - break; \ - } \ - __ret; \ -}) +#define run_plugins(__hook, ...) \ + ({ \ + plugin_desc_t *this; \ + int __ret = -ENOTSUP; \ + \ + list_for_each_entry(this, &cr_plugin_ctl.hook_chain[CR_PLUGIN_HOOK__##__hook], \ + link[CR_PLUGIN_HOOK__##__hook]) { \ + pr_debug("plugin: `%s' hook %u -> %p\n", this->d->name, CR_PLUGIN_HOOK__##__hook, \ + this->d->hooks[CR_PLUGIN_HOOK__##__hook]); \ + __ret = ((CR_PLUGIN_HOOK__##__hook##_t *)this->d->hooks[CR_PLUGIN_HOOK__##__hook])( \ + __VA_ARGS__); \ + if (__ret == -ENOTSUP) \ + continue; \ + break; \ + } \ + __ret; \ + }) #endif diff --git a/criu/include/posix-timer.h b/criu/include/posix-timer.h index fa99d8628..b1f4b1ab6 100644 --- a/criu/include/posix-timer.h +++ b/criu/include/posix-timer.h @@ -8,7 +8,8 @@ struct str_posix_timer { int clock_id; int si_signo; int it_sigev_notify; - void * sival_ptr; + int notify_thread_id; + void *sival_ptr; }; struct proc_posix_timer { @@ -21,7 +22,7 @@ struct proc_posix_timers_stat { struct list_head timers; }; -extern int parse_posix_timers(pid_t pid, struct proc_posix_timers_stat * args); +extern int parse_posix_timers(pid_t pid, struct proc_posix_timers_stat *args); void free_posix_timers(struct proc_posix_timers_stat *st); #endif /* __CR_PROC_POSIX_TIMER_H__ */ diff --git a/criu/include/prctl.h b/criu/include/prctl.h index 8e7fef317..2966659da 100644 --- a/criu/include/prctl.h +++ b/criu/include/prctl.h @@ -4,82 +4,104 @@ #include "int.h" #ifndef PR_SET_NAME -# define PR_SET_NAME 15 +#define PR_SET_NAME 15 #endif #ifndef PR_GET_NAME -# define PR_GET_NAME 16 +#define PR_GET_NAME 16 #endif #ifndef PR_SET_SECCOMP -# define PR_SET_SECCOMP 22 +#define PR_SET_SECCOMP 22 #endif #ifndef PR_CAPBSET_READ -# define PR_CAPBSET_READ 23 +#define PR_CAPBSET_READ 23 #endif #ifndef PR_CAPBSET_DROP -# define PR_CAPBSET_DROP 24 +#define PR_CAPBSET_DROP 24 #endif #ifndef PR_GET_SECUREBITS -# define PR_GET_SECUREBITS 27 +#define PR_GET_SECUREBITS 27 #endif #ifndef PR_SET_SECUREBITS -# define PR_SET_SECUREBITS 28 +#define PR_SET_SECUREBITS 28 #endif #ifndef PR_GET_DUMPABLE -# define PR_GET_DUMPABLE 3 +#define PR_GET_DUMPABLE 3 #endif #ifndef PR_SET_DUMPABLE -# define PR_SET_DUMPABLE 4 +#define PR_SET_DUMPABLE 4 +#endif +#ifndef PR_GET_NO_NEW_PRIVS +#define PR_GET_NO_NEW_PRIVS 39 +#endif +#ifndef PR_SET_NO_NEW_PRIVS +#define PR_SET_NO_NEW_PRIVS 38 +#endif +#ifndef PR_CAP_AMBIENT +#define PR_CAP_AMBIENT 47 +#endif +#ifndef PR_CAP_AMBIENT_IS_SET +#define PR_CAP_AMBIENT_IS_SET 1 +#endif +#ifndef PR_CAP_AMBIENT_RAISE +#define PR_CAP_AMBIENT_RAISE 2 #endif #ifndef PR_SET_MM -#define PR_SET_MM 35 -# define PR_SET_MM_START_CODE 1 -# define PR_SET_MM_END_CODE 2 -# define PR_SET_MM_START_DATA 3 -# define PR_SET_MM_END_DATA 4 -# define PR_SET_MM_START_STACK 5 -# define PR_SET_MM_START_BRK 6 -# define PR_SET_MM_BRK 7 -# define PR_SET_MM_ARG_START 8 -# define PR_SET_MM_ARG_END 9 -# define PR_SET_MM_ENV_START 10 -# define PR_SET_MM_ENV_END 11 -# define PR_SET_MM_AUXV 12 -# define PR_SET_MM_EXE_FILE 13 +#define PR_SET_MM 35 +#define PR_SET_MM_START_CODE 1 +#define PR_SET_MM_END_CODE 2 +#define PR_SET_MM_START_DATA 3 +#define PR_SET_MM_END_DATA 4 +#define PR_SET_MM_START_STACK 5 +#define PR_SET_MM_START_BRK 6 +#define PR_SET_MM_BRK 7 +#define PR_SET_MM_ARG_START 8 +#define PR_SET_MM_ARG_END 9 +#define PR_SET_MM_ENV_START 10 +#define PR_SET_MM_ENV_END 11 +#define PR_SET_MM_AUXV 12 +#define PR_SET_MM_EXE_FILE 13 #endif #ifndef PR_SET_MM_MAP -# define PR_SET_MM_MAP 14 -# define PR_SET_MM_MAP_SIZE 15 +#define PR_SET_MM_MAP 14 +#define PR_SET_MM_MAP_SIZE 15 struct prctl_mm_map { - u64 start_code; - u64 end_code; - u64 start_data; - u64 end_data; - u64 start_brk; - u64 brk; - u64 start_stack; - u64 arg_start; - u64 arg_end; - u64 env_start; - u64 env_end; - u64 *auxv; - u32 auxv_size; - u32 exe_fd; + u64 start_code; + u64 end_code; + u64 start_data; + u64 end_data; + u64 start_brk; + u64 brk; + u64 start_stack; + u64 arg_start; + u64 arg_end; + u64 env_start; + u64 env_end; + u64 *auxv; + u32 auxv_size; + u32 exe_fd; }; #endif #ifndef PR_GET_TID_ADDRESS -# define PR_GET_TID_ADDRESS 40 +#define PR_GET_TID_ADDRESS 40 #endif #ifndef PR_SET_THP_DISABLE -# define PR_SET_THP_DISABLE 41 +#define PR_SET_THP_DISABLE 41 #endif #ifndef PR_GET_THP_DISABLE -# define PR_GET_THP_DISABLE 42 +#define PR_GET_THP_DISABLE 42 +#endif + +#ifndef PR_TIMER_CREATE_RESTORE_IDS +#define PR_TIMER_CREATE_RESTORE_IDS 77 +# define PR_TIMER_CREATE_RESTORE_IDS_OFF 0 +# define PR_TIMER_CREATE_RESTORE_IDS_ON 1 +# define PR_TIMER_CREATE_RESTORE_IDS_GET 2 #endif #endif /* __CR_PRCTL_H__ */ diff --git a/criu/include/proc_parse.h b/criu/include/proc_parse.h index 96a097b3d..76d3242d2 100644 --- a/criu/include/proc_parse.h +++ b/criu/include/proc_parse.h @@ -3,67 +3,67 @@ #include -#include +#include "compel/infect.h" -#define PROC_TASK_COMM_LEN 32 -#define PROC_TASK_COMM_LEN_FMT "(%31s" +#define PROC_TASK_COMM_LEN 32 +#define PROC_TASK_COMM_LEN_FMT "(%31s" struct proc_pid_stat { - int pid; - char comm[PROC_TASK_COMM_LEN]; - char state; - int ppid; - int pgid; - int sid; - int tty_nr; - int tty_pgrp; - unsigned int flags; - unsigned long min_flt; - unsigned long cmin_flt; - unsigned long maj_flt; - unsigned long cmaj_flt; - unsigned long utime; - unsigned long stime; - long cutime; - long cstime; - long priority; - long nice; - int num_threads; - int zero0; - unsigned long long start_time; - unsigned long vsize; - long mm_rss; - unsigned long rsslim; - unsigned long start_code; - unsigned long end_code; - unsigned long start_stack; - unsigned long esp; - unsigned long eip; - unsigned long sig_pending; - unsigned long sig_blocked; - unsigned long sig_ignored; - unsigned long sig_handled; - unsigned long wchan; - unsigned long zero1; - unsigned long zero2; - int exit_signal; - int task_cpu; - unsigned int rt_priority; - unsigned int policy; - unsigned long long delayacct_blkio_ticks; - unsigned long gtime; - long cgtime; - unsigned long start_data; - unsigned long end_data; - unsigned long start_brk; - unsigned long arg_start; - unsigned long arg_end; - unsigned long env_start; - unsigned long env_end; - int exit_code; + int pid; + char comm[PROC_TASK_COMM_LEN]; + char state; + int ppid; + int pgid; + int sid; + int tty_nr; + int tty_pgrp; + unsigned int flags; + unsigned long min_flt; + unsigned long cmin_flt; + unsigned long maj_flt; + unsigned long cmaj_flt; + unsigned long utime; + unsigned long stime; + long cutime; + long cstime; + long priority; + long nice; + int num_threads; + int zero0; + unsigned long long start_time; + unsigned long vsize; + long mm_rss; + unsigned long rsslim; + unsigned long start_code; + unsigned long end_code; + unsigned long start_stack; + unsigned long esp; + unsigned long eip; + unsigned long sig_pending; + unsigned long sig_blocked; + unsigned long sig_ignored; + unsigned long sig_handled; + unsigned long wchan; + unsigned long zero1; + unsigned long zero2; + int exit_signal; + int task_cpu; + unsigned int rt_priority; + unsigned int policy; + unsigned long long delayacct_blkio_ticks; + unsigned long gtime; + long cgtime; + unsigned long start_data; + unsigned long end_data; + unsigned long start_brk; + unsigned long arg_start; + unsigned long arg_end; + unsigned long env_start; + unsigned long env_end; + int exit_code; }; -#define PROC_CAP_SIZE 2 +#define PROC_CAP_SIZE 2 struct proc_status_creds { struct seize_task_status s; @@ -71,16 +71,17 @@ struct proc_status_creds { unsigned int uids[4]; unsigned int gids[4]; - u32 last_filter; + u32 last_filter; /* * Keep them at the end of structure * for fast comparison reason. */ - u32 cap_inh[PROC_CAP_SIZE]; - u32 cap_prm[PROC_CAP_SIZE]; - u32 cap_eff[PROC_CAP_SIZE]; - u32 cap_bnd[PROC_CAP_SIZE]; + u32 cap_inh[PROC_CAP_SIZE]; + u32 cap_prm[PROC_CAP_SIZE]; + u32 cap_eff[PROC_CAP_SIZE]; + u32 cap_bnd[PROC_CAP_SIZE]; + u32 cap_amb[PROC_CAP_SIZE]; }; #define INVALID_UID ((uid_t)-1) @@ -88,7 +89,7 @@ struct proc_status_creds { extern int parse_pid_stat(pid_t pid, struct proc_pid_stat *s); extern unsigned int parse_pid_loginuid(pid_t pid, int *err, bool ignore_noent); extern int parse_pid_oom_score_adj(pid_t pid, int *err); -extern int prepare_loginuid(unsigned int value, unsigned int loglevel); +extern int prepare_loginuid(unsigned int value); extern int parse_pid_status(pid_t pid, struct seize_task_status *, void *data); extern int parse_file_locks(void); extern int get_fd_mntid(int fd, int *mnt_id); @@ -102,4 +103,8 @@ extern bool is_vma_range_fmt(char *line); extern void parse_vmflags(char *buf, u32 *flags, u64 *madv, int *io_pf); extern int parse_uptime(uint64_t *upt); +extern int parse_timens_offsets(struct timespec *boff, struct timespec *moff); + +extern bool found_uprobes_vma(void); + #endif /* __CR_PROC_PARSE_H__ */ diff --git a/criu/include/protobuf-desc.h b/criu/include/protobuf-desc.h index 31f5b9a79..c4241be55 100644 --- a/criu/include/protobuf-desc.h +++ b/criu/include/protobuf-desc.h @@ -6,7 +6,7 @@ enum { /* PB_AUTOGEN_START */ - PB_INVENTORY, /* 0 */ + PB_INVENTORY, /* 0 */ PB_STATS, PB_FDINFO, PB_CORE, @@ -16,7 +16,7 @@ enum { PB_POSIX_TIMER, PB_CREDS, PB_FS, - PB_UTSNS, /* 10 */ + PB_UTSNS, /* 10 */ PB_IPC_VAR, PB_IPC_SHM, PB_IPC_SEM, @@ -26,7 +26,7 @@ enum { PB_TCP_STREAM, PB_REG_FILE, PB_EXT_FILE, - PB_NS_FILE, /* 20 */ + PB_NS_FILE, /* 20 */ PB_INET_SK, PB_UNIX_SK, PB_PACKET_SOCK, @@ -36,7 +36,7 @@ enum { PB_PIPE_DATA, PB_EVENTFD_FILE, PB_EVENTPOLL_FILE, - PB_EVENTPOLL_TFD, /* 30 */ + PB_EVENTPOLL_TFD, /* 30 */ PB_SIGNALFD, PB_INOTIFY_FILE, PB_INOTIFY_WD, @@ -46,7 +46,7 @@ enum { PB_TTY_INFO, PB_FILE_LOCK, PB_RLIMIT, - PB_PAGEMAP, /* 40 */ + PB_PAGEMAP, /* 40 */ PB_SIGINFO, PB_TUNFILE, PB_IRMAP_CACHE, @@ -56,11 +56,21 @@ enum { PB_CPUINFO, PB_USERNS, PB_NETNS, - PB_BINFMT_MISC, /* 50 */ + PB_BINFMT_MISC, /* 50 */ PB_TTY_DATA, PB_AUTOFS, PB_GHOST_CHUNK, PB_FILE, + PB_MEMFD_FILE, + PB_MEMFD_INODE, + PB_TIMENS, + PB_IMG_STREAMER_REQUEST, + PB_IMG_STREAMER_REPLY, + PB_PIDNS, + PB_BPFMAP_FILE, + PB_BPFMAP_DATA, + PB_APPARMOR, + PB_PIDFD, /* PB_AUTOGEN_STOP */ @@ -78,15 +88,15 @@ enum { typedef size_t (*pb_getpksize_t)(void *obj); typedef size_t (*pb_pack_t)(void *obj, void *where); -typedef void *(*pb_unpack_t)(void *allocator, size_t size, void *from); -typedef void (*pb_free_t)(void *obj, void *allocator); +typedef void *(*pb_unpack_t)(void *allocator, size_t size, void *from); +typedef void (*pb_free_t)(void *obj, void *allocator); struct cr_pb_message_desc { - pb_getpksize_t getpksize; - pb_pack_t pack; - pb_unpack_t unpack; - pb_free_t free; - const ProtobufCMessageDescriptor *pb_desc; + pb_getpksize_t getpksize; + pb_pack_t pack; + pb_unpack_t unpack; + pb_free_t free; + const ProtobufCMessageDescriptor *pb_desc; }; extern void cr_pb_init(void); diff --git a/criu/include/protobuf.h b/criu/include/protobuf.h index fb7489e9d..11cca1564 100644 --- a/criu/include/protobuf.h +++ b/criu/include/protobuf.h @@ -11,19 +11,16 @@ struct cr_img; extern int do_pb_read_one(struct cr_img *, void **objp, int type, bool eof); -#define pb_read_one(fd, objp, type) do_pb_read_one(fd, (void **)objp, type, false) +#define pb_read_one(fd, objp, type) do_pb_read_one(fd, (void **)objp, type, false) #define pb_read_one_eof(fd, objp, type) do_pb_read_one(fd, (void **)objp, type, true) extern int pb_write_one(struct cr_img *, void *obj, int type); -#define pb_pksize(__obj, __proto_message_name) \ - (__proto_message_name ##__get_packed_size(__obj) + sizeof(u32)) +#define pb_pksize(__obj, __proto_message_name) (__proto_message_name##__get_packed_size(__obj) + sizeof(u32)) -#define pb_repeated_size(__obj, __member) \ - ((size_t)(sizeof(*(__obj)->__member) * (__obj)->n_ ##__member)) +#define pb_repeated_size(__obj, __member) ((size_t)(sizeof(*(__obj)->__member) * (__obj)->n_##__member)) -#define pb_msg(__base, __type) \ - container_of(__base, __type, base) +#define pb_msg(__base, __type) container_of(__base, __type, base) #include @@ -35,9 +32,9 @@ struct collect_image_info { unsigned flags; }; -#define COLLECT_SHARED 0x1 /* use shared memory for obj-s */ -#define COLLECT_NOFREE 0x2 /* don't free entry after callback */ -#define COLLECT_HAPPENED 0x4 /* image was opened and collected */ +#define COLLECT_SHARED 0x1 /* use shared memory for obj-s */ +#define COLLECT_NOFREE 0x2 /* don't free entry after callback */ +#define COLLECT_HAPPENED 0x4 /* image was opened and collected */ extern int collect_image(struct collect_image_info *); extern int collect_entry(ProtobufCMessage *base, struct collect_image_info *cinfo); @@ -52,4 +49,11 @@ static inline int collect_images(struct collect_image_info **array, unsigned siz return 0; } +/* + * To speed up reading of packed objects + * by providing space on stack, this should + * be more than enough for most objects. + */ +#define PB_PKOBJ_LOCAL_SIZE 1024 + #endif /* __CR_PROTOBUF_H__ */ diff --git a/criu/include/pstree.h b/criu/include/pstree.h index 7303c1fed..b750a919e 100644 --- a/criu/include/pstree.h +++ b/criu/include/pstree.h @@ -11,24 +11,24 @@ * That's the init process which usually inherit * all orphaned children in the system. */ -#define INIT_PID (1) +#define INIT_PID (1) struct pstree_item { - struct pstree_item *parent; - struct list_head children; /* list of my children */ - struct list_head sibling; /* linkage in my parent's children list */ + struct pstree_item *parent; + struct list_head children; /* list of my children */ + struct list_head sibling; /* linkage in my parent's children list */ - struct pid *pid; - pid_t pgid; - pid_t sid; - pid_t born_sid; + struct pid *pid; + pid_t pgid; + pid_t sid; + pid_t born_sid; - int nr_threads; /* number of threads */ - struct pid *threads; /* array of threads */ - CoreEntry **core; - TaskKobjIdsEntry *ids; + int nr_threads; /* number of threads */ + struct pid *threads; /* array of threads */ + CoreEntry **core; + TaskKobjIdsEntry *ids; union { - futex_t task_st; - unsigned long task_st_le_bits; + futex_t task_st; + unsigned long task_st_le_bits; }; }; @@ -38,11 +38,11 @@ static inline pid_t vpid(const struct pstree_item *i) } enum { - FDS_EVENT_BIT = 0, + FDS_EVENT_BIT = 0, }; #define FDS_EVENT (1 << FDS_EVENT_BIT) -struct pstree_item *current; +extern struct pstree_item *current; struct rst_info; /* See alloc_pstree_item() for details */ @@ -51,6 +51,11 @@ static inline struct rst_info *rsti(struct pstree_item *i) return (struct rst_info *)(i + 1); } +struct thread_lsm { + char *profile; + char *sockcreate; +}; + struct ns_id; struct dmp_info { struct ns_id *netns; @@ -58,6 +63,14 @@ struct dmp_info { struct parasite_ctl *parasite_ctl; struct parasite_thread_ctl **thread_ctls; uint64_t *thread_sp; + struct criu_rseq_cs *thread_rseq_cs; + + /* + * Although we don't support dumping different struct creds in general, + * we do for threads. Let's keep track of their profiles here; a NULL + * entry means there was no LSM profile for this thread. + */ + struct thread_lsm **thread_lsms; }; static inline struct dmp_info *dmpi(const struct pstree_item *i) @@ -68,8 +81,7 @@ static inline struct dmp_info *dmpi(const struct pstree_item *i) /* ids is allocated and initialized for all alive tasks */ static inline int shared_fdtable(struct pstree_item *item) { - return (item->parent && - item->ids->files_id == item->parent->ids->files_id); + return (item->parent && item->ids->files_id == item->parent->ids->files_id); } static inline bool is_alive_state(int state) @@ -92,9 +104,9 @@ extern void pstree_insert_pid(struct pid *pid_node); extern struct pid *pstree_pid_by_virt(pid_t pid); extern struct pstree_item *root_item; +extern bool has_children(struct pstree_item *item); extern struct pstree_item *pstree_item_next(struct pstree_item *item); -#define for_each_pstree_item(pi) \ - for (pi = root_item; pi != NULL; pi = pstree_item_next(pi)) +#define for_each_pstree_item(pi) for (pi = root_item; pi != NULL; pi = pstree_item_next(pi)) extern bool restore_before_setsid(struct pstree_item *child); extern int prepare_pstree(void); @@ -113,7 +125,7 @@ extern int prepare_task_entries(void); extern int prepare_dummy_task_state(struct pstree_item *pi); extern int get_task_ids(struct pstree_item *); -extern struct _TaskKobjIdsEntry *root_ids; +extern TaskKobjIdsEntry *root_ids; extern void core_entry_free(CoreEntry *core); extern CoreEntry *core_entry_alloc(int alloc_thread_info, int alloc_tc); diff --git a/criu/include/ptrace-compat.h b/criu/include/ptrace-compat.h index e16fef036..a2f211129 100644 --- a/criu/include/ptrace-compat.h +++ b/criu/include/ptrace-compat.h @@ -7,9 +7,9 @@ #ifndef CONFIG_HAS_PTRACE_PEEKSIGINFO struct ptrace_peeksiginfo_args { - __u64 off; /* from which siginfo to start */ + __u64 off; /* from which siginfo to start */ __u32 flags; - __u32 nr; /* how may siginfos to take */ + __u32 nr; /* how may siginfos to take */ }; #endif diff --git a/criu/include/rbtree.h b/criu/include/rbtree.h index 007950638..6981aa8f9 100644 --- a/criu/include/rbtree.h +++ b/criu/include/rbtree.h @@ -9,26 +9,32 @@ #include "common/compiler.h" -#define RB_RED 0 -#define RB_BLACK 1 -#define RB_MASK 3 +#define RB_RED 0 +#define RB_BLACK 1 +#define RB_MASK 3 struct rb_node { - unsigned long rb_parent_color; /* Keeps both parent anc color */ - struct rb_node *rb_right; - struct rb_node *rb_left; + unsigned long rb_parent_color; /* Keeps both parent and color */ + struct rb_node *rb_right; + struct rb_node *rb_left; } __aligned(sizeof(long)); struct rb_root { - struct rb_node *rb_node; + struct rb_node *rb_node; }; -#define rb_parent(r) ((struct rb_node *)((r)->rb_parent_color & ~RB_MASK)) -#define rb_color(r) ((r)->rb_parent_color & RB_BLACK) -#define rb_is_red(r) (!rb_color(r)) -#define rb_is_black(r) (rb_color(r)) -#define rb_set_red(r) do { (r)->rb_parent_color &= ~RB_BLACK; } while (0) -#define rb_set_black(r) do { (r)->rb_parent_color |= RB_BLACK; } while (0) +#define rb_parent(r) ((struct rb_node *)((r)->rb_parent_color & ~RB_MASK)) +#define rb_color(r) ((r)->rb_parent_color & RB_BLACK) +#define rb_is_red(r) (!rb_color(r)) +#define rb_is_black(r) (rb_color(r)) +#define rb_set_red(r) \ + do { \ + (r)->rb_parent_color &= ~RB_BLACK; \ + } while (0) +#define rb_set_black(r) \ + do { \ + (r)->rb_parent_color |= RB_BLACK; \ + } while (0) static inline void rb_set_parent(struct rb_node *rb, struct rb_node *p) { @@ -40,16 +46,20 @@ static inline void rb_set_color(struct rb_node *rb, int color) rb->rb_parent_color = (rb->rb_parent_color & ~RB_BLACK) | color; } -#define RB_ROOT (struct rb_root){ NULL, } -#define rb_entry(ptr, type, member) container_of(ptr, type, member) +#define RB_ROOT \ + (struct rb_root) \ + { \ + NULL, \ + } +#define rb_entry(ptr, type, member) container_of(ptr, type, member) -#define RB_EMPTY_ROOT(root) ((root)->rb_node == NULL) -#define RB_EMPTY_NODE(node) (rb_parent(node) == node) -#define RB_CLEAR_NODE(node) (rb_set_parent(node, node)) +#define RB_EMPTY_ROOT(root) ((root)->rb_node == NULL) +#define RB_EMPTY_NODE(node) (rb_parent(node) == node) +#define RB_CLEAR_NODE(node) (rb_set_parent(node, node)) static inline void rb_init_node(struct rb_node *node) { - *node = (struct rb_node){ }; + *node = (struct rb_node){}; RB_CLEAR_NODE(node); } @@ -64,11 +74,9 @@ extern struct rb_node *rb_next(const struct rb_node *node); extern struct rb_node *rb_prev(const struct rb_node *node); /* Fast replacement of a single node without remove/rebalance/add/rebalance */ -extern void rb_replace_node(struct rb_node *victim, struct rb_node *new, - struct rb_root *root); +extern void rb_replace_node(struct rb_node *victim, struct rb_node *new, struct rb_root *root); -static inline void rb_link_node(struct rb_node *node, struct rb_node *parent, - struct rb_node **rb_link) +static inline void rb_link_node(struct rb_node *node, struct rb_node *parent, struct rb_node **rb_link) { node->rb_parent_color = (unsigned long)parent; node->rb_left = node->rb_right = NULL; @@ -76,10 +84,8 @@ static inline void rb_link_node(struct rb_node *node, struct rb_node *parent, *rb_link = node; } -static inline void rb_link_and_balance(struct rb_root *root, - struct rb_node *node, - struct rb_node *parent, - struct rb_node **rb_link) +static inline void rb_link_and_balance(struct rb_root *root, struct rb_node *node, struct rb_node *parent, + struct rb_node **rb_link) { rb_link_node(node, parent, rb_link); rb_insert_color(node, root); diff --git a/criu/include/restore.h b/criu/include/restore.h index 8ef0dbddf..189051826 100644 --- a/criu/include/restore.h +++ b/criu/include/restore.h @@ -7,4 +7,57 @@ extern int arch_set_thread_regs_nosigrt(struct pid *pid); +struct task_restore_args; +struct pstree_item; +struct rst_shstk_info; + +#ifndef arch_shstk_prepare +static inline int arch_shstk_prepare(struct pstree_item *item, + CoreEntry *core, + struct task_restore_args *ta) +{ + return 0; +} +#define arch_shstk_prepare arch_shstk_prepare +#endif + +#ifndef arch_shstk_unlock +static inline int arch_shstk_unlock(struct pstree_item *item, + CoreEntry *core, pid_t pid) +{ + return 0; +} +#define arch_shstk_unlock arch_shstk_unlock +#endif + +#ifndef arch_shstk_trampoline +static inline int arch_shstk_trampoline(struct pstree_item *item, CoreEntry *core, + int (*func)(void *arg), void *arg) +{ + return func(arg); +} +#define arch_shstk_trampoline arch_shstk_trampoline +#endif + +#ifndef shstk_restorer_stack_size +static always_inline long shstk_restorer_stack_size(void) +{ + return 0; +} +#endif + +#ifndef shstk_set_restorer_stack +static always_inline long shstk_set_restorer_stack(struct rst_shstk_info *info, void *ptr) +{ + return 0; +} +#endif + +#ifndef shstk_min_mmap_addr +static always_inline long shstk_min_mmap_addr(struct rst_shstk_info *info, unsigned long def) +{ + return def; +} +#endif + #endif diff --git a/criu/include/restorer.h b/criu/include/restorer.h index b93807f5f..14c0a3768 100644 --- a/criu/include/restorer.h +++ b/criu/include/restorer.h @@ -28,14 +28,14 @@ /* * These *must* be power of two values. */ -#define RESTORE_ARGS_SIZE (512) -#define RESTORE_STACK_REDZONE (128) -#define RESTORE_STACK_SIZE (KILO(32)) +#define RESTORE_ARGS_SIZE (512) +#define RESTORE_STACK_REDZONE (128) +#define RESTORE_STACK_SIZE (KILO(32)) struct restore_mem_zone { - u8 redzone[RESTORE_STACK_REDZONE]; - u8 stack[RESTORE_STACK_SIZE]; - u8 rt_sigframe[RESTORE_STACK_SIGFRAME]; + u8 redzone[RESTORE_STACK_REDZONE]; + u8 stack[RESTORE_STACK_SIZE]; + u8 rt_sigframe[RESTORE_STACK_SIGFRAME]; } __stack_aligned__; struct rst_sched_param { @@ -44,12 +44,22 @@ struct rst_sched_param { int prio; }; +struct rst_rseq_param { + u64 rseq_abi_pointer; + u32 rseq_abi_size; + u32 signature; +}; + struct restore_posix_timer { struct str_posix_timer spt; struct itimerspec val; int overrun; }; +#ifndef rst_shstk_info +struct rst_shstk_info {}; +#endif + /* * We should be able to construct fpu sigframe in sigreturn_prep_fpu_frame, * so the mem_zone.rt_sigframe should be 64-bytes aligned. To make things @@ -57,66 +67,71 @@ struct restore_posix_timer { */ struct thread_creds_args { - CredsEntry creds; + CredsEntry creds; - unsigned int cap_last_cap; + unsigned int cap_last_cap; - u32 cap_inh[CR_CAP_SIZE]; - u32 cap_prm[CR_CAP_SIZE]; - u32 cap_eff[CR_CAP_SIZE]; - u32 cap_bnd[CR_CAP_SIZE]; + u32 cap_inh[CR_CAP_SIZE]; + u32 cap_prm[CR_CAP_SIZE]; + u32 cap_eff[CR_CAP_SIZE]; + u32 cap_bnd[CR_CAP_SIZE]; + u32 cap_amb[CR_CAP_SIZE]; - unsigned int secbits; - char *lsm_profile; - unsigned int *groups; - char *lsm_sockcreate; + char *lsm_profile; + unsigned int *groups; + char *lsm_sockcreate; - unsigned long mem_lsm_profile_pos; - unsigned long mem_lsm_sockcreate_pos; - unsigned long mem_groups_pos; + unsigned long mem_lsm_profile_pos; + unsigned long mem_lsm_sockcreate_pos; + unsigned long mem_groups_pos; - unsigned long mem_pos_next; + unsigned long mem_pos_next; }; struct thread_seccomp_filter { - struct sock_fprog sock_fprog; - unsigned int flags; + struct sock_fprog sock_fprog; + unsigned int flags; }; struct thread_restore_args { - struct restore_mem_zone *mz; + struct restore_mem_zone *mz; - int pid; - UserRegsEntry gpregs; - u64 clear_tid_addr; + int pid; + UserRegsEntry gpregs; + u64 clear_tid_addr; - u64 futex_rla; - u32 futex_rla_len; + u64 futex_rla; + u32 futex_rla_len; - struct rst_sched_param sp; + struct rst_sched_param sp; - struct task_restore_args *ta; + struct task_restore_args *ta; - tls_t tls; + tls_t tls; + struct rst_rseq_param rseq; - siginfo_t *siginfo; - unsigned int siginfo_n; + siginfo_t *siginfo; + unsigned int siginfo_n; - int pdeath_sig; + int pdeath_sig; - struct thread_creds_args *creds_args; + struct thread_creds_args *creds_args; - int seccomp_mode; - unsigned long seccomp_filters_pos; - struct thread_seccomp_filter *seccomp_filters; - void *seccomp_filters_data; - unsigned int seccomp_filters_n; - bool seccomp_force_tsync; + int seccomp_mode; + unsigned long seccomp_filters_pos; + struct thread_seccomp_filter *seccomp_filters; + void *seccomp_filters_data; + unsigned int seccomp_filters_n; + bool seccomp_force_tsync; - char comm[TASK_COMM_LEN]; + struct rst_shstk_info shstk; + + char comm[TASK_COMM_LEN]; + int cg_set; + int cgroupd_sk; } __aligned(64); -typedef long (*thread_restore_fcall_t) (struct thread_restore_args *args); +typedef long (*thread_restore_fcall_t)(struct thread_restore_args *args); struct restore_vma_io { int nr_iovs; @@ -124,111 +139,124 @@ struct restore_vma_io { struct iovec iovs[0]; }; -#define RIO_SIZE(niovs) (sizeof(struct restore_vma_io) + (niovs) * sizeof(struct iovec)) +#define RIO_SIZE(niovs) (sizeof(struct restore_vma_io) + (niovs) * sizeof(struct iovec)) struct task_restore_args { - struct thread_restore_args *t; /* thread group leader */ + struct thread_restore_args *t; /* thread group leader */ - int fd_exe_link; /* opened self->exe file */ - int logfd; - unsigned int loglevel; - struct timeval logstart; + int fd_exe_link; /* opened self->exe file */ + int logfd; + unsigned int loglevel; + struct timeval logstart; - int uffd; - bool has_thp_enabled; + int uffd; + bool thp_disabled; /* threads restoration */ - int nr_threads; /* number of threads */ - thread_restore_fcall_t clone_restore_fn; /* helper address for clone() call */ - struct thread_restore_args *thread_args; /* array of thread arguments */ - struct task_entries *task_entries; - void *rst_mem; - unsigned long rst_mem_size; + int nr_threads; /* number of threads */ + thread_restore_fcall_t clone_restore_fn; /* helper address for clone() call */ + struct thread_restore_args *thread_args; /* array of thread arguments */ + struct task_entries *task_entries; + void *rst_mem; + unsigned long rst_mem_size; /* Below arrays get remapped from RM_PRIVATE in sigreturn_restore */ - VmaEntry *vmas; - unsigned int vmas_n; + VmaEntry *vmas; + unsigned int vmas_n; - int vma_ios_fd; - struct restore_vma_io *vma_ios; - unsigned int vma_ios_n; + int vma_ios_fd; + struct restore_vma_io *vma_ios; + unsigned int vma_ios_n; - struct restore_posix_timer *posix_timers; - unsigned int posix_timers_n; + struct restore_posix_timer *posix_timers; + unsigned int posix_timers_n; + bool posix_timer_cr_ids; - struct restore_timerfd *timerfd; - unsigned int timerfd_n; + struct restore_timerfd *timerfd; + unsigned int timerfd_n; - siginfo_t *siginfo; - unsigned int siginfo_n; + siginfo_t *siginfo; + unsigned int siginfo_n; - struct rst_tcp_sock *tcp_socks; - unsigned int tcp_socks_n; + struct rst_tcp_sock *tcp_socks; + unsigned int tcp_socks_n; - struct rst_aio_ring *rings; - unsigned int rings_n; + struct rst_aio_ring *rings; + unsigned int rings_n; - struct rlimit64 *rlims; - unsigned int rlims_n; + struct rlimit64 *rlims; + unsigned int rlims_n; - pid_t *helpers /* the TASK_HELPERS to wait on at the end of restore */; - unsigned int helpers_n; + pid_t *helpers /* the TASK_HELPERS to wait on at the end of restore */; + unsigned int helpers_n; - pid_t *zombies; - unsigned int zombies_n; + pid_t *zombies; + unsigned int zombies_n; - int *inotify_fds; /* fds to cleanup inotify events at CR_STATE_RESTORE_SIGCHLD stage */ - unsigned int inotify_fds_n; + int *inotify_fds; /* fds to cleanup inotify events at CR_STATE_RESTORE_SIGCHLD stage */ + unsigned int inotify_fds_n; /* * * * * * * * * * * * * * * * * * * * */ - unsigned long task_size; - unsigned long premmapped_addr; - unsigned long premmapped_len; - rt_sigaction_t sigchld_act; + unsigned long task_size; + unsigned long premmapped_addr; + unsigned long premmapped_len; + rt_sigaction_t sigchld_act; - void *bootstrap_start; - unsigned long bootstrap_len; + void *bootstrap_start; + unsigned long bootstrap_len; - struct itimerval itimers[3]; + struct itimerval itimers[3]; - MmEntry mm; - auxv_t mm_saved_auxv[AT_VECTOR_SIZE]; - u32 mm_saved_auxv_size; - char comm[TASK_COMM_LEN]; + MmEntry mm; + auxv_t mm_saved_auxv[AT_VECTOR_SIZE]; + u32 mm_saved_auxv_size; + char comm[TASK_COMM_LEN]; /* * proc_fd is a handle to /proc that the restorer blob can use to open * files there, because some of them can't be opened before the * restorer blob is called. */ - int proc_fd; + int proc_fd; - int seccomp_mode; + int seccomp_mode; - bool compatible_mode; + bool compatible_mode; - bool can_map_vdso; - bool auto_dedup; - unsigned long vdso_rt_size; - struct vdso_maps vdso_maps_rt; /* runtime vdso symbols */ - unsigned long vdso_rt_parked_at; /* safe place to keep vdso */ - void **breakpoint; + bool can_map_vdso; + bool auto_dedup; + unsigned long vdso_rt_size; + struct vdso_maps vdso_maps_rt; /* runtime vdso symbols */ + unsigned long vdso_rt_parked_at; /* safe place to keep vdso */ + void **breakpoint; - enum faults fault_strategy; + enum faults fault_strategy; #ifdef ARCH_HAS_LONG_PAGES - unsigned page_size; + unsigned page_size; #endif - int lsm_type; - int child_subreaper; + int lsm_type; + int child_subreaper; + int membarrier_registration_mask; + bool has_clone3_set_tid; + + /* + * info about rseq from libc used to + * unregister it before memory restoration procedure + */ + struct rst_rseq_param libc_rseq; + + uid_t uid; + u32 cap_eff[CR_CAP_SIZE]; + + struct rst_shstk_info shstk; } __aligned(64); /* * For arm64 stack needs to aligned to 16 bytes. * Hence align to 16 bytes for all */ -#define RESTORE_ALIGN_STACK(start, size) \ - (ALIGN((start) + (size) - 16, 16)) +#define RESTORE_ALIGN_STACK(start, size) (ALIGN((start) + (size)-16, 16)) static inline unsigned long restorer_stack(struct restore_mem_zone *mz) { @@ -244,12 +272,12 @@ enum { * The first stated stage is CR_STATE_ROOT_TASK which is started * right before calling fork_with_pid() for the root_item. */ - CR_STATE_FAIL = -1, + CR_STATE_FAIL = -1, /* * Root task is created and does some pre-checks. * After the stage ACT_SETUP_NS scripts are performed. */ - CR_STATE_ROOT_TASK = 0, + CR_STATE_ROOT_TASK = 0, /* * The prepare_namespace() is called. * After the stage criu opens root task's mntns and @@ -303,14 +331,37 @@ enum { CR_STATE_COMPLETE }; -#define restore_finish_stage(__v, __stage) ({ \ - futex_dec_and_wake(&(__v)->nr_in_progress); \ - futex_wait_while(&(__v)->start, __stage); \ - (s32) futex_get(&(__v)->start); \ +#define restore_finish_stage(__v, __stage) \ + ({ \ + futex_dec_and_wake(&(__v)->nr_in_progress); \ + futex_wait_while(&(__v)->start, __stage); \ + (s32) futex_get(&(__v)->start); \ }) +#define __r_sym(name) restorer_sym##name +#define restorer_sym(rblob, name) (void *)(rblob + __r_sym(name)) -#define __r_sym(name) restorer_sym ## name -#define restorer_sym(rblob, name) (void*)(rblob + __r_sym(name)) +#ifndef arch_shstk_switch_to_restorer +static inline int arch_shstk_switch_to_restorer(struct rst_shstk_info *shstk) +{ + return 0; +} +#define arch_shstk_switch_to_restorer arch_shstk_switch_to_restorer +#endif + +#ifndef arch_shstk_restore +static inline int arch_shstk_restore(struct rst_shstk_info *shstk) +{ + return 0; +} +#define arch_shstk_restore arch_shstk_restore +#endif + +#ifndef shstk_vma_restore +static always_inline int shstk_vma_restore(VmaEntry *vma_entry) +{ + return -1; +} +#endif #endif /* __CR_RESTORER_H__ */ diff --git a/criu/include/rst-malloc.h b/criu/include/rst-malloc.h index 67391ba77..918359a93 100644 --- a/criu/include/rst-malloc.h +++ b/criu/include/rst-malloc.h @@ -52,9 +52,10 @@ extern void rst_mem_switch_to_private(void); */ extern unsigned long rst_mem_align_cpos(int type); extern void *rst_mem_remap_ptr(unsigned long pos, int type); -#define RST_MEM_FIXUP_PPTR(ptr) do { \ - ptr = rst_mem_remap_ptr((unsigned long)ptr, RM_PRIVATE);\ -} while (0) +#define RST_MEM_FIXUP_PPTR(ptr) \ + do { \ + ptr = rst_mem_remap_ptr((unsigned long)ptr, RM_PRIVATE); \ + } while (0) /* * Allocate and free objects. We don't need to free arbitrary diff --git a/criu/include/rst_info.h b/criu/include/rst_info.h index 07c634f4a..deb297e5f 100644 --- a/criu/include/rst_info.h +++ b/criu/include/rst_info.h @@ -1,9 +1,13 @@ #ifndef __CR_RST_INFO_H__ #define __CR_RST_INFO_H__ +#include "asm/restore.h" #include "common/lock.h" #include "common/list.h" #include "vma.h" +#include "kerndat.h" +#include "images/mm.pb-c.h" +#include "images/core.pb-c.h" struct task_entries { int nr_threads, nr_tasks, nr_helpers; @@ -11,65 +15,79 @@ struct task_entries { futex_t start; atomic_t cr_err; mutex_t userns_sync_lock; + mutex_t cgroupd_sync_lock; mutex_t last_pid_mutex; }; struct fdt { - int nr; /* How many tasks share this fd table */ - pid_t pid; /* Who should restore this fd table */ + int nr; /* How many tasks share this fd table */ + pid_t pid; /* Who should restore this fd table */ /* * The fd table is ready for restoing, if fdt_lock is equal to nr - * The fdt table was restrored, if fdt_lock is equal to nr + 1 + * The fdt table was restored, if fdt_lock is equal to nr + 1 */ - futex_t fdt_lock; + futex_t fdt_lock; }; -struct _MmEntry; +struct rst_rseq { + uint64_t rseq_abi_pointer; + uint64_t rseq_cs_pointer; +}; + +#ifndef ARCH_RST_INFO +struct rst_arch_info { +}; +#endif struct rst_info { - struct list_head fds; + struct list_head fds; - void *premmapped_addr; - unsigned long premmapped_len; - unsigned long clone_flags; + void *premmapped_addr; + unsigned long premmapped_len; + unsigned long clone_flags; - void *munmap_restorer; + void *munmap_restorer; int service_fd_id; - struct fdt *fdt; + struct fdt *fdt; - struct vm_area_list vmas; - struct _MmEntry *mm; - struct list_head vma_io; - unsigned int pages_img_id; + struct vm_area_list vmas; + MmEntry *mm; + struct list_head vma_io; + unsigned int pages_img_id; - u32 cg_set; + u32 cg_set; union { - struct pstree_item *pgrp_leader; - futex_t pgrp_set; + struct pstree_item *pgrp_leader; + futex_t pgrp_set; }; - struct file_desc *cwd; - struct file_desc *root; - bool has_umask; - u32 umask; + struct file_desc *cwd; + struct file_desc *root; + bool has_umask; + u32 umask; /* * We set this flag when process has seccomp filters * so that we know to suspend them before we unmap the * restorer blob. */ - bool has_seccomp; + bool has_seccomp; /* * To be compatible with old images where filters * are bound to group leader and we need to use tsync flag. */ - bool has_old_seccomp_filter; + bool has_old_seccomp_filter; - bool has_thp_enabled; + struct rst_rseq *rseqe; - void *breakpoint; + futex_t shstk_enable; + futex_t shstk_unlock; + + void *breakpoint; + + struct rst_arch_info arch_info; }; extern struct task_entries *task_entries; diff --git a/criu/include/sched.h b/criu/include/sched.h new file mode 100644 index 000000000..9f9f993ce --- /dev/null +++ b/criu/include/sched.h @@ -0,0 +1,33 @@ +#ifndef __CR_SCHED_H__ +#define __CR_SCHED_H__ + +#include + +#ifndef ptr_to_u64 +#define ptr_to_u64(ptr) ((__u64)((uintptr_t)(ptr))) +#endif +#ifndef u64_to_ptr +#define u64_to_ptr(x) ((void *)(uintptr_t)x) +#endif + +/* + * This structure is needed by clone3(). The kernel + * calls it 'struct clone_args'. As CRIU will always + * need at least this part of the structure (VER1) + * to be able to test if clone3() with set_tid works, + * the structure is defined here as 'struct _clone_args'. + */ + +struct _clone_args { + __aligned_u64 flags; + __aligned_u64 pidfd; + __aligned_u64 child_tid; + __aligned_u64 parent_tid; + __aligned_u64 exit_signal; + __aligned_u64 stack; + __aligned_u64 stack_size; + __aligned_u64 tls; + __aligned_u64 set_tid; + __aligned_u64 set_tid_size; +}; +#endif /* __CR_SCHED_H__ */ diff --git a/criu/include/seccomp.h b/criu/include/seccomp.h index 8e200124e..e20b9d655 100644 --- a/criu/include/seccomp.h +++ b/criu/include/seccomp.h @@ -45,19 +45,19 @@ struct rb_node; * which has no filters yet. */ struct seccomp_filter_chain { - struct seccomp_filter_chain *prev; - SeccompFilter filter; + struct seccomp_filter_chain *prev; + SeccompFilter filter; }; struct seccomp_entry { - struct rb_node node; - struct seccomp_entry *next; - pid_t tid_real; - size_t img_filter_pos; - unsigned int mode; + struct rb_node node; + struct seccomp_entry *next; + pid_t tid_real; + size_t img_filter_pos; + unsigned int mode; - struct seccomp_filter_chain *chain; - size_t nr_chains; + struct seccomp_filter_chain *chain; + size_t nr_chains; }; extern struct seccomp_entry *seccomp_lookup(pid_t tid_real, bool create, bool mandatory); diff --git a/criu/include/seize.h b/criu/include/seize.h index cf7366cb0..fc7facad3 100644 --- a/criu/include/seize.h +++ b/criu/include/seize.h @@ -2,8 +2,14 @@ #define __CR_SEIZE_H__ extern int collect_pstree(void); +extern int checkpoint_devices(void); +struct pstree_item; extern void pstree_switch_state(struct pstree_item *root_item, int st); extern const char *get_real_freezer_state(void); extern bool alarm_timeouted(void); +extern char *task_comm_info(pid_t pid, char *comm, size_t size); +extern char *__task_comm_info(pid_t pid); +extern void set_compel_interrupt_only_mode(void); + #endif diff --git a/criu/include/servicefd.h b/criu/include/servicefd.h index 986c46af5..4265d94ed 100644 --- a/criu/include/servicefd.h +++ b/criu/include/servicefd.h @@ -14,17 +14,20 @@ enum sfd_type { LOG_FD_OFF, IMG_FD_OFF, - PROC_FD_OFF, /* fd with /proc for all proc_ calls */ + IMG_STREAMER_FD_OFF, + PROC_FD_OFF, /* fd with /proc for all proc_ calls */ PROC_PID_FD_OFF, - CR_PROC_FD_OFF, /* some other's proc fd: + PROC_SELF_FD_OFF, + CR_PROC_FD_OFF, /* some other's proc fd: * - For dump -- target ns' proc * - For restore -- CRIU ns' proc */ - ROOT_FD_OFF, /* Root of the namespace we dump/restore */ + ROOT_FD_OFF, /* Root of the namespace we dump/restore */ CGROUP_YARD, - USERNSD_SK, /* Socket for usernsd */ - NS_FD_OFF, /* Node's net namespace fd */ - TRANSPORT_FD_OFF, /* to transfer file descriptors */ + CGROUPD_SK, /* Socket for cgroupd to fix up thread's cgroup controller */ + USERNSD_SK, /* Socket for usernsd */ + NS_FD_OFF, /* Node's net namespace fd */ + TRANSPORT_FD_OFF, /* to transfer file descriptors */ RPC_SK_OFF, FDSTORE_SK_OFF, @@ -34,7 +37,6 @@ enum sfd_type { struct pstree_item; extern bool sfds_protected; - extern const char *sfd_type_name(enum sfd_type type); extern int init_service_fd(void); extern int get_service_fd(enum sfd_type type); @@ -43,6 +45,7 @@ extern bool is_service_fd(int fd, enum sfd_type type); extern int service_fd_min_fd(struct pstree_item *item); extern int install_service_fd(enum sfd_type type, int fd); extern int close_service_fd(enum sfd_type type); +extern void __close_service_fd(enum sfd_type type); extern int clone_service_fd(struct pstree_item *me); #endif /* __CR_SERVICE_FD_H__ */ diff --git a/criu/include/setproctitle.h b/criu/include/setproctitle.h index bc634331b..a4873578a 100644 --- a/criu/include/setproctitle.h +++ b/criu/include/setproctitle.h @@ -1,19 +1,7 @@ #ifndef __CR_SETPROCTITLE_H__ #define __CR_SETPROCTITLE_H__ -#ifdef CONFIG_HAS_LIBBSD -#include -#else - -/* - * setproctitle_init is in the libbsd since v0.6.0. This macro allows to - * compile criu with libbsd<0.6.0. - */ -#ifndef CONFIG_HAS_SETPROCTITLE_INIT -#define setproctitle_init(argc, argv, envp) -#endif - -#define setproctitle(fmt, ...) -#endif +extern void __setproctitle_init(int argc, char *argv[], char *envp[]); +extern void __setproctitle(const char *fmt, ...); #endif /* __CR_SETPROCTITLE_H__ */ diff --git a/criu/include/shmem.h b/criu/include/shmem.h index 04ab8d076..15cab1146 100644 --- a/criu/include/shmem.h +++ b/criu/include/shmem.h @@ -4,18 +4,20 @@ #include "int.h" #include "common/lock.h" #include "images/vma.pb-c.h" +#include "pagemap-cache.h" -struct _VmaEntry; struct vma_area; extern int collect_shmem(int pid, struct vma_area *vma); extern int collect_sysv_shmem(unsigned long shmid, unsigned long size); extern int cr_dump_shmem(void); -extern int add_shmem_area(pid_t pid, VmaEntry *vma, u64 *map); +extern int add_shmem_area(pid_t pid, VmaEntry *vma, pmc_t *pmc); extern int fixup_sysv_shmems(void); +extern int dump_one_memfd_shmem(int fd, unsigned long shmid, unsigned long size); extern int dump_one_sysv_shmem(void *addr, unsigned long size, unsigned long shmid); extern int restore_sysv_shmem_content(void *addr, unsigned long size, unsigned long shmid); +extern int restore_memfd_shmem_content(int fd, unsigned long shmid, unsigned long size); -#define SYSV_SHMEM_SKIP_FD (0x7fffffff) +#define SYSV_SHMEM_SKIP_FD (0x7fffffff) #endif /* __CR_SHMEM_H__ */ diff --git a/criu/include/sigact.h b/criu/include/sigact.h new file mode 100644 index 000000000..4df011f96 --- /dev/null +++ b/criu/include/sigact.h @@ -0,0 +1,14 @@ +#ifndef __CR_SIGACT_H__ +#define __CR_SIGACT_H__ + +#include "images/core.pb-c.h" + +extern rt_sigaction_t sigchld_act; + +struct parasite_ctl; +struct pstree_item; + +extern int prepare_sigactions(CoreEntry *core); +extern int parasite_dump_sigacts_seized(struct parasite_ctl *ctl, struct pstree_item *); + +#endif diff --git a/criu/include/sigframe.h b/criu/include/sigframe.h index b63d9f0e5..539d7b17c 100644 --- a/criu/include/sigframe.h +++ b/criu/include/sigframe.h @@ -8,9 +8,7 @@ #include #include "images/core.pb-c.h" -extern int construct_sigframe(struct rt_sigframe *sigframe, - struct rt_sigframe *rsigframe, - k_rtsigset_t *blkset, +extern int construct_sigframe(struct rt_sigframe *sigframe, struct rt_sigframe *rsigframe, k_rtsigset_t *blkset, CoreEntry *core); #endif /* __CR_SIGFRAME_H__ */ diff --git a/criu/include/sizes.h b/criu/include/sizes.h new file mode 100644 index 000000000..0ec977fc0 --- /dev/null +++ b/criu/include/sizes.h @@ -0,0 +1,50 @@ +#ifndef __CR_SIZES_H__ +#define __CR_SIZES_H__ + +/* + * Copied from the Linux kernel header include/linux/sizes.h + */ + +#define SZ_1 0x00000001 +#define SZ_2 0x00000002 +#define SZ_4 0x00000004 +#define SZ_8 0x00000008 +#define SZ_16 0x00000010 +#define SZ_32 0x00000020 +#define SZ_64 0x00000040 +#define SZ_128 0x00000080 +#define SZ_256 0x00000100 +#define SZ_512 0x00000200 + +#define SZ_1K 0x00000400 +#define SZ_2K 0x00000800 +#define SZ_4K 0x00001000 +#define SZ_8K 0x00002000 +#define SZ_16K 0x00004000 +#define SZ_32K 0x00008000 +#define SZ_64K 0x00010000 +#define SZ_128K 0x00020000 +#define SZ_256K 0x00040000 +#define SZ_512K 0x00080000 + +#define SZ_1M 0x00100000 +#define SZ_2M 0x00200000 +#define SZ_4M 0x00400000 +#define SZ_8M 0x00800000 +#define SZ_16M 0x01000000 +#define SZ_32M 0x02000000 +#define SZ_64M 0x04000000 +#define SZ_128M 0x08000000 +#define SZ_256M 0x10000000 +#define SZ_512M 0x20000000 + +#define SZ_1G 0x40000000 +#define SZ_2G 0x80000000 + +#define SZ_4G 0x100000000ULL +#define SZ_8G 0x200000000ULL +#define SZ_16G 0x400000000ULL +#define SZ_32G 0x800000000ULL +#define SZ_64T 0x400000000000ULL + +#endif /* __CR_SIZES_H__ */ diff --git a/criu/include/sk-inet.h b/criu/include/sk-inet.h index 79966517b..69ee8589e 100644 --- a/criu/include/sk-inet.h +++ b/criu/include/sk-inet.h @@ -8,39 +8,39 @@ #include "common/list.h" #include "images/sk-inet.pb-c.h" -#define INET_ADDR_LEN 48 /* max of INET_ADDRSTRLEN and INET6_ADDRSTRLEN */ +#define INET_ADDR_LEN 48 /* max of INET_ADDRSTRLEN and INET6_ADDRSTRLEN */ #ifndef TCP_REPAIR -#define TCP_REPAIR 19 /* TCP sock is under repair right now */ -#define TCP_REPAIR_QUEUE 20 -#define TCP_QUEUE_SEQ 21 -#define TCP_REPAIR_OPTIONS 22 +#define TCP_REPAIR 19 /* TCP sock is under repair right now */ +#define TCP_REPAIR_QUEUE 20 +#define TCP_QUEUE_SEQ 21 +#define TCP_REPAIR_OPTIONS 22 #endif #ifndef IP_HDRINCL -# define IP_HDRINCL 3 +#define IP_HDRINCL 3 #endif #ifndef IP_NODEFRAG -# define IP_NODEFRAG 22 +#define IP_NODEFRAG 22 #endif #ifndef IPV6_HDRINCL -# define IPV6_HDRINCL 36 +#define IPV6_HDRINCL 36 #endif struct inet_sk_desc { - struct socket_desc sd; - unsigned int type; - unsigned int src_port; - unsigned int dst_port; - unsigned int state; - unsigned int rqlen; - unsigned int wqlen; /* sent + unsent data */ - unsigned int uwqlen; /* unsent data */ - unsigned int src_addr[4]; - unsigned int dst_addr[4]; - unsigned short shutdown; - bool cork; + struct socket_desc sd; + unsigned int type; + unsigned int src_port; + unsigned int dst_port; + unsigned int state; + unsigned int rqlen; + unsigned int wqlen; /* sent + unsent data */ + unsigned int uwqlen; /* unsent data */ + unsigned int src_addr[4]; + unsigned int dst_addr[4]; + unsigned short shutdown; + bool cork; int rfd; int cpt_reuseaddr; @@ -68,7 +68,8 @@ extern int inet_bind(int sk, struct inet_sk_info *); extern int inet_connect(int sk, struct inet_sk_info *); #ifdef CR_NOGLIBC -#define setsockopt sys_setsockopt +#define setsockopt sys_setsockopt +#define pr_perror(fmt, ...) pr_err(fmt ": errno %d\n", ##__VA_ARGS__, -ret) #endif static inline void tcp_repair_off(int fd) { @@ -76,30 +77,32 @@ static inline void tcp_repair_off(int fd) ret = setsockopt(fd, SOL_TCP, TCP_REPAIR, &aux, sizeof(aux)); if (ret < 0) - pr_err("Failed to turn off repair mode on socket: %m\n"); + pr_perror("Failed to turn off repair mode on socket %d", fd); } extern void tcp_locked_conn_add(struct inet_sk_info *); extern void rst_unlock_tcp_connections(void); extern void cpt_unlock_tcp_connections(void); -extern int dump_one_tcp(int sk, struct inet_sk_desc *sd); +extern int dump_one_tcp(int sk, struct inet_sk_desc *sd, SkOptsEntry *soe); extern int restore_one_tcp(int sk, struct inet_sk_info *si); -#define SK_EST_PARAM "tcp-established" +extern int dump_tcp_opts(int sk, TcpOptsEntry *toe); +extern int restore_tcp_opts(int sk, TcpOptsEntry *toe); + +#define SK_EST_PARAM "tcp-established" #define SK_INFLIGHT_PARAM "skip-in-flight" -#define SK_CLOSE_PARAM "tcp-close" +#define SK_CLOSE_PARAM "tcp-close" struct task_restore_args; int prepare_tcp_socks(struct task_restore_args *); struct rst_tcp_sock { - int sk; - bool reuseaddr; + int sk; + bool reuseaddr; }; union libsoccr_addr; -int restore_sockaddr(union libsoccr_addr *sa, - int family, u32 pb_port, u32 *pb_addr, u32 ifindex); +int restore_sockaddr(union libsoccr_addr *sa, int family, u32 pb_port, u32 *pb_addr, u32 ifindex); #endif /* __CR_SK_INET_H__ */ diff --git a/criu/include/sk-packet.h b/criu/include/sk-packet.h index a0738ae0c..5fe6d3d62 100644 --- a/criu/include/sk-packet.h +++ b/criu/include/sk-packet.h @@ -2,7 +2,7 @@ #define __CR_SK_PACKET_H__ #ifndef PACKET_TIMESTAMP -#define PACKET_TIMESTAMP 17 +#define PACKET_TIMESTAMP 17 #endif struct cr_imgset; @@ -22,7 +22,7 @@ extern int packet_receive_one(struct nlmsghdr *h, struct ns_id *ns, void *arg); #endif #ifndef PACKET_FANOUT -#define PACKET_FANOUT 18 +#define PACKET_FANOUT 18 #endif #ifndef TPACKET3_HDRLEN diff --git a/criu/include/sockets.h b/criu/include/sockets.h index cd98d18e0..6c81d3edd 100644 --- a/criu/include/sockets.h +++ b/criu/include/sockets.h @@ -17,16 +17,17 @@ struct nlmsghdr; struct cr_img; struct socket_desc { - unsigned int family; - unsigned int ino; - struct socket_desc *next; - struct ns_id *sk_ns; - int already_dumped; + unsigned int family; + unsigned int ino; + struct socket_desc *next; + struct ns_id *sk_ns; + int already_dumped; }; extern int dump_socket(struct fd_parms *p, int lfd, FdinfoEntry *); -extern int dump_socket_opts(int sk, SkOptsEntry *soe); +extern int dump_socket_opts(int sk, int family, SkOptsEntry *soe); extern int restore_socket_opts(int sk, SkOptsEntry *soe); +extern int sk_setbufs(int sk, uint32_t *bufs); extern void release_skopts(SkOptsEntry *); extern int restore_prepare_socket(int sk); extern void preload_socket_modules(void); @@ -62,24 +63,27 @@ extern int unix_sk_id_add(unsigned int ino); extern int unix_sk_ids_parse(char *optarg); extern int unix_prepare_root_shared(void); -extern int do_dump_opt(int sk, int level, int name, void *val, int len); -#define dump_opt(s, l, n, f) do_dump_opt(s, l, n, f, sizeof(*f)) -extern int do_restore_opt(int sk, int level, int name, void *val, int len); -#define restore_opt(s, l, n, f) do_restore_opt(s, l, n, f, sizeof(*f)) +extern void init_sk_info_hash(void); -#define sk_encode_shutdown(img, mask) do { \ +extern int do_dump_opt(int sk, int level, int name, void *val, int len); +#define dump_opt(s, l, n, f) do_dump_opt(s, l, n, f, sizeof(*f)) +extern int do_restore_opt(int sk, int level, int name, void *val, int len); +#define restore_opt(s, l, n, f) do_restore_opt(s, l, n, f, sizeof(*f)) + +#define sk_encode_shutdown(img, mask) \ + do { \ /* \ * protobuf SK_SHUTDOWN__ bits match those \ * reported by kernel \ - */ \ - (img)->shutdown = mask; \ - if ((img)->shutdown != SK_SHUTDOWN__NONE) \ - (img)->has_shutdown = true; \ + */ \ + (img)->shutdown = mask; \ + if ((img)->shutdown != SK_SHUTDOWN__NONE) \ + (img)->has_shutdown = true; \ } while (0) static inline int sk_decode_shutdown(int val) { - static const int hows[] = {-1, SHUT_RD, SHUT_WR, SHUT_RDWR}; + static const int hows[] = { -1, SHUT_RD, SHUT_WR, SHUT_RDWR }; return hows[val]; } @@ -92,7 +96,7 @@ static inline int sk_decode_shutdown(int val) extern int set_netns(uint32_t ns_id); #ifndef SIOCGSKNS -#define SIOCGSKNS 0x894C /* get socket network namespace */ +#define SIOCGSKNS 0x894C /* get socket network namespace */ #endif extern int kerndat_socket_netns(void); @@ -105,19 +109,23 @@ extern const char *socket_proto_name(unsigned int proto, char *nm, size_t size); #define __tcp_state_name(state, a) tcp_state_name(state, a, sizeof(a)) #define __socket_type_name(type, a) socket_type_name(type, a, sizeof(a)) -#define __socket_family_name(family, a) socket_family_name(family, a, sizeof(a)) +#define __socket_family_name(family, a) socket_family_name(family, a, sizeof(a)) #define __socket_proto_name(proto, a) socket_proto_name(proto, a, sizeof(a)) -#define __socket_info_helper(__h, __v) \ - ({ \ - char *__nm = alloca(32); \ - const char *__r = __h(__v, __nm, 32); \ - __r; \ +#define __socket_info_helper(__h, __v) \ + ({ \ + char *__nm = alloca(32); \ + const char *__r = __h(__v, __nm, 32); \ + __r; \ }) -#define ___tcp_state_name(state) __socket_info_helper(tcp_state_name, state) -#define ___socket_type_name(type) __socket_info_helper(socket_type_name, type) -#define ___socket_family_name(family) __socket_info_helper(socket_family_name, family) -#define ___socket_proto_name(proto) __socket_info_helper(socket_proto_name, proto) +#define ___tcp_state_name(state) __socket_info_helper(tcp_state_name, state) +#define ___socket_type_name(type) __socket_info_helper(socket_type_name, type) +#define ___socket_family_name(family) __socket_info_helper(socket_family_name, family) +#define ___socket_proto_name(proto) __socket_info_helper(socket_proto_name, proto) + +#ifndef SO_BUF_LOCK +#define SO_BUF_LOCK 72 +#endif #endif /* __CR_SOCKETS_H__ */ diff --git a/criu/include/stats.h b/criu/include/stats.h index bab9a0507..d8dd15998 100644 --- a/criu/include/stats.h +++ b/criu/include/stats.h @@ -45,9 +45,10 @@ enum { }; extern void cnt_add(int c, unsigned long val); +extern void cnt_sub(int c, unsigned long val); -#define DUMP_STATS 1 -#define RESTORE_STATS 2 +#define DUMP_STATS 1 +#define RESTORE_STATS 2 extern int init_stats(int what); extern void write_stats(int what); diff --git a/criu/include/string.h b/criu/include/string.h index bc5f9d219..4c71d961c 100644 --- a/criu/include/string.h +++ b/criu/include/string.h @@ -3,18 +3,9 @@ #include -#ifdef CONFIG_HAS_LIBBSD -# include -#endif - #include "common/config.h" -#ifndef CONFIG_HAS_STRLCPY -extern size_t strlcpy(char *dest, const char *src, size_t size); -#endif - -#ifndef CONFIG_HAS_STRLCAT -extern size_t strlcat(char *dest, const char *src, size_t count); -#endif +extern size_t __strlcpy(char *dest, const char *src, size_t size); +extern size_t __strlcat(char *dest, const char *src, size_t count); #endif /* __CR_STRING_H__ */ diff --git a/criu/include/sysctl.h b/criu/include/sysctl.h index e271f5ead..2d689a9a0 100644 --- a/criu/include/sysctl.h +++ b/criu/include/sysctl.h @@ -2,10 +2,10 @@ #define __CR_SYSCTL_H__ struct sysctl_req { - char *name; - void *arg; - int type; - int flags; + char *name; + void *arg; + int type; + int flags; }; extern int sysctl_op(struct sysctl_req *req, size_t nr_req, int op, unsigned int ns); @@ -15,27 +15,28 @@ enum { CTL_WRITE, }; -#define CTL_SHIFT 4 /* Up to 16 types */ +#define CTL_SHIFT 4 /* Up to 16 types */ -#define CTL_U32 1 /* Single u32 */ -#define CTL_U64 2 /* Single u64 */ -#define __CTL_U32A 3 /* Array of u32 */ -#define __CTL_U64A 4 /* Array of u64 */ -#define __CTL_STR 5 /* String */ -#define CTL_32 6 /* Single s32 */ +#define CTL_U32 1 /* Single u32 */ +#define CTL_U64 2 /* Single u64 */ +#define __CTL_U32A 3 /* Array of u32 */ +#define __CTL_U64A 4 /* Array of u64 */ +#define __CTL_STR 5 /* String */ +#define CTL_32 6 /* Single s32 */ -#define CTL_U32A(n) (__CTL_U32A | ((n) << CTL_SHIFT)) -#define CTL_U64A(n) (__CTL_U64A | ((n) << CTL_SHIFT)) -#define CTL_STR(len) (__CTL_STR | ((len) << CTL_SHIFT)) +#define CTL_U32A(n) (__CTL_U32A | ((n) << CTL_SHIFT)) +#define CTL_U64A(n) (__CTL_U64A | ((n) << CTL_SHIFT)) +#define CTL_STR(len) (__CTL_STR | ((len) << CTL_SHIFT)) -#define CTL_LEN(t) ((t) >> CTL_SHIFT) -#define CTL_TYPE(t) ((t) & ((1 << CTL_SHIFT) - 1)) +#define CTL_LEN(t) ((t) >> CTL_SHIFT) +#define CTL_TYPE(t) ((t) & ((1 << CTL_SHIFT) - 1)) /* * Some entries might be missing mark them as optional. */ -#define CTL_FLAGS_OPTIONAL 1 -#define CTL_FLAGS_HAS 2 -#define CTL_FLAGS_READ_EIO_SKIP 4 +#define CTL_FLAGS_OPTIONAL 1 +#define CTL_FLAGS_HAS 2 +#define CTL_FLAGS_READ_EIO_SKIP 4 +#define CTL_FLAGS_IPC_EACCES_SKIP 8 #endif /* __CR_SYSCTL_H__ */ diff --git a/criu/include/sysfs_parse.h b/criu/include/sysfs_parse.h index 3ba06ed56..f987d622f 100644 --- a/criu/include/sysfs_parse.h +++ b/criu/include/sysfs_parse.h @@ -2,9 +2,9 @@ #define __CR_SYSFS_PARSE_H__ #define SYSFS_AUFS "/sys/fs/aufs/" -#define SBINFO_LEN (3 + 16 + 1) /* si_%lx */ -#define SBINFO_PATH_LEN (sizeof SYSFS_AUFS + SBINFO_LEN) /* /sys/fs/aufs/ */ -#define AUFSBR_PATH_LEN (SBINFO_PATH_LEN + 6 + 1) /* /sys/fs/aufs//br%3d */ +#define SBINFO_LEN (3 + 16 + 1) /* si_%lx */ +#define SBINFO_PATH_LEN (sizeof SYSFS_AUFS + SBINFO_LEN) /* /sys/fs/aufs/ */ +#define AUFSBR_PATH_LEN (SBINFO_PATH_LEN + 6 + 1) /* /sys/fs/aufs//br%3d */ struct mount_info; struct vma_area; @@ -14,4 +14,3 @@ extern int fixup_aufs_vma_fd(struct vma_area *vma, int vm_file_fd); extern void free_aufs_branches(void); #endif /* __CR_SYSFS_PARSE_H__ */ - diff --git a/criu/include/timens.h b/criu/include/timens.h new file mode 100644 index 000000000..0567c5828 --- /dev/null +++ b/criu/include/timens.h @@ -0,0 +1,10 @@ +#ifndef __CR_TIME_NS_H__ +#define __CR_TIME_NS_H__ + +extern int dump_time_ns(int ns_id); +extern int prepare_timens(int pid); + +extern struct ns_desc time_ns_desc; +extern struct ns_desc time_for_children_ns_desc; + +#endif /* __CR_TIME_NS_H__ */ diff --git a/criu/include/timer.h b/criu/include/timer.h new file mode 100644 index 000000000..d1deb6051 --- /dev/null +++ b/criu/include/timer.h @@ -0,0 +1,17 @@ +#ifndef __CR_TIMER_H__ +#define __CR_TIMER_H__ + +#include "images/core.pb-c.h" + +struct task_restore_args; +struct pstree_item; +struct parasite_ctl; +struct proc_posix_timers_stat; + +extern int prepare_itimers(int pid, struct task_restore_args *args, CoreEntry *core); +extern int prepare_posix_timers(int pid, struct task_restore_args *ta, CoreEntry *core); + +extern int parasite_dump_itimers_seized(struct parasite_ctl *ctl, struct pstree_item *item); +extern int parasite_dump_posix_timers_seized(struct proc_posix_timers_stat *proc_args, struct parasite_ctl *ctl, + struct pstree_item *item); +#endif diff --git a/criu/include/timerfd.h b/criu/include/timerfd.h index 2e42a74fa..866cb13c7 100644 --- a/criu/include/timerfd.h +++ b/criu/include/timerfd.h @@ -11,12 +11,12 @@ struct pstree_item; struct restore_timerfd { - int id; - int fd; - int clockid; - int settime_flags; - unsigned long ticks; - struct itimerspec val; + int id; + int fd; + int clockid; + int settime_flags; + unsigned long ticks; + struct itimerspec val; }; extern const struct fdtype_ops timerfd_dump_ops; @@ -29,18 +29,16 @@ extern int check_timerfd(void); extern int is_timerfd_link(char *link); #ifndef TFD_TIMER_ABSTIME -# define TFD_TIMER_ABSTIME (1 << 0) +#define TFD_TIMER_ABSTIME (1 << 0) #endif #ifndef TFD_IOC_SET_TICKS -# define TFD_IOC_SET_TICKS _IOW('T', 0, u64) +#define TFD_IOC_SET_TICKS _IOW('T', 0, u64) #endif static inline int verify_timerfd(TimerfdEntry *tfe) { - if (tfe->clockid != CLOCK_REALTIME && - tfe->clockid != CLOCK_BOOTTIME && - tfe->clockid != CLOCK_MONOTONIC) { + if (tfe->clockid != CLOCK_REALTIME && tfe->clockid != CLOCK_BOOTTIME && tfe->clockid != CLOCK_MONOTONIC) { pr_err("Unknown clock type %d for %#x\n", tfe->clockid, tfe->id); return -1; } @@ -48,5 +46,4 @@ static inline int verify_timerfd(TimerfdEntry *tfe) return 0; } - #endif /* __CR_TIMERFD_H__ */ diff --git a/criu/include/tls.h b/criu/include/tls.h index aa2517887..f563c092c 100644 --- a/criu/include/tls.h +++ b/criu/include/tls.h @@ -1,10 +1,10 @@ #ifndef __CR_TLS_H__ #define __CR_TLS_H__ -# ifdef CONFIG_GNUTLS +#ifdef CONFIG_GNUTLS int tls_x509_init(int sockfd, bool is_server); -void tls_terminate_session(); +void tls_terminate_session(bool async); ssize_t tls_send(const void *buf, size_t len, int flags); ssize_t tls_recv(void *buf, size_t len, int flags); @@ -12,14 +12,14 @@ ssize_t tls_recv(void *buf, size_t len, int flags); int tls_send_data_from_fd(int fd, unsigned long len); int tls_recv_data_to_fd(int fd, unsigned long len); -# else /* CONFIG_GNUTLS */ +#else /* CONFIG_GNUTLS */ #define tls_x509_init(sockfd, is_server) (0) -#define tls_send(buf, len, flags) (-1) -#define tls_recv(buf, len, flags) (-1) -#define tls_send_data_from_fd(fd, len) (-1) -#define tls_recv_data_to_fd(fd, len) (-1) -#define tls_terminate_session() +#define tls_send(buf, len, flags) (-1) +#define tls_recv(buf, len, flags) (-1) +#define tls_send_data_from_fd(fd, len) (-1) +#define tls_recv_data_to_fd(fd, len) (-1) +#define tls_terminate_session(async) #endif /* CONFIG_HAS_GNUTLS */ diff --git a/criu/include/tty.h b/criu/include/tty.h index 8419593e5..3f9c53116 100644 --- a/criu/include/tty.h +++ b/criu/include/tty.h @@ -7,11 +7,11 @@ #include "files.h" /* Kernel's limit */ -#define TERMIOS_NCC 19 +#define TERMIOS_NCC 19 /* Popular serial console's majors, which not defined in */ -#define USB_SERIAL_MAJOR 188 -#define LOW_DENSE_SERIAL_MAJOR 204 +#define USB_SERIAL_MAJOR 188 +#define LOW_DENSE_SERIAL_MAJOR 204 extern const struct fdtype_ops tty_dump_ops; @@ -36,6 +36,6 @@ extern int tty_init_restore(void); extern int devpts_check_bindmount(struct mount_info *m); -#define OPT_SHELL_JOB "shell-job" +#define OPT_SHELL_JOB "shell-job" #endif /* __CR_TTY_H__ */ diff --git a/criu/include/tun.h b/criu/include/tun.h index ce0b266a6..200ead22b 100644 --- a/criu/include/tun.h +++ b/criu/include/tun.h @@ -2,10 +2,10 @@ #define __CR_TUN_H__ #ifndef TUN_MINOR -#define TUN_MINOR 200 +#define TUN_MINOR 200 #endif -struct ns_id *ns; +extern struct ns_id *ns; #include diff --git a/criu/include/uffd.h b/criu/include/uffd.h index 814e60f33..102ff561c 100644 --- a/criu/include/uffd.h +++ b/criu/include/uffd.h @@ -3,7 +3,7 @@ struct task_restore_args; -extern int uffd_open(int flags, unsigned long *features); +extern int uffd_open(int flags, unsigned long *features, int *err); extern bool uffd_noncooperative(void); extern int setup_uffd(int pid, struct task_restore_args *task_args); extern int lazy_pages_setup_zombie(int pid); diff --git a/criu/include/unix_diag.h b/criu/include/unix_diag.h index d88d52fd3..cf612b248 100644 --- a/criu/include/unix_diag.h +++ b/criu/include/unix_diag.h @@ -2,30 +2,30 @@ #define __CR_UNIX_DIAG_H__ struct unix_diag_req { - u8 sdiag_family; - u8 sdiag_protocol; - u16 pad; - u32 udiag_states; - u32 udiag_ino; - u32 udiag_show; - u32 udiag_cookie[2]; + u8 sdiag_family; + u8 sdiag_protocol; + u16 pad; + u32 udiag_states; + u32 udiag_ino; + u32 udiag_show; + u32 udiag_cookie[2]; }; -#define UDIAG_SHOW_NAME 0x00000001 /* show name (not path) */ -#define UDIAG_SHOW_VFS 0x00000002 /* show VFS inode info */ -#define UDIAG_SHOW_PEER 0x00000004 /* show peer socket info */ -#define UDIAG_SHOW_ICONS 0x00000008 /* show pending connections */ -#define UDIAG_SHOW_RQLEN 0x00000010 /* show skb receive queue len */ -#define UDIAG_SHOW_MEMINFO 0x00000020 /* show memory info of a socket */ +#define UDIAG_SHOW_NAME 0x00000001 /* show name (not path) */ +#define UDIAG_SHOW_VFS 0x00000002 /* show VFS inode info */ +#define UDIAG_SHOW_PEER 0x00000004 /* show peer socket info */ +#define UDIAG_SHOW_ICONS 0x00000008 /* show pending connections */ +#define UDIAG_SHOW_RQLEN 0x00000010 /* show skb receive queue len */ +#define UDIAG_SHOW_MEMINFO 0x00000020 /* show memory info of a socket */ struct unix_diag_msg { - u8 udiag_family; - u8 udiag_type; - u8 udiag_state; - u8 pad; + u8 udiag_family; + u8 udiag_type; + u8 udiag_state; + u8 pad; - u32 udiag_ino; - u32 udiag_cookie[2]; + u32 udiag_ino; + u32 udiag_cookie[2]; }; enum { @@ -53,13 +53,13 @@ enum { }; struct unix_diag_vfs { - u32 udiag_vfs_ino; - u32 udiag_vfs_dev; + u32 udiag_vfs_ino; + u32 udiag_vfs_dev; }; struct unix_diag_rqlen { - u32 udiag_rqueue; - u32 udiag_wqueue; + u32 udiag_rqueue; + u32 udiag_wqueue; }; #endif /* __CR_UNIX_DIAG_H__ */ diff --git a/criu/include/util-caps.h b/criu/include/util-caps.h new file mode 100644 index 000000000..7ccd162f5 --- /dev/null +++ b/criu/include/util-caps.h @@ -0,0 +1,58 @@ +#ifndef __CR_UTIL_CAPS_H__ +#define __CR_UTIL_CAPS_H__ + +#include + +#ifndef CAP_CHECKPOINT_RESTORE +#define CAP_CHECKPOINT_RESTORE 40 +#endif + +static inline bool has_capability(int cap, u32 *cap_eff) +{ + int mask = CAP_TO_MASK(cap); + int index = CAP_TO_INDEX(cap); + u32 effective; + + effective = cap_eff[index]; + + if (!(mask & effective)) { + pr_debug("Effective capability %d missing\n", cap); + return false; + } + + return true; +} + +static inline bool has_cap_checkpoint_restore(u32 *cap_eff) +{ + /* + * Everything guarded by CAP_CHECKPOINT_RESTORE is also + * guarded by CAP_SYS_ADMIN. Check for both capabilities. + */ + if (has_capability(CAP_CHECKPOINT_RESTORE, cap_eff) || has_capability(CAP_SYS_ADMIN, cap_eff)) + return true; + + return false; +} + +static inline bool has_cap_net_admin(u32 *cap_eff) +{ + return has_capability(CAP_NET_ADMIN, cap_eff); +} + +static inline bool has_cap_sys_chroot(u32 *cap_eff) +{ + return has_capability(CAP_SYS_CHROOT, cap_eff); +} + +static inline bool has_cap_setuid(u32 *cap_eff) +{ + return has_capability(CAP_SETUID, cap_eff); +} + +static inline bool has_cap_sys_resource(u32 *cap_eff) +{ + return has_capability(CAP_SYS_RESOURCE, cap_eff); +} + +#endif /* __CR_UTIL_CAPS_H__ */ diff --git a/criu/include/util-pie.h b/criu/include/util-pie.h index a8137f441..4c622a440 100644 --- a/criu/include/util-pie.h +++ b/criu/include/util-pie.h @@ -5,12 +5,11 @@ #include #ifndef UNIX_PATH_MAX -#define UNIX_PATH_MAX (sizeof(struct sockaddr_un) - \ - (size_t)((struct sockaddr_un *) 0)->sun_path) +#define UNIX_PATH_MAX (sizeof(struct sockaddr_un) - (size_t)((struct sockaddr_un *)0)->sun_path) #endif #ifndef SO_PEEK_OFF -#define SO_PEEK_OFF 42 +#define SO_PEEK_OFF 42 #endif #include "common/scm.h" diff --git a/criu/include/util-vdso.h b/criu/include/util-vdso.h index 33b7411de..9fd9a6de4 100644 --- a/criu/include/util-vdso.h +++ b/criu/include/util-vdso.h @@ -23,74 +23,81 @@ #include "asm/vdso.h" struct vdso_symbol { - char name[32]; - unsigned long offset; + char name[32]; + unsigned long offset; }; struct vdso_symtable { - unsigned long vdso_size; - unsigned long vvar_size; - struct vdso_symbol symbols[VDSO_SYMBOL_MAX]; - bool vdso_before_vvar; /* order of vdso/vvar pair */ + unsigned long vdso_size; + unsigned long vvar_size; + unsigned long vvar_vclock_size; + struct vdso_symbol symbols[VDSO_SYMBOL_MAX]; + bool vdso_before_vvar; /* order of vdso/vvar pair */ }; struct vdso_maps { - unsigned long vdso_start; - unsigned long vvar_start; - struct vdso_symtable sym; - bool compatible; + unsigned long vdso_start; + unsigned long vvar_start; + struct vdso_symtable sym; + bool compatible; }; -#define VDSO_SYMBOL_INIT { .offset = VDSO_BAD_ADDR, } +static inline bool vdso_is_present(struct vdso_maps *m) +{ + return m->vdso_start != VDSO_BAD_ADDR; +} -#define VDSO_SYMTABLE_INIT \ - { \ +#define VDSO_SYMBOL_INIT \ + { \ + .offset = VDSO_BAD_ADDR, \ + } + +#define VDSO_SYMTABLE_INIT \ + { \ .vdso_size = VDSO_BAD_SIZE, \ .vvar_size = VVAR_BAD_SIZE, \ .symbols = { \ [0 ... VDSO_SYMBOL_MAX - 1] = \ (struct vdso_symbol)VDSO_SYMBOL_INIT, \ }, \ - .vdso_before_vvar = false, \ + .vdso_before_vvar = false, \ } -#define VDSO_MAPS_INIT \ - { \ - .vdso_start = VDSO_BAD_ADDR, \ - .vvar_start = VVAR_BAD_ADDR, \ - .sym = VDSO_SYMTABLE_INIT, \ +#define VDSO_MAPS_INIT \ + { \ + .vdso_start = VDSO_BAD_ADDR, .vvar_start = VVAR_BAD_ADDR, .sym = VDSO_SYMTABLE_INIT, \ } #ifdef CONFIG_VDSO_32 -#define Ehdr_t Elf32_Ehdr -#define Sym_t Elf32_Sym -#define Phdr_t Elf32_Phdr -#define Word_t Elf32_Word -#define Dyn_t Elf32_Dyn +#define Ehdr_t Elf32_Ehdr +#define Sym_t Elf32_Sym +#define Phdr_t Elf32_Phdr +#define Word_t Elf32_Word +#define Dyn_t Elf32_Dyn #ifndef ELF_ST_TYPE -#define ELF_ST_TYPE ELF32_ST_TYPE +#define ELF_ST_TYPE ELF32_ST_TYPE #endif #ifndef ELF_ST_BIND -#define ELF_ST_BIND ELF32_ST_BIND +#define ELF_ST_BIND ELF32_ST_BIND #endif -# define vdso_fill_symtable vdso_fill_symtable_compat +#define vdso_fill_symtable vdso_fill_symtable_compat #else /* CONFIG_VDSO_32 */ -#define Ehdr_t Elf64_Ehdr -#define Sym_t Elf64_Sym -#define Phdr_t Elf64_Phdr -#define Word_t Elf64_Word -#define Dyn_t Elf64_Dyn +#define Ehdr_t Elf64_Ehdr +#define Sym_t Elf64_Sym +#define Phdr_t Elf64_Phdr +#define Word_t Elf64_Word +#define Dyn_t Elf64_Dyn #ifndef ELF_ST_TYPE -#define ELF_ST_TYPE ELF64_ST_TYPE +#define ELF_ST_TYPE ELF64_ST_TYPE #endif #ifndef ELF_ST_BIND -#define ELF_ST_BIND ELF64_ST_BIND +#define ELF_ST_BIND ELF64_ST_BIND #endif #endif /* CONFIG_VDSO_32 */ diff --git a/criu/include/util.h b/criu/include/util.h index 313aacd8c..55ad5b63c 100644 --- a/criu/include/util.h +++ b/criu/include/util.h @@ -21,50 +21,52 @@ #include "log.h" #include "common/err.h" -#define PREF_SHIFT_OP(pref, op, size) ((size) op (pref ##BYTES_SHIFT)) -#define KBYTES_SHIFT 10 -#define MBYTES_SHIFT 20 -#define GBYTES_SHIFT 30 +#include "compel/infect-util.h" -#define KBYTES(size) PREF_SHIFT_OP(K, >>, size) -#define MBYTES(size) PREF_SHIFT_OP(M, >>, size) -#define GBYTES(size) PREF_SHIFT_OP(G, >>, size) +#define PREF_SHIFT_OP(pref, op, size) ((size)op(pref##BYTES_SHIFT)) +#define KBYTES_SHIFT 10 +#define MBYTES_SHIFT 20 +#define GBYTES_SHIFT 30 -#define KILO(size) PREF_SHIFT_OP(K, <<, size) -#define MEGA(size) PREF_SHIFT_OP(M, <<, size) -#define GIGA(size) PREF_SHIFT_OP(G, <<, size) +#define KBYTES(size) PREF_SHIFT_OP(K, >>, size) +#define MBYTES(size) PREF_SHIFT_OP(M, >>, size) +#define GBYTES(size) PREF_SHIFT_OP(G, >>, size) + +#define KILO(size) PREF_SHIFT_OP(K, <<, size) +#define MEGA(size) PREF_SHIFT_OP(M, <<, size) +#define GIGA(size) PREF_SHIFT_OP(G, <<, size) struct vma_area; struct list_head; extern int service_fd_rlim_cur; -extern void pr_vma(unsigned int loglevel, const struct vma_area *vma_area); +extern void pr_vma(const struct vma_area *vma_area); -#define pr_info_vma(vma_area) pr_vma(LOG_INFO, vma_area) +#define pr_info_vma(vma_area) pr_vma(vma_area) -#define pr_vma_list(level, head) \ - do { \ - struct vma_area *vma; \ - list_for_each_entry(vma, head, list) \ - pr_vma(level, vma); \ +#define pr_vma_list(head) \ + do { \ + struct vma_area *vma; \ + list_for_each_entry(vma, head, list) \ + pr_vma(vma); \ } while (0) -#define pr_info_vma_list(head) pr_vma_list(LOG_INFO, head) +#define pr_info_vma_list(head) pr_vma_list(head) extern int move_fd_from(int *img_fd, int want_fd); extern int close_safe(int *fd); extern int reopen_fd_as_safe(char *file, int line, int new_fd, int old_fd, bool allow_reuse_fd); -#define reopen_fd_as(new_fd, old_fd) reopen_fd_as_safe(__FILE__, __LINE__, new_fd, old_fd, false) -#define reopen_fd_as_nocheck(new_fd, old_fd) reopen_fd_as_safe(__FILE__, __LINE__, new_fd, old_fd, true) +#define reopen_fd_as(new_fd, old_fd) reopen_fd_as_safe(__FILE__, __LINE__, new_fd, old_fd, false) +#define reopen_fd_as_nocheck(new_fd, old_fd) reopen_fd_as_safe(__FILE__, __LINE__, new_fd, old_fd, true) extern void close_proc(void); extern int open_pid_proc(pid_t pid); extern int close_pid_proc(void); extern int set_proc_fd(int fd); -extern pid_t sys_clone_unified(unsigned long flags, void *child_stack, void *parent_tid, - void *child_tid, unsigned long newtls); +extern pid_t sys_clone_unified(unsigned long flags, void *child_stack, void *parent_tid, void *child_tid, + unsigned long newtls); /* * Values for pid argument of the proc opening routines below. @@ -73,72 +75,66 @@ extern pid_t sys_clone_unified(unsigned long flags, void *child_stack, void *par * NONE is internal, don't use it ;) */ -#define PROC_SELF 0 -#define PROC_GEN -1 -#define PROC_NONE -2 +#define PROC_SELF 0 +#define PROC_GEN -1 +#define PROC_NONE -2 -extern int do_open_proc(pid_t pid, int flags, const char *fmt, ...) - __attribute__ ((__format__ (__printf__, 3, 4))); +extern int do_open_proc(pid_t pid, int flags, const char *fmt, ...) __attribute__((__format__(__printf__, 3, 4))); -#define __open_proc(pid, ier, flags, fmt, ...) \ - ({ \ - int __fd = do_open_proc(pid, flags, \ - fmt, ##__VA_ARGS__); \ - if (__fd < 0 && (errno != (ier))) \ - pr_perror("Can't open %d/" fmt " on procfs", \ - pid, ##__VA_ARGS__); \ - \ - __fd; \ +#define __open_proc(pid, ier, flags, fmt, ...) \ + ({ \ + int __fd = do_open_proc(pid, flags, fmt, ##__VA_ARGS__); \ + if (__fd < 0 && (errno != (ier))) \ + pr_perror("Can't open %d/" fmt " on procfs", pid, ##__VA_ARGS__); \ + \ + __fd; \ }) /* int open_proc(pid_t pid, const char *fmt, ...); */ -#define open_proc(pid, fmt, ...) \ - __open_proc(pid, 0, O_RDONLY, fmt, ##__VA_ARGS__) +#define open_proc(pid, fmt, ...) __open_proc(pid, 0, O_RDONLY, fmt, ##__VA_ARGS__) /* int open_proc_rw(pid_t pid, const char *fmt, ...); */ -#define open_proc_rw(pid, fmt, ...) \ - __open_proc(pid, 0, O_RDWR, fmt, ##__VA_ARGS__) +#define open_proc_rw(pid, fmt, ...) __open_proc(pid, 0, O_RDWR, fmt, ##__VA_ARGS__) -#define open_proc_path(pid, fmt, ...) \ - __open_proc(pid, 0, O_PATH, fmt, ##__VA_ARGS__) +#define open_proc_path(pid, fmt, ...) __open_proc(pid, 0, O_PATH, fmt, ##__VA_ARGS__) /* DIR *opendir_proc(pid_t pid, const char *fmt, ...); */ -#define opendir_proc(pid, fmt, ...) \ - ({ \ - int __fd = open_proc(pid, fmt, ##__VA_ARGS__); \ - DIR *__d = NULL; \ - \ - if (__fd >= 0) { \ - __d = fdopendir(__fd); \ - if (__d == NULL) \ - pr_perror("Can't fdopendir %d " \ - "(%d/" fmt " on procfs)", \ - __fd, pid, ##__VA_ARGS__); \ - } \ - __d; \ - }) +#define opendir_proc(pid, fmt, ...) \ + ({ \ + int __fd = open_proc(pid, fmt, ##__VA_ARGS__); \ + DIR *__d = NULL; \ + \ + if (__fd >= 0) { \ + __d = fdopendir(__fd); \ + if (__d == NULL) \ + pr_perror("Can't fdopendir %d " \ + "(%d/" fmt " on procfs)", \ + __fd, pid, ##__VA_ARGS__); \ + } \ + __d; \ + }) /* FILE *fopen_proc(pid_t pid, const char *fmt, ...); */ -#define fopen_proc(pid, fmt, ...) \ - ({ \ - int __fd = open_proc(pid, fmt, ##__VA_ARGS__); \ - FILE *__f = NULL; \ - \ - if (__fd >= 0) { \ - __f = fdopen(__fd, "r"); \ - if (__f == NULL) \ - pr_perror("Can't fdopen %d " \ - "(%d/" fmt " on procfs)", \ - __fd, pid, ##__VA_ARGS__); \ - } \ - __f; \ - }) +#define fopen_proc(pid, fmt, ...) \ + ({ \ + int __fd = open_proc(pid, fmt, ##__VA_ARGS__); \ + FILE *__f = NULL; \ + \ + if (__fd >= 0) { \ + __f = fdopen(__fd, "r"); \ + if (__f == NULL) \ + pr_perror("Can't fdopen %d " \ + "(%d/" fmt " on procfs)", \ + __fd, pid, ##__VA_ARGS__); \ + } \ + __f; \ + }) -#define DEVZERO (makedev(1, 5)) +#define DEVZERO (makedev(1, 5)) -#define KDEV_MINORBITS 20 -#define KDEV_MINORMASK ((1UL << KDEV_MINORBITS) - 1) -#define MKKDEV(ma, mi) (((ma) << KDEV_MINORBITS) | (mi)) +#define KDEV_MINORBITS 20 +#define KDEV_MINORMASK ((1UL << KDEV_MINORBITS) - 1) +#define MKKDEV(ma, mi) (((ma) << KDEV_MINORBITS) | (mi)) static inline u32 kdev_major(u32 kdev) { @@ -166,21 +162,19 @@ static inline dev_t kdev_to_odev(u32 kdev) extern int copy_file(int fd_in, int fd_out, size_t bytes); extern int is_anon_link_type(char *link, char *type); -#define is_hex_digit(c) \ - (((c) >= '0' && (c) <= '9') || \ - ((c) >= 'a' && (c) <= 'f') || \ - ((c) >= 'A' && (c) <= 'F')) +#define is_hex_digit(c) (((c) >= '0' && (c) <= '9') || ((c) >= 'a' && (c) <= 'f') || ((c) >= 'A' && (c) <= 'F')) -#define CRS_CAN_FAIL 0x1 /* cmd can validly exit with non zero code */ +#define CRS_CAN_FAIL 0x1 /* cmd can validly exit with non zero code */ extern int cr_system(int in, int out, int err, char *cmd, char *const argv[], unsigned flags); -extern int cr_system_userns(int in, int out, int err, char *cmd, - char *const argv[], unsigned flags, int userns_pid); +extern int cr_system_userns(int in, int out, int err, char *cmd, char *const argv[], unsigned flags, int userns_pid); +extern pid_t fork_and_ptrace_attach(int (*child_setup)(void)); extern int cr_daemon(int nochdir, int noclose, int close_fd); -extern int close_status_fd(void); +extern int status_ready(void); extern int is_root_user(void); +extern int close_fds(int minfd); -extern void set_proc_self_fd(int fd); +extern int set_proc_self_fd(int fd); static inline bool dir_dots(const struct dirent *de) { @@ -193,12 +187,12 @@ extern int is_empty_dir(int dirfd); * Size of buffer to carry the worst case or /proc/self/fd/N * path. Since fd is an integer, we can easily estimate one :) */ -#define PSFDS (sizeof("/proc/self/fd/2147483647")) +#define PSFDS (sizeof("/proc/self/fd/2147483647")) extern int read_fd_link(int lfd, char *buf, size_t size); -#define USEC_PER_SEC 1000000L -#define NSEC_PER_SEC 1000000000L +#define USEC_PER_SEC 1000000L +#define NSEC_PER_SEC 1000000000L int vaddr_to_pfn(int fd, unsigned long vaddr, u64 *pfn); @@ -211,7 +205,7 @@ static inline bool strstartswith2(const char *str, const char *sub, char *end) while (1) { if (*sub == '\0') /* end of sub -- match */ { if (end) { - if (*(sub-1) == '/') /* "/", "./" or "path/" */ + if (*(sub - 1) == '/') /* "/", "./" or "path/" */ *end = '/'; else *end = *str; @@ -248,10 +242,16 @@ static inline bool strstartswith(const char *str, const char *sub) static inline bool issubpath(const char *path, const char *sub_path) { char end; - return strstartswith2(path, sub_path, &end) && - (end == '/' || end == '\0'); + return strstartswith2(path, sub_path, &end) && (end == '/' || end == '\0'); } +extern char *get_relative_path(char *path, char *sub_path); +extern bool is_sub_path(char *path, char *sub_path); +extern bool is_same_path(char *path1, char *path2); + +int strip_deleted(char *path, int len); +int cut_path_ending(char *path, char *sub_path); + /* * mkdir -p */ @@ -266,58 +266,61 @@ bool is_path_prefix(const char *path, const char *prefix); FILE *fopenat(int dirfd, char *path, char *cflags); void split(char *str, char token, char ***out, int *n); +int cr_fchown(int fd, uid_t new_uid, gid_t new_gid); +int cr_fchperm(int fd, uid_t new_uid, gid_t new_gid, mode_t new_mode); +int cr_fchpermat(int dirfd, const char *path, uid_t new_uid, gid_t new_gid, mode_t new_mode, int flags); + int fd_has_data(int lfd); int make_yard(char *path); static inline int sk_wait_data(int sk) { - struct pollfd pfd = {sk, POLLIN, 0}; + struct pollfd pfd = { sk, POLLIN, 0 }; return poll(&pfd, 1, -1); } void fd_set_nonblocking(int fd, bool on); -void tcp_nodelay(int sk, bool on); -void tcp_cork(int sk, bool on); const char *ns_to_string(unsigned int ns); int xatol(const char *string, long *number); int xatoi(const char *string, int *number); -char *xstrcat(char *str, const char *fmt, ...) - __attribute__ ((__format__ (__printf__, 2, 3))); -char *xsprintf(const char *fmt, ...) - __attribute__ ((__format__ (__printf__, 1, 2))); - -void print_data(unsigned long addr, unsigned char *data, size_t size); +char *xstrcat(char *str, const char *fmt, ...) __attribute__((__format__(__printf__, 2, 3))); +char *xsprintf(const char *fmt, ...) __attribute__((__format__(__printf__, 1, 2))); int setup_tcp_server(char *type, char *addr, unsigned short *port); int run_tcp_server(bool daemon_mode, int *ask, int cfd, int sk); int setup_tcp_client(char *hostname); -#define LAST_PID_PATH "sys/kernel/ns_last_pid" -#define PID_MAX_PATH "sys/kernel/pid_max" +/* path should be writable and no more than PATH_MAX long */ +int rmrf(char *path); -#define block_sigmask(saved_mask, sig_mask) ({ \ - sigset_t ___blocked_mask; \ - int ___ret = 0; \ - sigemptyset(&___blocked_mask); \ - sigaddset(&___blocked_mask, sig_mask); \ - if (sigprocmask(SIG_BLOCK, &___blocked_mask, saved_mask) == -1) { \ - pr_perror("Can not set mask of blocked signals"); \ - ___ret = -1; \ - } \ - ___ret; \ +#define LAST_PID_PATH "sys/kernel/ns_last_pid" +#define PID_MAX_PATH "sys/kernel/pid_max" + +#define block_sigmask(saved_mask, sig_mask) \ + ({ \ + sigset_t ___blocked_mask; \ + int ___ret = 0; \ + sigemptyset(&___blocked_mask); \ + sigaddset(&___blocked_mask, sig_mask); \ + if (sigprocmask(SIG_BLOCK, &___blocked_mask, saved_mask) == -1) { \ + pr_perror("Can not set mask of blocked signals"); \ + ___ret = -1; \ + } \ + ___ret; \ }) -#define restore_sigmask(saved_mask) ({ \ - int ___ret = 0; \ - if (sigprocmask(SIG_SETMASK, saved_mask, NULL) == -1) { \ - pr_perror("Can not unset mask of blocked signals"); \ - ___ret = -1; \ - } \ - ___ret; \ +#define restore_sigmask(saved_mask) \ + ({ \ + int ___ret = 0; \ + if (sigprocmask(SIG_SETMASK, saved_mask, NULL) == -1) { \ + pr_perror("Can not unset mask of blocked signals"); \ + ___ret = -1; \ + } \ + ___ret; \ }) /* @@ -356,28 +359,76 @@ extern int call_in_child_process(int (*fn)(void *), void *arg); #ifdef __GLIBC__ extern void print_stack_trace(pid_t pid); #else -static inline void print_stack_trace(pid_t pid) {} +static inline void print_stack_trace(pid_t pid) +{ +} #endif -#define block_sigmask(saved_mask, sig_mask) ({ \ - sigset_t ___blocked_mask; \ - int ___ret = 0; \ - sigemptyset(&___blocked_mask); \ - sigaddset(&___blocked_mask, sig_mask); \ - if (sigprocmask(SIG_BLOCK, &___blocked_mask, saved_mask) == -1) { \ - pr_perror("Can not set mask of blocked signals"); \ - ___ret = -1; \ - } \ - ___ret; \ +#define block_sigmask(saved_mask, sig_mask) \ + ({ \ + sigset_t ___blocked_mask; \ + int ___ret = 0; \ + sigemptyset(&___blocked_mask); \ + sigaddset(&___blocked_mask, sig_mask); \ + if (sigprocmask(SIG_BLOCK, &___blocked_mask, saved_mask) == -1) { \ + pr_perror("Can not set mask of blocked signals"); \ + ___ret = -1; \ + } \ + ___ret; \ }) -#define restore_sigmask(saved_mask) ({ \ - int ___ret = 0; \ - if (sigprocmask(SIG_SETMASK, saved_mask, NULL) == -1) { \ - pr_perror("Can not unset mask of blocked signals"); \ - ___ret = -1; \ - } \ - ___ret; \ +#define restore_sigmask(saved_mask) \ + ({ \ + int ___ret = 0; \ + if (sigprocmask(SIG_SETMASK, saved_mask, NULL) == -1) { \ + pr_perror("Can not unset mask of blocked signals"); \ + ___ret = -1; \ + } \ + ___ret; \ }) +extern int mount_detached_fs(const char *fsname); + +extern int cr_fsopen(const char *fsname, unsigned int flags); +extern int cr_fsconfig(int fd, unsigned int cmd, const char *key, const char *value, int aux); +extern int cr_fsmount(int fd, unsigned int flags, unsigned int attr_flags); +extern void fsfd_dump_messages(int fd); + +extern char *get_legacy_iptables_bin(bool ipv6, bool restore); + +extern int set_opts_cap_eff(void); + +extern ssize_t read_all(int fd, void *buf, size_t size); +extern ssize_t write_all(int fd, const void *buf, size_t size); + +#define cleanup_free __attribute__((cleanup(cleanup_freep))) +static inline void cleanup_freep(void *p) +{ + void **pp = (void **)p; + free(*pp); +} + +#define cleanup_file __attribute__((cleanup(cleanup_filep))) +static inline void cleanup_filep(FILE **f) +{ + FILE *file = *f; + if (file) + (void)fclose(file); +} + +extern int run_command(char *buf, size_t buf_size, int (*child_fn)(void *), void *args); + +/* + * criu_run_id is a unique value of the current run. It can be used to + * generate resource ID-s to avoid conflicts with other CRIU processes. + */ +extern char criu_run_id[RUN_ID_HASH_LENGTH]; +extern void util_init(void); +#define NO_DUMP_CRIU_RUN_ID 0x7f +extern char dump_criu_run_id[RUN_ID_HASH_LENGTH]; + +extern char *resolve_mountpoint(char *path); + +extern int cr_close_range(unsigned int fd, unsigned int max_fd, unsigned int flags); + #endif /* __CR_UTIL_H__ */ diff --git a/criu/include/vdso.h b/criu/include/vdso.h index fd30772b4..83a04a39e 100644 --- a/criu/include/vdso.h +++ b/criu/include/vdso.h @@ -15,12 +15,10 @@ extern int vdso_init_restore(void); extern int kerndat_vdso_fill_symtable(void); extern int kerndat_vdso_preserves_hint(void); -extern int parasite_fixup_vdso(struct parasite_ctl *ctl, pid_t pid, - struct vm_area_list *vma_area_list); +extern int parasite_fixup_vdso(struct parasite_ctl *ctl, pid_t pid, struct vm_area_list *vma_area_list); #ifdef CONFIG_COMPAT -extern void compat_vdso_helper(struct vdso_maps *native, int pipe_fd, - int err_fd, void *vdso_buf, size_t buf_size); +extern void compat_vdso_helper(struct vdso_maps *native, int pipe_fd, int err_fd, void *vdso_buf, size_t buf_size); #endif #endif /* __CR_VDSO_H__ */ diff --git a/criu/include/vma.h b/criu/include/vma.h index 5e3f3527b..b8ddfc142 100644 --- a/criu/include/vma.h +++ b/criu/include/vma.h @@ -10,15 +10,15 @@ #include struct vm_area_list { - struct list_head h; /* list of VMAs */ - unsigned nr; /* nr of all VMAs in the list */ - unsigned int nr_aios; /* nr of AIOs VMAs in the list */ + struct list_head h; /* list of VMAs */ + unsigned nr; /* nr of all VMAs in the list */ + unsigned int nr_aios; /* nr of AIOs VMAs in the list */ union { - unsigned long nr_priv_pages; /* dmp: nr of pages in private VMAs */ - unsigned long rst_priv_size; /* rst: size of private VMAs */ + unsigned long nr_priv_pages; /* dmp: nr of pages in private VMAs */ + unsigned long rst_priv_size; /* rst: size of private VMAs */ }; - unsigned long nr_priv_pages_longest; /* nr of pages in longest private VMA */ - unsigned long nr_shared_pages_longest;/* nr of pages in longest shared VMA */ + unsigned long nr_priv_pages_longest; /* nr of pages in longest private VMA */ + unsigned long nr_shared_pages_longest; /* nr of pages in longest shared VMA */ }; static inline void vm_area_list_init(struct vm_area_list *vml) @@ -30,32 +30,32 @@ static inline void vm_area_list_init(struct vm_area_list *vml) struct file_desc; struct vma_area { - struct list_head list; - VmaEntry *e; + struct list_head list; + VmaEntry *e; union { struct /* for dump */ { - int vm_socket_id; + int vm_socket_id; - char *aufs_rpath; /* path from aufs root */ - char *aufs_fpath; /* full path from global root */ + char *aufs_rpath; /* path from aufs root */ + char *aufs_fpath; /* full path from global root */ /* * When several subsequent vmas have the same * dev:ino pair all 'tail' ones set this to true * and the vmst points to the head's stat buf. */ - bool file_borrowed; - struct stat *vmst; - int mnt_id; + bool file_borrowed; + struct stat *vmst; + int mnt_id; }; struct /* for restore */ { int (*vm_open)(int pid, struct vma_area *vma); struct file_desc *vmfd; - struct vma_area *pvma; /* parent for inherited VMAs */ - unsigned long *page_bitmap; /* existent pages */ - unsigned long premmaped_addr; /* restore only */ + struct vma_area *pvma; /* parent for inherited VMAs */ + unsigned long *page_bitmap; /* existent pages */ + unsigned long premmaped_addr; /* restore only */ /* * Some notes about pvma, page_bitmap and premmaped_addr bits @@ -72,22 +72,21 @@ struct vma_area { }; }; -#define VMA_COW_ROOT ((struct vma_area *)1) +#define VMA_COW_ROOT ((struct vma_area *)1) typedef int (*dump_filemap_t)(struct vma_area *vma_area, int fd); extern struct vma_area *alloc_vma_area(void); -extern int collect_mappings(pid_t pid, - struct vm_area_list *vma_area_list, dump_filemap_t cb); +extern int collect_mappings(pid_t pid, struct vm_area_list *vma_area_list, dump_filemap_t cb); extern void free_mappings(struct vm_area_list *vma_area_list); extern int parse_smaps(pid_t pid, struct vm_area_list *vma_area_list, dump_filemap_t cb); extern int parse_self_maps_lite(struct vm_area_list *vms); -#define vma_area_is(vma_area, s) vma_entry_is((vma_area)->e, s) -#define vma_area_len(vma_area) vma_entry_len((vma_area)->e) -#define vma_entry_is(vma, s) (((vma)->status & (s)) == (s)) -#define vma_entry_len(vma) ((vma)->end - (vma)->start) +#define vma_area_is(vma_area, s) vma_entry_is((vma_area)->e, s) +#define vma_area_len(vma_area) vma_entry_len((vma_area)->e) +#define vma_entry_is(vma, s) (((vma)->status & (s)) == (s)) +#define vma_entry_len(vma) ((vma)->end - (vma)->start) /* * vma_premmaped_start() can be used only in restorer. @@ -95,26 +94,23 @@ extern int parse_self_maps_lite(struct vm_area_list *vms); * This hack is required, because vma_area isn't transferred in restorer and * shmid is used to determine which vma-s are cowed. */ -#define vma_premmaped_start(vma) ((vma)->shmid) +#define vma_premmaped_start(vma) ((vma)->shmid) static inline int in_vma_area(struct vma_area *vma, unsigned long addr) { - return addr >= (unsigned long)vma->e->start && - addr < (unsigned long)vma->e->end; + return addr >= (unsigned long)vma->e->start && addr < (unsigned long)vma->e->end; } -static inline bool vma_entry_is_private(VmaEntry *entry, - unsigned long task_size) +static inline bool vma_entry_is_private(VmaEntry *entry, unsigned long task_size) { - return (vma_entry_is(entry, VMA_AREA_REGULAR) && - (vma_entry_is(entry, VMA_ANON_PRIVATE) || - vma_entry_is(entry, VMA_FILE_PRIVATE)) && - (entry->end <= task_size)) || - vma_entry_is(entry, VMA_AREA_AIORING); + return (vma_entry_is(entry, VMA_AREA_REGULAR) && + (vma_entry_is(entry, VMA_ANON_PRIVATE) || vma_entry_is(entry, VMA_FILE_PRIVATE)) && + (entry->end <= task_size)) || + vma_entry_is(entry, VMA_AREA_SHSTK) || + vma_entry_is(entry, VMA_AREA_AIORING); } -static inline bool vma_area_is_private(struct vma_area *vma, - unsigned long task_size) +static inline bool vma_area_is_private(struct vma_area *vma, unsigned long task_size) { return vma_entry_is_private(vma->e, task_size); } @@ -126,11 +122,9 @@ static inline struct vma_area *vma_next(struct vma_area *vma) static inline bool vma_entry_can_be_lazy(VmaEntry *e) { - return ((e->flags & MAP_ANONYMOUS) && - (e->flags & MAP_PRIVATE) && - !(e->flags & MAP_LOCKED) && - !(vma_entry_is(e, VMA_AREA_VDSO)) && - !(vma_entry_is(e, VMA_AREA_VSYSCALL))); + return ((e->flags & MAP_ANONYMOUS) && (e->flags & MAP_PRIVATE) && !(e->flags & MAP_LOCKED) && + !(vma_entry_is(e, VMA_AREA_VDSO)) && !(vma_entry_is(e, VMA_AREA_VVAR)) && + !(vma_entry_is(e, VMA_AREA_VSYSCALL)) && !(e->flags & MAP_HUGETLB)); } #endif /* __CR_VMA_H__ */ diff --git a/criu/ipc_ns.c b/criu/ipc_ns.c index d8590fa39..7e95be8c5 100644 --- a/criu/ipc_ns.c +++ b/criu/ipc_ns.c @@ -15,6 +15,7 @@ #include "sysctl.h" #include "ipc_ns.h" #include "shmem.h" +#include "types.h" #include "protobuf.h" #include "images/ipc-var.pb-c.h" @@ -22,26 +23,25 @@ #include "images/ipc-sem.pb-c.h" #include "images/ipc-msg.pb-c.h" -#if defined (__GLIBC__) && __GLIBC__ >= 2 +#if defined(__GLIBC__) && __GLIBC__ >= 2 #define KEY __key #else #define KEY key #endif #ifndef MSGMAX -#define MSGMAX 8192 +#define MSGMAX 8192 #endif #ifndef MSG_COPY -#define MSG_COPY 040000 +#define MSG_COPY 040000 #endif -static void pr_ipc_desc_entry(unsigned int loglevel, const IpcDescEntry *desc) +static void pr_ipc_desc_entry(const IpcDescEntry *desc) { - print_on_level(loglevel, "id: %-10d key: %#08x uid: %-10d gid: %-10d " - "cuid: %-10d cgid: %-10d mode: %-10o ", - desc->id, desc->key, desc->uid, desc->gid, - desc->cuid, desc->cgid, desc->mode); + pr_info("id: %-10d key: %#08x uid: %-10d gid: %-10d " + "cuid: %-10d cgid: %-10d mode: %-10o ", + desc->id, desc->key, desc->uid, desc->gid, desc->cuid, desc->cgid, desc->mode); } static void fill_ipc_desc(int id, IpcDescEntry *desc, const struct ipc_perm *ipcp) @@ -55,19 +55,19 @@ static void fill_ipc_desc(int id, IpcDescEntry *desc, const struct ipc_perm *ipc desc->mode = ipcp->mode; } -static void pr_ipc_sem_array(unsigned int loglevel, int nr, u16 *values) +static void pr_ipc_sem_array(int nr, u16 *values) { while (nr--) - print_on_level(loglevel, " %-5d", values[nr]); - print_on_level(loglevel, "\n"); + pr_info(" %-5d", values[nr]); // no \n + pr_info("\n"); } -#define pr_info_ipc_sem_array(nr, values) pr_ipc_sem_array(LOG_INFO, nr, values) +#define pr_info_ipc_sem_array(nr, values) pr_ipc_sem_array(nr, values) static void pr_info_ipc_sem_entry(const IpcSemEntry *sem) { - pr_ipc_desc_entry(LOG_INFO, sem->desc); - print_on_level(LOG_INFO, "nsems: %-10d\n", sem->nsems); + pr_ipc_desc_entry(sem->desc); + pr_info("nsems: %-10d\n", sem->nsems); } static int dump_ipc_sem_set(struct cr_img *img, const IpcSemEntry *sem) @@ -160,19 +160,16 @@ static int dump_ipc_sem(struct cr_img *img) static void pr_info_ipc_msg(int nr, const IpcMsg *msg) { - print_on_level(LOG_INFO, " %-5d: type: %-20"PRId64" size: %-10d\n", - nr++, msg->mtype, msg->msize); + pr_info(" %-5d: type: %-20" PRId64 " size: %-10d\n", nr++, msg->mtype, msg->msize); } static void pr_info_ipc_msg_entry(const IpcMsgEntry *msg) { - pr_ipc_desc_entry(LOG_INFO, msg->desc); - print_on_level(LOG_INFO, "qbytes: %-10d qnum: %-10d\n", - msg->qbytes, msg->qnum); + pr_ipc_desc_entry(msg->desc); + pr_info("qbytes: %-10d qnum: %-10d\n", msg->qbytes, msg->qnum); } -static int dump_ipc_msg_queue_messages(struct cr_img *img, const IpcMsgEntry *msq, - unsigned int msg_nr) +static int dump_ipc_msg_queue_messages(struct cr_img *img, const IpcMsgEntry *msq, unsigned int msg_nr) { struct msgbuf *message = NULL; unsigned int msgmax; @@ -287,33 +284,35 @@ static int dump_ipc_msg(struct cr_img *img) static void pr_info_ipc_shm(const IpcShmEntry *shm) { - pr_ipc_desc_entry(LOG_INFO, shm->desc); - print_on_level(LOG_INFO, "size: %-10"PRIu64"\n", shm->size); + pr_ipc_desc_entry(shm->desc); + pr_info("size: %-10" PRIu64 "\n", shm->size); } #define NR_MANDATORY_IPC_SYSCTLS 9 static int ipc_sysctl_req(IpcVarEntry *e, int op) { + int i; + struct sysctl_req req[] = { - { "kernel/sem", e->sem_ctls, CTL_U32A(e->n_sem_ctls) }, - { "kernel/msgmax", &e->msg_ctlmax, CTL_U32 }, - { "kernel/msgmnb", &e->msg_ctlmnb, CTL_U32 }, - { "kernel/auto_msgmni", &e->auto_msgmni, CTL_U32 }, - { "kernel/msgmni", &e->msg_ctlmni, CTL_U32 }, - { "kernel/shmmax", &e->shm_ctlmax, CTL_U64 }, - { "kernel/shmall", &e->shm_ctlall, CTL_U64 }, - { "kernel/shmmni", &e->shm_ctlmni, CTL_U32 }, - { "kernel/shm_rmid_forced", &e->shm_rmid_forced, CTL_U32 }, + { "kernel/sem", e->sem_ctls, CTL_U32A(e->n_sem_ctls) }, + { "kernel/msgmax", &e->msg_ctlmax, CTL_U32 }, + { "kernel/msgmnb", &e->msg_ctlmnb, CTL_U32 }, + { "kernel/auto_msgmni", &e->auto_msgmni, CTL_U32 }, + { "kernel/msgmni", &e->msg_ctlmni, CTL_U32 }, + { "kernel/shmmax", &e->shm_ctlmax, CTL_U64 }, + { "kernel/shmall", &e->shm_ctlall, CTL_U64 }, + { "kernel/shmmni", &e->shm_ctlmni, CTL_U32 }, + { "kernel/shm_rmid_forced", &e->shm_rmid_forced, CTL_U32 }, /* We have 9 mandatory sysctls above and 8 optional below */ - { "fs/mqueue/queues_max", &e->mq_queues_max, CTL_U32 }, - { "fs/mqueue/msg_max", &e->mq_msg_max, CTL_U32 }, - { "fs/mqueue/msgsize_max", &e->mq_msgsize_max, CTL_U32 }, - { "fs/mqueue/msg_default", &e->mq_msg_default, CTL_U32 }, - { "fs/mqueue/msgsize_default", &e->mq_msgsize_default, CTL_U32 }, - { "kernel/msg_next_id", &e->msg_next_id, CTL_U32 }, - { "kernel/sem_next_id", &e->sem_next_id, CTL_U32 }, - { "kernel/shm_next_id", &e->shm_next_id, CTL_U32 }, + { "fs/mqueue/queues_max", &e->mq_queues_max, CTL_U32 }, + { "fs/mqueue/msg_max", &e->mq_msg_max, CTL_U32 }, + { "fs/mqueue/msgsize_max", &e->mq_msgsize_max, CTL_U32 }, + { "fs/mqueue/msg_default", &e->mq_msg_default, CTL_U32 }, + { "fs/mqueue/msgsize_default", &e->mq_msgsize_default, CTL_U32 }, + { "kernel/msg_next_id", &e->msg_next_id, CTL_U32 }, + { "kernel/sem_next_id", &e->sem_next_id, CTL_U32 }, + { "kernel/shm_next_id", &e->shm_next_id, CTL_U32 }, }; int nr = NR_MANDATORY_IPC_SYSCTLS; @@ -335,6 +334,9 @@ static int ipc_sysctl_req(IpcVarEntry *e, int op) if (e->has_shm_next_id) req[nr++] = req[16]; + for (i = 0; i < nr; i++) + req[i].flags = CTL_FLAGS_IPC_EACCES_SKIP; + return sysctl_op(req, nr, op, CLONE_NEWIPC); } @@ -358,6 +360,42 @@ static int dump_ipc_shm_pages(const IpcShmEntry *shm) return ret; } +static int dump_shm_hugetlb_flag(IpcShmEntry *shm, int id, unsigned long size) +{ + void *addr; + int ret, hugetlb_flag, exit_code = -1; + struct stat st; + char path[64]; + + addr = shmat(id, NULL, SHM_RDONLY); + if (addr == (void *)-1) { + pr_perror("Failed to attach shm"); + return -1; + } + + /* The shm segment size may not be aligned, + * we need to align it up to next page size + */ + size = (size + PAGE_SIZE - 1) & ~(PAGE_SIZE - 1); + snprintf(path, sizeof(path), "/proc/self/map_files/%lx-%lx", (unsigned long)addr, (unsigned long)addr + size); + + ret = stat(path, &st); + if (ret < 0) { + pr_perror("Can't stat map_files"); + goto detach; + } + + if (is_hugetlb_dev(st.st_dev, &hugetlb_flag)) { + shm->has_hugetlb_flag = true; + shm->hugetlb_flag = hugetlb_flag | SHM_HUGETLB; + } + + exit_code = 0; +detach: + shmdt(addr); + return exit_code; +} + static int dump_ipc_shm_seg(struct cr_img *img, int id, const struct shmid_ds *ds) { IpcShmEntry shm = IPC_SHM_ENTRY__INIT; @@ -368,6 +406,10 @@ static int dump_ipc_shm_seg(struct cr_img *img, int id, const struct shmid_ds *d shm.size = ds->shm_segsz; shm.has_in_pagemaps = true; shm.in_pagemaps = true; + + if (dump_shm_hugetlb_flag(&shm, id, ds->shm_segsz)) + return -1; + fill_ipc_desc(id, shm.desc, &ds->shm_perm); pr_info_ipc_shm(&shm); @@ -409,8 +451,7 @@ static int dump_ipc_shm(struct cr_img *img) slot++; } if (slot != info.used_ids) { - pr_err("Failed to collect %d (only %d succeeded)\n", - info.used_ids, slot); + pr_err("Failed to collect %d (only %d succeeded)\n", info.used_ids, slot); return -EFAULT; } return 0; @@ -421,8 +462,8 @@ static int dump_ipc_var(struct cr_img *img) IpcVarEntry var = IPC_VAR_ENTRY__INIT; int ret = -1; - var.n_sem_ctls = 4; - var.sem_ctls = xmalloc(pb_repeated_size(&var, sem_ctls)); + var.n_sem_ctls = 4; + var.sem_ctls = xmalloc(pb_repeated_size(&var, sem_ctls)); if (!var.sem_ctls) goto err; var.has_mq_msg_default = true; @@ -534,7 +575,7 @@ static int prepare_ipc_sem_desc(struct cr_img *img, const IpcSemEntry *sem) { int ret, id; struct sysctl_req req[] = { - { "kernel/sem_next_id", &sem->desc->id, CTL_U32 }, + { "kernel/sem_next_id", &sem->desc->id, CTL_U32, CTL_FLAGS_IPC_EACCES_SKIP }, }; struct semid_ds semid; @@ -544,16 +585,14 @@ static int prepare_ipc_sem_desc(struct cr_img *img, const IpcSemEntry *sem) return ret; } - id = semget(sem->desc->key, sem->nsems, - sem->desc->mode | IPC_CREAT | IPC_EXCL); + id = semget(sem->desc->key, sem->nsems, sem->desc->mode | IPC_CREAT | IPC_EXCL); if (id == -1) { pr_perror("Failed to create sem set"); return -errno; } if (id != sem->desc->id) { - pr_err("Failed to restore sem id (%d instead of %d)\n", - id, sem->desc->id); + pr_err("Failed to restore sem id (%d instead of %d)\n", id, sem->desc->id); return -EFAULT; } @@ -640,8 +679,7 @@ static int prepare_ipc_msg_queue_messages(struct cr_img *img, const IpcMsgEntry if (msg->msize > MSGMAX) { ret = -1; - pr_err("Unsupported message size: %d (MAX: %d)\n", - msg->msize, MSGMAX); + pr_err("Unsupported message size: %d (MAX: %d)\n", msg->msize, MSGMAX); break; } @@ -670,7 +708,7 @@ static int prepare_ipc_msg_queue(struct cr_img *img, const IpcMsgEntry *msq) { int ret, id; struct sysctl_req req[] = { - { "kernel/msg_next_id", &msq->desc->id, CTL_U32 }, + { "kernel/msg_next_id", &msq->desc->id, CTL_U32, CTL_FLAGS_IPC_EACCES_SKIP }, }; struct msqid_ds msqid; @@ -687,8 +725,7 @@ static int prepare_ipc_msg_queue(struct cr_img *img, const IpcMsgEntry *msq) } if (id != msq->desc->id) { - pr_err("Failed to restore msg id (%d instead of %d)\n", - id, msq->desc->id); + pr_err("Failed to restore msg id (%d instead of %d)\n", id, msq->desc->id); return -EFAULT; } @@ -761,6 +798,10 @@ static int restore_content(void *data, struct cr_img *img, const IpcShmEntry *sh ssize_t size, off; ifd = img_raw_fd(img); + if (ifd < 0) { + pr_err("Failed getting raw image fd\n"); + return -1; + } size = round_up(shm->size, sizeof(u32)); off = 0; do { @@ -803,9 +844,9 @@ static int prepare_ipc_shm_pages(struct cr_img *img, const IpcShmEntry *shm) static int prepare_ipc_shm_seg(struct cr_img *img, const IpcShmEntry *shm) { - int ret, id; + int ret, id, hugetlb_flag = 0; struct sysctl_req req[] = { - { "kernel/shm_next_id", &shm->desc->id, CTL_U32 }, + { "kernel/shm_next_id", &shm->desc->id, CTL_U32, CTL_FLAGS_IPC_EACCES_SKIP }, }; struct shmid_ds shmid; @@ -818,16 +859,17 @@ static int prepare_ipc_shm_seg(struct cr_img *img, const IpcShmEntry *shm) return ret; } - id = shmget(shm->desc->key, shm->size, - shm->desc->mode | IPC_CREAT | IPC_EXCL); + if (shm->has_hugetlb_flag) + hugetlb_flag = shm->hugetlb_flag; + + id = shmget(shm->desc->key, shm->size, hugetlb_flag | shm->desc->mode | IPC_CREAT | IPC_EXCL); if (id == -1) { pr_perror("Failed to create shm set"); return -errno; } if (id != shm->desc->id) { - pr_err("Failed to restore shm id (%d instead of %d)\n", - id, shm->desc->id); + pr_err("Failed to restore shm id (%d instead of %d)\n", id, shm->desc->id); return -EFAULT; } diff --git a/criu/irmap.c b/criu/irmap.c index e72984216..d2c5d588a 100644 --- a/criu/irmap.c +++ b/criu/irmap.c @@ -30,12 +30,12 @@ #include "images/fsnotify.pb-c.h" #include "images/fh.pb-c.h" -#undef LOG_PREFIX +#undef LOG_PREFIX #define LOG_PREFIX "irmap: " -#define IRMAP_CACHE_BITS 5 -#define IRMAP_CACHE_SIZE (1 << IRMAP_CACHE_BITS) -#define IRMAP_CACHE_MASK (IRMAP_CACHE_SIZE - 1) +#define IRMAP_CACHE_BITS 5 +#define IRMAP_CACHE_SIZE (1 << IRMAP_CACHE_BITS) +#define IRMAP_CACHE_MASK (IRMAP_CACHE_SIZE - 1) static inline int irmap_hashfn(unsigned int s_dev, unsigned long i_ino) { @@ -55,16 +55,35 @@ struct irmap { static struct irmap *cache[IRMAP_CACHE_SIZE]; static struct irmap hints[] = { - { .path = "/etc", .nr_kids = -1, }, - { .path = "/var/spool", .nr_kids = -1, }, - { .path = "/var/log", .nr_kids = -1, }, + { + .path = "/etc", + .nr_kids = -1, + }, + { + .path = "/var/spool", + .nr_kids = -1, + }, + { + .path = "/var/log", + .nr_kids = -1, + }, + { .path = "/usr/share/dbus-1/services", .nr_kids = -1 }, { .path = "/usr/share/dbus-1/system-services", .nr_kids = -1 }, { .path = "/var/lib/polkit-1/localauthority", .nr_kids = -1 }, { .path = "/usr/share/polkit-1/actions", .nr_kids = -1 }, - { .path = "/lib/udev", .nr_kids = -1, }, - { .path = "/.", .nr_kids = 0, }, - { .path = "/no-such-path", .nr_kids = -1, }, - { }, + { + .path = "/lib/udev", + .nr_kids = -1, + }, + { + .path = "/.", + .nr_kids = 0, + }, + { + .path = "/no-such-path", + .nr_kids = -1, + }, + {}, }; /* @@ -83,7 +102,7 @@ static int irmap_update_stat(struct irmap *i) pr_debug("Refresh stat for %s\n", i->path); if (fstatat(mntns_root, i->path + 1, &st, AT_SYMLINK_NOFOLLOW)) { - pr_perror("Can't stat %s", i->path); + pr_pwarn("Can't stat %s", i->path); return -1; } @@ -118,13 +137,14 @@ static int irmap_update_dir(struct irmap *t) pr_debug("Refilling %s dir\n", t->path); fd = openat(mntns_root, t->path + 1, O_RDONLY); if (fd < 0) { - pr_perror("Can't open %s", t->path); + pr_pwarn("Can't open %s", t->path); return -1; } dfd = fdopendir(fd); if (!dfd) { pr_perror("Can't opendir %s", t->path); + close(fd); return -1; } @@ -155,14 +175,12 @@ static int irmap_update_dir(struct irmap *t) } closedir(dfd); - close(fd); t->nr_kids = nr; return 0; out_err: xfree(t->kids); closedir(dfd); - close(fd); return -1; } @@ -236,14 +254,13 @@ char *irmap_lookup(unsigned int s_dev, unsigned long i_ino) * But the root service fd is already set by the * irmap_predump_prep, so we just go ahead and scan. */ - if (!doing_predump && - __mntns_get_root_fd(root_item->pid->real) < 0) + if (!doing_predump && __mntns_get_root_fd(root_item->pid->real) < 0) goto out; timing_start(TIME_IRMAP_RESOLVE); hv = irmap_hashfn(s_dev, i_ino); - for (p = &cache[hv]; *p; ) { + for (p = &cache[hv]; *p;) { c = *p; if (!(c->dev == s_dev && c->ino == i_ino)) { p = &(*p)->next; @@ -299,8 +316,7 @@ struct irmap_predump { static struct irmap_predump *predump_queue; -int irmap_queue_cache(unsigned int dev, unsigned long ino, - FhEntry *fh) +int irmap_queue_cache(unsigned int dev, unsigned long ino, FhEntry *fh) { struct irmap_predump *ip; @@ -311,8 +327,7 @@ int irmap_queue_cache(unsigned int dev, unsigned long ino, ip->dev = dev; ip->ino = ino; ip->fh = *fh; - ip->fh.handle = xmemdup(fh->handle, - FH_ENTRY_SIZES__min_entries * sizeof(uint64_t)); + ip->fh.handle = xmemdup(fh->handle, FH_ENTRY_SIZES__min_entries * sizeof(uint64_t)); if (!ip->fh.handle) { xfree(ip); return -1; @@ -426,12 +441,10 @@ in: close_image(*img); if (dir == AT_FDCWD) { pr_info("Searching irmap cache in parent\n"); - dir = openat(get_service_fd(IMG_FD_OFF), - CR_PARENT_LINK, O_RDONLY); + if (open_parent(get_service_fd(IMG_FD_OFF), &dir)) + return -1; if (dir >= 0) goto in; - if (errno != ENOENT) - return -1; } pr_info("No irmap cache\n"); @@ -487,8 +500,13 @@ int irmap_scan_path_add(char *path) return -1; } - o->ir->path = path; + o->ir->path = xstrdup(path); + if (!o->ir->path) { + xfree(o->ir); + xfree(o); + return -1; + } o->ir->nr_kids = -1; - list_add(&o->node, &opts.irmap_scan_paths); + list_add_tail(&o->node, &opts.irmap_scan_paths); return 0; } diff --git a/criu/kcmp-ids.c b/criu/kcmp-ids.c index 4fde10e67..1569b9d40 100644 --- a/criu/kcmp-ids.c +++ b/criu/kcmp-ids.c @@ -52,13 +52,13 @@ */ struct kid_entry { - struct rb_node node; + struct rb_node node; - struct rb_root subtree_root; - struct rb_node subtree_node; + struct rb_root subtree_root; + struct rb_node subtree_node; - uint32_t subid; /* subid is always unique */ - struct kid_elem elem; + uint32_t subid; /* subid is always unique */ + struct kid_elem elem; } __aligned(sizeof(long)); static struct kid_entry *alloc_kid_entry(struct kid_tree *tree, struct kid_elem *elem) @@ -69,8 +69,8 @@ static struct kid_entry *alloc_kid_entry(struct kid_tree *tree, struct kid_elem if (!e) goto err; - e->subid = tree->subid++; - e->elem = *elem; + e->subid = tree->subid++; + e->elem = *elem; /* Make sure no overflow here */ BUG_ON(!e->subid); @@ -78,14 +78,12 @@ static struct kid_entry *alloc_kid_entry(struct kid_tree *tree, struct kid_elem rb_init_node(&e->node); rb_init_node(&e->subtree_node); e->subtree_root = RB_ROOT; - rb_link_and_balance(&e->subtree_root, &e->subtree_node, - NULL, &e->subtree_root.rb_node); + rb_link_and_balance(&e->subtree_root, &e->subtree_node, NULL, &e->subtree_root.rb_node); err: return e; } -static uint32_t kid_generate_sub(struct kid_tree *tree, struct kid_entry *e, - struct kid_elem *elem, int *new_id) +static uint32_t kid_generate_sub(struct kid_tree *tree, struct kid_entry *e, struct kid_elem *elem, int *new_id) { struct rb_node *node = e->subtree_root.rb_node; struct kid_entry *sub = NULL; @@ -97,8 +95,7 @@ static uint32_t kid_generate_sub(struct kid_tree *tree, struct kid_entry *e, while (node) { struct kid_entry *this = rb_entry(node, struct kid_entry, subtree_node); - int ret = syscall(SYS_kcmp, this->elem.pid, elem->pid, tree->kcmp_type, - this->elem.idx, elem->idx); + int ret = syscall(SYS_kcmp, this->elem.pid, elem->pid, tree->kcmp_type, this->elem.idx, elem->idx); parent = *new; if (ret == 1) @@ -108,9 +105,8 @@ static uint32_t kid_generate_sub(struct kid_tree *tree, struct kid_entry *e, else if (ret == 0) return this->subid; else { - pr_perror("kcmp failed: pid (%d %d) type %u idx (%u %u)", - this->elem.pid, elem->pid, tree->kcmp_type, - this->elem.idx, elem->idx); + pr_perror("kcmp failed: pid (%d %d) type %u idx (%u %u)", this->elem.pid, elem->pid, + tree->kcmp_type, this->elem.idx, elem->idx); return 0; } } @@ -124,8 +120,7 @@ static uint32_t kid_generate_sub(struct kid_tree *tree, struct kid_entry *e, return sub->subid; } -uint32_t kid_generate_gen(struct kid_tree *tree, - struct kid_elem *elem, int *new_id) +uint32_t kid_generate_gen(struct kid_tree *tree, struct kid_elem *elem, int *new_id) { struct rb_node *node = tree->root.rb_node; struct kid_entry *e = NULL; @@ -154,9 +149,7 @@ uint32_t kid_generate_gen(struct kid_tree *tree, return e->subid; } -static struct kid_elem *kid_lookup_epoll_tfd_sub(struct kid_tree *tree, - struct kid_entry *e, - struct kid_elem *elem, +static struct kid_elem *kid_lookup_epoll_tfd_sub(struct kid_tree *tree, struct kid_entry *e, struct kid_elem *elem, kcmp_epoll_slot_t *slot) { struct rb_node *node = e->subtree_root.rb_node; @@ -166,8 +159,7 @@ static struct kid_elem *kid_lookup_epoll_tfd_sub(struct kid_tree *tree, while (node) { struct kid_entry *this = rb_entry(node, struct kid_entry, subtree_node); - int ret = syscall(SYS_kcmp, this->elem.pid, elem->pid, KCMP_EPOLL_TFD, - this->elem.idx, slot); + int ret = syscall(SYS_kcmp, this->elem.pid, elem->pid, KCMP_EPOLL_TFD, this->elem.idx, slot); if (ret == 1) node = node->rb_left, new = &((*new)->rb_left); @@ -176,9 +168,8 @@ static struct kid_elem *kid_lookup_epoll_tfd_sub(struct kid_tree *tree, else if (ret == 0) return &this->elem; else { - pr_perror("kcmp-epoll failed: pid (%d %d) type %u idx (%u %u)", - this->elem.pid, elem->pid, KCMP_EPOLL_TFD, - this->elem.idx, elem->idx); + pr_perror("kcmp-epoll failed: pid (%d %d) type %u idx (%u %u)", this->elem.pid, elem->pid, + KCMP_EPOLL_TFD, this->elem.idx, elem->idx); return NULL; } } @@ -186,9 +177,7 @@ static struct kid_elem *kid_lookup_epoll_tfd_sub(struct kid_tree *tree, return NULL; } -struct kid_elem *kid_lookup_epoll_tfd(struct kid_tree *tree, - struct kid_elem *elem, - kcmp_epoll_slot_t *slot) +struct kid_elem *kid_lookup_epoll_tfd(struct kid_tree *tree, struct kid_elem *elem, kcmp_epoll_slot_t *slot) { struct rb_node *node = tree->root.rb_node; struct rb_node **new = &tree->root.rb_node; diff --git a/criu/kerndat.c b/criu/kerndat.c index 39cacb8fe..2dc2f77d5 100644 --- a/criu/kerndat.c +++ b/criu/kerndat.c @@ -4,16 +4,25 @@ #include #include #include +#include +#include #include #include #include #include #include #include -#include /* for sockaddr_in and inet_ntoa() */ +#include #include #include +#include +#include +#include +#if defined(CONFIG_HAS_NFTABLES_LIB_API_0) || defined(CONFIG_HAS_NFTABLES_LIB_API_1) +#include +#endif +#include #include "common/config.h" #include "int.h" @@ -22,6 +31,7 @@ #include "kerndat.h" #include "fs-magic.h" #include "mem.h" +#include "mman.h" #include "common/compiler.h" #include "sysctl.h" #include "cr_options.h" @@ -32,8 +42,8 @@ #include "sockets.h" #include "net.h" #include "tun.h" +#include #include -#include #include "netfilter.h" #include "fsnotify.h" #include "linux/userfaultfd.h" @@ -41,19 +51,32 @@ #include "uffd.h" #include "vdso.h" #include "kcmp.h" +#include "sched.h" +#include "memfd.h" +#include "mount-v2.h" +#include "util-caps.h" +#include "pagemap_scan.h" -struct kerndat_s kdat = { -}; +struct kerndat_s kdat = {}; +volatile int dummy_var; static int check_pagemap(void) { - int ret, fd; + int ret, fd, retry; u64 pfn = 0; + struct pm_scan_arg args = { + .size = sizeof(struct pm_scan_arg), + .flags = 0, + .category_inverted = PAGE_IS_PFNZERO | PAGE_IS_FILE, + .category_mask = PAGE_IS_PFNZERO | PAGE_IS_FILE, + .category_anyof_mask = PAGE_IS_PRESENT | PAGE_IS_SWAPPED, + .return_mask = PAGE_IS_PRESENT | PAGE_IS_SWAPPED | PAGE_IS_SOFT_DIRTY, + }; fd = __open_proc(PROC_SELF, EPERM, O_RDONLY, "pagemap"); if (fd < 0) { if (errno == EPERM) { - pr_info("Pagemap disabled"); + pr_info("Pagemap disabled\n"); kdat.pmap = PM_DISABLED; return 0; } @@ -61,11 +84,44 @@ static int check_pagemap(void) return -1; } - /* Get the PFN of some present page. Stack is here, so try it :) */ - ret = pread(fd, &pfn, sizeof(pfn), (((unsigned long)&ret) / page_size()) * sizeof(pfn)); - if (ret != sizeof(pfn)) { - pr_perror("Can't read pagemap"); - return -1; + if (ioctl(fd, PAGEMAP_SCAN, &args) == 0) { + pr_debug("PAGEMAP_SCAN is supported\n"); + kdat.has_pagemap_scan = true; + + args.return_mask |= PAGE_IS_GUARD; + if (ioctl(fd, PAGEMAP_SCAN, &args) == 0) + kdat.has_pagemap_scan_guard_pages = true; + } else { + switch (errno) { + case EINVAL: + case ENOTTY: + pr_debug("PAGEMAP_SCAN isn't supported\n"); + break; + default: + pr_perror("PAGEMAP_SCAN failed with unexpected errno"); + return -1; + } + } + + retry = 3; + while (retry--) { + ++dummy_var; + /* Get the PFN of a page likely to be present. */ + ret = pread(fd, &pfn, sizeof(pfn), PAGE_PFN((uintptr_t)&dummy_var) * sizeof(pfn)); + if (ret != sizeof(pfn)) { + pr_perror("Can't read pagemap"); + close(fd); + return -1; + } + /* The page can be swapped out by the time the read occurs, + * in which case the rest of the bits are a swap type + offset + * (which could be zero even if not hidden). + * Retry if this happens. */ + if (pfn & PME_PRESENT) + break; + pr_warn("got non-present PFN %#lx for the dummy data page; %s\n", (unsigned long)pfn, + retry ? "retrying" : "giving up"); + pfn = 0; } close(fd); @@ -132,29 +188,26 @@ static void kerndat_mmap_min_addr(void) struct sysctl_req req[] = { { - .name = "vm/mmap_min_addr", - .arg = &value, - .type = CTL_U64, + .name = "vm/mmap_min_addr", + .arg = &value, + .type = CTL_U64, }, }; if (sysctl_op(req, ARRAY_SIZE(req), CTL_READ, 0)) { - pr_warn("Can't fetch %s value, use default %#lx\n", - req[0].name, (unsigned long)default_mmap_min_addr); + pr_warn("Can't fetch %s value, use default %#lx\n", req[0].name, (unsigned long)default_mmap_min_addr); kdat.mmap_min_addr = default_mmap_min_addr; return; } if (value < default_mmap_min_addr) { - pr_debug("Adjust mmap_min_addr %#lx -> %#lx\n", - (unsigned long)value, + pr_debug("Adjust mmap_min_addr %#lx -> %#lx\n", (unsigned long)value, (unsigned long)default_mmap_min_addr); kdat.mmap_min_addr = default_mmap_min_addr; } else kdat.mmap_min_addr = value; - pr_debug("Found mmap_min_addr %#lx\n", - (unsigned long)kdat.mmap_min_addr); + pr_debug("Found mmap_min_addr %#lx\n", (unsigned long)kdat.mmap_min_addr); } static int kerndat_files_stat(void) @@ -164,9 +217,9 @@ static int kerndat_files_stat(void) struct sysctl_req req[] = { { - .name = "fs/nr_open", - .arg = &nr_open, - .type = CTL_U32, + .name = "fs/nr_open", + .arg = &nr_open, + .type = CTL_U32, }, }; @@ -177,28 +230,17 @@ static int kerndat_files_stat(void) kdat.sysctl_nr_open = nr_open; - pr_debug("files stat: %s %u\n", - req[0].name, kdat.sysctl_nr_open); + pr_debug("files stat: %s %u\n", req[0].name, kdat.sysctl_nr_open); return 0; } -static int kerndat_get_shmemdev(void) +static int kerndat_get_dev(dev_t *dev, char *map, size_t size) { - void *map; char maps[128]; struct stat buf; - dev_t dev; - map = mmap(NULL, PAGE_SIZE, PROT_READ | PROT_WRITE, - MAP_SHARED | MAP_ANONYMOUS, 0, 0); - if (map == MAP_FAILED) { - pr_perror("Can't mmap memory for shmemdev test"); - return -1; - } - - sprintf(maps, "/proc/self/map_files/%lx-%lx", - (unsigned long)map, (unsigned long)map + page_size()); + sprintf(maps, "/proc/self/map_files/%lx-%lx", (unsigned long)map, (unsigned long)map + size); if (stat(maps, &buf) < 0) { int e = errno; if (errno == EPERM) { @@ -207,20 +249,38 @@ static int kerndat_get_shmemdev(void) * OK, let's go the slower route. */ - if (parse_self_maps((unsigned long)map, &dev) < 0) { + if (parse_self_maps((unsigned long)map, dev) < 0) { pr_err("Can't read self maps\n"); - goto err; + return -1; } } else { pr_perror("Can't stat self map_files %d", e); - goto err; + return -1; } - } else - dev = buf.st_dev; + } else { + *dev = buf.st_dev; + } + + return 0; +} + +static int kerndat_get_shmemdev(void) +{ + void *map; + dev_t dev; + + map = mmap(NULL, PAGE_SIZE, PROT_READ | PROT_WRITE, MAP_SHARED | MAP_ANONYMOUS, 0, 0); + if (map == MAP_FAILED) { + pr_perror("Can't mmap memory for shmemdev test"); + return -1; + } + + if (kerndat_get_dev(&dev, map, PAGE_SIZE)) + goto err; munmap(map, PAGE_SIZE); kdat.shmem_dev = dev; - pr_info("Found anon-shmem device at %"PRIx64"\n", kdat.shmem_dev); + pr_info("Found anon-shmem device at %" PRIx64 "\n", kdat.shmem_dev); return 0; err: @@ -228,13 +288,67 @@ err: return -1; } +/* Return -1 -- error + * Return 0 -- successful but can't get any new device's numbers + * Return 1 -- successful and get new device's numbers + * + * At first, all kdat.hugetlb_dev elements are initialized to 0. + * When the function finishes, + * kdat.hugetlb_dev[i] == -1 -- this hugetlb page size is not supported + * kdat.hugetlb_dev[i] == 0 -- this hugetlb page size is supported but can't collect device's number + * Otherwise, kdat.hugetlb_dev[i] contains the corresponding device's number + * + * Next time the function is called, it only tries to collect the device's number of hugetlb page size + * that is supported but can't be collected in the previous call (kdat.hugetlb_dev[i] == 0) + */ +static int kerndat_get_hugetlb_dev(void) +{ + void *map; + int i, flag, ret = 0; + unsigned long long size; + dev_t dev; + + for (i = 0; i < HUGETLB_MAX; i++) { + /* Skip if this hugetlb size is not supported or the device's number has been collected */ + if (kdat.hugetlb_dev[i]) + continue; + + size = hugetlb_info[i].size; + flag = hugetlb_info[i].flag; + map = mmap(NULL, size, PROT_READ | PROT_WRITE, MAP_PRIVATE | MAP_ANONYMOUS | MAP_HUGETLB | flag, 0, 0); + if (map == MAP_FAILED) { + if (errno == EINVAL) { + kdat.hugetlb_dev[i] = (dev_t)-1; + continue; + } else if (errno == ENOMEM) { + pr_info("Hugetlb size %llu Mb is supported but cannot get dev's number\n", size >> 20); + continue; + } else { + pr_perror("Unexpected result when get hugetlb dev"); + return -1; + } + } + + if (kerndat_get_dev(&dev, map, size)) { + munmap(map, size); + return -1; + } + + munmap(map, size); + kdat.hugetlb_dev[i] = dev; + ret = 1; + pr_info("Found hugetlb device at %" PRIx64 "\n", kdat.hugetlb_dev[i]); + } + return ret; +} + static dev_t get_host_dev(unsigned int which) { static struct kst { - const char *name; - const char *path; - unsigned int magic; - dev_t fs_dev; + const char *name; + const char *path; + unsigned int magic; + dev_t fs_dev; } kstat[KERNDAT_FS_STAT_MAX] = { [KERNDAT_FS_STAT_DEVPTS] = { .name = "devpts", @@ -313,8 +427,7 @@ static int kerndat_get_dirty_track(void) u64 pmap = 0; int ret = -1; - map = mmap(NULL, PAGE_SIZE, PROT_READ | PROT_WRITE, - MAP_PRIVATE | MAP_ANONYMOUS, 0, 0); + map = mmap(NULL, PAGE_SIZE, PROT_READ | PROT_WRITE, MAP_PRIVATE | MAP_ANONYMOUS, 0, 0); if (map == MAP_FAILED) { pr_perror("Can't mmap memory for pagemap test"); return ret; @@ -352,19 +465,15 @@ static int kerndat_get_dirty_track(void) pr_info("Dirty track supported on kernel\n"); kdat.has_dirty_track = true; } else { -no_dt: + no_dt: pr_info("Dirty tracking support is OFF\n"); - if (opts.track_mem) { - pr_err("Tracking memory is not available\n"); - return -1; - } } return 0; } /* The page frame number (PFN) is constant for the zero page */ -static int init_zero_page_pfn() +static int init_zero_page_pfn(void) { void *addr; int ret = 0; @@ -381,7 +490,7 @@ static int init_zero_page_pfn() return 0; } - if (*((int *) addr) != 0) { + if (*((int *)addr) != 0) { BUG(); return -1; } @@ -389,9 +498,10 @@ static int init_zero_page_pfn() ret = vaddr_to_pfn(-1, (unsigned long)addr, &kdat.zero_page_pfn); munmap(addr, PAGE_SIZE); - if (kdat.zero_page_pfn == 0) + if (kdat.zero_page_pfn == 0) { + pr_err("vaddr_to_pfn succeeded but kdat.zero_page_pfn is invalid.\n"); ret = -1; - + } return ret; } @@ -400,22 +510,52 @@ static int get_last_cap(void) struct sysctl_req req[] = { { "kernel/cap_last_cap", &kdat.last_cap, CTL_U32 }, }; + int ret; - return sysctl_op(req, ARRAY_SIZE(req), CTL_READ, 0); + ret = sysctl_op(req, ARRAY_SIZE(req), CTL_READ, 0); + if (ret || kdat.last_cap < 32 * CR_CAP_SIZE) + return ret; + + pr_err("Kernel reports more capabilities than this CRIU supports: %u > %u\n", + kdat.last_cap, 32 * CR_CAP_SIZE - 1); + return -1; } static bool kerndat_has_memfd_create(void) { int ret; - ret = syscall(SYS_memfd_create, NULL, 0); + ret = memfd_create(NULL, 0); if (ret == -1 && errno == ENOSYS) kdat.has_memfd = false; else if (ret == -1 && errno == EFAULT) kdat.has_memfd = true; else { - pr_err("Unexpected error from memfd_create(NULL, 0): %m\n"); + pr_perror("Unexpected error from memfd_create(NULL, 0)"); + return -1; + } + + return 0; +} + +static bool kerndat_has_memfd_hugetlb(void) +{ + int ret; + + if (!kdat.has_memfd) { + kdat.has_memfd_hugetlb = false; + return 0; + } + + ret = memfd_create("", MFD_HUGETLB); + if (ret >= 0) { + kdat.has_memfd_hugetlb = true; + close(ret); + } else if (ret == -1 && (errno == EINVAL || errno == ENOENT || errno == ENOSYS)) { + kdat.has_memfd_hugetlb = false; + } else { + pr_perror("Unexpected error from memfd_create(\"\", MFD_HUGETLB)"); return -1; } @@ -429,7 +569,7 @@ static int get_task_size(void) return 0; } -static int kerndat_fdinfo_has_lock() +static int kerndat_fdinfo_has_lock(void) { int fd, pfd = -1, exit_code = -1, len; char buf[PAGE_SIZE]; @@ -458,13 +598,13 @@ static int kerndat_fdinfo_has_lock() exit_code = 0; out: - close(pfd); + close_safe(&pfd); close(fd); return exit_code; } -static int get_ipv6() +static int get_ipv6(void) { if (access("/proc/sys/net/ipv6", F_OK) < 0) { if (errno == ENOENT) { @@ -498,10 +638,10 @@ static int kerndat_loginuid(void) * on that rely dump/restore code. * See also: marc.info/?l=git-commits-head&m=138509506407067 */ - if (prepare_loginuid(INVALID_UID, LOG_WARN) < 0) + if (prepare_loginuid(INVALID_UID) < 0) return 0; /* Cleaning value back as it was */ - if (prepare_loginuid(saved_loginuid, LOG_WARN) < 0) + if (prepare_loginuid(saved_loginuid) < 0) return 0; kdat.luid = LUID_FULL; @@ -511,7 +651,7 @@ static int kerndat_loginuid(void) static int kerndat_iptables_has_xtlocks(void) { int fd; - char *argv[4] = { "sh", "-c", "iptables -w -L", NULL }; + char *argv[4] = { "sh", "-c", "iptables -n -w -L", NULL }; fd = open("/dev/null", O_RDWR); if (fd < 0) { @@ -527,29 +667,52 @@ static int kerndat_iptables_has_xtlocks(void) return 0; } -int kerndat_tcp_repair(void) -{ - int sock, clnt = -1, yes = 1, exit_code = -1; - struct sockaddr_in addr; - socklen_t aux; +/* + * Unfortunately in C htonl() is not constexpr and cannot be used in a static + * initialization below. + */ +#define constant_htonl(x) \ + (__BYTE_ORDER == __BIG_ENDIAN ? (x) : \ + (((x) & 0xff000000) >> 24) | (((x) & 0x00ff0000) >> 8) | \ + (((x) & 0x0000ff00) << 8) | (((x) & 0x000000ff) << 24)) - memset(&addr,0,sizeof(addr)); - addr.sin_family = AF_INET; - inet_pton(AF_INET, "127.0.0.1", &(addr.sin_addr)); - addr.sin_port = 0; +static int kerndat_tcp_repair(void) +{ + static const struct sockaddr_in loopback_ip4 = { + .sin_family = AF_INET, + .sin_port = 0, + .sin_addr = { constant_htonl(INADDR_LOOPBACK) }, + }; + static const struct sockaddr_in6 loopback_ip6 = { + .sin6_family = AF_INET6, + .sin6_port = 0, + .sin6_addr = IN6ADDR_LOOPBACK_INIT, + }; + int sock, clnt = -1, yes = 1, exit_code = -1; + const struct sockaddr *addr; + struct sockaddr_storage listener_addr; + socklen_t addrlen; + + addr = (const struct sockaddr *)&loopback_ip4; + addrlen = sizeof(loopback_ip4); sock = socket(AF_INET, SOCK_STREAM, IPPROTO_TCP); + if (sock < 0 && errno == EAFNOSUPPORT) { + addr = (const struct sockaddr *)&loopback_ip6; + addrlen = sizeof(loopback_ip6); + sock = socket(AF_INET6, SOCK_STREAM, IPPROTO_TCP); + } if (sock < 0) { pr_perror("Unable to create a socket"); return -1; } - if (bind(sock, (struct sockaddr *) &addr, sizeof(addr))) { + if (bind(sock, addr, addrlen)) { pr_perror("Unable to bind a socket"); goto err; } - aux = sizeof(addr); - if (getsockname(sock, (struct sockaddr *) &addr, &aux)) { + addrlen = sizeof(listener_addr); + if (getsockname(sock, (struct sockaddr *)&listener_addr, &addrlen)) { pr_perror("Unable to get a socket name"); goto err; } @@ -559,13 +722,13 @@ int kerndat_tcp_repair(void) goto err; } - clnt = socket(AF_INET, SOCK_STREAM, IPPROTO_TCP); + clnt = socket(addr->sa_family, SOCK_STREAM, IPPROTO_TCP); if (clnt < 0) { pr_perror("Unable to create a socket"); goto err; } - if (connect(clnt, (struct sockaddr *) &addr, sizeof(addr))) { + if (connect(clnt, (const struct sockaddr *)&listener_addr, addrlen)) { pr_perror("Unable to connect a socket"); goto err; } @@ -576,8 +739,10 @@ int kerndat_tcp_repair(void) } if (setsockopt(clnt, SOL_TCP, TCP_REPAIR, &yes, sizeof(yes))) { - if (errno != EPERM) + if (errno != EPERM) { + pr_perror("Unable to set TCP_REPAIR with setsockopt"); goto err; + } kdat.has_tcp_half_closed = false; } else kdat.has_tcp_half_closed = true; @@ -590,20 +755,22 @@ err: return exit_code; } -int kerndat_nsid(void) +static int kerndat_nsid(void) { int nsid, sk; + kdat.has_nsid = false; + sk = socket(PF_NETLINK, SOCK_RAW, NETLINK_ROUTE); if (sk < 0) { - pr_perror("Unable to create a netlink socket"); - return -1; + pr_pwarn("Unable to create a netlink socket: NSID can't be used."); + return 0; } if (net_get_nsid(sk, getpid(), &nsid) < 0) { - pr_err("NSID is not supported\n"); + pr_warn("NSID is not supported\n"); close(sk); - return -1; + return 0; } kdat.has_nsid = true; @@ -616,8 +783,10 @@ static int kerndat_compat_restore(void) int ret; ret = kdat_can_map_vdso(); - if (ret < 0) + if (ret < 0) { + pr_err("kdat_can_map_vdso failed\n"); return ret; + } kdat.can_map_vdso = !!ret; /* depends on kdat.can_map_vdso result */ @@ -635,8 +804,7 @@ static int kerndat_detect_stack_guard_gap(void) FILE *maps; void *mem; - mem = mmap(NULL, (3ul << 20), PROT_READ | PROT_WRITE, - MAP_PRIVATE | MAP_ANONYMOUS | MAP_GROWSDOWN, -1, 0); + mem = mmap(NULL, (3ul << 20), PROT_READ | PROT_WRITE, MAP_PRIVATE | MAP_ANONYMOUS | MAP_GROWSDOWN, -1, 0); if (mem == MAP_FAILED) { pr_perror("Can't mmap stack area"); return -1; @@ -652,13 +820,13 @@ static int kerndat_detect_stack_guard_gap(void) maps = fopen("/proc/self/maps", "r"); if (maps == NULL) { + pr_perror("Could not open /proc/self/maps"); munmap(mem, 4096); return -1; } while (fgets(buf, sizeof(buf), maps)) { - num = sscanf(buf, "%lx-%lx %c%c%c%c", - &start, &end, &r, &w, &x, &s); + num = sscanf(buf, "%lx-%lx %c%c%c%c", &start, &end, &r, &w, &x, &s); if (num < 6) { pr_err("Can't parse: %s\n", buf); goto err; @@ -666,14 +834,14 @@ static int kerndat_detect_stack_guard_gap(void) /* * When reading /proc/$pid/[s]maps the - * start/end addresses might be cutted off + * start/end addresses might be cut off * with PAGE_SIZE on kernels prior 4.12 * (see kernel commit 1be7107fbe18ee). * * Same time there was semi-complete - * patch released which hitted a number + * patch released which hit a number * of repos (Ubuntu, Fedora) where instead - * of PAGE_SIZE the 1M gap is cutted off. + * of PAGE_SIZE the 1M gap is cut off. */ if (start == (unsigned long)mem) { kdat.stack_guard_gap_hidden = false; @@ -723,9 +891,23 @@ static int kerndat_has_inotify_setnextwd(void) return ret; } +static int kerndat_has_fsopen(void) +{ + if (syscall(__NR_fsopen, NULL, -1) != -1) { + pr_err("fsopen should fail\n"); + return -1; + } + if (errno == ENOSYS) + pr_info("The new mount API (fsopen, fsmount) isn't supported\n"); + else + kdat.has_fsopen = true; + + return 0; +} + static int has_kcmp_epoll_tfd(void) { - kcmp_epoll_slot_t slot = { }; + kcmp_epoll_slot_t slot = {}; int ret = -1, efd, tfd; pid_t pid = getpid(); struct epoll_event ev; @@ -768,6 +950,21 @@ out: return ret; } +static int has_time_namespace(void) +{ + if (access("/proc/self/timens_offsets", F_OK) < 0) { + if (errno == ENOENT) { + pr_debug("Time namespaces are not supported.\n"); + kdat.has_timens = false; + return 0; + } + pr_perror("Unable to access /proc/self/timens_offsets"); + return -1; + } + kdat.has_timens = true; + return 0; +} + int __attribute__((weak)) kdat_x86_has_ptrace_fpu_xsave_bug(void) { return 0; @@ -777,26 +974,266 @@ static int kerndat_x86_has_ptrace_fpu_xsave_bug(void) { int ret = kdat_x86_has_ptrace_fpu_xsave_bug(); - if (ret < 0) + if (ret < 0) { + pr_err("kdat_x86_has_ptrace_fpu_xsave_bug failed\n"); return ret; + } kdat.x86_has_ptrace_fpu_xsave_bug = !!ret; return 0; } -#define KERNDAT_CACHE_FILE KDAT_RUNDIR"/criu.kdat" -#define KERNDAT_CACHE_FILE_TMP KDAT_RUNDIR"/.criu.kdat" +static int kerndat_has_rseq(void) +{ + if (syscall(__NR_rseq, NULL, 0, 0, 0) != -1) { + pr_err("rseq should fail\n"); + return -1; + } + if (errno == ENOSYS) + pr_info("rseq syscall isn't supported\n"); + else + kdat.has_rseq = true; + return 0; +} + +static int kerndat_has_ptrace_get_rseq_conf(void) +{ + pid_t pid; + int len; + struct __ptrace_rseq_configuration rseq; + int ret = 0; + + pid = fork_and_ptrace_attach(NULL); + if (pid < 0) + return -1; + + len = ptrace(PTRACE_GET_RSEQ_CONFIGURATION, pid, sizeof(rseq), &rseq); + if (len != sizeof(rseq)) { + if (kdat.has_ptrace_get_rseq_conf) + ret = 1; /* we should update kdat */ + + kdat.has_ptrace_get_rseq_conf = false; + pr_info("ptrace(PTRACE_GET_RSEQ_CONFIGURATION) is not supported\n"); + goto out; + } + + /* + * flags is always zero from the kernel side, if it will be changed + * we need to pay attention to that and, possibly, make changes on the CRIU side. + */ + if (rseq.flags != 0) { + if (kdat.has_ptrace_get_rseq_conf) + ret = 1; /* we should update kdat */ + + kdat.has_ptrace_get_rseq_conf = false; + pr_err("ptrace(PTRACE_GET_RSEQ_CONFIGURATION): rseq.flags != 0\n"); + } else { + if (!kdat.has_ptrace_get_rseq_conf) + ret = 1; /* we should update kdat */ + + kdat.has_ptrace_get_rseq_conf = true; + + if (memcmp(&kdat.libc_rseq_conf, &rseq, sizeof(rseq))) + ret = 1; /* we should update kdat */ + + kdat.libc_rseq_conf = rseq; + } + +out: + kill(pid, SIGKILL); + waitpid(pid, NULL, 0); + return ret; +} + +int kerndat_sockopt_buf_lock(void) +{ + int exit_code = -1; + socklen_t len; + u32 buf_lock; + int sock; + + sock = socket(AF_INET, SOCK_STREAM, IPPROTO_TCP); + if (sock < 0 && errno == EAFNOSUPPORT) + sock = socket(AF_INET6, SOCK_STREAM, IPPROTO_TCP); + if (sock < 0) { + pr_perror("Unable to create a socket"); + return -1; + } + + len = sizeof(buf_lock); + if (getsockopt(sock, SOL_SOCKET, SO_BUF_LOCK, &buf_lock, &len)) { + if (errno != ENOPROTOOPT) { + pr_perror("Unable to get SO_BUF_LOCK with getsockopt"); + goto err; + } + kdat.has_sockopt_buf_lock = false; + } else + kdat.has_sockopt_buf_lock = true; + + exit_code = 0; +err: + close(sock); + return exit_code; +} + +static int kerndat_has_move_mount_set_group(void) +{ + char tmpdir[] = "/tmp/.criu.move_mount_set_group.XXXXXX"; + char subdir[64]; + int exit_code = -1; + + if (mkdtemp(tmpdir) == NULL) { + pr_perror("Fail to make dir %s", tmpdir); + return -1; + } + + if (mount("criu.move_mount_set_group", tmpdir, "tmpfs", 0, NULL)) { + pr_perror("Fail to mount tmfps to %s", tmpdir); + rmdir(tmpdir); + return -1; + } + + if (mount(NULL, tmpdir, NULL, MS_PRIVATE, NULL)) { + pr_perror("Fail to make %s private", tmpdir); + goto out; + } + + if (snprintf(subdir, sizeof(subdir), "%s/subdir", tmpdir) >= sizeof(subdir)) { + pr_err("Fail to snprintf subdir\n"); + goto out; + } + + if (mkdir(subdir, 0700)) { + pr_perror("Fail to make dir %s", subdir); + goto out; + } + + if (mount(subdir, subdir, NULL, MS_BIND, NULL)) { + pr_perror("Fail to make bind-mount %s", subdir); + goto out; + } + + if (mount(NULL, tmpdir, NULL, MS_SHARED, NULL)) { + pr_perror("Fail to make %s private", tmpdir); + goto out; + } + + if (sys_move_mount(AT_FDCWD, tmpdir, AT_FDCWD, subdir, MOVE_MOUNT_SET_GROUP)) { + if (errno == EINVAL || errno == ENOSYS) { + pr_debug("No MOVE_MOUNT_SET_GROUP kernel feature\n"); + kdat.has_move_mount_set_group = false; + exit_code = 0; + goto out; + } + pr_perror("Fail to MOVE_MOUNT_SET_GROUP"); + goto out; + } + + kdat.has_move_mount_set_group = true; + exit_code = 0; +out: + if (umount2(tmpdir, MNT_DETACH)) + pr_warn("Fail to umount2 %s: %s\n", tmpdir, strerror(errno)); + if (rmdir(tmpdir)) + pr_warn("Fail to rmdir %s: %s\n", tmpdir, strerror(errno)); + return exit_code; +} + +static int kerndat_has_openat2(void) +{ + if (sys_openat2(AT_FDCWD, ".", NULL, 0) != -1) { + pr_err("openat2 should fail\n"); + return -1; + } + if (errno == ENOSYS) { + pr_debug("No openat2 syscall support\n"); + kdat.has_openat2 = false; + } else { + kdat.has_openat2 = true; + } + + return 0; +} + +int __attribute__((weak)) kdat_has_shstk(void) +{ + return 0; +} + +static int kerndat_has_shstk(void) +{ + int ret = kdat_has_shstk(); + + if (ret < 0) { + pr_err("kdat_has_shstk failed\n"); + return ret; + } + + kdat.has_shstk = !!ret; + return 0; +} + +#define KERNDAT_CACHE_NAME "criu.kdat" +#define KERNDAT_CACHE_FILE KDAT_RUNDIR "/" KERNDAT_CACHE_NAME + +/* + * Returns: + * -1 if kdat_file was not written due to error + * 0 if kdat_file was written + * 1 if kdat_file was not written because cache directory undefined in env (non-root mode) + */ +static int get_kerndat_filename(char **kdat_file) +{ + int ret; + + /* + * Running as non-root, even with CAP_CHECKPOINT_RESTORE, does not + * allow to write to KDAT_RUNDIR which usually is only writable by root. + * Let's write criu.kdat file to XDG_RUNTIME_DIR for non-root cases. + * Note that XDG_RUNTIME_DIR is not always defined (e.g. when executing + * via su/sudo). + */ + if (opts.unprivileged) { + const char *cache_dir = getenv("XDG_RUNTIME_DIR"); + if (!cache_dir) { + pr_warn("$XDG_RUNTIME_DIR not set. Cannot find location for kerndat file\n"); + return 1; + } + ret = asprintf(kdat_file, "%s/%s", cache_dir, KERNDAT_CACHE_NAME); + } else { + ret = asprintf(kdat_file, "%s", KERNDAT_CACHE_FILE); + } + + if (unlikely(ret < 0)) { + pr_warn("Cannot allocate memory for kerndat file name\n"); + return -1; + } + + return 0; +} + +/* + * Returns: + * -1 if error + * 0 if cache was loaded + * 1 if cache does not exist or is stale or cache directory undefined in env (non-root mode) + */ static int kerndat_try_load_cache(void) { + cleanup_free char *kdat_file = NULL; int fd, ret; - fd = open(KERNDAT_CACHE_FILE, O_RDONLY); + ret = get_kerndat_filename(&kdat_file); + if (ret) + return ret; + + fd = open(kdat_file, O_RDONLY); if (fd < 0) { - if(ENOENT == errno) - pr_debug("File %s does not exist\n", KERNDAT_CACHE_FILE); + if (ENOENT == errno) + pr_debug("File %s does not exist\n", kdat_file); else - pr_warn("Can't load %s\n", KERNDAT_CACHE_FILE); + pr_warn("Can't load %s\n", kdat_file); return 1; } @@ -809,15 +1246,13 @@ static int kerndat_try_load_cache(void) close(fd); - if (ret != sizeof(kdat) || - kdat.magic1 != KDAT_MAGIC || - kdat.magic2 != KDAT_MAGIC_2) { - pr_warn("Stale %s file\n", KERNDAT_CACHE_FILE); - unlink(KERNDAT_CACHE_FILE); + if (ret != sizeof(kdat) || kdat.magic1 != KDAT_MAGIC || kdat.magic2 != KDAT_MAGIC_2) { + pr_warn("Stale %s file\n", kdat_file); + unlink(kdat_file); return 1; } - pr_info("Loaded kdat cache from %s\n", KERNDAT_CACHE_FILE); + pr_info("Loaded kdat cache from %s\n", kdat_file); return 0; } @@ -825,8 +1260,20 @@ static void kerndat_save_cache(void) { int fd, ret; struct statfs s; + cleanup_free char *kdat_file = NULL; + cleanup_free char *kdat_file_tmp = NULL; - fd = open(KERNDAT_CACHE_FILE_TMP, O_CREAT | O_EXCL | O_WRONLY, 0600); + if (get_kerndat_filename(&kdat_file)) + return; + + ret = asprintf(&kdat_file_tmp, "%s.tmp", kdat_file); + + if (unlikely(ret < 0)) { + pr_warn("Cannot allocate memory for kerndat file name\n"); + return; + } + + fd = open(kdat_file_tmp, O_CREAT | O_EXCL | O_WRONLY, 0600); if (fd < 0) /* * It can happen that we race with some other criu @@ -835,6 +1282,10 @@ static void kerndat_save_cache(void) */ return; + /* + * If running as root we store the cache file on a tmpfs (/run), + * because the file should be gone after reboot. + */ if (fstatfs(fd, &s) < 0 || s.f_type != TMPFS_MAGIC) { pr_warn("Can't keep kdat cache on non-tempfs\n"); close(fd); @@ -848,40 +1299,54 @@ static void kerndat_save_cache(void) */ kdat.magic1 = KDAT_MAGIC; kdat.magic2 = KDAT_MAGIC_2; + ret = write(fd, &kdat, sizeof(kdat)); close(fd); if (ret == sizeof(kdat)) - ret = rename(KERNDAT_CACHE_FILE_TMP, KERNDAT_CACHE_FILE); + ret = rename(kdat_file_tmp, kdat_file); else { ret = -1; errno = EIO; } if (ret < 0) { - pr_perror("Couldn't save %s", KERNDAT_CACHE_FILE); -unl: - unlink(KERNDAT_CACHE_FILE_TMP); + pr_perror("Couldn't save %s", kdat_file); + unl: + unlink(kdat_file); } } static int kerndat_uffd(void) { - int uffd; + int uffd, err = 0; + + if (opts.unprivileged) + /* + * If running as non-root uffd_open() fails with + * 'Operation not permitted'. Just ignore uffd for + * non-root for now. + */ + return 0; kdat.uffd_features = 0; - uffd = uffd_open(0, &kdat.uffd_features); + uffd = uffd_open(0, &kdat.uffd_features, &err); /* - * uffd == -ENOSYS means userfaultfd is not supported on this - * system and we just happily return with kdat.has_uffd = false. - * Error other than -ENOSYS would mean "Houston, Houston, we + * err == ENOSYS means userfaultfd is not supported on this system and + * we just happily return with kdat.has_uffd = false. + * err == EPERM means that userfaultfd is not allowed as we are + * non-root user, so we also return with kdat.has_uffd = false. + * Errors other than ENOSYS and EPERM would mean "Houston, Houston, we * have a problem!" */ if (uffd < 0) { - if (uffd == -ENOSYS) + if (err == ENOSYS) return 0; - + if (err == EPERM) { + pr_info("Lazy pages are not permitted\n"); + return 0; + } pr_err("Lazy pages are not available\n"); return -1; } @@ -906,21 +1371,24 @@ int kerndat_has_thp_disable(void) bool vma_match = false; if (prctl(PR_SET_THP_DISABLE, 1, 0, 0, 0)) { - if (errno != EINVAL) + if (errno != EINVAL) { + pr_perror("prctl PR_SET_THP_DISABLE failed"); return -1; + } pr_info("PR_SET_THP_DISABLE is not available\n"); return 0; } - addr = mmap(NULL, PAGE_SIZE, PROT_READ | PROT_WRITE, - MAP_PRIVATE | MAP_ANONYMOUS, 0, 0); + addr = mmap(NULL, PAGE_SIZE, PROT_READ | PROT_WRITE, MAP_PRIVATE | MAP_ANONYMOUS, 0, 0); if (addr == MAP_FAILED) { pr_perror("Can't mmap memory for THP disable test"); return -1; } - if (prctl(PR_SET_THP_DISABLE, 0, 0, 0, 0)) - return -1; + if (prctl(PR_SET_THP_DISABLE, 0, 0, 0, 0)) { + pr_perror("prctl PR_SET_THP_DISABLE failed"); + goto out_unmap; + } f.fd = open("/proc/self/smaps", O_RDONLY); if (f.fd < 0) { @@ -953,6 +1421,8 @@ int kerndat_has_thp_disable(void) parse_vmflags(str, &flags, &madv, &io_pf); kdat.has_thp_disable = !(madv & (1 << MADV_NOHUGEPAGE)); + if (!kdat.has_thp_disable) + pr_warn("prctl PR_SET_THP_DISABLE sets MADV_NOHUGEPAGE\n"); break; } } @@ -972,77 +1442,690 @@ static int kerndat_tun_netns(void) return check_tun_netns_cr(&kdat.tun_ns); } +static bool kerndat_has_clone3_set_tid(void) +{ + pid_t pid; + struct _clone_args args = {}; + +#if defined(CONFIG_MIPS) + /* + * Currently the CRIU PIE assembler clone3() wrapper is + * not implemented for MIPS. + */ + kdat.has_clone3_set_tid = false; + return 0; +#endif + + args.set_tid = -1; + /* + * On a system without clone3() this will return ENOSYS. + * On a system with clone3() but without set_tid this + * will return E2BIG. + * On a system with clone3() and set_tid it will return + * EINVAL. + */ + pid = syscall(__NR_clone3, &args, sizeof(args)); + + if (pid != -1) { + pr_err("Unexpected success: clone3() returned %d\n", pid); + return -1; + } + + if (errno == ENOSYS || errno == E2BIG) + return 0; + + if (errno != EINVAL) { + pr_pwarn("Unexpected error from clone3"); + return 0; + } + + kdat.has_clone3_set_tid = true; + return 0; +} + +static void kerndat_has_pidfd_open(void) +{ + int pidfd; + + pidfd = syscall(SYS_pidfd_open, getpid(), 0); + if (pidfd == -1) + kdat.has_pidfd_open = false; + else + kdat.has_pidfd_open = true; + + close_safe(&pidfd); +} + +static int kerndat_has_pidfd_getfd(void) +{ + int ret; + int fds[2]; + int val_a, val_b; + int pidfd, stolen_fd; + + ret = 0; + + if (socketpair(AF_UNIX, SOCK_DGRAM, 0, fds)) { + pr_perror("Can't open unix socket pair"); + ret = -1; + goto out; + } + + val_a = 1984; + if (write(fds[0], &val_a, sizeof(val_a)) != sizeof(val_a)) { + pr_perror("Can't write to socket"); + ret = -1; + goto close_pair; + } + + pidfd = syscall(SYS_pidfd_open, getpid(), 0); + if (pidfd == -1) { + pr_warn("Can't get pidfd\n"); + /* + * If pidfd_open is not supported then pidfd_getfd + * will not be supported as well. + */ + kdat.has_pidfd_getfd = false; + goto close_pair; + } + + stolen_fd = syscall(SYS_pidfd_getfd, pidfd, fds[1], 0); + if (stolen_fd == -1) { + kdat.has_pidfd_getfd = false; + goto close_all; + } + + if (read(fds[1], &val_b, sizeof(val_b)) != sizeof(val_b)) { + pr_perror("Can't read from socket"); + ret = -1; + goto close_all; + } + + if (val_b == val_a) { + kdat.has_pidfd_getfd = true; + } else { + /* If val_b != val_a, something unexpected happened. */ + pr_err("Unexpected value read from socket\n"); + ret = -1; + } + +close_all: + close_safe(&stolen_fd); + close_safe(&pidfd); +close_pair: + close(fds[0]); + close(fds[1]); +out: + return ret; +} + +int kerndat_has_nspid(void) +{ + struct bfd f; + int ret = -1; + char *str; + + f.fd = open("/proc/self/status", O_RDONLY); + if (f.fd < 0) { + pr_perror("Can't open /proc/self/status"); + return -1; + } + if (bfdopenr(&f)) + return -1; + while ((str = breadline(&f)) != NULL) { + if (IS_ERR(str)) + goto close; + if (!strncmp(str, "NSpid:", 6)) { + kdat.has_nspid = true; + break; + } + } + ret = 0; +close: + bclose(&f); + return ret; +} + +#if defined(CONFIG_HAS_NFTABLES_LIB_API_0) || defined(CONFIG_HAS_NFTABLES_LIB_API_1) +static int __has_nftables_concat(void *arg) +{ + bool *has = (bool *)arg; + struct nft_ctx *nft; + int ret = 1; + + /* + * Create a separate network namespace to avoid + * collisions between two CRIU instances. + */ + if (unshare(CLONE_NEWNET)) { + pr_perror("Unable create a network namespace"); + return 1; + } + + nft = nft_ctx_new(NFT_CTX_DEFAULT); + if (!nft) + return 1; + + if (NFT_RUN_CMD(nft, "create table inet CRIU")) { + pr_warn("Can't create nftables table\n"); + *has = false; /* kdat.has_nftables_concat = false */ + ret = 0; + goto nft_ctx_free_out; + } + + if (NFT_RUN_CMD(nft, "add set inet CRIU conn { type ipv4_addr . inet_service ;}")) + *has = false; /* kdat.has_nftables_concat = false */ + else + *has = true; /* kdat.has_nftables_concat = true */ + + /* Clean up */ + NFT_RUN_CMD(nft, "delete table inet CRIU"); + + ret = 0; +nft_ctx_free_out: + nft_ctx_free(nft); + return ret; +} +#endif + +static int kerndat_has_nftables_concat(void) +{ +#if defined(CONFIG_HAS_NFTABLES_LIB_API_0) || defined(CONFIG_HAS_NFTABLES_LIB_API_1) + bool has; + + if (call_in_child_process(__has_nftables_concat, (void *)&has)) + return -1; + + kdat.has_nftables_concat = has; + return 0; +#else + pr_warn("CRIU was built without libnftables support\n"); + kdat.has_nftables_concat = false; + return 0; +#endif +} + +#ifndef IPV6_FREEBIND +#define IPV6_FREEBIND 78 +#endif + +static int __kerndat_has_ipv6_freebind(int sk) +{ + int val = 1; + + if (setsockopt(sk, SOL_IPV6, IPV6_FREEBIND, &val, sizeof(int)) == -1) { + if (errno == ENOPROTOOPT) { + kdat.has_ipv6_freebind = false; + return 0; + } + pr_perror("Unable to setsockopt ipv6_freebind"); + return -1; + } + + kdat.has_ipv6_freebind = true; + return 0; +} + +static int kerndat_has_ipv6_freebind(void) +{ + int sk, ret; + + if (!kdat.ipv6) { + kdat.has_ipv6_freebind = false; + return 0; + } + + sk = socket(AF_INET6, SOCK_DGRAM, IPPROTO_UDP); + if (sk == -1) { + pr_perror("Unable to create a ipv6 dgram socket"); + return -1; + } + + ret = __kerndat_has_ipv6_freebind(sk); + close(sk); + return ret; +} + +#define MEMBARRIER_CMDBIT_GET_REGISTRATIONS 9 + +static int kerndat_has_membarrier_get_registrations(void) +{ + int ret = syscall(__NR_membarrier, 1 << MEMBARRIER_CMDBIT_GET_REGISTRATIONS, 0); + if (ret < 0) { + if (errno != EINVAL) { + return ret; + } + + kdat.has_membarrier_get_registrations = false; + } else { + kdat.has_membarrier_get_registrations = true; + } + + return 0; +} + +static int kerndat_has_close_range(void) +{ + /* fd is greater than max_fd, so close_range should return EINVAL. */ + if (cr_close_range(2, 1, 0) == 0) { + pr_err("close_range succeeded unexpectedly\n"); + return -1; + } + + if (errno == ENOSYS) { + pr_debug("close_range isn't supported\n"); + return 0; + } + if (errno != EINVAL) { + pr_perror("close_range returned unexpected error code"); + return -1; + } + + kdat.has_close_range = true; + return 0; +} + +static int kerndat_has_timer_cr_ids(void) +{ + if (prctl(PR_TIMER_CREATE_RESTORE_IDS, + PR_TIMER_CREATE_RESTORE_IDS_GET, 0, 0, 0) == -1) { + if (errno == EINVAL) { + pr_debug("PR_TIMER_CREATE_RESTORE_IDS isn't supported\n"); + return 0; + } + pr_perror("prctl returned unexpected error code"); + return -1; + } + + kdat.has_timer_cr_ids = true; + return 0; +} + +static void breakpoint_func(void) +{ + if (raise(SIGSTOP)) + pr_perror("Unable to kill itself with SIGSTOP"); + exit(1); +} + +/* + * kerndat_breakpoints checks that hardware breakpoints work as they should. + * In some cases, they might not work in virtual machines if the hypervisor + * doesn't virtualize them. For example, they don't work in AMD SEV virtual + * machines if the Debug Virtualization extension isn't supported or isn't + * enabled in SEV_FEATURES. + */ +static int kerndat_breakpoints(void) +{ + int status, ret, exit_code = -1; + pid_t pid; + + pid = fork(); + if (pid == -1) { + pr_perror("fork"); + return -1; + } + if (pid == 0) { + if (ptrace(PTRACE_TRACEME, 0, 0, 0)) { + pr_perror("ptrace(PTRACE_TRACEME)"); + exit(1); + } + raise(SIGSTOP); + breakpoint_func(); + exit(1); + } + if (waitpid(pid, &status, 0) == -1) { + pr_perror("waitpid for initial stop"); + goto err; + } + if (!WIFSTOPPED(status) || WSTOPSIG(status) != SIGSTOP) { + pr_err("Child didn't stop as expected: status=%x\n", status); + goto err; + } + ret = ptrace_set_breakpoint(pid, &breakpoint_func); + if (ret < 0) { + pr_err("Failed to set breakpoint\n"); + goto err; + } + if (ret == 0) { + pr_debug("Hardware breakpoints appear to be disabled\n"); + goto out; + } + if (waitpid(pid, &status, 0) == -1) { + pr_perror("waitpid for breakpoint trigger"); + goto err; + } + if (!WIFSTOPPED(status) || WSTOPSIG(status) != SIGTRAP) { + pr_warn("Hardware breakpoints don't seem to work (status=%x)\n", status); + goto out; + } + kdat.has_breakpoints = true; +out: + exit_code = 0; +err: + if (kill(pid, SIGKILL)) { + pr_perror("Failed to kill the child process"); + exit_code = -1; + } + if (waitpid(pid, &status, 0) == -1) { + pr_perror("Failed to wait for the child process"); + exit_code = -1; + } + if (!WIFSIGNALED(status) || WTERMSIG(status) != SIGKILL) { + pr_err("The child exited with unexpected code: %x\n", status); + exit_code = -1; + } + return exit_code; +} + +static int kerndat_has_madv_guard(void) +{ + void *map; + + map = mmap(NULL, PAGE_SIZE, PROT_READ | PROT_WRITE, MAP_PRIVATE | MAP_ANONYMOUS, 0, 0); + if (map == MAP_FAILED) { + pr_perror("Can't mmap a page for has_madv_guard feature test"); + return -1; + } + + if (madvise(map, PAGE_SIZE, MADV_GUARD_INSTALL)) { + if (errno != EINVAL) { + pr_perror("madvise failed (has_madv_guard check)"); + goto mmap_cleanup; + } + } else { + kdat.has_madv_guard = true; + } + + munmap(map, PAGE_SIZE); + return 0; + +mmap_cleanup: + munmap(map, PAGE_SIZE); + return -1; +} + +void kerndat_warn_about_madv_guards(void) +{ + if (kdat.has_madv_guard && !kdat.has_pagemap_scan_guard_pages) + pr_warn("ioctl(PAGEMAP_SCAN) doesn't support PAGE_IS_GUARD flag. " + "CRIU dump will fail if dumped processes use madvise(MADV_GUARD_INSTALL). " + "Please, consider updating your kernel.\n"); +} + +/* + * Some features depend on resource that can be dynamically changed + * at the OS runtime. There are cases that we cannot determine the + * availability of those features at the first time we run kerndat + * check. So in later kerndat checks, we need to retry to get those + * information. This function contains calls to those kerndat checks. + * + * Those kerndat checks must + * Return -1 on error + * Return 0 when the check is successful but no new information + * Return 1 when the check is successful and there is new information + */ +int kerndat_try_load_new(void) +{ + int ret; + + ret = kerndat_get_hugetlb_dev(); + if (ret < 0) + return ret; + + ret = kerndat_has_ptrace_get_rseq_conf(); + if (ret < 0) { + pr_err("kerndat_has_ptrace_get_rseq_conf failed when initializing kerndat.\n"); + return ret; + } + + ret = kerndat_has_shstk(); + if (ret < 0) { + pr_err("kerndat_has_shstk failed when initializing kerndat.\n"); + return ret; + } + + /* New information is found, we need to save to the cache */ + if (ret) + kerndat_save_cache(); + return 0; +} + +static int root_only_init(void) +{ + int ret = 0; + + if (opts.unprivileged) + return 0; + + if (!ret && kerndat_loginuid()) { + pr_err("kerndat_loginuid failed when initializing kerndat.\n"); + ret = -1; + } + if (!ret && kerndat_tun_netns()) { + pr_err("kerndat_tun_netns failed when initializing kerndat.\n"); + ret = -1; + } + if (!ret && kerndat_socket_unix_file()) { + pr_err("kerndat_socket_unix_file failed when initializing kerndat.\n"); + ret = -1; + } + if (!ret && kerndat_link_nsid()) { + pr_err("kerndat_link_nsid failed when initializing kerndat.\n"); + ret = -1; + } + if (!ret && kerndat_socket_netns()) { + pr_err("kerndat_socket_netns failed when initializing kerndat.\n"); + ret = -1; + } + if (!ret && kerndat_has_nftables_concat()) { + pr_err("kerndat_has_nftables_concat failed when initializing kerndat.\n"); + ret = -1; + } + if (!ret && kerndat_has_move_mount_set_group()) { + pr_err("kerndat_has_move_mount_set_group failed when initializing kerndat.\n"); + ret = -1; + } + + return ret; +} + int kerndat_init(void) { int ret; ret = kerndat_try_load_cache(); - if (ret <= 0) + if (ret < 0) return ret; + if (ret == 0) + return kerndat_try_load_new(); + + ret = 0; + /* kerndat_try_load_cache can leave some trash in kdat */ memset(&kdat, 0, sizeof(kdat)); preload_socket_modules(); - preload_netfilter_modules(); + if (!opts.unprivileged) + /* + * This uses 'iptables -L' to implicitly load necessary modules. + * If the non nft backed iptables is used it does a + * openat(AT_FDCWD, "/run/xtables.lock", O_RDONLY|O_CREAT, 0600) = -1 EACCES + * which will fail as non-root. There are no capabilities to + * change this. The iptables nft backend fails with + * openat(AT_FDCWD, "/proc/net/ip_tables_names", O_RDONLY) = -1 EACCES + */ + preload_netfilter_modules(); - ret = check_pagemap(); - if (!ret) - ret = kerndat_get_shmemdev(); - if (!ret) - ret = kerndat_get_dirty_track(); - if (!ret) - ret = init_zero_page_pfn(); - if (!ret) - ret = get_last_cap(); - if (!ret) - ret = kerndat_fdinfo_has_lock(); - if (!ret) - ret = get_task_size(); - if (!ret) - ret = get_ipv6(); - if (!ret) - ret = kerndat_loginuid(); - if (!ret) - ret = kerndat_iptables_has_xtlocks(); - if (!ret) - ret = kerndat_tcp_repair(); - if (!ret) - ret = kerndat_compat_restore(); - if (!ret) - ret = kerndat_socket_netns(); - if (!ret) - ret = kerndat_tun_netns(); - if (!ret) - ret = kerndat_socket_unix_file(); - if (!ret) - ret = kerndat_nsid(); - if (!ret) - ret = kerndat_link_nsid(); - if (!ret) - ret = kerndat_has_memfd_create(); - if (!ret) - ret = kerndat_detect_stack_guard_gap(); - if (!ret) - ret = kerndat_uffd(); - if (!ret) - ret = kerndat_has_thp_disable(); + if (check_pagemap()) { + pr_err("check_pagemap failed when initializing kerndat.\n"); + ret = -1; + } + if (!ret && kerndat_get_shmemdev()) { + pr_err("kerndat_get_shmemdev failed when initializing kerndat.\n"); + ret = -1; + } + if (!ret && kerndat_get_hugetlb_dev() < 0) { + pr_err("kerndat_get_hugetlb_dev failed when initializing kerndat.\n"); + ret = -1; + } + if (!ret && kerndat_get_dirty_track()) { + pr_err("kerndat_get_dirty_track failed when initializing kerndat.\n"); + ret = -1; + } + if (!ret && init_zero_page_pfn()) { + pr_err("init_zero_page_pfn failed when initializing kerndat.\n"); + ret = -1; + } + if (!ret && get_last_cap()) { + pr_err("get_last_cap failed when initializing kerndat.\n"); + ret = -1; + } + if (!ret && kerndat_fdinfo_has_lock()) { + pr_err("kerndat_fdinfo_has_lock failed when initializing kerndat.\n"); + ret = -1; + } + if (!ret && get_task_size()) { + pr_err("get_task_size failed when initializing kerndat.\n"); + ret = -1; + } + if (!ret && get_ipv6()) { + pr_err("get_ipv6 failed when initializing kerndat.\n"); + ret = -1; + } + if (!ret && kerndat_nsid()) { + pr_err("kerndat_nsid failed when initializing kerndat.\n"); + ret = -1; + } + + if (!ret && root_only_init()) + ret = -1; + + if (!ret && kerndat_iptables_has_xtlocks()) { + pr_err("kerndat_iptables_has_xtlocks failed when initializing kerndat.\n"); + ret = -1; + } + if (!ret && kerndat_tcp_repair()) { + pr_err("kerndat_tcp_repair failed when initializing kerndat.\n"); + ret = -1; + } + if (!ret && kerndat_compat_restore()) { + pr_err("kerndat_compat_restore failed when initializing kerndat.\n"); + ret = -1; + } + if (!ret && kerndat_has_memfd_create()) { + pr_err("kerndat_has_memfd_create failed when initializing kerndat.\n"); + ret = -1; + } + if (!ret && kerndat_has_memfd_hugetlb()) { + pr_err("kerndat_has_memfd_hugetlb failed when initializing kerndat.\n"); + ret = -1; + } + if (!ret && kerndat_detect_stack_guard_gap()) { + pr_err("kerndat_detect_stack_guard_gap failed when initializing kerndat.\n"); + ret = -1; + } + if (!ret && kerndat_uffd()) { + pr_err("kerndat_uffd failed when initializing kerndat.\n"); + ret = -1; + } + if (!ret && kerndat_has_thp_disable()) { + pr_err("kerndat_has_thp_disable failed when initializing kerndat.\n"); + ret = -1; + } /* Needs kdat.compat_cr filled before */ - if (!ret) - ret = kerndat_vdso_fill_symtable(); + if (!ret && kerndat_vdso_fill_symtable()) { + pr_err("kerndat_vdso_fill_symtable failed when initializing kerndat.\n"); + ret = -1; + } /* Depends on kerndat_vdso_fill_symtable() */ + if (!ret && kerndat_vdso_preserves_hint()) { + pr_err("kerndat_vdso_preserves_hint failed when initializing kerndat.\n"); + ret = -1; + } + if (!ret && kerndat_x86_has_ptrace_fpu_xsave_bug()) { + pr_err("kerndat_x86_has_ptrace_fpu_xsave_bug failed when initializing kerndat.\n"); + ret = -1; + } + if (!ret && kerndat_has_inotify_setnextwd()) { + pr_err("kerndat_has_inotify_setnextwd failed when initializing kerndat.\n"); + ret = -1; + } + if (!ret && has_kcmp_epoll_tfd()) { + pr_err("has_kcmp_epoll_tfd failed when initializing kerndat.\n"); + ret = -1; + } + if (!ret && kerndat_has_fsopen()) { + pr_err("kerndat_has_fsopen failed when initializing kerndat.\n"); + ret = -1; + } + if (!ret && kerndat_has_clone3_set_tid()) { + pr_err("kerndat_has_clone3_set_tid failed when initializing kerndat.\n"); + ret = -1; + } + if (!ret && has_time_namespace()) { + pr_err("has_time_namespace failed when initializing kerndat.\n"); + ret = -1; + } + if (!ret && (!opts.unprivileged || has_cap_net_admin(opts.cap_eff)) && kerndat_has_newifindex()) { + pr_err("kerndat_has_newifindex failed when initializing kerndat.\n"); + ret = -1; + } + if (!ret && kerndat_has_pidfd_getfd()) { + pr_err("kerndat_has_pidfd_getfd failed when initializing kerndat.\n"); + ret = -1; + } if (!ret) - ret = kerndat_vdso_preserves_hint(); - if (!ret) - ret = kerndat_socket_netns(); - if (!ret) - ret = kerndat_nsid(); - if (!ret) - ret = kerndat_x86_has_ptrace_fpu_xsave_bug(); - if (!ret) - ret = kerndat_has_inotify_setnextwd(); - if (!ret) - ret = has_kcmp_epoll_tfd(); + kerndat_has_pidfd_open(); + if (!ret && kerndat_has_nspid()) { + pr_err("kerndat_has_nspid failed when initializing kerndat.\n"); + ret = -1; + } + if (!ret && kerndat_sockopt_buf_lock()) { + pr_err("kerndat_sockopt_buf_lock failed when initializing kerndat.\n"); + ret = -1; + } + if (!ret && kerndat_has_openat2()) { + pr_err("kerndat_has_openat2 failed when initializing kerndat.\n"); + ret = -1; + } + if (!ret && kerndat_has_rseq()) { + pr_err("kerndat_has_rseq failed when initializing kerndat.\n"); + ret = -1; + } + if (!ret && (kerndat_has_ptrace_get_rseq_conf() < 0)) { + pr_err("kerndat_has_ptrace_get_rseq_conf failed when initializing kerndat.\n"); + ret = -1; + } + if (!ret && (kerndat_has_ipv6_freebind() < 0)) { + pr_err("kerndat_has_ipv6_freebind failed when initializing kerndat.\n"); + ret = -1; + } + if (!ret && kerndat_has_membarrier_get_registrations()) { + pr_err("kerndat_has_membarrier_get_registrations failed when initializing kerndat.\n"); + ret = -1; + } + if (!ret && kerndat_has_shstk()) { + pr_err("kerndat_has_shstk failed when initializing kerndat.\n"); + ret = -1; + } + if (!ret && kerndat_has_close_range()) { + pr_err("kerndat_has_close_range has failed when initializing kerndat.\n"); + ret = -1; + } + if (!ret && kerndat_has_timer_cr_ids()) { + pr_err("kerndat_has_timer_cr_ids has failed when initializing kerndat.\n"); + ret = -1; + } + if (!ret && kerndat_breakpoints()) { + pr_err("kerndat_breakpoints has failed when initializing kerndat.\n"); + ret = -1; + } + if (!ret && kerndat_has_madv_guard()) { + pr_err("kerndat_has_madv_guard has failed when initializing kerndat.\n"); + ret = -1; + } kerndat_lsm(); kerndat_mmap_min_addr(); diff --git a/criu/libnetlink.c b/criu/libnetlink.c index 18a323b8d..c7a84a44d 100644 --- a/criu/libnetlink.c +++ b/criu/libnetlink.c @@ -10,9 +10,8 @@ #include "libnetlink.h" #include "util.h" -static int nlmsg_receive(char *buf, int len, - int (*cb)(struct nlmsghdr *, struct ns_id *ns, void *), - int (*err_cb)(int, struct ns_id *, void *), struct ns_id *ns, void *arg) +static int nlmsg_receive(char *buf, int len, int (*cb)(struct nlmsghdr *, struct ns_id *ns, void *), + int (*err_cb)(int, struct ns_id *, void *), struct ns_id *ns, void *arg) { struct nlmsghdr *hdr; @@ -20,9 +19,9 @@ static int nlmsg_receive(char *buf, int len, if (hdr->nlmsg_seq != CR_NLMSG_SEQ) continue; if (hdr->nlmsg_type == NLMSG_DONE) { - int *len = (int *)NLMSG_DATA(hdr); - if (*len < 0) - return err_cb(*len, ns, arg); + int *length = (int *)NLMSG_DATA(hdr); + if (*length < 0) + return err_cb(*length, ns, arg); return 0; } if (hdr->nlmsg_type == NLMSG_ERROR) { @@ -46,7 +45,7 @@ static int nlmsg_receive(char *buf, int len, } /* - * Default errror handler: just point our an error + * Default error handler: just point our an error * and pass up to caller. */ static int rtnl_return_err(int err, struct ns_id *ns, void *arg) @@ -56,8 +55,7 @@ static int rtnl_return_err(int err, struct ns_id *ns, void *arg) return err; } -int do_rtnl_req(int nl, void *req, int size, - int (*receive_callback)(struct nlmsghdr *h, struct ns_id *ns, void *), +int do_rtnl_req(int nl, void *req, int size, int (*receive_callback)(struct nlmsghdr *h, struct ns_id *ns, void *), int (*error_callback)(int err, struct ns_id *ns, void *arg), struct ns_id *ns, void *arg) { struct msghdr msg; @@ -70,16 +68,16 @@ int do_rtnl_req(int nl, void *req, int size, error_callback = rtnl_return_err; memset(&msg, 0, sizeof(msg)); - msg.msg_name = &nladdr; - msg.msg_namelen = sizeof(nladdr); - msg.msg_iov = &iov; - msg.msg_iovlen = 1; + msg.msg_name = &nladdr; + msg.msg_namelen = sizeof(nladdr); + msg.msg_iov = &iov; + msg.msg_iovlen = 1; memset(&nladdr, 0, sizeof(nladdr)); nladdr.nl_family = AF_NETLINK; - iov.iov_base = req; - iov.iov_len = size; + iov.iov_base = req; + iov.iov_len = size; if (sendmsg(nl, &msg, 0) < 0) { err = -errno; @@ -87,16 +85,15 @@ int do_rtnl_req(int nl, void *req, int size, goto err; } - iov.iov_base = buf; - iov.iov_len = sizeof(buf); + iov.iov_base = buf; + iov.iov_len = sizeof(buf); while (1) { - memset(&msg, 0, sizeof(msg)); - msg.msg_name = &nladdr; - msg.msg_namelen = sizeof(nladdr); - msg.msg_iov = &iov; - msg.msg_iovlen = 1; + msg.msg_name = &nladdr; + msg.msg_namelen = sizeof(nladdr); + msg.msg_iov = &iov; + msg.msg_iovlen = 1; err = recvmsg(nl, &msg, 0); if (err < 0) { @@ -130,8 +127,7 @@ err: return err; } -int addattr_l(struct nlmsghdr *n, int maxlen, int type, const void *data, - int alen) +int addattr_l(struct nlmsghdr *n, int maxlen, int type, const void *data, int alen) { int len = nla_attr_size(alen); struct rtattr *rta; @@ -172,8 +168,7 @@ int addattr_l(struct nlmsghdr *n, int maxlen, int type, const void *data, * @see nla_validate * @return 0 on success or a negative error code. */ -int __wrap_nla_parse(struct nlattr *tb[], int maxtype, struct nlattr *head, int len, - struct nla_policy *policy) +int __wrap_nla_parse(struct nlattr *tb[], int maxtype, struct nlattr *head, int len, struct nla_policy *policy) { struct nlattr *nla; int rem; @@ -188,14 +183,16 @@ int __wrap_nla_parse(struct nlattr *tb[], int maxtype, struct nlattr *head, int if (tb[type]) pr_warn("Attribute of type %#x found multiple times in message, " - "previous attribute is being ignored.\n", type); + "previous attribute is being ignored.\n", + type); tb[type] = nla; } if (rem > 0) pr_warn("netlink: %d bytes leftover after parsing " - "attributes.\n", rem); + "attributes.\n", + rem); return 0; } @@ -210,17 +207,10 @@ int __wrap_nla_parse(struct nlattr *tb[], int maxtype, struct nlattr *head, int * * See nla_parse() */ -int __wrap_nlmsg_parse(struct nlmsghdr *nlh, int hdrlen, struct nlattr *tb[], - int maxtype, struct nla_policy *policy) +int __wrap_nlmsg_parse(struct nlmsghdr *nlh, int hdrlen, struct nlattr *tb[], int maxtype, struct nla_policy *policy) { if (!nlmsg_valid_hdr(nlh, hdrlen)) return -NLE_MSG_TOOSHORT; - return nla_parse(tb, maxtype, nlmsg_attrdata(nlh, hdrlen), - nlmsg_attrlen(nlh, hdrlen), policy); -} - -int32_t nla_get_s32(const struct nlattr *nla) -{ - return *(const int32_t *) nla_data(nla); + return nla_parse(tb, maxtype, nlmsg_attrdata(nlh, hdrlen), nlmsg_attrlen(nlh, hdrlen), policy); } diff --git a/criu/log.c b/criu/log.c index 8bdf83534..bf6f657f2 100644 --- a/criu/log.c +++ b/criu/log.c @@ -10,6 +10,7 @@ #include #include #include +#include #include @@ -26,14 +27,14 @@ #include "../soccr/soccr.h" #include "compel/log.h" - -#define DEFAULT_LOGFD STDERR_FILENO +#define DEFAULT_LOGFD STDERR_FILENO /* Enable timestamps if verbosity is increased from default */ -#define LOG_TIMESTAMP (DEFAULT_LOGLEVEL + 1) -#define LOG_BUF_LEN (8*1024) -#define EARLY_LOG_BUF_LEN 1024 +#define LOG_TIMESTAMP (DEFAULT_LOGLEVEL + 1) +#define LOG_BUF_LEN (8 * 1024) +#define EARLY_LOG_BUF_LEN 1024 static unsigned int current_loglevel = DEFAULT_LOGLEVEL; +static void vprint_on_level(unsigned int, const char *, va_list); static char buffer[LOG_BUF_LEN]; static char buf_off = 0; @@ -52,7 +53,7 @@ static struct timeval start; * Manual buf len as sprintf will _always_ put '\0' at the end, * but we want a "constant" pid to be there on restore */ -#define TS_BUF_OFF 12 +#define TS_BUF_OFF 12 static void timediff(struct timeval *from, struct timeval *to) { @@ -71,8 +72,8 @@ static void print_ts(void) gettimeofday(&t, NULL); timediff(&start, &t); - snprintf(buffer, TS_BUF_OFF, - "(%02u.%06u)", (unsigned)t.tv_sec, (unsigned)t.tv_usec); + snprintf(buffer, TS_BUF_OFF, "(%02u.%06u", (unsigned)t.tv_sec, (unsigned)t.tv_usec); + buffer[TS_BUF_OFF - 2] = ')'; /* this will overwrite the last digit if tv_sec>=100 */ buffer[TS_BUF_OFF - 1] = ' '; /* kill the '\0' produced by snprintf */ } @@ -114,6 +115,9 @@ static struct str_and_lock *first_err; int log_keep_err(void) { + if (first_err) + return 0; + first_err = shmalloc(sizeof(struct str_and_lock)); if (first_err == NULL) return -1; @@ -132,10 +136,11 @@ static void log_note_err(char *msg) * anyway, so it doesn't make much sense to try hard * and optimize this out. */ - mutex_lock(&first_err->l); - if (first_err->s[0] == '\0') - strlcpy(first_err->s, msg, sizeof(first_err->s)); - mutex_unlock(&first_err->l); + if (mutex_trylock(&first_err->l)) { + if (first_err->s[0] == '\0') + __strlcpy(first_err->s, msg, sizeof(first_err->s)); + mutex_unlock(&first_err->l); + } } } @@ -161,8 +166,7 @@ static void print_versions(void) return; } - pr_info("Running on %s %s %s %s %s\n", buf.nodename, buf.sysname, - buf.release, buf.version, buf.machine); + pr_info("Running on %s %s %s %s %s\n", buf.nodename, buf.sysname, buf.release, buf.version, buf.machine); } struct early_log_hdr { @@ -186,12 +190,11 @@ void flush_early_log_buffer(int fd) * with reading the log_level. */ struct early_log_hdr *hdr = (void *)early_log_buffer + pos; - pos += sizeof(hdr); + pos += sizeof(*hdr); if (hdr->level <= current_loglevel) { size_t size = 0; while (size < hdr->len) { - ret = write(fd, early_log_buffer + pos + size, - hdr->len - size); + ret = write(fd, early_log_buffer + pos + size, hdr->len - size); if (ret <= 0) break; size += ret; @@ -199,8 +202,8 @@ void flush_early_log_buffer(int fd) } pos += hdr->len; } - if (early_log_buf_off) - pr_warn("The early log isn't empty\n"); + if ((early_log_buf_off + sizeof(struct early_log_hdr)) >= EARLY_LOG_BUF_LEN) + pr_warn("The early log buffer is full, some messages may have been lost\n"); early_log_buf_off = 0; } @@ -218,7 +221,7 @@ int log_init(const char *output) return -1; } } else if (output) { - new_logfd = open(output, O_CREAT|O_TRUNC|O_WRONLY|O_APPEND, 0600); + new_logfd = open(output, O_CREAT | O_TRUNC | O_WRONLY | O_APPEND, 0600); if (new_logfd < 0) { pr_perror("Can't create log file %s", output); return -1; @@ -317,18 +320,19 @@ unsigned int log_get_loglevel(void) static void early_vprint(const char *format, unsigned int loglevel, va_list params) { - unsigned int log_size = 0; + int log_size = 0, log_space; struct early_log_hdr *hdr; - if ((early_log_buf_off + sizeof(hdr)) >= EARLY_LOG_BUF_LEN) + if ((early_log_buf_off + sizeof(*hdr)) >= EARLY_LOG_BUF_LEN) return; /* Save loglevel */ hdr = (void *)early_log_buffer + early_log_buf_off; - hdr->level = loglevel; + hdr->level = loglevel; /* Skip the log entry size */ - early_log_buf_off += sizeof(hdr); + early_log_buf_off += sizeof(*hdr); + log_space = EARLY_LOG_BUF_LEN - early_log_buf_off; if (loglevel >= LOG_TIMESTAMP) { /* * If logging is not yet setup we just write zeros @@ -336,21 +340,24 @@ static void early_vprint(const char *format, unsigned int loglevel, va_list para * keep the same format as the other messages on * log levels with timestamps (>=LOG_TIMESTAMP). */ - log_size = snprintf(early_log_buffer + early_log_buf_off, - sizeof(early_log_buffer) - early_log_buf_off, - "(00.000000) "); + log_size = snprintf(early_log_buffer + early_log_buf_off, log_space, + "(00.000000) "); } - log_size += vsnprintf(early_log_buffer + early_log_buf_off + log_size, - sizeof(early_log_buffer) - early_log_buf_off - log_size, - format, params); + if (log_size < log_space) + log_size += vsnprintf(early_log_buffer + early_log_buf_off + log_size, + log_space - log_size, format, params); + if (log_size > log_space) { + /* vsnprintf always add the terminating null byte. */ + log_size = log_space - 1; + } /* Save log entry size */ hdr->len = log_size; early_log_buf_off += log_size; } -void vprint_on_level(unsigned int loglevel, const char *format, va_list params) +static void vprint_on_level(unsigned int loglevel, const char *format, va_list params) { int fd, size, ret, off = 0; int _errno = errno; @@ -374,7 +381,7 @@ void vprint_on_level(unsigned int loglevel, const char *format, va_list params) print_ts(); } - size = vsnprintf(buffer + buf_off, sizeof buffer - buf_off, format, params); + size = vsnprintf(buffer + buf_off, sizeof buffer - buf_off, format, params); size += buf_off; while (off < size) { @@ -388,7 +395,7 @@ void vprint_on_level(unsigned int loglevel, const char *format, va_list params) if (loglevel == LOG_ERROR) log_note_err(buffer + buf_off); - errno = _errno; + errno = _errno; } void print_on_level(unsigned int loglevel, const char *format, ...) @@ -402,15 +409,28 @@ void print_on_level(unsigned int loglevel, const char *format, ...) int write_pidfile(int pid) { - int fd; + int fd, ret, exit_code = -1; fd = open(opts.pidfile, O_WRONLY | O_EXCL | O_CREAT, 0600); if (fd == -1) { - pr_perror("Can't open %s", opts.pidfile); + pr_perror("pidfile: Can't open %s", opts.pidfile); return -1; } - dprintf(fd, "%d", pid); + ret = dprintf(fd, "%d", pid); + if (ret < 0) { + pr_perror("pidfile: Can't write pid %d to %s", pid, opts.pidfile); + goto close; + } + + if (ret == 0) { + pr_err("pidfile: Can't write pid %d to %s\n", pid, opts.pidfile); + goto close; + } + + pr_debug("pidfile: Wrote pid %d to %s (%d bytes)\n", pid, opts.pidfile, ret); + exit_code = 0; +close: close(fd); - return 0; + return exit_code; } diff --git a/criu/lsm.c b/criu/lsm.c index 9d7e55c11..5faf3e5b2 100644 --- a/criu/lsm.c +++ b/criu/lsm.c @@ -13,6 +13,7 @@ #include "cr_options.h" #include "lsm.h" #include "fdstore.h" +#include "apparmor.h" #include "protobuf.h" #include "images/inventory.pb-c.h" @@ -28,7 +29,9 @@ static int apparmor_get_label(pid_t pid, char **profile_name) FILE *f; char *space; - f = fopen_proc(pid, "attr/current"); + f = fopen_proc(pid, "attr/apparmor/current"); + if (!f) + f = fopen_proc(pid, "attr/current"); if (!f) return -1; @@ -58,15 +61,48 @@ static int apparmor_get_label(pid_t pid, char **profile_name) *profile_name = NULL; } + if (*profile_name && collect_aa_namespace(*profile_name) < 0) { + free(*profile_name); + *profile_name = NULL; + pr_err("failed to collect AA namespace\n"); + return -1; + } + return 0; } #ifdef CONFIG_HAS_SELINUX -static int selinux_get_label(pid_t pid, char **output) +static int verify_selinux_label(char *ctx) { - security_context_t ctx; char *pos; int i; + + /* + * There are SELinux setups where SELinux seems to be enabled, + * but the returned labels are not really valid. See also + * https://github.com/torvalds/linux/blob/master/security/selinux/include/initial_sid_to_string.h + * + * CRIU tells the user that such labels are invalid + * and CRIU expects a SELinux label to contain three ':'. + * + * A label should look like this: + * + * unconfined_u:unconfined_r:unconfined_t:s0-s0:c0.c1023 + */ + pos = (char *)ctx; + for (i = 0; i < 3; i++) { + pos = strstr(pos, ":"); + if (!pos) + return -1; + pos++; + } + + return 0; +} + +static int selinux_get_label(pid_t pid, char **output) +{ + char *ctx; int ret = -1; if (getpidcon_raw(pid, &ctx) < 0) { @@ -74,29 +110,15 @@ static int selinux_get_label(pid_t pid, char **output) return -1; } + if (verify_selinux_label(ctx)) { + pr_err("Invalid selinux context %s\n", (char *)ctx); + goto err; + } + *output = xstrdup((char *)ctx); if (!*output) goto err; - /* - * Make sure it is a valid SELinux label. It should look like this: - * - * unconfined_u:unconfined_r:unconfined_t:s0-s0:c0.c1023 - */ - pos = (char*)ctx; - for (i = 0; i < 3; i++) { - pos = strstr(pos, ":"); - if (!pos) { - pr_err("Invalid selinux context %s\n", (char *)ctx); - xfree(*output); - *output = NULL; - goto err; - } - - *pos = 0; - pos++; - } - ret = 0; err: freecon(ctx); @@ -133,7 +155,7 @@ static int selinux_get_sockcreate_label(pid_t pid, char **output) return 0; } -int reset_setsockcreatecon() +int reset_setsockcreatecon(void) { /* Currently this only works for SELinux. */ if (kdat.lsm != LSMTYPE__SELINUX) @@ -203,19 +225,32 @@ void kerndat_lsm(void) { if (access(AA_SECURITYFS_PATH, F_OK) == 0) { kdat.lsm = LSMTYPE__APPARMOR; + kdat.apparmor_ns_dumping_enabled = check_aa_ns_dumping(); return; } #ifdef CONFIG_HAS_SELINUX - /* - * This seems to be the canonical place to mount this fs if it is - * enabled, although we may (?) want to check /selinux for posterity as - * well. - */ - if (access("/sys/fs/selinux", F_OK) == 0) { + if (is_selinux_enabled()) { + char *ctx; + + /* + * CRIU used to only check if /sys/fs/selinux is mounted, but that does not + * seem to be enough for CRIU's use case. CRIU actually needs to look if + * a valid label is returned. + */ + if (getpidcon_raw(getpid(), &ctx) < 0) + goto no_lsm; + + if (verify_selinux_label(ctx)) { + freecon(ctx); + goto no_lsm; + } + kdat.lsm = LSMTYPE__SELINUX; + freecon(ctx); return; } +no_lsm: #endif kdat.lsm = LSMTYPE__NO_LSM; @@ -226,26 +261,23 @@ Lsmtype host_lsm_type(void) return kdat.lsm; } -int collect_lsm_profile(pid_t pid, CredsEntry *ce) +static int collect_lsm_profile(pid_t pid, struct thread_lsm *lsm) { int ret; - ce->lsm_profile = NULL; - ce->lsm_sockcreate = NULL; - switch (kdat.lsm) { case LSMTYPE__NO_LSM: ret = 0; break; case LSMTYPE__APPARMOR: - ret = apparmor_get_label(pid, &ce->lsm_profile); + ret = apparmor_get_label(pid, &lsm->profile); break; #ifdef CONFIG_HAS_SELINUX case LSMTYPE__SELINUX: - ret = selinux_get_label(pid, &ce->lsm_profile); + ret = selinux_get_label(pid, &lsm->profile); if (ret) break; - ret = selinux_get_sockcreate_label(pid, &ce->lsm_sockcreate); + ret = selinux_get_sockcreate_label(pid, &lsm->sockcreate); break; #endif default: @@ -254,14 +286,63 @@ int collect_lsm_profile(pid_t pid, CredsEntry *ce) break; } - if (ce->lsm_profile) - pr_info("%d has lsm profile %s\n", pid, ce->lsm_profile); - if (ce->lsm_sockcreate) - pr_info("%d has lsm sockcreate label %s\n", pid, ce->lsm_sockcreate); + if (lsm->profile) + pr_info("%d has lsm profile %s\n", pid, lsm->profile); + if (lsm->sockcreate) + pr_info("%d has lsm sockcreate label %s\n", pid, lsm->sockcreate); return ret; } +int collect_and_suspend_lsm(void) +{ + struct pstree_item *item; + + for_each_pstree_item(item) { + struct thread_lsm **thread_lsms; + int i; + + thread_lsms = xzalloc((item->nr_threads + 1) * sizeof(thread_lsms)); + if (!thread_lsms) + return -1; + dmpi(item)->thread_lsms = thread_lsms; + + for (i = 0; i < item->nr_threads; i++) { + thread_lsms[i] = xzalloc(sizeof(**thread_lsms)); + if (!thread_lsms[i]) + return -1; + + if (collect_lsm_profile(item->threads[i].real, thread_lsms[i]) < 0) + return -1; + } + } + + /* now, suspend the LSM; this is where code that implements something + * like PTRACE_O_SUSPEND_LSM should live. */ + switch (kdat.lsm) { + case LSMTYPE__APPARMOR: + if (suspend_aa() < 0) + return -1; + break; + case LSMTYPE__SELINUX: + break; + case LSMTYPE__NO_LSM: + break; + default: + pr_debug("don't know how to suspend LSM %d\n", kdat.lsm); + } + + return 0; +} + +int unsuspend_lsm(void) +{ + if (kdat.lsm == LSMTYPE__APPARMOR && unsuspend_aa()) + return -1; + + return 0; +} + // in inventory.c extern Lsmtype image_lsm; @@ -289,14 +370,9 @@ int render_lsm_profile(char *profile, char **val) switch (kdat.lsm) { case LSMTYPE__APPARMOR: - if (strcmp(profile, "unconfined") != 0 && asprintf(val, "changeprofile %s", profile) < 0) { - pr_err("allocating lsm profile failed\n"); - *val = NULL; - return -1; - } - break; + return render_aa_profile(val, profile); case LSMTYPE__SELINUX: - if (asprintf(val, "%s", profile) < 0) { + if (asprintf(val, "%s", opts.lsm_supplied ? opts.lsm_profile : profile) < 0) { *val = NULL; return -1; } diff --git a/criu/mem.c b/criu/mem.c index de66a6210..9e8740c07 100644 --- a/criu/mem.c +++ b/criu/mem.c @@ -10,6 +10,7 @@ #include "cr_options.h" #include "servicefd.h" #include "mem.h" +#include "mman.h" #include "parasite-syscall.h" #include "parasite.h" #include "page-pipe.h" @@ -29,7 +30,8 @@ #include "pagemap-cache.h" #include "fault-injection.h" #include "prctl.h" -#include +#include "compel/infect-util.h" +#include "pidfd-store.h" #include "protobuf.h" #include "images/pagemap.pb-c.h" @@ -64,7 +66,7 @@ int do_task_reset_dirty_track(int pid) if (errno == EINVAL) /* No clear-soft-dirty in kernel */ ret = 1; else { - pr_perror("Can't reset %d's dirty memory tracker (%d)", pid, errno); + pr_perror("Can't reset %d's dirty memory tracker", pid); ret = -1; } } else { @@ -79,9 +81,8 @@ int do_task_reset_dirty_track(int pid) unsigned long dump_pages_args_size(struct vm_area_list *vmas) { /* In the worst case I need one iovec for each page */ - return sizeof(struct parasite_dump_pages_args) + - vmas->nr * sizeof(struct parasite_vma_entry) + - (vmas->nr_priv_pages + 1) * sizeof(struct iovec); + return sizeof(struct parasite_dump_pages_args) + vmas->nr * sizeof(struct parasite_vma_entry) + + (vmas->nr_priv_pages + 1) * sizeof(struct iovec); } static inline bool __page_is_zero(u64 pme) @@ -99,7 +100,7 @@ static inline bool __page_in_parent(bool dirty) return opts.track_mem && opts.img_parent && !dirty; } -bool should_dump_page(VmaEntry *vmae, u64 pme) +static bool should_dump_entire_vma(VmaEntry *vmae) { /* * vDSO area must be always dumped because on restore @@ -107,30 +108,83 @@ bool should_dump_page(VmaEntry *vmae, u64 pme) */ if (vma_entry_is(vmae, VMA_AREA_VDSO)) return true; - /* - * In turn VVAR area is special and referenced from - * vDSO area by IP addressing (at least on x86) thus - * never ever dump its content but always use one provided - * by the kernel on restore, ie runtime VVAR area must - * be remapped into proper place.. - */ - if (vma_entry_is(vmae, VMA_AREA_VVAR)) - return false; - - /* - * Optimisation for private mapping pages, that haven't - * yet being COW-ed - */ - if (vma_entry_is(vmae, VMA_FILE_PRIVATE) && (pme & PME_FILE)) - return false; if (vma_entry_is(vmae, VMA_AREA_AIORING)) return true; - if ((pme & (PME_PRESENT | PME_SWAP)) && !__page_is_zero(pme)) - return true; return false; } +/* + * should_dump_page writes vaddr in page_info->next if an addressed page has to be dumped. + * Otherwise, it writes an address that has to be inspected next. + */ +int should_dump_page(pmc_t *pmc, VmaEntry *vmae, u64 vaddr, struct page_info *page_info) +{ + if (!page_info) + goto err; + + if (vaddr >= pmc->end && pmc_fill(pmc, vaddr, vmae->end)) + goto err; + + if (pmc->regs) { + while (1) { + if (pmc->regs_idx == pmc->regs_len) { + page_info->next = pmc->end; + return 0; + } + + if (vaddr < pmc->regs[pmc->regs_idx].end) + break; + pmc->regs_idx++; + } + + if (vaddr < pmc->regs[pmc->regs_idx].start) { + page_info->next = pmc->regs[pmc->regs_idx].start; + return 0; + } + + if (pmc->regs[pmc->regs_idx].categories & PAGE_IS_GUARD) + goto skip_guard_page; + + page_info->softdirty = pmc->regs[pmc->regs_idx].categories & PAGE_IS_SOFT_DIRTY; + page_info->next = vaddr; + return 0; + } else { + u64 pme = pmc->map[PAGE_PFN(vaddr - pmc->start)]; + + if (pme & PME_GUARD_REGION) + goto skip_guard_page; + + /* + * Optimisation for private mapping pages, that haven't + * yet being COW-ed + */ + if (vma_entry_is(vmae, VMA_FILE_PRIVATE) && (pme & PME_FILE)) { + page_info->next = vaddr + PAGE_SIZE; + return 0; + } + + if ((pme & (PME_PRESENT | PME_SWAP)) && !__page_is_zero(pme)) { + page_info->softdirty = pme & PME_SOFT_DIRTY; + page_info->next = vaddr; + return 0; + } + + page_info->next = vaddr + PAGE_SIZE; + return 0; + } + +err: + pr_err("should_dump_page failed on vma " + "%#016" PRIx64 "-%#016" PRIx64 " vaddr=%#016" PRIx64 "\n", + vmae->start, vmae->end, vaddr); + return -1; + +skip_guard_page: + page_info->next = vaddr + PAGE_SIZE; + return 0; +} + bool page_is_zero(u64 pme) { return __page_is_zero(pme); @@ -161,27 +215,34 @@ static bool is_stack(struct pstree_item *item, unsigned long vaddr) * put the memory into the page-pipe's pipe. * * "Holes" in page-pipe are regions, that should be dumped, but - * the memory contents is present in the pagent image set. + * the memory contents is present in the parent image set. */ -static int generate_iovs(struct pstree_item *item, struct vma_area *vma, struct page_pipe *pp, u64 *map, u64 *off, bool has_parent) +static int generate_iovs(struct pstree_item *item, struct vma_area *vma, struct page_pipe *pp, pmc_t *pmc, u64 *pvaddr, + bool has_parent) { - u64 *at = &map[PAGE_PFN(*off)]; - unsigned long pfn, nr_to_scan; + unsigned long nr_scanned; unsigned long pages[3] = {}; + unsigned long vaddr; + bool dump_all_pages; int ret = 0; - nr_to_scan = (vma_area_len(vma) - *off) / PAGE_SIZE; + dump_all_pages = should_dump_entire_vma(vma->e); - for (pfn = 0; pfn < nr_to_scan; pfn++) { - unsigned long vaddr; + nr_scanned = 0; + for (vaddr = *pvaddr; vaddr < vma->e->end; vaddr += PAGE_SIZE, nr_scanned++) { unsigned int ppb_flags = 0; + struct page_info page_info = {}; int st; - if (!should_dump_page(vma->e, at[pfn])) - continue; + /* If dump_all_pages is true, should_dump_page is called to get pme. */ + if (should_dump_page(pmc, vma->e, vaddr, &page_info)) + return -1; - vaddr = vma->e->start + *off + pfn * PAGE_SIZE; + if (!dump_all_pages && page_info.next != vaddr) { + vaddr = page_info.next - PAGE_SIZE; + continue; + } if (vma_entry_can_be_lazy(vma->e) && !is_stack(item, vaddr)) ppb_flags |= PPB_LAZY; @@ -193,7 +254,7 @@ static int generate_iovs(struct pstree_item *item, struct vma_area *vma, struct * page. The latter would be checked in page-xfer. */ - if (has_parent && page_in_parent(at[pfn] & PME_SOFT_DIRTY)) { + if (has_parent && page_in_parent(page_info.softdirty)) { ret = page_pipe_add_hole(pp, vaddr, PP_HOLE_PARENT); st = 0; } else { @@ -213,20 +274,18 @@ static int generate_iovs(struct pstree_item *item, struct vma_area *vma, struct pages[st]++; } - *off += pfn * PAGE_SIZE; - - cnt_add(CNT_PAGES_SCANNED, nr_to_scan); + *pvaddr = vaddr; + cnt_add(CNT_PAGES_SCANNED, nr_scanned); cnt_add(CNT_PAGES_SKIPPED_PARENT, pages[0]); cnt_add(CNT_PAGES_LAZY, pages[1]); cnt_add(CNT_PAGES_WRITTEN, pages[2]); - pr_info("Pagemap generated: %lu pages (%lu lazy) %lu holes\n", - pages[2] + pages[1], pages[1], pages[0]); + pr_info("Pagemap generated: %lu pages (%lu lazy) %lu holes\n", pages[2] + pages[1], pages[1], pages[0]); return ret; } -static struct parasite_dump_pages_args *prep_dump_pages_args(struct parasite_ctl *ctl, - struct vm_area_list *vma_area_list, bool skip_non_trackable) +static struct parasite_dump_pages_args * +prep_dump_pages_args(struct parasite_ctl *ctl, struct vm_area_list *vma_area_list, bool skip_non_trackable) { struct parasite_dump_pages_args *args; struct parasite_vma_entry *p_vma; @@ -246,6 +305,12 @@ static struct parasite_dump_pages_args *prep_dump_pages_args(struct parasite_ctl */ if (vma_entry_is(vma->e, VMA_AREA_AIORING) && skip_non_trackable) continue; + /* + * We totally ignore MAP_HUGETLB on pre-dump. + * See also generate_vma_iovs() comment. + */ + if ((vma->e->flags & MAP_HUGETLB) && skip_non_trackable) + continue; if (vma->e->prot & PROT_READ) continue; @@ -260,8 +325,7 @@ static struct parasite_dump_pages_args *prep_dump_pages_args(struct parasite_ctl return args; } -static int drain_pages(struct page_pipe *pp, struct parasite_ctl *ctl, - struct parasite_dump_pages_args *args) +static int drain_pages(struct page_pipe *pp, struct parasite_ctl *ctl, struct parasite_dump_pages_args *args) { struct page_pipe_buf *ppb; int ret = 0; @@ -272,8 +336,8 @@ static int drain_pages(struct page_pipe *pp, struct parasite_ctl *ctl, list_for_each_entry(ppb, &pp->bufs, l) { args->nr_segs = ppb->nr_segs; args->nr_pages = ppb->pages_in; - pr_debug("PPB: %d pages %d segs %u pipe %d off\n", - args->nr_pages, args->nr_segs, ppb->pipe_size, args->off); + pr_debug("PPB: %ld pages %d segs %u pipe %d off\n", args->nr_pages, args->nr_segs, ppb->pipe_size, + args->off); ret = compel_rpc_call(PARASITE_CMD_DUMPPAGES, ctl); if (ret < 0) @@ -307,17 +371,19 @@ static int xfer_pages(struct page_pipe *pp, struct page_xfer *xfer) return ret; } -static int detect_pid_reuse(struct pstree_item *item, - struct proc_pid_stat* pps, - InventoryEntry *parent_ie) +static int detect_pid_reuse(struct pstree_item *item, struct proc_pid_stat *pps, InventoryEntry *parent_ie) { unsigned long long dump_ticks; struct proc_pid_stat pps_buf; unsigned long long tps; /* ticks per second */ int ret; + /* Check pid reuse using pidfds */ + if (pidfd_store_ready()) + return pidfd_store_check_pid_reuse(item->pid->real); + if (!parent_ie) { - pr_err("Pid-reuse detection failed: no parent inventory, " \ + pr_err("Pid-reuse detection failed: no parent inventory, " "check warnings in get_parent_inventory\n"); return -1; } @@ -335,47 +401,112 @@ static int detect_pid_reuse(struct pstree_item *item, return -1; } - dump_ticks = parent_ie->dump_uptime/(USEC_PER_SEC/tps); + dump_ticks = parent_ie->dump_uptime / (USEC_PER_SEC / tps); if (pps->start_time >= dump_ticks) { /* Print "*" if unsure */ - pr_warn("Pid reuse%s detected for pid %d\n", - pps->start_time == dump_ticks ? "*" : "", - item->pid->real); + pr_warn("Pid reuse%s detected for pid %d\n", pps->start_time == dump_ticks ? "*" : "", item->pid->real); return 1; } return 0; } -static int generate_vma_iovs(struct pstree_item *item, struct vma_area *vma, - struct page_pipe *pp, struct page_xfer *xfer, - struct parasite_dump_pages_args *args, - struct parasite_ctl *ctl, pmc_t *pmc, - bool has_parent, bool pre_dump) +static int generate_vma_iovs(struct pstree_item *item, struct vma_area *vma, struct page_pipe *pp, + struct page_xfer *xfer, struct parasite_dump_pages_args *args, struct parasite_ctl *ctl, + pmc_t *pmc, bool has_parent, bool pre_dump, int parent_predump_mode) { - u64 off = 0; - u64 *map; + u64 vaddr; int ret; - if (!vma_area_is_private(vma, kdat.task_size) && - !vma_area_is(vma, VMA_ANON_SHARED)) + if (!vma_area_is_private(vma, kdat.task_size) && !vma_area_is(vma, VMA_ANON_SHARED)) + return 0; + /* + * In turn VVAR area is special and referenced from + * vDSO area by IP addressing (at least on x86) thus + * never ever dump its content but always use one provided + * by the kernel on restore, ie runtime VVAR area must + * be remapped into proper place.. + */ + if (vma_entry_is(vma->e, VMA_AREA_VVAR)) return 0; - if (vma_entry_is(vma->e, VMA_AREA_AIORING)) { + /* + * 9651fcedf7b9 ("mm: add MAP_DROPPABLE for designating always lazily freeable mappings") + * tells us that: + * Under memory pressure, mm can just drop the pages (so that they're + * zero when read back again). + * + * Let's just skip MAP_DROPPABLE mappings pages dump logic. + */ + if (vma->e->flags & MAP_DROPPABLE) + return 0; + + /* + * To facilitate any combination of pre-dump modes to run after + * one another, we need to take extra care as discussed below. + * + * The SPLICE mode pre-dump, processes all type of memory regions, + * whereas READ mode pre-dump skips processing those memory regions + * which lacks PROT_READ flag. + * + * Now on mixing pre-dump modes: + * If SPLICE mode follows SPLICE mode : no issue + * -> everything dumped both the times + * + * If READ mode follows READ mode : no issue + * -> non-PROT_READ skipped both the time + * + * If READ mode follows SPLICE mode : no issue + * -> everything dumped at first, + * the non-PROT_READ skipped later + * + * If SPLICE mode follows READ mode : Need special care + * + * If READ pre-dump happens first, then it has skipped processing + * non-PROT_READ regions. Following SPLICE pre-dump expects pagemap + * entries for all mappings in parent pagemap, but last READ mode + * pre-dump cycle has skipped processing & pagemap generation for + * non-PROT_READ regions. So SPLICE mode throws error of missing + * pagemap entry for encountered non-PROT_READ mapping. + * + * To resolve this, the pre-dump-mode is stored in current pre-dump's + * inventoy file. This pre-dump mode is read back from this file + * (present in parent pre-dump dir) as parent-pre-dump-mode during + * next pre-dump. + * + * If parent-pre-dump-mode and next-pre-dump-mode are in READ-mode -> + * SPLICE-mode order, then SPLICE mode doesn't expect mappings for + * non-PROT_READ regions in parent-image and marks "has_parent=false". + */ + + if (!(vma->e->prot & PROT_READ)) { + if (opts.pre_dump_mode == PRE_DUMP_READ && pre_dump) + return 0; + if ((parent_predump_mode == PRE_DUMP_READ && opts.pre_dump_mode == PRE_DUMP_SPLICE) || !pre_dump) + has_parent = false; + } + + /* + * We want to completely ignore these VMA types on the pre-dump: + * 1. VMA_AREA_AIORING because it is not soft-dirty trackable (kernel writes) + * 2. MAP_HUGETLB mappings because they are not premapped and we can't use + * parent images from pre-dump stages. Instead, the content is restored from + * the parasite context using full memory image. + */ + if (vma_entry_is(vma->e, VMA_AREA_AIORING) || vma->e->flags & MAP_HUGETLB) { if (pre_dump) return 0; has_parent = false; } - map = pmc_get_map(pmc, vma); - if (!map) + if (pmc_get_map(pmc, vma)) return -1; if (vma_area_is(vma, VMA_ANON_SHARED)) - return add_shmem_area(item->pid->real, vma->e, map); - + return add_shmem_area(item->pid->real, vma->e, pmc); + vaddr = vma->e->start; again: - ret = generate_iovs(item,vma, pp, map, &off, has_parent); + ret = generate_iovs(item, vma, pp, pmc, &vaddr, has_parent); if (ret == -EAGAIN) { BUG_ON(!(pp->flags & PP_CHUNK_MODE)); @@ -391,11 +522,9 @@ again: return ret; } -static int __parasite_dump_pages_seized(struct pstree_item *item, - struct parasite_dump_pages_args *args, - struct vm_area_list *vma_area_list, - struct mem_dump_ctl *mdc, - struct parasite_ctl *ctl) +static int __parasite_dump_pages_seized(struct pstree_item *item, struct parasite_dump_pages_args *args, + struct vm_area_list *vma_area_list, struct mem_dump_ctl *mdc, + struct parasite_ctl *ctl) { pmc_t pmc = PMC_INIT; struct page_pipe *pp; @@ -406,6 +535,7 @@ static int __parasite_dump_pages_seized(struct pstree_item *item, unsigned long pmc_size; int possible_pid_reuse = 0; bool has_parent; + int parent_predump_mode = -1; pr_info("\n"); pr_info("Dumping pages (type: %d pid: %d)\n", CR_FD_PAGES, item->pid->real); @@ -413,17 +543,14 @@ static int __parasite_dump_pages_seized(struct pstree_item *item, timing_start(TIME_MEMDUMP); - pr_debug(" Private vmas %lu/%lu pages\n", - vma_area_list->nr_priv_pages_longest, vma_area_list->nr_priv_pages); + pr_debug(" Private vmas %lu/%lu pages\n", vma_area_list->nr_priv_pages_longest, vma_area_list->nr_priv_pages); /* * Step 0 -- prepare */ - pmc_size = max(vma_area_list->nr_priv_pages_longest, - vma_area_list->nr_shared_pages_longest); - if (pmc_init(&pmc, item->pid->real, &vma_area_list->h, - pmc_size * PAGE_SIZE)) + pmc_size = max(vma_area_list->nr_priv_pages_longest, vma_area_list->nr_shared_pages_longest); + if (pmc_init(&pmc, item->pid->real, &vma_area_list->h, pmc_size * PAGE_SIZE)) return -1; if (!(mdc->pre_dump || mdc->lazy)) @@ -433,9 +560,7 @@ static int __parasite_dump_pages_seized(struct pstree_item *item, * use, i.e. on non-lazy non-predump. */ cpp_flags |= PP_CHUNK_MODE; - pp = create_page_pipe(vma_area_list->nr_priv_pages, - mdc->lazy ? NULL : pargs_iovs(args), - cpp_flags); + pp = create_page_pipe(vma_area_list->nr_priv_pages, mdc->lazy ? NULL : pargs_iovs(args), cpp_flags); if (!pp) goto out; @@ -460,29 +585,43 @@ static int __parasite_dump_pages_seized(struct pstree_item *item, } if (xfer.parent) { - possible_pid_reuse = detect_pid_reuse(item, mdc->stat, - mdc->parent_ie); + possible_pid_reuse = detect_pid_reuse(item, mdc->stat, mdc->parent_ie); if (possible_pid_reuse == -1) goto out_xfer; } - /* * Step 1 -- generate the pagemap */ args->off = 0; has_parent = !!xfer.parent && !possible_pid_reuse; + if (mdc->parent_ie) + parent_predump_mode = mdc->parent_ie->pre_dump_mode; + list_for_each_entry(vma_area, &vma_area_list->h, list) { - ret = generate_vma_iovs(item, vma_area, pp, &xfer, args, ctl, - &pmc, has_parent, mdc->pre_dump); + if (vma_area_is(vma_area, VMA_AREA_GUARD)) + continue; + + ret = generate_vma_iovs(item, vma_area, pp, &xfer, args, ctl, &pmc, has_parent, mdc->pre_dump, + parent_predump_mode); if (ret < 0) goto out_xfer; } if (mdc->lazy) - memcpy(pargs_iovs(args), pp->iovs, - sizeof(struct iovec) * pp->nr_iovs); - ret = drain_pages(pp, ctl, args); + memcpy(pargs_iovs(args), pp->iovs, sizeof(struct iovec) * pp->nr_iovs); + + /* + * Faking drain_pages for pre-dump here. Actual drain_pages for pre-dump + * will happen after task unfreezing in cr_pre_dump_finish(). This is + * actual optimization which reduces time for which process was frozen + * during pre-dump. + */ + if (mdc->pre_dump && opts.pre_dump_mode == PRE_DUMP_READ) + ret = 0; + else + ret = drain_pages(pp, ctl, args); + if (!ret && !mdc->pre_dump) ret = xfer_pages(pp, &xfer); if (ret) @@ -512,10 +651,8 @@ out: return exit_code; } -int parasite_dump_pages_seized(struct pstree_item *item, - struct vm_area_list *vma_area_list, - struct mem_dump_ctl *mdc, - struct parasite_ctl *ctl) +int parasite_dump_pages_seized(struct pstree_item *item, struct vm_area_list *vma_area_list, struct mem_dump_ctl *mdc, + struct parasite_ctl *ctl) { int ret; struct parasite_dump_pages_args *pargs; @@ -528,13 +665,47 @@ int parasite_dump_pages_seized(struct pstree_item *item, * able to read the memory contents. * * Afterwards -- reprotect memory back. + * + * This step is required for "splice" mode pre-dump and dump. + * Skip this step for "read" mode pre-dump. + * "read" mode pre-dump delegates processing of non-PROT_READ + * regions to dump stage. Adding PROT_READ works fine for + * static processing (target process frozen during pre-dump) + * and fails for dynamic as explained below. + * + * Consider following sequence of instances to reason, why + * not to add PROT_READ in "read" mode pre-dump ? + * + * CRIU- "read" pre-dump Target Process + * + * 1. Creates mapping M + * without PROT_READ + * 2. CRIU freezes target + * process + * 3. Collect the mappings + * 4. Add PROT_READ to M + * (non-PROT_READ region) + * 5. CRIU unfreezes target + * process + * 6. Add flag PROT_READ + * to mapping M + * 7. Revoke flag PROT_READ + * from mapping M + * 8. process_vm_readv tries + * to copy mapping M + * (believing M have + * PROT_READ flag) + * 9. syscall fails to copy + * data from M */ - pargs->add_prot = PROT_READ; - ret = compel_rpc_call_sync(PARASITE_CMD_MPROTECT_VMAS, ctl); - if (ret) { - pr_err("Can't dump unprotect vmas with parasite\n"); - return ret; + if (!mdc->pre_dump || opts.pre_dump_mode == PRE_DUMP_SPLICE) { + pargs->add_prot = PROT_READ; + ret = compel_rpc_call_sync(PARASITE_CMD_MPROTECT_VMAS, ctl); + if (ret) { + pr_err("Can't dump unprotect vmas with parasite\n"); + return ret; + } } if (fault_injected(FI_DUMP_PAGES)) { @@ -549,10 +720,12 @@ int parasite_dump_pages_seized(struct pstree_item *item, return ret; } - pargs->add_prot = 0; - if (compel_rpc_call_sync(PARASITE_CMD_MPROTECT_VMAS, ctl)) { - pr_err("Can't rollback unprotected vmas with parasite\n"); - ret = -1; + if (!mdc->pre_dump || opts.pre_dump_mode == PRE_DUMP_SPLICE) { + pargs->add_prot = 0; + if (compel_rpc_call_sync(PARASITE_CMD_MPROTECT_VMAS, ctl)) { + pr_err("Can't rollback unprotected vmas with parasite\n"); + ret = -1; + } } return ret; @@ -588,7 +761,6 @@ int prepare_mm_pid(struct pstree_item *i) return -1; } - while (vn < ri->mm->n_vmas || img != NULL) { struct vma_area *vma; @@ -617,12 +789,11 @@ int prepare_mm_pid(struct pstree_item *i) ri->vmas.rst_priv_size += PAGE_SIZE; } - pr_info("vma 0x%"PRIx64" 0x%"PRIx64"\n", vma->e->start, vma->e->end); + pr_info("vma 0x%" PRIx64 " 0x%" PRIx64 "\n", vma->e->start, vma->e->end); if (vma_area_is(vma, VMA_ANON_SHARED)) ret = collect_shmem(pid, vma); - else if (vma_area_is(vma, VMA_FILE_PRIVATE) || - vma_area_is(vma, VMA_FILE_SHARED)) + else if (vma_area_is(vma, VMA_FILE_PRIVATE) || vma_area_is(vma, VMA_FILE_SHARED)) ret = collect_filemap(vma); else if (vma_area_is(vma, VMA_AREA_SOCKET)) ret = collect_socket_map(vma); @@ -655,6 +826,9 @@ static inline bool check_cow_vmas(struct vma_area *vma, struct vma_area *pvma) return false; if (!vma_area_is_private(pvma, kdat.task_size)) return false; + /* ... but not hugetlb mappings */ + if (vma->e->flags & MAP_HUGETLB || pvma->e->flags & MAP_HUGETLB) + return false; /* ... have growsdown and anon flags coincide */ if ((vma->e->flags ^ pvma->e->flags) & (MAP_GROWSDOWN | MAP_ANONYMOUS)) return false; @@ -662,7 +836,7 @@ static inline bool check_cow_vmas(struct vma_area *vma, struct vma_area *pvma) if (!(vma->e->flags & MAP_ANONYMOUS) && vma->e->shmid != pvma->e->shmid) return false; - pr_debug("Found two COW VMAs @0x%"PRIx64"-0x%"PRIx64"\n", vma->e->start, pvma->e->end); + pr_debug("Found two COW VMAs @0x%" PRIx64 "-0x%" PRIx64 "\n", vma->e->start, pvma->e->end); return true; } @@ -688,14 +862,14 @@ static void prepare_cow_vmas_for(struct vm_area_list *vmas, struct vm_area_list /* <= here to shift from matching VMAs and ... */ while (vma->e->start <= pvma->e->start) { vma = vma_next(vma); - if (&vma->list == &vmas->h) + if ((&vma->list == &vmas->h) || vma_area_is(vma, VMA_AREA_GUARD)) return; } /* ... no == here since we must stop on matching pair */ while (pvma->e->start < vma->e->start) { pvma = vma_next(pvma); - if (&pvma->list == &pvmas->h) + if ((&pvma->list == &pvmas->h) || vma_area_is(pvma, VMA_AREA_GUARD)) return; } } @@ -754,6 +928,7 @@ static int premap_private_vma(struct pstree_item *t, struct vma_area *vma, void vma->e->start -= PAGE_SIZE; size = vma_entry_len(vma->e); + if (!vma_inherited(vma)) { int flag = 0; /* @@ -782,10 +957,8 @@ static int premap_private_vma(struct pstree_item *t, struct vma_area *vma, void * bits there. Ideally we'd check for the whole COW-chain * having any data in. */ - addr = mmap(*tgt_addr, size, - vma->e->prot | PROT_WRITE, - vma->e->flags | MAP_FIXED | flag, - vma->e->fd, vma->e->pgoff); + addr = mmap(*tgt_addr, size, vma->e->prot | PROT_WRITE, vma->e->flags | MAP_FIXED | flag, vma->e->fd, + vma->e->pgoff); if (addr == MAP_FAILED) { pr_perror("Unable to map ANON_VMA"); @@ -805,8 +978,7 @@ static int premap_private_vma(struct pstree_item *t, struct vma_area *vma, void if (vma_has_guard_gap_hidden(vma)) paddr -= PAGE_SIZE; - addr = mremap(paddr, size, size, - MREMAP_FIXED | MREMAP_MAYMOVE, *tgt_addr); + addr = mremap(paddr, size, size, MREMAP_FIXED | MREMAP_MAYMOVE, *tgt_addr); if (addr != *tgt_addr) { pr_perror("Unable to remap a private vma"); return -1; @@ -814,9 +986,9 @@ static int premap_private_vma(struct pstree_item *t, struct vma_area *vma, void } vma->e->status |= VMA_PREMMAPED; - vma->premmaped_addr = (unsigned long) addr; - pr_debug("\tpremap %#016"PRIx64"-%#016"PRIx64" -> %016lx\n", - vma->e->start, vma->e->end, (unsigned long)addr); + vma->premmaped_addr = (unsigned long)addr; + pr_debug("\tpremap %#016" PRIx64 "-%#016" PRIx64 " -> %016lx\n", vma->e->start, vma->e->end, + (unsigned long)addr); if (vma_has_guard_gap_hidden(vma)) { /* Skip guard page */ vma->e->start += PAGE_SIZE; @@ -832,6 +1004,15 @@ static int premap_private_vma(struct pstree_item *t, struct vma_area *vma, void static inline bool vma_force_premap(struct vma_area *vma, struct list_head *head) { + /* + * Shadow stack VMAs cannot be mmap()ed, they must be created using + * map_shadow_stack() system call. + * Premap them to reserve virtual address space and populate them + * to have there contents available for later copying. + */ + if (vma_area_is(vma, VMA_AREA_SHSTK)) + return true; + /* * On kernels with 4K guard pages, growsdown VMAs * always have one guard page at the @@ -847,8 +1028,7 @@ static inline bool vma_force_premap(struct vma_area *vma, struct list_head *head prev = list_entry(vma->list.prev, struct vma_area, list); if (prev->e->end == vma->e->start) { - pr_debug("Force premmap for 0x%"PRIx64":0x%"PRIx64"\n", - vma->e->start, vma->e->end); + pr_debug("Force premmap for 0x%" PRIx64 ":0x%" PRIx64 "\n", vma->e->start, vma->e->end); return true; } } @@ -865,15 +1045,15 @@ static int task_size_check(pid_t pid, VmaEntry *entry) #ifdef __s390x__ if (entry->end <= kdat.task_size) return 0; - pr_err("Can't restore high memory region %lx-%lx because kernel does only support vmas up to %lx\n", entry->start, entry->end, kdat.task_size); + pr_err("Can't restore high memory region %lx-%lx because kernel does only support vmas up to %lx\n", + entry->start, entry->end, kdat.task_size); return -1; #else return 0; #endif } -static int premap_priv_vmas(struct pstree_item *t, struct vm_area_list *vmas, - void **at, struct page_read *pr) +static int premap_priv_vmas(struct pstree_item *t, struct vm_area_list *vmas, void **at, struct page_read *pr) { struct vma_area *vma; unsigned long pstart = 0; @@ -883,6 +1063,9 @@ static int premap_priv_vmas(struct pstree_item *t, struct vm_area_list *vmas, filemap_ctx_init(true); list_for_each_entry(vma, &vmas->h, list) { + if (vma_area_is(vma, VMA_AREA_GUARD)) + continue; + if (task_size_check(vpid(t), vma->e)) { ret = -1; break; @@ -897,6 +1080,13 @@ static int premap_priv_vmas(struct pstree_item *t, struct vm_area_list *vmas, if (!vma_area_is_private(vma, kdat.task_size)) continue; + if (vma->e->flags & MAP_HUGETLB) + continue; + + /* VMA offset may change due to plugin so we cannot premap */ + if (vma->e->status & VMA_EXT_PLUGIN) + continue; + if (vma->pvma == NULL && pr->pieok && !vma_force_premap(vma, &vmas->h)) { /* * VMA in question is not shared with anyone. We'll @@ -907,7 +1097,7 @@ static int premap_priv_vmas(struct pstree_item *t, struct vm_area_list *vmas, do { if (pr->pe->vaddr + pr->pe->nr_pages * PAGE_SIZE <= vma->e->start) continue; - if (pr->pe->vaddr > vma->e->end) + if (pr->pe->vaddr >= vma->e->end) vma->e->status |= VMA_NO_PROT_WRITE; break; } while (pr->advance(pr)); @@ -937,6 +1127,7 @@ static int restore_priv_vma_content(struct pstree_item *t, struct page_read *pr) unsigned int nr_shared = 0; unsigned int nr_dropped = 0; unsigned int nr_compared = 0; + unsigned int nr_enqueued = 0; unsigned int nr_lazy = 0; unsigned long va; @@ -995,13 +1186,11 @@ static int restore_priv_vma_content(struct pstree_item *t, struct page_read *pr) } if (!vma_area_is(vma, VMA_PREMMAPED)) { - unsigned long len = min_t(unsigned long, - (nr_pages - i) * PAGE_SIZE, - vma->e->end - va); + unsigned long len = min_t(unsigned long, (nr_pages - i) * PAGE_SIZE, vma->e->end - va); if (vma->e->status & VMA_NO_PROT_WRITE) { - pr_debug("VMA 0x%"PRIx64":0x%"PRIx64" RO %#lx:%lu IO\n", - vma->e->start, vma->e->end, va, nr_pages); + pr_debug("VMA 0x%" PRIx64 ":0x%" PRIx64 " RO %#lx:%lu IO\n", vma->e->start, + vma->e->end, va, nr_pages); BUG(); } @@ -1014,7 +1203,8 @@ static int restore_priv_vma_content(struct pstree_item *t, struct page_read *pr) len >>= PAGE_SHIFT; nr_restored += len; i += len - 1; - pr_debug("Enqueue page-read\n"); + + nr_enqueued++; continue; } @@ -1023,8 +1213,7 @@ static int restore_priv_vma_content(struct pstree_item *t, struct page_read *pr) */ off = (va - vma->e->start) / PAGE_SIZE; - p = decode_pointer((off) * PAGE_SIZE + - vma->premmaped_addr); + p = decode_pointer((off)*PAGE_SIZE + vma->premmaped_addr); set_bit(off, vma->page_bitmap); if (vma_inherited(vma)) { @@ -1068,7 +1257,6 @@ static int restore_priv_vma_content(struct pstree_item *t, struct page_read *pr) bitmap_set(vma->page_bitmap, off + 1, nr - 1); } - } } @@ -1085,6 +1273,9 @@ err_read: unsigned long size, i = 0; void *addr = decode_pointer(vma->premmaped_addr); + if (vma_area_is(vma, VMA_AREA_GUARD)) + continue; + if (!vma_inherited(vma)) continue; @@ -1093,11 +1284,10 @@ err_read: /* Find all pages, which are not shared with this child */ i = find_next_bit(vma->pvma->page_bitmap, size, i); - if ( i >= size) + if (i >= size) break; - ret = madvise(addr + PAGE_SIZE * i, - PAGE_SIZE, MADV_DONTNEED); + ret = madvise(addr + PAGE_SIZE * i, PAGE_SIZE, MADV_DONTNEED); if (ret < 0) { pr_perror("madvise failed"); return -1; @@ -1113,21 +1303,19 @@ err_read: pr_info("nr_restored_pages: %d\n", nr_restored); pr_info("nr_shared_pages: %d\n", nr_shared); - pr_info("nr_dropped_pages: %d\n", nr_dropped); + pr_info("nr_dropped_pages: %d\n", nr_dropped); + pr_info("nr_enqueued: %d\n", nr_enqueued); pr_info("nr_lazy: %d\n", nr_lazy); return 0; err_addr: - pr_err("Page entry address %lx outside of VMA %lx-%lx\n", - va, (long)vma->e->start, (long)vma->e->end); + pr_err("Page entry address %lx outside of VMA %lx-%lx\n", va, (long)vma->e->start, (long)vma->e->end); return -1; } static int maybe_disable_thp(struct pstree_item *t, struct page_read *pr) { - struct _MmEntry *mm = rsti(t)->mm; - /* * There is no need to disable it if the page read doesn't * have parent. In this case VMA will be empty until @@ -1150,8 +1338,6 @@ static int maybe_disable_thp(struct pstree_item *t, struct page_read *pr) pr_perror("Cannot disable THP"); return -1; } - if (!(mm->has_thp_disabled && mm->thp_disabled)) - rsti(t)->has_thp_enabled = true; return 0; } @@ -1205,8 +1391,7 @@ int prepare_mappings(struct pstree_item *t) if (old_premmapped_addr) { ret = munmap(old_premmapped_addr, old_premmapped_len); if (ret < 0) - pr_perror("Unable to unmap %p(%lx)", - old_premmapped_addr, old_premmapped_len); + pr_perror("Unable to unmap %p(%lx)", old_premmapped_addr, old_premmapped_len); } /* @@ -1222,8 +1407,7 @@ int prepare_mappings(struct pstree_item *t) if (ret < 0) pr_perror("Unable to unmap %p(%lx)", addr, tail); rsti(t)->premmapped_len = old_premmapped_len; - pr_info("Shrunk premap area to %p(%lx)\n", - rsti(t)->premmapped_addr, rsti(t)->premmapped_len); + pr_info("Shrunk premap area to %p(%lx)\n", rsti(t)->premmapped_addr, rsti(t)->premmapped_len); } out: @@ -1276,9 +1460,8 @@ int open_vmas(struct pstree_item *t) if (!vma_area_is(vma, VMA_AREA_REGULAR) || !vma->vm_open) continue; - pr_info("Opening %#016"PRIx64"-%#016"PRIx64" %#016"PRIx64" (%x) vma\n", - vma->e->start, vma->e->end, - vma->e->pgoff, vma->e->status); + pr_info("Opening %#016" PRIx64 "-%#016" PRIx64 " %#016" PRIx64 " (%x) vma\n", vma->e->start, + vma->e->end, vma->e->pgoff, vma->e->status); if (vma->vm_open(pid, vma)) { pr_err("`- Can't open vma\n"); @@ -1290,8 +1473,7 @@ int open_vmas(struct pstree_item *t) * turn, puts the VMA_CLOSE bit itself. For all the rest we * need to put it by hands, so that the restorer closes the fd */ - if (!(vma_area_is(vma, VMA_FILE_PRIVATE) || - vma_area_is(vma, VMA_FILE_SHARED))) + if (!(vma_area_is(vma, VMA_FILE_PRIVATE) || vma_area_is(vma, VMA_FILE_SHARED))) vma->e->status |= VMA_CLOSE; } @@ -1304,12 +1486,25 @@ static int prepare_vma_ios(struct pstree_item *t, struct task_restore_args *ta) { struct cr_img *pages; + /* + * We optimize the case when rsti(t)->vma_io is empty. + * + * This is useful when using the image streamer, where all VMAs are + * premapped (pr->pieok is false). This avoids re-opening the + * CR_FD_PAGES file, which may only be readable only once. + */ + if (list_empty(&rsti(t)->vma_io)) { + ta->vma_ios = NULL; + ta->vma_ios_n = 0; + ta->vma_ios_fd = -1; + return 0; + } + /* * If auto-dedup is on we need RDWR mode to be able to punch holes in * the input files (in restorer.c) */ - pages = open_image(CR_FD_PAGES, opts.auto_dedup ? O_RDWR : O_RSTR, - rsti(t)->pages_img_id); + pages = open_image(CR_FD_PAGES, opts.auto_dedup ? O_RDWR : O_RSTR, rsti(t)->pages_img_id); if (!pages) return -1; @@ -1344,3 +1539,72 @@ int prepare_vmas(struct pstree_item *t, struct task_restore_args *ta) return prepare_vma_ios(t, ta); } + +int collect_madv_guards(pid_t pid, struct vm_area_list *vma_area_list) +{ + int pagemap_fd = -1; + struct page_region *regs = NULL; + long regs_len = 0; + int i, ret = -1; + + struct pm_scan_arg args = { + .size = sizeof(struct pm_scan_arg), + .flags = 0, + .start = 0, + .end = kdat.task_size, + .walk_end = 0, + .vec_len = 1000, /* this should be enough for most cases */ + .max_pages = 0, + .category_mask = PAGE_IS_GUARD, + .return_mask = PAGE_IS_GUARD, + }; + + if (!kdat.has_pagemap_scan_guard_pages) { + ret = 0; + goto out; + } + + pagemap_fd = open_proc(pid, "pagemap"); + if (pagemap_fd < 0) + goto out; + + regs = xmalloc(args.vec_len * sizeof(struct page_region)); + if (!regs) + goto out; + args.vec = (long)regs; + + do { + /* start from where we finished the last time */ + args.start = args.walk_end; + regs_len = ioctl(pagemap_fd, PAGEMAP_SCAN, &args); + if (regs_len == -1) { + pr_perror("PAGEMAP_SCAN"); + goto out; + } + + for (i = 0; i < regs_len; i++) { + struct vma_area *vma; + + BUG_ON(!(regs[i].categories & PAGE_IS_GUARD)); + + vma = alloc_vma_area(); + if (!vma) + goto out; + + vma->e->start = regs[i].start; + vma->e->end = regs[i].end; + vma->e->status = VMA_AREA_GUARD; + + list_add_tail(&vma->list, &vma_area_list->h); + vma_area_list->nr++; + } + } while (args.walk_end != kdat.task_size); + + ret = 0; + +out: + xfree(regs); + if (pagemap_fd >= 0) + close(pagemap_fd); + return ret; +} diff --git a/criu/memfd.c b/criu/memfd.c new file mode 100644 index 000000000..9d9f0621f --- /dev/null +++ b/criu/memfd.c @@ -0,0 +1,492 @@ +#include +#include + +#include "common/compiler.h" +#include "common/lock.h" +#include "memfd.h" +#include "fdinfo.h" +#include "imgset.h" +#include "image.h" +#include "util.h" +#include "log.h" +#include "files.h" +#include "fs-magic.h" +#include "kerndat.h" +#include "files-reg.h" +#include "rst-malloc.h" +#include "fdstore.h" +#include "file-ids.h" +#include "namespaces.h" +#include "shmem.h" +#include "hugetlb.h" + +#include "protobuf.h" +#include "images/memfd.pb-c.h" + +#define MEMFD_PREFIX "/memfd:" +#define MEMFD_PREFIX_LEN (sizeof(MEMFD_PREFIX) - 1) + +#define F_SEAL_SEAL 0x0001 /* prevent further seals from being set */ +#define F_SEAL_SHRINK 0x0002 /* prevent file from shrinking */ +#define F_SEAL_GROW 0x0004 /* prevent file from growing */ +#define F_SEAL_WRITE 0x0008 /* prevent writes */ +/* Linux 5.1+ */ +#define F_SEAL_FUTURE_WRITE 0x0010 /* prevent future writes while mapped */ + +struct memfd_dump_inode { + struct list_head list; + u32 id; + u32 dev; + u32 ino; +}; + +struct memfd_restore_inode { + struct list_head list; + mutex_t lock; + int fdstore_id; + unsigned int pending_seals; + MemfdInodeEntry *mie; + bool was_opened_rw; +}; + +static LIST_HEAD(memfd_inodes); + +/* + * Dump only + */ + +static u32 memfd_inode_ids = 1; + +int is_memfd(dev_t dev) +{ + return dev == kdat.shmem_dev; +} + +static int dump_memfd_inode(int fd, struct memfd_dump_inode *inode, const char *name, const struct stat *st) +{ + MemfdInodeEntry mie = MEMFD_INODE_ENTRY__INIT; + int ret = -1, flag; + u32 shmid; + + /* + * shmids are chosen as the inode number of the corresponding mmapped + * file. See handle_vma() in proc_parse.c. + * It works for memfd too, because we share the same device as the + * shmem device. + */ + shmid = inode->ino; + + pr_info("Dumping memfd:%s contents (id %#x, shmid: %#x, size: %" PRIu64 ")\n", name, inode->id, shmid, + st->st_size); + + if (dump_one_memfd_shmem(fd, shmid, st->st_size) < 0) + goto out; + + mie.inode_id = inode->id; + mie.uid = userns_uid(st->st_uid); + mie.gid = userns_gid(st->st_gid); + mie.name = (char *)name; + mie.size = st->st_size; + mie.shmid = shmid; + if (is_hugetlb_dev(inode->dev, &flag)) { + mie.has_hugetlb_flag = true; + mie.hugetlb_flag = flag | MFD_HUGETLB; + } + mie.mode = st->st_mode; + mie.has_mode = true; + + mie.seals = fcntl(fd, F_GET_SEALS); + if (mie.seals == -1) { + if (errno != EINVAL || ~mie.hugetlb_flag & MFD_HUGETLB) { + pr_perror("fcntl(F_GET_SEALS)"); + goto out; + } + /* Kernels before 4.16 don't allow MFD_HUGETLB | + * MFD_ALLOW_SEALING and return EINVAL for + * fcntl(MFD_HUGETLB-enabled fd). + */ + mie.seals = F_SEAL_SEAL; + } + + if (pb_write_one(img_from_set(glob_imgset, CR_FD_MEMFD_INODE), &mie, PB_MEMFD_INODE)) + goto out; + + ret = 0; + +out: + return ret; +} + +static struct memfd_dump_inode *dump_unique_memfd_inode(int lfd, const char *name, const struct stat *st) +{ + struct memfd_dump_inode *inode; + int fd; + + list_for_each_entry(inode, &memfd_inodes, list) + if ((inode->dev == st->st_dev) && (inode->ino == st->st_ino)) + return inode; + + inode = xmalloc(sizeof(*inode)); + if (inode == NULL) + return NULL; + + inode->dev = st->st_dev; + inode->ino = st->st_ino; + inode->id = memfd_inode_ids++; + + fd = open_proc(PROC_SELF, "fd/%d", lfd); + if (fd < 0) { + xfree(inode); + return NULL; + } + + if (dump_memfd_inode(fd, inode, name, st)) { + close(fd); + xfree(inode); + return NULL; + } + close(fd); + + list_add_tail(&inode->list, &memfd_inodes); + + return inode; +} + +static int dump_one_memfd(int lfd, u32 id, const struct fd_parms *p) +{ + MemfdFileEntry mfe = MEMFD_FILE_ENTRY__INIT; + FileEntry fe = FILE_ENTRY__INIT; + struct memfd_dump_inode *inode; + struct fd_link _link, *link; + const char *name; + + if (!p->link) { + if (fill_fdlink(lfd, p, &_link)) + return -1; + link = &_link; + } else + link = p->link; + + link_strip_deleted(link); + /* link->name is always started with "." which has to be skipped. */ + if (strncmp(link->name + 1, MEMFD_PREFIX, MEMFD_PREFIX_LEN) == 0) + name = &link->name[1 + MEMFD_PREFIX_LEN]; + else + name = link->name + 1; + + inode = dump_unique_memfd_inode(lfd, name, &p->stat); + if (!inode) + return -1; + + mfe.id = id; + mfe.flags = p->flags; + mfe.pos = p->pos; + mfe.fown = (FownEntry *)&p->fown; + mfe.inode_id = inode->id; + + fe.type = FD_TYPES__MEMFD; + fe.id = mfe.id; + fe.memfd = &mfe; + + return pb_write_one(img_from_set(glob_imgset, CR_FD_FILES), &fe, PB_FILE); +} + +int dump_one_memfd_cond(int lfd, u32 *id, struct fd_parms *parms) +{ + if (fd_id_generate_special(parms, id)) + return dump_one_memfd(lfd, *id, parms); + return 0; +} + +const struct fdtype_ops memfd_dump_ops = { + .type = FD_TYPES__MEMFD, + .dump = dump_one_memfd, +}; + +/* + * Restore only + */ + +struct memfd_info { + MemfdFileEntry *mfe; + struct file_desc d; + struct memfd_restore_inode *inode; +}; + +static struct memfd_restore_inode *memfd_alloc_inode(int id) +{ + struct memfd_restore_inode *inode; + + list_for_each_entry(inode, &memfd_inodes, list) + if (inode->mie->inode_id == id) + return inode; + + pr_err("Unable to find the %d memfd inode\n", id); + return NULL; +} + +static int collect_one_memfd_inode(void *o, ProtobufCMessage *base, struct cr_img *i) +{ + MemfdInodeEntry *mie = pb_msg(base, MemfdInodeEntry); + struct memfd_restore_inode *inode = o; + + inode->mie = mie; + mutex_init(&inode->lock); + inode->fdstore_id = -1; + inode->pending_seals = 0; + inode->was_opened_rw = false; + + list_add_tail(&inode->list, &memfd_inodes); + + return 0; +} + +static struct collect_image_info memfd_inode_cinfo = { + .fd_type = CR_FD_MEMFD_INODE, + .pb_type = PB_MEMFD_INODE, + .priv_size = sizeof(struct memfd_restore_inode), + .collect = collect_one_memfd_inode, + .flags = COLLECT_SHARED | COLLECT_NOFREE, +}; + +int prepare_memfd_inodes(void) +{ + return collect_image(&memfd_inode_cinfo); +} + +static int memfd_open_inode_nocache(struct memfd_restore_inode *inode) +{ + MemfdInodeEntry *mie = NULL; + int fd = -1; + int ret = -1; + int flags; + + mie = inode->mie; + if (mie->seals == F_SEAL_SEAL) { + inode->pending_seals = 0; + flags = 0; + } else { + /* Seals are applied later due to F_SEAL_FUTURE_WRITE */ + inode->pending_seals = mie->seals; + flags = MFD_ALLOW_SEALING; + } + + if (mie->has_hugetlb_flag) + flags |= mie->hugetlb_flag; + + fd = memfd_create(mie->name, flags); + if (fd < 0) { + pr_perror("Can't create memfd:%s", mie->name); + goto out; + } + + if (restore_memfd_shmem_content(fd, mie->shmid, mie->size)) + goto out; + + if (mie->has_mode) + ret = cr_fchperm(fd, mie->uid, mie->gid, mie->mode); + else + ret = cr_fchown(fd, mie->uid, mie->gid); + if (ret) { + pr_perror("Can't set permissions { uid %d gid %d mode %#o } of memfd:%s", (int)mie->uid, + (int)mie->gid, mie->has_mode ? (int)mie->mode : -1, mie->name); + goto out; + } + + inode->fdstore_id = fdstore_add(fd); + if (inode->fdstore_id < 0) + goto out; + + ret = fd; + fd = -1; + +out: + if (fd != -1) + close(fd); + return ret; +} + +static int memfd_open_inode(struct memfd_restore_inode *inode) +{ + int fd; + + if (inode->fdstore_id != -1) + return fdstore_get(inode->fdstore_id); + + mutex_lock(&inode->lock); + if (inode->fdstore_id != -1) + fd = fdstore_get(inode->fdstore_id); + else + fd = memfd_open_inode_nocache(inode); + mutex_unlock(&inode->lock); + + return fd; +} + +int memfd_open(struct file_desc *d, u32 *fdflags, bool filemap) +{ + struct memfd_info *mfi; + MemfdFileEntry *mfe; + int fd, _fd; + u32 flags; + + mfi = container_of(d, struct memfd_info, d); + mfe = mfi->mfe; + + pr_info("Restoring memfd id=%d\n", mfe->id); + + fd = memfd_open_inode(mfi->inode); + if (fd < 0) + return -1; + + /* Reopen the fd with original permissions */ + flags = fdflags ? *fdflags : mfe->flags; + + if (filemap && (flags & O_ACCMODE) == O_RDWR) + return fd; + + if (!mfi->inode->was_opened_rw && (flags & O_ACCMODE) == O_RDWR) { + /* + * If there is only a single RW-opened fd for a memfd, it can + * be used to pass it to execveat() with AT_EMPTY_PATH to have + * its contents executed. This currently works only for the + * original fd from memfd_create() so return the original fd + * once -- in case the caller expects to be the sole opener + * and does execveat() from this memfd. + */ + if (!fcntl(fd, F_SETFL, flags)) { + mfi->inode->was_opened_rw = true; + return fd; + } + + pr_pwarn("Can't change fd flags to %#o for memfd id=%d", flags, mfe->id); + } + + /* + * Ideally we should call compat version open() to not force the + * O_LARGEFILE file flag with regular open(). It doesn't seem that + * important though. + */ + _fd = __open_proc(PROC_SELF, 0, flags, "fd/%d", fd); + if (_fd < 0) + pr_perror("Can't reopen memfd id=%d", mfe->id); + else if (!filemap && (flags & O_ACCMODE) == O_RDWR) + pr_warn("execveat(fd=%d, ..., AT_EMPTY_PATH) might fail after restore; memfd id=%d\n", _fd, mfe->id); + + close(fd); + return _fd; +} + +static int memfd_open_fe_fd(struct file_desc *d, int *new_fd) +{ + MemfdFileEntry *mfe; + int fd; + + if (inherited_fd(d, new_fd)) + return 0; + + fd = memfd_open(d, NULL, false); + if (fd < 0) + return -1; + + mfe = container_of(d, struct memfd_info, d)->mfe; + + if (restore_fown(fd, mfe->fown) < 0) + goto err; + + if (lseek(fd, mfe->pos, SEEK_SET) < 0) { + pr_perror("Can't restore file position of %d for memfd id=%d", fd, mfe->id); + goto err; + } + + *new_fd = fd; + return 0; + +err: + close(fd); + return -1; +} + +static char *memfd_d_name(struct file_desc *d, char *buf, size_t s) +{ + MemfdInodeEntry *mie = NULL; + struct memfd_info *mfi; + + mfi = container_of(d, struct memfd_info, d); + + mie = mfi->inode->mie; + if (snprintf(buf, s, "%s%s", MEMFD_PREFIX, mie->name) >= s) { + pr_err("Buffer too small for memfd name %s\n", mie->name); + return NULL; + } + + return buf; +} + +static struct file_desc_ops memfd_desc_ops = { + .type = FD_TYPES__MEMFD, + .open = memfd_open_fe_fd, + .name = memfd_d_name, +}; + +static int collect_one_memfd(void *o, ProtobufCMessage *msg, struct cr_img *i) +{ + struct memfd_info *info = o; + + info->mfe = pb_msg(msg, MemfdFileEntry); + info->inode = memfd_alloc_inode(info->mfe->inode_id); + if (!info->inode) + return -1; + + return file_desc_add(&info->d, info->mfe->id, &memfd_desc_ops); +} + +struct collect_image_info memfd_cinfo = { + .fd_type = CR_FD_MEMFD_FILE, + .pb_type = PB_MEMFD_FILE, + .priv_size = sizeof(struct memfd_info), + .collect = collect_one_memfd, +}; + +struct file_desc *collect_memfd(u32 id) +{ + struct file_desc *fdesc; + + fdesc = find_file_desc_raw(FD_TYPES__MEMFD, id); + if (fdesc == NULL) + pr_err("No entry for memfd %#x\n", id); + + return fdesc; +} + +int apply_memfd_seals(void) +{ + /* + * We apply the seals after all the mappings are done because the seal + * F_SEAL_FUTURE_WRITE prevents future write access (added in + * Linux 5.1). Thus we must make sure all writable mappings are opened + * before applying this seal. + */ + + int ret, fd; + struct memfd_restore_inode *inode; + + list_for_each_entry(inode, &memfd_inodes, list) { + if (!inode->pending_seals) + continue; + + fd = memfd_open_inode(inode); + if (fd < 0) + return -1; + + ret = fcntl(fd, F_ADD_SEALS, inode->pending_seals); + close(fd); + + if (ret < 0) { + pr_perror("Cannot apply seals on memfd"); + return -1; + } + } + + return 0; +} diff --git a/criu/mount-v2.c b/criu/mount-v2.c new file mode 100644 index 000000000..1e33ac12a --- /dev/null +++ b/criu/mount-v2.c @@ -0,0 +1,1321 @@ +#include +#include +#include +#include +#include +#include +#include + +#include "kerndat.h" +#include "log.h" +#include "cr_options.h" +#include "xmalloc.h" +#include "util.h" +#include "filesystems.h" +#include "mount.h" +#include "mount-v2.h" +#include "namespaces.h" +#include "fs-magic.h" +#include "path.h" +#include "files-reg.h" +#include "fdstore.h" +#include "common/list.h" +#include "common/bug.h" +#include "common/compiler.h" + +#include "images/mnt.pb-c.h" + +#undef LOG_PREFIX +#define LOG_PREFIX "mnt-v2: " + +LIST_HEAD(sharing_groups); + +int check_mount_v2(void) +{ + if (!kdat.has_move_mount_set_group) { + pr_debug("Mounts-v2 requires MOVE_MOUNT_SET_GROUP support\n"); + return -1; + } + + if (!kdat.has_openat2) { + pr_debug("Mounts-v2 requires openat2 support\n"); + return -1; + } + + return 0; +} + +static struct sharing_group *get_sharing_group(int shared_id, int master_id) +{ + struct sharing_group *sg; + + list_for_each_entry(sg, &sharing_groups, list) { + if (sg->shared_id == shared_id && sg->master_id == master_id) + return sg; + } + + return NULL; +} + +static struct sharing_group *alloc_sharing_group(int shared_id, int master_id) +{ + struct sharing_group *sg; + + sg = xzalloc(sizeof(struct sharing_group)); + if (!sg) + return NULL; + + sg->shared_id = shared_id; + sg->master_id = master_id; + + INIT_LIST_HEAD(&sg->list); + INIT_LIST_HEAD(&sg->mnt_list); + INIT_LIST_HEAD(&sg->children); + INIT_LIST_HEAD(&sg->siblings); + + list_add(&sg->list, &sharing_groups); + + return sg; +} + +int resolve_shared_mounts_v2(void) +{ + struct sharing_group *sg; + struct mount_info *mi; + + /* + * Create sharing groups for each unique shared_id+master_id pair and + * link each mount to the corresponding sharing group. + */ + for (mi = mntinfo; mi; mi = mi->next) { + if (!mi->shared_id && !mi->master_id) + continue; + + pr_debug("Inspecting sharing on %2d shared_id %d master_id %d (@%s)\n", mi->mnt_id, mi->shared_id, + mi->master_id, mi->ns_mountpoint); + + sg = get_sharing_group(mi->shared_id, mi->master_id); + if (!sg) { + sg = alloc_sharing_group(mi->shared_id, mi->master_id); + if (!sg) + return -1; + } + + list_add(&mi->mnt_sharing, &sg->mnt_list); + mi->sg = sg; + } + + /* + * Collect sharing groups tree. Mount propagation between sharing + * groups only goes down this tree, meaning that only mounts of same or + * descendant sharing groups receive mount propagation. + */ + list_for_each_entry(sg, &sharing_groups, list) { + if (sg->master_id) { + struct sharing_group *p; + + /* + * Lookup parent sharing group. If one sharing group + * has master_id equal to shared_id of another sharing + * group than the former is a child (slave) of the + * latter. Also sharing groups should not have two + * parents so we check this here too. + */ + list_for_each_entry(p, &sharing_groups, list) { + if (p->shared_id != sg->master_id) + continue; + + if (sg->parent) { + pr_err("Sharing group (%d, %d) parent collision (%d, %d) (%d, %d)\n", + sg->shared_id, sg->master_id, p->shared_id, p->master_id, + sg->parent->shared_id, sg->parent->master_id); + return -1; + } + sg->parent = p; + + if (!list_empty(&sg->siblings)) { + pr_err("External slavery sharing group (%d, %d) has parent (%d, %d)\n", + sg->shared_id, sg->master_id, p->shared_id, p->master_id); + return -1; + } + list_add(&sg->siblings, &p->children); + /* Don't break to check for parent collision */ + } + + /* + * If sharing group has master_id but we did't find + * parent for it inside the dumped container yet, this + * means that the master_id is external and a mount on + * host should exist with corresponding shared_id. + */ + if (!sg->parent && list_empty(&sg->siblings)) { + struct mount_info *ext; + struct sharing_group *s; + char *source = NULL; + + /* + * Though we don't have parent sharing group + * (inaccessible sharing), we can still have + * siblings, sharing groups with same master_id + * but different shared_id, let's collect them + * to the list. + */ + list_for_each_entry(s, &sharing_groups, list) { + if (s->master_id != sg->master_id) + continue; + + if (s->parent) { + pr_err("External slavery sharing group (%d, %d) has parent (%d, %d)\n", + sg->shared_id, sg->master_id, s->parent->shared_id, + s->parent->master_id); + return -1; + } + + if (!list_empty(&s->siblings)) { + pr_err("External slavery sharing group collision (%d, %d) (%d, %d)\n", + sg->shared_id, sg->master_id, s->shared_id, s->master_id); + return -1; + } + list_add(&s->siblings, &sg->siblings); + } + + BUG_ON(list_empty(&sg->mnt_list)); + mi = list_entry(sg->mnt_list.next, struct mount_info, mnt_sharing); + + /* + * We need to know from which mount on host we + * can get this external master_id. There are + * two options: mountpoint external mount or + * root mount of container. + */ + if ((ext = mnt_get_external_bind_nodev(mi))) + source = ext->external; + else if (mnt_is_root_bind(mi)) + source = opts.root; + + if (!source) { + pr_err("Sharing group (%d, %d) " + "has unreachable sharing. Try --enable-external-masters.\n", + sg->shared_id, sg->master_id); + return -1; + } + + sg->source = source; + list_for_each_entry(s, &sg->siblings, siblings) + s->source = sg->source; + + pr_debug("Detected external slavery for shared group (%d, %d) with source %s\n", + sg->shared_id, sg->master_id, source); + } + } + } + + return 0; +} + +/* + * When first mount from superblock is mounted, give other mounts + * a hint that they can now just bindmount from the first one. + */ +static int propagate_mount_v2(struct mount_info *mi) +{ + struct mount_info *t; + + list_for_each_entry(t, &mi->mnt_bind, mnt_bind) { + if (t->mounted) + continue; + if (t->bind) + continue; + if (!issubpath(t->root, mi->root)) + continue; + pr_debug("\t\tPropagate %d to %d\n", mi->mnt_id, t->mnt_id); + t->bind = mi; + t->s_dev_rt = mi->s_dev_rt; + } + + return 0; +} + +/* + * Mounts first mount of superblock + */ +static int do_new_mount_v2(struct mount_info *mi) +{ + unsigned long sflags = mi->sb_flags; + unsigned long mflags = mi->flags & (~MS_PROPAGATE); + char *src; + struct fstype *tp = mi->fstype; + bool remount_ro = (tp->restore && mi->sb_flags & MS_RDONLY); + mount_fn_t do_mount = (tp->mount) ? tp->mount : do_simple_mount; + + src = resolve_source(mi); + if (!src) + return -1; + + /* Merge superblock and mount flags if it's possible */ + if (!(mflags & ~MS_MNT_KNOWN_FLAGS) && !((sflags ^ mflags) & MS_RDONLY)) { + sflags |= mflags; + mflags = 0; + } + + if (remount_ro) + sflags &= ~MS_RDONLY; + + if (do_mount(mi, src, mnt_fsname(mi), sflags) < 0) { + pr_perror("Can't mount at %s", mi->plain_mountpoint); + return -1; + } + + /* + * Mount-v2 relies that before mount tree is constructed all mounts + * should remain private. Newly created mounts can become non-private + * initially depending on parent/source sharing, let's be as explicit + * as possible here and make it obvious that mount becomes private. + */ + if (mount(NULL, mi->plain_mountpoint, NULL, MS_PRIVATE, NULL)) { + pr_perror("Can't remount %s with MS_PRIVATE", mi->plain_mountpoint); + return -1; + } + + if (tp->restore && tp->restore(mi)) + return -1; + + if (remount_ro) { + int fd; + + fd = open(mi->plain_mountpoint, O_PATH); + if (fd < 0) { + pr_perror("Unable to open %s", mi->plain_mountpoint); + return -1; + } + sflags |= MS_RDONLY | MS_REMOUNT; + if (userns_call(apply_sb_flags, 0, &sflags, sizeof(sflags), fd)) { + pr_perror("Unable to apply mount flags %d for %s", mi->sb_flags, mi->plain_mountpoint); + close(fd); + return -1; + } + close(fd); + } + + if (mflags && mount(NULL, mi->plain_mountpoint, NULL, MS_REMOUNT | MS_BIND | mflags, NULL)) { + pr_perror("Unable to apply bind-mount options"); + return -1; + } + + mi->mounted = true; + return 0; +} + +/* + * Does simple bindmount, but via new kernel mount api, + * which also handles autofs and symlink without resolving. + */ +static int __do_bind_mount_v2(char *from, char *to) +{ + int detached_fd; + + detached_fd = sys_open_tree(AT_FDCWD, from, AT_NO_AUTOMOUNT | AT_SYMLINK_NOFOLLOW | OPEN_TREE_CLONE); + if (detached_fd == -1) { + pr_perror("Failed to open_tree %s", from); + return -1; + } + + if (sys_move_mount(detached_fd, "", AT_FDCWD, to, MOVE_MOUNT_F_EMPTY_PATH)) { + pr_perror("Failed to move_mount from %s to %s", from, to); + close(detached_fd); + return -1; + } + close(detached_fd); + + return 0; +} + +LIST_HEAD(deleted_mounts); + +/* + * Bind-mounts all later mounts of superblock from first one, + * also handles first mounts of mountpoint external mounts. + */ +static int do_bind_mount_v2(struct mount_info *mi) +{ + char *root = NULL, *cut_root, rpath[PATH_MAX]; + unsigned long mflags; + int exit_code = -1; + char *mnt_path = NULL; + int level = 0; + + if (mi->need_plugin) { + if (restore_ext_mount(mi)) + return -1; + goto out; + } + + if (mnt_is_nodev_external(mi)) { + root = mi->external; + goto do_bind; + } + + cut_root = get_relative_path(mi->root, mi->bind->root); + if (!cut_root) { + pr_err("Failed to find root for %d in our supposed bind %d\n", mi->mnt_id, mi->bind->mnt_id); + return -1; + } + + /* + * Mount ->private can be initialized on fstype->mount() callback, + * which is called for first mount of superblock in do_new_mount(). + * Also ->private have to be copied to all other mounts of superblock + * to provide users of it with actual data. + */ + mi->private = mi->bind->private; + + mnt_path = mi->bind->plain_mountpoint; + + if (cut_root[0]) { + snprintf(rpath, sizeof(rpath), "%s/%s", mnt_path, cut_root); + root = rpath; + } else { + root = mnt_path; + } +do_bind: + pr_info("\tBind %s to %s\n", root, mi->plain_mountpoint); + + if (unlikely(mi->deleted)) { + level = make_parent_dirs_if_need(-1, root); + if (level < 0) + goto err; + + if (mi->is_dir) { + if (mkdir(root, 0600)) { + pr_perror("Can't re-create deleted directory %s", root); + goto err; + } + } else { + int fd = open(root, O_WRONLY | O_CREAT | O_EXCL, 0600); + if (fd < 0) { + pr_perror("Can't re-create deleted file %s", root); + goto err; + } + close(fd); + } + } + + if (__do_bind_mount_v2(root, mi->plain_mountpoint)) + goto err; + + /* + * Mount-v2 relies that before mount tree is constructed all mounts + * should remain private. Newly created mounts can become non-private + * initially depending on parent/source sharing, let's be as explicit + * as possible here and make it obvious that mount becomes private. + */ + if (mount(NULL, mi->plain_mountpoint, NULL, MS_PRIVATE, NULL)) { + pr_perror("Can't remount %s with MS_PRIVATE", mi->plain_mountpoint); + goto err; + } + + mflags = mi->flags & (~MS_PROPAGATE); + if (!mi->bind || mflags != (mi->bind->flags & (~MS_PROPAGATE))) + if (mount(NULL, mi->plain_mountpoint, NULL, MS_BIND | MS_REMOUNT | mflags, NULL)) { + pr_perror("Can't bind remount 0x%lx at %s", mflags, mi->plain_mountpoint); + goto err; + } + + if (mi->deleted) { + /* + * Deleted mounts can't be moved, will delete source after + * moving to proper position in the mount tree FIXME. + */ + mi->deleted_level = level; + level = 0; + list_add(&mi->deleted_list, &deleted_mounts); + } +out: + mi->mounted = true; + exit_code = 0; +err: + if (level) + rm_parent_dirs(-1, root, level); + + return exit_code; +} + +/* Mounts root container mount. */ +static int do_mount_root_v2(struct mount_info *mi) +{ + unsigned long mflags = mi->flags & (~MS_PROPAGATE); + unsigned long flags = MS_BIND; + int fd; + + if (root_ns_mask & CLONE_NEWUSER) { + fd = open(mi->plain_mountpoint, O_PATH); + if (fd < 0) { + pr_perror("Unable to open %s", mi->plain_mountpoint); + return -1; + } + + if (userns_call(mount_root, 0, &flags, sizeof(flags), fd)) { + pr_err("Unable to mount %s\n", mi->plain_mountpoint); + close(fd); + return -1; + } + close(fd); + } else { + if (mount(opts.root, mi->plain_mountpoint, NULL, flags, NULL)) { + pr_perror("Unable to mount %s %s (id=%d)", opts.root, mi->plain_mountpoint, mi->mnt_id); + return -1; + } + } + + /* + * Mount-v2 relies that before mount tree is constructed all mounts + * should remain private. Newly created mounts can become non-private + * initially depending on parent/source sharing, let's be as explicit + * as possible here and make it obvious that mount becomes private. + */ + if (mount(NULL, mi->plain_mountpoint, NULL, MS_PRIVATE, NULL)) { + pr_perror("Can't remount %s with MS_PRIVATE", mi->plain_mountpoint); + return -1; + } + + if (mflags && mount(NULL, mi->plain_mountpoint, NULL, MS_REMOUNT | MS_BIND | mflags, NULL)) { + pr_perror("Unable to apply root mount options"); + return -1; + } + + mi->mounted = true; + + return 0; +} + +/* Check if mount is ready to be mounted. */ +static bool can_mount_now_v2(struct mount_info *mi) +{ + struct mount_info *root, *ext; + + /* Parent should be mounted already, that's how mnt_tree_for_each works */ + BUG_ON(mi->parent && !mi->parent->mounted); + + /* Root mounts can be mounted at any moment */ + if (rst_mnt_is_root(mi)) { + pr_debug("%s: true as %d is global root\n", __func__, mi->mnt_id); + return true; + } + + /* External mounts can be mounted at any moment */ + if (mi->external) { + pr_debug("%s: true as %d is external\n", __func__, mi->mnt_id); + return true; + } + + /* + * Container root and external mounts should go before + * anything which should be bindmounted from them. + */ + if (!mi->bind) { + root = mnt_get_root_bind(mi); + if (root) { + pr_debug("%s: false as %d is bind of not mounted global root %d\n", __func__, mi->mnt_id, + root->mnt_id); + return false; + } + + ext = mnt_get_external_bind(mi); + if (ext) { + pr_debug("%s: false as %d is a bind of not mounted external %d\n", __func__, mi->mnt_id, + ext->mnt_id); + return false; + } + } + + /* Non fsroot mounts can not be mounted without bind-mount */ + if (!fsroot_mounted(mi) && !mi->bind && !mi->need_plugin) { + pr_debug("%s: false as %d is non-root without bind or plugin\n", __func__, mi->mnt_id); + return false; + } + + return true; +} + +static int __set_unbindable_v2(struct mount_info *mi) +{ + if (mi->flags & MS_UNBINDABLE) { + if (mount(NULL, service_mountpoint(mi), NULL, MS_UNBINDABLE, NULL)) { + pr_perror("Failed to set mount %d unbindable", mi->mnt_id); + return -1; + } + } + return 0; +} + +/* + * Setting MS_UNBINDABLE flag is slightly delayed, + * obviousely until we finish bind-mounting everything. + */ +static int set_unbindable_v2(void) +{ + int orig_nsfd = -1, nsfd = -1, exit_code = -1; + struct mount_info *mi; + struct ns_id *nsid; + + for (nsid = ns_ids; nsid != NULL; nsid = nsid->next) { + bool ns_has_unbindable = false; + + if (nsid->nd != &mnt_ns_desc) + continue; + + for (mi = mntinfo; mi != NULL; mi = mi->next) + if (mi->nsid == nsid && mi->flags & MS_UNBINDABLE) + ns_has_unbindable = true; + + if (!ns_has_unbindable) + continue; + + nsfd = fdstore_get(nsid->mnt.nsfd_id); + if (nsfd < 0) + goto err; + + if (switch_ns_by_fd(nsfd, &mnt_ns_desc, orig_nsfd == -1 ? &orig_nsfd : NULL)) + goto err; + close_safe(&nsfd); + + if (mnt_tree_for_each(nsid->mnt.mntinfo_tree, __set_unbindable_v2)) + goto err; + } + + exit_code = 0; +err: + if (orig_nsfd >= 0 && restore_ns(orig_nsfd, &mnt_ns_desc)) + exit_code = -1; + close_safe(&nsfd); + return exit_code; +} + +/* + * Detects if mount is a directory mount or file mount based on stat on + * its mountpoint inside already mounted parent mount. This is deeply + * integrated in plain mount creation process because before mounting + * something plain we need to create right type of mountpoint for it. + */ +static int detect_is_dir(struct mount_info *mi) +{ + static char mountpoint[PATH_MAX]; + char *rel_path; + struct stat st; + + if (mi->is_dir != -1) + return 0; + + if (mi->mnt_id == HELPER_MNT_ID) { + pr_err("Helper %s should have is_dir pre-set\n", mi->ns_mountpoint); + return -1; + } + + if (!mi->parent || mi->parent == root_yard_mp) { + pr_err("Mount namespace root mount %d should have is_dir pre-set\n", mi->mnt_id); + return -1; + } + + if (!mi->parent->mounted) { + pr_err("Parent mount %d of %d should be mounted\n", mi->parent->mnt_id, mi->mnt_id); + return -1; + } + + rel_path = get_relative_path(mi->ns_mountpoint, mi->parent->ns_mountpoint); + if (!rel_path) { + pr_err("Child-parent mountpoint mismatch %d:%s %d:%s\n", mi->mnt_id, mi->ns_mountpoint, + mi->parent->mnt_id, mi->parent->ns_mountpoint); + return -1; + } + + snprintf(mountpoint, sizeof(mountpoint), "%s%s%s", mi->parent->plain_mountpoint, rel_path[0] ? "/" : "", + rel_path); + if (stat(mountpoint, &st)) { + pr_perror("Can't stat mountpoint %s", mountpoint); + return -1; + } + + if (S_ISDIR(st.st_mode)) + mi->is_dir = true; + else + mi->is_dir = false; + + pr_debug("Mount %d is detected as %s-mount\n", mi->mnt_id, mi->is_dir ? "dir" : "file"); + return 0; +} + +static int create_plain_mountpoint(struct mount_info *mi) +{ + BUG_ON(mi->is_dir == -1); + + pr_debug("Create plain mountpoint %s for %d\n", mi->plain_mountpoint, mi->mnt_id); + if (mi->is_dir) { + if (mkdir(mi->plain_mountpoint, 0600)) { + pr_perror("Unable to mkdir mountpoint %s", mi->plain_mountpoint); + return -1; + } + } else { + int fd; + + fd = creat(mi->plain_mountpoint, 0600); + if (fd < 0) { + pr_perror("Unable to create mountpoint %s", mi->plain_mountpoint); + return -1; + } + close(fd); + } + + return 0; +} + +/* + * At this point we already have a mount in service mount namespace now we + * bind-mount it to the final restored mount namespace via new kernel mount + * API. + */ +static int do_mount_in_right_mntns(struct mount_info *mi) +{ + int nsfd = -1, orig_nsfd = -1, detached_fd = -1, exit_code = -1; + + if (!mi->nsid) + return 0; + + detached_fd = + sys_open_tree(AT_FDCWD, mi->plain_mountpoint, AT_NO_AUTOMOUNT | AT_SYMLINK_NOFOLLOW | OPEN_TREE_CLONE); + if (detached_fd == -1) { + pr_perror("Failed to open_tree %s", mi->plain_mountpoint); + goto err; + } + + nsfd = fdstore_get(mi->nsid->mnt.nsfd_id); + if (nsfd < 0) + goto err; + + if (switch_ns_by_fd(nsfd, &mnt_ns_desc, &orig_nsfd)) + goto err; + + if (create_plain_mountpoint(mi)) + goto err; + + if (sys_move_mount(detached_fd, "", AT_FDCWD, mi->plain_mountpoint, MOVE_MOUNT_F_EMPTY_PATH)) { + pr_perror("Failed to cross-mntns move_mount plain mount %d", mi->mnt_id); + goto err; + } + + exit_code = 0; +err: + if (orig_nsfd >= 0 && restore_ns(orig_nsfd, &mnt_ns_desc)) + exit_code = -1; + close_safe(&nsfd); + close_safe(&detached_fd); + return exit_code; +} + +static int do_mount_one_v2(struct mount_info *mi) +{ + int ret; + + if (mi->mounted) + return 0; + + if (!can_mount_now_v2(mi)) { + pr_debug("Postpone mount %d\n", mi->mnt_id); + return 1; + } + + if (detect_is_dir(mi)) + return -1; + + if (create_plain_mountpoint(mi)) + return -1; + + pr_debug("\tMounting %s @%d (%d)\n", mi->fstype->name, mi->mnt_id, mi->need_plugin); + + if (rst_mnt_is_root(mi)) { + if (opts.root == NULL) { + pr_err("The --root option is required to restore a mount namespace\n"); + return -1; + } + ret = do_mount_root_v2(mi); + } else if (!mi->bind && !mi->need_plugin && (!mi->external || !strcmp(mi->external, EXTERNAL_DEV_MOUNT))) { + ret = do_new_mount_v2(mi); + } else { + ret = do_bind_mount_v2(mi); + } + + if (ret == 0 && fetch_rt_stat(mi, mi->plain_mountpoint)) + return -1; + + if (ret == 0 && propagate_mount_v2(mi)) + return -1; + + if (mi->fstype->code == FSTYPE__UNSUPPORTED) { + struct statfs st; + + if (statfs(mi->plain_mountpoint, &st)) { + pr_perror("Unable to statfs %s", mi->plain_mountpoint); + return -1; + } + if (st.f_type == BTRFS_SUPER_MAGIC) + mi->fstype = find_fstype_by_name("btrfs"); + } + + if (ret == 0 && do_mount_in_right_mntns(mi)) + return -1; + + return ret; +} + +static int populate_mnt_ns_v2(void) +{ + if (make_yard(mnt_roots)) + return -1; + + if (mnt_tree_for_each(root_yard_mp, do_mount_one_v2)) + return -1; + + return set_unbindable_v2(); +} + +/* + * This function moves plain mounts into actual mount tree. + * + * Mounts in children list are sorted the way that sibling overmount goes after + * all siblings which it overmounts (see __mnt_resort_children). The function + * mnt_tree_for_each is effectively DFS (in case we don't postpone), thus all + * descendants of all mounts which we sibling-overmount are mounted before us. + * Be careful, we can't postpone (return >0) from this function because of it. + */ +static int move_mount_to_tree(struct mount_info *mi) +{ + int fd; + + fd = open(mi->mountpoint, O_PATH); + if (fd < 0) { + pr_perror("Failed to open real mountpoint of %d", mi->mnt_id); + return -1; + } + + mi->mp_fd_id = fdstore_add(fd); + close(fd); + if (mi->mp_fd_id < 0) { + pr_err("Can't add mountpoint of mount %d to fdstore\n", mi->mnt_id); + return -1; + } + + pr_info("Move mount %d from %s to %s\n", mi->mnt_id, mi->plain_mountpoint, mi->mountpoint); + if (sys_move_mount(AT_FDCWD, mi->plain_mountpoint, AT_FDCWD, mi->mountpoint, 0)) { + pr_perror("Failed to move mount %d from %s to %s", mi->mnt_id, mi->plain_mountpoint, mi->mountpoint); + return -1; + } + + fd = open(mi->mountpoint, O_PATH); + if (fd < 0) { + pr_perror("Failed to open real mountpoint of %d", mi->mnt_id); + return -1; + } + + mi->mnt_fd_id = fdstore_add(fd); + close(fd); + if (mi->mnt_fd_id < 0) { + pr_err("Can't add mount %d fd to fdstore\n", mi->mnt_id); + return -1; + } + + return 0; +} + +static int assemble_tree_from_plain_mounts(struct ns_id *nsid) +{ + return mnt_tree_for_each(nsid->mnt.mntinfo_tree, move_mount_to_tree); +} + +/* + * With MOVE_MOUNT_SET_GROUP source mount should have wider root than + * destination, thus let's choose widest mount from group as first. + */ +static struct mount_info *get_first_mount(struct sharing_group *sg) +{ + struct mount_info *first = NULL, *tmp; + int min_len = 0; + + list_for_each_entry(tmp, &sg->mnt_list, mnt_sharing) { + int len = strlen(tmp->root); + + if (!first || len < min_len) { + first = tmp; + min_len = len; + } + } + + return first; +} + +struct set_group_arg { + int src_id; + char source[PATH_MAX]; + int dst_id; +}; + +static int __move_mount_set_group(void *arg, int dfd, int pid) +{ + struct set_group_arg *sga = (struct set_group_arg *)arg; + int src_fd, dst_fd, exit_code = -1; + + if (sga->src_id != -1) { + src_fd = fdstore_get(sga->src_id); + BUG_ON(src_fd < 0); + } else { + char *source_mp; + + BUG_ON(sga->source[0] == '\0'); + /* + * Source path should not always be a mountpoint as we + * automatically resolve it to mountpoint below. + */ + source_mp = resolve_mountpoint(sga->source); + if (!source_mp) { + pr_err("Failed to find %s mountpoint\n", sga->source); + return -1; + } + + src_fd = open(source_mp, O_PATH); + if (src_fd < 0) { + pr_perror("Failed to open %s mountpoint", source_mp); + xfree(source_mp); + return -1; + } + xfree(source_mp); + } + + dst_fd = fdstore_get(sga->dst_id); + BUG_ON(dst_fd < 0); + + /* Copy shared_id of the source */ + if (sys_move_mount(src_fd, "", dst_fd, "", + MOVE_MOUNT_SET_GROUP | MOVE_MOUNT_F_EMPTY_PATH | MOVE_MOUNT_T_EMPTY_PATH)) { + pr_perror("Failed to copy sharing from %d:%s to %d", sga->src_id, sga->source ?: "", sga->dst_id); + goto err; + } + + exit_code = 0; +err: + close(src_fd); + close(dst_fd); + return exit_code; +} + +/* + * Copy sharing between mounts passing mountpoint fds via fdstore ids. Also it + * is possible (for external mounts) to pass path on mountpoint via source path, + * it would resolve to mountpoint automatically. + */ +static int move_mount_set_group(int src_id, char *source, int dst_id) +{ + struct set_group_arg sga = { + .src_id = src_id, + .dst_id = dst_id, + }; + + sga.source[0] = '\0'; + if (source) { + if (snprintf(sga.source, sizeof(sga.source), "%s", source) >= sizeof(sga.source)) { + pr_err("Source %s is too long\n", source); + return -1; + } + } + + if (userns_call(__move_mount_set_group, 0, &sga, sizeof(sga), -1)) + return -1; + + return 0; +} + +static int restore_one_sharing(struct sharing_group *sg, struct mount_info *target) +{ + int nsfd = -1, orig_nsfd = -1, exit_code = -1; + char target_path[PATH_MAX]; + int target_fd = -1; + + if (!sg->master_id && !sg->shared_id) + return 0; + + target_fd = fdstore_get(target->mnt_fd_id); + BUG_ON(target_fd < 0); + snprintf(target_path, sizeof(target_path), "/proc/self/fd/%d", target_fd); + + /* Restore target's master_id from shared_id of the source */ + if (sg->master_id) { + if (sg->parent) { + struct mount_info *first; + + /* Get shared_id from parent sharing group */ + first = get_first_mount(sg->parent); + if (move_mount_set_group(first->mnt_fd_id, NULL, target->mnt_fd_id)) { + pr_err("Failed to copy sharing from %d to %d\n", first->mnt_id, target->mnt_id); + goto err; + } + } else { + /* + * External slavery. We rely on the user to give us the + * right source for external mount with all proper + * sharing options setup (it should be either shared + * or non-shared slave). If source is a private mount + * we would fail. + */ + if (move_mount_set_group(-1, sg->source, target->mnt_fd_id)) { + pr_err("Failed to copy sharing from source %s to %d\n", sg->source, target->mnt_id); + goto err; + } + } + } + + nsfd = fdstore_get(target->nsid->mnt.nsfd_id); + if (nsfd < 0) + goto err; + + if (switch_ns_by_fd(nsfd, &mnt_ns_desc, &orig_nsfd)) + goto err; + + if (sg->master_id) { + /* Convert shared_id to master_id */ + if (mount(NULL, target_path, NULL, MS_SLAVE, NULL)) { + pr_perror("Failed to make mount %d slave", target->mnt_id); + goto err; + } + } + + /* Restore target's shared_id */ + if (sg->shared_id) { + if (mount(NULL, target_path, NULL, MS_SHARED, NULL)) { + pr_perror("Failed to make mount %d shared", target->mnt_id); + goto err; + } + } + exit_code = 0; +err: + close_safe(&target_fd); + close_safe(&nsfd); + if (orig_nsfd >= 0 && restore_ns(orig_nsfd, &mnt_ns_desc)) + exit_code = -1; + return exit_code; +} + +static int restore_one_sharing_group(struct sharing_group *sg) +{ + struct mount_info *first, *other; + + first = get_first_mount(sg); + + if (restore_one_sharing(sg, first)) + return -1; + + /* Restore sharing for other mounts from the sharing group */ + list_for_each_entry(other, &sg->mnt_list, mnt_sharing) { + if (other == first) + continue; + + if (is_sub_path(other->root, first->root)) { + if (move_mount_set_group(first->mnt_fd_id, NULL, other->mnt_fd_id)) { + pr_err("Failed to copy sharing from %d to %d\n", first->mnt_id, other->mnt_id); + return -1; + } + } else { + /* + * Case where mounts of this sharing group don't have common root. + * For instance we can create two sub-directories .a and .b in some + * shared mount, bindmount them separately somethere and umount the + * original mount. Now we have both bindmounts shared between each + * other. Kernel only allows to copy sharing between mounts when + * source root contains destination root, which is not true for + * these two, so we can't just copy from first to other. + * + * For external sharing (!sg->parent) with only master_id (shared_id + * == 0) we can workaround this by copying from their external source + * instead (same as we did for a first mount). + * + * This is a w/a runc usecase, see https://github.com/opencontainers/runc/pull/3442 + */ + if (!sg->parent && !sg->shared_id) { + if (restore_one_sharing(sg, other)) + return -1; + } else { + pr_err("Can't copy sharing from %d[%s] to %d[%s]\n", first->mnt_id, first->root, + other->mnt_id, other->root); + return -1; + } + } + } + + return 0; +} + +static struct sharing_group *sharing_group_next(struct sharing_group *sg) +{ + if (!list_empty(&sg->children)) + return list_entry(sg->children.next, struct sharing_group, siblings); + + while (sg->parent) { + if (sg->siblings.next == &sg->parent->children) + sg = sg->parent; + else + return list_entry(sg->siblings.next, struct sharing_group, siblings); + } + + return NULL; +} + +static int restore_mount_sharing_options(void) +{ + struct sharing_group *sg; + + list_for_each_entry(sg, &sharing_groups, list) { + struct sharing_group *t; + + if (sg->parent) + continue; + + /* Handle dependent sharing groups in tree order */ + for (t = sg; t != NULL; t = sharing_group_next(t)) { + if (restore_one_sharing_group(t)) + return -1; + } + } + + return 0; +} + +static int remove_source_of_deleted_mount(struct mount_info *mi) +{ + char *cut_root, path[PATH_MAX], *root; + + BUG_ON(!mi->deleted || !mi->bind); + + cut_root = get_relative_path(mi->root, mi->bind->root); + if (!cut_root) { + pr_err("Failed to find root for %d in our supposed bind %d\n", mi->mnt_id, mi->bind->mnt_id); + return -1; + } + + if (cut_root[0]) { + snprintf(path, sizeof(path), "%s/%s", mi->bind->plain_mountpoint, cut_root); + root = path; + } else { + root = mi->bind->plain_mountpoint; + } + + if (mi->is_dir) { + if (rmdir(root)) { + pr_perror("Can't remove deleted directory %s", root); + return -1; + } + } else { + if (unlink(root)) { + pr_perror("Can't unlink deleted file %s", root); + return -1; + } + } + + if (mi->deleted_level) + rm_parent_dirs(-1, root, mi->deleted_level); + + return 0; +} + +/* Delay making mounts deleted until we've restored sharing groups */ +static int remove_sources_of_deleted_mounts(void) +{ + struct mount_info *mi; + int ret = 0; + + list_for_each_entry(mi, &deleted_mounts, deleted_list) { + if (remove_source_of_deleted_mount(mi)) + ret = -1; + } + + return ret; +} + +static int get_empty_mntns(void) +{ + int orig_nsfd, nsfd = -1; + + orig_nsfd = open_proc(PROC_SELF, "ns/mnt"); + if (orig_nsfd < 0) + return -1; + + /* Create the new mount namespace */ + if (unshare(CLONE_NEWNS)) { + pr_perror("Unable to create a new mntns"); + close(orig_nsfd); + return -1; + } + + if (mount("none", "/", NULL, MS_REC | MS_PRIVATE, NULL)) { + pr_perror("Can't remount \"/\" with MS_PRIVATE"); + goto err; + } + + if (make_yard(mnt_roots)) + goto err; + + if (cr_pivot_root(mnt_roots)) + goto err; + + if (mkdirpat(AT_FDCWD, mnt_roots, 0777)) { + pr_err("Failed to setup root yard in empty mntns\n"); + goto err; + } + + nsfd = open_proc(PROC_SELF, "ns/mnt"); +err: + if (restore_ns(orig_nsfd, &mnt_ns_desc)) + close_safe(&nsfd); + return nsfd; +} + +/* Create almost empty mount namespaces only with root yard precreated */ +static int pre_create_mount_namespaces(void) +{ + int orig_nsfd = -1, nsfd = -1, empty_mntns, exit_code = -1; + char path[PATH_MAX]; + struct ns_id *nsid; + + empty_mntns = get_empty_mntns(); + if (empty_mntns == -1) { + pr_err("Failed to create empty mntns\n"); + goto err; + } + + /* restore mount namespaces */ + for (nsid = ns_ids; nsid != NULL; nsid = nsid->next) { + if (nsid->nd != &mnt_ns_desc) + continue; + + if (switch_ns_by_fd(empty_mntns, &mnt_ns_desc, orig_nsfd == -1 ? &orig_nsfd : NULL)) + goto err; + + /* Create the new mount namespace */ + if (unshare(CLONE_NEWNS)) { + pr_perror("Unable to create a new mntns"); + goto err; + } + + nsfd = open_proc(PROC_SELF, "ns/mnt"); + if (nsfd < 0) + goto err; + + /* Pin new mntns with a file descriptor */ + nsid->mnt.nsfd_id = fdstore_add(nsfd); + close(nsfd); + if (nsid->mnt.nsfd_id < 0) { + pr_err("Can't add mntns fd to fdstore\n"); + goto err; + } + + if (make_yard(mnt_roots)) + goto err; + + print_ns_root(nsid, 0, path, sizeof(path)); + if (mkdir(path, 0600)) { + pr_perror("Unable to create %s", path); + goto err; + } + } + + exit_code = 0; +err: + if (orig_nsfd >= 0 && restore_ns(orig_nsfd, &mnt_ns_desc)) + exit_code = -1; + close_safe(&empty_mntns); + return exit_code; +} + +/* + * Assemble the mount tree for each restored mount namespace + * from pre-created plain mounts. + */ +static int assemble_mount_namespaces(void) +{ + int orig_nsfd = -1, nsfd = -1, rootfd = -1, exit_code = -1; + char path[PATH_MAX]; + struct ns_id *nsid; + + for (nsid = ns_ids; nsid != NULL; nsid = nsid->next) { + if (nsid->nd != &mnt_ns_desc) + continue; + + nsfd = fdstore_get(nsid->mnt.nsfd_id); + if (nsfd < 0) + goto err; + + if (switch_ns_by_fd(nsfd, &mnt_ns_desc, orig_nsfd == -1 ? &orig_nsfd : NULL)) { + close(nsfd); + goto err; + } + close(nsfd); + + if (assemble_tree_from_plain_mounts(nsid)) + goto err; + + /* Set its root */ + print_ns_root(nsid, 0, path, sizeof(path) - 1); + if (cr_pivot_root(path)) + goto err; + + /* root fd is used to restore file mappings */ + rootfd = open_proc(PROC_SELF, "root"); + if (rootfd < 0) + goto err; + nsid->mnt.root_fd_id = fdstore_add(rootfd); + if (nsid->mnt.root_fd_id < 0) { + pr_err("Can't add root fd to fdstore\n"); + close(rootfd); + goto err; + } + close(rootfd); + } + + exit_code = 0; +err: + if (orig_nsfd >= 0 && restore_ns(orig_nsfd, &mnt_ns_desc)) + exit_code = -1; + return exit_code; +} + +/* The main entry point of mount-v2 for creating mounts */ +int prepare_mnt_ns_v2(void) +{ + if (!(root_ns_mask & CLONE_NEWNS)) + return 0; + +#ifdef CONFIG_BINFMT_MISC_VIRTUALIZED + if (!opts.has_binfmt_misc && !list_empty(&binfmt_misc_list)) { + /* + * Add to root yard along with other plain mounts and mntns + * directories. This mount would be created and restored by + * generic mount creation code, but it would never be moved to + * any restored mount namespaces. + */ + if (!add_cr_time_mount(root_yard_mp, "binfmt_misc", "binfmt_misc", 0, true)) + return -1; + } +#endif + + if (validate_mounts(mntinfo, false)) + return -1; + + if (pre_create_mount_namespaces()) + return -1; + + if (populate_mnt_ns_v2()) + return -1; + + if (assemble_mount_namespaces()) + return -1; + + if (restore_mount_sharing_options()) + return -1; + + return remove_sources_of_deleted_mounts(); +} diff --git a/criu/mount.c b/criu/mount.c index 486d01719..b643a7f26 100644 --- a/criu/mount.c +++ b/criu/mount.c @@ -17,6 +17,7 @@ #include "plugin.h" #include "filesystems.h" #include "mount.h" +#include "mount-v2.h" #include "pstree.h" #include "image.h" #include "namespaces.h" @@ -27,30 +28,32 @@ #include "external.h" #include "clone-noasan.h" #include "fdstore.h" +#include "rst-malloc.h" #include "images/mnt.pb-c.h" -/* - * Put a : in here since those are invalid on - * the cli, so we know it's autogenerated in - * debugging. - */ -#define AUTODETECTED_MOUNT "CRIU:AUTOGENERATED" -#define NO_ROOT_MOUNT "CRIU:NO_ROOT" -#define MS_PROPAGATE (MS_SHARED | MS_PRIVATE | MS_UNBINDABLE | MS_SLAVE) - -#undef LOG_PREFIX +#undef LOG_PREFIX #define LOG_PREFIX "mnt: " -#define BINFMT_MISC_HOME "proc/sys/fs/binfmt_misc" -#define CRTIME_MNT_ID 0 +#define CONTEXT_OPT "context=" /* A helper mount_info entry for the roots yard */ -static struct mount_info *root_yard_mp = NULL; +struct mount_info *root_yard_mp = NULL; + +static LIST_HEAD(delayed_unbindable); + +char *service_mountpoint(const struct mount_info *mi) +{ + if (!opts.mntns_compat_mode && opts.mode == CR_RESTORE) { + BUG_ON(!mi->plain_mountpoint); + return mi->plain_mountpoint; + } + return mi->mountpoint; +} int ext_mount_add(char *key, char *val) { - char *e_str; + cleanup_free char *e_str = NULL; e_str = xmalloc(strlen(key) + strlen(val) + 8); if (!e_str) @@ -95,7 +98,7 @@ static char *ext_mount_lookup(char *key) int len = strlen(key); char mkey[len + 6]; - sprintf(mkey, "mnt[%s]", key); + snprintf(mkey, sizeof(mkey), "mnt[%s]", key); v = external_lookup_by_key(mkey); if (IS_ERR(v)) v = NULL; @@ -122,9 +125,14 @@ static void mntinfo_add_list(struct mount_info *new) } } -static struct mount_info *__lookup_overlayfs(struct mount_info *list, char *rpath, - unsigned int st_dev, unsigned int st_ino, - unsigned int mnt_id) +void mntinfo_add_list_before(struct mount_info **head, struct mount_info *new) +{ + new->next = *head; + *head = new; +} + +static struct mount_info *__lookup_overlayfs(struct mount_info *list, char *rpath, unsigned int st_dev, + unsigned int st_ino, unsigned int mnt_id) { /* * Goes through all entries in the mountinfo table @@ -155,15 +163,18 @@ static struct mount_info *__lookup_overlayfs(struct mount_info *list, char *rpat } } - /* Concatenates m->mountpoint with rpath and attempts to stat the resulting path */ + /* + * Concatenates m->ns_mountpoint with rpath and attempts + * to stat the resulting path at mntns_root + */ if (is_root_mount(m)) { ret_stat = fstatat(mntns_root, rpath, &f_stat, 0); } else { char _full_path[PATH_MAX]; - int n = snprintf(_full_path, PATH_MAX, "%s/%s", m->mountpoint, rpath); + int n = snprintf(_full_path, PATH_MAX, "%s/%s", m->ns_mountpoint, rpath); if (n >= PATH_MAX) { - pr_err("Not enough space to concatenate %s and %s\n", m->mountpoint, rpath); + pr_err("Not enough space to concatenate %s and %s\n", m->ns_mountpoint, rpath); return ERR_PTR(-ENOSPC); } ret_stat = fstatat(mntns_root, _full_path, &f_stat, 0); @@ -192,8 +203,7 @@ static struct mount_info *__lookup_overlayfs(struct mount_info *list, char *rpat * correct device id and node number. If that is the case, we update the * mount id and link variables with the correct values. */ -struct mount_info *lookup_overlayfs(char *rpath, unsigned int st_dev, - unsigned int st_ino, unsigned int mnt_id) +struct mount_info *lookup_overlayfs(char *rpath, unsigned int st_dev, unsigned int st_ino, unsigned int mnt_id) { struct mount_info *m; @@ -233,6 +243,7 @@ struct mount_info *lookup_mnt_sdev(unsigned int s_dev) if (m->s_dev == s_dev && mnt_is_dir(m)) return m; + pr_err("Unable to find suitable mount point for s_dev %x\n", s_dev); return NULL; } @@ -245,11 +256,11 @@ static struct mount_info *mount_resolve_path(struct mount_info *mntinfo_tree, co list_for_each_entry(c, &m->children, siblings) { size_t n; - n = strlen(c->mountpoint + 1); + n = strlen(c->ns_mountpoint + 1); if (n > pathlen) continue; - if (strncmp(c->mountpoint + 1, path, min(n, pathlen))) + if (strncmp(c->ns_mountpoint + 1, path, min(n, pathlen))) continue; if (n < pathlen && path[n] != '/') continue; @@ -261,7 +272,7 @@ static struct mount_info *mount_resolve_path(struct mount_info *mntinfo_tree, co break; } - pr_debug("Path `%s' resolved to `%s' mountpoint\n", path, m->mountpoint); + pr_debug("Path `%s' resolved to `%s' mountpoint\n", path, m->ns_mountpoint); return m; } @@ -275,12 +286,10 @@ dev_t phys_stat_resolve_dev(struct ns_id *ns, dev_t st_dev, const char *path) * superblock dev-id, in such case return device * obtained from mountinfo (ie subvolume0). */ - return strcmp(m->fstype->name, "btrfs") ? - MKKDEV(major(st_dev), minor(st_dev)) : m->s_dev; + return strcmp(m->fstype->name, "btrfs") ? MKKDEV(major(st_dev), minor(st_dev)) : m->s_dev; } -bool phys_stat_dev_match(dev_t st_dev, dev_t phys_dev, - struct ns_id *ns, const char *path) +bool phys_stat_dev_match(dev_t st_dev, dev_t phys_dev, struct ns_id *ns, const char *path) { if (st_dev == kdev_to_odev(phys_dev)) return true; @@ -293,17 +302,30 @@ bool phys_stat_dev_match(dev_t st_dev, dev_t phys_dev, */ static bool mounts_sb_equal(struct mount_info *a, struct mount_info *b) { - if (a->fstype != b->fstype) - return false; - if (a->s_dev != b->s_dev) return false; - if (strcmp(a->source, b->source) != 0) - return false; + /* + * If one of compared mounts is external its mount info can have fstype + * and source fields changed by resolve_external_mounts() or + * try_resolve_ext_mount(), but we still want to detect bindmounts of + * this external mount, so let's skip source and fstype checks for it. + */ + if (!a->external && !b->external) { + if (strcmp(a->source, b->source) != 0) + return false; - if (a->fstype->sb_equal) /* :) */ - return b->fstype->sb_equal(a, b); + if (a->fstype != b->fstype) + return false; + + if (a->fstype->sb_equal) + return a->fstype->sb_equal(a, b); + } else { + if (a->fstype->sb_equal) + return a->fstype->sb_equal(a, b); + else if (b->fstype->sb_equal) + return b->fstype->sb_equal(a, b); + } if (strcmp(a->options, b->options)) return false; @@ -328,9 +350,9 @@ static bool mounts_equal(struct mount_info *a, struct mount_info *b) * mnt_roots is a temporary directory for restoring sub-trees of * non-root namespaces. */ -static char *mnt_roots; +char *mnt_roots; -static struct mount_info *mnt_build_ids_tree(struct mount_info *list, struct mount_info *yard_mount) +static struct mount_info *mnt_build_ids_tree(struct mount_info *list) { struct mount_info *m, *root = NULL; @@ -351,41 +373,13 @@ static struct mount_info *mnt_build_ids_tree(struct mount_info *list, struct mou if (!parent) { /* Only a root mount can be without parent */ - if (root == NULL && m->is_ns_root) { + if (!root && m->is_ns_root) { root = m; - if (!yard_mount) - continue; + continue; } - if (!root) { - pr_err("No parent found for mountpoint %d (@%s)\n", - m->mnt_id, m->mountpoint); - return NULL; - } - - pr_debug("Mountpoint %d (@%s) w/o parent %d\n", - m->mnt_id, m->mountpoint, m->parent_mnt_id); - - if (!mounts_sb_equal(root, m) || - strcmp(root->root, m->root)) { - pr_err("Nested mount namespaces with different " - "roots %d (@%s %s) %d (@%s %s) are not supported yet\n", - root->mnt_id, root->mountpoint, root->root, - m->mnt_id, m->mountpoint, m->root); - return NULL; - } - - /* Mount all namespace roots into the roots yard. */ - parent = yard_mount; - if (unlikely(!yard_mount)) { - pr_err("Nested mount %d (@%s %s) w/o root insertion detected\n", - m->mnt_id, m->mountpoint, m->root); - return NULL; - } - - pr_debug("Mountpoint %d (@%s) get parent %d (@%s)\n", - m->mnt_id, m->mountpoint, - parent->mnt_id, parent->mountpoint); + pr_err("No parent found for mountpoint %d (@%s)\n", m->mnt_id, m->ns_mountpoint); + return NULL; } m->parent = parent; @@ -397,9 +391,6 @@ static struct mount_info *mnt_build_ids_tree(struct mount_info *list, struct mou return NULL; } - if (yard_mount) - return yard_mount; - return root; } @@ -408,20 +399,19 @@ static unsigned int mnt_depth(struct mount_info *m) unsigned int depth = 0; char *c; - for (c = m->mountpoint; *c != '\0'; c++) + for (c = m->ns_mountpoint; *c != '\0'; c++) if (*c == '/') depth++; return depth; } -static void mnt_resort_siblings(struct mount_info *tree) +static void __mnt_resort_children(struct mount_info *parent) { - struct mount_info *m, *p; LIST_HEAD(list); /* - * Put siblings of each node in an order they can be (u)mounted + * Put children mounts in an order they can be (u)mounted * I.e. if we have mounts on foo/bar/, foo/bar/foobar/ and foo/ * we should put them in the foo/bar/foobar/, foo/bar/, foo/ order. * Otherwise we will not be able to (u)mount them in a sequence. @@ -433,11 +423,12 @@ static void mnt_resort_siblings(struct mount_info *tree) * to contain hundreds (or more) elements. */ - pr_info("\tResorting siblings on %d\n", tree->mnt_id); - while (!list_empty(&tree->children)) { + pr_info("\tResorting children of %d in mount order\n", parent->mnt_id); + while (!list_empty(&parent->children)) { + struct mount_info *m, *p; unsigned int depth; - m = list_first_entry(&tree->children, struct mount_info, siblings); + m = list_first_entry(&parent->children, struct mount_info, siblings); list_del(&m->siblings); depth = mnt_depth(m); @@ -446,18 +437,37 @@ static void mnt_resort_siblings(struct mount_info *tree) break; list_add_tail(&m->siblings, &p->siblings); - mnt_resort_siblings(m); } - list_splice(&list, &tree->children); + list_splice(&list, &parent->children); +} + +static struct mount_info *mnt_subtree_next(struct mount_info *mi, struct mount_info *root); + +static void resort_siblings(struct mount_info *root, void (*resort_children)(struct mount_info *)) +{ + struct mount_info *mi = root; + while (1) { + /* + * Explanation: sorting the children of the tree like these is + * safe and does not break the tree search in mnt_subtree_next + * (DFS-next search), as we sort children before calling next + * on parent and thus before DFS-next ever touches them, so + * from the perspective of DFS-next all children look like they + * are already sorted. + */ + resort_children(mi); + mi = mnt_subtree_next(mi, root); + if (!mi) + break; + } } static void mnt_tree_show(struct mount_info *tree, int off) { struct mount_info *m; - pr_info("%*s[%s](%d->%d)\n", off, "", - tree->mountpoint, tree->mnt_id, tree->parent_mnt_id); + pr_info("%*s[%s](%d->%d)\n", off, "", tree->ns_mountpoint, tree->mnt_id, tree->parent_mnt_id); list_for_each_entry(m, &tree->children, siblings) mnt_tree_show(m, off + 1); @@ -468,21 +478,28 @@ static void mnt_tree_show(struct mount_info *tree, int off) /* Returns -1 on error, 1 if external mount resolved, 0 otherwise */ static int try_resolve_ext_mount(struct mount_info *info) { - char *ext; char devstr[64]; - ext = ext_mount_lookup(info->mountpoint + 1 /* trim the . */); - if (ext) { - pr_info("Found %s mapping for %s mountpoint\n", - ext, info->mountpoint); - info->external = ext; - return 1; + /* + * Only allow mountpoint-external mounts in root mntns. Their lookup is + * based on mountpoint path, but in nested mntns we can have completely + * different mount tree and at same mountpoint we can have completely + * different mount. + */ + if (info->nsid->type == NS_ROOT) { + char *ext; + + ext = ext_mount_lookup(info->ns_mountpoint + 1 /* trim the . */); + if (ext) { + pr_info("Found %s mapping for %s mountpoint\n", ext, info->ns_mountpoint); + info->external = ext; + return 1; + } } - snprintf(devstr, sizeof(devstr), "dev[%d/%d]", - kdev_major(info->s_dev), kdev_minor(info->s_dev)); + snprintf(devstr, sizeof(devstr), "dev[%d/%d]", kdev_major(info->s_dev), kdev_minor(info->s_dev)); - if (info->fstype->code == FSTYPE__UNSUPPORTED) { + if (info->fstype->code == FSTYPE__UNSUPPORTED && fsroot_mounted(info)) { char *val; val = external_lookup_by_key(devstr); @@ -490,6 +507,9 @@ static int try_resolve_ext_mount(struct mount_info *info) char *source; int len; + pr_info("Found %s dev-mapping for %s(%d) mountpoint\n", val, info->ns_mountpoint, info->mnt_id); + info->external = EXTERNAL_DEV_MOUNT; + len = strlen(val) + sizeof("dev[]"); source = xrealloc(info->source, len); if (source == NULL) @@ -518,9 +538,7 @@ static struct mount_info *find_fsroot_mount_for(struct mount_info *bm) struct mount_info *sm; list_for_each_entry(sm, &bm->mnt_bind, mnt_bind) - if (fsroot_mounted(sm) || - (sm->parent == root_yard_mp && - strstartswith(bm->root, sm->root))) + if (fsroot_mounted(sm) || (sm->parent == root_yard_mp && strstartswith(bm->root, sm->root))) return sm; return NULL; @@ -530,13 +548,13 @@ static bool mnt_needs_remap(struct mount_info *m) { struct mount_info *t; - if (!m->parent) + if (!m->parent || m->parent == root_yard_mp) return false; list_for_each_entry(t, &m->parent->children, siblings) { if (m == t) continue; - if (issubpath(t->mountpoint, m->mountpoint)) + if (issubpath(t->ns_mountpoint, m->ns_mountpoint)) return true; } @@ -545,40 +563,107 @@ static bool mnt_needs_remap(struct mount_info *m) * remapped too, else fixup_remap_mounts() won't be able to move parent * to it's real place, it will move child instead. */ - if (!strcmp(m->parent->mountpoint, m->mountpoint)) + if (!strcmp(m->parent->ns_mountpoint, m->ns_mountpoint)) return mnt_needs_remap(m->parent); return false; } -/* - * Say mount is external if it was explicitly specified as an - * external or it will be bind from such an explicit external - * mount, we set bind in propagate_mount and propagate_siblings - */ - -static bool mnt_is_external(struct mount_info *m) +static bool __mnt_is_external_bind(struct mount_info *mi, struct mount_info *bind) { - struct mount_info *t; + if (bind->external && is_sub_path(mi->root, bind->root)) + return true; - while (m) { - if (m->external) - return 1; + return false; +} - if (!list_empty(&m->mnt_share)) - list_for_each_entry(t, &m->mnt_share, mnt_share) - if (t->external) - return 1; +/* + * Say mount is external if it was explicitly specified as an external or it + * can be bind-mounted from such an explicit external mount. + */ +struct mount_info *mnt_get_external_bind(struct mount_info *mi) +{ + return mnt_bind_pick(mi, __mnt_is_external_bind); +} - if (m->master_id <= 0 && !list_empty(&m->mnt_bind)) - list_for_each_entry(t, &m->mnt_bind, mnt_bind) - if (issubpath(m->root, t->root) && t->external) - return 1; +bool mnt_is_external_bind(struct mount_info *mi) +{ + return mnt_get_external_bind(mi); +} - m = m->mnt_master; - } +static bool __can_receive_master_from_external(struct mount_info *mi, struct mount_info *bind) +{ + if (mnt_is_nodev_external(bind) && bind->master_id == mi->master_id && is_sub_path(mi->root, bind->root)) + return true; - return 0; + return false; +} + +static struct mount_info *can_receive_master_from_external(struct mount_info *mi) +{ + return mnt_bind_pick(mi, __can_receive_master_from_external); +} + +static bool __has_mounted_external_bind(struct mount_info *mi, struct mount_info *bind) +{ + if (bind->external && bind->mounted && is_sub_path(mi->root, bind->root)) + return true; + + return false; +} + +bool has_mounted_external_bind(struct mount_info *mi) +{ + return mnt_bind_pick(mi, __has_mounted_external_bind); +} + +bool rst_mnt_is_root(struct mount_info *mi) +{ + return (mi->is_ns_root && mi->nsid->id == root_item->ids->mnt_ns_id); +} + +static bool __mnt_is_root_bind(struct mount_info *mi, struct mount_info *bind) +{ + if (rst_mnt_is_root(bind) && is_sub_path(mi->root, bind->root)) + return true; + + return false; +} + +struct mount_info *mnt_get_root_bind(struct mount_info *mi) +{ + return mnt_bind_pick(mi, __mnt_is_root_bind); +} + +bool mnt_is_root_bind(struct mount_info *mi) +{ + return mnt_get_root_bind(mi); +} + +static bool __can_receive_master_from_root(struct mount_info *mi, struct mount_info *bind) +{ + if (rst_mnt_is_root(bind) && bind->master_id == mi->master_id && is_sub_path(mi->root, bind->root)) + return true; + + return false; +} + +static struct mount_info *can_receive_master_from_root(struct mount_info *mi) +{ + return mnt_bind_pick(mi, __can_receive_master_from_root); +} + +static bool __mnt_is_external_bind_nodev(struct mount_info *mi, struct mount_info *bind) +{ + if (bind->external && !mnt_is_dev_external(bind) && is_sub_path(mi->root, bind->root)) + return true; + + return false; +} + +struct mount_info *mnt_get_external_bind_nodev(struct mount_info *mi) +{ + return mnt_bind_pick(mi, __mnt_is_external_bind_nodev); } /* @@ -598,7 +683,7 @@ static int validate_children_collision(struct mount_info *mnt) list_for_each_entry(chj, &mnt->children, siblings) { if (chj == chi) break; - if (!strcmp(chj->mountpoint, chi->mountpoint)) { + if (!strcmp(chj->ns_mountpoint, chi->ns_mountpoint)) { pr_err("Mount %d has two children with same " "mountpoint: %d %d\n", mnt->mnt_id, chj->mnt_id, chi->mnt_id); @@ -609,19 +694,18 @@ static int validate_children_collision(struct mount_info *mnt) return 0; } -static int validate_mounts(struct mount_info *info, bool for_dump) +int validate_mounts(struct mount_info *info, bool for_dump) { struct mount_info *m, *t; for (m = info; m; m = m->next) { - if (m->parent == NULL || m->is_ns_root) - /* root mount can be any */ - continue; - if (validate_children_collision(m)) return -1; - if (mnt_is_external(m)) + if (mnt_is_external_bind(m)) + continue; + + if (mnt_is_root_bind(m)) continue; /* @@ -637,8 +721,8 @@ static int validate_mounts(struct mount_info *info, bool for_dump) if (fsroot_mounted(m)) { if (m->fstype->code == FSTYPE__UNSUPPORTED) { - pr_err("FS mnt %s dev %#x root %s unsupported id %d\n", - m->mountpoint, m->s_dev, m->root, m->mnt_id); + pr_err("FS mnt %s dev %#x root %s unsupported id %d\n", m->ns_mountpoint, m->s_dev, + m->root, m->mnt_id); return -1; } } else { @@ -653,7 +737,7 @@ static int validate_mounts(struct mount_info *info, bool for_dump) */ if (for_dump) { - ret = run_plugins(DUMP_EXT_MOUNT, m->mountpoint, m->mnt_id); + ret = run_plugins(DUMP_EXT_MOUNT, m->ns_mountpoint, m->mnt_id); if (ret == 0) m->need_plugin = true; } else @@ -666,8 +750,8 @@ static int validate_mounts(struct mount_info *info, bool for_dump) if (ret < 0) { if (ret == -ENOTSUP) - pr_err("%d:%s doesn't have a proper root mount\n", - m->mnt_id, m->mountpoint); + pr_err("%d:%s doesn't have a proper root mount\n", m->mnt_id, + m->ns_mountpoint); return -1; } } @@ -742,8 +826,7 @@ static struct ns_id *find_ext_ns_id(void) for (ns = ns_ids; ns->next; ns = ns->next) if (ns->type == NS_CRIU && ns->nd == &mnt_ns_desc) { - if (!ns->mnt.mntinfo_list && - !collect_mntinfo(ns, true)) + if (!ns->mnt.mntinfo_list && !collect_mntinfo(ns, false)) break; return ns; } @@ -805,7 +888,11 @@ static int resolve_external_mounts(struct mount_info *info) cut_root = cut_root_for_bind(m->root, match->root); - p = xsprintf("%s/%s", match->mountpoint + 1, cut_root); + if (cut_root[0] == '\0') { + p = xstrdup(match->ns_mountpoint + 1); + } else { + p = xsprintf("%s/%s", match->ns_mountpoint + 1, cut_root); + } if (!p) return -1; @@ -818,7 +905,7 @@ static int resolve_external_mounts(struct mount_info *info) xfree(m->source); m->source = p; - pr_info("autodetected external mount %s for %s\n", p, m->mountpoint); + pr_info("autodetected external mount %s for %s(%d)\n", p, m->ns_mountpoint, m->mnt_id); } return 0; @@ -829,18 +916,18 @@ static int root_path_from_parent(struct mount_info *m, char *buf, int size) bool head_slash = false, tail_slash = false; int p_len, m_len, len; - if (!m->parent) + if (!m->parent || m->parent == root_yard_mp) return -1; - p_len = strlen(m->parent->mountpoint); - m_len = strlen(m->mountpoint); + p_len = strlen(m->parent->ns_mountpoint); + m_len = strlen(m->ns_mountpoint); len = snprintf(buf, size, "%s", m->parent->root); if (len >= size) return -1; BUG_ON(len <= 0); - if (buf[len-1] == '/') + if (buf[len - 1] == '/') tail_slash = true; size -= len; @@ -849,12 +936,11 @@ static int root_path_from_parent(struct mount_info *m, char *buf, int size) len = m_len - p_len; BUG_ON(len < 0); if (len) { - if (m->mountpoint[p_len] == '/') + if (m->ns_mountpoint[p_len] == '/') head_slash = true; - len = snprintf(buf, size, "%s%s", - (!tail_slash && !head_slash) ? "/" : "", - m->mountpoint + p_len + (tail_slash && head_slash)); + len = snprintf(buf, size, "%s%s", (!tail_slash && !head_slash) ? "/" : "", + m->ns_mountpoint + p_len + (tail_slash && head_slash)); if (len >= size) return -1; } @@ -862,7 +948,8 @@ static int root_path_from_parent(struct mount_info *m, char *buf, int size) return 0; } -static int same_propagation_group(struct mount_info *a, struct mount_info *b) { +static int same_propagation_group(struct mount_info *a, struct mount_info *b) +{ char root_path_a[PATH_MAX], root_path_b[PATH_MAX]; /* @@ -870,8 +957,7 @@ static int same_propagation_group(struct mount_info *a, struct mount_info *b) { * 1) Their parents should be different * 2) Their parents should be together in same shared group */ - if (!a->parent || !b->parent || a->parent == b->parent || - a->parent->shared_id != b->parent->shared_id) + if (!a->parent || !b->parent || a->parent == b->parent || a->parent->shared_id != b->parent->shared_id) return 0; if (root_path_from_parent(a, root_path_a, PATH_MAX)) { @@ -893,7 +979,66 @@ static int same_propagation_group(struct mount_info *a, struct mount_info *b) { return 0; } -static int resolve_shared_mounts(struct mount_info *info, int root_master_id) +/* + * Note: Only valid if called consequently on all mounts in mntinfo list. + * + * Note: We may want to iterate over all bindmounts of some mount, and we would + * use ->mnt_bind list for this, but iterating over ->mnt_bind list is + * obviously meaningless before search_bindmounts had actually put bindmounts + * in it. That's why we have ->mnt_bind_is_populated to protect from misuse of + * ->mnt_bind. (As ->mnt_bind list can validly be empty when mount has no + * bindmounts we need separate field to indicate population.) + */ +static void __search_bindmounts(struct mount_info *mi) +{ + struct mount_info *t; + + if (mi->mnt_bind_is_populated) + return; + + for (t = mi->next; t; t = t->next) { + if (mounts_sb_equal(mi, t)) { + list_add(&t->mnt_bind, &mi->mnt_bind); + t->mnt_bind_is_populated = true; + pr_debug("\t" + "The mount %3d is bind for %3d (@%s -> @%s)\n", + t->mnt_id, mi->mnt_id, t->ns_mountpoint, mi->ns_mountpoint); + } + } + + mi->mnt_bind_is_populated = true; +} + +static void search_bindmounts(void) +{ + struct mount_info *mi; + + for (mi = mntinfo; mi; mi = mi->next) + __search_bindmounts(mi); +} + +struct mount_info *mnt_bind_pick(struct mount_info *mi, bool (*pick)(struct mount_info *mi, struct mount_info *bind)) +{ + struct mount_info *bind; + + BUG_ON(!mi); + + if (pick(mi, mi)) + return mi; + + /* + * Shouldn't use mnt_bind list before it was populated in search_bindmounts + */ + BUG_ON(!mi->mnt_bind_is_populated); + + list_for_each_entry(bind, &mi->mnt_bind, mnt_bind) + if (pick(mi, bind)) + return bind; + + return NULL; +} + +static int resolve_shared_mounts(struct mount_info *info) { struct mount_info *m, *t; @@ -906,23 +1051,19 @@ static int resolve_shared_mounts(struct mount_info *info, int root_master_id) for (m = info; m; m = m->next) { bool need_share, need_master; - /* the root master_id can be ignored, because it's already created */ - if (root_master_id && root_master_id == m->master_id) - m->master_id = -1; - need_share = m->shared_id && list_empty(&m->mnt_share); - need_master = m->master_id > 0; + need_master = m->master_id; - pr_debug("Inspecting sharing on %2d shared_id %d master_id %d (@%s)\n", - m->mnt_id, m->shared_id, m->master_id, m->mountpoint); + pr_debug("Inspecting sharing on %2d shared_id %d master_id %d (@%s)\n", m->mnt_id, m->shared_id, + m->master_id, m->ns_mountpoint); for (t = info; t && (need_share || need_master); t = t->next) { if (t == m) continue; if (need_master && t->shared_id == m->master_id) { - pr_debug("\tThe mount %3d is slave for %3d (@%s -> @%s)\n", - m->mnt_id, t->mnt_id, - m->mountpoint, t->mountpoint); + pr_debug("\t" + "The mount %3d is slave for %3d (@%s -> @%s)\n", + m->mnt_id, t->mnt_id, m->ns_mountpoint, t->ns_mountpoint); list_add(&m->mnt_slave, &t->mnt_slave_list); m->mnt_master = t; need_master = false; @@ -930,38 +1071,28 @@ static int resolve_shared_mounts(struct mount_info *info, int root_master_id) /* Collect all mounts from this group */ if (need_share && t->shared_id == m->shared_id) { - pr_debug("\tMount %3d is shared with %3d group %3d (@%s -> @%s)\n", - m->mnt_id, t->mnt_id, m->shared_id, - t->mountpoint, m->mountpoint); + pr_debug("\t" + "Mount %3d is shared with %3d group %3d (@%s -> @%s)\n", + m->mnt_id, t->mnt_id, m->shared_id, t->ns_mountpoint, m->ns_mountpoint); list_add(&t->mnt_share, &m->mnt_share); } } /* - * If we haven't already determined this mount is external, - * or bind of external, then we don't know where it came from. + * External master detected */ - if (need_master && m->parent && !mnt_is_external(m)) { - pr_err("Mount %d %s (master_id: %d shared_id: %d) " - "has unreachable sharing. Try --enable-external-masters.\n", m->mnt_id, - m->mountpoint, m->master_id, m->shared_id); - return -1; - } - - /* Search bind-mounts */ - if (list_empty(&m->mnt_bind)) { - /* - * A first mounted point will be set up as a source point - * for others. Look at propagate_mount() - */ - for (t = m->next; t; t = t->next) { - if (mounts_sb_equal(m, t)) { - list_add(&t->mnt_bind, &m->mnt_bind); - pr_debug("\tThe mount %3d is bind for %3d (@%s -> @%s)\n", - t->mnt_id, m->mnt_id, - t->mountpoint, m->mountpoint); - } + if (need_master) { + if ((t = can_receive_master_from_external(m)) || (t = can_receive_master_from_root(m))) { + pr_debug("Detected external slavery for %d via %d\n", m->mnt_id, t->mnt_id); + if (m != t) + list_add(&m->mnt_ext_slave, &t->mnt_ext_slave); + continue; } + + pr_err("Mount %d %s (master_id: %d shared_id: %d) " + "has unreachable sharing. Try --enable-external-masters.\n", + m->mnt_id, m->ns_mountpoint, m->master_id, m->shared_id); + return -1; } } @@ -987,7 +1118,7 @@ static int resolve_shared_mounts(struct mount_info *info, int root_master_id) else if (ret) { BUG_ON(!mounts_equal(m, schild)); pr_debug("\tMount %3d is in same propagation group with %3d (@%s ~ @%s)\n", - m->mnt_id, schild->mnt_id, m->mountpoint, schild->mountpoint); + m->mnt_id, schild->mnt_id, m->ns_mountpoint, schild->ns_mountpoint); list_add(&schild->mnt_propagate, &m->mnt_propagate); } } @@ -997,8 +1128,7 @@ static int resolve_shared_mounts(struct mount_info *info, int root_master_id) return 0; } -static struct mount_info *mnt_build_tree(struct mount_info *list, - struct mount_info *root_mp) +static struct mount_info *mnt_build_tree(struct mount_info *list) { struct mount_info *tree; @@ -1007,11 +1137,11 @@ static struct mount_info *mnt_build_tree(struct mount_info *list, */ pr_info("Building mountpoints tree\n"); - tree = mnt_build_ids_tree(list, root_mp); + tree = mnt_build_ids_tree(list); if (!tree) return NULL; - mnt_resort_siblings(tree); + resort_siblings(tree, __mnt_resort_children); pr_info("Done:\n"); mnt_tree_show(tree, 0); return tree; @@ -1024,12 +1154,12 @@ int mnt_is_dir(struct mount_info *pm) mntns_root = mntns_get_root_fd(pm->nsid); if (mntns_root < 0) { - pr_perror("Can't get root fd of mntns for %d", pm->mnt_id); + pr_warn("Can't get root fd of mntns for %d: %s\n", pm->mnt_id, strerror(errno)); return 0; } if (fstatat(mntns_root, pm->ns_mountpoint, &st, 0)) { - pr_perror("Can't fstatat on %s", pm->ns_mountpoint); + pr_warn("Can't fstatat on %s: %s\n", pm->ns_mountpoint, strerror(errno)); return 0; } @@ -1038,40 +1168,21 @@ int mnt_is_dir(struct mount_info *pm) return 0; } -/* - * mnt_fd is a file descriptor on the mountpoint, which is closed in an error case. - * If mnt_fd is -1, the mountpoint will be opened by this function. - */ -int __open_mountpoint(struct mount_info *pm, int mnt_fd) +int __check_mountpoint_fd(struct mount_info *pm, int mnt_fd, bool parse_mountinfo) { struct stat st; - int dev; + unsigned int dev; int ret; - if (mnt_fd == -1) { - int mntns_root; - - mntns_root = mntns_get_root_fd(pm->nsid); - if (mntns_root < 0) - return -1; - - mnt_fd = openat(mntns_root, pm->ns_mountpoint, O_RDONLY); - if (mnt_fd < 0) { - pr_perror("Can't open %s", pm->ns_mountpoint); - return -1; - } - } - ret = fstat(mnt_fd, &st); if (ret < 0) { pr_perror("fstat(%s) failed", pm->ns_mountpoint); - goto err; + return -1; } if (pm->s_dev_rt == MOUNT_INVALID_DEV) { - pr_err("Resolving over invalid device for %#x %s %s\n", - pm->s_dev, pm->fstype->name, pm->ns_mountpoint); - goto err; + pr_err("Resolving over invalid device for %#x %s %s\n", pm->s_dev, pm->fstype->name, pm->ns_mountpoint); + return -1; } dev = MKKDEV(major(st.st_dev), minor(st.st_dev)); @@ -1082,27 +1193,66 @@ int __open_mountpoint(struct mount_info *pm, int mnt_fd) * allocates new device ID). */ if (dev != pm->s_dev_rt) { - pr_err("The file system %#x %#x (%#x) %s %s is inaccessible\n", - pm->s_dev, pm->s_dev_rt, dev, - pm->fstype->name, pm->ns_mountpoint); - goto err; + /* + * For btrfs device numbers in stat and mountinfo can be + * different, fallback to get_sdev_from_fd to get right dev. + */ + if (!strcmp(pm->fstype->name, "btrfs") && !get_sdev_from_fd(mnt_fd, &dev, parse_mountinfo) && + dev == pm->s_dev_rt) + return 0; + + pr_warn("The file system %#x %#x (%#x) %s %s is inaccessible\n", pm->s_dev, pm->s_dev_rt, dev, + pm->fstype->name, pm->ns_mountpoint); + return -1; + } + + return 0; +} + +int check_mountpoint_fd(struct mount_info *pm, int mnt_fd) +{ + return __check_mountpoint_fd(pm, mnt_fd, false); +} + +/* + * mnt_fd is a file descriptor on the mountpoint, which is closed in an error case. + * If mnt_fd is -1, the mountpoint will be opened by this function. + */ +int __open_mountpoint(struct mount_info *pm) +{ + int mntns_root, mnt_fd; + + mntns_root = mntns_get_root_fd(pm->nsid); + if (mntns_root < 0) + return -1; + + mnt_fd = openat(mntns_root, pm->ns_mountpoint, O_RDONLY); + if (mnt_fd < 0) { + pr_perror("Can't open %s", pm->ns_mountpoint); + return -1; + } + + if (check_mountpoint_fd(pm, mnt_fd)) { + close(mnt_fd); + return -1; } return mnt_fd; -err: - close(mnt_fd); - return -1; } int open_mount(unsigned int s_dev) { struct mount_info *m; + int mnt_fd; m = lookup_mnt_sdev(s_dev); if (!m) return -ENOENT; - return __open_mountpoint(m, -1); + mnt_fd = __open_mountpoint(m); + if (mnt_fd < 0) + pr_err("Can't open mount %#x\n", s_dev); + return mnt_fd; } /* Bind-mount a mount point in a temporary place without children */ @@ -1114,13 +1264,12 @@ static char *get_clean_mnt(struct mount_info *mi, char *mnt_path_tmp, char *mnt_ if (mnt_path == NULL && errno == ENOENT) mnt_path = mkdtemp(mnt_path_root); if (mnt_path == NULL) { - pr_perror("Can't create a temporary directory"); + pr_warn("Can't create a temporary directory: %s\n", strerror(errno)); return NULL; } - if (mount(mi->mountpoint, mnt_path, NULL, MS_BIND, NULL)) { - pr_perror("Can't bind-mount %d:%s to %s", - mi->mnt_id, mi->mountpoint, mnt_path); + if (mount(mi->ns_mountpoint, mnt_path, NULL, MS_BIND, NULL)) { + pr_perror("Can't bind-mount %d:%s to %s", mi->mnt_id, mi->ns_mountpoint, mnt_path); rmdir(mnt_path); return NULL; } @@ -1133,12 +1282,34 @@ static int get_clean_fd(struct mount_info *mi) char *mnt_path = NULL; char mnt_path_tmp[] = "/tmp/cr-tmpfs.XXXXXX"; char mnt_path_root[] = "/cr-tmpfs.XXXXXX"; + int fd; mnt_path = get_clean_mnt(mi, mnt_path_tmp, mnt_path_root); if (!mnt_path) return -1; - return open_detach_mount(mnt_path); + fd = open(mnt_path, O_RDONLY | O_DIRECTORY, 0); + if (fd < 0) { + pr_perror("Can't open directory %s", mnt_path); + } else { + if (__check_mountpoint_fd(mi, fd, true)) + goto err_close; + } + + if (umount2(mnt_path, MNT_DETACH)) { + pr_perror("Can't detach mount %s", mnt_path); + goto err_close; + } + + if (rmdir(mnt_path)) { + pr_perror("Can't remove tmp dir %s", mnt_path); + goto err_close; + } + + return fd; +err_close: + close_safe(&fd); + return -1; } /* @@ -1169,7 +1340,7 @@ bool mnt_is_overmounted(struct mount_info *mi) list_for_each_entry(t, &m->parent->children, siblings) { if (m == t) continue; - if (issubpath(m->mountpoint, t->mountpoint)) { + if (issubpath(m->ns_mountpoint, t->ns_mountpoint)) { mi->is_overmounted = 1; goto exit; } @@ -1185,7 +1356,7 @@ bool mnt_is_overmounted(struct mount_info *mi) /* Check there is no children-overmount */ list_for_each_entry(c, &mi->children, siblings) - if (!strcmp(c->mountpoint, mi->mountpoint)) { + if (!strcmp(c->ns_mountpoint, mi->ns_mountpoint)) { mi->is_overmounted = 1; goto exit; } @@ -1194,12 +1365,35 @@ exit: return mi->is_overmounted; } -static int set_is_overmounted(struct mount_info *mi) +static int __set_is_overmounted(struct mount_info *mi) { + /* coverity[check_return] */ mnt_is_overmounted(mi); return 0; } +/* + * mnt_is_overmounted is intended to detect overmounts in original dumped mount + * tree, so we pre-save it just after loading mount tree from images, so that + * it does not mess up with any helper mounts or tree changes we can do. + */ +static void prepare_is_overmounted(void) +{ + struct ns_id *nsid; + + for (nsid = ns_ids; nsid; nsid = nsid->next) { + struct mount_info *root; + + if (nsid->nd != &mnt_ns_desc) + continue; + + root = nsid->mnt.mntinfo_tree; + + BUG_ON(root->parent); + mnt_tree_for_each(root, __set_is_overmounted); + } +} + /* * __umount_children_overmounts() assumes that the mountpoint and * it's ancestors have no sibling-overmounts, so we can see children @@ -1216,7 +1410,7 @@ static int __umount_children_overmounts(struct mount_info *mi) */ again: list_for_each_entry(c, &m->children, siblings) { - if (!strcmp(c->mountpoint, m->mountpoint)) { + if (!strcmp(c->ns_mountpoint, m->ns_mountpoint)) { m = c; goto again; } @@ -1224,8 +1418,8 @@ again: /* Unmout children-overmounts in the order of visibility */ while (m != mi) { - if (umount2(m->mountpoint, MNT_DETACH)) { - pr_perror("Unable to umount child-overmount %s", m->mountpoint); + if (umount2(m->ns_mountpoint, MNT_DETACH)) { + pr_perror("Unable to umount child-overmount %s", m->ns_mountpoint); return -1; } BUG_ON(!m->parent); @@ -1255,12 +1449,12 @@ static int __umount_overmounts(struct mount_info *m) /* Unmount sibling-overmounts in visibility order */ next: ovm = NULL; - ovm_len = strlen(m->mountpoint) + 1; + ovm_len = strlen(m->ns_mountpoint) + 1; list_for_each_entry(t, &m->parent->children, siblings) { if (m == t) continue; - if (issubpath(m->mountpoint, t->mountpoint)) { - int t_len = strlen(t->mountpoint); + if (issubpath(m->ns_mountpoint, t->ns_mountpoint)) { + int t_len = strlen(t->ns_mountpoint); if (t_len < ovm_len && t_len > ovm_len_min) { ovm = t; @@ -1276,8 +1470,8 @@ next: if (__umount_children_overmounts(ovm)) return -1; - if (umount2(ovm->mountpoint, MNT_DETACH)) { - pr_perror("Unable to umount %s", ovm->mountpoint); + if (umount2(ovm->ns_mountpoint, MNT_DETACH)) { + pr_perror("Unable to umount %s", ovm->ns_mountpoint + 1); return -1; } @@ -1321,8 +1515,7 @@ int ns_open_mountpoint(void *arg) * unmount them (see CL_UNPRIVILEGED in sys_umount(), clone_mnt() and * copy_mnt_ns() in linux kernel code). */ - if ((root_ns_mask & CLONE_NEWUSER) && - switch_ns(root_item->pid->real, &user_ns_desc, NULL) < 0) + if ((root_ns_mask & CLONE_NEWUSER) && switch_ns(root_item->pid->real, &user_ns_desc, NULL) < 0) goto err; /* @@ -1335,8 +1528,10 @@ int ns_open_mountpoint(void *arg) } /* Remount all mounts as private to disable propagation */ - if (mount("none", "/", NULL, MS_REC|MS_PRIVATE, NULL)) + if (mount("none", "/", NULL, MS_REC | MS_PRIVATE, NULL)) { + pr_perror("Unable to remount"); goto err; + } if (umount_overmounts(mi)) goto err; @@ -1348,9 +1543,14 @@ int ns_open_mountpoint(void *arg) * explicitly as when last process exits mntns all mounts in it are * cleaned from their children, and we are exactly the last process. */ - *fd = open(mi->mountpoint, O_DIRECTORY|O_RDONLY); + *fd = open(mi->ns_mountpoint, O_DIRECTORY | O_RDONLY); if (*fd < 0) { - pr_perror("Unable to open %s", mi->mountpoint); + pr_perror("Unable to open %s(%d)", mi->ns_mountpoint, mi->mnt_id); + goto err; + } + + if (__check_mountpoint_fd(mi, *fd, true)) { + close(*fd); goto err; } @@ -1365,9 +1565,9 @@ int open_mountpoint(struct mount_info *pm) /* No overmounts and children - the entire mount is visible */ if (list_empty(&pm->children) && !mnt_is_overmounted(pm)) - return __open_mountpoint(pm, -1); + return __open_mountpoint(pm); - pr_info("Mount is not fully visible %s\n", pm->mountpoint); + pr_info("Mount is not fully visible %s(%d)\n", pm->ns_mountpoint, pm->mnt_id); /* * We do two things below: @@ -1380,17 +1580,12 @@ int open_mountpoint(struct mount_info *pm) * In both cases we can't do the thing from criu's mount namespace, so * we need to switch to mount's mount namespace, and later switch back. */ - cwd_fd = open(".", O_DIRECTORY); - if (cwd_fd < 0) { - pr_perror("Unable to open cwd"); - return -1; - } - if (switch_ns(pm->nsid->ns_pid, &mnt_ns_desc, &ns_old) < 0) + if (switch_mnt_ns(pm->nsid->ns_pid, &ns_old, &cwd_fd) < 0) goto err; if (!mnt_is_overmounted(pm)) { - pr_info("\tmount has children %s\n", pm->mountpoint); + pr_info("\tmount has children %s(%d)\n", pm->ns_mountpoint, pm->mnt_id); fd = get_clean_fd(pm); } @@ -1400,13 +1595,9 @@ int open_mountpoint(struct mount_info *pm) */ if (fd < 0) { int pid, status; - struct clone_arg ca = { - .mi = pm, - .fd = &fd - }; + struct clone_arg ca = { .mi = pm, .fd = &fd }; - pr_info("\tmount is overmounted or has children %s\n", - pm->mountpoint); + pr_info("\tmount is overmounted or has children %s(%d)\n", pm->ns_mountpoint, pm->mnt_id); /* * We are overmounted - not accessible in a regular way. We @@ -1417,69 +1608,67 @@ int open_mountpoint(struct mount_info *pm) * to create helper process here as entering user namespace is * irreversible operation. */ - pid = clone_noasan(ns_open_mountpoint, CLONE_VFORK | CLONE_VM - | CLONE_FILES | CLONE_IO | CLONE_SIGHAND - | CLONE_SYSVSEM, &ca); + pid = clone_noasan(ns_open_mountpoint, + CLONE_VFORK | CLONE_VM | CLONE_FILES | CLONE_IO | CLONE_SIGHAND | CLONE_SYSVSEM, + &ca); if (pid == -1) { pr_perror("Can't clone helper process"); goto err; } errno = 0; - if (waitpid(pid, &status, __WALL) != pid || !WIFEXITED(status) - || WEXITSTATUS(status)) { - pr_err("Can't wait or bad status: errno=%d, status=%d\n", - errno, status); + if (waitpid(pid, &status, __WALL) != pid || !WIFEXITED(status) || WEXITSTATUS(status)) { + pr_err("Can't wait or bad status: errno=%d, status=%d\n", errno, status); goto err; } } - if (restore_ns(ns_old, &mnt_ns_desc)) { + if (restore_mnt_ns(ns_old, &cwd_fd)) { ns_old = -1; goto err; } - if (fchdir(cwd_fd)) { - pr_perror("Unable to restore cwd"); - close(cwd_fd); - close(fd); - return -1; - } - close(cwd_fd); - - return __open_mountpoint(pm, fd); + return fd < 0 ? __open_mountpoint(pm) : fd; err: if (ns_old >= 0) - restore_ns(ns_old, &mnt_ns_desc); + /* coverity[check_return] */ + restore_mnt_ns(ns_old, &cwd_fd); close_safe(&fd); - if (fchdir(cwd_fd)) - pr_perror("Unable to restore cwd"); - close(cwd_fd); return -1; } -static __maybe_unused int add_cr_time_mount(struct mount_info *root, char *fsname, const char *path, unsigned int s_dev) +/* + * Helper for getting a path to mount's plain mountpoint + */ +char *get_plain_mountpoint(int mnt_id, char *name) +{ + static char tmp[PATH_MAX]; + int ret; + + if (!mnt_roots) + return NULL; + + if (name) + ret = snprintf(tmp, sizeof(tmp), "%s/mnt-%s", mnt_roots, name); + else + ret = snprintf(tmp, sizeof(tmp), "%s/mnt-%010d", mnt_roots, mnt_id); + + if (ret >= sizeof(tmp)) + return NULL; + + return xstrdup(tmp); +} + +struct mount_info __maybe_unused *add_cr_time_mount(struct mount_info *root, char *fsname, const char *path, + unsigned int s_dev, bool rst) { struct mount_info *mi, *t, *parent; bool add_slash = false; int len; - if (!root->nsid) { - /* On restore we have fake top mount_info. Find real NS_ROOT */ - list_for_each_entry(t, &root->children, siblings) - if (t->nsid->type == NS_ROOT) { - root = t; - break; - } - if (!root->nsid) { - pr_err("Can't find NS_ROOT\n"); - return -1; - } - } - - mi = mnt_entry_alloc(); + mi = mnt_entry_alloc(rst); if (!mi) - return -1; + return NULL; len = strlen(root->mountpoint); /* It may be "./" or "./path/to/dir" */ @@ -1490,20 +1679,27 @@ static __maybe_unused int add_cr_time_mount(struct mount_info *root, char *fsnam mi->mountpoint = xmalloc(len + strlen(path) + 1); if (!mi->mountpoint) - return -1; - mi->ns_mountpoint = mi->mountpoint; + goto err; + if (!rst) + mi->ns_mountpoint = mi->mountpoint; if (!add_slash) sprintf(mi->mountpoint, "%s%s", root->mountpoint, path); else sprintf(mi->mountpoint, "%s/%s", root->mountpoint, path); - mi->mnt_id = CRTIME_MNT_ID; + if (rst) { + mi->plain_mountpoint = get_plain_mountpoint(-1, "crtime"); + if (!mi->plain_mountpoint) + goto err; + } + mi->mnt_id = HELPER_MNT_ID; + mi->is_dir = true; mi->flags = mi->sb_flags = 0; mi->root = xstrdup("/"); mi->fsname = xstrdup(fsname); mi->source = xstrdup(fsname); mi->options = xstrdup(""); if (!mi->root || !mi->fsname || !mi->source || !mi->options) - return -1; + goto err; mi->fstype = find_fstype_by_name(fsname); mi->s_dev = mi->s_dev_rt = s_dev; @@ -1511,7 +1707,7 @@ static __maybe_unused int add_cr_time_mount(struct mount_info *root, char *fsnam parent = root; while (1) { list_for_each_entry(t, &parent->children, siblings) { - if (strstartswith(mi->mountpoint, t->mountpoint)) { + if (strstartswith(service_mountpoint(mi), service_mountpoint(t))) { parent = t; break; } @@ -1520,64 +1716,77 @@ static __maybe_unused int add_cr_time_mount(struct mount_info *root, char *fsnam break; } + mi->mnt_bind_is_populated = true; + mi->is_overmounted = false; mi->nsid = parent->nsid; mi->parent = parent; mi->parent_mnt_id = parent->mnt_id; - mi->next = parent->next; - parent->next = mi; list_add(&mi->siblings, &parent->children); - pr_info("Add cr-time mountpoint %s with parent %s(%u)\n", - mi->mountpoint, parent->mountpoint, parent->mnt_id); - return 0; + pr_info("Add cr-time mountpoint %s with parent %s(%u)\n", service_mountpoint(mi), service_mountpoint(parent), + parent->mnt_id); + return mi; + +err: + mnt_entry_free(mi); + return NULL; } -/* Returns 1 in case of success, -errno in case of mount fail, and 0 on other errors */ +/* + * Returns: + * 0 - success + * -1 - error + * 1 - skip + */ static __maybe_unused int mount_cr_time_mount(struct ns_id *ns, unsigned int *s_dev, const char *source, - const char *target, const char *type) + const char *target, const char *type) { - int mnt_fd, ret, exit_code = 0; + int mnt_fd, cwd_fd, exit_code = -1; struct stat st; - ret = switch_ns(ns->ns_pid, &mnt_ns_desc, &mnt_fd); - if (ret < 0) { + if (switch_mnt_ns(ns->ns_pid, &mnt_fd, &cwd_fd)) { pr_err("Can't switch mnt_ns\n"); - goto out; + return -1; } - ret = mount(source, target, type, 0, NULL); - if (ret < 0) { - exit_code = -errno; - goto restore_ns; - } else { - if (stat(target, &st) < 0) { - pr_perror("Can't stat %s", target); - exit_code = 0; - } else { - *s_dev = MKKDEV(major(st.st_dev), minor(st.st_dev)); + if (mount(source, target, type, 0, NULL)) { + switch (errno) { + case EPERM: + case EBUSY: + case ENODEV: + case ENOENT: + pr_debug("Skipping %s as was unable to mount it: %s\n", type, strerror(errno)); exit_code = 1; + break; + default: + pr_perror("Unable to mount %s %s %s", type, source, target); } + goto restore_ns; } + if (stat(target, &st)) { + pr_perror("Can't stat %s", target); + goto restore_ns; + } + + *s_dev = MKKDEV(major(st.st_dev), minor(st.st_dev)); + exit_code = 0; restore_ns: - ret = restore_ns(mnt_fd, &mnt_ns_desc); -out: - return ret < 0 ? 0 : exit_code; + if (restore_mnt_ns(mnt_fd, &cwd_fd)) + exit_code = -1; + return exit_code; } - - static int dump_one_fs(struct mount_info *mi) { struct mount_info *pm = mi; struct mount_info *t; bool first = true; - if (mi->is_ns_root || mi->need_plugin || mnt_is_external(mi) || !mi->fstype->dump) + if (mnt_is_root_bind(mi) || mi->need_plugin || mnt_is_external_bind(mi) || !mi->fstype->dump) return 0; /* mnt_bind is a cycled list, so list_for_each can't be used here. */ - for (; &pm->mnt_bind != &mi->mnt_bind || first; - pm = list_entry(pm->mnt_bind.next, typeof(*pm), mnt_bind)) { + for (; &pm->mnt_bind != &mi->mnt_bind || first; pm = list_entry(pm->mnt_bind.next, typeof(*pm), mnt_bind)) { int ret; first = false; @@ -1591,13 +1800,13 @@ static int dump_one_fs(struct mount_info *mi) if (ret < 0) return ret; + pm->dumped = true; list_for_each_entry(t, &pm->mnt_bind, mnt_bind) t->dumped = true; return 0; } - pr_err("Unable to dump a file system for %d:%s\n", - mi->mnt_id, mi->mountpoint); + pr_err("Unable to dump a file system for %d:%s\n", mi->mnt_id, mi->ns_mountpoint); return -1; } @@ -1605,48 +1814,45 @@ static int dump_one_mountpoint(struct mount_info *pm, struct cr_img *img) { MntEntry me = MNT_ENTRY__INIT; - pr_info("\t%d: %x:%s @ %s\n", pm->mnt_id, pm->s_dev, - pm->root, pm->mountpoint); + pr_info("\t%d: %x:%s @ %s\n", pm->mnt_id, pm->s_dev, pm->root, pm->ns_mountpoint); - me.fstype = pm->fstype->code; + me.fstype = pm->fstype->code; if (me.fstype == FSTYPE__AUTO) me.fsname = pm->fsname; - if (!pm->external) { - if (!pm->dumped && dump_one_fs(pm)) - return -1; + if (!pm->dumped && dump_one_fs(pm)) + return -1; - if (!fsroot_mounted(pm) && - pm->fstype->check_bindmount && pm->fstype->check_bindmount(pm)) - return -1; - } + if (!mnt_is_external_bind(pm) && !fsroot_mounted(pm) && pm->fstype->check_bindmount && + pm->fstype->check_bindmount(pm)) + return -1; - if (pm->mnt_id == CRTIME_MNT_ID) { - pr_info("Skip dumping cr-time mountpoint: %s\n", pm->mountpoint); + if (pm->mnt_id == HELPER_MNT_ID) { + pr_info("Skip dumping helper mountpoint: %s\n", pm->ns_mountpoint); return 0; } - me.mnt_id = pm->mnt_id; - me.root_dev = pm->s_dev; - me.parent_mnt_id = pm->parent_mnt_id; - me.flags = pm->flags; - me.sb_flags = pm->sb_flags; - me.has_sb_flags = true; - me.mountpoint = pm->mountpoint + 1; - me.source = pm->source; - me.options = pm->options; - me.shared_id = pm->shared_id; - me.has_shared_id = true; - me.master_id = pm->master_id; - me.has_master_id = true; + me.mnt_id = pm->mnt_id; + me.root_dev = pm->s_dev; + me.parent_mnt_id = pm->parent_mnt_id; + me.flags = pm->flags; + me.sb_flags = pm->sb_flags; + me.has_sb_flags = true; + me.mountpoint = pm->ns_mountpoint + 1; + me.source = pm->source; + me.options = pm->options; + me.shared_id = pm->shared_id; + me.has_shared_id = true; + me.master_id = pm->master_id; + me.has_master_id = true; if (pm->need_plugin) { me.has_with_plugin = true; me.with_plugin = true; } if (pm->deleted) { - me.has_deleted = true; - me.deleted = true; + me.has_deleted = true; + me.deleted = true; } if (pm->internal_sharing) { @@ -1690,7 +1896,7 @@ struct mount_info *collect_mntinfo(struct ns_id *ns, bool for_dump) return NULL; } - ns->mnt.mntinfo_tree = mnt_build_tree(pm, NULL); + ns->mnt.mntinfo_tree = mnt_build_tree(pm); if (ns->mnt.mntinfo_tree == NULL) goto err; @@ -1730,55 +1936,52 @@ err: * _plist - a postpone list. _el is added to this list, if _fn_f returns * a positive value, and all lower elements are not enumerated. */ -#define MNT_TREE_WALK(_r, _el, _fn_f, _fn_r, _plist, _prgs) do { \ - struct mount_info *_mi = _r; \ - \ - while (1) { \ - int ret; \ - \ - list_del_init(&_mi->postpone); \ - \ - ret = _fn_f(_mi); \ - if (ret < 0) \ - return -1; \ - else if (ret > 0) { \ - list_add_tail(&_mi->postpone, _plist); \ - goto up; \ - } \ - \ - _prgs++; \ - \ - if (!list_empty(&_mi->children)) { \ - _mi = list_entry(_mi->children._el, \ - struct mount_info, siblings); \ - continue; \ - } \ - up: \ - if (_fn_r(_mi)) \ - return -1; \ - if (_mi == _r) \ - break; \ - if (_mi->siblings._el == &_mi->parent->children) { \ - _mi = _mi->parent; \ - goto up; \ - } \ - _mi = list_entry(_mi->siblings._el, \ - struct mount_info, siblings); \ - } \ +#define MNT_TREE_WALK(_r, _el, _fn_f, _fn_r, _plist, _prgs) \ + do { \ + struct mount_info *_mi = _r; \ + \ + while (1) { \ + int ret; \ + \ + list_del_init(&_mi->postpone); \ + \ + ret = _fn_f(_mi); \ + if (ret < 0) \ + return -1; \ + else if (ret > 0) { \ + list_add_tail(&_mi->postpone, _plist); \ + goto up; \ + } \ + \ + _prgs++; \ + \ + if (!list_empty(&_mi->children)) { \ + _mi = list_entry(_mi->children._el, struct mount_info, siblings); \ + continue; \ + } \ + up: \ + if (_fn_r(_mi)) \ + return -1; \ + if (_mi == _r) \ + break; \ + if (_mi->siblings._el == &_mi->parent->children) { \ + _mi = _mi->parent; \ + goto up; \ + } \ + _mi = list_entry(_mi->siblings._el, struct mount_info, siblings); \ + } \ } while (0) -#define MNT_WALK_NONE 0 && +#define MNT_WALK_NONE 0 && - -static int mnt_tree_for_each(struct mount_info *start, - int (*fn)(struct mount_info *)) +int mnt_tree_for_each(struct mount_info *start, int (*fn)(struct mount_info *)) { struct mount_info *tmp; LIST_HEAD(postpone); LIST_HEAD(postpone2); int progress; - pr_debug("Start with %d:%s\n", start->mnt_id, start->mountpoint); + pr_debug("Start with %d:%s\n", start->mnt_id, start->ns_mountpoint); list_add(&start->postpone, &postpone); again: @@ -1792,9 +1995,7 @@ again: pr_err("A few mount points can't be mounted\n"); list_for_each_entry(m, &postpone2, postpone) { - pr_err("%d:%d %s %s %s\n", m->mnt_id, - m->parent_mnt_id, m->root, - m->mountpoint, m->source); + pr_err("%d:%d %s %s %s\n", m->mnt_id, m->parent_mnt_id, m->root, m->ns_mountpoint, m->source); } return -1; } @@ -1805,20 +2006,19 @@ again: goto again; return 0; - } -static int mnt_tree_for_each_reverse(struct mount_info *m, - int (*fn)(struct mount_info *)) +static int mnt_tree_for_each_reverse(struct mount_info *m, int (*fn)(struct mount_info *)) { int progress = 0; - MNT_TREE_WALK(m, prev, MNT_WALK_NONE, fn, (struct list_head *) NULL, progress); + MNT_TREE_WALK(m, prev, MNT_WALK_NONE, fn, (struct list_head *)NULL, progress); + (void)progress; // Suppress -Wused-but-unset-variable for clang>=15 return 0; } -static char *resolve_source(struct mount_info *mi) +char *resolve_source(struct mount_info *mi) { if (kdev_major(mi->s_dev) == 0) /* @@ -1827,7 +2027,11 @@ static char *resolve_source(struct mount_info *mi) */ return mi->source; - if (mi->fstype->code == FSTYPE__AUTO) { + /* + * FSTYPE__AUTO check is a fallback for old images which do not have + * explicit EXTERNAL_DEV_MOUNT mark, but still have "dev[key]" in source. + */ + if (mnt_is_dev_external(mi) || mi->fstype->code == FSTYPE__AUTO) { struct stat st; char *val; @@ -1835,38 +2039,51 @@ static char *resolve_source(struct mount_info *mi) if (!IS_ERR_OR_NULL(val)) return val; - if (!stat(mi->source, &st) && S_ISBLK(st.st_mode) && - major(st.st_rdev) == kdev_major(mi->s_dev) && + if (!stat(mi->source, &st) && S_ISBLK(st.st_mode) && major(st.st_rdev) == kdev_major(mi->s_dev) && minor(st.st_rdev) == kdev_minor(mi->s_dev)) return mi->source; } - pr_err("No device for %s mount\n", mi->mountpoint); + pr_err("No device for %s(%d) mount\n", mi->ns_mountpoint, mi->mnt_id); return NULL; } static int restore_shared_options(struct mount_info *mi, bool private, bool shared, bool slave) { - pr_debug("%d:%s private %d shared %d slave %d\n", - mi->mnt_id, mi->mountpoint, private, shared, slave); + pr_debug("%d:%s private %d shared %d slave %d\n", mi->mnt_id, service_mountpoint(mi), private, shared, slave); if (mi->flags & MS_UNBINDABLE) { - if (shared || slave) - pr_warn("%s has both unbindable and sharing, ignoring unbindable\n", mi->mountpoint); - else - return mount(NULL, mi->mountpoint, NULL, MS_UNBINDABLE, NULL); + if (shared || slave) { + pr_warn("%s has both unbindable and sharing, ignoring unbindable\n", service_mountpoint(mi)); + } else { + if (!mnt_is_overmounted(mi)) { + /* Someone may still want to bind from us, let them do it. */ + pr_debug("Temporary leave unbindable mount %s as private\n", service_mountpoint(mi)); + if (mount(NULL, service_mountpoint(mi), NULL, MS_PRIVATE, NULL)) { + pr_perror("Unable to make %d private", mi->mnt_id); + return -1; + } + list_add(&mi->mnt_unbindable, &delayed_unbindable); + return 0; + } + if (mount(NULL, service_mountpoint(mi), NULL, MS_UNBINDABLE, NULL)) { + pr_perror("Unable to make %d unbindable", mi->mnt_id); + return -1; + } + return 0; + } } - if (private && mount(NULL, mi->mountpoint, NULL, MS_PRIVATE, NULL)) { - pr_perror("Unable to make %s private", mi->mountpoint); + if (private && mount(NULL, service_mountpoint(mi), NULL, MS_PRIVATE, NULL)) { + pr_perror("Unable to make %d private", mi->mnt_id); return -1; } - if (slave && mount(NULL, mi->mountpoint, NULL, MS_SLAVE, NULL)) { - pr_perror("Unable to make %s slave", mi->mountpoint); + if (slave && mount(NULL, service_mountpoint(mi), NULL, MS_SLAVE, NULL)) { + pr_perror("Unable to make %d slave", mi->mnt_id); return -1; } - if (shared && mount(NULL, mi->mountpoint, NULL, MS_SHARED, NULL)) { - pr_perror("Unable to make %s shared", mi->mountpoint); + if (shared && mount(NULL, service_mountpoint(mi), NULL, MS_SHARED, NULL)) { + pr_perror("Unable to make %d shared", mi->mnt_id); return -1; } @@ -1882,6 +2099,8 @@ static int umount_from_slaves(struct mount_info *mi) struct mount_info *t; char *mpath, buf[PATH_MAX]; + BUG_ON(mi->parent == root_yard_mp); + list_for_each_entry(t, &mi->parent->mnt_slave_list, mnt_slave) { if (!t->mounted) continue; @@ -1919,7 +2138,7 @@ static int propagate_siblings(struct mount_info *mi) continue; if (t->bind && t->bind->shared_id == t->shared_id) continue; - pr_debug("\t\tBind share %s\n", t->mountpoint); + pr_debug("\t\tBind share %s(%d)\n", t->ns_mountpoint, t->mnt_id); t->bind = mi; t->s_dev_rt = mi->s_dev_rt; } @@ -1927,7 +2146,15 @@ static int propagate_siblings(struct mount_info *mi) list_for_each_entry(t, &mi->mnt_slave_list, mnt_slave) { if (t->mounted || t->bind) continue; - pr_debug("\t\tBind slave %s\n", t->mountpoint); + pr_debug("\t\tBind slave %s(%d)\n", t->ns_mountpoint, t->mnt_id); + t->bind = mi; + t->s_dev_rt = mi->s_dev_rt; + } + + list_for_each_entry(t, &mi->mnt_ext_slave, mnt_ext_slave) { + if (t->mounted || t->bind) + continue; + pr_debug("\t\tBind ext-slave %s(%d)\n", t->ns_mountpoint, t->mnt_id); t->bind = mi; t->s_dev_rt = mi->s_dev_rt; } @@ -1941,7 +2168,7 @@ static int propagate_mount(struct mount_info *mi) propagate_siblings(mi); - if (!mi->parent) + if (!mi->parent || mi->parent == root_yard_mp) goto skip_parent; umount_from_slaves(mi); @@ -1950,7 +2177,7 @@ static int propagate_mount(struct mount_info *mi) list_for_each_entry(p, &mi->mnt_propagate, mnt_propagate) { /* Should not propagate the same mount twice */ BUG_ON(p->mounted); - pr_debug("\t\tPropagate %s\n", p->mountpoint); + pr_debug("\t\tPropagate %s(%d)\n", p->ns_mountpoint, p->mnt_id); /* * When a mount is propagated, the result mount @@ -1976,11 +2203,11 @@ skip_parent: continue; if (t->bind) continue; - if (t->master_id > 0) + if (t->master_id) continue; if (!issubpath(t->root, mi->root)) continue; - pr_debug("\t\tBind private %s\n", t->mountpoint); + pr_debug("\t\tBind private %s(%d)\n", t->ns_mountpoint, t->mnt_id); t->bind = mi; t->s_dev_rt = mi->s_dev_rt; } @@ -1989,7 +2216,7 @@ skip_parent: return 0; } -static int fetch_rt_stat(struct mount_info *m, const char *where) +int fetch_rt_stat(struct mount_info *m, const char *where) { struct stat st; @@ -2002,49 +2229,52 @@ static int fetch_rt_stat(struct mount_info *m, const char *where) return 0; } -/* - * Here are a set of flags which we know how to handle for the one mount call. - * All of them except MS_RDONLY are set only as mnt flags. - * MS_RDONLY is set for both mnt ans sb flags, so we can restore it for one - * mount call only if it set for both masks. - */ -#define MS_MNT_KNOWN_FLAGS (MS_NOSUID | MS_NOEXEC | MS_NODEV | MS_NOATIME | \ - MS_NODIRATIME | MS_RELATIME | MS_RDONLY) - -static int do_simple_mount(struct mount_info *mi, const char *src, const - char *fstype, unsigned long mountflags) +int do_simple_mount(struct mount_info *mi, const char *src, const char *fstype, unsigned long mountflags) { - return mount(src, mi->mountpoint, fstype, mountflags, mi->options); + int ret = mount(src, service_mountpoint(mi), fstype, mountflags, mi->options); + if (ret) + pr_perror("Unable to mount %s %s (id=%d)", src, service_mountpoint(mi), mi->mnt_id); + return ret; } -static char *mnt_fsname(struct mount_info *mi) +char *mnt_fsname(struct mount_info *mi) { if (mi->fstype->code == FSTYPE__AUTO) return mi->fsname; return mi->fstype->name; } -static int apply_sb_flags(void *args, int fd, pid_t pid) +static int userns_mount(char *src, void *args, int fd, pid_t pid) { - unsigned long flags = *(unsigned long *) args; + unsigned long flags = *(unsigned long *)args; int rst = -1, err = -1; - char path[PSFDS]; + char target[PSFDS]; - snprintf(path, sizeof(path), "/proc/self/fd/%d", fd); + snprintf(target, sizeof(target), "/proc/self/fd/%d", fd); if (pid != getpid() && switch_ns(pid, &mnt_ns_desc, &rst)) return -1; - err = mount(NULL, path, NULL, MS_REMOUNT | flags, NULL); + err = mount(src, target, NULL, flags, NULL); if (err) - pr_perror("Unable to remount %s", path); + pr_perror("Unable to mount %s", target); - if (rst >= 0 && restore_ns(rst, &mnt_ns_desc)) + if (rst >= 0 && restore_ns(rst, &mnt_ns_desc)) return -1; return err; } +int apply_sb_flags(void *args, int fd, pid_t pid) +{ + return userns_mount(NULL, args, fd, pid); +} + +int mount_root(void *args, int fd, pid_t pid) +{ + return userns_mount(opts.root, args, fd, pid); +} + static int do_new_mount(struct mount_info *mi) { unsigned long sflags = mi->sb_flags; @@ -2068,43 +2298,31 @@ static int do_new_mount(struct mount_info *mi) sflags &= ~MS_RDONLY; if (do_mount(mi, src, mnt_fsname(mi), sflags) < 0) { - pr_perror("Can't mount at %s", mi->mountpoint); + pr_perror("Can't mount at %s", service_mountpoint(mi)); return -1; } if (tp->restore && tp->restore(mi)) return -1; - if (mi->mnt_id == CRTIME_MNT_ID) { - /* C-r time mountpoint, umount it */ - if (umount(mi->mountpoint) < 0) { - pr_perror("Can't umount %s", mi->mountpoint); - return -1; - } - goto out; - } - - if (!mi->is_ns_root && remount_ro) { + if (remount_ro) { int fd; - fd = open(mi->mountpoint, O_PATH); + fd = open(service_mountpoint(mi), O_PATH); if (fd < 0) { - pr_perror("Unable to open %s", mi->mountpoint); + pr_perror("Unable to open %s", service_mountpoint(mi)); return -1; } - sflags |= MS_RDONLY; - if (userns_call(apply_sb_flags, 0, - &sflags, sizeof(sflags), fd)) { - pr_perror("Unable to apply mount flags %d for %s", - mi->sb_flags, mi->mountpoint); + sflags |= MS_RDONLY | MS_REMOUNT; + if (userns_call(apply_sb_flags, 0, &sflags, sizeof(sflags), fd)) { + pr_err("Unable to apply mount flags %d for %s\n", mi->sb_flags, service_mountpoint(mi)); close(fd); return -1; } close(fd); } - if (mflags && mount(NULL, mi->mountpoint, NULL, - MS_REMOUNT | MS_BIND | mflags, NULL)) { + if (mflags && mount(NULL, service_mountpoint(mi), NULL, MS_REMOUNT | MS_BIND | mflags, NULL)) { pr_perror("Unable to apply bind-mount options"); return -1; } @@ -2116,18 +2334,18 @@ static int do_new_mount(struct mount_info *mi) BUG_ON(mi->master_id); if (restore_shared_options(mi, !mi->shared_id, mi->shared_id, 0)) return -1; -out: + mi->mounted = true; return 0; } -static int restore_ext_mount(struct mount_info *mi) +int restore_ext_mount(struct mount_info *mi) { int ret; - pr_debug("Restoring external bind mount %s\n", mi->mountpoint); - ret = run_plugins(RESTORE_EXT_MOUNT, mi->mnt_id, mi->mountpoint, "/", NULL); + pr_debug("Restoring external bind mount %s\n", service_mountpoint(mi)); + ret = run_plugins(RESTORE_EXT_MOUNT, mi->mnt_id, service_mountpoint(mi), "/", NULL); if (ret) pr_err("Can't restore ext mount (%d)\n", ret); return ret; @@ -2135,7 +2353,7 @@ static int restore_ext_mount(struct mount_info *mi) static char mnt_clean_path[] = "/tmp/cr-tmpfs.XXXXXX"; -static int mount_clean_path() +static int mount_clean_path(void) { /* * To make a bind mount, we need to have access to a source directory, @@ -2162,7 +2380,7 @@ static int mount_clean_path() return 0; } -static int umount_clean_path() +static int umount_clean_path(void) { if (umount2(mnt_clean_path, MNT_DETACH)) { pr_perror("Unable to umount %s", mnt_clean_path); @@ -2184,7 +2402,7 @@ static int do_bind_mount(struct mount_info *mi) int exit_code = -1, mp_len; bool shared = false; bool master = false; - bool private = false; + bool priv = false; char *mnt_path = NULL; struct stat st; bool umount_mnt_path = false; @@ -2196,7 +2414,7 @@ static int do_bind_mount(struct mount_info *mi) goto out; } - if (mi->external) { + if (mnt_is_nodev_external(mi)) { /* * We have / pointing to criu's ns root still, * so just use the mapping's path. The mountpoint @@ -2204,13 +2422,13 @@ static int do_bind_mount(struct mount_info *mi) * to proper location in the namespace we restore. */ root = mi->external; - private = !mi->master_id && (mi->internal_sharing || !mi->shared_id); + priv = !mi->master_id && (mi->internal_sharing || !mi->shared_id); goto do_bind; } shared = mi->shared_id && mi->shared_id == mi->bind->shared_id; master = mi->master_id && mi->master_id == mi->bind->master_id; - private = !mi->master_id && !shared; + priv = !mi->master_id && !shared; cut_root = cut_root_for_bind(mi->root, mi->bind->root); /* Mount private can be initialized on mount() callback, which is @@ -2220,12 +2438,11 @@ static int do_bind_mount(struct mount_info *mi) */ mi->private = mi->bind->private; - mnt_path = mi->bind->mountpoint; + mnt_path = service_mountpoint(mi->bind); - /* Access a mount by fd if mi->bind->mountpoint is overmounted */ + /* Access a mount by fd if service_mountpoint(mi->bind) is overmounted */ if (mi->bind->fd >= 0) { - snprintf(mnt_fd_path, sizeof(mnt_fd_path), - "/proc/self/fd/%d", mi->bind->fd); + snprintf(mnt_fd_path, sizeof(mnt_fd_path), "/proc/self/fd/%d", mi->bind->fd); mnt_path = mnt_fd_path; } @@ -2236,22 +2453,21 @@ static int do_bind_mount(struct mount_info *mi) * The target path may be over-mounted by one of child mounts * and we need to create a new bind-mount to get access to the path. */ - mp_len = strlen(mi->bind->mountpoint); - if (mp_len > 1) /* skip a joining / if mi->bind->mountpoint isn't "/" */ + mp_len = strlen(service_mountpoint(mi->bind)); + if (mp_len > 1) /* skip a joining / if service_mountpoint(mi->bind) isn't "/" */ mp_len++; list_for_each_entry(c, &mi->bind->children, siblings) { if (!c->mounted) continue; - if (issubpath(cut_root, c->mountpoint + mp_len)) + if (issubpath(cut_root, service_mountpoint(c) + mp_len)) break; /* a source path is overmounted */ } if (&c->siblings != &mi->bind->children) { /* Get a copy of mi->bind without child mounts */ if (mount(mnt_path, mnt_clean_path, NULL, MS_BIND, NULL)) { - pr_perror("Unable to bind-mount %s to %s", - mnt_path, mnt_clean_path); + pr_perror("Unable to bind-mount %s to %s", mnt_path, mnt_clean_path); return -1; } mnt_path = mnt_clean_path; @@ -2262,15 +2478,14 @@ static int do_bind_mount(struct mount_info *mi) return -1; skip_overmount_check: - snprintf(rpath, sizeof(rpath), "%s/%s", - mnt_path, cut_root); + snprintf(rpath, sizeof(rpath), "%s/%s", mnt_path, cut_root); root = rpath; do_bind: - pr_info("\tBind %s to %s\n", root, mi->mountpoint); + pr_info("\tBind %s to %s\n", root, service_mountpoint(mi)); if (unlikely(mi->deleted)) { - if (stat(mi->mountpoint, &st)) { - pr_perror("Can't fetch stat on %s", mi->mountpoint); + if (stat(service_mountpoint(mi), &st)) { + pr_perror("Can't fetch stat on %s", service_mountpoint(mi)); goto err; } @@ -2280,29 +2495,27 @@ do_bind: goto err; } } else if (S_ISREG(st.st_mode)) { - int fd = open(root, O_WRONLY | O_CREAT | O_EXCL, - st.st_mode & ~S_IFMT); + int fd = open(root, O_WRONLY | O_CREAT | O_EXCL, st.st_mode & ~S_IFMT); if (fd < 0) { pr_perror("Can't re-create deleted file %s", root); goto err; } close(fd); } else { - pr_err("Unsupported st_mode 0%o deleted root %s\n", - (int)st.st_mode, root); + pr_err("Unsupported st_mode 0%o deleted root %s\n", (int)st.st_mode, root); goto err; } } - if (mount(root, mi->mountpoint, NULL, MS_BIND | (mi->flags & MS_REC), NULL) < 0) { - pr_perror("Can't mount at %s", mi->mountpoint); + if (mount(root, service_mountpoint(mi), NULL, MS_BIND | (mi->flags & MS_REC), NULL) < 0) { + pr_perror("Can't bind-mount at %s", service_mountpoint(mi)); goto err; } mflags = mi->flags & (~MS_PROPAGATE); if (!mi->bind || mflags != (mi->bind->flags & (~MS_PROPAGATE))) - if (mount(NULL, mi->mountpoint, NULL, MS_BIND | MS_REMOUNT | mflags, NULL)) { - pr_perror("Can't mount at %s", mi->mountpoint); + if (mount(NULL, service_mountpoint(mi), NULL, MS_BIND | MS_REMOUNT | mflags, NULL)) { + pr_perror("Can't re-mount at %s", service_mountpoint(mi)); goto err; } @@ -2324,9 +2537,7 @@ out: * shared - the mount is in the same shared group with mi->bind * mi->shared_id && !shared - create a new shared group */ - if (restore_shared_options(mi, private, - mi->shared_id && !shared, - mi->master_id && !master)) + if (restore_shared_options(mi, priv, mi->shared_id && !shared, mi->master_id && !master)) goto err; mi->mounted = true; @@ -2349,22 +2560,26 @@ err: return exit_code; } -static bool rst_mnt_is_root(struct mount_info *m) -{ - return (m->is_ns_root && m->nsid->id == root_item->ids->mnt_ns_id); -} - static bool can_mount_now(struct mount_info *mi) { - if (rst_mnt_is_root(mi)) + struct mount_info *ext; + + if (rst_mnt_is_root(mi)) { + pr_debug("%s: true as %d is mntns root\n", __func__, mi->mnt_id); return true; + } /* Parent should be mounted already, that's how mnt_tree_for_each works */ BUG_ON(mi->parent && !mi->parent->mounted); - if (mi->external) + if (mnt_is_nodev_external(mi)) goto shared; + if (!mi->bind && !mi->external && (ext = mnt_get_external_bind(mi)) && !has_mounted_external_bind(mi)) { + pr_debug("%s: false as %d's external %d is not mounted\n", __func__, mi->mnt_id, ext->mnt_id); + return false; + } + /* * We're the slave peer: * - Make sure the master peer is already mounted @@ -2374,21 +2589,35 @@ static bool can_mount_now(struct mount_info *mi) if (mi->mnt_master) { struct mount_info *c, *s; - if (mi->bind == NULL) + if (mi->bind == NULL) { + pr_debug("%s: false as %d is slave with unmounted master %d\n", __func__, mi->mnt_id, + mi->mnt_master->mnt_id); return false; + } - list_for_each_entry(c, &mi->mnt_master->children, siblings) - if (!c->mounted) + list_for_each_entry(c, &mi->mnt_master->children, siblings) { + if (!c->mounted) { + pr_debug("%s: false as %d is slave with unmounted master's children %d\n", __func__, + mi->mnt_id, c->mnt_id); return false; + } + } - list_for_each_entry(s, &mi->mnt_master->mnt_share, mnt_share) - list_for_each_entry(c, &s->children, siblings) - if (!c->mounted) + list_for_each_entry(s, &mi->mnt_master->mnt_share, mnt_share) { + list_for_each_entry(c, &s->children, siblings) { + if (!c->mounted) { + pr_debug("%s: false as %d is slave with unmounted children of master's share\n", + __func__, mi->mnt_id); return false; + } + } + } } - if (!fsroot_mounted(mi) && (mi->bind == NULL && !mi->need_plugin)) + if (!fsroot_mounted(mi) && (mi->bind == NULL && !mi->need_plugin)) { + pr_debug("%s: false as %d is non-root without bind or plugin\n", __func__, mi->mnt_id); return false; + } shared: /* Mount only after all parents of our propagation group mounted */ @@ -2397,8 +2626,11 @@ shared: list_for_each_entry(p, &mi->mnt_propagate, mnt_propagate) { BUG_ON(!p->parent); - if (!p->parent->mounted) + if (!p->parent->mounted) { + pr_debug("%s: false as %d has unmounted parent %d of its propagation group\n", __func__, + mi->mnt_id, p->parent->mnt_id); return false; + } } } @@ -2445,8 +2677,11 @@ shared: /* Check not propagated mounts mounted and cleanup list */ list_for_each_entry_safe(p, t, &mi_notprop, mnt_notprop) { - if (!p->mounted) + if (!p->mounted) { + pr_debug("%s: false as %d has unmounted 'anti'-propagation mount %d\n", __func__, + mi->mnt_id, p->mnt_id); can = false; + } list_del_init(&p->mnt_notprop); } @@ -2459,11 +2694,17 @@ shared: static int do_mount_root(struct mount_info *mi) { - if (restore_shared_options(mi, !mi->shared_id && !mi->master_id, - mi->shared_id, mi->master_id)) + unsigned long mflags = mi->flags & (~MS_PROPAGATE); + + if (restore_shared_options(mi, !mi->shared_id && !mi->master_id, mi->shared_id, mi->master_id)) return -1; - return fetch_rt_stat(mi, mi->mountpoint); + if (mflags && mount(NULL, service_mountpoint(mi), NULL, MS_REMOUNT | MS_BIND | mflags, NULL)) { + pr_perror("Unable to apply root mount options"); + return -1; + } + + return fetch_rt_stat(mi, service_mountpoint(mi)); } static int do_close_one(struct mount_info *mi) @@ -2472,6 +2713,16 @@ static int do_close_one(struct mount_info *mi) return 0; } +static int set_unbindable(struct mount_info *mi) +{ + if (mount(NULL, service_mountpoint(mi), NULL, MS_UNBINDABLE, NULL)) { + pr_perror("Failed setting unbindable flag on %d", mi->mnt_id); + return -1; + } + + return 0; +} + static int do_mount_one(struct mount_info *mi) { int ret; @@ -2480,39 +2731,62 @@ static int do_mount_one(struct mount_info *mi) return 0; if (!can_mount_now(mi)) { - pr_debug("Postpone slave %s\n", mi->mountpoint); + pr_debug("Postpone mount %s(%d)\n", mi->ns_mountpoint, mi->mnt_id); return 1; } - if (!strcmp(mi->parent->mountpoint, mi->mountpoint)) { - mi->parent->fd = open(mi->parent->mountpoint, O_PATH); + if ((mi->parent && mi->parent != root_yard_mp) && !strcmp(mi->parent->ns_mountpoint, mi->ns_mountpoint)) { + mi->parent->fd = open(service_mountpoint(mi->parent), O_PATH); if (mi->parent->fd < 0) { - pr_perror("Unable to open %s", mi->mountpoint); + pr_perror("Unable to open %s", service_mountpoint(mi)); return -1; } } - pr_debug("\tMounting %s @%s (%d)\n", mi->fstype->name, mi->mountpoint, mi->need_plugin); + pr_debug("\tMounting %s %d@%s (%d)\n", mi->fstype->name, mi->mnt_id, service_mountpoint(mi), mi->need_plugin); if (rst_mnt_is_root(mi)) { + int fd; + unsigned long flags = MS_BIND | MS_REC; + if (opts.root == NULL) { pr_err("The --root option is required to restore a mount namespace\n"); return -1; } /* do_mount_root() is called from populate_mnt_ns() */ - if (mount(opts.root, mi->mountpoint, NULL, MS_BIND | MS_REC, NULL)) - return -1; + if (root_ns_mask & CLONE_NEWUSER) { + fd = open(service_mountpoint(mi), O_PATH); + if (fd < 0) { + pr_perror("Unable to open %s", service_mountpoint(mi)); + return -1; + } + + if (userns_call(mount_root, 0, &flags, sizeof(flags), fd)) { + pr_err("Unable to mount %s\n", service_mountpoint(mi)); + close(fd); + return -1; + } + close(fd); + } else { + if (mount(opts.root, service_mountpoint(mi), NULL, flags, NULL)) { + pr_perror("Unable to mount %s %s (id=%d)", opts.root, service_mountpoint(mi), + mi->mnt_id); + return -1; + } + } + if (do_mount_root(mi)) return -1; mi->mounted = true; ret = 0; - } else if (!mi->bind && !mi->need_plugin && !mi->external) + } else if (!mi->bind && !mi->need_plugin && !mnt_is_nodev_external(mi)) { ret = do_new_mount(mi); - else + } else { ret = do_bind_mount(mi); + } - if (ret == 0 && fetch_rt_stat(mi, mi->mountpoint)) + if (ret == 0 && fetch_rt_stat(mi, service_mountpoint(mi))) return -1; if (ret == 0 && propagate_mount(mi)) @@ -2521,8 +2795,8 @@ static int do_mount_one(struct mount_info *mi) if (mi->fstype->code == FSTYPE__UNSUPPORTED) { struct statfs st; - if (statfs(mi->mountpoint, &st)) { - pr_perror("Unable to statfs %s", mi->mountpoint); + if (statfs(service_mountpoint(mi), &st)) { + pr_perror("Unable to statfs %s", service_mountpoint(mi)); return -1; } if (st.f_type == BTRFS_SUPER_MAGIC) @@ -2537,17 +2811,17 @@ static int do_umount_one(struct mount_info *mi) if (!mi->parent) return 0; - if (mount("none", mi->parent->mountpoint, "none", MS_REC|MS_PRIVATE, NULL)) { - pr_perror("Can't mark %s as private", mi->parent->mountpoint); + if (mount("none", service_mountpoint(mi->parent), "none", MS_REC | MS_PRIVATE, NULL)) { + pr_perror("Can't mark %s as private", service_mountpoint(mi->parent)); return -1; } - if (umount(mi->mountpoint)) { - pr_perror("Can't umount at %s", mi->mountpoint); + if (umount(service_mountpoint(mi))) { + pr_perror("Can't umount at %s", service_mountpoint(mi)); return -1; } - pr_info("Umounted at %s\n", mi->mountpoint); + pr_info("Umounted at %s\n", service_mountpoint(mi)); return 0; } @@ -2560,15 +2834,11 @@ static int do_umount_one(struct mount_info *mi) * roots_yard where it will be restored. The remapped mount will be * moved to the right places after restoring all mounts. */ - -static inline int print_ns_root(struct ns_id *ns, int remap_id, char *buf, int bs); -static int get_mp_mountpoint(char *mountpoint, struct mount_info *mi, char *root, int root_len); - static LIST_HEAD(mnt_remap_list); static int remap_id; struct mnt_remap_entry { - struct mount_info *mi; /* child is remaped into the root yards */ + struct mount_info *mi; /* child is remapped into the root yards */ struct mount_info *parent; /* the origin parent for the child*/ struct list_head node; }; @@ -2633,7 +2903,7 @@ static int find_remap_mounts(struct mount_info *root) } /* Move remapped mounts to places where they have to be */ -static int fixup_remap_mounts() +static int fixup_remap_mounts(void) { struct mnt_remap_entry *r; @@ -2662,7 +2932,7 @@ static int fixup_remap_mounts() return 0; } -static int cr_pivot_root(char *root) +int cr_pivot_root(char *root) { char tmp_dir_tmpl[] = "crtools-put-root.XXXXXX"; bool tmp_dir = false; @@ -2670,7 +2940,7 @@ static int cr_pivot_root(char *root) int exit_code = -1; struct stat st; - pr_info("Move the root to %s\n", root ? : "."); + pr_info("Move the root to %s\n", root ?: "."); if (root) { if (chdir(root)) { @@ -2703,7 +2973,7 @@ static int cr_pivot_root(char *root) goto err_tmpfs; } - if (mount("none", put_root, "none", MS_REC|MS_SLAVE, NULL)) { + if (mount("none", put_root, "none", MS_REC | MS_SLAVE, NULL)) { pr_perror("Can't remount root with MS_PRIVATE"); return -1; } @@ -2730,7 +3000,7 @@ err_root: return exit_code; } -struct mount_info *mnt_entry_alloc() +struct mount_info *mnt_entry_alloc(bool rst) { struct mount_info *new; @@ -2741,16 +3011,30 @@ struct mount_info *mnt_entry_alloc() new = xzalloc(sizeof(struct mount_info)); if (new) { + if (rst) { + new->rmi = shmalloc(sizeof(struct rst_mount_info)); + if (!new->rmi) { + xfree(new); + return NULL; + } + memset(new->rmi, 0, sizeof(struct rst_mount_info)); + } + new->mp_fd_id = -1; + new->mnt_fd_id = -1; + new->is_dir = -1; new->fd = -1; new->is_overmounted = -1; INIT_LIST_HEAD(&new->children); INIT_LIST_HEAD(&new->siblings); INIT_LIST_HEAD(&new->mnt_slave_list); + INIT_LIST_HEAD(&new->mnt_ext_slave); INIT_LIST_HEAD(&new->mnt_share); INIT_LIST_HEAD(&new->mnt_bind); INIT_LIST_HEAD(&new->mnt_propagate); INIT_LIST_HEAD(&new->mnt_notprop); + INIT_LIST_HEAD(&new->mnt_unbindable); INIT_LIST_HEAD(&new->postpone); + INIT_LIST_HEAD(&new->deleted_list); } return new; } @@ -2760,6 +3044,7 @@ void mnt_entry_free(struct mount_info *mi) if (mi) { xfree(mi->root); xfree(mi->mountpoint); + xfree(mi->plain_mountpoint); xfree(mi->source); xfree(mi->options); xfree(mi->fsname); @@ -2771,7 +3056,7 @@ void mnt_entry_free(struct mount_info *mi) * Helper for getting a path to where the namespace's root * is re-constructed. */ -static inline int print_ns_root(struct ns_id *ns, int remap_id, char *buf, int bs) +int print_ns_root(struct ns_id *ns, int remap_id, char *buf, int bs) { return snprintf(buf, bs, "%s/%d-%010d", mnt_roots, ns->id, remap_id); } @@ -2810,7 +3095,7 @@ static int get_mp_root(MntEntry *me, struct mount_info *mi) me->ext_key = me->root; /* * Putting the id of external mount which is provided by user, - * to ->root can confuse mnt_is_external and other functions + * to ->root can confuse mnt_is_external_bind and other functions * which expect to see the path in the file system to the root * of these mount (mounts_equal, mnt_build_ids_tree, * find_fsroot_mount_for, find_best_external_match, etc.) @@ -2830,10 +3115,11 @@ static int get_mp_root(MntEntry *me, struct mount_info *mi) * from the command line and put into root's place */ - ext = ext_mount_lookup(me->ext_key); - if (!ext) { + if (!strcmp(me->ext_key, AUTODETECTED_MOUNT)) { if (!opts.autodetect_ext_mounts) { - pr_err("No mapping for %s mountpoint\n", me->mountpoint); + pr_err("Mount %d:%s is autodetected external mount. " + "Try \"--ext-mount-map auto\" to allow them.\n", + mi->mnt_id, mi->ns_mountpoint); return -1; } @@ -2847,12 +3133,19 @@ static int get_mp_root(MntEntry *me, struct mount_info *mi) */ ext = mi->source; + } else if (!strcmp(me->ext_key, EXTERNAL_DEV_MOUNT)) { + ext = EXTERNAL_DEV_MOUNT; + } else { + ext = ext_mount_lookup(me->ext_key); + if (!ext) { + pr_err("No mapping for %d:%s mountpoint\n", mi->mnt_id, mi->ns_mountpoint); + return -1; + } } mi->external = ext; out: - pr_debug("\t\tWill mount %d from %s%s\n", - mi->mnt_id, ext ? : mi->root, ext ? " (E)" : ""); + pr_debug("\t\tWill mount %d from %s%s\n", mi->mnt_id, ext ?: mi->root, ext ? " (E)" : ""); return 0; } @@ -2860,7 +3153,7 @@ static int get_mp_mountpoint(char *mountpoint, struct mount_info *mi, char *root { int len; - len = strlen(mountpoint) + root_len + 1; + len = strlen(mountpoint) + root_len + 1; mi->mountpoint = xmalloc(len); if (!mi->mountpoint) return -1; @@ -2877,11 +3170,75 @@ static int get_mp_mountpoint(char *mountpoint, struct mount_info *mi, char *root mi->ns_mountpoint = mi->mountpoint + root_len; - pr_debug("\t\tWill mount %d @ %s\n", mi->mnt_id, mi->mountpoint); + mi->plain_mountpoint = get_plain_mountpoint(mi->mnt_id, NULL); + if (!mi->plain_mountpoint) + return -1; + + pr_debug("\t\tWill mount %d @ %s %s\n", mi->mnt_id, service_mountpoint(mi), mi->ns_mountpoint); return 0; } -static int collect_mnt_from_image(struct mount_info **pms, struct ns_id *nsid) +static char *mount_update_lsm_context(char *mount_opts) +{ + cleanup_free char *before_context = NULL; + char *other_options; + char *context_start; + char *context_end; + char *old_context; + char *new_options; + int ret; + + old_context = strstr(mount_opts, CONTEXT_OPT); + + if (!old_context || !opts.lsm_mount_context) + return xstrdup(mount_opts); + + /* + * If the user specified a different mount_context we need + * to replace the existing mount context in the mount + * options with the one specified by the user. + * + * The original mount options will be something like: + * + * context="system_u:object_r:container_file_t:s0:c82,c137",inode64 + * + * and it needs to be replaced with opts.lsm_mount_context. + * + * The content between 'context=' and ',inode64' will be replaced + * with opts.lsm_mount_context in quotes. + */ + + /* Skip 'context=' */ + context_start = old_context + strlen(CONTEXT_OPT); + if (context_start[0] == '"' && context_start + 1 < mount_opts + strlen(mount_opts)) { + /* Skip quotes */ + context_end = strchr(context_start + 1, '"'); + if (!context_end) { + pr_err("Failed parsing mount option 'context'\n"); + return NULL; + } + } else { + context_end = context_start; + } + + /* Find next after optionally skipping quotes. */ + other_options = strchr(context_end, ','); + + before_context = xstrdup(mount_opts); + if (unlikely(!before_context)) + return NULL; + before_context[context_start - mount_opts] = 0; + + ret = asprintf(&new_options, "%s\"%s\"%s", before_context, opts.lsm_mount_context, + other_options ? other_options : ""); + if (unlikely(ret < 0)) + return NULL; + pr_debug("\t\tChanged mount 'context=' to %s\n", new_options); + + return new_options; +} + +static int collect_mnt_from_image(struct mount_info **head, struct mount_info **tail, struct ns_id *nsid) { MntEntry *me = NULL; int ret, root_len = 1; @@ -2894,8 +3251,7 @@ static int collect_mnt_from_image(struct mount_info **pms, struct ns_id *nsid) root_len = print_ns_root(nsid, 0, root, sizeof(root)); - pr_debug("Reading mountpoint images (id %d pid %d)\n", - nsid->id, (int)nsid->ns_pid); + pr_debug("Reading mountpoint images (id %d pid %d)\n", nsid->id, (int)nsid->ns_pid); while (1) { struct mount_info *pm; @@ -2904,24 +3260,23 @@ static int collect_mnt_from_image(struct mount_info **pms, struct ns_id *nsid) if (ret <= 0) break; - pm = mnt_entry_alloc(); + pm = mnt_entry_alloc(true); if (!pm) goto err; pm->nsid = nsid; - pm->next = *pms; - *pms = pm; + mntinfo_add_list_before(head, pm); + if (!*tail) + *tail = pm; - pm->mnt_id = me->mnt_id; - pm->parent_mnt_id = me->parent_mnt_id; - pm->s_dev = me->root_dev; - pm->flags = me->flags; - pm->sb_flags = me->sb_flags; + pm->mnt_id = me->mnt_id; + pm->parent_mnt_id = me->parent_mnt_id; + pm->s_dev = me->root_dev; + pm->flags = me->flags; + pm->sb_flags = me->sb_flags; if (!me->has_sb_flags) { - const unsigned int mflags = MS_SHARED | MS_PRIVATE | - MS_SLAVE | MS_UNBINDABLE | - MS_NOSUID | MS_NODEV | MS_NOEXEC | - MS_NOATIME | MS_NODIRATIME | MS_RELATIME; + const unsigned int mflags = MS_SHARED | MS_PRIVATE | MS_SLAVE | MS_UNBINDABLE | MS_NOSUID | + MS_NODEV | MS_NOEXEC | MS_NOATIME | MS_NODIRATIME | MS_RELATIME; /* * In old images mnt and sb flags are saved together. @@ -2931,11 +3286,11 @@ static int collect_mnt_from_image(struct mount_info **pms, struct ns_id *nsid) pm->sb_flags = pm->flags & ~mflags; pm->flags = pm->flags & mflags; } - pm->shared_id = me->shared_id; - pm->master_id = me->master_id; - pm->need_plugin = me->with_plugin; - pm->deleted = me->deleted; - pm->is_ns_root = is_root(me->mountpoint); + pm->shared_id = me->shared_id; + pm->master_id = me->master_id; + pm->need_plugin = me->with_plugin; + pm->deleted = me->deleted; + pm->is_ns_root = is_root(me->mountpoint); if (me->has_internal_sharing) pm->internal_sharing = me->internal_sharing; @@ -2943,8 +3298,8 @@ static int collect_mnt_from_image(struct mount_info **pms, struct ns_id *nsid) if (!pm->source) goto err; - pm->options = xstrdup(me->options); - if (!pm->options) + pm->options = mount_update_lsm_context(me->options); + if (unlikely(!pm->options)) goto err; if (me->fstype != FSTYPE__AUTO && me->fsname) { @@ -2957,7 +3312,6 @@ static int collect_mnt_from_image(struct mount_info **pms, struct ns_id *nsid) if (pm->fstype->collect && (pm->fstype->collect(pm) < 0)) goto err; - if (me->fsname) { pm->fsname = xstrdup(me->fsname); if (!pm->fsname) @@ -2970,7 +3324,9 @@ static int collect_mnt_from_image(struct mount_info **pms, struct ns_id *nsid) if (get_mp_mountpoint(me->mountpoint, pm, root, root_len)) goto err; - pr_debug("\tRead %d mp @ %s\n", pm->mnt_id, pm->mountpoint); + pr_debug("\t" + "Read %d mp @ %s\n", + pm->mnt_id, pm->ns_mountpoint); } if (me) @@ -2984,20 +3340,82 @@ err: return -1; } +static int merge_mount_trees(void) +{ + struct ns_id *nsid; + + root_yard_mp = mnt_entry_alloc(true); + if (!root_yard_mp) + return -1; + + root_yard_mp->mountpoint = mnt_roots; + root_yard_mp->plain_mountpoint = xstrdup(mnt_roots); + if (!root_yard_mp->plain_mountpoint) + return -1; + root_yard_mp->is_dir = true; + root_yard_mp->mounted = true; + root_yard_mp->mnt_bind_is_populated = true; + root_yard_mp->is_overmounted = false; + root_yard_mp->mnt_id = HELPER_MNT_ID; + + /* Merge mount trees together under root_yard_mp */ + for (nsid = ns_ids; nsid; nsid = nsid->next) { + struct mount_info *root; + + if (nsid->nd != &mnt_ns_desc) + continue; + + root = nsid->mnt.mntinfo_tree; + + pr_debug("Mountpoint %d (@%s) moved to the root yard\n", root->mnt_id, root->ns_mountpoint); + root->parent = root_yard_mp; + list_add(&root->siblings, &root_yard_mp->children); + } + + return 0; +} + int read_mnt_ns_img(void) { struct mount_info *pms = NULL; struct ns_id *nsid; + if (!(root_ns_mask & CLONE_NEWNS)) { + mntinfo = NULL; + return 0; + } + for (nsid = ns_ids; nsid != NULL; nsid = nsid->next) { + struct mount_info *head = NULL, *tail = NULL; + if (nsid->nd != &mnt_ns_desc) continue; - if (collect_mnt_from_image(&pms, nsid)) + if (collect_mnt_from_image(&head, &tail, nsid)) return -1; + + nsid->mnt.mntinfo_tree = mnt_build_tree(head); + if (!nsid->mnt.mntinfo_tree) + return -1; + + /* mntns root mounts are always directories */ + nsid->mnt.mntinfo_tree->is_dir = true; + + tail->next = pms; + pms = head; } mntinfo = pms; + + search_bindmounts(); + prepare_is_overmounted(); + + if (!opts.mntns_compat_mode && resolve_shared_mounts_v2()) + return -1; + + if (merge_mount_trees()) + return -1; + return 0; } @@ -3052,19 +3470,19 @@ int restore_task_mnt_ns(struct pstree_item *current) return 0; if (current->ids && current->ids->has_mnt_ns_id) { + struct pstree_item *parent = current->parent; unsigned int id = current->ids->mnt_ns_id; struct ns_id *nsid; - /* - * Regardless of the namespace a task wants to - * live in, by that point they all will live in - * root's one (see prepare_pstree_kobj_ids() + - * get_clone_mask()). So if the current task's - * target namespace is the root's one -- it's - * already there, otherwise it will have to do - * setns(). + /* Zombies and helpers can have ids == 0 so we skip them */ + while (parent && !parent->ids) + parent = parent->parent; + + /** + * Our parent had restored the mount namespace before forking + * us and if we have the same mntns we just stay there. */ - if (current->parent && id == current->parent->ids->mnt_ns_id) + if (parent && id == parent->ids->mnt_ns_id) return 0; nsid = lookup_ns_by_id(id, &mnt_ns_desc); @@ -3099,7 +3517,7 @@ void fini_restore_mntns(void) /* * All nested mount namespaces are restore as sub-trees of the root namespace. */ -static int populate_roots_yard(void) +static int populate_roots_yard(struct mount_info *cr_time) { struct mnt_remap_entry *r; char path[PATH_MAX]; @@ -3124,75 +3542,64 @@ static int populate_roots_yard(void) * contains mounts which has to be restored separately */ list_for_each_entry(r, &mnt_remap_list, node) { - if (mkdirpat(AT_FDCWD, r->mi->mountpoint, 0755)) { - pr_perror("Unable to create %s", r->mi->mountpoint); + if (mkdirpat(AT_FDCWD, service_mountpoint(r->mi), 0755)) { + pr_perror("Unable to create %s", service_mountpoint(r->mi)); return -1; } } + if (cr_time && mkdirpat(AT_FDCWD, service_mountpoint(cr_time), 0755)) { + pr_perror("Unable to create %s", service_mountpoint(cr_time)); + return -1; + } + return 0; } static int populate_mnt_ns(void) { - struct mount_info *pms; - struct ns_id *nsid; + struct mount_info *cr_time = NULL; int ret; - if (mnt_roots) { - /* mnt_roots is a tmpfs mount and it's private */ - root_yard_mp = mnt_entry_alloc(); - if (!root_yard_mp) - return -1; - - root_yard_mp->mountpoint = mnt_roots; - root_yard_mp->mounted = true; - } - - pms = mnt_build_tree(mntinfo, root_yard_mp); - if (!pms) - return -1; - #ifdef CONFIG_BINFMT_MISC_VIRTUALIZED if (!opts.has_binfmt_misc && !list_empty(&binfmt_misc_list)) { /* Add to mount tree. Generic code will mount it later */ - ret = add_cr_time_mount(pms, "binfmt_misc", BINFMT_MISC_HOME, 0); - if (ret) + cr_time = add_cr_time_mount(root_yard_mp, "binfmt_misc", "binfmt_misc", 0, true); + if (!cr_time) return -1; } #endif - if (resolve_shared_mounts(mntinfo, pms->master_id)) + if (resolve_shared_mounts(mntinfo)) return -1; - for (nsid = ns_ids; nsid; nsid = nsid->next) { - if (nsid->nd != &mnt_ns_desc) - continue; - - /* - * Make trees of all namespaces look the - * same, so that manual paths resolution - * works on them. - */ - nsid->mnt.mntinfo_tree = pms; - } - if (validate_mounts(mntinfo, false)) return -1; - mnt_tree_for_each(pms, set_is_overmounted); - - if (find_remap_mounts(pms)) + if (find_remap_mounts(root_yard_mp)) return -1; - if (populate_roots_yard()) + if (populate_roots_yard(cr_time)) return -1; if (mount_clean_path()) return -1; - ret = mnt_tree_for_each(pms, do_mount_one); - mnt_tree_for_each(pms, do_close_one); + ret = mnt_tree_for_each(root_yard_mp, do_mount_one); + mnt_tree_for_each(root_yard_mp, do_close_one); + + if (ret == 0) { + struct mount_info *mi; + + /* + * Mounts in delayed_unbindable list were temporary mounted as + * private instead of unbindable so that do_mount_one can bind + * from them, now we are ready to fix it. + */ + list_for_each_entry(mi, &delayed_unbindable, mnt_unbindable) + if (set_unbindable(mi)) + return -1; + } if (ret == 0 && fixup_remap_mounts()) return -1; @@ -3209,7 +3616,7 @@ static int __depopulate_roots_yard(void) if (mnt_roots == NULL) return 0; - if (mount("none", mnt_roots, "none", MS_REC|MS_PRIVATE, NULL)) { + if (mount("none", mnt_roots, "none", MS_REC | MS_PRIVATE, NULL)) { pr_perror("Can't remount root with MS_PRIVATE"); ret = 1; } @@ -3329,6 +3736,9 @@ int prepare_mnt_ns(void) free_mntinfo(old); } + if (!opts.mntns_compat_mode) + return prepare_mnt_ns_v2(); + ret = populate_mnt_ns(); if (ret) return -1; @@ -3405,6 +3815,7 @@ int prepare_mnt_ns(void) return ret; err: if (rst >= 0) + /* coverity[check_return] */ restore_ns(rst, &mnt_ns_desc); return -1; } @@ -3423,7 +3834,6 @@ static int mntns_set_root_fd(pid_t pid, int fd) int __mntns_get_root_fd(pid_t pid) { - int fd, pfd; int ret; char path[PATH_MAX + 1]; @@ -3482,6 +3892,10 @@ int mntns_get_root_fd(struct ns_id *mntns) { if (!(root_ns_mask & CLONE_NEWNS)) return __mntns_get_root_fd(0); + + if (!mntns) + return -1; + /* * All namespaces are restored from the root task and during the * CR_STATE_FORKING stage the root task has two file descriptors for @@ -3545,7 +3959,6 @@ int mntns_get_root_by_mnt_id(int mnt_id) struct collect_mntns_arg { bool need_to_validate; bool for_dump; - int root_master_id; }; static int collect_mntns(struct ns_id *ns, void *__arg) @@ -3562,9 +3975,6 @@ static int collect_mntns(struct ns_id *ns, void *__arg) mntinfo_add_list(pms); - if (arg->need_to_validate && ns->id == root_item->ids->mnt_ns_id) - arg->root_master_id = ns->mnt.mntinfo_tree->master_id; - return 0; } @@ -3580,6 +3990,8 @@ int collect_mnt_namespaces(bool for_dump) if (ret) goto err; + search_bindmounts(); + #ifdef CONFIG_BINFMT_MISC_VIRTUALIZED if (for_dump && !opts.has_binfmt_misc) { unsigned int s_dev = 0; @@ -3591,18 +4003,11 @@ int collect_mnt_namespaces(bool for_dump) } if (ns) { - ret = mount_cr_time_mount(ns, &s_dev, "binfmt_misc", "/" BINFMT_MISC_HOME, - "binfmt_misc"); - if (ret == -EPERM) - pr_info("Can't mount binfmt_misc: EPERM. Running in user_ns?\n"); - else if (ret < 0 && ret != -EBUSY && ret != -ENODEV && ret != -ENOENT) { - pr_err("Can't mount binfmt_misc: %d %s\n", ret, strerror(-ret)); + ret = mount_cr_time_mount(ns, &s_dev, "binfmt_misc", "/" BINFMT_MISC_HOME, "binfmt_misc"); + if (ret == -1) { goto err; - } else if (ret == 0) { - ret = -1; - goto err; - } else if (ret > 0 && add_cr_time_mount(ns->mnt.mntinfo_tree, "binfmt_misc", - BINFMT_MISC_HOME, s_dev) < 0) { + } else if (ret == 0 && !add_cr_time_mount(ns->mnt.mntinfo_tree, "binfmt_misc", BINFMT_MISC_HOME, + s_dev, false)) { ret = -1; goto err; } @@ -3617,7 +4022,7 @@ int collect_mnt_namespaces(bool for_dump) if (arg.need_to_validate) { ret = -1; - if (resolve_shared_mounts(mntinfo, arg.root_master_id)) + if (resolve_shared_mounts(mntinfo)) goto err; if (validate_mounts(mntinfo, true)) goto err; @@ -3641,7 +4046,7 @@ int dump_mnt_namespaces(void) if ((nsid->type == NS_OTHER) && check_mnt_id()) { pr_err("Nested mount namespaces are not supported " - "without mnt_id in fdinfo\n"); + "without mnt_id in fdinfo\n"); return -1; } @@ -3655,21 +4060,23 @@ int dump_mnt_namespaces(void) void clean_cr_time_mounts(void) { struct mount_info *mi; - int mnt_fd, ret; + int ns_old, ret; for (mi = mntinfo; mi; mi = mi->next) { - if (mi->mnt_id != CRTIME_MNT_ID) + int cwd_fd; + + if (mi->mnt_id != HELPER_MNT_ID) continue; - ret = switch_ns(mi->nsid->ns_pid, &mnt_ns_desc, &mnt_fd); + ret = switch_mnt_ns(mi->nsid->ns_pid, &ns_old, &cwd_fd); if (ret) { pr_err("Can't switch to pid's %u mnt_ns\n", mi->nsid->ns_pid); continue; } - if (umount(mi->mountpoint) < 0) - pr_perror("Can't umount forced mount %s", mi->mountpoint); + if (umount(mi->ns_mountpoint) < 0) + pr_perror("Can't umount forced mount %s", mi->ns_mountpoint); - if (restore_ns(mnt_fd, &mnt_ns_desc)) { + if (restore_mnt_ns(ns_old, &cwd_fd)) { pr_err("cleanup_forced_mounts exiting with wrong mnt_ns\n"); return; } @@ -3680,27 +4087,37 @@ struct ns_desc mnt_ns_desc = NS_DESC_ENTRY(CLONE_NEWNS, "mnt"); static int call_helper_process(int (*call)(void *), void *arg) { - int pid, status; + int pid, status, exit_code = -1; - pid = clone_noasan(call, CLONE_VFORK | CLONE_VM | CLONE_FILES | - CLONE_IO | CLONE_SIGHAND | CLONE_SYSVSEM, arg); + /* + * Running new helper process on the restore must be + * done under last_pid mutex: other tasks may be restoring + * threads and the PID we need there might be occupied by + * this clone() call. + */ + lock_last_pid(); + + pid = clone_noasan(call, CLONE_VFORK | CLONE_VM | CLONE_FILES | CLONE_IO | CLONE_SIGHAND | CLONE_SYSVSEM, arg); if (pid == -1) { pr_perror("Can't clone helper process"); - return -1; + goto out; } errno = 0; if (waitpid(pid, &status, __WALL) != pid) { pr_perror("Unable to wait %d", pid); - return -1; + goto out; } if (status) { pr_err("Bad child exit status: %d\n", status); - return -1; + goto out; } - return 0; + exit_code = 0; +out: + unlock_last_pid(); + return exit_code; } static int ns_remount_writable(void *arg) @@ -3710,11 +4127,11 @@ static int ns_remount_writable(void *arg) if (do_restore_task_mnt_ns(ns)) return 1; - pr_debug("Switched to mntns %u:%u/n", ns->id, ns->kid); + pr_debug("Switched to mntns %u:%u\n", ns->id, ns->kid); - if (mount(NULL, mi->ns_mountpoint, NULL, MS_REMOUNT | MS_BIND | - (mi->flags & ~(MS_PROPAGATE | MS_RDONLY)), NULL) == -1) { - pr_perror("Failed to remount %d:%s writable", mi->mnt_id, mi->mountpoint); + if (mount(NULL, mi->ns_mountpoint, NULL, MS_REMOUNT | MS_BIND | (mi->flags & ~(MS_PROPAGATE | MS_RDONLY)), + NULL) == -1) { + pr_perror("Failed to remount %d:%s writable", mi->mnt_id, mi->ns_mountpoint); return 1; } return 0; @@ -3731,7 +4148,10 @@ int try_remount_writable(struct mount_info *mi, bool ns) if (!ns) remounted = REMOUNTED_RW_SERVICE; - if (mi->flags & MS_RDONLY && !(mi->remounted_rw & remounted)) { + /* All mounts in mntinfo list should have it on restore */ + BUG_ON(mi->rmi == NULL); + + if (mi->flags & MS_RDONLY && !(mi->rmi->remounted_rw & remounted)) { if (mnt_is_overmounted(mi)) { pr_err("The mount %d is overmounted so paths are invisible\n", mi->mnt_id); return -1; @@ -3743,18 +4163,18 @@ int try_remount_writable(struct mount_info *mi, bool ns) return -1; } - pr_info("Remount %d:%s writable\n", mi->mnt_id, mi->mountpoint); + pr_info("Remount %d:%s writable\n", mi->mnt_id, service_mountpoint(mi)); if (!ns) { - if (mount(NULL, mi->mountpoint, NULL, MS_REMOUNT | MS_BIND | - (mi->flags & ~(MS_PROPAGATE | MS_RDONLY)), NULL) == -1) { - pr_perror("Failed to remount %d:%s writable", mi->mnt_id, mi->mountpoint); + if (mount(NULL, service_mountpoint(mi), NULL, + MS_REMOUNT | MS_BIND | (mi->flags & ~(MS_PROPAGATE | MS_RDONLY)), NULL) == -1) { + pr_perror("Failed to remount %d:%s writable", mi->mnt_id, service_mountpoint(mi)); return -1; } } else { if (call_helper_process(ns_remount_writable, mi)) return -1; } - mi->remounted_rw |= remounted; + mi->rmi->remounted_rw |= remounted; } return 0; @@ -3769,7 +4189,7 @@ static int __remount_readonly_mounts(struct ns_id *ns) if (ns && mi->nsid != ns) continue; - if (!(mi->remounted_rw && REMOUNTED_RW)) + if (!(mi->rmi->remounted_rw & REMOUNTED_RW)) continue; /* @@ -3781,15 +4201,12 @@ static int __remount_readonly_mounts(struct ns_id *ns) if (do_restore_task_mnt_ns(ns)) return -1; mntns_set = true; - pr_debug("Switched to mntns %u:%u/n", ns->id, ns->kid); + pr_debug("Switched to mntns %u:%u\n", ns->id, ns->kid); } - pr_info("Remount %d:%s back to readonly\n", mi->mnt_id, mi->mountpoint); - if (mount(NULL, mi->ns_mountpoint, NULL, - MS_REMOUNT | MS_BIND | (mi->flags & ~MS_PROPAGATE), - NULL)) { - pr_perror("Failed to restore %d:%s mount flags %x", - mi->mnt_id, mi->mountpoint, mi->flags); + pr_info("Remount %d:%s back to readonly\n", mi->mnt_id, mi->ns_mountpoint); + if (mount(NULL, mi->ns_mountpoint, NULL, MS_REMOUNT | MS_BIND | (mi->flags & ~MS_PROPAGATE), NULL)) { + pr_perror("Failed to restore %d:%s mount flags %x", mi->mnt_id, mi->ns_mountpoint, mi->flags); return -1; } } @@ -3820,3 +4237,18 @@ int remount_readonly_mounts(void) */ return call_helper_process(ns_remount_readonly_mounts, NULL); } + +static struct mount_info *mnt_subtree_next(struct mount_info *mi, struct mount_info *root) +{ + if (!list_empty(&mi->children)) + return list_entry(mi->children.next, struct mount_info, siblings); + + while (mi->parent && mi != root) { + if (mi->siblings.next == &mi->parent->children) + mi = mi->parent; + else + return list_entry(mi->siblings.next, struct mount_info, siblings); + } + + return NULL; +} diff --git a/criu/namespaces.c b/criu/namespaces.c index a228737ee..0c9b16a87 100644 --- a/criu/namespaces.c +++ b/criu/namespaces.c @@ -4,7 +4,6 @@ #include #include #include -#include #include #include #include @@ -20,30 +19,31 @@ #include "imgset.h" #include "uts_ns.h" #include "ipc_ns.h" +#include "timens.h" #include "mount.h" #include "pstree.h" #include "namespaces.h" #include "net.h" #include "cgroup.h" #include "fdstore.h" +#include "kerndat.h" +#include "util-caps.h" #include "protobuf.h" #include "util.h" #include "images/ns.pb-c.h" #include "images/userns.pb-c.h" +#include "images/pidns.pb-c.h" static struct ns_desc *ns_desc_array[] = { - &net_ns_desc, - &uts_ns_desc, - &ipc_ns_desc, - &pid_ns_desc, - &user_ns_desc, - &mnt_ns_desc, - &cgroup_ns_desc, + &net_ns_desc, &uts_ns_desc, &ipc_ns_desc, &pid_ns_desc, + &user_ns_desc, &mnt_ns_desc, &time_ns_desc, &cgroup_ns_desc, }; static unsigned int join_ns_flags; +static int collect_pid_namespaces(bool); + int check_namespace_opts(void) { errno = EINVAL; @@ -73,9 +73,7 @@ static int check_int_str(char *str) errno = EINVAL; val = strtol(str, &endptr, 10); - if ((errno == ERANGE) || (endptr == str) - || (*endptr != '\0') - || (val < 0) || (val > 65535)) { + if ((errno == ERANGE) || (endptr == str) || (*endptr != '\0') || (val < 0) || (val > 65535)) { str = NULL; return -1; } @@ -96,8 +94,7 @@ static int check_ns_file(char *ns_file) } proc_dir = open_pid_proc(pid); if (proc_dir < 0) { - pr_err("Invalid join_ns pid: /proc/%s not found\n", - ns_file); + pr_err("Invalid join_ns pid: /proc/%s not found\n", ns_file); return -1; } return 0; @@ -150,13 +147,21 @@ int join_ns_add(const char *type, char *ns_file, char *extra_opts) if (!jn) return -1; - jn->ns_file = ns_file; + jn->ns_file = xstrdup(ns_file); + if (!jn->ns_file) { + xfree(jn); + return -1; + } + if (!strncmp(type, "net", 4)) { jn->nd = &net_ns_desc; join_ns_flags |= CLONE_NEWNET; } else if (!strncmp(type, "uts", 4)) { jn->nd = &uts_ns_desc; join_ns_flags |= CLONE_NEWUTS; + } else if (!strncmp(type, "time", 5)) { + jn->nd = &time_ns_desc; + join_ns_flags |= CLONE_NEWTIME; } else if (!strncmp(type, "ipc", 4)) { jn->nd = &ipc_ns_desc; join_ns_flags |= CLONE_NEWIPC; @@ -182,6 +187,7 @@ int join_ns_add(const char *type, char *ns_file, char *extra_opts) pr_info("Added %s:%s join namespace\n", type, ns_file); return 0; err: + xfree(jn->ns_file); xfree(jn); return -1; } @@ -239,11 +245,11 @@ int switch_ns(int pid, struct ns_desc *nd, int *rst) int switch_ns_by_fd(int nsfd, struct ns_desc *nd, int *rst) { - int ret = -1; + int ret = -1, old_ns = -1; if (rst) { - *rst = open_proc(PROC_SELF, "ns/%s", nd->str); - if (*rst < 0) + old_ns = open_proc(PROC_SELF, "ns/%s", nd->str); + if (old_ns < 0) goto err_ns; } @@ -253,11 +259,12 @@ int switch_ns_by_fd(int nsfd, struct ns_desc *nd, int *rst) goto err_set; } + if (rst) + *rst = old_ns; return 0; err_set: - if (rst) - close(*rst); + close_safe(&old_ns); err_ns: return -1; } @@ -275,6 +282,48 @@ int restore_ns(int rst, struct ns_desc *nd) return ret; } +int switch_mnt_ns(int pid, int *rst, int *cwd_fd) +{ + int fd; + + if (!cwd_fd) + return switch_ns(pid, &mnt_ns_desc, rst); + + fd = open(".", O_PATH); + if (fd < 0) { + pr_perror("unable to open current directory"); + return -1; + } + + if (switch_ns(pid, &mnt_ns_desc, rst)) { + close(fd); + return -1; + } + + *cwd_fd = fd; + return 0; +} + +int restore_mnt_ns(int rst, int *cwd_fd) +{ + int exit_code = -1; + + if (restore_ns(rst, &mnt_ns_desc)) + goto err_restore; + + if (cwd_fd && fchdir(*cwd_fd)) { + pr_perror("Unable to restore current directory"); + goto err_restore; + } + + exit_code = 0; +err_restore: + if (cwd_fd) + close_safe(cwd_fd); + + return exit_code; +} + struct ns_id *ns_ids = NULL; static unsigned int ns_next_id = 1; unsigned long root_ns_mask = 0; @@ -290,8 +339,7 @@ static void nsid_add(struct ns_id *ns, struct ns_desc *nd, unsigned int id, pid_ pr_info("Add %s ns %d pid %d\n", nd->str, ns->id, ns->ns_pid); } -struct ns_id *rst_new_ns_id(unsigned int id, pid_t pid, - struct ns_desc *nd, enum ns_type type) +static struct ns_id *rst_new_ns_id(unsigned int id, pid_t pid, struct ns_desc *nd, enum ns_type type) { struct ns_id *nsid; @@ -323,8 +371,7 @@ int rst_add_ns_id(unsigned int id, struct pstree_item *i, struct ns_desc *nd) return 0; } - nsid = rst_new_ns_id(id, pid, nd, - i == root_item ? NS_ROOT : NS_OTHER); + nsid = rst_new_ns_id(id, pid, nd, i == root_item ? NS_ROOT : NS_OTHER); if (nsid == NULL) return -1; @@ -336,7 +383,7 @@ struct ns_id *lookup_ns_by_kid(unsigned int kid, struct ns_desc *nd) struct ns_id *nsid; for (nsid = ns_ids; nsid != NULL; nsid = nsid->next) - if (nsid->kid == kid && nsid->nd == nd) + if (nsid->kid == kid && nsid->nd->cflag == nd->cflag) return nsid; return NULL; @@ -393,8 +440,7 @@ int walk_namespaces(struct ns_desc *nd, int (*cb)(struct ns_id *, void *), void return ret; } -static unsigned int generate_ns_id(int pid, unsigned int kid, struct ns_desc *nd, - struct ns_id **ns_ret) +static unsigned int generate_ns_id(int pid, unsigned int kid, struct ns_desc *nd, struct ns_id **ns_ret) { struct ns_id *nsid; enum ns_type type; @@ -411,8 +457,7 @@ static unsigned int generate_ns_id(int pid, unsigned int kid, struct ns_desc *nd root_ns_mask |= nd->cflag; type = NS_ROOT; } else if (nd->cflag & ~CLONE_SUBNS) { - pr_err("Can't dump nested %s namespace for %d\n", - nd->str, pid); + pr_err("Can't dump nested %s namespace for %d\n", nd->str, pid); return 0; } } else @@ -442,7 +487,7 @@ static unsigned int __get_ns_id(int pid, struct ns_desc *nd, protobuf_c_boolean { int proc_dir; unsigned int kid; - char ns_path[10]; + char ns_path[32]; struct stat st; proc_dir = open_pid_proc(pid); @@ -488,10 +533,10 @@ int dump_one_ns_file(int lfd, u32 id, const struct fd_parms *p) return -1; } - nfe.id = id; - nfe.ns_id = nsid->id; - nfe.ns_cflag = link->ns_d->cflag; - nfe.flags = p->flags; + nfe.id = id; + nfe.ns_id = nsid->id; + nfe.ns_cflag = link->ns_d->cflag; + nfe.flags = p->flags; fe.type = FD_TYPES__NS; fe.id = nfe.id; @@ -502,13 +547,13 @@ int dump_one_ns_file(int lfd, u32 id, const struct fd_parms *p) } const struct fdtype_ops nsfile_dump_ops = { - .type = FD_TYPES__NS, - .dump = dump_one_ns_file, + .type = FD_TYPES__NS, + .dump = dump_one_ns_file, }; struct ns_file_info { - struct file_desc d; - NsFileEntry *nfe; + struct file_desc d; + NsFileEntry *nfe; }; static int open_ns_fd(struct file_desc *d, int *new_fd) @@ -529,7 +574,10 @@ static int open_ns_fd(struct file_desc *d, int *new_fd) else break; fd = fdstore_get(nsfd_id); - goto check_open; + if (fd < 0) { + return -1; + } + goto out; } /* @@ -568,6 +616,10 @@ static int open_ns_fd(struct file_desc *d, int *new_fd) item = t; nd = &cgroup_ns_desc; break; + } else if (ids->time_ns_id == nfi->nfe->ns_id) { + item = t; + nd = &time_ns_desc; + break; } } @@ -585,12 +637,11 @@ static int open_ns_fd(struct file_desc *d, int *new_fd) path[sizeof(path) - 1] = '\0'; fd = open(path, nfi->nfe->flags); -check_open: if (fd < 0) { pr_perror("Can't open file %s on restore", path); return fd; } - +out: *new_fd = fd; return 0; } @@ -671,6 +722,25 @@ int dump_task_ns_ids(struct pstree_item *item) return -1; } + ids->time_ns_id = get_ns_id(pid, &time_ns_desc, &ids->has_time_ns_id); + if (!ids->time_ns_id) { + pr_err("Can't make timens id\n"); + return -1; + } + if (ids->has_time_ns_id) { + unsigned int id; + protobuf_c_boolean supported = false; + id = get_ns_id(pid, &time_for_children_ns_desc, &supported); + if (!supported || !id) { + pr_err("Can't make timens id\n"); + return -1; + } + if (id != ids->time_ns_id) { + pr_err("Can't dump nested time namespace for %d\n", pid); + return -1; + } + } + ids->has_mnt_ns_id = true; ids->mnt_ns_id = get_ns_id(pid, &mnt_ns_desc, NULL); if (!ids->mnt_ns_id) { @@ -705,8 +775,7 @@ static unsigned int userns_id(unsigned int id, UidGidExtent **map, int n) return id; for (i = 0; i < n; i++) { - if (map[i]->lower_first <= id && - map[i]->lower_first + map[i]->count > id) + if (map[i]->lower_first <= id && map[i]->lower_first + map[i]->count > id) return map[i]->first + (id - map[i]->lower_first); } @@ -721,8 +790,7 @@ static unsigned int host_id(unsigned int id, UidGidExtent **map, int n) return id; for (i = 0; i < n; i++) { - if (map[i]->first <= id && - map[i]->first + map[i]->count > id) + if (map[i]->first <= id && map[i]->first + map[i]->count > id) return map[i]->lower_first + (id - map[i]->first); } @@ -780,8 +848,7 @@ static int parse_id_map(pid_t pid, char *name, UidGidExtent ***pb_exts) ext = &extents[len]; uid_gid_extent__init(ext); - ret = fscanf(f, "%d %d %d", &ext->first, - &ext->lower_first, &ext->count); + ret = fscanf(f, "%d %d %d", &ext->first, &ext->lower_first, &ext->count); if (ret != 3) { if (ferror(f)) { pr_perror("Unable to parse extents: %d", ret); @@ -821,7 +888,7 @@ int collect_user_ns(struct ns_id *ns, void *oarg) { /* * User namespace is dumped before files to get uid and gid - * mappings, which are used for convirting local id-s to + * mappings, which are used for converting local id-s to * userns id-s (userns_uid(), userns_gid()) */ if (dump_user_ns(root_item->pid->real, root_item->ids->user_ns_id)) @@ -908,17 +975,15 @@ static int check_user_ns(int pid) if (switch_ns(pid, &user_ns_desc, NULL)) exit(1); - if ((root_ns_mask & CLONE_NEWNET) && - switch_ns(pid, &net_ns_desc, NULL)) + if ((root_ns_mask & CLONE_NEWNET) && switch_ns(pid, &net_ns_desc, NULL)) exit(1); - if ((root_ns_mask & CLONE_NEWUTS) && - switch_ns(pid, &uts_ns_desc, NULL)) + if ((root_ns_mask & CLONE_NEWUTS) && switch_ns(pid, &uts_ns_desc, NULL)) exit(1); - if ((root_ns_mask & CLONE_NEWIPC) && - switch_ns(pid, &ipc_ns_desc, NULL)) + if ((root_ns_mask & CLONE_NEWTIME) && switch_ns(pid, &time_ns_desc, NULL)) exit(1); - if ((root_ns_mask & CLONE_NEWNS) && - switch_ns(pid, &mnt_ns_desc, NULL)) + if ((root_ns_mask & CLONE_NEWIPC) && switch_ns(pid, &ipc_ns_desc, NULL)) + exit(1); + if ((root_ns_mask & CLONE_NEWNS) && switch_ns(pid, &mnt_ns_desc, NULL)) exit(1); exit(0); } @@ -938,18 +1003,23 @@ static int check_user_ns(int pid) int dump_user_ns(pid_t pid, int ns_id) { - int ret, exit_code = -1; UsernsEntry *e = &userns_entry; struct cr_img *img; + int ret; ret = parse_id_map(pid, "uid_map", &e->uid_map); if (ret < 0) - goto err; + /* + * The uid_map and gid_map is clean up in free_userns_maps + * later, so we don't need to clean these up in error cases. + */ + return -1; + e->n_uid_map = ret; ret = parse_id_map(pid, "gid_map", &e->gid_map); if (ret < 0) - goto err; + return -1; e->n_gid_map = ret; if (check_user_ns(pid)) @@ -957,26 +1027,16 @@ int dump_user_ns(pid_t pid, int ns_id) img = open_image(CR_FD_USERNS, O_DUMP, ns_id); if (!img) - goto err; + return -1; ret = pb_write_one(img, e, PB_USERNS); close_image(img); if (ret < 0) - goto err; + return -1; return 0; -err: - if (e->uid_map) { - xfree(e->uid_map[0]); - xfree(e->uid_map); - } - if (e->gid_map) { - xfree(e->gid_map[0]); - xfree(e->gid_map); - } - return exit_code; } -void free_userns_maps() +void free_userns_maps(void) { if (userns_entry.n_uid_map > 0) { xfree(userns_entry.uid_map[0]); @@ -998,18 +1058,19 @@ static int do_dump_namespaces(struct ns_id *ns) switch (ns->nd->cflag) { case CLONE_NEWUTS: - pr_info("Dump UTS namespace %d via %d\n", - ns->id, ns->ns_pid); + pr_info("Dump UTS namespace %d via %d\n", ns->id, ns->ns_pid); ret = dump_uts_ns(ns->id); break; + case CLONE_NEWTIME: + pr_info("Dump TIME namespace %d via %d\n", ns->id, ns->ns_pid); + ret = dump_time_ns(ns->id); + break; case CLONE_NEWIPC: - pr_info("Dump IPC namespace %d via %d\n", - ns->id, ns->ns_pid); + pr_info("Dump IPC namespace %d via %d\n", ns->id, ns->ns_pid); ret = dump_ipc_ns(ns->id); break; case CLONE_NEWNET: - pr_info("Dump NET namespace info %d via %d\n", - ns->id, ns->ns_pid); + pr_info("Dump NET namespace info %d via %d\n", ns->id, ns->ns_pid); ret = dump_net_ns(ns); break; default: @@ -1018,7 +1079,6 @@ static int do_dump_namespaces(struct ns_id *ns) } return ret; - } int dump_namespaces(struct pstree_item *item, unsigned int ns_flags) @@ -1041,9 +1101,22 @@ int dump_namespaces(struct pstree_item *item, unsigned int ns_flags) pr_info("Dumping %d(%d)'s namespaces\n", ns_pid->ns[0].virt, ns_pid->real); - if ((ns_flags & CLONE_NEWPID) && ns_pid->ns[0].virt != 1) { - pr_err("Can't dump a pid namespace without the process init\n"); - return -1; + if ((ns_flags & CLONE_NEWPID) && ns_pid->ns[0].virt != INIT_PID) { + char *val = NULL; + + ns = lookup_ns_by_id(item->ids->pid_ns_id, &pid_ns_desc); + if (ns) { + char id[64]; + snprintf(id, sizeof(id), "pid[%u]", ns->kid); + val = external_lookup_by_key(id); + if (IS_ERR_OR_NULL(val)) + val = NULL; + } + + if (!val) { + pr_err("Can't dump a pid namespace without the process init\n"); + return -1; + } } for (ns = ns_ids; ns; ns = ns->next) { @@ -1052,15 +1125,15 @@ int dump_namespaces(struct pstree_item *item, unsigned int ns_flags) continue; switch (ns->nd->cflag) { - /* No data for pid namespaces to dump */ - case CLONE_NEWPID: - /* Dumped explicitly with dump_mnt_namespaces() */ - case CLONE_NEWNS: - /* Userns is dumped before dumping tasks */ - case CLONE_NEWUSER: - /* handled separately in cgroup dumping code */ - case CLONE_NEWCGROUP: - continue; + /* No data for pid namespaces to dump */ + case CLONE_NEWPID: + /* Dumped explicitly with dump_mnt_namespaces() */ + case CLONE_NEWNS: + /* Userns is dumped before dumping tasks */ + case CLONE_NEWUSER: + /* handled separately in cgroup dumping code */ + case CLONE_NEWCGROUP: + continue; } pid = fork(); @@ -1108,11 +1181,20 @@ static int write_id_map(pid_t pid, UidGidExtent **extents, int n, char *id_map) * We can perform only a single write (that may contain multiple * newline-delimited records) to a uid_map and a gid_map files. */ - for (i = 0; i < n; i++) - off += snprintf(buf + off, sizeof(buf) - off, - "%u %u %u\n", extents[i]->first, - extents[i]->lower_first, - extents[i]->count); + for (i = 0; i < n; i++) { + int len; + + len = snprintf(buf + off, sizeof(buf) - off, "%u %u %u\n", extents[i]->first, extents[i]->lower_first, + extents[i]->count); + if (len < 0) { + pr_perror("Unable to form the user/group mappings buffer"); + return -1; + } else if (len >= sizeof(buf) - off) { + pr_err("The user/group mappings buffer truncated\n"); + return -1; + } + off += len; + } fd = open_proc_rw(pid, "%s", id_map); if (fd < 0) @@ -1127,21 +1209,9 @@ static int write_id_map(pid_t pid, UidGidExtent **extents, int n, char *id_map) return 0; } -struct unsc_msg { - struct msghdr h; - /* - * 0th is the call address - * 1st is the flags - * 2nd is the optional (NULL in response) arguments - */ - struct iovec iov[3]; - char c[CMSG_SPACE(sizeof(struct ucred)) + CMSG_SPACE(sizeof(int))]; -}; - static int usernsd_pid; -static inline void unsc_msg_init(struct unsc_msg *m, uns_call_t *c, - int *x, void *arg, size_t asize, int fd) +inline void unsc_msg_init(struct unsc_msg *m, uns_call_t *c, int *x, void *arg, size_t asize, int fd, pid_t *pid) { struct cmsghdr *ch; struct ucred *ucred; @@ -1178,8 +1248,11 @@ static inline void unsc_msg_init(struct unsc_msg *m, uns_call_t *c, ch->cmsg_level = SOL_SOCKET; ch->cmsg_type = SCM_CREDENTIALS; - ucred = (struct ucred *) CMSG_DATA(ch); - ucred->pid = getpid(); + ucred = (struct ucred *)CMSG_DATA(ch); + if (pid) + ucred->pid = *pid; + else + ucred->pid = getpid(); ucred->uid = getuid(); ucred->gid = getgid(); @@ -1194,7 +1267,7 @@ static inline void unsc_msg_init(struct unsc_msg *m, uns_call_t *c, } } -static void unsc_msg_pid_fd(struct unsc_msg *um, pid_t *pid, int *fd) +void unsc_msg_pid_fd(struct unsc_msg *um, pid_t *pid, int *fd) { struct cmsghdr *ch; struct ucred *ucred; @@ -1206,7 +1279,7 @@ static void unsc_msg_pid_fd(struct unsc_msg *um, pid_t *pid, int *fd) BUG_ON(ch->cmsg_type != SCM_CREDENTIALS); if (pid) { - ucred = (struct ucred *) CMSG_DATA(ch); + ucred = (struct ucred *)CMSG_DATA(ch); *pid = ucred->pid; } @@ -1232,7 +1305,7 @@ static int usernsd(int sk) int flags, fd, ret; pid_t pid; - unsc_msg_init(&um, &call, &flags, msg, sizeof(msg), 0); + unsc_msg_init(&um, &call, &flags, msg, sizeof(msg), 0, NULL); if (recvmsg(sk, &um.h, 0) <= 0) { pr_perror("uns: recv req error"); return -1; @@ -1241,11 +1314,6 @@ static int usernsd(int sk) unsc_msg_pid_fd(&um, &pid, &fd); pr_debug("uns: daemon calls %p (%d, %d, %x)\n", call, pid, fd, flags); - if (fd < 0 && flags & UNS_FDOUT) { - pr_err("uns: bad flags/fd %p %d %x\n", call, fd, flags); - BUG(); - } - /* * Caller has sent us bare address of the routine it * wants to call. Since the caller is fork()-ed from the @@ -1282,7 +1350,7 @@ static int usernsd(int sk) else fd = -1; - unsc_msg_init(&um, &call, &ret, NULL, 0, fd); + unsc_msg_init(&um, &call, &ret, NULL, 0, fd, NULL); if (sendmsg(sk, &um.h, 0) <= 0) { pr_perror("uns: send resp error"); return -1; @@ -1293,8 +1361,7 @@ static int usernsd(int sk) } } -int __userns_call(const char *func_name, uns_call_t call, int flags, - void *arg, size_t arg_size, int fd) +int __userns_call(const char *func_name, uns_call_t call, int flags, void *arg, size_t arg_size, int fd) { int ret, res, sk; bool async = flags & UNS_ASYNC; @@ -1309,6 +1376,10 @@ int __userns_call(const char *func_name, uns_call_t call, int flags, return call(arg, fd, getpid()); sk = get_service_fd(USERNSD_SK); + if (sk < 0) { + pr_err("Cannot get USERNSD_SK fd\n"); + return -1; + } pr_debug("uns: calling %s (%d, %x)\n", func_name, fd, flags); if (!async) @@ -1330,7 +1401,7 @@ int __userns_call(const char *func_name, uns_call_t call, int flags, /* Send the request */ - unsc_msg_init(&um, &call, &flags, arg, arg_size, fd); + unsc_msg_init(&um, &call, &flags, arg, arg_size, fd, NULL); ret = sendmsg(sk, &um.h, 0); if (ret <= 0) { pr_perror("uns: send req error"); @@ -1345,7 +1416,7 @@ int __userns_call(const char *func_name, uns_call_t call, int flags, /* Get the response back */ - unsc_msg_init(&um, &call, &res, NULL, 0, 0); + unsc_msg_init(&um, &call, &res, NULL, 0, 0, NULL); ret = recvmsg(sk, &um.h, 0); if (ret <= 0) { pr_perror("uns: recv resp error"); @@ -1366,14 +1437,11 @@ out: return ret; } -static int start_usernsd(void) +int start_unix_cred_daemon(pid_t *pid, int (*daemon_func)(int sk)) { int sk[2]; int one = 1; - if (!(root_ns_mask & CLONE_NEWUSER)) - return 0; - /* * Seqpacket to * @@ -1381,7 +1449,7 @@ static int start_usernsd(void) * each other easily. Stream socket require manual * messages boundaries. * - * b) Make callers note the damon death by seeing the + * b) Make callers note the daemon death by seeing the * disconnected socket. In case of dgram socket * callers would just get stuck in receiving the * response. @@ -1402,24 +1470,39 @@ static int start_usernsd(void) return -1; } - usernsd_pid = fork(); - if (usernsd_pid < 0) { - pr_perror("Can't fork usernsd"); + *pid = fork(); + if (*pid < 0) { + pr_perror("Can't unix daemon"); close(sk[0]); close(sk[1]); return -1; } - if (usernsd_pid == 0) { + if (*pid == 0) { int ret; - close(sk[0]); - ret = usernsd(sk[1]); + ret = daemon_func(sk[1]); exit(ret); } - close(sk[1]); - if (install_service_fd(USERNSD_SK, sk[0]) < 0) { + + return sk[0]; +} + +static int start_usernsd(void) +{ + int sk; + + if (!(root_ns_mask & CLONE_NEWUSER)) + return 0; + + sk = start_unix_cred_daemon(&usernsd_pid, usernsd); + if (sk < 0) { + pr_err("failed to start usernsd\n"); + return -1; + } + + if (install_service_fd(USERNSD_SK, sk) < 0) { kill(usernsd_pid, SIGKILL); waitpid(usernsd_pid, NULL, 0); return -1; @@ -1527,15 +1610,21 @@ int collect_namespaces(bool for_dump) if (ret < 0) return ret; + ret = collect_pid_namespaces(for_dump); + if (ret < 0) + return ret; + return 0; } int prepare_userns_creds(void) { - /* UID and GID must be set after restoring /proc/PID/{uid,gid}_maps */ - if (setuid(0) || setgid(0) || setgroups(0, NULL)) { - pr_perror("Unable to initialize id-s"); - return -1; + if (!opts.unprivileged || has_cap_setuid(opts.cap_eff)) { + /* UID and GID must be set after restoring /proc/PID/{uid,gid}_maps */ + if (setuid(0) || setgid(0) || setgroups(0, NULL)) { + pr_perror("Unable to initialize id-s"); + return -1; + } } /* @@ -1678,8 +1767,7 @@ int prepare_namespace(struct pstree_item *item, unsigned long clone_flags) sigset_t sig_mask; int id, ret = -1; - pr_info("Restoring namespaces %d flags 0x%lx\n", - vpid(item), clone_flags); + pr_info("Restoring namespaces %d flags 0x%lx\n", vpid(item), clone_flags); if (block_sigmask(&sig_mask, SIGCHLD) < 0) return -1; @@ -1718,6 +1806,35 @@ out: return ret; } +static int read_pid_ns_img(void) +{ + struct ns_id *ns; + PidnsEntry *e; + + for (ns = ns_ids; ns != NULL; ns = ns->next) { + struct cr_img *img; + int ret; + + if (ns->nd != &pid_ns_desc) + continue; + + img = open_image(CR_FD_PIDNS, O_RSTR, ns->id); + if (!img) + return -1; + + ret = pb_read_one_eof(img, &e, PB_PIDNS); + close_image(img); + if (ret < 0) { + pr_err("Can not read pidns object\n"); + return -1; + } + if (ret > 0) + ns->ext_key = e->ext_key; + } + + return 0; +} + int prepare_namespace_before_tasks(void) { if (start_usernsd()) @@ -1735,6 +1852,9 @@ int prepare_namespace_before_tasks(void) if (read_net_ns_img()) goto err_img; + if (read_pid_ns_img()) + goto err_img; + return 0; err_img: @@ -1752,3 +1872,43 @@ err_unds: struct ns_desc pid_ns_desc = NS_DESC_ENTRY(CLONE_NEWPID, "pid"); struct ns_desc user_ns_desc = NS_DESC_ENTRY(CLONE_NEWUSER, "user"); + +static int collect_pid_ns(struct ns_id *ns, void *oarg) +{ + PidnsEntry e = PIDNS_ENTRY__INIT; + struct cr_img *img; + int ret; + char id[64], *val; + + pr_info("Collecting pidns %d/%d\n", ns->id, ns->ns_pid); + + snprintf(id, sizeof(id), "pid[%u]", ns->kid); + val = external_lookup_by_key(id); + if (PTR_RET(val)) + return 0; + + /* + * Only if the user marked the PID namespace as external + * via --external pid[]: