criu: Introduce new device file plugin hooks

Currently CRIU cannot handle Checkpoint Restore operations when a device
file is involved in a process, however, CRIU allows flexible extensions
via special plugins but still, for certain complex devices such as a GPU,
the existing hooks are not sufficient. This introduces few new hooks
that will be used to support Checkpoint Restore operation with AMD GPU
devices and potentially to other similar devices too.

 - HANDLE_DEVICE_VMA
 - UPDATE_VMA_MAP
 - RESUME_DEVICES_LATE

 *HANDLE_DEVICE_VMA:
	Hook to detect a suitable plugin to handle device file VMA with
	PF | IO mappings.

 *UPDATE_VMA_MAP:
	Hook to handle VMAs during a device file restore.

	When restoring VMAs for the device files, criu runs sys_mmap in
	the pie restore context but the offsets and file path within a
	device file may change during restore operation so it needs to be
	adjusted properly.

 *RESUME_DEVICES_LATE:
	Hook to do some special handling in late restore phase.

	During criu restore phase when a device is getting restored with
	the help of a plugin, some device specific operations might need
	to be delayed until criu finalizes the VMA placements in address
	space of the target process. But by the time criu finalizes this,
	its too late since pie phase is over and control is back to criu
	master process. This hook allows an external trigger to each
	resuming task to check whether it has a device specific operation
	pending such as issuing an ioctl call? Since this is called from
	criu master process context, supply the pid of the target process
	and give a chance to each plugin registered to run device
	specific operation if the target pid is valid.

A future patch will add consumers for these plugin hooks to support AMD
GPUs.

Signed-off-by: Rajneesh Bhardwaj <rajneesh.bhardwaj@amd.com>
This commit is contained in:
Rajneesh Bhardwaj 2021-04-15 12:04:45 -04:00 committed by Andrei Vagin
parent dd46e79196
commit 17e2a8c709
5 changed files with 100 additions and 6 deletions

View file

@ -2388,6 +2388,29 @@ skip_ns_bouncing:
pr_err("Unable to flush breakpoints\n");
finalize_restore();
/*
* Some external devices such as GPUs might need a very late
* trigger to kick-off some events, memory notifiers and for
* restarting the previously restored queues during criu restore
* stage. This is needed since criu pie code may shuffle VMAs
* around so things such as registering MMU notifiers (for GPU
* mapped memory) could be done sanely once the pie code hands
* over the control to master process.
*/
for_each_pstree_item(item) {
pr_info("Run late stage hook from criu master for external devices\n");
ret = run_plugins(RESUME_DEVICES_LATE, item->pid->real);
/*
* This may not really be an error. Only certain plugin hooks
* (if available) will return success such as amdgpu_plugin that
* validates the pid of the resuming tasks in the kernel mode.
* Most of the times, it'll be -ENOTSUP and in few cases, it
* might actually be a true error code but that would be also
* captured in the plugin so no need to print the error here.
*/
if (ret < 0)
pr_debug("restore late stage hook for external plugin failed\n");
}
ret = run_scripts(ACT_PRE_RESUME);
if (ret)

View file

@ -2267,6 +2267,23 @@ static int open_filemap(int pid, struct vma_area *vma)
BUG_ON((vma->vmfd == NULL) || !vma->e->has_fdflags);
flags = vma->e->fdflags;
/* update the new device file page offsets and file paths set during restore */
if (vma->e->status & VMA_UNSUPP) {
uint64_t new_pgoff;
char new_path[PATH_MAX];
int ret;
struct reg_file_info *rfi = container_of(vma->vmfd, struct reg_file_info, d);
ret = run_plugins(UPDATE_VMA_MAP, rfi->rfe->name, new_path, vma->e->start, vma->e->pgoff, &new_pgoff);
if (ret == 1) {
pr_info("New mmap %#016" PRIx64 "->%#016" PRIx64 " path %s\n", vma->e->pgoff, new_pgoff,
new_path);
vma->e->pgoff = new_pgoff;
rfi->path = xstrdup(new_path);
pr_debug("Updated rfi->path %s\n", rfi->path);
}
}
if (ctx.flags != flags || ctx.desc != vma->vmfd) {
if (vma->e->status & VMA_AREA_MEMFD)
ret = memfd_open(vma->vmfd, &flags);

View file

@ -22,6 +22,8 @@
#include <limits.h>
#include <stdbool.h>
#include <stdint.h>
#include <sys/stat.h>
#define CRIU_PLUGIN_GEN_VERSION(a, b, c) (((a) << 16) + ((b) << 8) + (c))
#define CRIU_PLUGIN_VERSION_MAJOR 0
@ -48,6 +50,12 @@ enum {
CR_PLUGIN_HOOK__DUMP_EXT_LINK = 6,
CR_PLUGIN_HOOK__HANDLE_DEVICE_VMA = 7,
CR_PLUGIN_HOOK__UPDATE_VMA_MAP = 8,
CR_PLUGIN_HOOK__RESUME_DEVICES_LATE = 9,
CR_PLUGIN_HOOK__MAX
};
@ -60,6 +68,10 @@ DECLARE_PLUGIN_HOOK_ARGS(CR_PLUGIN_HOOK__RESTORE_EXT_FILE, int id);
DECLARE_PLUGIN_HOOK_ARGS(CR_PLUGIN_HOOK__DUMP_EXT_MOUNT, char *mountpoint, int id);
DECLARE_PLUGIN_HOOK_ARGS(CR_PLUGIN_HOOK__RESTORE_EXT_MOUNT, int id, char *mountpoint, char *old_root, int *is_file);
DECLARE_PLUGIN_HOOK_ARGS(CR_PLUGIN_HOOK__DUMP_EXT_LINK, int index, int type, char *kind);
DECLARE_PLUGIN_HOOK_ARGS(CR_PLUGIN_HOOK__HANDLE_DEVICE_VMA, int fd, const struct stat *stat);
DECLARE_PLUGIN_HOOK_ARGS(CR_PLUGIN_HOOK__UPDATE_VMA_MAP, const char *old_path, char *new_path, const uint64_t addr,
const uint64_t old_pgoff, uint64_t *new_pgoff);
DECLARE_PLUGIN_HOOK_ARGS(CR_PLUGIN_HOOK__RESUME_DEVICES_LATE, int pid);
enum {
CR_PLUGIN_STAGE__DUMP,
@ -130,5 +142,9 @@ typedef int(cr_plugin_restore_file_t)(int id);
typedef int(cr_plugin_dump_ext_mount_t)(char *mountpoint, int id);
typedef int(cr_plugin_restore_ext_mount_t)(int id, char *mountpoint, char *old_root, int *is_file);
typedef int(cr_plugin_dump_ext_link_t)(int index, int type, char *kind);
typedef int(cr_plugin_handle_device_vma_t)(int fd, const struct stat *stat);
typedef int(cr_plugin_update_vma_map_t)(const char *old_path, char *new_path, const uint64_t addr,
const uint64_t old_pgoff, uint64_t *new_pgoff);
typedef int(cr_plugin_resume_devices_late_t)(int pid);
#endif /* __CRIU_PLUGIN_H__ */

View file

@ -54,6 +54,9 @@ static cr_plugin_desc_t *cr_gen_plugin_desc(void *h, char *path)
__assign_hook(DUMP_EXT_MOUNT, "cr_plugin_dump_ext_mount");
__assign_hook(RESTORE_EXT_MOUNT, "cr_plugin_restore_ext_mount");
__assign_hook(DUMP_EXT_LINK, "cr_plugin_dump_ext_link");
__assign_hook(HANDLE_DEVICE_VMA, "cr_plugin_handle_device_vma");
__assign_hook(UPDATE_VMA_MAP, "cr_plugin_update_vma_map");
__assign_hook(RESUME_DEVICES_LATE, "cr_plugin_resume_devices_late");
#undef __assign_hook

View file

@ -45,6 +45,7 @@
#include "protobuf.h"
#include "images/fdinfo.pb-c.h"
#include "images/mnt.pb-c.h"
#include "plugin.h"
#include <stdlib.h>
@ -103,6 +104,19 @@ bool is_vma_range_fmt(char *line)
return __is_vma_range_fmt(line);
}
bool handle_vma_plugin(int *fd, struct stat *stat)
{
int ret;
ret = run_plugins(HANDLE_DEVICE_VMA, *fd, stat);
if (ret < 0) {
pr_perror("handle_device_vma plugin failed");
return false;
}
return true;
}
static void __parse_vmflags(char *buf, u32 *flags, u64 *madv, int *io_pf)
{
char *tok;
@ -188,6 +202,7 @@ struct vma_file_info {
int dev_min;
unsigned long ino;
struct vma_area *vma;
bool has_device_plugin;
};
static inline int vfi_equal(struct vma_file_info *a, struct vma_file_info *b)
@ -577,11 +592,17 @@ static int handle_vma(pid_t pid, struct vma_area *vma_area, const char *file_pat
} else if (*vm_file_fd >= 0) {
struct stat *st_buf = vma_area->vmst;
if (S_ISREG(st_buf->st_mode))
if (S_ISREG(st_buf->st_mode)) {
/* regular file mapping -- supported */;
else if (S_ISCHR(st_buf->st_mode) && (st_buf->st_rdev == DEVZERO))
pr_debug("Found regular file mapping, OK\n");
} else if (S_ISCHR(st_buf->st_mode) && (st_buf->st_rdev == DEVZERO)) {
/* devzero mapping -- also makes sense */;
else {
pr_debug("Found devzero mapping, OK\n");
} else if (handle_vma_plugin(vm_file_fd, st_buf)) {
pr_info("Found device file mapping, plugin is available\n");
vfi->has_device_plugin = true;
} else {
/* non-regular mapping with no supporting plugin */
pr_err("Can't handle non-regular mapping on %d's map %" PRIx64 "\n", pid, vma_area->e->start);
goto err;
}
@ -646,9 +667,23 @@ static int vma_list_add(struct vma_area *vma_area, struct vm_area_list *vma_area
struct vma_file_info *vfi, struct vma_file_info *prev_vfi)
{
if (vma_area->e->status & VMA_UNSUPP) {
pr_err("Unsupported mapping found %016" PRIx64 "-%016" PRIx64 "\n", vma_area->e->start,
vma_area->e->end);
return -1;
if (vfi->has_device_plugin) {
/* Unsupported VMAs that provide special plugins for
* backup can be treated as regular VMAs and criu
* should only save their metadata in the dump files.
* There can be several special backup plugins hooks
* that might run at different stages during checkpoint
* and restore.
*/
pr_debug("Device file mapping %016" PRIx64 "-%016" PRIx64 " "
"must be supported via device plugins\n",
vma_area->e->start, vma_area->e->end);
} else {
pr_err("Unsupported mapping found %016" PRIx64 "-%016" PRIx64 "\n", vma_area->e->start,
vma_area->e->end);
return -1;
}
}
/* Add a guard page only if here is enough space for it */