criu/plugin: Support AMD ROCm Checkpoint Restore with KFD

To support Checkpoint Restore with AMDGPUs for ROCm workloads, introduce
a new plugin to assist CRIU with the help of AMD KFD kernel driver. This
initial commit just provides the basic framework to build up further
capabilities. Like CRIU, the amdgpu plugin also uses protobuf to
serialize
and save the amdkfd data which is mostly VRAM contents with some
metadata.
We generate a data file "amdgpu-kfd-<id>.img" during the dump stage. On restore
this file is read and extracted to re-create various types of buffer
objects that belonged to the previously checkpointed process. Upon
restore the mmap page offset within a device file might change so we use
the new hook to update and adjust the mmap offsets for newly created
target process. This is needed for sys_mmap call in pie restorer phase.
Support for queues and events is added in future patches of this series.

With the current implementation (amdgpu_plugin), we support:
     - Only compute workloads such (Non Gfx) are supported
     - GPU visible inside a container
     - AMD GPU Gfx 9 Family
     - Pytorch Benchmarks such as BERT Base

amdgpu plugin dependes on libdrm and libdrm_amdgpu which are typically
installed with libdrm-dev package. We build amdgpu_plugin only when the
dependencies are met on the target system and when user intends to
install the amdgpu plugin and not by default with criu build.

Suggested-by: Felix Kuehling <felix.kuehling@amd.com>
Co-authored-by: David Yat Sin <david.yatsin@amd.com>
Signed-off-by: David Yat Sin <david.yatsin@amd.com>
Signed-off-by: Rajneesh Bhardwaj <rajneesh.bhardwaj@amd.com>
This commit is contained in:
Rajneesh Bhardwaj 2021-12-15 14:18:02 -05:00 committed by Andrei Vagin
parent 71ff9cc045
commit 55a5993bc7
9 changed files with 1065 additions and 52 deletions

View file

@ -16,6 +16,7 @@ ifeq ($(PYTHON),python3)
SRC1 += criu-ns.txt
endif
SRC1 += compel.txt
SRC1 += amdgpu_plugin.txt
SRC8 += criu.txt
SRC := $(SRC1) $(SRC8)
XMLS := $(patsubst %.txt,%.xml,$(SRC))

View file

@ -0,0 +1,45 @@
ROCM Support(1)
===============
NAME
----
amdgpu_plugin - A plugin extension to CRIU to support checkpoint/restore in
userspace for AMD GPUs.
CURRENT SUPPORT
---------------
Single GPU systems (Gfx9)
Checkpoint / Restore on same system
Checkpoint / Restore inside a docker container
Pytorch
DESCRIPTION
-----------
Though *criu* is a great tool for checkpointing and restoring running
applications, it has certain limitations such as it cannot handle
applications that have device files open. In order to support *ROCm* based
workloads with *criu* we need to augment criu's core functionality with a
plugin based extension mechanism. *amdgpu_plugin* provides the necessary support
to criu to allow Checkpoint / Restore with ROCm.
Dependencies
~~~~~~~~~~~~~~
*amdkfd support*::
In order to snapshot the *VRAM* and other *GPU* device states, we require
an updated version of amdkfd(amdgpu) driver. The kernel patches are under
review currently.
*criu 3.16*::
This work is rebased on latest criu release available at this time.
AUTHOR
------
The AMDKFD team.
COPYRIGHT
---------
Copyright \(C) 2020-2021, Advanced Micro Devices, Inc. (AMD)

View file

@ -284,9 +284,9 @@ clean mrproper:
$(Q) $(MAKE) $(build)=crit $@
.PHONY: clean mrproper
clean-dummy_amdgpu_plugin:
clean-amdgpu_plugin:
$(Q) $(MAKE) -C plugins/amdgpu clean
.PHONY: clean dummy_amdgpu_plugin
.PHONY: clean-amdgpu_plugin
clean-top:
$(Q) $(MAKE) -C Documentation clean
@ -294,9 +294,9 @@ clean-top:
$(Q) $(RM) .gitid
.PHONY: clean-top
clean: clean-top clean-dummy_amdgpu_plugin
clean: clean-top clean-amdgpu_plugin
mrproper-top: clean-top clean-dummy_amdgpu_plugin
mrproper-top: clean-top clean-amdgpu_plugin
$(Q) $(RM) $(CONFIG_HEADER)
$(Q) $(RM) $(VERSION_HEADER)
$(Q) $(RM) $(COMPEL_VERSION_HEADER)
@ -324,9 +324,9 @@ test: zdtm
$(Q) $(MAKE) -C test
.PHONY: test
dummy_amdgpu_plugin:
amdgpu_plugin: criu
$(Q) $(MAKE) -C plugins/amdgpu all
.PHONY: dummy_amdgpu_plugin
.PHONY: amdgpu_plugin
#
# Generating tar requires tag matched CRIU_VERSION.
@ -408,6 +408,7 @@ help:
@echo ' unittest - Run unit tests'
@echo ' lint - Run code linters'
@echo ' indent - Indent C code'
@echo ' amdgpu_plugin - Make AMD GPU plugin'
.PHONY: help
lint:

View file

@ -21,6 +21,14 @@ ifeq ($(call pkg-config-check,libbpf),y)
export CONFIG_HAS_LIBBPF := y
endif
ifeq ($(call pkg-config-check,libdrm),y)
export CONFIG_AMDGPU := y
$(info Note: Building criu with amdgpu_plugin.)
else
$(info Note: Building criu without amdgpu_plugin.)
$(info Note: libdrm and libdrm_amdgpu are required to build amdgpu_plugin.)
endif
ifeq ($(NO_GNUTLS)x$(call pkg-config-check,gnutls),xy)
LIBS_FEATURES += -lgnutls
export CONFIG_GNUTLS := y

View file

@ -41,16 +41,16 @@ install-criu: criu
$(Q) $(MAKE) $(build)=criu install
.PHONY: install-criu
install-dummy_amdgpu_plugin: dummy_amdgpu_plugin
install-amdgpu_plugin: amdgpu_plugin
$(Q) $(MAKE) -C plugins/amdgpu install
.PHONY: install-dummy_amdgpu_plugin
.PHONY: install-amdgpu_plugin
install-compel: $(compel-install-targets)
$(Q) $(MAKE) $(build)=compel install
$(Q) $(MAKE) $(build)=compel/plugins install
.PHONY: install-compel
install: install-man install-lib install-criu install-compel ;
install: install-man install-lib install-criu install-compel install-amdgpu_plugin ;
.PHONY: install
uninstall:

View file

@ -1,13 +1,49 @@
all: dummy_plugin.so
PLUGIN_NAME := amdgpu_plugin
PLUGIN_SOBJ := amdgpu_plugin.so
dummy_plugin.so: dummy_plugin.c
gcc -g -Werror -D _GNU_SOURCE -Wall -shared -nostartfiles dummy_plugin.c -o dummy_plugin.so -iquote ../../../criu/include -iquote ../../criu/include -fPIC
PLUGIN_INC := ../../../criu/include
PLUGIN_INC_EXTRA := ../../criu/include
PLUGIN_INCLUDE := -iquote$(PLUGIN_INC) -iquote$(PLUGIN_INC_EXTRA)
LIBDRM_INC := -I/usr/include/libdrm
DEPS_OK := amdgpu_plugin.so
DEPS_NOK := ;
include $(__nmk_dir)msg.mk
CC := gcc
PLUGIN_CFLAGS := -g -Wall -Werror -D _GNU_SOURCE -shared -nostartfiles -fPIC
ifeq ($(CONFIG_AMDGPU),y)
all: $(DEPS_OK)
else
all: $(DEPS_NOK)
endif
criu-amdgpu.pb-c.c: criu-amdgpu.proto
protoc-c --proto_path=. --c_out=. criu-amdgpu.proto
amdgpu_plugin.so: amdgpu_plugin.c criu-amdgpu.pb-c.c
$(CC) $(PLUGIN_CFLAGS) $^ -o $@ $(PLUGIN_INCLUDE)
amdgpu_plugin_clean:
$(call msg-clean, $@)
$(Q) $(RM) amdgpu_plugin.so criu-amdgpu.pb-c*
.PHONY: amdgpu_plugin_clean
clean: amdgpu_plugin_clean
mrproper: clean
clean:
$(Q) $(RM) dummy_plugin.so
install:
$(Q) mkdir -p $(PLUGINDIR)
$(Q) install -m 644 dummy_plugin.so $(PLUGINDIR)
ifeq ($(CONFIG_AMDGPU),y)
$(E) " INSTALL " $(PLUGIN_NAME)
$(Q) install -m 644 $(PLUGIN_SOBJ) $(PLUGINDIR)
endif
.PHONY: install
uninstall:
$(Q) $(RM) $(PLUGINDIR)/dummy_plugin.so
ifeq ($(CONFIG_AMDGPU),y)
$(E) " UNINSTALL" $(PLUGIN_NAME)
$(Q) $(RM) $(PLUGINDIR)/$(PLUGIN_SOBJ)
endif
.PHONY: uninstall

View file

@ -0,0 +1,930 @@
#include <errno.h>
#include <fcntl.h>
#include <stdlib.h>
#include <stdio.h>
#include <string.h>
#include <unistd.h>
#include <linux/limits.h>
#include <sys/ioctl.h>
#include <sys/stat.h>
#include <sys/sysmacros.h>
#include <sys/mman.h>
#include <sys/types.h>
#include <stdint.h>
#include "criu-plugin.h"
#include "plugin.h"
#include "criu-amdgpu.pb-c.h"
#include "kfd_ioctl.h"
#include "xmalloc.h"
#include "criu-log.h"
#include "common/list.h"
#define DRM_FIRST_RENDER_NODE 128
#define DRM_LAST_RENDER_NODE 255
#define AMDGPU_KFD_DEVICE "/dev/kfd"
#define PROCPIDMEM "/proc/%d/mem"
#ifndef _GNU_SOURCE
#define _GNU_SOURCE 1
#endif
#ifdef LOG_PREFIX
#undef LOG_PREFIX
#endif
#define LOG_PREFIX "amdgpu_plugin: "
#ifdef DEBUG
#define plugin_log_msg(fmt, ...) pr_debug(fmt, ##__VA_ARGS__)
#else
#define plugin_log_msg(fmt, ...) \
{ \
}
#endif
struct vma_metadata {
struct list_head list;
uint64_t old_pgoff;
uint64_t new_pgoff;
uint64_t vma_entry;
};
static LIST_HEAD(update_vma_info_list);
int open_drm_render_device(int minor)
{
char path[128];
int fd;
if (minor < DRM_FIRST_RENDER_NODE || minor > DRM_LAST_RENDER_NODE) {
pr_perror("DRM render minor %d out of range [%d, %d]", minor, DRM_FIRST_RENDER_NODE,
DRM_LAST_RENDER_NODE);
return -EINVAL;
}
sprintf(path, "/dev/dri/renderD%d", minor);
fd = open(path, O_RDWR | O_CLOEXEC);
if (fd < 0) {
if (errno != ENOENT && errno != EPERM) {
pr_err("Failed to open %s: %s\n", path, strerror(errno));
if (errno == EACCES)
pr_err("Check user is in \"video\" group\n");
}
return -EBADFD;
}
return fd;
}
int write_file(const char *file_path, const void *buf, const size_t buf_len)
{
int fd;
FILE *fp;
size_t len_wrote;
fd = openat(criu_get_image_dir(), file_path, O_WRONLY | O_CREAT, 0600);
if (fd < 0) {
pr_perror("Cannot open %s", file_path);
return -errno;
}
fp = fdopen(fd, "w");
if (!fp) {
pr_perror("Cannot fdopen %s", file_path);
return -errno;
}
len_wrote = fwrite(buf, 1, buf_len, fp);
if (len_wrote != buf_len) {
pr_perror("Unable to write %s (wrote:%ld buf_len:%ld)", file_path, len_wrote, buf_len);
fclose(fp);
return -EIO;
}
pr_info("Wrote file:%s (%ld bytes)\n", file_path, buf_len);
/* this will also close fd */
fclose(fp);
return 0;
}
int read_file(const char *file_path, void *buf, const size_t buf_len)
{
int fd;
FILE *fp;
size_t len_read;
fd = openat(criu_get_image_dir(), file_path, O_RDONLY);
if (fd < 0) {
pr_perror("Cannot open %s", file_path);
return -errno;
}
fp = fdopen(fd, "r");
if (!fp) {
pr_perror("Cannot fdopen %s", file_path);
return -errno;
}
len_read = fread(buf, 1, buf_len, fp);
if (len_read != buf_len) {
pr_perror("Unable to read %s", file_path);
fclose(fp);
return -EIO;
}
pr_info("Read file:%s (%ld bytes)\n", file_path, buf_len);
/* this will also close fd */
fclose(fp);
return 0;
}
/* Call ioctl, restarting if it is interrupted */
int kmtIoctl(int fd, unsigned long request, void *arg)
{
int ret, max_retries = 200;
do {
ret = ioctl(fd, request, arg);
} while (ret == -1 && max_retries-- > 0 && (errno == EINTR || errno == EAGAIN));
if (ret == -1 && errno == EBADF)
/* In case pthread_atfork didn't catch it, this will
* make any subsequent hsaKmt calls fail in CHECK_KFD_OPEN.
*/
pr_perror("KFD file descriptor not valid in this process");
return ret;
}
static void free_e(CriuKfd *e)
{
for (int i = 0; i < e->n_bo_entries; i++) {
if (e->bo_entries[i]) {
if (e->bo_entries[i]->rawdata.data)
xfree(e->bo_entries[i]->rawdata.data);
xfree(e->bo_entries[i]);
}
}
for (int i = 0; i < e->n_device_entries; i++) {
if (e->device_entries[i])
xfree(e->device_entries[i]);
}
xfree(e);
}
static int allocate_device_entries(CriuKfd *e, int num_of_devices)
{
e->device_entries = xmalloc(sizeof(DeviceEntry *) * num_of_devices);
if (!e->device_entries) {
pr_err("Failed to allocate device_entries\n");
return -ENOMEM;
}
for (int i = 0; i < num_of_devices; i++) {
DeviceEntry *entry = xzalloc(sizeof(*entry));
if (!entry) {
pr_err("Failed to allocate entry\n");
return -ENOMEM;
}
device_entry__init(entry);
e->device_entries[i] = entry;
e->n_device_entries++;
}
return 0;
}
static int allocate_bo_entries(CriuKfd *e, int num_bos, struct kfd_criu_bo_bucket *bo_bucket_ptr)
{
e->bo_entries = xmalloc(sizeof(BoEntry *) * num_bos);
if (!e->bo_entries) {
pr_err("Failed to allocate bo_info\n");
return -ENOMEM;
}
for (int i = 0; i < num_bos; i++) {
BoEntry *entry = xzalloc(sizeof(*entry));
if (!entry) {
pr_err("Failed to allocate botest\n");
return -ENOMEM;
}
bo_entry__init(entry);
if ((bo_bucket_ptr)[i].alloc_flags & KFD_IOC_ALLOC_MEM_FLAGS_VRAM ||
(bo_bucket_ptr)[i].alloc_flags & KFD_IOC_ALLOC_MEM_FLAGS_GTT) {
entry->rawdata.data = xmalloc((bo_bucket_ptr)[i].size);
entry->rawdata.len = (bo_bucket_ptr)[i].size;
}
e->bo_entries[i] = entry;
e->n_bo_entries++;
}
return 0;
}
int amdgpu_plugin_init(int stage)
{
pr_info("amdgpu_plugin: initialized: %s (AMDGPU/KFD)\n", CR_PLUGIN_DESC.name);
return 0;
}
void amdgpu_plugin_fini(int stage, int ret)
{
pr_info("amdgpu_plugin: finished %s (AMDGPU/KFD)\n", CR_PLUGIN_DESC.name);
}
CR_PLUGIN_REGISTER("amdgpu_plugin", amdgpu_plugin_init, amdgpu_plugin_fini)
int amdgpu_plugin_handle_device_vma(int fd, const struct stat *st_buf)
{
struct stat st_kfd, st_dri_min;
char img_path[128];
int ret = 0;
pr_debug("amdgpu_plugin: Enter %s\n", __func__);
ret = stat(AMDGPU_KFD_DEVICE, &st_kfd);
if (ret == -1) {
pr_perror("stat error for /dev/kfd");
return ret;
}
snprintf(img_path, sizeof(img_path), "/dev/dri/renderD%d", DRM_FIRST_RENDER_NODE);
ret = stat(img_path, &st_dri_min);
if (ret == -1) {
pr_perror("stat error for %s", img_path);
return ret;
}
if (major(st_buf->st_rdev) == major(st_kfd.st_rdev) || ((major(st_buf->st_rdev) == major(st_dri_min.st_rdev)) &&
(minor(st_buf->st_rdev) >= minor(st_dri_min.st_rdev) &&
minor(st_buf->st_rdev) >= DRM_FIRST_RENDER_NODE))) {
pr_debug("Known non-regular mapping, kfd-renderD%d -> OK\n", minor(st_buf->st_rdev));
pr_debug("AMD KFD(maj) = %d, DRI(maj,min) = %d:%d VMA Device fd(maj,min) = %d:%d\n",
major(st_kfd.st_rdev), major(st_dri_min.st_rdev), minor(st_dri_min.st_rdev),
major(st_buf->st_rdev), minor(st_buf->st_rdev));
/* VMA belongs to kfd */
return 0;
}
pr_perror("amdgpu_plugin: Can't handle the VMA mapping");
return -ENOTSUP;
}
CR_PLUGIN_REGISTER_HOOK(CR_PLUGIN_HOOK__HANDLE_DEVICE_VMA, amdgpu_plugin_handle_device_vma)
static int unpause_process(int fd)
{
int ret = 0;
struct kfd_ioctl_criu_args args = { 0 };
args.op = KFD_CRIU_OP_UNPAUSE;
ret = kmtIoctl(fd, AMDKFD_IOC_CRIU_OP, &args);
if (ret) {
pr_perror("amdgpu_plugin: Failed to unpause process");
goto exit;
}
exit:
pr_info("Process unpaused %s (ret:%d)\n", ret ? "Failed" : "Ok", ret);
return ret;
}
static int save_devices(int fd, struct kfd_ioctl_criu_args *args, struct kfd_criu_device_bucket *device_buckets,
CriuKfd *e)
{
int ret = 0;
pr_debug("Dumping %d devices\n", args->num_devices);
e->num_of_gpus = args->num_devices;
ret = allocate_device_entries(e, e->num_of_gpus);
if (ret) {
ret = -ENOMEM;
goto exit;
}
plugin_log_msg("Number of GPUs:%d\n", e->num_of_gpus);
exit:
pr_info("Dumped devices %s (ret:%d)\n", ret ? "Failed" : "Ok", ret);
return ret;
}
static int save_bos(int fd, struct kfd_ioctl_criu_args *args, struct kfd_criu_bo_bucket *bo_buckets, CriuKfd *e)
{
int ret = 0, i;
char *fname;
pr_debug("Dumping %d BOs\n", args->num_bos);
e->num_of_bos = args->num_bos;
ret = allocate_bo_entries(e, e->num_of_bos, bo_buckets);
if (ret)
goto exit;
for (i = 0; i < e->num_of_bos; i++) {
struct kfd_criu_bo_bucket *bo_bucket = &bo_buckets[i];
BoEntry *boinfo = e->bo_entries[i];
boinfo->gpu_id = bo_bucket->gpu_id;
boinfo->addr = bo_bucket->addr;
boinfo->size = bo_bucket->size;
boinfo->offset = bo_bucket->offset;
boinfo->alloc_flags = bo_bucket->alloc_flags;
if (bo_bucket->alloc_flags & KFD_IOC_ALLOC_MEM_FLAGS_VRAM ||
bo_bucket->alloc_flags & KFD_IOC_ALLOC_MEM_FLAGS_GTT) {
if (bo_bucket->alloc_flags & KFD_IOC_ALLOC_MEM_FLAGS_PUBLIC) {
void *addr;
pr_info("amdgpu_plugin: large bar read possible\n");
addr = mmap(NULL, boinfo->size, PROT_READ, MAP_SHARED, fd, boinfo->offset);
if (addr == MAP_FAILED) {
pr_perror("amdgpu_plugin: mmap failed\n");
ret = -errno;
goto exit;
}
/* direct memcpy is possible on large bars */
memcpy(boinfo->rawdata.data, addr, boinfo->size);
munmap(addr, boinfo->size);
} else {
size_t bo_size;
int mem_fd;
pr_info("Now try reading BO contents with /proc/pid/mem\n");
if (asprintf(&fname, PROCPIDMEM, args->pid) < 0) {
pr_perror("failed in asprintf, %s", fname);
ret = -1;
goto exit;
}
mem_fd = open(fname, O_RDONLY);
if (mem_fd < 0) {
pr_perror("Can't open %s for pid %d", fname, args->pid);
free(fname);
close(mem_fd);
ret = -1;
goto exit;
}
pr_info("Opened %s file for pid = %d\n", fname, args->pid);
free(fname);
if (lseek(mem_fd, (off_t)bo_bucket->addr, SEEK_SET) == -1) {
pr_perror("Can't lseek for bo_offset for pid = %d", args->pid);
close(mem_fd);
ret = -1;
goto exit;
}
bo_size = read(mem_fd, boinfo->rawdata.data, boinfo->size);
if (bo_size != boinfo->size) {
close(mem_fd);
pr_perror("Can't read buffer");
ret = -1;
goto exit;
}
close(mem_fd);
}
}
}
exit:
pr_info("Dumped bos %s (ret:%d)\n", ret ? "failed" : "ok", ret);
return ret;
}
int amdgpu_plugin_dump_file(int fd, int id)
{
struct kfd_ioctl_criu_args args = { 0 };
char img_path[PATH_MAX];
struct stat st, st_kfd;
unsigned char *buf;
CriuKfd *e = NULL;
int ret = 0;
size_t len;
if (fstat(fd, &st) == -1) {
pr_perror("amdgpu_plugin: fstat error");
return -1;
}
ret = stat(AMDGPU_KFD_DEVICE, &st_kfd);
if (ret == -1) {
pr_perror("amdgpu_plugin: fstat error for /dev/kfd");
return -1;
}
/* Check whether this plugin was called for kfd or render nodes */
if (major(st.st_rdev) != major(st_kfd.st_rdev) || minor(st.st_rdev) != 0) {
/* This is RenderD dumper plugin, for now just save renderD
* minor number to be used during restore. In later phases this
* needs to save more data for video decode etc.
*/
CriuRenderNode rd = CRIU_RENDER_NODE__INIT;
pr_info("amdgpu_plugin: Dumper called for /dev/dri/renderD%d, FD = %d, ID = %d\n", minor(st.st_rdev),
fd, id);
rd.minor_number = minor(st.st_rdev);
len = criu_render_node__get_packed_size(&rd);
buf = xmalloc(len);
if (!buf)
return -ENOMEM;
criu_render_node__pack(&rd, buf);
snprintf(img_path, sizeof(img_path), "amdgpu-renderD-%d.img", id);
ret = write_file(img_path, buf, len);
if (ret) {
xfree(buf);
return ret;
}
xfree(buf);
/* Need to return success here so that criu can call plugins for renderD nodes */
return ret;
}
pr_info("amdgpu_plugin: %s : %s() called for fd = %d\n", CR_PLUGIN_DESC.name, __func__, major(st.st_rdev));
args.op = KFD_CRIU_OP_PROCESS_INFO;
if (kmtIoctl(fd, AMDKFD_IOC_CRIU_OP, &args) == -1) {
pr_perror("amdgpu_plugin: Failed to call process info ioctl");
ret = -1;
goto exit;
}
pr_info("amdgpu_plugin: devices:%d bos:%d objects:%d priv_data:%lld\n", args.num_devices, args.num_bos,
args.num_objects, args.priv_data_size);
e = xmalloc(sizeof(*e));
if (!e) {
pr_err("Failed to allocate proto structure\n");
ret = -ENOMEM;
goto exit;
}
criu_kfd__init(e);
e->pid = args.pid;
args.devices = (uintptr_t)xzalloc((args.num_devices * sizeof(struct kfd_criu_device_bucket)));
if (!args.devices) {
ret = -ENOMEM;
goto exit;
}
args.bos = (uintptr_t)xzalloc((args.num_bos * sizeof(struct kfd_criu_bo_bucket)));
if (!args.bos) {
ret = -ENOMEM;
goto exit;
}
args.priv_data = (uintptr_t)xzalloc((args.priv_data_size));
if (!args.priv_data) {
ret = -ENOMEM;
goto exit;
}
args.op = KFD_CRIU_OP_CHECKPOINT;
ret = kmtIoctl(fd, AMDKFD_IOC_CRIU_OP, &args);
if (ret) {
pr_perror("amdgpu_plugin: Failed to call dumper (process) ioctl");
goto exit;
}
ret = save_devices(fd, &args, (struct kfd_criu_device_bucket *)args.devices, e);
if (ret)
goto exit;
ret = save_bos(fd, &args, (struct kfd_criu_bo_bucket *)args.bos, e);
if (ret)
goto exit;
e->num_of_objects = args.num_objects;
e->priv_data.data = (void *)args.priv_data;
e->priv_data.len = args.priv_data_size;
snprintf(img_path, sizeof(img_path), "amdgpu-kfd-%d.img", id);
pr_info("amdgpu_plugin: img_path = %s\n", img_path);
len = criu_kfd__get_packed_size(e);
pr_info("amdgpu_plugin: Len = %ld\n", len);
buf = xmalloc(len);
if (!buf) {
pr_perror("Failed to allocate memory to store protobuf");
ret = -ENOMEM;
goto exit;
}
criu_kfd__pack(e, buf);
ret = write_file(img_path, buf, len);
xfree(buf);
exit:
/* Restore all queues */
unpause_process(fd);
xfree((void *)args.devices);
xfree((void *)args.bos);
xfree((void *)args.priv_data);
free_e(e);
if (ret)
pr_err("amdgpu_plugin: Failed to dump (ret:%d)\n", ret);
else
pr_info("amdgpu_plugin: Dump successful\n");
return ret;
}
CR_PLUGIN_REGISTER_HOOK(CR_PLUGIN_HOOK__DUMP_EXT_FILE, amdgpu_plugin_dump_file)
/* Restore per-device information */
static int restore_devices(struct kfd_ioctl_criu_args *args, CriuKfd *e)
{
struct kfd_criu_device_bucket *device_buckets;
int ret = 0, bucket_index = 0;
pr_debug("Restoring %d devices\n", e->num_of_gpus);
args->num_devices = e->num_of_gpus;
device_buckets = xzalloc(sizeof(*device_buckets) * args->num_devices);
if (!device_buckets)
return -ENOMEM;
args->devices = (uintptr_t)device_buckets;
for (int i = 0; i < e->num_of_gpus; i++) {
struct kfd_criu_device_bucket *device_bucket;
DeviceEntry *devinfo = e->device_entries[i];
device_bucket = &device_buckets[bucket_index++];
device_bucket->user_gpu_id = devinfo->gpu_id;
device_bucket->drm_fd = open_drm_render_device(i + DRM_FIRST_RENDER_NODE);
if (device_bucket->drm_fd < 0) {
pr_perror("amdgpu_plugin: Can't pass NULL drm render fd to driver");
goto exit;
} else {
pr_info("amdgpu_plugin: passing drm render fd = %d to driver\n", device_bucket->drm_fd);
}
}
exit:
pr_info("Restore devices %s (ret:%d)\n", ret ? "Failed" : "Ok", ret);
return ret;
}
static int restore_bos(struct kfd_ioctl_criu_args *args, CriuKfd *e)
{
struct kfd_criu_bo_bucket *bo_buckets;
pr_debug("Restoring %ld BOs\n", e->num_of_bos);
args->num_bos = e->num_of_bos;
bo_buckets = xzalloc(sizeof(*bo_buckets) * args->num_bos);
if (!bo_buckets)
return -ENOMEM;
args->bos = (uintptr_t)bo_buckets;
for (int i = 0; i < args->num_bos; i++) {
struct kfd_criu_bo_bucket *bo_bucket = &bo_buckets[i];
BoEntry *bo_entry = e->bo_entries[i];
bo_bucket->gpu_id = bo_entry->gpu_id;
bo_bucket->addr = bo_entry->addr;
bo_bucket->size = bo_entry->size;
bo_bucket->offset = bo_entry->offset;
bo_bucket->alloc_flags = bo_entry->alloc_flags;
plugin_log_msg("BO [%d] gpu_id:%x addr:%llx size:%llx offset:%llx\n", i, bo_bucket->gpu_id,
bo_bucket->addr, bo_bucket->size, bo_bucket->offset);
}
pr_info("Restore BOs Ok\n");
return 0;
}
static int restore_bo_data(int fd, struct kfd_criu_bo_bucket *bo_buckets, CriuKfd *e)
{
int mem_fd = -1;
for (int i = 0; i < e->num_of_bos; i++) {
void *addr;
struct kfd_criu_bo_bucket *bo_bucket = &bo_buckets[i];
BoEntry *bo_entry = e->bo_entries[i];
if (bo_bucket->alloc_flags & (KFD_IOC_ALLOC_MEM_FLAGS_VRAM | KFD_IOC_ALLOC_MEM_FLAGS_GTT |
KFD_IOC_ALLOC_MEM_FLAGS_MMIO_REMAP | KFD_IOC_ALLOC_MEM_FLAGS_DOORBELL)) {
struct vma_metadata *vma_md;
vma_md = xmalloc(sizeof(*vma_md));
if (!vma_md)
return -ENOMEM;
vma_md->old_pgoff = bo_bucket->offset;
vma_md->vma_entry = bo_bucket->addr;
vma_md->new_pgoff = bo_bucket->restored_offset;
plugin_log_msg("amdgpu_plugin: adding vma_entry:addr:0x%lx old-off:0x%lx "
"new_off:0x%lx new_minor:%d\n",
vma_md->vma_entry, vma_md->old_pgoff, vma_md->new_pgoff, vma_md->new_minor);
list_add_tail(&vma_md->list, &update_vma_info_list);
}
if (bo_bucket->alloc_flags & (KFD_IOC_ALLOC_MEM_FLAGS_VRAM | KFD_IOC_ALLOC_MEM_FLAGS_GTT)) {
pr_info("amdgpu_plugin: Trying mmap in stage 2\n");
if (bo_bucket->alloc_flags & KFD_IOC_ALLOC_MEM_FLAGS_PUBLIC ||
bo_bucket->alloc_flags & KFD_IOC_ALLOC_MEM_FLAGS_GTT) {
plugin_log_msg("amdgpu_plugin: large bar write possible\n");
addr = mmap(NULL, bo_bucket->size, PROT_WRITE, MAP_SHARED, fd,
bo_bucket->restored_offset);
if (addr == MAP_FAILED) {
pr_perror("amdgpu_plugin: mmap failed");
fd = -EBADFD;
goto exit;
}
/* direct memcpy is possible on large bars */
memcpy(addr, (void *)bo_entry->rawdata.data, bo_entry->size);
munmap(addr, bo_entry->size);
} else {
size_t bo_size;
char *fname;
/* Use indirect host data path via /proc/pid/mem
* on small pci bar GPUs or for Buffer Objects
* that don't have HostAccess permissions.
*/
plugin_log_msg("amdgpu_plugin: using PROCPIDMEM to restore BO contents\n");
addr = mmap(NULL, bo_bucket->size, PROT_NONE, MAP_SHARED, fd,
bo_bucket->restored_offset);
if (addr == MAP_FAILED) {
pr_perror("amdgpu_plugin: mmap failed");
fd = -EBADFD;
goto exit;
}
if (asprintf(&fname, PROCPIDMEM, e->pid) < 0) {
pr_perror("failed in asprintf, %s", fname);
munmap(addr, bo_bucket->size);
fd = -EBADFD;
goto exit;
}
mem_fd = open(fname, O_RDWR);
if (mem_fd < 0) {
pr_perror("Can't open %s for pid %d", fname, e->pid);
free(fname);
munmap(addr, bo_bucket->size);
fd = -EBADFD;
goto exit;
}
plugin_log_msg("Opened %s file for pid = %d", fname, e->pid);
free(fname);
if (lseek(mem_fd, (off_t)addr, SEEK_SET) == -1) {
pr_perror("Can't lseek for bo_offset for pid = %d", e->pid);
munmap(addr, bo_entry->size);
fd = -EBADFD;
goto exit;
}
plugin_log_msg("Attempt writing now");
bo_size = write(mem_fd, bo_entry->rawdata.data, bo_entry->size);
if (bo_size != bo_entry->size) {
pr_perror("Can't write buffer");
munmap(addr, bo_entry->size);
fd = -EBADFD;
goto exit;
}
munmap(addr, bo_entry->size);
close(mem_fd);
}
} else {
plugin_log_msg("Not a VRAM BO\n");
continue;
}
}
exit:
if (mem_fd > 0)
close(mem_fd);
return 0;
}
int amdgpu_plugin_restore_file(int id)
{
int ret = 0, fd;
char img_path[PATH_MAX];
struct stat filestat;
unsigned char *buf;
CriuRenderNode *rd;
CriuKfd *e = NULL;
struct kfd_ioctl_criu_args args = { 0 };
pr_info("amdgpu_plugin: Initialized kfd plugin restorer with ID = %d\n", id);
snprintf(img_path, sizeof(img_path), "amdgpu-kfd-%d.img", id);
if (stat(img_path, &filestat) == -1) {
pr_perror("open(%s)", img_path);
/* This is restorer plugin for renderD nodes. Since criu doesn't
* gurantee that they will be called before the plugin is called
* for kfd file descriptor, we need to make sure we open the render
* nodes only once and before /dev/kfd is open, the render nodes
* are open too. Generally, it is seen that during checkpoint and
* restore both, the kfd plugin gets called first.
*/
snprintf(img_path, sizeof(img_path), "amdgpu-renderD-%d.img", id);
if (stat(img_path, &filestat) == -1) {
pr_perror("Failed to read file stats");
return -1;
}
pr_info("renderD file size on disk = %ld\n", filestat.st_size);
buf = xmalloc(filestat.st_size);
if (!buf) {
pr_perror("Failed to allocate memory");
return -ENOMEM;
}
if (read_file(img_path, buf, filestat.st_size)) {
pr_perror("Unable to read from %s", img_path);
xfree(buf);
return -1;
}
rd = criu_render_node__unpack(NULL, filestat.st_size, buf);
if (rd == NULL) {
pr_perror("Unable to parse the KFD message %d", id);
xfree(buf);
return -1;
}
pr_info("amdgpu_plugin: render node minor num = %d\n", rd->minor_number);
fd = open_drm_render_device(rd->minor_number);
criu_render_node__free_unpacked(rd, NULL);
xfree(buf);
return fd;
}
fd = open(AMDGPU_KFD_DEVICE, O_RDWR | O_CLOEXEC);
if (fd < 0) {
pr_perror("failed to open kfd in plugin");
return -1;
}
pr_info("amdgpu_plugin: Opened kfd, fd = %d\n", fd);
pr_info("kfd img file size on disk = %ld\n", filestat.st_size);
buf = xmalloc(filestat.st_size);
if (!buf) {
pr_perror("Failed to allocate memory");
return -ENOMEM;
}
if (read_file(img_path, buf, filestat.st_size)) {
pr_perror("Unable to read from %s", img_path);
xfree(buf);
return -1;
}
e = criu_kfd__unpack(NULL, filestat.st_size, buf);
if (e == NULL) {
pr_err("Unable to parse the KFD message %#x\n", id);
xfree(buf);
return -1;
}
plugin_log_msg("amdgpu_plugin: read image file data\n");
ret = restore_devices(&args, e);
if (ret)
goto exit;
ret = restore_bos(&args, e);
if (ret)
goto exit;
args.num_objects = e->num_of_objects;
args.priv_data_size = e->priv_data.len;
args.priv_data = (uintptr_t)e->priv_data.data;
args.op = KFD_CRIU_OP_RESTORE;
if (kmtIoctl(fd, AMDKFD_IOC_CRIU_OP, &args) == -1) {
pr_perror("Restore ioctl failed");
ret = -1;
goto exit;
}
ret = restore_bo_data(fd, (struct kfd_criu_bo_bucket *)args.bos, e);
if (ret)
goto exit;
exit:
if (e)
criu_kfd__free_unpacked(e, NULL);
xfree((void *)args.devices);
xfree((void *)args.bos);
xfree(buf);
if (ret) {
pr_err("amdgpu_plugin: Failed to restore (ret:%d)\n", ret);
fd = ret;
} else {
pr_info("amdgpu_plugin: Restore successful (fd:%d)\n", fd);
}
return fd;
}
CR_PLUGIN_REGISTER_HOOK(CR_PLUGIN_HOOK__RESTORE_EXT_FILE, amdgpu_plugin_restore_file)
/* return 0 if no match found
* return -1 for error.
* return 1 if vmap map must be adjusted.
*/
int amdgpu_plugin_update_vmamap(const char *path, const uint64_t addr, const uint64_t old_offset, uint64_t *new_offset,
int *updated_fd)
{
struct vma_metadata *vma_md;
plugin_log_msg("amdgpu_plugin: Enter %s\n", __func__);
/*
* On newer versions of AMD KFD driver, only the file descriptor that was used to open the
* device can be used for mmap, so we will have to return the proper file descriptor here
*/
*updated_fd = -1;
list_for_each_entry(vma_md, &update_vma_info_list, list) {
if (addr == vma_md->vma_entry && old_offset == vma_md->old_pgoff) {
*new_offset = vma_md->new_pgoff;
plugin_log_msg("amdgpu_plugin: old_pgoff= 0x%lx new_pgoff = 0x%lx path = %s\n",
vma_md->old_pgoff, vma_md->new_pgoff, path);
return 1;
}
}
pr_info("No match for addr:0x%lx offset:%lx\n", addr, old_offset);
return 0;
}
CR_PLUGIN_REGISTER_HOOK(CR_PLUGIN_HOOK__UPDATE_VMA_MAP, amdgpu_plugin_update_vmamap)
int amdgpu_plugin_resume_devices_late(int target_pid)
{
struct kfd_ioctl_criu_args args = { 0 };
int fd, ret = 0;
pr_info("amdgpu_plugin: Inside %s for target pid = %d\n", __func__, target_pid);
fd = open(AMDGPU_KFD_DEVICE, O_RDWR | O_CLOEXEC);
if (fd < 0) {
pr_perror("failed to open kfd in plugin");
return -1;
}
args.pid = target_pid;
args.op = KFD_CRIU_OP_RESUME;
pr_info("amdgpu_plugin: Calling IOCTL to start notifiers and queues\n");
if (kmtIoctl(fd, AMDKFD_IOC_CRIU_OP, &args) == -1) {
pr_perror("restore late ioctl failed");
ret = -1;
}
close(fd);
return ret;
}
CR_PLUGIN_REGISTER_HOOK(CR_PLUGIN_HOOK__RESUME_DEVICES_LATE, amdgpu_plugin_resume_devices_late)

View file

@ -0,0 +1,28 @@
syntax = "proto2";
message device_entry {
required uint32 gpu_id = 1;
}
message bo_entry {
required uint64 addr = 1;
required uint64 size = 2;
required uint64 offset = 3;
required uint32 alloc_flags = 4;
required uint32 gpu_id = 5;
required bytes rawdata = 6;
}
message criu_kfd {
required uint32 pid = 1;
required uint32 num_of_gpus = 2;
repeated device_entry device_entries = 3;
required uint64 num_of_bos = 4;
repeated bo_entry bo_entries = 5;
required uint32 num_of_objects = 6;
required bytes priv_data = 7;
}
message criu_render_node {
required uint32 minor_number = 1;
}

View file

@ -1,36 +0,0 @@
#include <sys/stat.h>
#include "criu-log.h"
#include "criu-plugin.h"
int dummy_plugin_handle_device_vma(int fd, const struct stat *stat)
{
pr_info("dummy_plugin: Inside %s for fd = %d\n", __func__, fd);
/* let criu report failure for the unsupported mapping */
return -ENOTSUP;
}
CR_PLUGIN_REGISTER_HOOK(CR_PLUGIN_HOOK__HANDLE_DEVICE_VMA, dummy_plugin_handle_device_vma)
int dummy_plugin_resume_devices_late(int target_pid)
{
pr_info("dummy_plugin: Inside %s for target pid = %d\n", __func__, target_pid);
return -ENOTSUP;
}
CR_PLUGIN_REGISTER_HOOK(CR_PLUGIN_HOOK__RESUME_DEVICES_LATE, dummy_plugin_resume_devices_late)
/*
* return 0 if no match found
* return -1 for error or -ENOTSUP.
* return 1 if vmap map must be adjusted.
*/
int dummy_plugin_update_vmamap(const char *old_path, char *new_path, const uint64_t addr, const uint64_t old_offset,
uint64_t *new_offset)
{
uint64_t temp = 100;
*new_offset = temp;
pr_info("dummy_plugin: old_pgoff= 0x%lu new_pgoff = 0x%lx old_path = %s new_path = %s addr = 0x%lu\n",
old_offset, *new_offset, old_path, new_path, addr);
return -ENOTSUP;
}
CR_PLUGIN_REGISTER_HOOK(CR_PLUGIN_HOOK__UPDATE_VMA_MAP, dummy_plugin_update_vmamap)