Merge remote-tracking branch 'upstream/criu-dev' into criu-dev

This commit is contained in:
asafpamzn 2025-11-16 12:19:02 +02:00
commit d39e2b5ff2
No known key found for this signature in database
28 changed files with 5997 additions and 143 deletions

View file

@ -1,3 +1,7 @@
[codespell]
<<<<<<< HEAD
skip = ./.git,./test/pki,./tags
=======
skip = ./.git,./test/pki,./tags,./plugins/amdgpu/amdgpu_drm.h,./plugins/amdgpu/drm.h,./plugins/amdgpu/drm_mode.h
>>>>>>> upstream/criu-dev
ignore-words-list = creat,fpr,fle,ue,bord,parms,nd,te,testng,inh,wronly,renderd,bui,clen,sems

View file

@ -1,10 +1,10 @@
#
# CRIU version.
CRIU_VERSION_MAJOR := 4
CRIU_VERSION_MINOR := 0
CRIU_VERSION_MINOR := 2
CRIU_VERSION_SUBLEVEL :=
CRIU_VERSION_EXTRA :=
CRIU_VERSION_NAME := CRIUDA
CRIU_VERSION_NAME := CRIUTIBILITY
CRIU_VERSION := $(CRIU_VERSION_MAJOR)$(if $(CRIU_VERSION_MINOR),.$(CRIU_VERSION_MINOR))$(if $(CRIU_VERSION_SUBLEVEL),.$(CRIU_VERSION_SUBLEVEL))$(if $(CRIU_VERSION_EXTRA),.$(CRIU_VERSION_EXTRA))
export CRIU_VERSION_MAJOR CRIU_VERSION_MINOR CRIU_VERSION_SUBLEVEL

View file

@ -794,7 +794,12 @@ class coredump_generator:
off = 0 # in pages
for m in pagemap[1:]:
found = False
<<<<<<< HEAD
num_pages = m.get("nr_pages", m.compat_nr_pages)
=======
num_pages = m.get("nr_pages", m["compat_nr_pages"])
>>>>>>> upstream/criu-dev
for i in range(num_pages):
if m["vaddr"] + i * PAGESIZE == page_no * PAGESIZE:
found = True

View file

@ -705,7 +705,10 @@ int parse_options(int argc, char **argv, bool *usage_error, bool *has_exec_cmd,
BOOL_OPT("unprivileged", &opts.unprivileged),
BOOL_OPT("ghost-fiemap", &opts.ghost_fiemap),
BOOL_OPT(OPT_ALLOW_UPROBES, &opts.allow_uprobes),
<<<<<<< HEAD
{ "cow-dump", no_argument, 0, 1105 },
=======
>>>>>>> upstream/criu-dev
{},
};

View file

@ -2289,6 +2289,10 @@ int cr_dump_tasks(pid_t pid)
goto err;
}
ret = run_plugins(DUMP_DEVICES_LATE, pid);
if (ret && ret != -ENOTSUP)
goto err;
if (parent_ie) {
inventory_entry__free_unpacked(parent_ie, NULL);
parent_ie = NULL;

View file

@ -439,12 +439,6 @@ static int setup_opts_from_req(int sk, CriuOpts *req)
if (req->has_unprivileged)
opts.unprivileged = req->unprivileged;
if (check_caps())
return 1;
if (kerndat_init())
return 1;
if (log_keep_err()) {
pr_perror("Can't tune log");
goto err;
@ -738,9 +732,6 @@ static int setup_opts_from_req(int sk, CriuOpts *req)
}
}
if (req->has_pidfd_store_sk && init_pidfd_store_sk(ids.pid, req->pidfd_store_sk))
goto err;
if (req->orphan_pts_master)
opts.orphan_pts_master = true;
@ -817,6 +808,19 @@ static int setup_opts_from_req(int sk, CriuOpts *req)
if (setup_logging_from_req(req, output_changed_by_rpc_conf))
goto err;
<<<<<<< HEAD
=======
if (check_caps())
goto err;
if (kerndat_init())
goto err;
/* init_pidfd_store_sk must be called after kerndat_init. */
if (req->has_pidfd_store_sk && init_pidfd_store_sk(ids.pid, req->pidfd_store_sk))
goto err;
>>>>>>> upstream/criu-dev
if (req->mntns_compat_mode)
opts.mntns_compat_mode = true;

View file

@ -45,10 +45,11 @@ static int open_fd(struct file_desc *d, int *new_fd)
{
struct ext_file_info *xfi;
int fd;
bool retry_needed;
xfi = container_of(d, struct ext_file_info, d);
fd = run_plugins(RESTORE_EXT_FILE, xfi->xfe->id);
fd = run_plugins(RESTORE_EXT_FILE, xfi->xfe->id, &retry_needed);
if (fd < 0) {
pr_err("Unable to restore %#x\n", xfi->xfe->id);
return -1;
@ -57,8 +58,11 @@ static int open_fd(struct file_desc *d, int *new_fd)
if (restore_fown(fd, xfi->xfe->fown))
return -1;
*new_fd = fd;
return 0;
if (!retry_needed)
*new_fd = fd;
else
*new_fd = -1;
return retry_needed;
}
static struct file_desc_ops ext_desc_ops = {

View file

@ -62,6 +62,13 @@ enum {
CR_PLUGIN_HOOK__POST_FORKING = 12,
<<<<<<< HEAD
=======
CR_PLUGIN_HOOK__RESTORE_INIT = 13,
CR_PLUGIN_HOOK__DUMP_DEVICES_LATE = 14,
>>>>>>> upstream/criu-dev
CR_PLUGIN_HOOK__MAX
};
@ -70,7 +77,7 @@ enum {
DECLARE_PLUGIN_HOOK_ARGS(CR_PLUGIN_HOOK__DUMP_UNIX_SK, int fd, int id);
DECLARE_PLUGIN_HOOK_ARGS(CR_PLUGIN_HOOK__RESTORE_UNIX_SK, int id);
DECLARE_PLUGIN_HOOK_ARGS(CR_PLUGIN_HOOK__DUMP_EXT_FILE, int fd, int id);
DECLARE_PLUGIN_HOOK_ARGS(CR_PLUGIN_HOOK__RESTORE_EXT_FILE, int id);
DECLARE_PLUGIN_HOOK_ARGS(CR_PLUGIN_HOOK__RESTORE_EXT_FILE, int id, bool *retry_needed);
DECLARE_PLUGIN_HOOK_ARGS(CR_PLUGIN_HOOK__DUMP_EXT_MOUNT, char *mountpoint, int id);
DECLARE_PLUGIN_HOOK_ARGS(CR_PLUGIN_HOOK__RESTORE_EXT_MOUNT, int id, char *mountpoint, char *old_root, int *is_file);
DECLARE_PLUGIN_HOOK_ARGS(CR_PLUGIN_HOOK__DUMP_EXT_LINK, int index, int type, char *kind);
@ -81,6 +88,11 @@ DECLARE_PLUGIN_HOOK_ARGS(CR_PLUGIN_HOOK__RESUME_DEVICES_LATE, int pid);
DECLARE_PLUGIN_HOOK_ARGS(CR_PLUGIN_HOOK__PAUSE_DEVICES, int pid);
DECLARE_PLUGIN_HOOK_ARGS(CR_PLUGIN_HOOK__CHECKPOINT_DEVICES, int pid);
DECLARE_PLUGIN_HOOK_ARGS(CR_PLUGIN_HOOK__POST_FORKING, void);
<<<<<<< HEAD
=======
DECLARE_PLUGIN_HOOK_ARGS(CR_PLUGIN_HOOK__RESTORE_INIT, void);
DECLARE_PLUGIN_HOOK_ARGS(CR_PLUGIN_HOOK__DUMP_DEVICES_LATE, int id);
>>>>>>> upstream/criu-dev
enum {
CR_PLUGIN_STAGE__DUMP,

View file

@ -190,7 +190,7 @@ void flush_early_log_buffer(int fd)
* with reading the log_level.
*/
struct early_log_hdr *hdr = (void *)early_log_buffer + pos;
pos += sizeof(hdr);
pos += sizeof(*hdr);
if (hdr->level <= current_loglevel) {
size_t size = 0;
while (size < hdr->len) {
@ -202,7 +202,7 @@ void flush_early_log_buffer(int fd)
}
pos += hdr->len;
}
if (early_log_buf_off == EARLY_LOG_BUF_LEN)
if ((early_log_buf_off + sizeof(struct early_log_hdr)) >= EARLY_LOG_BUF_LEN)
pr_warn("The early log buffer is full, some messages may have been lost\n");
early_log_buf_off = 0;
}
@ -320,10 +320,10 @@ unsigned int log_get_loglevel(void)
static void early_vprint(const char *format, unsigned int loglevel, va_list params)
{
unsigned int log_size = 0;
int log_size = 0, log_space;
struct early_log_hdr *hdr;
if ((early_log_buf_off + sizeof(hdr)) >= EARLY_LOG_BUF_LEN)
if ((early_log_buf_off + sizeof(*hdr)) >= EARLY_LOG_BUF_LEN)
return;
/* Save loglevel */
@ -331,7 +331,8 @@ static void early_vprint(const char *format, unsigned int loglevel, va_list para
hdr = (void *)early_log_buffer + early_log_buf_off;
hdr->level = loglevel;
/* Skip the log entry size */
early_log_buf_off += sizeof(hdr);
early_log_buf_off += sizeof(*hdr);
log_space = EARLY_LOG_BUF_LEN - early_log_buf_off;
if (loglevel >= LOG_TIMESTAMP) {
/*
* If logging is not yet setup we just write zeros
@ -339,12 +340,17 @@ static void early_vprint(const char *format, unsigned int loglevel, va_list para
* keep the same format as the other messages on
* log levels with timestamps (>=LOG_TIMESTAMP).
*/
log_size = snprintf(early_log_buffer + early_log_buf_off, sizeof(early_log_buffer) - early_log_buf_off,
log_size = snprintf(early_log_buffer + early_log_buf_off, log_space,
"(00.000000) ");
}
log_size += vsnprintf(early_log_buffer + early_log_buf_off + log_size,
sizeof(early_log_buffer) - early_log_buf_off - log_size, format, params);
if (log_size < log_space)
log_size += vsnprintf(early_log_buffer + early_log_buf_off + log_size,
log_space - log_size, format, params);
if (log_size > log_space) {
/* vsnprintf always add the terminating null byte. */
log_size = log_space - 1;
}
/* Save log entry size */
hdr->len = log_size;

View file

@ -1218,11 +1218,23 @@ static int page_server_get_pages(int sk, struct page_server_iov *pi)
* on 32-bit platforms (e.g. armv7). */
nr_pages = pi->nr_pages;
ret = page_pipe_read(pp, &pipe_read_dest, pi->vaddr, &nr_pages, PPB_LAZY);
<<<<<<< HEAD
if (ret) {
ps_stats.get_errors++;
=======
if (ret)
>>>>>>> upstream/criu-dev
return ret;
}
<<<<<<< HEAD
=======
/*
* The pi is reused for send_psi here, so .nr_pages, .vaddr and
* .dst_id all remain intact.
*/
>>>>>>> upstream/criu-dev
pi->nr_pages = nr_pages;
if (pi->nr_pages == 0) {
pr_debug("no iovs found, zero pages\n");

View file

@ -1989,6 +1989,9 @@ __visible long __export_restore_task(struct task_restore_args *args)
for (m = 0; m < sizeof(vma_entry->madv) * 8; m++) {
if (vma_entry->madv & (1ul << m)) {
if (!(vma_entry_is(vma_entry, VMA_AREA_REGULAR)))
continue;
ret = sys_madvise(vma_entry->start, vma_entry_len(vma_entry), m);
if (ret) {
pr_err("madvise(%" PRIx64 ", %" PRIu64 ", %ld) "

View file

@ -60,6 +60,11 @@ static cr_plugin_desc_t *cr_gen_plugin_desc(void *h, char *path)
__assign_hook(PAUSE_DEVICES, "cr_plugin_pause_devices");
__assign_hook(CHECKPOINT_DEVICES, "cr_plugin_checkpoint_devices");
__assign_hook(POST_FORKING, "cr_plugin_post_forking");
<<<<<<< HEAD
=======
__assign_hook(RESTORE_INIT, "cr_plugin_restore_init");
__assign_hook(DUMP_DEVICES_LATE, "cr_plugin_dump_devices_late");
>>>>>>> upstream/criu-dev
#undef __assign_hook
@ -257,8 +262,21 @@ int cr_plugin_init(int stage)
goto err;
}
<<<<<<< HEAD
if (stage == CR_PLUGIN_STAGE__RESTORE && check_inventory_plugins())
goto err;
=======
if (stage == CR_PLUGIN_STAGE__RESTORE) {
int ret;
if (check_inventory_plugins())
goto err;
ret = run_plugins(RESTORE_INIT);
if (ret < 0 && ret != -ENOTSUP)
goto err;
}
>>>>>>> upstream/criu-dev
exit_code = 0;
err:

View file

@ -313,4 +313,4 @@ int clone_service_fd(struct pstree_item *me)
ret = 0;
return ret;
}
}

View file

@ -953,6 +953,7 @@ static int uffd_check_op_error(struct lazy_pages_info *lpi, const char *op, unsi
return 0;
}
<<<<<<< HEAD
static int xfer_pages(struct lazy_pages_info *lpi);
/*
* Aggressively refill pipeline to maximum capacity.
@ -973,6 +974,8 @@ static int refill_pipeline(struct lazy_pages_info *lpi)
return 0;
}
=======
>>>>>>> upstream/criu-dev
static int uffd_copy(struct lazy_pages_info *lpi, __u64 address, unsigned long *nr_pages)
{
struct uffdio_copy uffdio_copy;

View file

@ -27,7 +27,11 @@ endif
criu-amdgpu.pb-c.c: criu-amdgpu.proto
protoc --proto_path=. --c_out=. criu-amdgpu.proto
<<<<<<< HEAD
amdgpu_plugin.so: amdgpu_plugin.c amdgpu_plugin_drm.c amdgpu_plugin_topology.c amdgpu_plugin_util.c criu-amdgpu.pb-c.c amdgpu_socket_utils.c
=======
amdgpu_plugin.so: amdgpu_plugin.c amdgpu_plugin_drm.c amdgpu_plugin_dmabuf.c amdgpu_plugin_topology.c amdgpu_plugin_util.c criu-amdgpu.pb-c.c amdgpu_socket_utils.c
>>>>>>> upstream/criu-dev
$(CC) $(PLUGIN_CFLAGS) $(shell $(COMPEL) includes) $^ -o $@ $(PLUGIN_INCLUDE) $(PLUGIN_LDFLAGS) $(LIBDRM_INC)
amdgpu_plugin_clean:

1801
plugins/amdgpu/amdgpu_drm.h Normal file

File diff suppressed because it is too large Load diff

View file

@ -12,25 +12,36 @@
#include <sys/sysmacros.h>
#include <sys/mman.h>
#include <sys/types.h>
#include <sys/socket.h>
#include <sys/un.h>
#include <stdint.h>
#include <pthread.h>
#include <semaphore.h>
#include <xf86drm.h>
#include <libdrm/amdgpu.h>
#include <libdrm/amdgpu_drm.h>
#include "criu-plugin.h"
#include "plugin.h"
#include "criu-amdgpu.pb-c.h"
#include "util.h"
#include "util-pie.h"
#include "fdstore.h"
#include "kfd_ioctl.h"
#include "xmalloc.h"
#include "criu-log.h"
#include "files.h"
#include "pstree.h"
<<<<<<< HEAD
=======
#include "sockets.h"
#include "rst-malloc.h"
>>>>>>> upstream/criu-dev
#include "common/list.h"
#include "amdgpu_drm.h"
#include "amdgpu_plugin_dmabuf.h"
#include "amdgpu_plugin_drm.h"
#include "amdgpu_plugin_util.h"
#include "amdgpu_plugin_topology.h"
@ -39,6 +50,7 @@
#include "img-streamer.h"
#include "image.h"
#include "cr_options.h"
#include "util.h"
struct vma_metadata {
struct list_head list;
@ -51,13 +63,6 @@ struct vma_metadata {
/************************************ Global Variables ********************************************/
/**
* FD of KFD device used to checkpoint. On a multi-process
* tree the order of checkpointing goes from parent to child
* and so on - so saving the FD will not be overwritten
*/
static int kfd_checkpoint_fd;
static LIST_HEAD(update_vma_info_list);
size_t kfd_max_buffer_size;
@ -66,6 +71,22 @@ bool plugin_added_to_inventory = false;
bool plugin_disabled = false;
<<<<<<< HEAD
=======
struct handle_id {
int handle;
int fdstore_id;
};
struct shared_handle_ids {
int num_handles;
struct handle_id *handles;
};
struct shared_handle_ids *shared_memory = NULL;
static mutex_t *shared_memory_mutex;
int current_pid;
>>>>>>> upstream/criu-dev
/*
* In the case of a single process (common case), this optimization can effectively
* reduce the restore latency with parallel restore. In the case of multiple processes,
@ -313,8 +334,6 @@ void getenv_size_t(const char *var, size_t *value)
int sh = 0;
size_t size;
pr_info("Value str: %s\n", value_str);
if (value_str) {
size = (size_t)strtoul(value_str, &endp, 0);
if (errno || value_str == endp) {
@ -526,11 +545,11 @@ void free_and_unmap(uint64_t size, amdgpu_bo_handle h_bo, amdgpu_va_handle h_va,
amdgpu_bo_free(h_bo);
}
static int sdma_copy_bo(struct kfd_criu_bo_bucket bo_bucket, FILE *storage_fp,
void *buffer, size_t buffer_size, amdgpu_device_handle h_dev,
uint64_t max_copy_size, enum sdma_op_type type)
int sdma_copy_bo(int shared_fd, uint64_t size, FILE *storage_fp,
void *buffer, size_t buffer_size, amdgpu_device_handle h_dev,
uint64_t max_copy_size, enum sdma_op_type type, bool do_not_free)
{
uint64_t size, src_bo_size, dst_bo_size, buffer_bo_size, bytes_remain, buffer_space_remain;
uint64_t src_bo_size, dst_bo_size, buffer_bo_size, bytes_remain, buffer_space_remain;
uint64_t gpu_addr_src, gpu_addr_dst, gpu_addr_ib, copy_src, copy_dst, copy_size;
amdgpu_va_handle h_va_src, h_va_dst, h_va_ib;
amdgpu_bo_handle h_bo_src, h_bo_dst, h_bo_ib;
@ -543,10 +562,8 @@ static int sdma_copy_bo(struct kfd_criu_bo_bucket bo_bucket, FILE *storage_fp,
uint32_t expired;
amdgpu_context_handle h_ctx;
uint32_t *ib = NULL;
int j, err, shared_fd, packets_per_buffer;
int j, err, packets_per_buffer;
shared_fd = bo_bucket.dmabuf_fd;
size = bo_bucket.size;
buffer_bo_size = min(size, buffer_size);
packets_per_buffer = ((buffer_bo_size - 1) / max_copy_size) + 1;
src_bo_size = (type == SDMA_OP_VRAM_WRITE) ? buffer_bo_size : size;
@ -757,7 +774,8 @@ err_dst_bo_map:
if (err)
pr_perror("dest range free failed");
err_dst_va:
err = amdgpu_bo_free(h_bo_dst);
if (!do_not_free)
err = amdgpu_bo_free(h_bo_dst);
if (err)
pr_perror("dest bo free failed");
err_dst_bo_prep:
@ -845,8 +863,9 @@ void *dump_bo_contents(void *_thread_data)
num_bos++;
/* perform sDMA based vram copy */
ret = sdma_copy_bo(bo_buckets[i], bo_contents_fp, buffer, buffer_size, h_dev, max_copy_size,
SDMA_OP_VRAM_READ);
ret = sdma_copy_bo(bo_buckets[i].dmabuf_fd, bo_buckets[i].size, bo_contents_fp, buffer, buffer_size, h_dev, max_copy_size,
SDMA_OP_VRAM_READ, false);
if (ret) {
pr_err("Failed to drain the BO using sDMA: bo_buckets[%d]\n", i);
break;
@ -943,8 +962,8 @@ void *restore_bo_contents(void *_thread_data)
num_bos++;
ret = sdma_copy_bo(bo_buckets[i], bo_contents_fp, buffer, buffer_size, h_dev, max_copy_size,
SDMA_OP_VRAM_WRITE);
ret = sdma_copy_bo(bo_buckets[i].dmabuf_fd, bo_buckets[i].size, bo_contents_fp, buffer, buffer_size, h_dev, max_copy_size,
SDMA_OP_VRAM_WRITE, false);
if (ret) {
pr_err("Failed to fill the BO using sDMA: bo_buckets[%d]\n", i);
break;
@ -1030,28 +1049,163 @@ int restore_hsakmt_shared_mem(const uint64_t shared_mem_size, const uint32_t sha
return 0;
}
static int unpause_process(int fd)
int amdgpu_unpause_processes(int pid)
{
int ret = 0;
struct kfd_ioctl_criu_args args = { 0 };
struct list_head *l = get_dumped_fds();
struct dumped_fd *st;
args.op = KFD_CRIU_OP_UNPAUSE;
list_for_each_entry(st, l, l) {
if (st->is_drm) {
close(st->fd);
} else {
args.op = KFD_CRIU_OP_UNPAUSE;
ret = kmtIoctl(fd, AMDKFD_IOC_CRIU_OP, &args);
if (ret) {
pr_perror("Failed to unpause process");
goto exit;
ret = kmtIoctl(st->fd, AMDKFD_IOC_CRIU_OP, &args);
if (ret) {
pr_perror("Failed to unpause process");
goto exit;
}
}
}
// Reset the KFD FD
kfd_checkpoint_fd = -1;
sys_close_drm_render_devices(&src_topology);
if (post_dump_dmabuf_check() < 0)
ret = -1;
exit:
pr_info("Process unpaused %s (ret:%d)\n", ret ? "Failed" : "Ok", ret);
clear_dumped_fds();
return ret;
}
CR_PLUGIN_REGISTER_HOOK(CR_PLUGIN_HOOK__DUMP_DEVICES_LATE, amdgpu_unpause_processes)
int store_dmabuf_fd(int handle, int fd)
{
int id;
id = fdstore_add(fd);
mutex_lock(shared_memory_mutex);
for (int i = 0; i < shared_memory->num_handles; i++) {
if (shared_memory->handles[i].handle == handle) {
mutex_unlock(shared_memory_mutex);
return 0;
}
if (shared_memory->handles[i].handle == -1) {
shared_memory->handles[i].handle = handle;
shared_memory->handles[i].fdstore_id = id;
mutex_unlock(shared_memory_mutex);
return 0;
}
}
mutex_unlock(shared_memory_mutex);
return -1;
}
int amdgpu_id_for_handle(int handle)
{
mutex_lock(shared_memory_mutex);
for (int i = 0; i < shared_memory->num_handles; i++) {
if (shared_memory->handles[i].handle == handle) {
mutex_unlock(shared_memory_mutex);
return shared_memory->handles[i].fdstore_id;
}
}
mutex_unlock(shared_memory_mutex);
return -1;
}
int amdgpu_restore_init(void)
{
if (!shared_memory) {
int protection = PROT_READ | PROT_WRITE;
int visibility = MAP_SHARED | MAP_ANONYMOUS;
size_t img_size;
FILE *img_fp = NULL;
int ret;
unsigned char *buf;
int num_handles = 0;
char img_path[PATH_MAX];
CriuRenderNode *rd = NULL;
CriuKfd *e = NULL;
DIR *d;
struct dirent *dir;
d = opendir(".");
if (d) {
while ((dir = readdir(d)) != NULL) {
if (strncmp("amdgpu-kfd-", dir->d_name, strlen("amdgpu-kfd-")) == 0) {
img_fp = open_img_file(dir->d_name, false, &img_size);
buf = xmalloc(img_size);
if (!buf) {
fclose(img_fp);
return -ENOMEM;
}
ret = read_fp(img_fp, buf, img_size);
if (ret) {
pr_perror("Unable to read from %s", img_path);
fclose(img_fp);
xfree(buf);
return ret;
}
fclose(img_fp);
e = criu_kfd__unpack(NULL, img_size, buf);
num_handles += e->num_of_bos;
criu_kfd__free_unpacked(e, NULL);
xfree(buf);
}
if (strncmp("amdgpu-renderD-", dir->d_name, strlen("amdgpu-renderD-")) == 0) {
img_fp = open_img_file(dir->d_name, false, &img_size);
buf = xmalloc(img_size);
if (!buf) {
fclose(img_fp);
return -ENOMEM;
}
ret = read_fp(img_fp, buf, img_size);
if (ret) {
pr_perror("Unable to read from %s", img_path);
fclose(img_fp);
xfree(buf);
return ret;
}
fclose(img_fp);
rd = criu_render_node__unpack(NULL, img_size, buf);
num_handles += rd->num_of_bos;
criu_render_node__free_unpacked(rd, NULL);
xfree(buf);
}
}
closedir(d);
}
if (num_handles > 0) {
shared_memory = mmap(NULL, sizeof(shared_memory), protection, visibility, -1, 0);
shared_memory->num_handles = num_handles;
shared_memory->handles = mmap(NULL, sizeof(struct handle_id) * num_handles, protection, visibility, -1, 0);
for (int i = 0; i < num_handles; i++) {
shared_memory->handles[i].handle = -1;
shared_memory->handles[i].fdstore_id = -1;
}
shared_memory_mutex = shmalloc(sizeof(*shared_memory_mutex));
if (!shared_memory_mutex) {
pr_err("Can't create amdgpu mutex\n");
return -1;
}
mutex_init(shared_memory_mutex);
}
}
return 0;
}
CR_PLUGIN_REGISTER_HOOK(CR_PLUGIN_HOOK__RESTORE_INIT, amdgpu_restore_init)
static int save_devices(int fd, struct kfd_ioctl_criu_args *args, struct kfd_criu_device_bucket *device_buckets,
CriuKfd *e)
@ -1095,6 +1249,8 @@ static int save_bos(int id, int fd, struct kfd_ioctl_criu_args *args, struct kfd
{
struct thread_data *thread_datas;
int ret = 0, i;
amdgpu_device_handle h_dev;
uint32_t major, minor;
pr_debug("Dumping %d BOs\n", args->num_bos);
@ -1118,6 +1274,19 @@ static int save_bos(int id, int fd, struct kfd_ioctl_criu_args *args, struct kfd
boinfo->size = bo_bucket->size;
boinfo->offset = bo_bucket->offset;
boinfo->alloc_flags = bo_bucket->alloc_flags;
ret = amdgpu_device_initialize(node_get_drm_render_device(sys_get_node_by_gpu_id(&src_topology, bo_bucket->gpu_id)), &major, &minor, &h_dev);
boinfo->handle = get_gem_handle(h_dev, bo_bucket->dmabuf_fd);
amdgpu_device_deinitialize(h_dev);
}
for (i = 0; i < e->num_of_bos; i++) {
KfdBoEntry *boinfo = e->bo_entries[i];
ret = record_shared_bo(boinfo->handle, false);
if (ret)
goto exit;
}
for (int i = 0; i < e->num_of_gpus; i++) {
@ -1238,10 +1407,17 @@ int amdgpu_plugin_dump_file(int fd, int id)
return -1;
}
/* Initialize number of device files that will be checkpointed */
init_gpu_count(&src_topology);
/* Check whether this plugin was called for kfd, dmabuf or render nodes */
ret = get_dmabuf_info(fd, &st);
if (ret < 0) {
pr_perror("Failed to get dmabuf info");
return -1;
}
if (ret == 0) {
pr_info("Dumping dmabuf fd = %d\n", fd);
return amdgpu_plugin_dmabuf_dump(fd, id);
}
/* Check whether this plugin was called for kfd or render nodes */
if (major(st.st_rdev) != major(st_kfd.st_rdev) || minor(st.st_rdev) != 0) {
/* This is RenderD dumper plugin, for now just save renderD
@ -1252,14 +1428,12 @@ int amdgpu_plugin_dump_file(int fd, int id)
if (ret)
return ret;
/* Invoke unpause process if needed */
decrement_checkpoint_count();
if (checkpoint_is_complete()) {
ret = unpause_process(kfd_checkpoint_fd);
}
ret = record_dumped_fd(fd, true);
if (ret)
return ret;
/* Need to return success here so that criu can call plugins for renderD nodes */
return ret;
return try_dump_dmabuf_list();
}
pr_info("%s() called for fd = %d\n", __func__, major(st.st_rdev));
@ -1354,14 +1528,11 @@ int amdgpu_plugin_dump_file(int fd, int id)
xfree(buf);
exit:
/* Restore all queues if conditions permit */
kfd_checkpoint_fd = fd;
decrement_checkpoint_count();
if (checkpoint_is_complete()) {
ret = unpause_process(fd);
}
ret = record_dumped_fd(fd, false);
if (ret)
goto exit;
exit:
xfree((void *)args.devices);
xfree((void *)args.bos);
xfree((void *)args.priv_data);
@ -1384,7 +1555,6 @@ static int restore_devices(struct kfd_ioctl_criu_args *args, CriuKfd *e)
int ret = 0, bucket_index = 0;
pr_debug("Restoring %d devices\n", e->num_of_gpus);
args->num_devices = e->num_of_gpus;
device_buckets = xzalloc(sizeof(*device_buckets) * args->num_devices);
if (!device_buckets)
@ -1457,6 +1627,29 @@ static int restore_bos(struct kfd_ioctl_criu_args *args, CriuKfd *e)
}
pr_info("Restore BOs Ok\n");
return 0;
}
int save_vma_updates(uint64_t offset, uint64_t addr, uint64_t restored_offset, int fd)
{
struct vma_metadata *vma_md;
vma_md = xmalloc(sizeof(*vma_md));
if (!vma_md) {
return -ENOMEM;
}
memset(vma_md, 0, sizeof(*vma_md));
vma_md->old_pgoff = offset;
vma_md->vma_entry = addr;
vma_md->new_pgoff = restored_offset;
vma_md->fd = fd;
list_add_tail(&vma_md->list, &update_vma_info_list);
return 0;
}
@ -1614,7 +1807,7 @@ exit:
return ret;
}
int amdgpu_plugin_restore_file(int id)
int amdgpu_plugin_restore_file(int id, bool *retry_needed)
{
int ret = 0, fd;
char img_path[PATH_MAX];
@ -1625,6 +1818,11 @@ int amdgpu_plugin_restore_file(int id)
size_t img_size;
FILE *img_fp = NULL;
<<<<<<< HEAD
=======
*retry_needed = false;
>>>>>>> upstream/criu-dev
if (plugin_disabled)
return -ENOTSUP;
@ -1643,12 +1841,21 @@ int amdgpu_plugin_restore_file(int id)
* first as we assume restore_maps is already filled. Need to fix this later.
*/
snprintf(img_path, sizeof(img_path), IMG_DRM_FILE, id);
pr_info("Restoring RenderD %s\n", img_path);
img_fp = open_img_file(img_path, false, &img_size);
if (!img_fp)
return -EINVAL;
if (!img_fp) {
ret = amdgpu_plugin_dmabuf_restore(id);
if (ret == 1) {
/* This is a dmabuf fd, but the corresponding buffer object that was
* exported to make it has not yet been restored. Need to try again
* later when the buffer object exists, so it can be re-exported.
*/
*retry_needed = true;
return 0;
}
return ret;
}
pr_info("Restoring RenderD %s\n", img_path);
pr_debug("RenderD Image file size:%ld\n", img_size);
buf = xmalloc(img_size);
if (!buf) {
@ -1689,8 +1896,18 @@ int amdgpu_plugin_restore_file(int id)
pr_info("render node destination gpu_id = 0x%04x\n", tp_node->gpu_id);
fd = node_get_drm_render_device(tp_node);
if (fd < 0)
if (fd < 0) {
pr_err("Failed to open render device (minor:%d)\n", tp_node->drm_render_minor);
return -1;
}
ret = amdgpu_plugin_drm_restore_file(fd, rd);
if (ret == 1)
*retry_needed = true;
if (ret < 0) {
fd = ret;
goto fail;
}
fail:
criu_render_node__free_unpacked(rd, NULL);
xfree(buf);
@ -1702,12 +1919,20 @@ int amdgpu_plugin_restore_file(int id)
* copy of the fd. CRIU core owns the duplicated returned fd, and amdgpu_plugin owns the fd stored in
* tp_node.
*/
fd = dup(fd);
if (fd == -1) {
pr_perror("unable to duplicate the render fd");
return -1;
if (fd < 0)
return fd;
if (!(*retry_needed)) {
fd = dup(fd);
if (fd == -1) {
pr_perror("unable to duplicate the render fd");
return -1;
}
return fd;
}
return fd;
return 0;
}
fd = open(AMDGPU_KFD_DEVICE, O_RDWR | O_CLOEXEC);
@ -1751,11 +1976,13 @@ int amdgpu_plugin_restore_file(int id)
* This way, we know that the file descriptors we store will not conflict with file descriptors inside core
* CRIU.
*/
fd_next = find_unused_fd_pid(e->pid);
if (fd_next <= 0) {
pr_err("Failed to find unused fd (fd:%d)\n", fd_next);
ret = -EINVAL;
goto exit;
if (fd_next == -1) {
fd_next = find_unused_fd_pid(e->pid);
if (fd_next <= 0) {
pr_err("Failed to find unused fd (fd:%d)\n", fd_next);
ret = -EINVAL;
goto exit;
}
}
ret = devinfo_to_topology(e->device_entries, e->num_of_gpus + e->num_of_cpus, &src_topology);
@ -1788,14 +2015,26 @@ int amdgpu_plugin_restore_file(int id)
args.num_objects = e->num_of_objects;
args.priv_data_size = e->priv_data.len;
args.priv_data = (uintptr_t)e->priv_data.data;
args.op = KFD_CRIU_OP_RESTORE;
if (kmtIoctl(fd, AMDKFD_IOC_CRIU_OP, &args) == -1) {
pr_perror("Restore ioctl failed");
ret = -1;
goto exit;
}
if (ret < 0)
goto exit;
for (int i = 0; i < args.num_bos; i++) {
struct kfd_criu_bo_bucket *bo_bucket = &((struct kfd_criu_bo_bucket *)args.bos)[i];
KfdBoEntry *bo_entry = e->bo_entries[i];
if (bo_entry->handle != -1) {
store_dmabuf_fd(bo_entry->handle, bo_bucket->dmabuf_fd);
}
}
ret = restore_bo_data(id, (struct kfd_criu_bo_bucket *)args.bos, e);
if (ret)
goto exit;
@ -1938,12 +2177,15 @@ int amdgpu_plugin_resume_devices_late(int target_pid)
}
}
clear_restore_state();
close(fd);
return exit_code;
}
CR_PLUGIN_REGISTER_HOOK(CR_PLUGIN_HOOK__RESUME_DEVICES_LATE, amdgpu_plugin_resume_devices_late)
<<<<<<< HEAD
int sdma_copy_bo_helper(uint64_t size, int fd, FILE *storage_fp, void *buffer, size_t buffer_size,
amdgpu_device_handle h_dev, uint64_t max_copy_size, enum sdma_op_type type)
{
@ -1951,6 +2193,8 @@ int sdma_copy_bo_helper(uint64_t size, int fd, FILE *storage_fp, void *buffer, s
buffer_size, h_dev, max_copy_size, SDMA_OP_VRAM_WRITE);
}
=======
>>>>>>> upstream/criu-dev
int init_dev(int dev_minor, amdgpu_device_handle *h_dev, uint64_t *max_copy_size)
{
int ret = 0;
@ -2059,8 +2303,15 @@ void *parallel_restore_bo_contents(void *_thread_data)
entry = &restore_cmd->entries[i];
fseek(bo_contents_fp, entry->read_offset + offset, SEEK_SET);
<<<<<<< HEAD
ret = sdma_copy_bo_helper(entry->size, restore_cmd->fds_write[entry->write_id], bo_contents_fp, buffer,
buffer_size, h_dev, max_copy_size, SDMA_OP_VRAM_WRITE);
=======
ret = sdma_copy_bo(restore_cmd->fds_write[entry->write_id], entry->size, bo_contents_fp,
buffer, buffer_size, h_dev,
max_copy_size, SDMA_OP_VRAM_WRITE, false);
>>>>>>> upstream/criu-dev
if (ret) {
pr_err("Failed to fill the BO using sDMA: bo_buckets[%d]\n", i);
goto err_sdma;

View file

@ -0,0 +1,197 @@
#include <errno.h>
#include <fcntl.h>
#include <stdlib.h>
#include <stdint.h>
#include <stdbool.h>
#include <string.h>
#include <stdio.h>
#include <unistd.h>
#include <sys/stat.h>
#include <sys/mman.h>
#include <sys/types.h>
#include <linux/limits.h>
#include "common/list.h"
#include "criu-amdgpu.pb-c.h"
#include "xmalloc.h"
#include "criu-log.h"
#include "amdgpu_plugin_drm.h"
#include "amdgpu_plugin_util.h"
#include "amdgpu_plugin_dmabuf.h"
#include "fdstore.h"
#include "util.h"
#include "common/scm.h"
struct dmabuf {
int id;
int dmabuf_fd;
struct list_head node;
};
static LIST_HEAD(dmabuf_list);
/* Return < 0 for error, > 0 for "not a dmabuf" and 0 "is a dmabuf" */
int get_dmabuf_info(int fd, struct stat *st)
{
char path[PATH_MAX];
if (read_fd_link(fd, path, sizeof(path)) < 0)
return -1;
if (strncmp(path, DMABUF_LINK, strlen(DMABUF_LINK)) != 0)
return 1;
return 0;
}
int __amdgpu_plugin_dmabuf_dump(int dmabuf_fd, int id)
{
int ret = 0;
char path[PATH_MAX];
size_t len = 0;
unsigned char *buf = NULL;
int gem_handle;
gem_handle = handle_for_shared_bo_fd(dmabuf_fd);
if (gem_handle < 0) {
pr_err("Failed to get handle for dmabuf_fd = %d\n", dmabuf_fd);
return -EAGAIN; /* Retry needed */
}
CriuDmabufNode *node = xmalloc(sizeof(*node));
if (!node) {
pr_err("Failed to allocate memory for dmabuf node\n");
return -ENOMEM;
}
criu_dmabuf_node__init(node);
node->gem_handle = gem_handle;
if (node->gem_handle < 0) {
pr_err("Failed to get handle for dmabuf_fd\n");
xfree(node);
return -EINVAL;
}
/* Serialize metadata to a file */
snprintf(path, sizeof(path), IMG_DMABUF_FILE, id);
len = criu_dmabuf_node__get_packed_size(node);
buf = xmalloc(len);
if (!buf) {
pr_err("Failed to allocate buffer for dmabuf metadata\n");
xfree(node);
return -ENOMEM;
}
criu_dmabuf_node__pack(node, buf);
ret = write_img_file(path, buf, len);
xfree(buf);
xfree(node);
return ret;
}
int amdgpu_plugin_dmabuf_restore(int id)
{
char path[PATH_MAX];
size_t img_size;
FILE *img_fp = NULL;
int ret = 0;
CriuDmabufNode *rd = NULL;
unsigned char *buf = NULL;
int fd_id;
snprintf(path, sizeof(path), IMG_DMABUF_FILE, id);
/* Read serialized metadata */
img_fp = open_img_file(path, false, &img_size);
if (!img_fp) {
pr_err("Failed to open dmabuf metadata file: %s\n", path);
return -EINVAL;
}
pr_debug("dmabuf Image file size:%ld\n", img_size);
buf = xmalloc(img_size);
if (!buf) {
pr_perror("Failed to allocate memory");
return -ENOMEM;
}
ret = read_fp(img_fp, buf, img_size);
if (ret) {
pr_perror("Unable to read from %s", path);
xfree(buf);
return ret;
}
rd = criu_dmabuf_node__unpack(NULL, img_size, buf);
if (rd == NULL) {
pr_perror("Unable to parse the dmabuf message %d", id);
xfree(buf);
fclose(img_fp);
return -1;
}
fclose(img_fp);
/* Match GEM handle with shared_dmabuf list */
fd_id = amdgpu_id_for_handle(rd->gem_handle);
if (fd_id == -1) {
pr_err("Failed to find dmabuf_fd for GEM handle = %d\n", rd->gem_handle);
return 1;
}
int dmabuf_fd = fdstore_get(fd_id);
if (dmabuf_fd == -1) {
pr_err("Failed to find dmabuf_fd for GEM handle = %d\n", rd->gem_handle);
return 1; /* Retry needed */
}
pr_info("Restored dmabuf_fd = %d for GEM handle = %d\n", dmabuf_fd, rd->gem_handle);
ret = dmabuf_fd;
pr_info("Successfully restored dmabuf_fd %d\n", dmabuf_fd);
criu_dmabuf_node__free_unpacked(rd, NULL);
xfree(buf);
return ret;
}
int amdgpu_plugin_dmabuf_dump(int dmabuf_fd, int id)
{
int ret;
ret = __amdgpu_plugin_dmabuf_dump(dmabuf_fd, id);
if (ret == -EAGAIN) {
struct dmabuf *b = xmalloc(sizeof(*b));
b->id = id;
b->dmabuf_fd = dmabuf_fd;
list_add(&b->node, &dmabuf_list);
return 0;
}
return ret;
}
int try_dump_dmabuf_list()
{
struct dmabuf *b, *t;
list_for_each_entry_safe(b, t, &dmabuf_list, node) {
int ret = __amdgpu_plugin_dmabuf_dump(b->dmabuf_fd, b->id);
if (ret == -EAGAIN)
continue;
if (ret)
return ret;
list_del(&b->node);
xfree(b);
}
return 0;
}
int post_dump_dmabuf_check()
{
if (!list_empty(&dmabuf_list)) {
pr_err("Not all dma buffers have been dumped\n");
return -1;
}
return 0;
}

View file

@ -0,0 +1,16 @@
#ifndef __AMDGPU_PLUGIN_DMABUF_H__
#define __AMDGPU_PLUGIN_DMABUF_H__
#include "amdgpu_plugin_util.h"
#include "criu-amdgpu.pb-c.h"
int amdgpu_plugin_dmabuf_dump(int fd, int id);
int amdgpu_plugin_dmabuf_restore(int id);
int try_dump_dmabuf_list();
int post_dump_dmabuf_check();
int get_dmabuf_info(int fd, struct stat *st);
#endif /* __AMDGPU_PLUGIN_DMABUF_H__ */

View file

@ -19,19 +19,115 @@
#include <dirent.h>
#include "common/list.h"
#include "files.h"
#include "fdstore.h"
#include "criu-amdgpu.pb-c.h"
/* Define __user as empty for kernel headers in user-space */
#define __user
#include "drm.h"
#include <xf86drm.h>
#include <libdrm/amdgpu.h>
#include "xmalloc.h"
#include "criu-log.h"
#include "kfd_ioctl.h"
#include "amdgpu_drm.h"
#include "amdgpu_plugin_drm.h"
#include "amdgpu_plugin_util.h"
#include "amdgpu_plugin_topology.h"
#include "util.h"
#include "common/scm.h"
int get_gem_handle(amdgpu_device_handle h_dev, int dmabuf_fd)
{
uint32_t handle;
int fd = amdgpu_device_get_fd(h_dev);
if (dmabuf_fd == -1) {
return -1;
}
if (drmPrimeFDToHandle(fd, dmabuf_fd, &handle))
return -1;
return handle;
}
int drmIoctl(int fd, unsigned long request, void *arg)
{
int ret, max_retries = 200;
do {
ret = ioctl(fd, request, arg);
} while (ret == -1 && max_retries-- > 0 && (errno == EINTR || errno == EAGAIN));
if (ret == -1 && errno == EBADF)
/* In case pthread_atfork didn't catch it, this will
* make any subsequent hsaKmt calls fail in CHECK_KFD_OPEN.
*/
pr_perror("KFD file descriptor not valid in this process");
return ret;
}
static int allocate_bo_entries(CriuRenderNode *e, int num_bos)
{
e->bo_entries = xmalloc(sizeof(DrmBoEntry *) * num_bos);
if (!e->bo_entries) {
pr_err("Failed to allocate bo_info\n");
return -ENOMEM;
}
for (int i = 0; i < num_bos; i++) {
DrmBoEntry *entry = xzalloc(sizeof(*entry));
if (!entry) {
pr_err("Failed to allocate botest\n");
return -ENOMEM;
}
drm_bo_entry__init(entry);
e->bo_entries[i] = entry;
e->n_bo_entries++;
}
return 0;
}
static int allocate_vm_entries(DrmBoEntry *e, int num_vms)
{
e->vm_entries = xmalloc(sizeof(DrmVmEntry *) * num_vms);
if (!e->vm_entries) {
pr_err("Failed to allocate bo_info\n");
return -ENOMEM;
}
for (int i = 0; i < num_vms; i++) {
DrmVmEntry *entry = xzalloc(sizeof(*entry));
if (!entry) {
pr_err("Failed to allocate botest\n");
return -ENOMEM;
}
drm_vm_entry__init(entry);
e->vm_entries[i] = entry;
e->n_vm_entries++;
}
return 0;
}
static void free_e(CriuRenderNode *e)
{
for (int i = 0; i < e->n_bo_entries; i++) {
if (e->bo_entries[i])
xfree(e->bo_entries[i]);
}
xfree(e);
}
int amdgpu_plugin_drm_handle_device_vma(int fd, const struct stat *st)
{
@ -60,19 +156,257 @@ int amdgpu_plugin_drm_handle_device_vma(int fd, const struct stat *st)
return 0;
}
static int restore_bo_contents_drm(int drm_render_minor, CriuRenderNode *rd, int drm_fd, int *dmabufs)
{
size_t image_size = 0, max_bo_size = 0, buffer_size;
struct amdgpu_gpu_info gpu_info = { 0 };
amdgpu_device_handle h_dev;
uint64_t max_copy_size;
uint32_t major, minor;
FILE *bo_contents_fp = NULL;
void *buffer = NULL;
char img_path[40];
int i, ret = 0;
ret = amdgpu_device_initialize(drm_fd, &major, &minor, &h_dev);
if (ret) {
pr_perror("failed to initialize device");
goto exit;
}
plugin_log_msg("libdrm initialized successfully\n");
ret = amdgpu_query_gpu_info(h_dev, &gpu_info);
if (ret) {
pr_perror("failed to query gpuinfo via libdrm");
goto exit;
}
max_copy_size = (gpu_info.family_id >= AMDGPU_FAMILY_AI) ? SDMA_LINEAR_COPY_MAX_SIZE :
SDMA_LINEAR_COPY_MAX_SIZE - 1;
for (i = 0; i < rd->num_of_bos; i++) {
if (rd->bo_entries[i]->preferred_domains & (AMDGPU_GEM_DOMAIN_VRAM | AMDGPU_GEM_DOMAIN_GTT)) {
if (rd->bo_entries[i]->size > max_bo_size)
max_bo_size = rd->bo_entries[i]->size;
}
}
buffer_size = max_bo_size;
posix_memalign(&buffer, sysconf(_SC_PAGE_SIZE), buffer_size);
if (!buffer) {
pr_perror("Failed to alloc aligned memory. Consider setting KFD_MAX_BUFFER_SIZE.");
ret = -ENOMEM;
goto exit;
}
for (i = 0; i < rd->num_of_bos; i++) {
if (!(rd->bo_entries[i]->preferred_domains & (AMDGPU_GEM_DOMAIN_VRAM | AMDGPU_GEM_DOMAIN_GTT)))
continue;
if (rd->bo_entries[i]->num_of_vms == 0)
continue;
snprintf(img_path, sizeof(img_path), IMG_DRM_PAGES_FILE, rd->id, drm_render_minor, i);
bo_contents_fp = open_img_file(img_path, false, &image_size);
ret = sdma_copy_bo(dmabufs[i], rd->bo_entries[i]->size, bo_contents_fp, buffer, buffer_size, h_dev, max_copy_size,
SDMA_OP_VRAM_WRITE, true);
if (ret) {
pr_err("Failed to fill the BO using sDMA: bo_buckets[%d]\n", i);
break;
}
plugin_log_msg("** Successfully filled the BO using sDMA: bo_buckets[%d] **\n", i);
if (bo_contents_fp)
fclose(bo_contents_fp);
}
exit:
for (int i = 0; i < rd->num_of_bos; i++) {
if (dmabufs[i] != KFD_INVALID_FD)
close(dmabufs[i]);
}
xfree(buffer);
amdgpu_device_deinitialize(h_dev);
return ret;
}
int amdgpu_plugin_drm_dump_file(int fd, int id, struct stat *drm)
{
CriuRenderNode rd = CRIU_RENDER_NODE__INIT;
struct tp_node *tp_node;
CriuRenderNode *rd = NULL;
char path[PATH_MAX];
unsigned char *buf;
int minor;
int len;
int ret;
size_t image_size;
struct tp_node *tp_node;
struct drm_amdgpu_gem_list_handles list_handles_args = { 0 };
struct drm_amdgpu_gem_list_handles_entry *list_handles_entries;
int num_bos;
rd = xmalloc(sizeof(*rd));
if (!rd) {
ret = -ENOMEM;
goto exit;
}
criu_render_node__init(rd);
/* Get the topology node of the DRM device */
minor = minor(drm->st_rdev);
rd->drm_render_minor = minor;
rd->id = id;
num_bos = 8;
list_handles_entries = xzalloc(sizeof(struct drm_amdgpu_gem_list_handles_entry) * num_bos);
list_handles_args.num_entries = num_bos;
list_handles_args.entries = (uintptr_t)list_handles_entries;
ret = drmIoctl(fd, DRM_IOCTL_AMDGPU_GEM_LIST_HANDLES, &list_handles_args);
if (ret && errno == EINVAL) {
pr_info("This kernel appears not to have AMDGPU_GEM_LIST_HANDLES ioctl. Consider disabling Dmabuf IPC or updating your kernel.\n");
list_handles_args.num_entries = 0;
} else if (ret) {
pr_perror("Failed to call bo info ioctl");
goto exit;
}
if (list_handles_args.num_entries > num_bos) {
num_bos = list_handles_args.num_entries;
xfree(list_handles_entries);
list_handles_entries = xzalloc(sizeof(struct drm_amdgpu_gem_list_handles_entry) * num_bos);
list_handles_args.num_entries = num_bos;
list_handles_args.entries = (uintptr_t)list_handles_entries;
ret = drmIoctl(fd, DRM_IOCTL_AMDGPU_GEM_LIST_HANDLES, &list_handles_args);
if (ret) {
pr_perror("Failed to call bo info ioctl");
goto exit;
}
} else {
num_bos = list_handles_args.num_entries;
}
rd->num_of_bos = num_bos;
ret = allocate_bo_entries(rd, num_bos);
if (ret)
goto exit;
for (int i = 0; i < num_bos; i++) {
int num_vm_entries = 8;
struct drm_amdgpu_gem_vm_entry *vm_info_entries;
struct drm_amdgpu_gem_op vm_info_args = { 0 };
DrmBoEntry *boinfo = rd->bo_entries[i];
struct drm_amdgpu_gem_list_handles_entry handle_entry = list_handles_entries[i];
union drm_amdgpu_gem_mmap mmap_args = { 0 };
int dmabuf_fd;
uint32_t major, minor;
amdgpu_device_handle h_dev;
void *buffer = NULL;
char img_path[40];
FILE *bo_contents_fp = NULL;
int device_fd;
boinfo->size = handle_entry.size;
boinfo->alloc_flags = handle_entry.alloc_flags;
boinfo->preferred_domains = handle_entry.preferred_domains;
boinfo->alignment = handle_entry.alignment;
boinfo->handle = handle_entry.gem_handle;
boinfo->is_import = (handle_entry.flags & AMDGPU_GEM_LIST_HANDLES_FLAG_IS_IMPORT) || shared_bo_has_exporter(boinfo->handle);
mmap_args.in.handle = boinfo->handle;
if (drmIoctl(fd, DRM_IOCTL_AMDGPU_GEM_MMAP, &mmap_args) == -1) {
pr_perror("Error Failed to call mmap ioctl");
ret = -1;
goto exit;
}
boinfo->offset = mmap_args.out.addr_ptr;
vm_info_entries = xzalloc(sizeof(struct drm_amdgpu_gem_vm_entry) * num_vm_entries);
vm_info_args.handle = handle_entry.gem_handle;
vm_info_args.num_entries = num_vm_entries;
vm_info_args.value = (uintptr_t)vm_info_entries;
vm_info_args.op = AMDGPU_GEM_OP_GET_MAPPING_INFO;
ret = drmIoctl(fd, DRM_IOCTL_AMDGPU_GEM_OP, &vm_info_args);
if (ret) {
pr_perror("Failed to call vm info ioctl");
goto exit;
}
if (vm_info_args.num_entries > num_vm_entries) {
num_vm_entries = vm_info_args.num_entries;
xfree(vm_info_entries);
vm_info_entries = xzalloc(sizeof(struct drm_amdgpu_gem_vm_entry) * num_vm_entries);
vm_info_args.handle = handle_entry.gem_handle;
vm_info_args.num_entries = num_vm_entries;
vm_info_args.value = (uintptr_t)vm_info_entries;
vm_info_args.op = AMDGPU_GEM_OP_GET_MAPPING_INFO;
ret = drmIoctl(fd, DRM_IOCTL_AMDGPU_GEM_OP, &vm_info_args);
if (ret) {
pr_perror("Failed to call vm info ioctl");
goto exit;
}
} else {
num_vm_entries = vm_info_args.num_entries;
}
boinfo->num_of_vms = num_vm_entries;
ret = allocate_vm_entries(boinfo, num_vm_entries);
if (ret)
goto exit;
for (int j = 0; j < num_vm_entries; j++) {
DrmVmEntry *vminfo = boinfo->vm_entries[j];
boinfo->addr = vm_info_entries[j].addr;
vminfo->addr = vm_info_entries[j].addr;
vminfo->size = vm_info_entries[j].size;
vminfo->offset = vm_info_entries[j].offset;
vminfo->flags = vm_info_entries[j].flags;
}
ret = amdgpu_device_initialize(fd, &major, &minor, &h_dev);
device_fd = amdgpu_device_get_fd(h_dev);
drmPrimeHandleToFD(device_fd, boinfo->handle, 0, &dmabuf_fd);
snprintf(img_path, sizeof(img_path), IMG_DRM_PAGES_FILE, rd->id, rd->drm_render_minor, i);
bo_contents_fp = open_img_file(img_path, true, &image_size);
posix_memalign(&buffer, sysconf(_SC_PAGE_SIZE), handle_entry.size);
ret = sdma_copy_bo(dmabuf_fd, handle_entry.size, bo_contents_fp, buffer, handle_entry.size, h_dev, 0x1000,
SDMA_OP_VRAM_READ, false);
if (dmabuf_fd != KFD_INVALID_FD)
close(dmabuf_fd);
if (bo_contents_fp)
fclose(bo_contents_fp);
ret = amdgpu_device_deinitialize(h_dev);
if (ret)
goto exit;
xfree(vm_info_entries);
}
xfree(list_handles_entries);
for (int i = 0; i < num_bos; i++) {
DrmBoEntry *boinfo = rd->bo_entries[i];
ret = record_shared_bo(boinfo->handle, boinfo->is_import);
if (ret)
goto exit;
}
tp_node = sys_get_node_by_render_minor(&src_topology, minor);
if (!tp_node) {
pr_err("Failed to find a device with minor number = %d\n", minor);
@ -80,21 +414,156 @@ int amdgpu_plugin_drm_dump_file(int fd, int id, struct stat *drm)
}
/* Get the GPU_ID of the DRM device */
rd.gpu_id = maps_get_dest_gpu(&checkpoint_maps, tp_node->gpu_id);
if (!rd.gpu_id) {
pr_err("Failed to find valid gpu_id for the device = %d\n", rd.gpu_id);
rd->gpu_id = maps_get_dest_gpu(&checkpoint_maps, tp_node->gpu_id);
if (!rd->gpu_id) {
pr_err("Failed to find valid gpu_id for the device = %d\n", rd->gpu_id);
return -ENODEV;
}
len = criu_render_node__get_packed_size(&rd);
len = criu_render_node__get_packed_size(rd);
buf = xmalloc(len);
if (!buf)
return -ENOMEM;
criu_render_node__pack(&rd, buf);
criu_render_node__pack(rd, buf);
snprintf(path, sizeof(path), IMG_DRM_FILE, id);
ret = write_img_file(path, buf, len);
xfree(buf);
exit:
free_e(rd);
return ret;
}
int amdgpu_plugin_drm_restore_file(int fd, CriuRenderNode *rd)
{
int ret = 0;
bool retry_needed = false;
uint32_t major, minor;
amdgpu_device_handle h_dev;
int device_fd;
int *dmabufs = xzalloc(sizeof(int) * rd->num_of_bos);
ret = amdgpu_device_initialize(fd, &major, &minor, &h_dev);
if (ret) {
pr_info("Error in init amdgpu device\n");
goto exit;
}
device_fd = amdgpu_device_get_fd(h_dev);
for (int i = 0; i < rd->num_of_bos; i++) {
DrmBoEntry *boinfo = rd->bo_entries[i];
int dmabuf_fd = -1;
uint32_t handle;
struct drm_gem_change_handle change_args = { 0 };
union drm_amdgpu_gem_mmap mmap_args = { 0 };
struct drm_amdgpu_gem_va va_args = { 0 };
int fd_id;
if (work_already_completed(boinfo->handle, rd->drm_render_minor)) {
continue;
} else if (boinfo->handle != -1) {
if (boinfo->is_import) {
fd_id = amdgpu_id_for_handle(boinfo->handle);
if (fd_id == -1) {
retry_needed = true;
continue;
}
dmabuf_fd = fdstore_get(fd_id);
}
}
if (boinfo->is_import) {
drmPrimeFDToHandle(device_fd, dmabuf_fd, &handle);
} else {
union drm_amdgpu_gem_create create_args = { 0 };
create_args.in.bo_size = boinfo->size;
create_args.in.alignment = boinfo->alignment;
create_args.in.domains = boinfo->preferred_domains;
create_args.in.domain_flags = boinfo->alloc_flags;
if (drmIoctl(fd, DRM_IOCTL_AMDGPU_GEM_CREATE, &create_args) == -1) {
pr_perror("Error Failed to call create ioctl");
ret = -1;
goto exit;
}
handle = create_args.out.handle;
drmPrimeHandleToFD(device_fd, handle, 0, &dmabuf_fd);
}
change_args.handle = handle;
change_args.new_handle = boinfo->handle;
if (drmIoctl(fd, DRM_IOCTL_GEM_CHANGE_HANDLE, &change_args) == -1) {
pr_perror("Error Failed to call change ioctl; check if the kernel has DRM_IOCTL_GEM_CHANGE_HANDLE support");
ret = -1;
goto exit;
}
if (!boinfo->is_import)
store_dmabuf_fd(boinfo->handle, dmabuf_fd);
dmabufs[i] = dmabuf_fd;
ret = record_completed_work(boinfo->handle, rd->drm_render_minor);
if (ret)
goto exit;
mmap_args.in.handle = boinfo->handle;
if (drmIoctl(fd, DRM_IOCTL_AMDGPU_GEM_MMAP, &mmap_args) == -1) {
pr_perror("Error Failed to call mmap ioctl");
ret = -1;
goto exit;
}
for (int j = 0; j < boinfo->num_of_vms; j++) {
DrmVmEntry *vminfo = boinfo->vm_entries[j];
va_args.handle = boinfo->handle;
va_args.operation = AMDGPU_VA_OP_MAP;
va_args.flags = vminfo->flags;
va_args.va_address = vminfo->addr;
va_args.offset_in_bo = vminfo->offset;
va_args.map_size = vminfo->size;
if (drmIoctl(fd, DRM_IOCTL_AMDGPU_GEM_VA, &va_args) == -1) {
pr_perror("Error Failed to call gem va ioctl");
ret = -1;
goto exit;
}
}
ret = save_vma_updates(boinfo->offset, boinfo->addr, mmap_args.out.addr_ptr, fd);
if (ret < 0)
goto exit;
}
if (ret) {
pr_info("Error in deinit amdgpu device\n");
goto exit;
}
ret = record_completed_work(-1, rd->drm_render_minor);
if (ret)
goto exit;
ret = amdgpu_device_deinitialize(h_dev);
if (rd->num_of_bos > 0) {
ret = restore_bo_contents_drm(rd->drm_render_minor, rd, fd, dmabufs);
if (ret)
goto exit;
}
exit:
if (ret < 0)
return ret;
xfree(dmabufs);
return retry_needed;
}

View file

@ -24,5 +24,17 @@ int amdgpu_plugin_drm_handle_device_vma(int fd, const struct stat *drm);
*/
int amdgpu_plugin_drm_dump_file(int fd, int id, struct stat *drm);
int amdgpu_plugin_drm_restore_file(int fd, CriuRenderNode *rd);
int amdgpu_plugin_drm_unpause_file(int fd);
int amdgpu_id_for_handle(int handle);
int store_dmabuf_fd(int handle, int fd);
int get_gem_handle(amdgpu_device_handle h_dev, int dmabuf_fd);
int save_vma_updates(uint64_t offset, uint64_t addr, uint64_t restored_offset, int gpu_id);
#endif /* __AMDGPU_PLUGIN_DRM_H__ */

View file

@ -37,9 +37,11 @@
#include "amdgpu_drm.h"
#include "amdgpu_plugin_util.h"
#include "amdgpu_plugin_topology.h"
#include "amdgpu_plugin_drm.h"
/* Tracks number of device files that need to be checkpointed */
static int dev_file_cnt = 0;
static LIST_HEAD(dumped_fds);
static LIST_HEAD(shared_bos);
static LIST_HEAD(completed_work);
/* Helper structures to encode device topology of SRC and DEST platforms */
struct tp_system src_topology;
@ -49,23 +51,145 @@ struct tp_system dest_topology;
struct device_maps checkpoint_maps;
struct device_maps restore_maps;
bool checkpoint_is_complete()
int record_dumped_fd(int fd, bool is_drm)
{
return (dev_file_cnt == 0);
int newfd = dup(fd);
if (newfd < 0)
return newfd;
struct dumped_fd *st = malloc(sizeof(struct dumped_fd));
if (!st)
return -1;
st->fd = newfd;
st->is_drm = is_drm;
list_add(&st->l, &dumped_fds);
return 0;
}
void decrement_checkpoint_count()
struct list_head *get_dumped_fds()
{
dev_file_cnt--;
return &dumped_fds;
}
void init_gpu_count(struct tp_system *topo)
bool shared_bo_has_exporter(int handle)
{
if (dev_file_cnt != 0)
return;
struct shared_bo *bo;
/* We add ONE to include checkpointing of KFD device */
dev_file_cnt = 1 + topology_gpu_count(topo);
if (handle == -1)
return false;
list_for_each_entry(bo, &shared_bos, l) {
if (bo->handle == handle) {
return bo->has_exporter;
}
}
return false;
}
int record_shared_bo(int handle, bool is_imported)
{
struct shared_bo *bo;
if (handle == -1)
return 0;
list_for_each_entry(bo, &shared_bos, l) {
if (bo->handle == handle) {
return 0;
}
}
bo = malloc(sizeof(struct shared_bo));
if (!bo)
return -1;
bo->handle = handle;
bo->has_exporter = !is_imported;
list_add(&bo->l, &shared_bos);
return 0;
}
int handle_for_shared_bo_fd(int fd)
{
struct dumped_fd *df;
int trial_handle;
amdgpu_device_handle h_dev;
uint32_t major, minor;
struct shared_bo *bo;
list_for_each_entry(df, &dumped_fds, l) {
/* see if the gem handle for fd using the hdev for df->fd is the
same as bo->handle. */
if (!df->is_drm) {
continue;
}
if (amdgpu_device_initialize(df->fd, &major, &minor, &h_dev)) {
pr_err("Failed to initialize amdgpu device\n");
continue;
}
trial_handle = get_gem_handle(h_dev, fd);
if (trial_handle < 0)
continue;
list_for_each_entry(bo, &shared_bos, l) {
if (bo->handle == trial_handle)
return trial_handle;
}
amdgpu_device_deinitialize(h_dev);
}
return -1;
}
int record_completed_work(int handle, int id)
{
struct restore_completed_work *work;
work = malloc(sizeof(struct restore_completed_work));
if (!work)
return -1;
work->handle = handle;
work->id = id;
list_add(&work->l, &completed_work);
return 0;
}
bool work_already_completed(int handle, int id)
{
struct restore_completed_work *work;
list_for_each_entry(work, &completed_work, l) {
if (work->handle == handle && work->id == id) {
return true;
}
}
return false;
}
void clear_restore_state()
{
while (!list_empty(&completed_work)) {
struct restore_completed_work *st = list_first_entry(&completed_work, struct restore_completed_work, l);
list_del(&st->l);
free(st);
}
}
void clear_dumped_fds()
{
while (!list_empty(&dumped_fds)) {
struct dumped_fd *st = list_first_entry(&dumped_fds, struct dumped_fd, l);
list_del(&st->l);
close(st->fd);
free(st);
}
}
int read_fp(FILE *fp, void *buf, const size_t buf_len)

View file

@ -1,6 +1,8 @@
#ifndef __AMDGPU_PLUGIN_UTIL_H__
#define __AMDGPU_PLUGIN_UTIL_H__
#include <libdrm/amdgpu.h>
#ifndef _GNU_SOURCE
#define _GNU_SOURCE 1
#endif
@ -51,14 +53,18 @@
/* Name of file having serialized data of DRM device */
#define IMG_DRM_FILE "amdgpu-renderD-%d.img"
/* Name of file having serialized data of dmabuf meta */
#define IMG_DMABUF_FILE "amdgpu-dmabuf_%d.img"
/* Name of file having serialized data of DRM device buffer objects (BOs) */
#define IMG_DRM_PAGES_FILE "amdgpu-drm-pages-%d-%04x.img"
#define IMG_DRM_PAGES_FILE "amdgpu-drm-pages-%d-%d-%04x.img"
/* Helper macros to Checkpoint and Restore a ROCm file */
#define HSAKMT_SHM_PATH "/dev/shm/hsakmt_shared_mem"
#define HSAKMT_SHM "/hsakmt_shared_mem"
#define HSAKMT_SEM_PATH "/dev/shm/sem.hsakmt_semaphore"
#define HSAKMT_SEM "hsakmt_semaphore"
#define DMABUF_LINK "/dmabuf"
/* Help macros to build sDMA command packets */
#define SDMA_PACKET(op, sub_op, e) ((((e)&0xFFFF) << 16) | (((sub_op)&0xFF) << 8) | (((op)&0xFF) << 0))
@ -73,6 +79,24 @@ enum sdma_op_type {
SDMA_OP_VRAM_WRITE,
};
struct dumped_fd {
struct list_head l;
int fd;
bool is_drm;
};
struct shared_bo {
struct list_head l;
int handle;
bool has_exporter;
};
struct restore_completed_work {
struct list_head l;
int handle;
int id;
};
/* Helper structures to encode device topology of SRC and DEST platforms */
extern struct tp_system src_topology;
extern struct tp_system dest_topology;
@ -97,10 +121,25 @@ int read_file(const char *file_path, void *buf, const size_t buf_len);
int write_img_file(char *path, const void *buf, const size_t buf_len);
FILE *open_img_file(char *path, bool write, size_t *size);
bool checkpoint_is_complete();
void decrement_checkpoint_count();
void init_gpu_count(struct tp_system *topology);
int record_dumped_fd(int fd, bool is_drm);
struct list_head *get_dumped_fds();
void clear_dumped_fds();
bool shared_bo_has_exporter(int handle);
int record_shared_bo(int handle, bool is_imported);
int handle_for_shared_bo_fd(int dmabuf_fd);
int record_completed_work(int handle, int id);
bool work_already_completed(int handle, int id);
void clear_restore_state();
void print_kfd_bo_stat(int bo_cnt, struct kfd_criu_bo_bucket *bo_list);
int sdma_copy_bo(int shared_fd, uint64_t size, FILE *storage_fp,
void *buffer, size_t buffer_size, amdgpu_device_handle h_dev,
uint64_t max_copy_size, enum sdma_op_type type, bool do_not_free);
int serve_out_dmabuf_fd(int handle, int fd);
#endif /* __AMDGPU_PLUGIN_UTIL_H__ */

View file

@ -46,6 +46,7 @@ message kfd_bo_entry {
required uint64 offset = 3;
required uint32 alloc_flags = 4;
required uint32 gpu_id = 5;
required uint32 handle = 6;
}
message criu_kfd {
@ -61,6 +62,34 @@ message criu_kfd {
required bytes priv_data = 10;
}
message drm_bo_entry {
required uint64 addr = 1;
required uint64 size = 2;
required uint64 offset = 3;
required uint64 alloc_flags = 4;
required uint64 alignment = 5;
required uint32 preferred_domains = 6;
required uint32 handle = 7;
required uint32 is_import = 8;
required uint32 num_of_vms = 9;
repeated drm_vm_entry vm_entries = 10;
}
message drm_vm_entry {
required uint64 addr = 1;
required uint64 size = 2;
required uint64 offset = 3;
required uint64 flags = 4;
}
message criu_render_node {
required uint32 gpu_id = 1;
required uint32 id = 2;
required uint32 drm_render_minor = 3;
required uint64 num_of_bos = 4;
repeated drm_bo_entry bo_entries = 5;
}
message criu_dmabuf_node {
required uint32 gem_handle = 1;
}

1476
plugins/amdgpu/drm.h Normal file

File diff suppressed because it is too large Load diff

1362
plugins/amdgpu/drm_mode.h Normal file

File diff suppressed because it is too large Load diff

View file

@ -23,9 +23,12 @@
#ifndef KFD_IOCTL_H_INCLUDED
#define KFD_IOCTL_H_INCLUDED
#include <libdrm/drm.h>
#include <linux/ioctl.h>
/* Define __user as empty for kernel headers in user-space */
#define __user
#include "drm.h"
/*
* - 1.1 - initial version
* - 1.3 - Add SMI events support

View file

@ -70,6 +70,7 @@ static int parse_maps(struct vm_area *vmas)
#endif
v->is_vvar_or_vdso |= strstr(buf, "[vdso]") != NULL;
v->is_vvar_or_vdso |= strstr(buf, "[vvar]") != NULL;
v->is_vvar_or_vdso |= strstr(buf, "[vvar_vclock]") != NULL;
test_msg("[NOTE]\tVMA: [%#" PRIx64 ", %#" PRIx64 "]\n", v->start, v->end);
}
@ -86,42 +87,35 @@ static int parse_maps(struct vm_area *vmas)
return i;
}
int compare_vmas(struct vm_area *vmax, struct vm_area *vmay)
{
if (vmax->start > vmay->start)
return 1;
if (vmax->start < vmay->start)
return -1;
if (vmax->end > vmay->end)
return 1;
if (vmax->end < vmay->end)
return -1;
return 0;
}
static int check_vvar_vdso(struct vm_area *before, struct vm_area *after)
static int check_vvar_vdso(struct vm_area *before, int nr_before, struct vm_area *after, int nr_after)
{
int i, j = 0;
for (i = 0; i < MAX_VMAS && j < MAX_VMAS; i++, j++) {
int cmp = compare_vmas(&before[i], &after[j]);
if (cmp == 0)
continue;
if (cmp < 0) { /* Lost mapping */
for (i = 0, j = 0; i < nr_before || j < nr_after;) {
if (j == nr_after || before[i].start < after[j].start) {
test_msg("[NOTE]\tLost mapping: %#" PRIx64 "-%#" PRIx64 "\n", before[i].start, before[i].end);
j--;
if (before[i].is_vvar_or_vdso) {
fail("Lost vvar/vdso mapping");
return -1;
}
i++;
continue;
}
test_msg("[NOTE]\tNew mapping appeared: %#" PRIx64 "-%#" PRIx64 "\n", after[j].start, after[j].end);
i--;
if (i == nr_before || before[i].start > after[j].start) {
test_msg("[NOTE]\tNew mapping appeared: %#" PRIx64 "-%#" PRIx64 "\n", after[j].start, after[j].end);
j++;
continue;
}
if (before[i].end == after[j].end) {
i++;
j++;
} else if (before[i].end > after[j].end) {
before[i].start = after[j].end;
j++;
} else {
after[j].start = before[i].end;
i++;
}
}
return 0;
@ -129,11 +123,10 @@ static int check_vvar_vdso(struct vm_area *before, struct vm_area *after)
static struct vm_area vmas_before[MAX_VMAS];
static struct vm_area vmas_after[MAX_VMAS];
static int nr_before, nr_after;
int main(int argc, char *argv[])
{
int nr_before, nr_after;
test_init(argc, argv);
test_msg("[NOTE]\tMappings before:\n");
@ -154,7 +147,7 @@ int main(int argc, char *argv[])
}
/* After restore vDSO/VVAR blobs must remain in the old place. */
if (check_vvar_vdso(vmas_before, vmas_after))
if (check_vvar_vdso(vmas_before, nr_before, vmas_after, nr_after))
return -1;
if (nr_before + 2 < nr_after) {