mirror of
https://github.com/checkpoint-restore/criu.git
synced 2026-01-22 18:05:10 +00:00
plugin/amdgpu: Add handling for amdgpu drm buffer objects
Buffer objects held by the amdgpu drm driver are checkpointed with the new BO_INFO and MAPPING_INFO ioctls/ioctl options. Handling is in amdgpu_plugin_drm.h Handling of imported buffer objects may require dmabuf fds to be transferred between processes. These occur over fdstore, with the handle-fstore id relationships kept in shread memory. There is a new plugin callback: RESTORE_INIT to create the shared memory. During checkpoint, track shared buffer objects, so that buffer objects that are shared across processes can be identified. During restore, track which buffer objects have been restored. Retry restore of a drm file if a buffer object is imported and the original has not been exported yet. Skip buffer objects that have already been completed or cannot be completed in the current restore. So drm code can use sdma_copy_bo, that function no longer requires kfd bo structs Update the protobuf messages with new amdgpu drm information. Signed-off-by: David Francis <David.Francis@amd.com>
This commit is contained in:
parent
5eb61e1b14
commit
db0ec806d1
9 changed files with 900 additions and 46 deletions
|
|
@ -62,6 +62,8 @@ enum {
|
|||
|
||||
CR_PLUGIN_HOOK__POST_FORKING = 12,
|
||||
|
||||
CR_PLUGIN_HOOK__RESTORE_INIT = 13,
|
||||
|
||||
CR_PLUGIN_HOOK__MAX
|
||||
};
|
||||
|
||||
|
|
@ -81,6 +83,7 @@ DECLARE_PLUGIN_HOOK_ARGS(CR_PLUGIN_HOOK__RESUME_DEVICES_LATE, int pid);
|
|||
DECLARE_PLUGIN_HOOK_ARGS(CR_PLUGIN_HOOK__PAUSE_DEVICES, int pid);
|
||||
DECLARE_PLUGIN_HOOK_ARGS(CR_PLUGIN_HOOK__CHECKPOINT_DEVICES, int pid);
|
||||
DECLARE_PLUGIN_HOOK_ARGS(CR_PLUGIN_HOOK__POST_FORKING, void);
|
||||
DECLARE_PLUGIN_HOOK_ARGS(CR_PLUGIN_HOOK__RESTORE_INIT, void);
|
||||
|
||||
enum {
|
||||
CR_PLUGIN_STAGE__DUMP,
|
||||
|
|
|
|||
|
|
@ -60,6 +60,7 @@ static cr_plugin_desc_t *cr_gen_plugin_desc(void *h, char *path)
|
|||
__assign_hook(PAUSE_DEVICES, "cr_plugin_pause_devices");
|
||||
__assign_hook(CHECKPOINT_DEVICES, "cr_plugin_checkpoint_devices");
|
||||
__assign_hook(POST_FORKING, "cr_plugin_post_forking");
|
||||
__assign_hook(RESTORE_INIT, "cr_plugin_restore_init");
|
||||
|
||||
#undef __assign_hook
|
||||
|
||||
|
|
@ -257,8 +258,16 @@ int cr_plugin_init(int stage)
|
|||
goto err;
|
||||
}
|
||||
|
||||
if (stage == CR_PLUGIN_STAGE__RESTORE && check_inventory_plugins())
|
||||
goto err;
|
||||
if (stage == CR_PLUGIN_STAGE__RESTORE) {
|
||||
int ret;
|
||||
|
||||
if (check_inventory_plugins())
|
||||
goto err;
|
||||
|
||||
ret = run_plugins(RESTORE_INIT);
|
||||
if (ret < 0 && ret != -ENOTSUP)
|
||||
goto err;
|
||||
}
|
||||
|
||||
exit_code = 0;
|
||||
err:
|
||||
|
|
|
|||
|
|
@ -313,4 +313,4 @@ int clone_service_fd(struct pstree_item *me)
|
|||
ret = 0;
|
||||
|
||||
return ret;
|
||||
}
|
||||
}
|
||||
|
|
@ -12,6 +12,8 @@
|
|||
#include <sys/sysmacros.h>
|
||||
#include <sys/mman.h>
|
||||
#include <sys/types.h>
|
||||
#include <sys/socket.h>
|
||||
#include <sys/un.h>
|
||||
#include <stdint.h>
|
||||
#include <pthread.h>
|
||||
#include <semaphore.h>
|
||||
|
|
@ -23,12 +25,17 @@
|
|||
#include "criu-plugin.h"
|
||||
#include "plugin.h"
|
||||
#include "criu-amdgpu.pb-c.h"
|
||||
#include "util.h"
|
||||
#include "util-pie.h"
|
||||
#include "fdstore.h"
|
||||
|
||||
#include "kfd_ioctl.h"
|
||||
#include "xmalloc.h"
|
||||
#include "criu-log.h"
|
||||
#include "files.h"
|
||||
#include "pstree.h"
|
||||
#include "sockets.h"
|
||||
#include "rst-malloc.h"
|
||||
|
||||
#include "common/list.h"
|
||||
#include "amdgpu_plugin_drm.h"
|
||||
|
|
@ -66,6 +73,19 @@ bool plugin_added_to_inventory = false;
|
|||
|
||||
bool plugin_disabled = false;
|
||||
|
||||
struct handle_id {
|
||||
int handle;
|
||||
int fdstore_id;
|
||||
};
|
||||
struct shared_handle_ids {
|
||||
int num_handles;
|
||||
struct handle_id *handles;
|
||||
};
|
||||
struct shared_handle_ids *shared_memory = NULL;
|
||||
|
||||
static mutex_t *shared_memory_mutex;
|
||||
|
||||
int current_pid;
|
||||
/*
|
||||
* In the case of a single process (common case), this optimization can effectively
|
||||
* reduce the restore latency with parallel restore. In the case of multiple processes,
|
||||
|
|
@ -526,11 +546,11 @@ void free_and_unmap(uint64_t size, amdgpu_bo_handle h_bo, amdgpu_va_handle h_va,
|
|||
amdgpu_bo_free(h_bo);
|
||||
}
|
||||
|
||||
static int sdma_copy_bo(struct kfd_criu_bo_bucket bo_bucket, FILE *storage_fp,
|
||||
void *buffer, size_t buffer_size, amdgpu_device_handle h_dev,
|
||||
uint64_t max_copy_size, enum sdma_op_type type)
|
||||
int sdma_copy_bo(int shared_fd, uint64_t size, FILE *storage_fp,
|
||||
void *buffer, size_t buffer_size, amdgpu_device_handle h_dev,
|
||||
uint64_t max_copy_size, enum sdma_op_type type, bool do_not_free)
|
||||
{
|
||||
uint64_t size, src_bo_size, dst_bo_size, buffer_bo_size, bytes_remain, buffer_space_remain;
|
||||
uint64_t src_bo_size, dst_bo_size, buffer_bo_size, bytes_remain, buffer_space_remain;
|
||||
uint64_t gpu_addr_src, gpu_addr_dst, gpu_addr_ib, copy_src, copy_dst, copy_size;
|
||||
amdgpu_va_handle h_va_src, h_va_dst, h_va_ib;
|
||||
amdgpu_bo_handle h_bo_src, h_bo_dst, h_bo_ib;
|
||||
|
|
@ -543,10 +563,8 @@ static int sdma_copy_bo(struct kfd_criu_bo_bucket bo_bucket, FILE *storage_fp,
|
|||
uint32_t expired;
|
||||
amdgpu_context_handle h_ctx;
|
||||
uint32_t *ib = NULL;
|
||||
int j, err, shared_fd, packets_per_buffer;
|
||||
int j, err, packets_per_buffer;
|
||||
|
||||
shared_fd = bo_bucket.dmabuf_fd;
|
||||
size = bo_bucket.size;
|
||||
buffer_bo_size = min(size, buffer_size);
|
||||
packets_per_buffer = ((buffer_bo_size - 1) / max_copy_size) + 1;
|
||||
src_bo_size = (type == SDMA_OP_VRAM_WRITE) ? buffer_bo_size : size;
|
||||
|
|
@ -757,7 +775,8 @@ err_dst_bo_map:
|
|||
if (err)
|
||||
pr_perror("dest range free failed");
|
||||
err_dst_va:
|
||||
err = amdgpu_bo_free(h_bo_dst);
|
||||
if (!do_not_free)
|
||||
err = amdgpu_bo_free(h_bo_dst);
|
||||
if (err)
|
||||
pr_perror("dest bo free failed");
|
||||
err_dst_bo_prep:
|
||||
|
|
@ -845,8 +864,9 @@ void *dump_bo_contents(void *_thread_data)
|
|||
num_bos++;
|
||||
|
||||
/* perform sDMA based vram copy */
|
||||
ret = sdma_copy_bo(bo_buckets[i], bo_contents_fp, buffer, buffer_size, h_dev, max_copy_size,
|
||||
SDMA_OP_VRAM_READ);
|
||||
ret = sdma_copy_bo(bo_buckets[i].dmabuf_fd, bo_buckets[i].size, bo_contents_fp, buffer, buffer_size, h_dev, max_copy_size,
|
||||
SDMA_OP_VRAM_READ, false);
|
||||
|
||||
if (ret) {
|
||||
pr_err("Failed to drain the BO using sDMA: bo_buckets[%d]\n", i);
|
||||
break;
|
||||
|
|
@ -943,8 +963,8 @@ void *restore_bo_contents(void *_thread_data)
|
|||
|
||||
num_bos++;
|
||||
|
||||
ret = sdma_copy_bo(bo_buckets[i], bo_contents_fp, buffer, buffer_size, h_dev, max_copy_size,
|
||||
SDMA_OP_VRAM_WRITE);
|
||||
ret = sdma_copy_bo(bo_buckets[i].dmabuf_fd, bo_buckets[i].size, bo_contents_fp, buffer, buffer_size, h_dev, max_copy_size,
|
||||
SDMA_OP_VRAM_WRITE, false);
|
||||
if (ret) {
|
||||
pr_err("Failed to fill the BO using sDMA: bo_buckets[%d]\n", i);
|
||||
break;
|
||||
|
|
@ -1053,6 +1073,134 @@ exit:
|
|||
return ret;
|
||||
}
|
||||
|
||||
int store_dmabuf_fd(int handle, int fd)
|
||||
{
|
||||
int id;
|
||||
|
||||
id = fdstore_add(fd);
|
||||
mutex_lock(shared_memory_mutex);
|
||||
for (int i = 0; i < shared_memory->num_handles; i++) {
|
||||
if (shared_memory->handles[i].handle == handle) {
|
||||
mutex_unlock(shared_memory_mutex);
|
||||
return 0;
|
||||
}
|
||||
if (shared_memory->handles[i].handle == -1) {
|
||||
shared_memory->handles[i].handle = handle;
|
||||
shared_memory->handles[i].fdstore_id = id;
|
||||
mutex_unlock(shared_memory_mutex);
|
||||
return 0;
|
||||
}
|
||||
}
|
||||
mutex_unlock(shared_memory_mutex);
|
||||
|
||||
return -1;
|
||||
}
|
||||
|
||||
int amdgpu_id_for_handle(int handle)
|
||||
{
|
||||
mutex_lock(shared_memory_mutex);
|
||||
for (int i = 0; i < shared_memory->num_handles; i++) {
|
||||
if (shared_memory->handles[i].handle == handle) {
|
||||
mutex_unlock(shared_memory_mutex);
|
||||
return shared_memory->handles[i].fdstore_id;
|
||||
}
|
||||
}
|
||||
mutex_unlock(shared_memory_mutex);
|
||||
return -1;
|
||||
}
|
||||
|
||||
int amdgpu_restore_init(void)
|
||||
{
|
||||
if (!shared_memory) {
|
||||
int protection = PROT_READ | PROT_WRITE;
|
||||
int visibility = MAP_SHARED | MAP_ANONYMOUS;
|
||||
size_t img_size;
|
||||
FILE *img_fp = NULL;
|
||||
int ret;
|
||||
unsigned char *buf;
|
||||
int num_handles = 0;
|
||||
char img_path[PATH_MAX];
|
||||
CriuRenderNode *rd = NULL;
|
||||
CriuKfd *e = NULL;
|
||||
|
||||
DIR *d;
|
||||
struct dirent *dir;
|
||||
d = opendir(".");
|
||||
if (d) {
|
||||
while ((dir = readdir(d)) != NULL) {
|
||||
if (strncmp("amdgpu-kfd-", dir->d_name, strlen("amdgpu-kfd-")) == 0) {
|
||||
pr_info("CC3: Found kfd file\n");
|
||||
img_fp = open_img_file(dir->d_name, false, &img_size);
|
||||
buf = xmalloc(img_size);
|
||||
if (!buf) {
|
||||
fclose(img_fp);
|
||||
return -ENOMEM;
|
||||
}
|
||||
|
||||
ret = read_fp(img_fp, buf, img_size);
|
||||
if (ret) {
|
||||
pr_perror("Unable to read from %s", img_path);
|
||||
fclose(img_fp);
|
||||
xfree(buf);
|
||||
return ret;
|
||||
}
|
||||
|
||||
fclose(img_fp);
|
||||
e = criu_kfd__unpack(NULL, img_size, buf);
|
||||
num_handles += e->num_of_bos;
|
||||
criu_kfd__free_unpacked(e, NULL);
|
||||
xfree(buf);
|
||||
}
|
||||
if (strncmp("amdgpu-renderD-", dir->d_name, strlen("amdgpu-renderD-")) == 0) {
|
||||
pr_info("CC3: Found drm file\n");
|
||||
img_fp = open_img_file(dir->d_name, false, &img_size);
|
||||
buf = xmalloc(img_size);
|
||||
if (!buf) {
|
||||
fclose(img_fp);
|
||||
return -ENOMEM;
|
||||
}
|
||||
|
||||
ret = read_fp(img_fp, buf, img_size);
|
||||
if (ret) {
|
||||
pr_perror("Unable to read from %s", img_path);
|
||||
fclose(img_fp);
|
||||
xfree(buf);
|
||||
return ret;
|
||||
}
|
||||
|
||||
fclose(img_fp);
|
||||
rd = criu_render_node__unpack(NULL, img_size, buf);
|
||||
num_handles += rd->num_of_bos;
|
||||
criu_render_node__free_unpacked(rd, NULL);
|
||||
xfree(buf);
|
||||
}
|
||||
}
|
||||
closedir(d);
|
||||
}
|
||||
|
||||
if (num_handles > 0) {
|
||||
shared_memory = mmap(NULL, sizeof(shared_memory), protection, visibility, -1, 0);
|
||||
shared_memory->num_handles = num_handles;
|
||||
shared_memory->handles = mmap(NULL, sizeof(struct handle_id) * num_handles, protection, visibility, -1, 0);
|
||||
|
||||
for (int i = 0; i < num_handles; i++) {
|
||||
shared_memory->handles[i].handle = -1;
|
||||
shared_memory->handles[i].fdstore_id = -1;
|
||||
}
|
||||
|
||||
shared_memory_mutex = shmalloc(sizeof(*shared_memory_mutex));
|
||||
if (!shared_memory_mutex) {
|
||||
pr_err("Can't create amdgpu mutex\n");
|
||||
return -1;
|
||||
}
|
||||
mutex_init(shared_memory_mutex);
|
||||
}
|
||||
}
|
||||
|
||||
return 0;
|
||||
}
|
||||
CR_PLUGIN_REGISTER_HOOK(CR_PLUGIN_HOOK__RESTORE_INIT, amdgpu_restore_init)
|
||||
|
||||
static int save_devices(int fd, struct kfd_ioctl_criu_args *args, struct kfd_criu_device_bucket *device_buckets,
|
||||
CriuKfd *e)
|
||||
{
|
||||
|
|
@ -1095,6 +1243,8 @@ static int save_bos(int id, int fd, struct kfd_ioctl_criu_args *args, struct kfd
|
|||
{
|
||||
struct thread_data *thread_datas;
|
||||
int ret = 0, i;
|
||||
amdgpu_device_handle h_dev;
|
||||
uint32_t major, minor;
|
||||
|
||||
pr_debug("Dumping %d BOs\n", args->num_bos);
|
||||
|
||||
|
|
@ -1118,6 +1268,19 @@ static int save_bos(int id, int fd, struct kfd_ioctl_criu_args *args, struct kfd
|
|||
boinfo->size = bo_bucket->size;
|
||||
boinfo->offset = bo_bucket->offset;
|
||||
boinfo->alloc_flags = bo_bucket->alloc_flags;
|
||||
|
||||
ret = amdgpu_device_initialize(node_get_drm_render_device(sys_get_node_by_gpu_id(&src_topology, bo_bucket->gpu_id)), &major, &minor, &h_dev);
|
||||
|
||||
boinfo->handle = get_gem_handle(h_dev, bo_bucket->dmabuf_fd);
|
||||
|
||||
amdgpu_device_deinitialize(h_dev);
|
||||
}
|
||||
for (i = 0; i < e->num_of_bos; i++) {
|
||||
KfdBoEntry *boinfo = e->bo_entries[i];
|
||||
|
||||
ret = record_shared_bo(boinfo->handle, false);
|
||||
if (ret)
|
||||
goto exit;
|
||||
}
|
||||
|
||||
for (int i = 0; i < e->num_of_gpus; i++) {
|
||||
|
|
@ -1457,6 +1620,29 @@ static int restore_bos(struct kfd_ioctl_criu_args *args, CriuKfd *e)
|
|||
}
|
||||
|
||||
pr_info("Restore BOs Ok\n");
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
int save_vma_updates(uint64_t offset, uint64_t addr, uint64_t restored_offset, int fd)
|
||||
{
|
||||
struct vma_metadata *vma_md;
|
||||
|
||||
vma_md = xmalloc(sizeof(*vma_md));
|
||||
if (!vma_md) {
|
||||
return -ENOMEM;
|
||||
}
|
||||
|
||||
memset(vma_md, 0, sizeof(*vma_md));
|
||||
|
||||
vma_md->old_pgoff = offset;
|
||||
vma_md->vma_entry = addr;
|
||||
|
||||
vma_md->new_pgoff = restored_offset;
|
||||
vma_md->fd = fd;
|
||||
|
||||
list_add_tail(&vma_md->list, &update_vma_info_list);
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
|
|
@ -1691,8 +1877,18 @@ int amdgpu_plugin_restore_file(int id, bool *retry_needed)
|
|||
pr_info("render node destination gpu_id = 0x%04x\n", tp_node->gpu_id);
|
||||
|
||||
fd = node_get_drm_render_device(tp_node);
|
||||
if (fd < 0)
|
||||
if (fd < 0) {
|
||||
pr_err("Failed to open render device (minor:%d)\n", tp_node->drm_render_minor);
|
||||
return -1;
|
||||
}
|
||||
|
||||
ret = amdgpu_plugin_drm_restore_file(fd, rd);
|
||||
if (ret == 1)
|
||||
*retry_needed = true;
|
||||
if (ret < 0) {
|
||||
fd = ret;
|
||||
goto fail;
|
||||
}
|
||||
fail:
|
||||
criu_render_node__free_unpacked(rd, NULL);
|
||||
xfree(buf);
|
||||
|
|
@ -1704,12 +1900,20 @@ int amdgpu_plugin_restore_file(int id, bool *retry_needed)
|
|||
* copy of the fd. CRIU core owns the duplicated returned fd, and amdgpu_plugin owns the fd stored in
|
||||
* tp_node.
|
||||
*/
|
||||
fd = dup(fd);
|
||||
if (fd == -1) {
|
||||
pr_perror("unable to duplicate the render fd");
|
||||
return -1;
|
||||
|
||||
if (fd < 0)
|
||||
return fd;
|
||||
|
||||
if (!(*retry_needed)) {
|
||||
fd = dup(fd);
|
||||
if (fd == -1) {
|
||||
pr_perror("unable to duplicate the render fd");
|
||||
return -1;
|
||||
}
|
||||
return fd;
|
||||
}
|
||||
return fd;
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
fd = open(AMDGPU_KFD_DEVICE, O_RDWR | O_CLOEXEC);
|
||||
|
|
@ -1753,11 +1957,13 @@ int amdgpu_plugin_restore_file(int id, bool *retry_needed)
|
|||
* This way, we know that the file descriptors we store will not conflict with file descriptors inside core
|
||||
* CRIU.
|
||||
*/
|
||||
fd_next = find_unused_fd_pid(e->pid);
|
||||
if (fd_next <= 0) {
|
||||
pr_err("Failed to find unused fd (fd:%d)\n", fd_next);
|
||||
ret = -EINVAL;
|
||||
goto exit;
|
||||
if (fd_next == -1) {
|
||||
fd_next = find_unused_fd_pid(e->pid);
|
||||
if (fd_next <= 0) {
|
||||
pr_err("Failed to find unused fd (fd:%d)\n", fd_next);
|
||||
ret = -EINVAL;
|
||||
goto exit;
|
||||
}
|
||||
}
|
||||
|
||||
ret = devinfo_to_topology(e->device_entries, e->num_of_gpus + e->num_of_cpus, &src_topology);
|
||||
|
|
@ -1790,14 +1996,26 @@ int amdgpu_plugin_restore_file(int id, bool *retry_needed)
|
|||
args.num_objects = e->num_of_objects;
|
||||
args.priv_data_size = e->priv_data.len;
|
||||
args.priv_data = (uintptr_t)e->priv_data.data;
|
||||
|
||||
args.op = KFD_CRIU_OP_RESTORE;
|
||||
|
||||
if (kmtIoctl(fd, AMDKFD_IOC_CRIU_OP, &args) == -1) {
|
||||
pr_perror("Restore ioctl failed");
|
||||
ret = -1;
|
||||
goto exit;
|
||||
}
|
||||
|
||||
if (ret < 0)
|
||||
goto exit;
|
||||
|
||||
for (int i = 0; i < args.num_bos; i++) {
|
||||
struct kfd_criu_bo_bucket *bo_bucket = &((struct kfd_criu_bo_bucket *)args.bos)[i];
|
||||
KfdBoEntry *bo_entry = e->bo_entries[i];
|
||||
|
||||
if (bo_entry->handle != -1) {
|
||||
store_dmabuf_fd(bo_entry->handle, bo_bucket->dmabuf_fd);
|
||||
}
|
||||
}
|
||||
|
||||
ret = restore_bo_data(id, (struct kfd_criu_bo_bucket *)args.bos, e);
|
||||
if (ret)
|
||||
goto exit;
|
||||
|
|
@ -1940,19 +2158,14 @@ int amdgpu_plugin_resume_devices_late(int target_pid)
|
|||
}
|
||||
}
|
||||
|
||||
clear_restore_state();
|
||||
|
||||
close(fd);
|
||||
return exit_code;
|
||||
}
|
||||
|
||||
CR_PLUGIN_REGISTER_HOOK(CR_PLUGIN_HOOK__RESUME_DEVICES_LATE, amdgpu_plugin_resume_devices_late)
|
||||
|
||||
int sdma_copy_bo_helper(uint64_t size, int fd, FILE *storage_fp, void *buffer, size_t buffer_size,
|
||||
amdgpu_device_handle h_dev, uint64_t max_copy_size, enum sdma_op_type type)
|
||||
{
|
||||
return sdma_copy_bo((struct kfd_criu_bo_bucket){ 0, size, 0, 0, 0, 0, fd, 0 }, storage_fp, buffer,
|
||||
buffer_size, h_dev, max_copy_size, SDMA_OP_VRAM_WRITE);
|
||||
}
|
||||
|
||||
int init_dev(int dev_minor, amdgpu_device_handle *h_dev, uint64_t *max_copy_size)
|
||||
{
|
||||
int ret = 0;
|
||||
|
|
@ -2061,8 +2274,10 @@ void *parallel_restore_bo_contents(void *_thread_data)
|
|||
|
||||
entry = &restore_cmd->entries[i];
|
||||
fseek(bo_contents_fp, entry->read_offset + offset, SEEK_SET);
|
||||
ret = sdma_copy_bo_helper(entry->size, restore_cmd->fds_write[entry->write_id], bo_contents_fp, buffer,
|
||||
buffer_size, h_dev, max_copy_size, SDMA_OP_VRAM_WRITE);
|
||||
ret = sdma_copy_bo(restore_cmd->fds_write[entry->write_id], entry->size, bo_contents_fp,
|
||||
buffer, buffer_size, h_dev,
|
||||
max_copy_size, SDMA_OP_VRAM_WRITE, false);
|
||||
|
||||
if (ret) {
|
||||
pr_err("Failed to fill the BO using sDMA: bo_buckets[%d]\n", i);
|
||||
goto err_sdma;
|
||||
|
|
|
|||
|
|
@ -19,19 +19,112 @@
|
|||
|
||||
#include <dirent.h>
|
||||
#include "common/list.h"
|
||||
#include "files.h"
|
||||
#include "fdstore.h"
|
||||
|
||||
#include "criu-amdgpu.pb-c.h"
|
||||
#define __user
|
||||
#include "drm.h"
|
||||
|
||||
#include <xf86drm.h>
|
||||
#include <libdrm/amdgpu.h>
|
||||
|
||||
#include "xmalloc.h"
|
||||
#include "criu-log.h"
|
||||
#include "kfd_ioctl.h"
|
||||
#include "amdgpu_drm.h"
|
||||
#include "amdgpu_plugin_drm.h"
|
||||
#include "amdgpu_plugin_util.h"
|
||||
#include "amdgpu_plugin_topology.h"
|
||||
|
||||
#include "util.h"
|
||||
#include "common/scm.h"
|
||||
|
||||
int get_gem_handle(amdgpu_device_handle h_dev, int dmabuf_fd)
|
||||
{
|
||||
uint32_t handle;
|
||||
int fd = amdgpu_device_get_fd(h_dev);
|
||||
|
||||
if (dmabuf_fd == -1) {
|
||||
return -1;
|
||||
}
|
||||
|
||||
drmPrimeFDToHandle(fd, dmabuf_fd, &handle);
|
||||
|
||||
return handle;
|
||||
}
|
||||
|
||||
int drmIoctl(int fd, unsigned long request, void *arg)
|
||||
{
|
||||
int ret, max_retries = 200;
|
||||
|
||||
do {
|
||||
ret = ioctl(fd, request, arg);
|
||||
} while (ret == -1 && max_retries-- > 0 && (errno == EINTR || errno == EAGAIN));
|
||||
|
||||
if (ret == -1 && errno == EBADF)
|
||||
/* In case pthread_atfork didn't catch it, this will
|
||||
* make any subsequent hsaKmt calls fail in CHECK_KFD_OPEN.
|
||||
*/
|
||||
pr_perror("KFD file descriptor not valid in this process");
|
||||
return ret;
|
||||
}
|
||||
|
||||
static int allocate_bo_entries(CriuRenderNode *e, int num_bos)
|
||||
{
|
||||
e->bo_entries = xmalloc(sizeof(DrmBoEntry *) * num_bos);
|
||||
if (!e->bo_entries) {
|
||||
pr_err("Failed to allocate bo_info\n");
|
||||
return -ENOMEM;
|
||||
}
|
||||
|
||||
for (int i = 0; i < num_bos; i++) {
|
||||
DrmBoEntry *entry = xzalloc(sizeof(*entry));
|
||||
|
||||
if (!entry) {
|
||||
pr_err("Failed to allocate botest\n");
|
||||
return -ENOMEM;
|
||||
}
|
||||
|
||||
drm_bo_entry__init(entry);
|
||||
|
||||
e->bo_entries[i] = entry;
|
||||
e->n_bo_entries++;
|
||||
}
|
||||
return 0;
|
||||
}
|
||||
|
||||
static int allocate_vm_entries(DrmBoEntry *e, int num_vms)
|
||||
{
|
||||
e->vm_entries = xmalloc(sizeof(DrmVmEntry *) * num_vms);
|
||||
if (!e->vm_entries) {
|
||||
pr_err("Failed to allocate bo_info\n");
|
||||
return -ENOMEM;
|
||||
}
|
||||
|
||||
for (int i = 0; i < num_vms; i++) {
|
||||
DrmVmEntry *entry = xzalloc(sizeof(*entry));
|
||||
|
||||
if (!entry) {
|
||||
pr_err("Failed to allocate botest\n");
|
||||
return -ENOMEM;
|
||||
}
|
||||
|
||||
drm_vm_entry__init(entry);
|
||||
|
||||
e->vm_entries[i] = entry;
|
||||
e->n_vm_entries++;
|
||||
}
|
||||
return 0;
|
||||
}
|
||||
|
||||
static void free_e(CriuRenderNode *e)
|
||||
{
|
||||
for (int i = 0; i < e->n_bo_entries; i++) {
|
||||
if (e->bo_entries[i])
|
||||
xfree(e->bo_entries[i]);
|
||||
}
|
||||
|
||||
xfree(e);
|
||||
}
|
||||
|
||||
int amdgpu_plugin_drm_handle_device_vma(int fd, const struct stat *st)
|
||||
{
|
||||
|
|
@ -60,19 +153,260 @@ int amdgpu_plugin_drm_handle_device_vma(int fd, const struct stat *st)
|
|||
return 0;
|
||||
}
|
||||
|
||||
static int restore_bo_contents_drm(int drm_render_minor, CriuRenderNode *rd, int drm_fd, int *dmabufs)
|
||||
{
|
||||
size_t image_size = 0, max_bo_size = 0, buffer_size;
|
||||
struct amdgpu_gpu_info gpu_info = { 0 };
|
||||
amdgpu_device_handle h_dev;
|
||||
uint64_t max_copy_size;
|
||||
uint32_t major, minor;
|
||||
FILE *bo_contents_fp = NULL;
|
||||
void *buffer = NULL;
|
||||
char img_path[40];
|
||||
int num_bos = 0;
|
||||
int i, ret = 0;
|
||||
|
||||
ret = amdgpu_device_initialize(drm_fd, &major, &minor, &h_dev);
|
||||
if (ret) {
|
||||
pr_perror("failed to initialize device");
|
||||
goto exit;
|
||||
}
|
||||
plugin_log_msg("libdrm initialized successfully\n");
|
||||
|
||||
ret = amdgpu_query_gpu_info(h_dev, &gpu_info);
|
||||
if (ret) {
|
||||
pr_perror("failed to query gpuinfo via libdrm");
|
||||
goto exit;
|
||||
}
|
||||
|
||||
max_copy_size = (gpu_info.family_id >= AMDGPU_FAMILY_AI) ? SDMA_LINEAR_COPY_MAX_SIZE :
|
||||
SDMA_LINEAR_COPY_MAX_SIZE - 1;
|
||||
|
||||
for (i = 0; i < rd->num_of_bos; i++) {
|
||||
if (rd->bo_entries[i]->preferred_domains & (AMDGPU_GEM_DOMAIN_VRAM | AMDGPU_GEM_DOMAIN_GTT)) {
|
||||
if (rd->bo_entries[i]->size > max_bo_size)
|
||||
max_bo_size = rd->bo_entries[i]->size;
|
||||
}
|
||||
}
|
||||
|
||||
buffer_size = max_bo_size;
|
||||
|
||||
posix_memalign(&buffer, sysconf(_SC_PAGE_SIZE), buffer_size);
|
||||
if (!buffer) {
|
||||
pr_perror("Failed to alloc aligned memory. Consider setting KFD_MAX_BUFFER_SIZE.");
|
||||
ret = -ENOMEM;
|
||||
goto exit;
|
||||
}
|
||||
|
||||
for (i = 0; i < rd->num_of_bos; i++) {
|
||||
if (!(rd->bo_entries[i]->preferred_domains & (AMDGPU_GEM_DOMAIN_VRAM | AMDGPU_GEM_DOMAIN_GTT)))
|
||||
continue;
|
||||
|
||||
if (rd->bo_entries[i]->num_of_vms == 0)
|
||||
continue;
|
||||
|
||||
num_bos++;
|
||||
|
||||
snprintf(img_path, sizeof(img_path), IMG_DRM_PAGES_FILE, rd->id, drm_render_minor, i);
|
||||
|
||||
bo_contents_fp = open_img_file(img_path, false, &image_size);
|
||||
|
||||
ret = sdma_copy_bo(dmabufs[i], rd->bo_entries[i]->size, bo_contents_fp, buffer, buffer_size, h_dev, max_copy_size,
|
||||
SDMA_OP_VRAM_WRITE, true);
|
||||
if (ret) {
|
||||
pr_err("Failed to fill the BO using sDMA: bo_buckets[%d]\n", i);
|
||||
break;
|
||||
}
|
||||
plugin_log_msg("** Successfully filled the BO using sDMA: bo_buckets[%d] **\n", i);
|
||||
|
||||
if (bo_contents_fp)
|
||||
fclose(bo_contents_fp);
|
||||
}
|
||||
|
||||
exit:
|
||||
for (int i = 0; i < rd->num_of_bos; i++) {
|
||||
if (dmabufs[i] != KFD_INVALID_FD)
|
||||
close(dmabufs[i]);
|
||||
}
|
||||
|
||||
xfree(buffer);
|
||||
|
||||
amdgpu_device_deinitialize(h_dev);
|
||||
return ret;
|
||||
}
|
||||
|
||||
int amdgpu_plugin_drm_dump_file(int fd, int id, struct stat *drm)
|
||||
{
|
||||
CriuRenderNode rd = CRIU_RENDER_NODE__INIT;
|
||||
struct tp_node *tp_node;
|
||||
CriuRenderNode *rd = NULL;
|
||||
char path[PATH_MAX];
|
||||
unsigned char *buf;
|
||||
int minor;
|
||||
int len;
|
||||
int ret;
|
||||
size_t image_size;
|
||||
struct tp_node *tp_node;
|
||||
struct drm_amdgpu_gem_list_handles list_handles_args = { 0 };
|
||||
struct drm_amdgpu_gem_list_handles_entry *list_handles_entries;
|
||||
int num_bos;
|
||||
|
||||
rd = xmalloc(sizeof(*rd));
|
||||
if (!rd) {
|
||||
ret = -ENOMEM;
|
||||
goto exit;
|
||||
}
|
||||
criu_render_node__init(rd);
|
||||
|
||||
/* Get the topology node of the DRM device */
|
||||
minor = minor(drm->st_rdev);
|
||||
rd->drm_render_minor = minor;
|
||||
rd->id = id;
|
||||
|
||||
num_bos = 8;
|
||||
list_handles_entries = xzalloc(sizeof(struct drm_amdgpu_gem_list_handles_entry) * num_bos);
|
||||
list_handles_args.num_entries = num_bos;
|
||||
list_handles_args.entries = (uintptr_t)list_handles_entries;
|
||||
|
||||
ret = drmIoctl(fd, DRM_IOCTL_AMDGPU_GEM_LIST_HANDLES, &list_handles_args);
|
||||
if (ret && errno == EINVAL) {
|
||||
pr_info("This kernel appears not to have AMDGPU_GEM_LIST_HANDLES ioctl. Consider disabling Dmabuf IPC or updating your kernel.\n");
|
||||
list_handles_args.num_entries = 0;
|
||||
} else if (ret) {
|
||||
pr_perror("Failed to call bo info ioctl");
|
||||
goto exit;
|
||||
}
|
||||
|
||||
if (list_handles_args.num_entries > num_bos) {
|
||||
num_bos = list_handles_args.num_entries;
|
||||
xfree(list_handles_entries);
|
||||
list_handles_entries = xzalloc(sizeof(struct drm_amdgpu_gem_list_handles_entry) * num_bos);
|
||||
list_handles_args.num_entries = num_bos;
|
||||
list_handles_args.entries = (uintptr_t)list_handles_entries;
|
||||
ret = drmIoctl(fd, DRM_IOCTL_AMDGPU_GEM_LIST_HANDLES, &list_handles_args);
|
||||
if (ret) {
|
||||
pr_perror("Failed to call bo info ioctl");
|
||||
goto exit;
|
||||
}
|
||||
} else {
|
||||
num_bos = list_handles_args.num_entries;
|
||||
}
|
||||
|
||||
rd->num_of_bos = num_bos;
|
||||
ret = allocate_bo_entries(rd, num_bos);
|
||||
if (ret)
|
||||
goto exit;
|
||||
|
||||
for (int i = 0; i < num_bos; i++) {
|
||||
int num_vm_entries = 8;
|
||||
struct drm_amdgpu_gem_vm_entry *vm_info_entries;
|
||||
struct drm_amdgpu_gem_op vm_info_args = { 0 };
|
||||
DrmBoEntry *boinfo = rd->bo_entries[i];
|
||||
struct drm_amdgpu_gem_list_handles_entry handle_entry = list_handles_entries[i];
|
||||
union drm_amdgpu_gem_mmap mmap_args = { 0 };
|
||||
int dmabuf_fd;
|
||||
uint32_t major, minor;
|
||||
amdgpu_device_handle h_dev;
|
||||
void *buffer = NULL;
|
||||
char img_path[40];
|
||||
FILE *bo_contents_fp = NULL;
|
||||
int device_fd;
|
||||
|
||||
boinfo->size = handle_entry.size;
|
||||
|
||||
boinfo->alloc_flags = handle_entry.alloc_flags;
|
||||
boinfo->preferred_domains = handle_entry.preferred_domains;
|
||||
boinfo->alignment = handle_entry.alignment;
|
||||
boinfo->handle = handle_entry.gem_handle;
|
||||
boinfo->is_import = (handle_entry.flags & AMDGPU_GEM_LIST_HANDLES_FLAG_IS_IMPORT) || shared_bo_has_exporter(boinfo->handle);
|
||||
|
||||
mmap_args.in.handle = boinfo->handle;
|
||||
|
||||
if (drmIoctl(fd, DRM_IOCTL_AMDGPU_GEM_MMAP, &mmap_args) == -1) {
|
||||
pr_perror("Error Failed to call mmap ioctl");
|
||||
ret = -1;
|
||||
goto exit;
|
||||
}
|
||||
|
||||
boinfo->offset = mmap_args.out.addr_ptr;
|
||||
|
||||
vm_info_entries = xzalloc(sizeof(struct drm_amdgpu_gem_vm_entry) * num_vm_entries);
|
||||
vm_info_args.handle = handle_entry.gem_handle;
|
||||
vm_info_args.num_entries = num_vm_entries;
|
||||
vm_info_args.value = (uintptr_t)vm_info_entries;
|
||||
vm_info_args.op = AMDGPU_GEM_OP_GET_MAPPING_INFO;
|
||||
ret = drmIoctl(fd, DRM_IOCTL_AMDGPU_GEM_OP, &vm_info_args);
|
||||
if (ret) {
|
||||
pr_perror("Failed to call vm info ioctl");
|
||||
goto exit;
|
||||
}
|
||||
|
||||
if (vm_info_args.num_entries > num_vm_entries) {
|
||||
num_vm_entries = vm_info_args.num_entries;
|
||||
xfree(vm_info_entries);
|
||||
vm_info_entries = xzalloc(sizeof(struct drm_amdgpu_gem_vm_entry) * num_vm_entries);
|
||||
vm_info_args.handle = handle_entry.gem_handle;
|
||||
vm_info_args.num_entries = num_vm_entries;
|
||||
vm_info_args.value = (uintptr_t)vm_info_entries;
|
||||
vm_info_args.op = AMDGPU_GEM_OP_GET_MAPPING_INFO;
|
||||
ret = drmIoctl(fd, DRM_IOCTL_AMDGPU_GEM_OP, &vm_info_args);
|
||||
if (ret) {
|
||||
pr_perror("Failed to call vm info ioctl");
|
||||
goto exit;
|
||||
}
|
||||
} else {
|
||||
num_vm_entries = vm_info_args.num_entries;
|
||||
}
|
||||
|
||||
boinfo->num_of_vms = num_vm_entries;
|
||||
ret = allocate_vm_entries(boinfo, num_vm_entries);
|
||||
if (ret)
|
||||
goto exit;
|
||||
|
||||
for (int j = 0; j < num_vm_entries; j++) {
|
||||
DrmVmEntry *vminfo = boinfo->vm_entries[j];
|
||||
|
||||
boinfo->addr = vm_info_entries[j].addr;
|
||||
vminfo->addr = vm_info_entries[j].addr;
|
||||
vminfo->size = vm_info_entries[j].size;
|
||||
vminfo->offset = vm_info_entries[j].offset;
|
||||
vminfo->flags = vm_info_entries[j].flags;
|
||||
}
|
||||
|
||||
ret = amdgpu_device_initialize(fd, &major, &minor, &h_dev);
|
||||
|
||||
device_fd = amdgpu_device_get_fd(h_dev);
|
||||
|
||||
drmPrimeHandleToFD(device_fd, boinfo->handle, 0, &dmabuf_fd);
|
||||
|
||||
snprintf(img_path, sizeof(img_path), IMG_DRM_PAGES_FILE, rd->id, rd->drm_render_minor, i);
|
||||
bo_contents_fp = open_img_file(img_path, true, &image_size);
|
||||
|
||||
posix_memalign(&buffer, sysconf(_SC_PAGE_SIZE), handle_entry.size);
|
||||
|
||||
ret = sdma_copy_bo(dmabuf_fd, handle_entry.size, bo_contents_fp, buffer, handle_entry.size, h_dev, 0x1000,
|
||||
SDMA_OP_VRAM_READ, false);
|
||||
|
||||
if (dmabuf_fd != KFD_INVALID_FD)
|
||||
close(dmabuf_fd);
|
||||
|
||||
if (bo_contents_fp)
|
||||
fclose(bo_contents_fp);
|
||||
|
||||
ret = amdgpu_device_deinitialize(h_dev);
|
||||
if (ret)
|
||||
goto exit;
|
||||
|
||||
xfree(vm_info_entries);
|
||||
}
|
||||
xfree(list_handles_entries);
|
||||
|
||||
for (int i = 0; i < num_bos; i++) {
|
||||
DrmBoEntry *boinfo = rd->bo_entries[i];
|
||||
|
||||
ret = record_shared_bo(boinfo->handle, boinfo->is_import);
|
||||
if (ret)
|
||||
goto exit;
|
||||
}
|
||||
|
||||
tp_node = sys_get_node_by_render_minor(&src_topology, minor);
|
||||
if (!tp_node) {
|
||||
pr_err("Failed to find a device with minor number = %d\n", minor);
|
||||
|
|
@ -80,21 +414,156 @@ int amdgpu_plugin_drm_dump_file(int fd, int id, struct stat *drm)
|
|||
}
|
||||
|
||||
/* Get the GPU_ID of the DRM device */
|
||||
rd.gpu_id = maps_get_dest_gpu(&checkpoint_maps, tp_node->gpu_id);
|
||||
if (!rd.gpu_id) {
|
||||
pr_err("Failed to find valid gpu_id for the device = %d\n", rd.gpu_id);
|
||||
rd->gpu_id = maps_get_dest_gpu(&checkpoint_maps, tp_node->gpu_id);
|
||||
if (!rd->gpu_id) {
|
||||
pr_err("Failed to find valid gpu_id for the device = %d\n", rd->gpu_id);
|
||||
return -ENODEV;
|
||||
}
|
||||
|
||||
len = criu_render_node__get_packed_size(&rd);
|
||||
len = criu_render_node__get_packed_size(rd);
|
||||
buf = xmalloc(len);
|
||||
if (!buf)
|
||||
return -ENOMEM;
|
||||
|
||||
criu_render_node__pack(&rd, buf);
|
||||
criu_render_node__pack(rd, buf);
|
||||
|
||||
snprintf(path, sizeof(path), IMG_DRM_FILE, id);
|
||||
ret = write_img_file(path, buf, len);
|
||||
|
||||
xfree(buf);
|
||||
exit:
|
||||
free_e(rd);
|
||||
return ret;
|
||||
}
|
||||
|
||||
int amdgpu_plugin_drm_restore_file(int fd, CriuRenderNode *rd)
|
||||
{
|
||||
int ret = 0;
|
||||
bool retry_needed = false;
|
||||
uint32_t major, minor;
|
||||
amdgpu_device_handle h_dev;
|
||||
int device_fd;
|
||||
int *dmabufs = xzalloc(sizeof(int) * rd->num_of_bos);
|
||||
|
||||
ret = amdgpu_device_initialize(fd, &major, &minor, &h_dev);
|
||||
if (ret) {
|
||||
pr_info("Error in init amdgpu device\n");
|
||||
goto exit;
|
||||
}
|
||||
|
||||
device_fd = amdgpu_device_get_fd(h_dev);
|
||||
|
||||
for (int i = 0; i < rd->num_of_bos; i++) {
|
||||
DrmBoEntry *boinfo = rd->bo_entries[i];
|
||||
int dmabuf_fd = -1;
|
||||
uint32_t handle;
|
||||
struct drm_gem_change_handle change_args = { 0 };
|
||||
union drm_amdgpu_gem_mmap mmap_args = { 0 };
|
||||
struct drm_amdgpu_gem_va va_args = { 0 };
|
||||
int fd_id;
|
||||
|
||||
if (work_already_completed(boinfo->handle, rd->drm_render_minor)) {
|
||||
continue;
|
||||
} else if (boinfo->handle != -1) {
|
||||
if (boinfo->is_import) {
|
||||
fd_id = amdgpu_id_for_handle(boinfo->handle);
|
||||
if (fd_id == -1) {
|
||||
retry_needed = true;
|
||||
continue;
|
||||
}
|
||||
dmabuf_fd = fdstore_get(fd_id);
|
||||
}
|
||||
}
|
||||
|
||||
if (boinfo->is_import) {
|
||||
drmPrimeFDToHandle(device_fd, dmabuf_fd, &handle);
|
||||
} else {
|
||||
union drm_amdgpu_gem_create create_args = { 0 };
|
||||
|
||||
create_args.in.bo_size = boinfo->size;
|
||||
create_args.in.alignment = boinfo->alignment;
|
||||
create_args.in.domains = boinfo->preferred_domains;
|
||||
create_args.in.domain_flags = boinfo->alloc_flags;
|
||||
|
||||
if (drmIoctl(fd, DRM_IOCTL_AMDGPU_GEM_CREATE, &create_args) == -1) {
|
||||
pr_perror("Error Failed to call create ioctl");
|
||||
ret = -1;
|
||||
goto exit;
|
||||
}
|
||||
handle = create_args.out.handle;
|
||||
|
||||
drmPrimeHandleToFD(device_fd, handle, 0, &dmabuf_fd);
|
||||
}
|
||||
|
||||
change_args.handle = handle;
|
||||
change_args.new_handle = boinfo->handle;
|
||||
|
||||
if (drmIoctl(fd, DRM_IOCTL_GEM_CHANGE_HANDLE, &change_args) == -1) {
|
||||
pr_perror("Error Failed to call change ioctl; check if the kernel has DRM_IOCTL_GEM_CHANGE_HANDLE support");
|
||||
ret = -1;
|
||||
goto exit;
|
||||
}
|
||||
|
||||
if (!boinfo->is_import)
|
||||
store_dmabuf_fd(boinfo->handle, dmabuf_fd);
|
||||
|
||||
dmabufs[i] = dmabuf_fd;
|
||||
|
||||
ret = record_completed_work(boinfo->handle, rd->drm_render_minor);
|
||||
if (ret)
|
||||
goto exit;
|
||||
|
||||
mmap_args.in.handle = boinfo->handle;
|
||||
|
||||
if (drmIoctl(fd, DRM_IOCTL_AMDGPU_GEM_MMAP, &mmap_args) == -1) {
|
||||
pr_perror("Error Failed to call mmap ioctl");
|
||||
ret = -1;
|
||||
goto exit;
|
||||
}
|
||||
|
||||
for (int j = 0; j < boinfo->num_of_vms; j++) {
|
||||
DrmVmEntry *vminfo = boinfo->vm_entries[j];
|
||||
|
||||
va_args.handle = boinfo->handle;
|
||||
va_args.operation = AMDGPU_VA_OP_MAP;
|
||||
va_args.flags = vminfo->flags;
|
||||
va_args.va_address = vminfo->addr;
|
||||
va_args.offset_in_bo = vminfo->offset;
|
||||
va_args.map_size = vminfo->size;
|
||||
|
||||
if (drmIoctl(fd, DRM_IOCTL_AMDGPU_GEM_VA, &va_args) == -1) {
|
||||
pr_perror("Error Failed to call gem va ioctl");
|
||||
ret = -1;
|
||||
goto exit;
|
||||
}
|
||||
}
|
||||
|
||||
ret = save_vma_updates(boinfo->offset, boinfo->addr, mmap_args.out.addr_ptr, fd);
|
||||
if (ret < 0)
|
||||
goto exit;
|
||||
}
|
||||
|
||||
if (ret) {
|
||||
pr_info("Error in deinit amdgpu device\n");
|
||||
goto exit;
|
||||
}
|
||||
|
||||
ret = record_completed_work(-1, rd->drm_render_minor);
|
||||
if (ret)
|
||||
goto exit;
|
||||
|
||||
ret = amdgpu_device_deinitialize(h_dev);
|
||||
|
||||
if (rd->num_of_bos > 0) {
|
||||
ret = restore_bo_contents_drm(rd->drm_render_minor, rd, fd, dmabufs);
|
||||
if (ret)
|
||||
goto exit;
|
||||
}
|
||||
|
||||
exit:
|
||||
if (ret < 0)
|
||||
return ret;
|
||||
xfree(dmabufs);
|
||||
|
||||
return retry_needed;
|
||||
}
|
||||
|
|
|
|||
|
|
@ -24,5 +24,17 @@ int amdgpu_plugin_drm_handle_device_vma(int fd, const struct stat *drm);
|
|||
*/
|
||||
int amdgpu_plugin_drm_dump_file(int fd, int id, struct stat *drm);
|
||||
|
||||
int amdgpu_plugin_drm_restore_file(int fd, CriuRenderNode *rd);
|
||||
|
||||
int amdgpu_plugin_drm_unpause_file(int fd);
|
||||
|
||||
int amdgpu_id_for_handle(int handle);
|
||||
|
||||
int store_dmabuf_fd(int handle, int fd);
|
||||
|
||||
int get_gem_handle(amdgpu_device_handle h_dev, int dmabuf_fd);
|
||||
|
||||
int save_vma_updates(uint64_t offset, uint64_t addr, uint64_t restored_offset, int gpu_id);
|
||||
|
||||
#endif /* __AMDGPU_PLUGIN_DRM_H__ */
|
||||
|
||||
|
|
|
|||
|
|
@ -41,6 +41,9 @@
|
|||
/* Tracks number of device files that need to be checkpointed */
|
||||
static int dev_file_cnt = 0;
|
||||
|
||||
static LIST_HEAD(shared_bos);
|
||||
static LIST_HEAD(completed_work);
|
||||
|
||||
/* Helper structures to encode device topology of SRC and DEST platforms */
|
||||
struct tp_system src_topology;
|
||||
struct tp_system dest_topology;
|
||||
|
|
@ -68,6 +71,87 @@ void init_gpu_count(struct tp_system *topo)
|
|||
dev_file_cnt = 1 + topology_gpu_count(topo);
|
||||
}
|
||||
|
||||
bool shared_bo_has_exporter(int handle)
|
||||
{
|
||||
struct shared_bo *bo;
|
||||
|
||||
if (handle == -1)
|
||||
return false;
|
||||
|
||||
list_for_each_entry(bo, &shared_bos, l) {
|
||||
if (bo->handle == handle) {
|
||||
return bo->has_exporter;
|
||||
}
|
||||
}
|
||||
|
||||
return false;
|
||||
}
|
||||
|
||||
int record_shared_bo(int handle, bool is_imported)
|
||||
{
|
||||
struct shared_bo *bo;
|
||||
|
||||
if (handle == -1)
|
||||
return 0;
|
||||
|
||||
list_for_each_entry(bo, &shared_bos, l) {
|
||||
if (bo->handle == handle) {
|
||||
return 0;
|
||||
}
|
||||
}
|
||||
bo = malloc(sizeof(struct shared_bo));
|
||||
if (!bo)
|
||||
return -1;
|
||||
bo->handle = handle;
|
||||
bo->has_exporter = !is_imported;
|
||||
list_add(&bo->l, &shared_bos);
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
int record_completed_work(int handle, int id)
|
||||
{
|
||||
struct restore_completed_work *work;
|
||||
|
||||
work = malloc(sizeof(struct restore_completed_work));
|
||||
if (!work)
|
||||
return -1;
|
||||
work->handle = handle;
|
||||
work->id = id;
|
||||
list_add(&work->l, &completed_work);
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
bool work_already_completed(int handle, int id)
|
||||
{
|
||||
struct restore_completed_work *work;
|
||||
|
||||
list_for_each_entry(work, &completed_work, l) {
|
||||
if (work->handle == handle && work->id == id) {
|
||||
return true;
|
||||
}
|
||||
}
|
||||
|
||||
return false;
|
||||
}
|
||||
|
||||
void clear_restore_state()
|
||||
{
|
||||
while (!list_empty(&shared_dmabuf_fds)) {
|
||||
struct shared_dmabuf *st = list_first_entry(&shared_dmabuf_fds, struct shared_dmabuf, l);
|
||||
list_del(&st->l);
|
||||
close(st->dmabuf_fd);
|
||||
free(st);
|
||||
}
|
||||
|
||||
while (!list_empty(&completed_work)) {
|
||||
struct restore_completed_work *st = list_first_entry(&completed_work, struct restore_completed_work, l);
|
||||
list_del(&st->l);
|
||||
free(st);
|
||||
}
|
||||
}
|
||||
|
||||
int read_fp(FILE *fp, void *buf, const size_t buf_len)
|
||||
{
|
||||
size_t len_read;
|
||||
|
|
|
|||
|
|
@ -1,6 +1,8 @@
|
|||
#ifndef __AMDGPU_PLUGIN_UTIL_H__
|
||||
#define __AMDGPU_PLUGIN_UTIL_H__
|
||||
|
||||
#include <libdrm/amdgpu.h>
|
||||
|
||||
#ifndef _GNU_SOURCE
|
||||
#define _GNU_SOURCE 1
|
||||
#endif
|
||||
|
|
@ -52,7 +54,7 @@
|
|||
#define IMG_DRM_FILE "amdgpu-renderD-%d.img"
|
||||
|
||||
/* Name of file having serialized data of DRM device buffer objects (BOs) */
|
||||
#define IMG_DRM_PAGES_FILE "amdgpu-drm-pages-%d-%04x.img"
|
||||
#define IMG_DRM_PAGES_FILE "amdgpu-drm-pages-%d-%d-%04x.img"
|
||||
|
||||
/* Helper macros to Checkpoint and Restore a ROCm file */
|
||||
#define HSAKMT_SHM_PATH "/dev/shm/hsakmt_shared_mem"
|
||||
|
|
@ -73,6 +75,24 @@ enum sdma_op_type {
|
|||
SDMA_OP_VRAM_WRITE,
|
||||
};
|
||||
|
||||
struct dumped_fd {
|
||||
struct list_head l;
|
||||
int fd;
|
||||
bool is_drm;
|
||||
};
|
||||
|
||||
struct shared_bo {
|
||||
struct list_head l;
|
||||
int handle;
|
||||
bool has_exporter;
|
||||
};
|
||||
|
||||
struct restore_completed_work {
|
||||
struct list_head l;
|
||||
int handle;
|
||||
int id;
|
||||
};
|
||||
|
||||
/* Helper structures to encode device topology of SRC and DEST platforms */
|
||||
extern struct tp_system src_topology;
|
||||
extern struct tp_system dest_topology;
|
||||
|
|
@ -101,6 +121,23 @@ bool checkpoint_is_complete();
|
|||
void decrement_checkpoint_count();
|
||||
void init_gpu_count(struct tp_system *topology);
|
||||
|
||||
bool shared_bo_has_exporter(int handle);
|
||||
int record_shared_bo(int handle, bool is_imported);
|
||||
|
||||
int record_shared_dmabuf_fd(int handle, int dmabuf_fd);
|
||||
int dmabuf_fd_for_handle(int handle);
|
||||
|
||||
int record_completed_work(int handle, int id);
|
||||
bool work_already_completed(int handle, int id);
|
||||
|
||||
void clear_restore_state();
|
||||
|
||||
void print_kfd_bo_stat(int bo_cnt, struct kfd_criu_bo_bucket *bo_list);
|
||||
|
||||
int sdma_copy_bo(int shared_fd, uint64_t size, FILE *storage_fp,
|
||||
void *buffer, size_t buffer_size, amdgpu_device_handle h_dev,
|
||||
uint64_t max_copy_size, enum sdma_op_type type, bool do_not_free);
|
||||
|
||||
int serve_out_dmabuf_fd(int handle, int fd);
|
||||
|
||||
#endif /* __AMDGPU_PLUGIN_UTIL_H__ */
|
||||
|
|
|
|||
|
|
@ -46,6 +46,7 @@ message kfd_bo_entry {
|
|||
required uint64 offset = 3;
|
||||
required uint32 alloc_flags = 4;
|
||||
required uint32 gpu_id = 5;
|
||||
required uint32 handle = 6;
|
||||
}
|
||||
|
||||
message criu_kfd {
|
||||
|
|
@ -61,6 +62,30 @@ message criu_kfd {
|
|||
required bytes priv_data = 10;
|
||||
}
|
||||
|
||||
message drm_bo_entry {
|
||||
required uint64 addr = 1;
|
||||
required uint64 size = 2;
|
||||
required uint64 offset = 3;
|
||||
required uint64 alloc_flags = 4;
|
||||
required uint64 alignment = 5;
|
||||
required uint32 preferred_domains = 6;
|
||||
required uint32 handle = 7;
|
||||
required uint32 is_import = 8;
|
||||
required uint32 num_of_vms = 9;
|
||||
repeated drm_vm_entry vm_entries = 10;
|
||||
}
|
||||
|
||||
message drm_vm_entry {
|
||||
required uint64 addr = 1;
|
||||
required uint64 size = 2;
|
||||
required uint64 offset = 3;
|
||||
required uint64 flags = 4;
|
||||
}
|
||||
|
||||
message criu_render_node {
|
||||
required uint32 gpu_id = 1;
|
||||
required uint32 id = 2;
|
||||
required uint32 drm_render_minor = 3;
|
||||
required uint64 num_of_bos = 4;
|
||||
repeated drm_bo_entry bo_entries = 5;
|
||||
}
|
||||
|
|
|
|||
Loading…
Add table
Add a link
Reference in a new issue