mirror of
https://github.com/checkpoint-restore/criu.git
synced 2026-01-23 02:14:37 +00:00
plugin/amdgpu: Support for checkpoint of dmabuf fds
amdgpu libraries that use dmabuf fd to share GPU memory between processes close the dmabuf fds immediately after using them. However, it is possible that checkpoint of a process catches one of the dmabuf fds open. In that case, the amdgpu plugin needs to handle it. The checkpoint of the dmabuf fd does require the device file it was exported from to have already been dumped To identify which device this dmabuf fd was exprted from, attempt to import it on each device, then record the dmabuf handle it imports as. This handle can be used to restore it. Signed-off-by: David Francis <David.Francis@amd.com>
This commit is contained in:
parent
d43217dadb
commit
9e404e2083
8 changed files with 306 additions and 20 deletions
|
|
@ -27,7 +27,7 @@ endif
|
|||
criu-amdgpu.pb-c.c: criu-amdgpu.proto
|
||||
protoc --proto_path=. --c_out=. criu-amdgpu.proto
|
||||
|
||||
amdgpu_plugin.so: amdgpu_plugin.c amdgpu_plugin_drm.c amdgpu_plugin_topology.c amdgpu_plugin_util.c criu-amdgpu.pb-c.c amdgpu_socket_utils.c
|
||||
amdgpu_plugin.so: amdgpu_plugin.c amdgpu_plugin_drm.c amdgpu_plugin_dmabuf.c amdgpu_plugin_topology.c amdgpu_plugin_util.c criu-amdgpu.pb-c.c amdgpu_socket_utils.c
|
||||
$(CC) $(PLUGIN_CFLAGS) $(shell $(COMPEL) includes) $^ -o $@ $(PLUGIN_INCLUDE) $(PLUGIN_LDFLAGS) $(LIBDRM_INC)
|
||||
|
||||
amdgpu_plugin_clean:
|
||||
|
|
|
|||
|
|
@ -38,6 +38,7 @@
|
|||
#include "rst-malloc.h"
|
||||
|
||||
#include "common/list.h"
|
||||
#include "amdgpu_plugin_dmabuf.h"
|
||||
#include "amdgpu_plugin_drm.h"
|
||||
#include "amdgpu_plugin_util.h"
|
||||
#include "amdgpu_plugin_topology.h"
|
||||
|
|
@ -46,7 +47,7 @@
|
|||
#include "img-streamer.h"
|
||||
#include "image.h"
|
||||
#include "cr_options.h"
|
||||
|
||||
#include "util.h"
|
||||
struct vma_metadata {
|
||||
struct list_head list;
|
||||
uint64_t old_pgoff;
|
||||
|
|
@ -1064,6 +1065,9 @@ int amdgpu_unpause_processes(int pid)
|
|||
}
|
||||
}
|
||||
|
||||
if (post_dump_dmabuf_check() < 0)
|
||||
ret = -1;
|
||||
|
||||
exit:
|
||||
pr_info("Process unpaused %s (ret:%d)\n", ret ? "Failed" : "Ok", ret);
|
||||
clear_dumped_fds();
|
||||
|
|
@ -1400,7 +1404,17 @@ int amdgpu_plugin_dump_file(int fd, int id)
|
|||
return -1;
|
||||
}
|
||||
|
||||
/* Check whether this plugin was called for kfd or render nodes */
|
||||
/* Check whether this plugin was called for kfd, dmabuf or render nodes */
|
||||
ret = get_dmabuf_info(fd, &st);
|
||||
if (ret < 0) {
|
||||
pr_perror("Failed to get dmabuf info");
|
||||
return -1;
|
||||
} else if (ret == 0) {
|
||||
pr_info("Dumping dmabuf fd = %d\n", fd);
|
||||
ret = amdgpu_plugin_dmabuf_dump(fd, id);
|
||||
return ret;
|
||||
}
|
||||
|
||||
if (major(st.st_rdev) != major(st_kfd.st_rdev) || minor(st.st_rdev) != 0) {
|
||||
|
||||
/* This is RenderD dumper plugin, for now just save renderD
|
||||
|
|
@ -1414,7 +1428,7 @@ int amdgpu_plugin_dump_file(int fd, int id)
|
|||
ret = record_dumped_fd(fd, true);
|
||||
if (ret)
|
||||
return ret;
|
||||
|
||||
ret = try_dump_dmabuf_list();
|
||||
/* Need to return success here so that criu can call plugins for renderD nodes */
|
||||
return ret;
|
||||
}
|
||||
|
|
@ -1538,7 +1552,6 @@ static int restore_devices(struct kfd_ioctl_criu_args *args, CriuKfd *e)
|
|||
int ret = 0, bucket_index = 0;
|
||||
|
||||
pr_debug("Restoring %d devices\n", e->num_of_gpus);
|
||||
|
||||
args->num_devices = e->num_of_gpus;
|
||||
device_buckets = xzalloc(sizeof(*device_buckets) * args->num_devices);
|
||||
if (!device_buckets)
|
||||
|
|
@ -1822,12 +1835,17 @@ int amdgpu_plugin_restore_file(int id, bool *retry_needed)
|
|||
* first as we assume restore_maps is already filled. Need to fix this later.
|
||||
*/
|
||||
snprintf(img_path, sizeof(img_path), IMG_DRM_FILE, id);
|
||||
pr_info("Restoring RenderD %s\n", img_path);
|
||||
|
||||
img_fp = open_img_file(img_path, false, &img_size);
|
||||
if (!img_fp)
|
||||
return -EINVAL;
|
||||
|
||||
if (!img_fp) {
|
||||
ret = amdgpu_plugin_dmabuf_restore(id);
|
||||
if (ret == 1) {
|
||||
*retry_needed = true;
|
||||
return 0;
|
||||
}
|
||||
return ret;
|
||||
}
|
||||
pr_info("Restoring RenderD %s\n", img_path);
|
||||
pr_debug("RenderD Image file size:%ld\n", img_size);
|
||||
buf = xmalloc(img_size);
|
||||
if (!buf) {
|
||||
|
|
|
|||
207
plugins/amdgpu/amdgpu_plugin_dmabuf.c
Normal file
207
plugins/amdgpu/amdgpu_plugin_dmabuf.c
Normal file
|
|
@ -0,0 +1,207 @@
|
|||
#include <errno.h>
|
||||
#include <fcntl.h>
|
||||
#include <stdlib.h>
|
||||
#include <stdint.h>
|
||||
#include <stdbool.h>
|
||||
#include <string.h>
|
||||
#include <stdio.h>
|
||||
#include <unistd.h>
|
||||
|
||||
#include <sys/stat.h>
|
||||
#include <sys/mman.h>
|
||||
#include <sys/types.h>
|
||||
#include <linux/limits.h>
|
||||
|
||||
#include "common/list.h"
|
||||
#include "criu-amdgpu.pb-c.h"
|
||||
|
||||
#include "xmalloc.h"
|
||||
#include "criu-log.h"
|
||||
#include "amdgpu_plugin_drm.h"
|
||||
#include "amdgpu_plugin_util.h"
|
||||
#include "amdgpu_plugin_dmabuf.h"
|
||||
#include "fdstore.h"
|
||||
|
||||
#include "util.h"
|
||||
#include "common/scm.h"
|
||||
|
||||
struct dmabuf {
|
||||
int id;
|
||||
int dmabuf_fd;
|
||||
struct list_head node;
|
||||
};
|
||||
|
||||
static LIST_HEAD(dmabuf_list);
|
||||
|
||||
/* Return < 0 for error, > 0 for "not a dmabuf" and 0 "is a dmabuf" */
|
||||
int get_dmabuf_info(int fd, struct stat *st)
|
||||
{
|
||||
char path[PATH_MAX];
|
||||
|
||||
if (read_fd_link(fd, path, sizeof(path)) < 0)
|
||||
return -1;
|
||||
|
||||
if (strncmp(path, DMABUF_LINK, strlen(DMABUF_LINK)) != 0)
|
||||
return 1;
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
int __amdgpu_plugin_dmabuf_dump(int dmabuf_fd, int id)
|
||||
{
|
||||
int ret = 0;
|
||||
char path[PATH_MAX];
|
||||
size_t len = 0;
|
||||
unsigned char *buf = NULL;
|
||||
int gem_handle;
|
||||
|
||||
pr_info("TWI: Dumping dmabuf fd = %d\n", dmabuf_fd);
|
||||
|
||||
gem_handle = handle_for_shared_bo_fd(dmabuf_fd);
|
||||
if (gem_handle < 0) {
|
||||
pr_err("Failed to get handle for dmabuf_fd = %d\n", dmabuf_fd);
|
||||
return -EAGAIN; /* Retry needed */
|
||||
}
|
||||
|
||||
CriuDmabufNode *node = xmalloc(sizeof(*node));
|
||||
if (!node) {
|
||||
pr_err("Failed to allocate memory for dmabuf node\n");
|
||||
return -ENOMEM;
|
||||
}
|
||||
criu_dmabuf_node__init(node);
|
||||
|
||||
node->gem_handle = gem_handle;
|
||||
|
||||
if (node->gem_handle < 0) {
|
||||
pr_err("Failed to get handle for dmabuf_fd\n");
|
||||
xfree(node);
|
||||
return -EINVAL;
|
||||
}
|
||||
|
||||
/* Serialize metadata to a file */
|
||||
snprintf(path, sizeof(path), IMG_DMABUF_FILE, id);
|
||||
len = criu_dmabuf_node__get_packed_size(node);
|
||||
buf = xmalloc(len);
|
||||
if (!buf) {
|
||||
pr_err("Failed to allocate buffer for dmabuf metadata\n");
|
||||
xfree(node);
|
||||
return -ENOMEM;
|
||||
}
|
||||
criu_dmabuf_node__pack(node, buf);
|
||||
ret = write_img_file(path, buf, len);
|
||||
|
||||
xfree(buf);
|
||||
xfree(node);
|
||||
return ret;
|
||||
}
|
||||
|
||||
int amdgpu_plugin_dmabuf_restore(int id)
|
||||
{
|
||||
char path[PATH_MAX];
|
||||
size_t img_size;
|
||||
FILE *img_fp = NULL;
|
||||
int ret = 0;
|
||||
CriuDmabufNode *rd = NULL;
|
||||
unsigned char *buf = NULL;
|
||||
int fd_id;
|
||||
|
||||
snprintf(path, sizeof(path), IMG_DMABUF_FILE, id);
|
||||
|
||||
pr_info("TWI: Restoring dmabuf fd, id = %d\n", id);
|
||||
|
||||
/* Read serialized metadata */
|
||||
img_fp = open_img_file(path, false, &img_size);
|
||||
if (!img_fp) {
|
||||
pr_err("Failed to open dmabuf metadata file: %s\n", path);
|
||||
return -EINVAL;
|
||||
}
|
||||
|
||||
pr_debug("dmabuf Image file size:%ld\n", img_size);
|
||||
buf = xmalloc(img_size);
|
||||
if (!buf) {
|
||||
pr_perror("Failed to allocate memory");
|
||||
return -ENOMEM;
|
||||
}
|
||||
|
||||
ret = read_fp(img_fp, buf, img_size);
|
||||
if (ret) {
|
||||
pr_perror("Unable to read from %s", path);
|
||||
xfree(buf);
|
||||
return ret;
|
||||
}
|
||||
|
||||
rd = criu_dmabuf_node__unpack(NULL, img_size, buf);
|
||||
if (rd == NULL) {
|
||||
pr_perror("Unable to parse the dmabuf message %d", id);
|
||||
xfree(buf);
|
||||
fclose(img_fp);
|
||||
return -1;
|
||||
}
|
||||
fclose(img_fp);
|
||||
|
||||
pr_info("TWI: dmabuf node gem_handle = %d\n", rd->gem_handle);
|
||||
|
||||
/* Match GEM handle with shared_dmabuf list */
|
||||
fd_id = amdgpu_id_for_handle(rd->gem_handle);
|
||||
if (fd_id == -1) {
|
||||
pr_err("Failed to find dmabuf_fd for GEM handle = %d\n",
|
||||
rd->gem_handle);
|
||||
return 1;
|
||||
}
|
||||
int dmabuf_fd = fdstore_get(fd_id);
|
||||
pr_info("TWI: dmabuf node fd_id = %d, dmabuf_fd = %d\n", fd_id, dmabuf_fd);
|
||||
if (dmabuf_fd == -1) {
|
||||
pr_err("Failed to find dmabuf_fd for GEM handle = %d\n",
|
||||
rd->gem_handle);
|
||||
return 1; /* Retry needed */
|
||||
} else {
|
||||
pr_info("Restored dmabuf_fd = %d for GEM handle = %d\n",
|
||||
dmabuf_fd, rd->gem_handle);
|
||||
}
|
||||
ret = dmabuf_fd;
|
||||
|
||||
pr_info("Successfully restored dmabuf_fd %d\n",
|
||||
dmabuf_fd);
|
||||
criu_dmabuf_node__free_unpacked(rd, NULL);
|
||||
xfree(buf);
|
||||
return ret;
|
||||
}
|
||||
|
||||
int amdgpu_plugin_dmabuf_dump(int dmabuf_fd, int id)
|
||||
{
|
||||
int ret;
|
||||
|
||||
ret = __amdgpu_plugin_dmabuf_dump(dmabuf_fd, id);
|
||||
if (ret == -EAGAIN) {
|
||||
struct dmabuf *b = xmalloc(sizeof(*b));
|
||||
b->id = id;
|
||||
b->dmabuf_fd = dmabuf_fd;
|
||||
list_add(&b->node, &dmabuf_list);
|
||||
return 0;
|
||||
}
|
||||
return ret;
|
||||
}
|
||||
|
||||
int try_dump_dmabuf_list()
|
||||
{
|
||||
struct dmabuf *b, *t;
|
||||
list_for_each_entry_safe(b, t, &dmabuf_list, node) {
|
||||
int ret = __amdgpu_plugin_dmabuf_dump(b->dmabuf_fd, b->id);
|
||||
if (ret == -EAGAIN)
|
||||
continue;
|
||||
else if (ret)
|
||||
return ret;
|
||||
list_del(&b->node);
|
||||
xfree(b);
|
||||
}
|
||||
return 0;
|
||||
}
|
||||
|
||||
int post_dump_dmabuf_check()
|
||||
{
|
||||
if (!list_empty(&dmabuf_list)) {
|
||||
pr_err("Not all dma buffers have been dumped\n");
|
||||
return -1;
|
||||
}
|
||||
return 1;
|
||||
}
|
||||
16
plugins/amdgpu/amdgpu_plugin_dmabuf.h
Normal file
16
plugins/amdgpu/amdgpu_plugin_dmabuf.h
Normal file
|
|
@ -0,0 +1,16 @@
|
|||
|
||||
#ifndef __AMDGPU_PLUGIN_DMABUF_H__
|
||||
#define __AMDGPU_PLUGIN_DMABUF_H__
|
||||
|
||||
#include "amdgpu_plugin_util.h"
|
||||
#include "criu-amdgpu.pb-c.h"
|
||||
|
||||
int amdgpu_plugin_dmabuf_dump(int fd, int id);
|
||||
int amdgpu_plugin_dmabuf_restore(int id);
|
||||
|
||||
int try_dump_dmabuf_list();
|
||||
int post_dump_dmabuf_check();
|
||||
|
||||
int get_dmabuf_info(int fd, struct stat *st);
|
||||
|
||||
#endif /* __AMDGPU_PLUGIN_DMABUF_H__ */
|
||||
|
|
@ -47,7 +47,8 @@ int get_gem_handle(amdgpu_device_handle h_dev, int dmabuf_fd)
|
|||
return -1;
|
||||
}
|
||||
|
||||
drmPrimeFDToHandle(fd, dmabuf_fd, &handle);
|
||||
if (drmPrimeFDToHandle(fd, dmabuf_fd, &handle))
|
||||
return -1;
|
||||
|
||||
return handle;
|
||||
}
|
||||
|
|
@ -465,6 +466,7 @@ int amdgpu_plugin_drm_restore_file(int fd, CriuRenderNode *rd)
|
|||
if (work_already_completed(boinfo->handle, rd->drm_render_minor)) {
|
||||
continue;
|
||||
} else if (boinfo->handle != -1) {
|
||||
pr_info("TWI: restore bo %d\n", boinfo->handle);
|
||||
if (boinfo->is_import) {
|
||||
fd_id = amdgpu_id_for_handle(boinfo->handle);
|
||||
if (fd_id == -1) {
|
||||
|
|
@ -472,11 +474,13 @@ int amdgpu_plugin_drm_restore_file(int fd, CriuRenderNode *rd)
|
|||
continue;
|
||||
}
|
||||
dmabuf_fd = fdstore_get(fd_id);
|
||||
pr_info("TWI: restore bo %d: fd_id %d, dmabuf_fd %d\n", boinfo->handle, fd_id, dmabuf_fd);
|
||||
}
|
||||
}
|
||||
|
||||
if (boinfo->is_import) {
|
||||
drmPrimeFDToHandle(device_fd, dmabuf_fd, &handle);
|
||||
pr_info("TWI: restore bo imported to handle %d\n", handle);
|
||||
} else {
|
||||
union drm_amdgpu_gem_create create_args = { 0 };
|
||||
|
||||
|
|
@ -493,6 +497,7 @@ int amdgpu_plugin_drm_restore_file(int fd, CriuRenderNode *rd)
|
|||
handle = create_args.out.handle;
|
||||
|
||||
drmPrimeHandleToFD(device_fd, handle, 0, &dmabuf_fd);
|
||||
pr_info("TWI: restore bo created at handle %d and exported to fd %d\n", handle, dmabuf_fd);
|
||||
}
|
||||
|
||||
change_args.handle = handle;
|
||||
|
|
|
|||
|
|
@ -37,6 +37,7 @@
|
|||
#include "amdgpu_drm.h"
|
||||
#include "amdgpu_plugin_util.h"
|
||||
#include "amdgpu_plugin_topology.h"
|
||||
#include "amdgpu_plugin_drm.h"
|
||||
|
||||
static LIST_HEAD(dumped_fds);
|
||||
static LIST_HEAD(shared_bos);
|
||||
|
|
@ -109,6 +110,46 @@ int record_shared_bo(int handle, bool is_imported)
|
|||
return 0;
|
||||
}
|
||||
|
||||
int handle_for_shared_bo_fd(int fd)
|
||||
{
|
||||
struct dumped_fd *df;
|
||||
int trial_handle;
|
||||
amdgpu_device_handle h_dev;
|
||||
uint32_t major, minor;
|
||||
struct shared_bo *bo;
|
||||
|
||||
list_for_each_entry(df, &dumped_fds, l) {
|
||||
/* see if the gem handle for fd using the hdev for df->fd is the
|
||||
same as bo->handle. */
|
||||
|
||||
if (!df->is_drm) {
|
||||
continue;
|
||||
}
|
||||
|
||||
if (amdgpu_device_initialize(df->fd, &major, &minor, &h_dev)) {
|
||||
pr_err("Failed to initialize amdgpu device\n");
|
||||
continue;
|
||||
}
|
||||
|
||||
trial_handle = get_gem_handle(h_dev, fd);
|
||||
if (trial_handle < 0)
|
||||
continue;
|
||||
|
||||
pr_info("TWI: Check device %d, got handle %d\n", df->fd, trial_handle);
|
||||
|
||||
list_for_each_entry(bo, &shared_bos, l) {
|
||||
if (bo->handle == trial_handle) {
|
||||
pr_info("TWI: And that handle exists\n");
|
||||
return trial_handle;
|
||||
}
|
||||
}
|
||||
|
||||
amdgpu_device_deinitialize(h_dev);
|
||||
}
|
||||
|
||||
return -1;
|
||||
}
|
||||
|
||||
int record_completed_work(int handle, int id)
|
||||
{
|
||||
struct restore_completed_work *work;
|
||||
|
|
@ -138,13 +179,6 @@ bool work_already_completed(int handle, int id)
|
|||
|
||||
void clear_restore_state()
|
||||
{
|
||||
while (!list_empty(&shared_dmabuf_fds)) {
|
||||
struct shared_dmabuf *st = list_first_entry(&shared_dmabuf_fds, struct shared_dmabuf, l);
|
||||
list_del(&st->l);
|
||||
close(st->dmabuf_fd);
|
||||
free(st);
|
||||
}
|
||||
|
||||
while (!list_empty(&completed_work)) {
|
||||
struct restore_completed_work *st = list_first_entry(&completed_work, struct restore_completed_work, l);
|
||||
list_del(&st->l);
|
||||
|
|
|
|||
|
|
@ -53,6 +53,9 @@
|
|||
/* Name of file having serialized data of DRM device */
|
||||
#define IMG_DRM_FILE "amdgpu-renderD-%d.img"
|
||||
|
||||
/* Name of file having serialized data of dmabuf meta */
|
||||
#define IMG_DMABUF_FILE "amdgpu-dmabuf_%d.img"
|
||||
|
||||
/* Name of file having serialized data of DRM device buffer objects (BOs) */
|
||||
#define IMG_DRM_PAGES_FILE "amdgpu-drm-pages-%d-%d-%04x.img"
|
||||
|
||||
|
|
@ -61,6 +64,7 @@
|
|||
#define HSAKMT_SHM "/hsakmt_shared_mem"
|
||||
#define HSAKMT_SEM_PATH "/dev/shm/sem.hsakmt_semaphore"
|
||||
#define HSAKMT_SEM "hsakmt_semaphore"
|
||||
#define DMABUF_LINK "/dmabuf"
|
||||
|
||||
/* Help macros to build sDMA command packets */
|
||||
#define SDMA_PACKET(op, sub_op, e) ((((e)&0xFFFF) << 16) | (((sub_op)&0xFF) << 8) | (((op)&0xFF) << 0))
|
||||
|
|
@ -123,9 +127,7 @@ void clear_dumped_fds();
|
|||
|
||||
bool shared_bo_has_exporter(int handle);
|
||||
int record_shared_bo(int handle, bool is_imported);
|
||||
|
||||
int record_shared_dmabuf_fd(int handle, int dmabuf_fd);
|
||||
int dmabuf_fd_for_handle(int handle);
|
||||
int handle_for_shared_bo_fd(int dmabuf_fd);
|
||||
|
||||
int record_completed_work(int handle, int id);
|
||||
bool work_already_completed(int handle, int id);
|
||||
|
|
|
|||
|
|
@ -89,3 +89,7 @@ message criu_render_node {
|
|||
required uint64 num_of_bos = 4;
|
||||
repeated drm_bo_entry bo_entries = 5;
|
||||
}
|
||||
|
||||
message criu_dmabuf_node {
|
||||
required uint32 gem_handle = 1;
|
||||
}
|
||||
|
|
|
|||
Loading…
Add table
Add a link
Reference in a new issue