mirror of
https://github.com/checkpoint-restore/criu.git
synced 2026-01-22 18:05:10 +00:00
plugin: Add DUMP_DEVICES_LATE callback
The amdgpu plugin was counting how many files were checkpointed to determine when it should close the device files. The number of device files is not consistent; a process may have multiple copies of the drm device files open. Instead of doing this counting, add a new callback after all files are checkpointed, so plugins can clean up their resources at an appropriate time. Signed-off-by: David Francis <David.Francis@amd.com>
This commit is contained in:
parent
db0ec806d1
commit
d43217dadb
6 changed files with 60 additions and 51 deletions
|
|
@ -2247,6 +2247,10 @@ int cr_dump_tasks(pid_t pid)
|
|||
goto err;
|
||||
}
|
||||
|
||||
ret = run_plugins(DUMP_DEVICES_LATE, pid);
|
||||
if (ret && ret != -ENOTSUP)
|
||||
goto err;
|
||||
|
||||
if (parent_ie) {
|
||||
inventory_entry__free_unpacked(parent_ie, NULL);
|
||||
parent_ie = NULL;
|
||||
|
|
|
|||
|
|
@ -64,6 +64,8 @@ enum {
|
|||
|
||||
CR_PLUGIN_HOOK__RESTORE_INIT = 13,
|
||||
|
||||
CR_PLUGIN_HOOK__DUMP_DEVICES_LATE = 14,
|
||||
|
||||
CR_PLUGIN_HOOK__MAX
|
||||
};
|
||||
|
||||
|
|
@ -84,6 +86,7 @@ DECLARE_PLUGIN_HOOK_ARGS(CR_PLUGIN_HOOK__PAUSE_DEVICES, int pid);
|
|||
DECLARE_PLUGIN_HOOK_ARGS(CR_PLUGIN_HOOK__CHECKPOINT_DEVICES, int pid);
|
||||
DECLARE_PLUGIN_HOOK_ARGS(CR_PLUGIN_HOOK__POST_FORKING, void);
|
||||
DECLARE_PLUGIN_HOOK_ARGS(CR_PLUGIN_HOOK__RESTORE_INIT, void);
|
||||
DECLARE_PLUGIN_HOOK_ARGS(CR_PLUGIN_HOOK__DUMP_DEVICES_LATE, int id);
|
||||
|
||||
enum {
|
||||
CR_PLUGIN_STAGE__DUMP,
|
||||
|
|
|
|||
|
|
@ -61,6 +61,7 @@ static cr_plugin_desc_t *cr_gen_plugin_desc(void *h, char *path)
|
|||
__assign_hook(CHECKPOINT_DEVICES, "cr_plugin_checkpoint_devices");
|
||||
__assign_hook(POST_FORKING, "cr_plugin_post_forking");
|
||||
__assign_hook(RESTORE_INIT, "cr_plugin_restore_init");
|
||||
__assign_hook(DUMP_DEVICES_LATE, "cr_plugin_dump_devices_late");
|
||||
|
||||
#undef __assign_hook
|
||||
|
||||
|
|
|
|||
|
|
@ -58,13 +58,6 @@ struct vma_metadata {
|
|||
|
||||
/************************************ Global Variables ********************************************/
|
||||
|
||||
/**
|
||||
* FD of KFD device used to checkpoint. On a multi-process
|
||||
* tree the order of checkpointing goes from parent to child
|
||||
* and so on - so saving the FD will not be overwritten
|
||||
*/
|
||||
static int kfd_checkpoint_fd;
|
||||
|
||||
static LIST_HEAD(update_vma_info_list);
|
||||
|
||||
size_t kfd_max_buffer_size;
|
||||
|
|
@ -1050,28 +1043,34 @@ int restore_hsakmt_shared_mem(const uint64_t shared_mem_size, const uint32_t sha
|
|||
return 0;
|
||||
}
|
||||
|
||||
static int unpause_process(int fd)
|
||||
int amdgpu_unpause_processes(int pid)
|
||||
{
|
||||
int ret = 0;
|
||||
struct kfd_ioctl_criu_args args = { 0 };
|
||||
struct list_head *l = get_dumped_fds();
|
||||
struct dumped_fd *st;
|
||||
|
||||
args.op = KFD_CRIU_OP_UNPAUSE;
|
||||
list_for_each_entry(st, l, l) {
|
||||
if (st->is_drm) {
|
||||
close(st->fd);
|
||||
} else {
|
||||
args.op = KFD_CRIU_OP_UNPAUSE;
|
||||
|
||||
ret = kmtIoctl(fd, AMDKFD_IOC_CRIU_OP, &args);
|
||||
if (ret) {
|
||||
pr_perror("Failed to unpause process");
|
||||
goto exit;
|
||||
ret = kmtIoctl(st->fd, AMDKFD_IOC_CRIU_OP, &args);
|
||||
if (ret) {
|
||||
pr_perror("Failed to unpause process");
|
||||
goto exit;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// Reset the KFD FD
|
||||
kfd_checkpoint_fd = -1;
|
||||
sys_close_drm_render_devices(&src_topology);
|
||||
|
||||
exit:
|
||||
pr_info("Process unpaused %s (ret:%d)\n", ret ? "Failed" : "Ok", ret);
|
||||
clear_dumped_fds();
|
||||
|
||||
return ret;
|
||||
}
|
||||
CR_PLUGIN_REGISTER_HOOK(CR_PLUGIN_HOOK__DUMP_DEVICES_LATE, amdgpu_unpause_processes)
|
||||
|
||||
int store_dmabuf_fd(int handle, int fd)
|
||||
{
|
||||
|
|
@ -1401,9 +1400,6 @@ int amdgpu_plugin_dump_file(int fd, int id)
|
|||
return -1;
|
||||
}
|
||||
|
||||
/* Initialize number of device files that will be checkpointed */
|
||||
init_gpu_count(&src_topology);
|
||||
|
||||
/* Check whether this plugin was called for kfd or render nodes */
|
||||
if (major(st.st_rdev) != major(st_kfd.st_rdev) || minor(st.st_rdev) != 0) {
|
||||
|
||||
|
|
@ -1415,11 +1411,9 @@ int amdgpu_plugin_dump_file(int fd, int id)
|
|||
if (ret)
|
||||
return ret;
|
||||
|
||||
/* Invoke unpause process if needed */
|
||||
decrement_checkpoint_count();
|
||||
if (checkpoint_is_complete()) {
|
||||
ret = unpause_process(kfd_checkpoint_fd);
|
||||
}
|
||||
ret = record_dumped_fd(fd, true);
|
||||
if (ret)
|
||||
return ret;
|
||||
|
||||
/* Need to return success here so that criu can call plugins for renderD nodes */
|
||||
return ret;
|
||||
|
|
@ -1517,14 +1511,11 @@ int amdgpu_plugin_dump_file(int fd, int id)
|
|||
|
||||
xfree(buf);
|
||||
|
||||
exit:
|
||||
/* Restore all queues if conditions permit */
|
||||
kfd_checkpoint_fd = fd;
|
||||
decrement_checkpoint_count();
|
||||
if (checkpoint_is_complete()) {
|
||||
ret = unpause_process(fd);
|
||||
}
|
||||
ret = record_dumped_fd(fd, false);
|
||||
if (ret)
|
||||
goto exit;
|
||||
|
||||
exit:
|
||||
xfree((void *)args.devices);
|
||||
xfree((void *)args.bos);
|
||||
xfree((void *)args.priv_data);
|
||||
|
|
|
|||
|
|
@ -38,9 +38,7 @@
|
|||
#include "amdgpu_plugin_util.h"
|
||||
#include "amdgpu_plugin_topology.h"
|
||||
|
||||
/* Tracks number of device files that need to be checkpointed */
|
||||
static int dev_file_cnt = 0;
|
||||
|
||||
static LIST_HEAD(dumped_fds);
|
||||
static LIST_HEAD(shared_bos);
|
||||
static LIST_HEAD(completed_work);
|
||||
|
||||
|
|
@ -52,23 +50,25 @@ struct tp_system dest_topology;
|
|||
struct device_maps checkpoint_maps;
|
||||
struct device_maps restore_maps;
|
||||
|
||||
bool checkpoint_is_complete()
|
||||
int record_dumped_fd(int fd, bool is_drm)
|
||||
{
|
||||
return (dev_file_cnt == 0);
|
||||
int newfd = dup(fd);
|
||||
|
||||
if (newfd < 0)
|
||||
return newfd;
|
||||
struct dumped_fd *st = malloc(sizeof(struct dumped_fd));
|
||||
if (!st)
|
||||
return -1;
|
||||
st->fd = newfd;
|
||||
st->is_drm = is_drm;
|
||||
list_add(&st->l, &dumped_fds);
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
void decrement_checkpoint_count()
|
||||
struct list_head *get_dumped_fds()
|
||||
{
|
||||
dev_file_cnt--;
|
||||
}
|
||||
|
||||
void init_gpu_count(struct tp_system *topo)
|
||||
{
|
||||
if (dev_file_cnt != 0)
|
||||
return;
|
||||
|
||||
/* We add ONE to include checkpointing of KFD device */
|
||||
dev_file_cnt = 1 + topology_gpu_count(topo);
|
||||
return &dumped_fds;
|
||||
}
|
||||
|
||||
bool shared_bo_has_exporter(int handle)
|
||||
|
|
@ -152,6 +152,16 @@ void clear_restore_state()
|
|||
}
|
||||
}
|
||||
|
||||
void clear_dumped_fds()
|
||||
{
|
||||
while (!list_empty(&dumped_fds)) {
|
||||
struct dumped_fd *st = list_first_entry(&dumped_fds, struct dumped_fd, l);
|
||||
list_del(&st->l);
|
||||
close(st->fd);
|
||||
free(st);
|
||||
}
|
||||
}
|
||||
|
||||
int read_fp(FILE *fp, void *buf, const size_t buf_len)
|
||||
{
|
||||
size_t len_read;
|
||||
|
|
|
|||
|
|
@ -117,9 +117,9 @@ int read_file(const char *file_path, void *buf, const size_t buf_len);
|
|||
int write_img_file(char *path, const void *buf, const size_t buf_len);
|
||||
FILE *open_img_file(char *path, bool write, size_t *size);
|
||||
|
||||
bool checkpoint_is_complete();
|
||||
void decrement_checkpoint_count();
|
||||
void init_gpu_count(struct tp_system *topology);
|
||||
int record_dumped_fd(int fd, bool is_drm);
|
||||
struct list_head *get_dumped_fds();
|
||||
void clear_dumped_fds();
|
||||
|
||||
bool shared_bo_has_exporter(int handle);
|
||||
int record_shared_bo(int handle, bool is_imported);
|
||||
|
|
|
|||
Loading…
Add table
Add a link
Reference in a new issue