mirror of
https://github.com/checkpoint-restore/criu.git
synced 2026-01-23 02:14:37 +00:00
amdgpu libraries that use dmabuf fd to share GPU memory between processes close the dmabuf fds immediately after using them. However, it is possible that checkpoint of a process catches one of the dmabuf fds open. In that case, the amdgpu plugin needs to handle it. The checkpoint of the dmabuf fd does require the device file it was exported from to have already been dumped To identify which device this dmabuf fd was exprted from, attempt to import it on each device, then record the dmabuf handle it imports as. This handle can be used to restore it. Signed-off-by: David Francis <David.Francis@amd.com>
145 lines
4.2 KiB
C
145 lines
4.2 KiB
C
#ifndef __AMDGPU_PLUGIN_UTIL_H__
|
|
#define __AMDGPU_PLUGIN_UTIL_H__
|
|
|
|
#include <libdrm/amdgpu.h>
|
|
|
|
#ifndef _GNU_SOURCE
|
|
#define _GNU_SOURCE 1
|
|
#endif
|
|
|
|
#ifdef COMPILE_TESTS
|
|
#undef pr_err
|
|
#define pr_err(format, arg...) fprintf(stdout, "%s:%d ERROR:" format, __FILE__, __LINE__, ##arg)
|
|
#undef pr_info
|
|
#define pr_info(format, arg...) fprintf(stdout, "%s:%d INFO:" format, __FILE__, __LINE__, ##arg)
|
|
#undef pr_debug
|
|
#define pr_debug(format, arg...) fprintf(stdout, "%s:%d DBG:" format, __FILE__, __LINE__, ##arg)
|
|
|
|
#undef pr_perror
|
|
#define pr_perror(format, arg...) \
|
|
fprintf(stdout, "%s:%d: " format " (errno = %d (%s))\n", __FILE__, __LINE__, ##arg, errno, strerror(errno))
|
|
#endif
|
|
|
|
#ifdef LOG_PREFIX
|
|
#undef LOG_PREFIX
|
|
#endif
|
|
#define LOG_PREFIX "amdgpu_plugin: "
|
|
|
|
#ifdef DEBUG
|
|
#define plugin_log_msg(fmt, ...) pr_debug(fmt, ##__VA_ARGS__)
|
|
#else
|
|
#define plugin_log_msg(fmt, ...) \
|
|
{ \
|
|
}
|
|
#endif
|
|
|
|
|
|
/* Path where KFD device is surfaced */
|
|
#define AMDGPU_KFD_DEVICE "/dev/kfd"
|
|
|
|
/* Path where DRM devices are surfaced */
|
|
#define AMDGPU_DRM_DEVICE "/dev/dri/renderD%d"
|
|
|
|
/* Minimum version of KFD IOCTL's that supports C&R */
|
|
#define KFD_IOCTL_MAJOR_VERSION 1
|
|
#define MIN_KFD_IOCTL_MINOR_VERSION 8
|
|
|
|
/* Name of file having serialized data of KFD device */
|
|
#define IMG_KFD_FILE "amdgpu-kfd-%d.img"
|
|
|
|
/* Name of file having serialized data of KFD buffer objects (BOs) */
|
|
#define IMG_KFD_PAGES_FILE "amdgpu-pages-%d-%04x.img"
|
|
|
|
/* Name of file having serialized data of DRM device */
|
|
#define IMG_DRM_FILE "amdgpu-renderD-%d.img"
|
|
|
|
/* Name of file having serialized data of dmabuf meta */
|
|
#define IMG_DMABUF_FILE "amdgpu-dmabuf_%d.img"
|
|
|
|
/* Name of file having serialized data of DRM device buffer objects (BOs) */
|
|
#define IMG_DRM_PAGES_FILE "amdgpu-drm-pages-%d-%d-%04x.img"
|
|
|
|
/* Helper macros to Checkpoint and Restore a ROCm file */
|
|
#define HSAKMT_SHM_PATH "/dev/shm/hsakmt_shared_mem"
|
|
#define HSAKMT_SHM "/hsakmt_shared_mem"
|
|
#define HSAKMT_SEM_PATH "/dev/shm/sem.hsakmt_semaphore"
|
|
#define HSAKMT_SEM "hsakmt_semaphore"
|
|
#define DMABUF_LINK "/dmabuf"
|
|
|
|
/* Help macros to build sDMA command packets */
|
|
#define SDMA_PACKET(op, sub_op, e) ((((e)&0xFFFF) << 16) | (((sub_op)&0xFF) << 8) | (((op)&0xFF) << 0))
|
|
|
|
#define SDMA_OPCODE_COPY 1
|
|
#define SDMA_COPY_SUB_OPCODE_LINEAR 0
|
|
#define SDMA_NOP 0
|
|
#define SDMA_LINEAR_COPY_MAX_SIZE (1ULL << 21)
|
|
|
|
enum sdma_op_type {
|
|
SDMA_OP_VRAM_READ,
|
|
SDMA_OP_VRAM_WRITE,
|
|
};
|
|
|
|
struct dumped_fd {
|
|
struct list_head l;
|
|
int fd;
|
|
bool is_drm;
|
|
};
|
|
|
|
struct shared_bo {
|
|
struct list_head l;
|
|
int handle;
|
|
bool has_exporter;
|
|
};
|
|
|
|
struct restore_completed_work {
|
|
struct list_head l;
|
|
int handle;
|
|
int id;
|
|
};
|
|
|
|
/* Helper structures to encode device topology of SRC and DEST platforms */
|
|
extern struct tp_system src_topology;
|
|
extern struct tp_system dest_topology;
|
|
|
|
/* Helper structures to encode device maps during Checkpoint and Restore operations */
|
|
extern struct device_maps checkpoint_maps;
|
|
extern struct device_maps restore_maps;
|
|
|
|
extern int fd_next;
|
|
|
|
extern bool kfd_fw_version_check;
|
|
extern bool kfd_sdma_fw_version_check;
|
|
extern bool kfd_caches_count_check;
|
|
extern bool kfd_num_gws_check;
|
|
extern bool kfd_vram_size_check;
|
|
extern bool kfd_numa_check;
|
|
extern bool kfd_capability_check;
|
|
|
|
int read_fp(FILE *fp, void *buf, const size_t buf_len);
|
|
int write_fp(FILE *fp, const void *buf, const size_t buf_len);
|
|
int read_file(const char *file_path, void *buf, const size_t buf_len);
|
|
int write_img_file(char *path, const void *buf, const size_t buf_len);
|
|
FILE *open_img_file(char *path, bool write, size_t *size);
|
|
|
|
int record_dumped_fd(int fd, bool is_drm);
|
|
struct list_head *get_dumped_fds();
|
|
void clear_dumped_fds();
|
|
|
|
bool shared_bo_has_exporter(int handle);
|
|
int record_shared_bo(int handle, bool is_imported);
|
|
int handle_for_shared_bo_fd(int dmabuf_fd);
|
|
|
|
int record_completed_work(int handle, int id);
|
|
bool work_already_completed(int handle, int id);
|
|
|
|
void clear_restore_state();
|
|
|
|
void print_kfd_bo_stat(int bo_cnt, struct kfd_criu_bo_bucket *bo_list);
|
|
|
|
int sdma_copy_bo(int shared_fd, uint64_t size, FILE *storage_fp,
|
|
void *buffer, size_t buffer_size, amdgpu_device_handle h_dev,
|
|
uint64_t max_copy_size, enum sdma_op_type type, bool do_not_free);
|
|
|
|
int serve_out_dmabuf_fd(int handle, int fd);
|
|
|
|
#endif /* __AMDGPU_PLUGIN_UTIL_H__ */
|