criu/plugins/amdgpu/amdgpu_plugin_util.h
David Francis d43217dadb plugin: Add DUMP_DEVICES_LATE callback
The amdgpu plugin was counting how many files were checkpointed
to determine when it should close the device files.

The number of device files is not consistent; a process may
have multiple copies of the drm device files open.

Instead of doing this counting, add a new callback after all
files are checkpointed, so plugins can clean up their
resources at an appropriate time.

Signed-off-by: David Francis <David.Francis@amd.com>
2025-11-14 18:31:37 +00:00

143 lines
4.1 KiB
C

#ifndef __AMDGPU_PLUGIN_UTIL_H__
#define __AMDGPU_PLUGIN_UTIL_H__
#include <libdrm/amdgpu.h>
#ifndef _GNU_SOURCE
#define _GNU_SOURCE 1
#endif
#ifdef COMPILE_TESTS
#undef pr_err
#define pr_err(format, arg...) fprintf(stdout, "%s:%d ERROR:" format, __FILE__, __LINE__, ##arg)
#undef pr_info
#define pr_info(format, arg...) fprintf(stdout, "%s:%d INFO:" format, __FILE__, __LINE__, ##arg)
#undef pr_debug
#define pr_debug(format, arg...) fprintf(stdout, "%s:%d DBG:" format, __FILE__, __LINE__, ##arg)
#undef pr_perror
#define pr_perror(format, arg...) \
fprintf(stdout, "%s:%d: " format " (errno = %d (%s))\n", __FILE__, __LINE__, ##arg, errno, strerror(errno))
#endif
#ifdef LOG_PREFIX
#undef LOG_PREFIX
#endif
#define LOG_PREFIX "amdgpu_plugin: "
#ifdef DEBUG
#define plugin_log_msg(fmt, ...) pr_debug(fmt, ##__VA_ARGS__)
#else
#define plugin_log_msg(fmt, ...) \
{ \
}
#endif
/* Path where KFD device is surfaced */
#define AMDGPU_KFD_DEVICE "/dev/kfd"
/* Path where DRM devices are surfaced */
#define AMDGPU_DRM_DEVICE "/dev/dri/renderD%d"
/* Minimum version of KFD IOCTL's that supports C&R */
#define KFD_IOCTL_MAJOR_VERSION 1
#define MIN_KFD_IOCTL_MINOR_VERSION 8
/* Name of file having serialized data of KFD device */
#define IMG_KFD_FILE "amdgpu-kfd-%d.img"
/* Name of file having serialized data of KFD buffer objects (BOs) */
#define IMG_KFD_PAGES_FILE "amdgpu-pages-%d-%04x.img"
/* Name of file having serialized data of DRM device */
#define IMG_DRM_FILE "amdgpu-renderD-%d.img"
/* Name of file having serialized data of DRM device buffer objects (BOs) */
#define IMG_DRM_PAGES_FILE "amdgpu-drm-pages-%d-%d-%04x.img"
/* Helper macros to Checkpoint and Restore a ROCm file */
#define HSAKMT_SHM_PATH "/dev/shm/hsakmt_shared_mem"
#define HSAKMT_SHM "/hsakmt_shared_mem"
#define HSAKMT_SEM_PATH "/dev/shm/sem.hsakmt_semaphore"
#define HSAKMT_SEM "hsakmt_semaphore"
/* Help macros to build sDMA command packets */
#define SDMA_PACKET(op, sub_op, e) ((((e)&0xFFFF) << 16) | (((sub_op)&0xFF) << 8) | (((op)&0xFF) << 0))
#define SDMA_OPCODE_COPY 1
#define SDMA_COPY_SUB_OPCODE_LINEAR 0
#define SDMA_NOP 0
#define SDMA_LINEAR_COPY_MAX_SIZE (1ULL << 21)
enum sdma_op_type {
SDMA_OP_VRAM_READ,
SDMA_OP_VRAM_WRITE,
};
struct dumped_fd {
struct list_head l;
int fd;
bool is_drm;
};
struct shared_bo {
struct list_head l;
int handle;
bool has_exporter;
};
struct restore_completed_work {
struct list_head l;
int handle;
int id;
};
/* Helper structures to encode device topology of SRC and DEST platforms */
extern struct tp_system src_topology;
extern struct tp_system dest_topology;
/* Helper structures to encode device maps during Checkpoint and Restore operations */
extern struct device_maps checkpoint_maps;
extern struct device_maps restore_maps;
extern int fd_next;
extern bool kfd_fw_version_check;
extern bool kfd_sdma_fw_version_check;
extern bool kfd_caches_count_check;
extern bool kfd_num_gws_check;
extern bool kfd_vram_size_check;
extern bool kfd_numa_check;
extern bool kfd_capability_check;
int read_fp(FILE *fp, void *buf, const size_t buf_len);
int write_fp(FILE *fp, const void *buf, const size_t buf_len);
int read_file(const char *file_path, void *buf, const size_t buf_len);
int write_img_file(char *path, const void *buf, const size_t buf_len);
FILE *open_img_file(char *path, bool write, size_t *size);
int record_dumped_fd(int fd, bool is_drm);
struct list_head *get_dumped_fds();
void clear_dumped_fds();
bool shared_bo_has_exporter(int handle);
int record_shared_bo(int handle, bool is_imported);
int record_shared_dmabuf_fd(int handle, int dmabuf_fd);
int dmabuf_fd_for_handle(int handle);
int record_completed_work(int handle, int id);
bool work_already_completed(int handle, int id);
void clear_restore_state();
void print_kfd_bo_stat(int bo_cnt, struct kfd_criu_bo_bucket *bo_list);
int sdma_copy_bo(int shared_fd, uint64_t size, FILE *storage_fp,
void *buffer, size_t buffer_size, amdgpu_device_handle h_dev,
uint64_t max_copy_size, enum sdma_op_type type, bool do_not_free);
int serve_out_dmabuf_fd(int handle, int fd);
#endif /* __AMDGPU_PLUGIN_UTIL_H__ */