mirror of
https://github.com/checkpoint-restore/criu.git
synced 2026-01-23 02:14:37 +00:00
amdgpu_plugin: Refactor code in preparation to support C&R for DRM devices
Add a new compilation unit to host symbols and methods that will be needed to C&R DRM devices. Refactor code that indicates support for C&R and checkpoints KFD and DRM devices Signed-off-by: Ramesh Errabolu <Ramesh.Errabolu@amd.com>
This commit is contained in:
parent
b689a6710c
commit
733ef96315
9 changed files with 460 additions and 247 deletions
|
|
@ -28,7 +28,7 @@ endif
|
|||
criu-amdgpu.pb-c.c: criu-amdgpu.proto
|
||||
protoc-c --proto_path=. --c_out=. criu-amdgpu.proto
|
||||
|
||||
amdgpu_plugin.so: amdgpu_plugin.c amdgpu_plugin_topology.c criu-amdgpu.pb-c.c
|
||||
amdgpu_plugin.so: amdgpu_plugin.c amdgpu_plugin_drm.c amdgpu_plugin_topology.c amdgpu_plugin_util.c criu-amdgpu.pb-c.c
|
||||
$(CC) $(PLUGIN_CFLAGS) $(shell $(COMPEL) includes) $^ -o $@ $(PLUGIN_INCLUDE) $(PLUGIN_LDFLAGS) $(LIBDRM_INC)
|
||||
|
||||
amdgpu_plugin_clean:
|
||||
|
|
|
|||
|
|
@ -30,55 +30,14 @@
|
|||
#include "files.h"
|
||||
|
||||
#include "common/list.h"
|
||||
#include "amdgpu_plugin_drm.h"
|
||||
#include "amdgpu_plugin_util.h"
|
||||
#include "amdgpu_plugin_topology.h"
|
||||
|
||||
#include "img-streamer.h"
|
||||
#include "image.h"
|
||||
#include "cr_options.h"
|
||||
|
||||
#define AMDGPU_KFD_DEVICE "/dev/kfd"
|
||||
#define PROCPIDMEM "/proc/%d/mem"
|
||||
#define HSAKMT_SHM_PATH "/dev/shm/hsakmt_shared_mem"
|
||||
#define HSAKMT_SHM "/hsakmt_shared_mem"
|
||||
#define HSAKMT_SEM_PATH "/dev/shm/sem.hsakmt_semaphore"
|
||||
#define HSAKMT_SEM "hsakmt_semaphore"
|
||||
|
||||
#define KFD_IOCTL_MAJOR_VERSION 1
|
||||
#define MIN_KFD_IOCTL_MINOR_VERSION 8
|
||||
|
||||
#define IMG_KFD_FILE "amdgpu-kfd-%d.img"
|
||||
#define IMG_RENDERD_FILE "amdgpu-renderD-%d.img"
|
||||
#define IMG_PAGES_FILE "amdgpu-pages-%d-%04x.img"
|
||||
|
||||
#ifndef _GNU_SOURCE
|
||||
#define _GNU_SOURCE 1
|
||||
#endif
|
||||
|
||||
#ifdef LOG_PREFIX
|
||||
#undef LOG_PREFIX
|
||||
#endif
|
||||
#define LOG_PREFIX "amdgpu_plugin: "
|
||||
|
||||
#ifdef DEBUG
|
||||
#define plugin_log_msg(fmt, ...) pr_debug(fmt, ##__VA_ARGS__)
|
||||
#else
|
||||
#define plugin_log_msg(fmt, ...) \
|
||||
{ \
|
||||
}
|
||||
#endif
|
||||
|
||||
#define SDMA_PACKET(op, sub_op, e) ((((e)&0xFFFF) << 16) | (((sub_op)&0xFF) << 8) | (((op)&0xFF) << 0))
|
||||
|
||||
#define SDMA_OPCODE_COPY 1
|
||||
#define SDMA_COPY_SUB_OPCODE_LINEAR 0
|
||||
#define SDMA_NOP 0
|
||||
#define SDMA_LINEAR_COPY_MAX_SIZE (1ULL << 21)
|
||||
|
||||
enum sdma_op_type {
|
||||
SDMA_OP_VRAM_READ,
|
||||
SDMA_OP_VRAM_WRITE,
|
||||
};
|
||||
|
||||
struct vma_metadata {
|
||||
struct list_head list;
|
||||
uint64_t old_pgoff;
|
||||
|
|
@ -89,143 +48,13 @@ struct vma_metadata {
|
|||
};
|
||||
|
||||
/************************************ Global Variables ********************************************/
|
||||
struct tp_system src_topology;
|
||||
struct tp_system dest_topology;
|
||||
|
||||
struct device_maps checkpoint_maps;
|
||||
struct device_maps restore_maps;
|
||||
|
||||
extern int fd_next;
|
||||
|
||||
static LIST_HEAD(update_vma_info_list);
|
||||
|
||||
extern bool kfd_fw_version_check;
|
||||
extern bool kfd_sdma_fw_version_check;
|
||||
extern bool kfd_caches_count_check;
|
||||
extern bool kfd_num_gws_check;
|
||||
extern bool kfd_vram_size_check;
|
||||
extern bool kfd_numa_check;
|
||||
extern bool kfd_capability_check;
|
||||
|
||||
size_t kfd_max_buffer_size;
|
||||
|
||||
/**************************************************************************************************/
|
||||
|
||||
int write_fp(FILE *fp, const void *buf, const size_t buf_len)
|
||||
{
|
||||
size_t len_write;
|
||||
|
||||
len_write = fwrite(buf, 1, buf_len, fp);
|
||||
if (len_write != buf_len) {
|
||||
pr_perror("Unable to write file (wrote:%ld buf_len:%ld)", len_write, buf_len);
|
||||
return -EIO;
|
||||
}
|
||||
return 0;
|
||||
}
|
||||
|
||||
int read_fp(FILE *fp, void *buf, const size_t buf_len)
|
||||
{
|
||||
size_t len_read;
|
||||
|
||||
len_read = fread(buf, 1, buf_len, fp);
|
||||
if (len_read != buf_len) {
|
||||
pr_perror("Unable to read file (read:%ld buf_len:%ld)", len_read, buf_len);
|
||||
return -EIO;
|
||||
}
|
||||
return 0;
|
||||
}
|
||||
|
||||
/**
|
||||
* @brief Open an image file
|
||||
*
|
||||
* We store the size of the actual contents in the first 8-bytes of the file. This allows us to
|
||||
* determine the file size when using criu_image_streamer when fseek and fstat are not available.
|
||||
* The FILE * returned is already at the location of the first actual contents.
|
||||
*
|
||||
* @param path The file path
|
||||
* @param write False for read, true for write
|
||||
* @param size Size of actual contents
|
||||
* @return FILE *if successful, NULL if failed
|
||||
*/
|
||||
FILE *open_img_file(char *path, bool write, size_t *size)
|
||||
{
|
||||
FILE *fp = NULL;
|
||||
int fd, ret;
|
||||
|
||||
if (opts.stream)
|
||||
fd = img_streamer_open(path, write ? O_DUMP : O_RSTR);
|
||||
else
|
||||
fd = openat(criu_get_image_dir(), path, write ? (O_WRONLY | O_CREAT) : O_RDONLY, 0600);
|
||||
|
||||
if (fd < 0) {
|
||||
pr_perror("%s: Failed to open for %s", path, write ? "write" : "read");
|
||||
return NULL;
|
||||
}
|
||||
|
||||
fp = fdopen(fd, write ? "w" : "r");
|
||||
if (!fp) {
|
||||
pr_perror("%s: Failed get pointer for %s", path, write ? "write" : "read");
|
||||
close(fd);
|
||||
return NULL;
|
||||
}
|
||||
|
||||
if (write)
|
||||
ret = write_fp(fp, size, sizeof(*size));
|
||||
else
|
||||
ret = read_fp(fp, size, sizeof(*size));
|
||||
|
||||
if (ret) {
|
||||
pr_perror("%s:Failed to access file size", path);
|
||||
fclose(fp);
|
||||
return NULL;
|
||||
}
|
||||
|
||||
pr_debug("%s:Opened file for %s with size:%ld\n", path, write ? "write" : "read", *size);
|
||||
return fp;
|
||||
}
|
||||
|
||||
/**
|
||||
* @brief Write an image file
|
||||
*
|
||||
* We store the size of the actual contents in the first 8-bytes of the file. This allows us to
|
||||
* determine the file size when using criu_image_streamer when fseek and fstat are not available.
|
||||
*
|
||||
* @param path The file path
|
||||
* @param buf pointer to data to be written
|
||||
* @param buf_len size of buf
|
||||
* @return 0 if successful. -errno on failure
|
||||
*/
|
||||
int write_img_file(char *path, const void *buf, const size_t buf_len)
|
||||
{
|
||||
int ret;
|
||||
FILE *fp;
|
||||
size_t len = buf_len;
|
||||
|
||||
fp = open_img_file(path, true, &len);
|
||||
if (!fp)
|
||||
return -errno;
|
||||
|
||||
ret = write_fp(fp, buf, buf_len);
|
||||
fclose(fp); /* this will also close fd */
|
||||
return ret;
|
||||
}
|
||||
|
||||
int read_file(const char *file_path, void *buf, const size_t buf_len)
|
||||
{
|
||||
int ret;
|
||||
FILE *fp;
|
||||
|
||||
fp = fopen(file_path, "r");
|
||||
if (!fp) {
|
||||
pr_perror("Cannot fopen %s", file_path);
|
||||
return -errno;
|
||||
}
|
||||
|
||||
ret = read_fp(fp, buf, buf_len);
|
||||
fclose(fp); /* this will also close fd */
|
||||
return ret;
|
||||
}
|
||||
|
||||
/* Call ioctl, restarting if it is interrupted */
|
||||
int kmtIoctl(int fd, unsigned long request, void *arg)
|
||||
{
|
||||
|
|
@ -263,21 +92,21 @@ static void free_e(CriuKfd *e)
|
|||
|
||||
static int allocate_device_entries(CriuKfd *e, int num_of_devices)
|
||||
{
|
||||
e->device_entries = xmalloc(sizeof(DeviceEntry *) * num_of_devices);
|
||||
e->device_entries = xmalloc(sizeof(KfdDeviceEntry *) * num_of_devices);
|
||||
if (!e->device_entries) {
|
||||
pr_err("Failed to allocate device_entries\n");
|
||||
return -ENOMEM;
|
||||
}
|
||||
|
||||
for (int i = 0; i < num_of_devices; i++) {
|
||||
DeviceEntry *entry = xzalloc(sizeof(*entry));
|
||||
KfdDeviceEntry *entry = xzalloc(sizeof(*entry));
|
||||
|
||||
if (!entry) {
|
||||
pr_err("Failed to allocate entry\n");
|
||||
return -ENOMEM;
|
||||
}
|
||||
|
||||
device_entry__init(entry);
|
||||
kfd_device_entry__init(entry);
|
||||
|
||||
e->device_entries[i] = entry;
|
||||
e->n_device_entries++;
|
||||
|
|
@ -287,21 +116,21 @@ static int allocate_device_entries(CriuKfd *e, int num_of_devices)
|
|||
|
||||
static int allocate_bo_entries(CriuKfd *e, int num_bos, struct kfd_criu_bo_bucket *bo_bucket_ptr)
|
||||
{
|
||||
e->bo_entries = xmalloc(sizeof(BoEntry *) * num_bos);
|
||||
e->bo_entries = xmalloc(sizeof(KfdBoEntry *) * num_bos);
|
||||
if (!e->bo_entries) {
|
||||
pr_err("Failed to allocate bo_info\n");
|
||||
return -ENOMEM;
|
||||
}
|
||||
|
||||
for (int i = 0; i < num_bos; i++) {
|
||||
BoEntry *entry = xzalloc(sizeof(*entry));
|
||||
KfdBoEntry *entry = xzalloc(sizeof(*entry));
|
||||
|
||||
if (!entry) {
|
||||
pr_err("Failed to allocate botest\n");
|
||||
return -ENOMEM;
|
||||
}
|
||||
|
||||
bo_entry__init(entry);
|
||||
kfd_bo_entry__init(entry);
|
||||
|
||||
e->bo_entries[i] = entry;
|
||||
e->n_bo_entries++;
|
||||
|
|
@ -309,13 +138,13 @@ static int allocate_bo_entries(CriuKfd *e, int num_bos, struct kfd_criu_bo_bucke
|
|||
return 0;
|
||||
}
|
||||
|
||||
int topology_to_devinfo(struct tp_system *sys, struct device_maps *maps, DeviceEntry **deviceEntries)
|
||||
int topology_to_devinfo(struct tp_system *sys, struct device_maps *maps, KfdDeviceEntry **deviceEntries)
|
||||
{
|
||||
uint32_t devinfo_index = 0;
|
||||
struct tp_node *node;
|
||||
|
||||
list_for_each_entry(node, &sys->nodes, listm_system) {
|
||||
DeviceEntry *devinfo = deviceEntries[devinfo_index++];
|
||||
KfdDeviceEntry *devinfo = deviceEntries[devinfo_index++];
|
||||
|
||||
devinfo->node_id = node->id;
|
||||
|
||||
|
|
@ -383,11 +212,11 @@ int topology_to_devinfo(struct tp_system *sys, struct device_maps *maps, DeviceE
|
|||
return 0;
|
||||
}
|
||||
|
||||
int devinfo_to_topology(DeviceEntry *devinfos[], uint32_t num_devices, struct tp_system *sys)
|
||||
int devinfo_to_topology(KfdDeviceEntry *devinfos[], uint32_t num_devices, struct tp_system *sys)
|
||||
{
|
||||
for (int i = 0; i < num_devices; i++) {
|
||||
struct tp_node *node;
|
||||
DeviceEntry *devinfo = devinfos[i];
|
||||
KfdDeviceEntry *devinfo = devinfos[i];
|
||||
|
||||
node = sys_add_node(sys, devinfo->node_id, devinfo->gpu_id);
|
||||
if (!node)
|
||||
|
|
@ -549,7 +378,7 @@ struct thread_data {
|
|||
uint32_t gpu_id;
|
||||
pid_t pid;
|
||||
struct kfd_criu_bo_bucket *bo_buckets;
|
||||
BoEntry **bo_entries;
|
||||
KfdBoEntry **bo_entries;
|
||||
int drm_fd;
|
||||
int ret;
|
||||
int id; /* File ID used by CRIU to identify KFD image for this process */
|
||||
|
|
@ -557,8 +386,7 @@ struct thread_data {
|
|||
|
||||
int amdgpu_plugin_handle_device_vma(int fd, const struct stat *st_buf)
|
||||
{
|
||||
struct stat st_kfd, st_dri_min;
|
||||
char img_path[128];
|
||||
struct stat st_kfd;
|
||||
int ret = 0;
|
||||
|
||||
pr_debug("Enter %s\n", __func__);
|
||||
|
|
@ -568,27 +396,18 @@ int amdgpu_plugin_handle_device_vma(int fd, const struct stat *st_buf)
|
|||
return ret;
|
||||
}
|
||||
|
||||
snprintf(img_path, sizeof(img_path), "/dev/dri/renderD%d", DRM_FIRST_RENDER_NODE);
|
||||
|
||||
ret = stat(img_path, &st_dri_min);
|
||||
if (ret == -1) {
|
||||
pr_perror("stat error for %s", img_path);
|
||||
return ret;
|
||||
}
|
||||
|
||||
if (major(st_buf->st_rdev) == major(st_kfd.st_rdev) || ((major(st_buf->st_rdev) == major(st_dri_min.st_rdev)) &&
|
||||
(minor(st_buf->st_rdev) >= minor(st_dri_min.st_rdev) &&
|
||||
minor(st_buf->st_rdev) >= DRM_FIRST_RENDER_NODE))) {
|
||||
/* If input device is KFD return device as supported */
|
||||
if (major(st_buf->st_rdev) == major(st_kfd.st_rdev)) {
|
||||
pr_debug("Known non-regular mapping, kfd-renderD%d -> OK\n", minor(st_buf->st_rdev));
|
||||
pr_debug("AMD KFD(maj) = %d, DRI(maj,min) = %d:%d VMA Device fd(maj,min) = %d:%d\n",
|
||||
major(st_kfd.st_rdev), major(st_dri_min.st_rdev), minor(st_dri_min.st_rdev),
|
||||
major(st_buf->st_rdev), minor(st_buf->st_rdev));
|
||||
/* VMA belongs to kfd */
|
||||
return 0;
|
||||
}
|
||||
|
||||
pr_perror("Can't handle the VMA mapping");
|
||||
return -ENOTSUP;
|
||||
/* Determine if input is a DRM device and therefore is supported */
|
||||
ret = amdgpu_plugin_drm_handle_device_vma(fd, st_buf);
|
||||
if (ret)
|
||||
pr_perror("%s(), Can't handle VMAs of input device\n", __func__);
|
||||
|
||||
return ret;
|
||||
}
|
||||
CR_PLUGIN_REGISTER_HOOK(CR_PLUGIN_HOOK__HANDLE_DEVICE_VMA, amdgpu_plugin_handle_device_vma)
|
||||
|
||||
|
|
@ -655,8 +474,9 @@ void free_and_unmap(uint64_t size, amdgpu_bo_handle h_bo, amdgpu_va_handle h_va,
|
|||
amdgpu_bo_free(h_bo);
|
||||
}
|
||||
|
||||
int sdma_copy_bo(struct kfd_criu_bo_bucket bo_bucket, FILE *storage_fp, void *buffer, size_t buffer_size,
|
||||
amdgpu_device_handle h_dev, uint64_t max_copy_size, enum sdma_op_type type)
|
||||
static int sdma_copy_bo(struct kfd_criu_bo_bucket bo_bucket, FILE *storage_fp,
|
||||
void *buffer, size_t buffer_size, amdgpu_device_handle h_dev,
|
||||
uint64_t max_copy_size, enum sdma_op_type type)
|
||||
{
|
||||
uint64_t size, src_bo_size, dst_bo_size, buffer_bo_size, bytes_remain, buffer_space_remain;
|
||||
uint64_t gpu_addr_src, gpu_addr_dst, gpu_addr_ib, copy_src, copy_dst, copy_size;
|
||||
|
|
@ -954,7 +774,7 @@ void *dump_bo_contents(void *_thread_data)
|
|||
goto exit;
|
||||
}
|
||||
|
||||
snprintf(img_path, sizeof(img_path), IMG_PAGES_FILE, thread_data->id, thread_data->gpu_id);
|
||||
snprintf(img_path, sizeof(img_path), IMG_KFD_PAGES_FILE, thread_data->id, thread_data->gpu_id);
|
||||
bo_contents_fp = open_img_file(img_path, true, &image_size);
|
||||
if (!bo_contents_fp) {
|
||||
pr_perror("Cannot fopen %s", img_path);
|
||||
|
|
@ -1027,7 +847,7 @@ void *restore_bo_contents(void *_thread_data)
|
|||
max_copy_size = (gpu_info.family_id >= AMDGPU_FAMILY_AI) ? SDMA_LINEAR_COPY_MAX_SIZE :
|
||||
SDMA_LINEAR_COPY_MAX_SIZE - 1;
|
||||
|
||||
snprintf(img_path, sizeof(img_path), IMG_PAGES_FILE, thread_data->id, thread_data->gpu_id);
|
||||
snprintf(img_path, sizeof(img_path), IMG_KFD_PAGES_FILE, thread_data->id, thread_data->gpu_id);
|
||||
bo_contents_fp = open_img_file(img_path, false, &image_size);
|
||||
if (!bo_contents_fp) {
|
||||
pr_perror("Cannot fopen %s", img_path);
|
||||
|
|
@ -1234,7 +1054,7 @@ static int save_bos(int id, int fd, struct kfd_ioctl_criu_args *args, struct kfd
|
|||
|
||||
for (i = 0; i < e->num_of_bos; i++) {
|
||||
struct kfd_criu_bo_bucket *bo_bucket = &bo_buckets[i];
|
||||
BoEntry *boinfo = e->bo_entries[i];
|
||||
KfdBoEntry *boinfo = e->bo_entries[i];
|
||||
|
||||
boinfo->gpu_id = bo_bucket->gpu_id;
|
||||
boinfo->addr = bo_bucket->addr;
|
||||
|
|
@ -1391,7 +1211,7 @@ int amdgpu_plugin_dump_file(int fd, int id)
|
|||
|
||||
criu_render_node__pack(&rd, buf);
|
||||
|
||||
snprintf(img_path, sizeof(img_path), IMG_RENDERD_FILE, id);
|
||||
snprintf(img_path, sizeof(img_path), IMG_DRM_FILE, id);
|
||||
ret = write_img_file(img_path, buf, len);
|
||||
if (ret) {
|
||||
xfree(buf);
|
||||
|
|
@ -1399,6 +1219,7 @@ int amdgpu_plugin_dump_file(int fd, int id)
|
|||
}
|
||||
|
||||
xfree(buf);
|
||||
|
||||
/* Need to return success here so that criu can call plugins for renderD nodes */
|
||||
return ret;
|
||||
}
|
||||
|
|
@ -1531,7 +1352,7 @@ static int restore_devices(struct kfd_ioctl_criu_args *args, CriuKfd *e)
|
|||
|
||||
for (int entries_i = 0; entries_i < e->num_of_cpus + e->num_of_gpus; entries_i++) {
|
||||
struct kfd_criu_device_bucket *device_bucket;
|
||||
DeviceEntry *devinfo = e->device_entries[entries_i];
|
||||
KfdDeviceEntry *devinfo = e->device_entries[entries_i];
|
||||
struct tp_node *tp_node;
|
||||
|
||||
if (!devinfo->gpu_id)
|
||||
|
|
@ -1581,7 +1402,7 @@ static int restore_bos(struct kfd_ioctl_criu_args *args, CriuKfd *e)
|
|||
|
||||
for (int i = 0; i < args->num_bos; i++) {
|
||||
struct kfd_criu_bo_bucket *bo_bucket = &bo_buckets[i];
|
||||
BoEntry *bo_entry = e->bo_entries[i];
|
||||
KfdBoEntry *bo_entry = e->bo_entries[i];
|
||||
|
||||
bo_bucket->gpu_id = bo_entry->gpu_id;
|
||||
bo_bucket->addr = bo_entry->addr;
|
||||
|
|
@ -1736,7 +1557,7 @@ int amdgpu_plugin_restore_file(int id)
|
|||
* TODO: Currently, this code will only work if this function is called for /dev/kfd
|
||||
* first as we assume restore_maps is already filled. Need to fix this later.
|
||||
*/
|
||||
snprintf(img_path, sizeof(img_path), IMG_RENDERD_FILE, id);
|
||||
snprintf(img_path, sizeof(img_path), IMG_DRM_FILE, id);
|
||||
pr_info("Restoring RenderD %s\n", img_path);
|
||||
|
||||
img_fp = open_img_file(img_path, false, &img_size);
|
||||
|
|
|
|||
63
plugins/amdgpu/amdgpu_plugin_drm.c
Normal file
63
plugins/amdgpu/amdgpu_plugin_drm.c
Normal file
|
|
@ -0,0 +1,63 @@
|
|||
#include <errno.h>
|
||||
#include <fcntl.h>
|
||||
#include <stdlib.h>
|
||||
#include <stdint.h>
|
||||
#include <stdbool.h>
|
||||
#include <string.h>
|
||||
#include <stdio.h>
|
||||
#include <unistd.h>
|
||||
#include <pthread.h>
|
||||
#include <semaphore.h>
|
||||
|
||||
#include <sys/stat.h>
|
||||
#include <sys/mman.h>
|
||||
#include <sys/types.h>
|
||||
#include <sys/ioctl.h>
|
||||
#include <sys/syscall.h>
|
||||
#include <sys/sysmacros.h>
|
||||
#include <linux/limits.h>
|
||||
|
||||
#include <dirent.h>
|
||||
#include "common/list.h"
|
||||
|
||||
#include "criu-amdgpu.pb-c.h"
|
||||
|
||||
#include <xf86drm.h>
|
||||
#include <libdrm/amdgpu.h>
|
||||
|
||||
#include "xmalloc.h"
|
||||
#include "criu-log.h"
|
||||
#include "kfd_ioctl.h"
|
||||
#include "amdgpu_plugin_drm.h"
|
||||
#include "amdgpu_plugin_util.h"
|
||||
#include "amdgpu_plugin_topology.h"
|
||||
|
||||
|
||||
int amdgpu_plugin_drm_handle_device_vma(int fd, const struct stat *st)
|
||||
{
|
||||
char path[PATH_MAX];
|
||||
struct stat drm;
|
||||
int ret = 0;
|
||||
|
||||
snprintf(path, sizeof(path), AMDGPU_DRM_DEVICE, DRM_FIRST_RENDER_NODE);
|
||||
ret = stat(path, &drm);
|
||||
if (ret == -1) {
|
||||
pr_err("Error in getting stat for: %s", path);
|
||||
return ret;
|
||||
}
|
||||
|
||||
if ((major(st->st_rdev) != major(drm.st_rdev)) ||
|
||||
(minor(st->st_rdev) < minor(drm.st_rdev)) ||
|
||||
(minor(st->st_rdev) > DRM_LAST_RENDER_NODE)) {
|
||||
pr_err("Can't handle VMA mapping of input device\n");
|
||||
return -ENOTSUP;
|
||||
}
|
||||
|
||||
pr_debug("AMD DRI(maj,min) = %d:%d VMA Device FD(maj,min) = %d:%d\n",
|
||||
major(drm.st_rdev), minor(drm.st_rdev),
|
||||
major(st->st_rdev), minor(st->st_rdev));
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
|
||||
22
plugins/amdgpu/amdgpu_plugin_drm.h
Normal file
22
plugins/amdgpu/amdgpu_plugin_drm.h
Normal file
|
|
@ -0,0 +1,22 @@
|
|||
#ifndef __AMDGPU_PLUGIN_DRM_H__
|
||||
#define __AMDGPU_PLUGIN_DRM_H__
|
||||
|
||||
#include <dirent.h>
|
||||
#include "common/list.h"
|
||||
|
||||
#include "xmalloc.h"
|
||||
#include "criu-log.h"
|
||||
#include "kfd_ioctl.h"
|
||||
#include "amdgpu_plugin_util.h"
|
||||
#include "amdgpu_plugin_topology.h"
|
||||
|
||||
|
||||
/**
|
||||
* Determines if VMA's of input file descriptor belong to amdgpu's
|
||||
* DRM device and are therefore supported
|
||||
*/
|
||||
int amdgpu_plugin_drm_handle_device_vma(int fd, const struct stat *drm);
|
||||
|
||||
|
||||
#endif /* __AMDGPU_PLUGIN_DRM_H__ */
|
||||
|
||||
|
|
@ -16,35 +16,11 @@
|
|||
|
||||
#include "xmalloc.h"
|
||||
#include "kfd_ioctl.h"
|
||||
#include "amdgpu_plugin_util.h"
|
||||
#include "amdgpu_plugin_topology.h"
|
||||
|
||||
#define TOPOLOGY_PATH "/sys/class/kfd/kfd/topology/nodes/"
|
||||
|
||||
#ifndef _GNU_SOURCE
|
||||
#define _GNU_SOURCE 1
|
||||
#endif
|
||||
|
||||
#ifdef COMPILE_TESTS
|
||||
#undef pr_err
|
||||
#define pr_err(format, arg...) fprintf(stdout, "%s:%d ERROR:" format, __FILE__, __LINE__, ##arg)
|
||||
#undef pr_info
|
||||
#define pr_info(format, arg...) fprintf(stdout, "%s:%d INFO:" format, __FILE__, __LINE__, ##arg)
|
||||
#undef pr_debug
|
||||
#define pr_debug(format, arg...) fprintf(stdout, "%s:%d DBG:" format, __FILE__, __LINE__, ##arg)
|
||||
|
||||
#undef pr_perror
|
||||
#define pr_perror(format, arg...) \
|
||||
fprintf(stdout, "%s:%d: " format " (errno = %d (%s))\n", __FILE__, __LINE__, ##arg, errno, strerror(errno))
|
||||
#endif
|
||||
|
||||
#ifdef DEBUG
|
||||
#define plugin_log_msg(fmt, ...) pr_debug(fmt, ##__VA_ARGS__)
|
||||
#else
|
||||
#define plugin_log_msg(fmt, ...) \
|
||||
{ \
|
||||
}
|
||||
#endif
|
||||
|
||||
/* User override options */
|
||||
/* Skip firmware version check */
|
||||
bool kfd_fw_version_check = true;
|
||||
|
|
@ -840,6 +816,9 @@ void topology_free(struct tp_system *sys)
|
|||
list_del(&p2pgroup->listm_system);
|
||||
xfree(p2pgroup);
|
||||
}
|
||||
|
||||
/* Update Topology as being freed */
|
||||
sys->parsed = false;
|
||||
}
|
||||
|
||||
/**
|
||||
|
|
@ -1461,3 +1440,15 @@ int set_restore_gpu_maps(struct tp_system *src_sys, struct tp_system *dest_sys,
|
|||
|
||||
return ret;
|
||||
}
|
||||
|
||||
int topology_gpu_count(struct tp_system *sys)
|
||||
{
|
||||
struct tp_node *node;
|
||||
int count = 0;
|
||||
|
||||
list_for_each_entry(node, &sys->nodes, listm_system)
|
||||
if (NODE_IS_GPU(node))
|
||||
count++;
|
||||
return count;
|
||||
}
|
||||
|
||||
|
|
|
|||
|
|
@ -107,6 +107,8 @@ int topology_parse(struct tp_system *topology, const char *msg);
|
|||
int topology_determine_iolinks(struct tp_system *sys);
|
||||
void topology_print(const struct tp_system *sys, const char *msg);
|
||||
|
||||
int topology_gpu_count(struct tp_system *topology);
|
||||
|
||||
struct id_map *maps_add_gpu_entry(struct device_maps *maps, const uint32_t src_id, const uint32_t dest_id);
|
||||
|
||||
struct tp_node *sys_add_node(struct tp_system *sys, uint32_t id, uint32_t gpu_id);
|
||||
|
|
|
|||
208
plugins/amdgpu/amdgpu_plugin_util.c
Executable file
208
plugins/amdgpu/amdgpu_plugin_util.c
Executable file
|
|
@ -0,0 +1,208 @@
|
|||
#include <errno.h>
|
||||
#include <fcntl.h>
|
||||
#include <stdlib.h>
|
||||
#include <stdint.h>
|
||||
#include <stdbool.h>
|
||||
#include <string.h>
|
||||
#include <stdio.h>
|
||||
#include <unistd.h>
|
||||
#include <pthread.h>
|
||||
#include <semaphore.h>
|
||||
|
||||
#include <sys/stat.h>
|
||||
#include <sys/mman.h>
|
||||
#include <sys/types.h>
|
||||
#include <sys/ioctl.h>
|
||||
#include <sys/syscall.h>
|
||||
#include <sys/sysmacros.h>
|
||||
#include <linux/limits.h>
|
||||
|
||||
#include <dirent.h>
|
||||
#include "common/list.h"
|
||||
|
||||
#include <xf86drm.h>
|
||||
#include <libdrm/amdgpu.h>
|
||||
|
||||
#include "criu-plugin.h"
|
||||
#include "plugin.h"
|
||||
#include "criu-amdgpu.pb-c.h"
|
||||
|
||||
#include "img-streamer.h"
|
||||
#include "image.h"
|
||||
#include "cr_options.h"
|
||||
|
||||
#include "xmalloc.h"
|
||||
#include "criu-log.h"
|
||||
#include "kfd_ioctl.h"
|
||||
#include "amdgpu_drm.h"
|
||||
#include "amdgpu_plugin_util.h"
|
||||
#include "amdgpu_plugin_topology.h"
|
||||
|
||||
/* Tracks number of device files that need to be checkpointed */
|
||||
static int dev_file_cnt = 0;
|
||||
|
||||
/* Helper structures to encode device topology of SRC and DEST platforms */
|
||||
struct tp_system src_topology;
|
||||
struct tp_system dest_topology;
|
||||
|
||||
/* Helper structures to encode device maps during Checkpoint and Restore operations */
|
||||
struct device_maps checkpoint_maps;
|
||||
struct device_maps restore_maps;
|
||||
|
||||
bool checkpoint_is_complete()
|
||||
{
|
||||
return (dev_file_cnt == 0);
|
||||
}
|
||||
|
||||
void decrement_checkpoint_count()
|
||||
{
|
||||
dev_file_cnt--;
|
||||
}
|
||||
|
||||
void init_gpu_count(struct tp_system *topo)
|
||||
{
|
||||
if (dev_file_cnt != 0)
|
||||
return;
|
||||
|
||||
/* We add ONE to include checkpointing of KFD device */
|
||||
dev_file_cnt = 1 + topology_gpu_count(topo);
|
||||
}
|
||||
|
||||
int read_fp(FILE *fp, void *buf, const size_t buf_len)
|
||||
{
|
||||
size_t len_read;
|
||||
|
||||
len_read = fread(buf, 1, buf_len, fp);
|
||||
if (len_read != buf_len) {
|
||||
pr_err("Unable to read file (read:%ld buf_len:%ld)", len_read, buf_len);
|
||||
return -EIO;
|
||||
}
|
||||
return 0;
|
||||
}
|
||||
|
||||
int write_fp(FILE *fp, const void *buf, const size_t buf_len)
|
||||
{
|
||||
size_t len_write;
|
||||
|
||||
len_write = fwrite(buf, 1, buf_len, fp);
|
||||
if (len_write != buf_len) {
|
||||
pr_err("Unable to write file (wrote:%ld buf_len:%ld)", len_write, buf_len);
|
||||
return -EIO;
|
||||
}
|
||||
return 0;
|
||||
}
|
||||
|
||||
/**
|
||||
* @brief Open an image file
|
||||
*
|
||||
* We store the size of the actual contents in the first 8-bytes of
|
||||
* the file. This allows us to determine the file size when using
|
||||
* criu_image_streamer when fseek and fstat are not available. The
|
||||
* FILE * returned is already at the location of the first actual
|
||||
* contents.
|
||||
*
|
||||
* @param path The file path
|
||||
* @param write False for read, true for write
|
||||
* @param size Size of actual contents
|
||||
* @return FILE *if successful, NULL if failed
|
||||
*/
|
||||
FILE *open_img_file(char *path, bool write, size_t *size)
|
||||
{
|
||||
FILE *fp = NULL;
|
||||
int fd, ret;
|
||||
|
||||
if (opts.stream)
|
||||
fd = img_streamer_open(path, write ? O_DUMP : O_RSTR);
|
||||
else
|
||||
fd = openat(criu_get_image_dir(), path, write ? (O_WRONLY | O_CREAT) : O_RDONLY, 0600);
|
||||
|
||||
if (fd < 0) {
|
||||
pr_err("%s: Failed to open for %s", path, write ? "write" : "read");
|
||||
return NULL;
|
||||
}
|
||||
|
||||
fp = fdopen(fd, write ? "w" : "r");
|
||||
if (!fp) {
|
||||
pr_err("%s: Failed get pointer for %s", path, write ? "write" : "read");
|
||||
return NULL;
|
||||
}
|
||||
|
||||
if (write)
|
||||
ret = write_fp(fp, size, sizeof(*size));
|
||||
else
|
||||
ret = read_fp(fp, size, sizeof(*size));
|
||||
|
||||
if (ret) {
|
||||
pr_err("%s:Failed to access file size", path);
|
||||
fclose(fp);
|
||||
return NULL;
|
||||
}
|
||||
|
||||
pr_debug("%s:Opened file for %s with size:%ld\n", path, write ? "write" : "read", *size);
|
||||
return fp;
|
||||
}
|
||||
|
||||
int read_file(const char *file_path, void *buf, const size_t buf_len)
|
||||
{
|
||||
int ret;
|
||||
FILE *fp;
|
||||
|
||||
fp = fopen(file_path, "r");
|
||||
if (!fp) {
|
||||
pr_err("Cannot fopen %s", file_path);
|
||||
return -errno;
|
||||
}
|
||||
|
||||
ret = read_fp(fp, buf, buf_len);
|
||||
fclose(fp); /* this will also close fd */
|
||||
return ret;
|
||||
}
|
||||
|
||||
|
||||
/**
|
||||
* @brief Write an image file
|
||||
*
|
||||
* We store the size of the actual contents in the first 8-bytes of the file. This allows us to
|
||||
* determine the file size when using criu_image_streamer when fseek and fstat are not available.
|
||||
*
|
||||
* @param path The file path
|
||||
* @param buf pointer to data to be written
|
||||
* @param buf_len size of buf
|
||||
* @return 0 if successful. -errno on failure
|
||||
*/
|
||||
int write_img_file(char *path, const void *buf, const size_t buf_len)
|
||||
{
|
||||
int ret;
|
||||
FILE *fp;
|
||||
size_t len = buf_len;
|
||||
|
||||
fp = open_img_file(path, true, &len);
|
||||
if (!fp)
|
||||
return -errno;
|
||||
|
||||
ret = write_fp(fp, buf, buf_len);
|
||||
fclose(fp); /* this will also close fd */
|
||||
return ret;
|
||||
}
|
||||
|
||||
void print_kfd_bo_stat(int bo_cnt, struct kfd_criu_bo_bucket *bo_list)
|
||||
{
|
||||
struct kfd_criu_bo_bucket *bo;
|
||||
|
||||
pr_info("\n");
|
||||
for (int idx = 0; idx < bo_cnt; idx++) {
|
||||
bo = &bo_list[idx];
|
||||
pr_info("\n");
|
||||
pr_info("%s(), %d. KFD BO Addr: %llx \n", __func__, idx, bo->addr);
|
||||
pr_info("%s(), %d. KFD BO Size: %llx \n", __func__, idx, bo->size);
|
||||
pr_info("%s(), %d. KFD BO Offset: %llx \n", __func__, idx, bo->offset);
|
||||
pr_info("%s(), %d. KFD BO Restored Offset: %llx \n", __func__, idx, bo->restored_offset);
|
||||
pr_info("%s(), %d. KFD BO Alloc Flags: %x \n", __func__, idx, bo->alloc_flags);
|
||||
pr_info("%s(), %d. KFD BO Gpu ID: %x \n", __func__, idx, bo->gpu_id);
|
||||
pr_info("%s(), %d. KFD BO Dmabuf FD: %x \n", __func__, idx, bo->dmabuf_fd);
|
||||
pr_info("\n");
|
||||
}
|
||||
pr_info("\n");
|
||||
}
|
||||
|
||||
|
||||
106
plugins/amdgpu/amdgpu_plugin_util.h
Executable file
106
plugins/amdgpu/amdgpu_plugin_util.h
Executable file
|
|
@ -0,0 +1,106 @@
|
|||
#ifndef __AMDGPU_PLUGIN_UTIL_H__
|
||||
#define __AMDGPU_PLUGIN_UTIL_H__
|
||||
|
||||
#ifndef _GNU_SOURCE
|
||||
#define _GNU_SOURCE 1
|
||||
#endif
|
||||
|
||||
#ifdef COMPILE_TESTS
|
||||
#undef pr_err
|
||||
#define pr_err(format, arg...) fprintf(stdout, "%s:%d ERROR:" format, __FILE__, __LINE__, ##arg)
|
||||
#undef pr_info
|
||||
#define pr_info(format, arg...) fprintf(stdout, "%s:%d INFO:" format, __FILE__, __LINE__, ##arg)
|
||||
#undef pr_debug
|
||||
#define pr_debug(format, arg...) fprintf(stdout, "%s:%d DBG:" format, __FILE__, __LINE__, ##arg)
|
||||
|
||||
#undef pr_perror
|
||||
#define pr_perror(format, arg...) \
|
||||
fprintf(stdout, "%s:%d: " format " (errno = %d (%s))\n", __FILE__, __LINE__, ##arg, errno, strerror(errno))
|
||||
#endif
|
||||
|
||||
#ifdef LOG_PREFIX
|
||||
#undef LOG_PREFIX
|
||||
#endif
|
||||
#define LOG_PREFIX "amdgpu_plugin: "
|
||||
|
||||
#ifdef DEBUG
|
||||
#define plugin_log_msg(fmt, ...) pr_debug(fmt, ##__VA_ARGS__)
|
||||
#else
|
||||
#define plugin_log_msg(fmt, ...) \
|
||||
{ \
|
||||
}
|
||||
#endif
|
||||
|
||||
|
||||
/* Path where KFD device is surfaced */
|
||||
#define AMDGPU_KFD_DEVICE "/dev/kfd"
|
||||
|
||||
/* Path where DRM devices are surfaced */
|
||||
#define AMDGPU_DRM_DEVICE "/dev/dri/renderD%d"
|
||||
|
||||
/* Minimum version of KFD IOCTL's that supports C&R */
|
||||
#define KFD_IOCTL_MAJOR_VERSION 1
|
||||
#define MIN_KFD_IOCTL_MINOR_VERSION 8
|
||||
|
||||
/* Name of file having serialized data of KFD device */
|
||||
#define IMG_KFD_FILE "amdgpu-kfd-%d.img"
|
||||
|
||||
/* Name of file having serialized data of KFD buffer objects (BOs) */
|
||||
#define IMG_KFD_PAGES_FILE "amdgpu-pages-%d-%04x.img"
|
||||
|
||||
/* Name of file having serialized data of DRM device */
|
||||
#define IMG_DRM_FILE "amdgpu-renderD-%d.img"
|
||||
|
||||
/* Name of file having serialized data of DRM device buffer objects (BOs) */
|
||||
#define IMG_DRM_PAGES_FILE "amdgpu-drm-pages-%d-%04x.img"
|
||||
|
||||
/* Helper macros to Checkpoint and Restore a ROCm file */
|
||||
#define HSAKMT_SHM_PATH "/dev/shm/hsakmt_shared_mem"
|
||||
#define HSAKMT_SHM "/hsakmt_shared_mem"
|
||||
#define HSAKMT_SEM_PATH "/dev/shm/sem.hsakmt_semaphore"
|
||||
#define HSAKMT_SEM "hsakmt_semaphore"
|
||||
|
||||
/* Help macros to build sDMA command packets */
|
||||
#define SDMA_PACKET(op, sub_op, e) ((((e)&0xFFFF) << 16) | (((sub_op)&0xFF) << 8) | (((op)&0xFF) << 0))
|
||||
|
||||
#define SDMA_OPCODE_COPY 1
|
||||
#define SDMA_COPY_SUB_OPCODE_LINEAR 0
|
||||
#define SDMA_NOP 0
|
||||
#define SDMA_LINEAR_COPY_MAX_SIZE (1ULL << 21)
|
||||
|
||||
enum sdma_op_type {
|
||||
SDMA_OP_VRAM_READ,
|
||||
SDMA_OP_VRAM_WRITE,
|
||||
};
|
||||
|
||||
/* Helper structures to encode device topology of SRC and DEST platforms */
|
||||
extern struct tp_system src_topology;
|
||||
extern struct tp_system dest_topology;
|
||||
|
||||
/* Helper structures to encode device maps during Checkpoint and Restore operations */
|
||||
extern struct device_maps checkpoint_maps;
|
||||
extern struct device_maps restore_maps;
|
||||
|
||||
extern int fd_next;
|
||||
|
||||
extern bool kfd_fw_version_check;
|
||||
extern bool kfd_sdma_fw_version_check;
|
||||
extern bool kfd_caches_count_check;
|
||||
extern bool kfd_num_gws_check;
|
||||
extern bool kfd_vram_size_check;
|
||||
extern bool kfd_numa_check;
|
||||
extern bool kfd_capability_check;
|
||||
|
||||
int read_fp(FILE *fp, void *buf, const size_t buf_len);
|
||||
int write_fp(FILE *fp, const void *buf, const size_t buf_len);
|
||||
int read_file(const char *file_path, void *buf, const size_t buf_len);
|
||||
int write_img_file(char *path, const void *buf, const size_t buf_len);
|
||||
FILE *open_img_file(char *path, bool write, size_t *size);
|
||||
|
||||
bool checkpoint_is_complete();
|
||||
void decrement_checkpoint_count();
|
||||
void init_gpu_count(struct tp_system *topology);
|
||||
|
||||
void print_kfd_bo_stat(int bo_cnt, struct kfd_criu_bo_bucket *bo_list);
|
||||
|
||||
#endif /* __AMDGPU_PLUGIN_UTIL_H__ */
|
||||
|
|
@ -5,7 +5,7 @@ message dev_iolink {
|
|||
required uint32 node_to_id = 2;
|
||||
}
|
||||
|
||||
message device_entry {
|
||||
message kfd_device_entry {
|
||||
required uint32 node_id = 1;
|
||||
required uint32 gpu_id = 2;
|
||||
required uint32 cpu_cores_count = 3;
|
||||
|
|
@ -40,10 +40,10 @@ message device_entry {
|
|||
repeated dev_iolink iolinks = 32;
|
||||
}
|
||||
|
||||
message bo_entry {
|
||||
required uint64 addr = 1;
|
||||
required uint64 size = 2;
|
||||
required uint64 offset = 3;
|
||||
message kfd_bo_entry {
|
||||
required uint64 addr = 1;
|
||||
required uint64 size = 2;
|
||||
required uint64 offset = 3;
|
||||
required uint32 alloc_flags = 4;
|
||||
required uint32 gpu_id = 5;
|
||||
}
|
||||
|
|
@ -52,10 +52,10 @@ message criu_kfd {
|
|||
required uint32 pid = 1;
|
||||
required uint32 num_of_gpus = 2;
|
||||
required uint32 num_of_cpus = 3;
|
||||
repeated device_entry device_entries = 4;
|
||||
required uint64 num_of_bos = 5;
|
||||
repeated bo_entry bo_entries = 6;
|
||||
required uint32 num_of_objects = 7;
|
||||
repeated kfd_device_entry device_entries = 4;
|
||||
required uint64 num_of_bos = 5;
|
||||
repeated kfd_bo_entry bo_entries = 6;
|
||||
required uint32 num_of_objects = 7;
|
||||
required uint64 shared_mem_size = 8;
|
||||
required uint32 shared_mem_magic = 9;
|
||||
required bytes priv_data = 10;
|
||||
|
|
|
|||
Loading…
Add table
Add a link
Reference in a new issue