criu/plugin: Store BO contents directly to file

Store BO contents directly to file (1 per GPU) instead of using
protobuf.

Bug Fix:
Fixes an issue where we could not handle BOs bigger than 4GB because
protobuf has an internal limit of 4GB for the Bytes structure.

Performance Improvements:
This significantly reduces CR duration on multi-GPU systems as it allows
reading and writing to disk in parallel. During checkpoint, instead of
waiting for all the BO contents to be read from the one protobuf file,
we can now start writing the BO contents as soon as the first BO is read
from disk. During restore, we can start writing BO contents to disk
after the first BO from VRAM. This also reduces the peak amount of
system memory used as we only need to keep 1 BO content in memory per
GPU at a time instead of all the BO contents.

Signed-off-by: David Yat Sin <david.yatsin@amd.com>
This commit is contained in:
David Yat Sin 2021-10-25 13:57:12 -04:00 committed by Andrei Vagin
parent ecdf740fa3
commit 55370b720e
3 changed files with 126 additions and 84 deletions

View file

@ -13,6 +13,7 @@ Single and Multi GPU systems (Gfx9)
Checkpoint / Restore on different system
Checkpoint / Restore inside a docker container
Pytorch
Tensorflow
DESCRIPTION
-----------

View file

@ -42,6 +42,10 @@
#define KFD_IOCTL_MAJOR_VERSION 1
#define MIN_KFD_IOCTL_MINOR_VERSION 8
#define IMG_KFD_FILE "amdgpu-kfd-%d.img"
#define IMG_RENDERD_FILE "amdgpu-renderD-%d.img"
#define IMG_PAGES_FILE "amdgpu-pages-%d-%04x.img"
#ifndef _GNU_SOURCE
#define _GNU_SOURCE 1
#endif
@ -101,11 +105,34 @@ extern bool kfd_capability_check;
/**************************************************************************************************/
int write_fp(FILE *fp, const void *buf, const size_t buf_len)
{
size_t len_write;
len_write = fwrite(buf, 1, buf_len, fp);
if (len_write != buf_len) {
pr_perror("Unable to write file (wrote:%ld buf_len:%ld)", len_write, buf_len);
return -EIO;
}
return 0;
}
int read_fp(FILE *fp, void *buf, const size_t buf_len)
{
size_t len_read;
len_read = fread(buf, 1, buf_len, fp);
if (len_read != buf_len) {
pr_perror("Unable to read file (read:%ld buf_len:%ld)", len_read, buf_len);
return -EIO;
}
return 0;
}
int write_file(const char *file_path, const void *buf, const size_t buf_len)
{
int fd;
int fd, ret;
FILE *fp;
size_t len_wrote;
fd = openat(criu_get_image_dir(), file_path, O_WRONLY | O_CREAT, 0600);
if (fd < 0) {
@ -119,24 +146,15 @@ int write_file(const char *file_path, const void *buf, const size_t buf_len)
return -errno;
}
len_wrote = fwrite(buf, 1, buf_len, fp);
if (len_wrote != buf_len) {
pr_perror("Unable to write %s (wrote:%ld buf_len:%ld)", file_path, len_wrote, buf_len);
fclose(fp);
return -EIO;
}
pr_info("Wrote file:%s (%ld bytes)\n", file_path, buf_len);
/* this will also close fd */
fclose(fp);
return 0;
ret = write_fp(fp, buf, buf_len);
fclose(fp); /* this will also close fd */
return ret;
}
int read_file(const char *file_path, void *buf, const size_t buf_len)
{
int fd;
int fd, ret;
FILE *fp;
size_t len_read;
fd = openat(criu_get_image_dir(), file_path, O_RDONLY);
if (fd < 0) {
@ -150,18 +168,9 @@ int read_file(const char *file_path, void *buf, const size_t buf_len)
return -errno;
}
len_read = fread(buf, 1, buf_len, fp);
if (len_read != buf_len) {
pr_perror("Unable to read %s", file_path);
fclose(fp);
return -EIO;
}
pr_info("Read file:%s (%ld bytes)\n", file_path, buf_len);
/* this will also close fd */
fclose(fp);
return 0;
ret = read_fp(fp, buf, buf_len);
fclose(fp); /* this will also close fd */
return ret;
}
/* Call ioctl, restarting if it is interrupted */
@ -184,12 +193,8 @@ int kmtIoctl(int fd, unsigned long request, void *arg)
static void free_e(CriuKfd *e)
{
for (int i = 0; i < e->n_bo_entries; i++) {
if (e->bo_entries[i]) {
if (e->bo_entries[i]->rawdata.data)
xfree(e->bo_entries[i]->rawdata.data);
if (e->bo_entries[i])
xfree(e->bo_entries[i]);
}
}
for (int i = 0; i < e->n_device_entries; i++) {
@ -245,12 +250,6 @@ static int allocate_bo_entries(CriuKfd *e, int num_bos, struct kfd_criu_bo_bucke
bo_entry__init(entry);
if ((bo_bucket_ptr)[i].alloc_flags & KFD_IOC_ALLOC_MEM_FLAGS_VRAM ||
(bo_bucket_ptr)[i].alloc_flags & KFD_IOC_ALLOC_MEM_FLAGS_GTT) {
entry->rawdata.data = xmalloc((bo_bucket_ptr)[i].size);
entry->rawdata.len = (bo_bucket_ptr)[i].size;
}
e->bo_entries[i] = entry;
e->n_bo_entries++;
}
@ -455,6 +454,7 @@ struct thread_data {
BoEntry **bo_entries;
int drm_fd;
int ret;
int id; /* File ID used by CRIU to identify KFD image for this process */
};
int amdgpu_plugin_handle_device_vma(int fd, const struct stat *st_buf)
@ -557,7 +557,7 @@ void free_and_unmap(uint64_t size, amdgpu_bo_handle h_bo, amdgpu_va_handle h_va,
amdgpu_bo_free(h_bo);
}
int sdma_copy_bo(struct kfd_criu_bo_bucket *bo_buckets, BoEntry **bo_info_test, int i, amdgpu_device_handle h_dev,
int sdma_copy_bo(struct kfd_criu_bo_bucket *bo_buckets, void *userptr, int i, amdgpu_device_handle h_dev,
uint64_t max_copy_size, enum sdma_op_type type)
{
uint64_t size, gpu_addr_src, gpu_addr_dest, gpu_addr_ib;
@ -574,7 +574,6 @@ int sdma_copy_bo(struct kfd_criu_bo_bucket *bo_buckets, BoEntry **bo_info_test,
struct amdgpu_cs_fence fence;
uint32_t expired;
amdgpu_context_handle h_ctx;
void *userptr = NULL;
uint32_t *ib = NULL;
int err, shared_fd;
@ -586,20 +585,9 @@ int sdma_copy_bo(struct kfd_criu_bo_bucket *bo_buckets, BoEntry **bo_info_test,
/* prepare src buffer */
switch (type) {
case SDMA_OP_VRAM_WRITE:
/* create the userptr BO and prepare the src buffer */
posix_memalign(&userptr, sysconf(_SC_PAGE_SIZE), size);
if (!userptr) {
pr_perror("failed to alloc memory for userptr");
return -ENOMEM;
}
memcpy(userptr, bo_info_test[i]->rawdata.data, size);
plugin_log_msg("data copied to userptr from protobuf buffer\n");
err = amdgpu_create_bo_from_user_mem(h_dev, userptr, size, &h_bo_src);
if (err) {
pr_perror("failed to create userptr for sdma");
free(userptr);
return -EFAULT;
}
@ -644,16 +632,9 @@ int sdma_copy_bo(struct kfd_criu_bo_bucket *bo_buckets, BoEntry **bo_info_test,
break;
case SDMA_OP_VRAM_READ:
posix_memalign(&userptr, sysconf(_SC_PAGE_SIZE), size);
if (!userptr) {
pr_perror("failed to alloc memory for userptr");
goto err_dest_bo_prep;
}
memset(userptr, 0, size);
err = amdgpu_create_bo_from_user_mem(h_dev, userptr, size, &h_bo_dest);
if (err) {
pr_perror("failed to create userptr for sdma");
free(userptr);
goto err_dest_bo_prep;
}
break;
@ -774,11 +755,6 @@ int sdma_copy_bo(struct kfd_criu_bo_bucket *bo_buckets, BoEntry **bo_info_test,
plugin_log_msg("done querying fence status\n");
if (type == SDMA_OP_VRAM_READ) {
memcpy(bo_info_test[i]->rawdata.data, userptr, size);
plugin_log_msg("data copied to protobuf buffer\n");
}
err_cs_submit_ib:
amdgpu_cs_ctx_free(h_ctx);
err_ctx:
@ -798,11 +774,6 @@ err_dest_va:
if (err)
pr_perror("dest bo free failed");
if (userptr && (type == SDMA_OP_VRAM_READ)) {
free(userptr);
userptr = NULL;
}
err_dest_bo_prep:
err = amdgpu_bo_va_op(h_bo_src, 0, size, gpu_addr_src, 0, AMDGPU_VA_OP_UNMAP);
if (err)
@ -816,11 +787,6 @@ err_src_va:
if (err)
pr_perror("src bo free failed");
if (userptr && (type == SDMA_OP_VRAM_WRITE)) {
free(userptr);
userptr = NULL;
}
plugin_log_msg("Leaving sdma_copy_bo, err = %d\n", err);
return err;
}
@ -836,6 +802,10 @@ void *dump_bo_contents(void *_thread_data)
uint32_t major, minor;
int num_bos = 0;
int i, ret = 0;
FILE *bo_contents_fp = NULL;
void *buffer;
char img_path[40];
size_t max_bo_size = 0;
pr_info("amdgpu_plugin: Thread[0x%x] started\n", thread_data->gpu_id);
@ -855,6 +825,30 @@ void *dump_bo_contents(void *_thread_data)
max_copy_size = (gpu_info.family_id >= AMDGPU_FAMILY_AI) ? SDMA_LINEAR_COPY_MAX_SIZE :
SDMA_LINEAR_COPY_MAX_SIZE - 1;
for (i = 0; i < thread_data->num_of_bos; i++) {
if (bo_buckets[i].gpu_id == thread_data->gpu_id &&
(bo_buckets[i].alloc_flags & (KFD_IOC_ALLOC_MEM_FLAGS_VRAM | KFD_IOC_ALLOC_MEM_FLAGS_GTT)) &&
bo_buckets[i].size > max_bo_size) {
max_bo_size = bo_buckets[i].size;
}
}
/* Allocate buffer to fit biggest BO */
posix_memalign(&buffer, sysconf(_SC_PAGE_SIZE), max_bo_size);
if (!buffer) {
pr_perror("Failed to alloc aligned memory");
ret = -ENOMEM;
goto exit;
}
snprintf(img_path, sizeof(img_path), IMG_PAGES_FILE, thread_data->id, thread_data->gpu_id);
bo_contents_fp = fopen(img_path, "w");
if (!bo_contents_fp) {
pr_perror("Cannot fopen %s", img_path);
ret = -EIO;
goto exit;
}
for (i = 0; i < thread_data->num_of_bos; i++) {
if (bo_buckets[i].gpu_id != thread_data->gpu_id)
continue;
@ -865,17 +859,25 @@ void *dump_bo_contents(void *_thread_data)
num_bos++;
/* perform sDMA based vram copy */
ret = sdma_copy_bo(bo_buckets, bo_info, i, h_dev, max_copy_size, SDMA_OP_VRAM_READ);
ret = sdma_copy_bo(bo_buckets, buffer, i, h_dev, max_copy_size, SDMA_OP_VRAM_READ);
if (ret) {
pr_err("Failed to drain the BO using sDMA: bo_buckets[%d]\n", i);
break;
}
plugin_log_msg("** Successfully drained the BO using sDMA: bo_buckets[%d] **\n", i);
ret = write_fp(bo_contents_fp, buffer, bo_info[i]->size);
if (ret)
break;
}
exit:
pr_info("amdgpu_plugin: Thread[0x%x] done num_bos:%d ret:%d\n", thread_data->gpu_id, num_bos, ret);
if (bo_contents_fp)
fclose(bo_contents_fp);
xfree(buffer);
amdgpu_device_deinitialize(h_dev);
thread_data->ret = ret;
@ -891,6 +893,10 @@ void *restore_bo_contents(void *_thread_data)
amdgpu_device_handle h_dev;
uint64_t max_copy_size;
uint32_t major, minor;
FILE *bo_contents_fp = NULL;
size_t max_bo_size = 0;
void *buffer;
char img_path[40];
int num_bos = 0;
int i, ret = 0;
@ -912,6 +918,31 @@ void *restore_bo_contents(void *_thread_data)
max_copy_size = (gpu_info.family_id >= AMDGPU_FAMILY_AI) ? SDMA_LINEAR_COPY_MAX_SIZE :
SDMA_LINEAR_COPY_MAX_SIZE - 1;
snprintf(img_path, sizeof(img_path), IMG_PAGES_FILE, thread_data->id, thread_data->gpu_id);
bo_contents_fp = fopen(img_path, "r");
if (!bo_contents_fp) {
pr_perror("Cannot fopen %s", img_path);
ret = -errno;
goto exit;
}
/* Allocate buffer to fit biggest BO */
for (i = 0; i < thread_data->num_of_bos; i++) {
if (bo_buckets[i].gpu_id == thread_data->gpu_id &&
(bo_buckets[i].alloc_flags & (KFD_IOC_ALLOC_MEM_FLAGS_VRAM | KFD_IOC_ALLOC_MEM_FLAGS_GTT)) &&
bo_buckets[i].size > max_bo_size) {
max_bo_size = bo_buckets[i].size;
}
}
/* Allocate buffer to fit biggest BO */
posix_memalign(&buffer, sysconf(_SC_PAGE_SIZE), max_bo_size);
if (!buffer) {
pr_perror("Failed to alloc aligned memory");
ret = -ENOMEM;
goto exit;
}
for (i = 0; i < thread_data->num_of_bos; i++) {
if (bo_buckets[i].gpu_id != thread_data->gpu_id)
continue;
@ -921,7 +952,11 @@ void *restore_bo_contents(void *_thread_data)
num_bos++;
ret = sdma_copy_bo(bo_buckets, bo_info, i, h_dev, max_copy_size, SDMA_OP_VRAM_WRITE);
ret = read_fp(bo_contents_fp, buffer, bo_info[i]->size);
if (ret)
goto exit;
ret = sdma_copy_bo(bo_buckets, buffer, i, h_dev, max_copy_size, SDMA_OP_VRAM_WRITE);
if (ret) {
pr_err("Failed to fill the BO using sDMA: bo_buckets[%d]\n", i);
break;
@ -932,6 +967,11 @@ void *restore_bo_contents(void *_thread_data)
exit:
pr_info("amdgpu_plugin: Thread[0x%x] done num_bos:%d ret:%d\n", thread_data->gpu_id, num_bos, ret);
if (bo_contents_fp)
fclose(bo_contents_fp);
xfree(buffer);
amdgpu_device_deinitialize(h_dev);
thread_data->ret = ret;
return NULL;
@ -1059,7 +1099,7 @@ exit:
return ret;
}
static int save_bos(int fd, struct kfd_ioctl_criu_args *args, struct kfd_criu_bo_bucket *bo_buckets, CriuKfd *e)
static int save_bos(int id, int fd, struct kfd_ioctl_criu_args *args, struct kfd_criu_bo_bucket *bo_buckets, CriuKfd *e)
{
struct thread_data *thread_datas;
int ret = 0, i;
@ -1098,6 +1138,7 @@ static int save_bos(int fd, struct kfd_ioctl_criu_args *args, struct kfd_criu_bo
goto exit;
}
thread_datas[i].id = id;
thread_datas[i].gpu_id = dev->gpu_id;
thread_datas[i].bo_buckets = bo_buckets;
thread_datas[i].bo_entries = e->bo_entries;
@ -1236,7 +1277,7 @@ int amdgpu_plugin_dump_file(int fd, int id)
criu_render_node__pack(&rd, buf);
snprintf(img_path, sizeof(img_path), "amdgpu-renderD-%d.img", id);
snprintf(img_path, sizeof(img_path), IMG_RENDERD_FILE, id);
ret = write_file(img_path, buf, len);
if (ret) {
xfree(buf);
@ -1307,7 +1348,7 @@ int amdgpu_plugin_dump_file(int fd, int id)
if (ret)
goto exit;
ret = save_bos(fd, &args, (struct kfd_criu_bo_bucket *)args.bos, e);
ret = save_bos(id, fd, &args, (struct kfd_criu_bo_bucket *)args.bos, e);
if (ret)
goto exit;
@ -1320,7 +1361,7 @@ int amdgpu_plugin_dump_file(int fd, int id)
if (ret)
goto exit;
snprintf(img_path, sizeof(img_path), "amdgpu-kfd-%d.img", id);
snprintf(img_path, sizeof(img_path), IMG_KFD_FILE, id);
pr_info("amdgpu_plugin: img_path = %s\n", img_path);
len = criu_kfd__get_packed_size(e);
@ -1442,7 +1483,7 @@ static int restore_bos(struct kfd_ioctl_criu_args *args, CriuKfd *e)
return 0;
}
static int restore_bo_data(struct kfd_criu_bo_bucket *bo_buckets, CriuKfd *e)
static int restore_bo_data(int id, struct kfd_criu_bo_bucket *bo_buckets, CriuKfd *e)
{
struct thread_data *thread_datas;
int thread_i, ret = 0;
@ -1514,6 +1555,7 @@ static int restore_bo_data(struct kfd_criu_bo_bucket *bo_buckets, CriuKfd *e)
goto exit;
}
thread_datas[thread_i].id = id;
thread_datas[thread_i].gpu_id = e->device_entries[i]->gpu_id;
thread_datas[thread_i].bo_buckets = bo_buckets;
thread_datas[thread_i].bo_entries = e->bo_entries;
@ -1567,7 +1609,7 @@ int amdgpu_plugin_restore_file(int id)
pr_info("amdgpu_plugin: Initialized kfd plugin restorer with ID = %d\n", id);
snprintf(img_path, sizeof(img_path), "amdgpu-kfd-%d.img", id);
snprintf(img_path, sizeof(img_path), IMG_KFD_FILE, id);
if (stat(img_path, &filestat) == -1) {
struct tp_node *tp_node;
@ -1579,7 +1621,7 @@ int amdgpu_plugin_restore_file(int id)
* TODO: Currently, this code will only work if this function is called for /dev/kfd
* first as we assume restore_maps is already filled. Need to fix this later.
*/
snprintf(img_path, sizeof(img_path), "renderDXXX.%d.img", id);
snprintf(img_path, sizeof(img_path), IMG_RENDERD_FILE, id);
if (stat(img_path, &filestat) == -1) {
pr_perror("Failed to read file stats");
@ -1722,7 +1764,7 @@ int amdgpu_plugin_restore_file(int id)
goto exit;
}
ret = restore_bo_data((struct kfd_criu_bo_bucket *)args.bos, e);
ret = restore_bo_data(id, (struct kfd_criu_bo_bucket *)args.bos, e);
if (ret)
goto exit;

View file

@ -46,7 +46,6 @@ message bo_entry {
required uint64 offset = 3;
required uint32 alloc_flags = 4;
required uint32 gpu_id = 5;
required bytes rawdata = 6;
}
message criu_kfd {