amdgpu: use 64-bit offsets for parallel restore

On AMD Instinct MI300 systems, restoring a large GPU application can
fail because the checkpoint size is too large and the maximum value of
an offset (with integer type) is insufficient. This problem occurs when
the total size of all buffer objects exceeds int max, not because any
single buffer is too large, but it can also happen with a large number
of small buffers.

Fixes: #2812

Signed-off-by: Yanning Yang <yangyanning@sjtu.edu.cn>
Signed-off-by: Radostin Stoyanov <rstoyanov@fedoraproject.org>
This commit is contained in:
Yanning Yang 2025-11-14 23:08:16 +00:00 committed by Andrei Vagin
parent 1db7eed69f
commit 62aadb22ab

View file

@ -1651,7 +1651,7 @@ static int restore_bo_data(int id, struct kfd_criu_bo_bucket *bo_buckets, CriuKf
{
struct thread_data *thread_datas = NULL;
int thread_i, ret = 0;
int offset = 0;
uint64_t offset = 0;
for (int i = 0; i < e->num_of_bos; i++) {
struct kfd_criu_bo_bucket *bo_bucket = &bo_buckets[i];
@ -2283,7 +2283,7 @@ void *parallel_restore_bo_contents(void *_thread_data)
continue;
entry = &restore_cmd->entries[i];
fseek(bo_contents_fp, entry->read_offset + offset, SEEK_SET);
fseeko64(bo_contents_fp, entry->read_offset + offset, SEEK_SET);
ret = sdma_copy_bo(restore_cmd->fds_write[entry->write_id], entry->size, bo_contents_fp,
buffer, buffer_size, h_dev,
max_copy_size, SDMA_OP_VRAM_WRITE, false);
@ -2410,4 +2410,4 @@ int amdgpu_plugin_post_forking(void)
return back_thread_create(&parallel_thread, restore_device_parallel_worker, &parallel_thread_result);
}
CR_PLUGIN_REGISTER_HOOK(CR_PLUGIN_HOOK__POST_FORKING, amdgpu_plugin_post_forking)
CR_PLUGIN_REGISTER_HOOK(CR_PLUGIN_HOOK__POST_FORKING, amdgpu_plugin_post_forking)