From 62aadb22ab1efeccef7fb322f525bd1b2cb6969c Mon Sep 17 00:00:00 2001 From: Yanning Yang Date: Fri, 14 Nov 2025 23:08:16 +0000 Subject: [PATCH] amdgpu: use 64-bit offsets for parallel restore On AMD Instinct MI300 systems, restoring a large GPU application can fail because the checkpoint size is too large and the maximum value of an offset (with integer type) is insufficient. This problem occurs when the total size of all buffer objects exceeds int max, not because any single buffer is too large, but it can also happen with a large number of small buffers. Fixes: #2812 Signed-off-by: Yanning Yang Signed-off-by: Radostin Stoyanov --- plugins/amdgpu/amdgpu_plugin.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/plugins/amdgpu/amdgpu_plugin.c b/plugins/amdgpu/amdgpu_plugin.c index 713ffed6e..574d7b829 100644 --- a/plugins/amdgpu/amdgpu_plugin.c +++ b/plugins/amdgpu/amdgpu_plugin.c @@ -1651,7 +1651,7 @@ static int restore_bo_data(int id, struct kfd_criu_bo_bucket *bo_buckets, CriuKf { struct thread_data *thread_datas = NULL; int thread_i, ret = 0; - int offset = 0; + uint64_t offset = 0; for (int i = 0; i < e->num_of_bos; i++) { struct kfd_criu_bo_bucket *bo_bucket = &bo_buckets[i]; @@ -2283,7 +2283,7 @@ void *parallel_restore_bo_contents(void *_thread_data) continue; entry = &restore_cmd->entries[i]; - fseek(bo_contents_fp, entry->read_offset + offset, SEEK_SET); + fseeko64(bo_contents_fp, entry->read_offset + offset, SEEK_SET); ret = sdma_copy_bo(restore_cmd->fds_write[entry->write_id], entry->size, bo_contents_fp, buffer, buffer_size, h_dev, max_copy_size, SDMA_OP_VRAM_WRITE, false); @@ -2410,4 +2410,4 @@ int amdgpu_plugin_post_forking(void) return back_thread_create(¶llel_thread, restore_device_parallel_worker, ¶llel_thread_result); } -CR_PLUGIN_REGISTER_HOOK(CR_PLUGIN_HOOK__POST_FORKING, amdgpu_plugin_post_forking) \ No newline at end of file +CR_PLUGIN_REGISTER_HOOK(CR_PLUGIN_HOOK__POST_FORKING, amdgpu_plugin_post_forking)