amdgpu: use 64-bit offsets for parallel restore

On AMD Instinct MI300 systems, restoring a large GPU application can fail because the checkpoint size is too large and the maximum value of an offset (with integer type) is insufficient. This problem occurs when the total size of all buffer objects exceeds int max, not because any single buffer is too large, but it can also happen with a large number of small buffers. Fixes: #2812 Signed-off-by: Yanning Yang <yangyanning@sjtu.edu.cn> Signed-off-by: Radostin Stoyanov <rstoyanov@fedoraproject.org>
2026-01-22 18:05:10 +00:00 · 2025-11-14 23:08:16 +00:00 · 2025-11-14 23:08:16 +00:00 · 62aadb22ab
commit 62aadb22ab
parent 1db7eed69f
1 changed files with 3 additions and 3 deletions
--- a/plugins/amdgpu/amdgpu_plugin.c
+++ b/plugins/amdgpu/amdgpu_plugin.c
@ -1651,7 +1651,7 @@ static int restore_bo_data(int id, struct kfd_criu_bo_bucket *bo_buckets, CriuKf
 {
 	struct thread_data *thread_datas = NULL;
 	int thread_i, ret = 0;
-	int offset = 0;
+	uint64_t offset = 0;

 	for (int i = 0; i < e->num_of_bos; i++) {
 		struct kfd_criu_bo_bucket *bo_bucket = &bo_buckets[i];
@ -2283,7 +2283,7 @@ void *parallel_restore_bo_contents(void *_thread_data)
 			continue;

 		entry = &restore_cmd->entries[i];
-		fseek(bo_contents_fp, entry->read_offset + offset, SEEK_SET);
+		fseeko64(bo_contents_fp, entry->read_offset + offset, SEEK_SET);
 		ret = sdma_copy_bo(restore_cmd->fds_write[entry->write_id], entry->size, bo_contents_fp,
 				   buffer, buffer_size, h_dev,
 				   max_copy_size, SDMA_OP_VRAM_WRITE, false);
@ -2410,4 +2410,4 @@ int amdgpu_plugin_post_forking(void)

 	return back_thread_create(&parallel_thread, restore_device_parallel_worker, &parallel_thread_result);
 }
-CR_PLUGIN_REGISTER_HOOK(CR_PLUGIN_HOOK__POST_FORKING, amdgpu_plugin_post_forking)
+CR_PLUGIN_REGISTER_HOOK(CR_PLUGIN_HOOK__POST_FORKING, amdgpu_plugin_post_forking)