mirror of
https://github.com/checkpoint-restore/criu.git
synced 2026-01-22 18:05:10 +00:00
amdgpu: use 64-bit offsets for parallel restore
On AMD Instinct MI300 systems, restoring a large GPU application can fail because the checkpoint size is too large and the maximum value of an offset (with integer type) is insufficient. This problem occurs when the total size of all buffer objects exceeds int max, not because any single buffer is too large, but it can also happen with a large number of small buffers. Fixes: #2812 Signed-off-by: Yanning Yang <yangyanning@sjtu.edu.cn> Signed-off-by: Radostin Stoyanov <rstoyanov@fedoraproject.org>
This commit is contained in:
parent
1db7eed69f
commit
62aadb22ab
1 changed files with 3 additions and 3 deletions
|
|
@ -1651,7 +1651,7 @@ static int restore_bo_data(int id, struct kfd_criu_bo_bucket *bo_buckets, CriuKf
|
|||
{
|
||||
struct thread_data *thread_datas = NULL;
|
||||
int thread_i, ret = 0;
|
||||
int offset = 0;
|
||||
uint64_t offset = 0;
|
||||
|
||||
for (int i = 0; i < e->num_of_bos; i++) {
|
||||
struct kfd_criu_bo_bucket *bo_bucket = &bo_buckets[i];
|
||||
|
|
@ -2283,7 +2283,7 @@ void *parallel_restore_bo_contents(void *_thread_data)
|
|||
continue;
|
||||
|
||||
entry = &restore_cmd->entries[i];
|
||||
fseek(bo_contents_fp, entry->read_offset + offset, SEEK_SET);
|
||||
fseeko64(bo_contents_fp, entry->read_offset + offset, SEEK_SET);
|
||||
ret = sdma_copy_bo(restore_cmd->fds_write[entry->write_id], entry->size, bo_contents_fp,
|
||||
buffer, buffer_size, h_dev,
|
||||
max_copy_size, SDMA_OP_VRAM_WRITE, false);
|
||||
|
|
@ -2410,4 +2410,4 @@ int amdgpu_plugin_post_forking(void)
|
|||
|
||||
return back_thread_create(¶llel_thread, restore_device_parallel_worker, ¶llel_thread_result);
|
||||
}
|
||||
CR_PLUGIN_REGISTER_HOOK(CR_PLUGIN_HOOK__POST_FORKING, amdgpu_plugin_post_forking)
|
||||
CR_PLUGIN_REGISTER_HOOK(CR_PLUGIN_HOOK__POST_FORKING, amdgpu_plugin_post_forking)
|
||||
|
|
|
|||
Loading…
Add table
Add a link
Reference in a new issue