criu: add support for --gpu-device-map

This option allows to specify a GPU device map that will be used during
restore. The provided map must contain all checkpointed GPU devices
in the format "oldUuid=newUuid,oldUuid=newUuid,..." that will be
used to remap old devices to new devices.

Example of migrating all process from GPU_0 to GPU_1:

i=0
for uuid in $(nvidia-smi --list-gpus | grep -oP 'UUID: \K[^)]+'); do
    export GPU_$i=$uuid
    i=$((i+1))
done

criu restore --gpu-device-map=$GPU_0=$GPU_1,$GPU_1=$GPU_0,$GPU_2=$GPU_2,$GPU_3=$GPU_3

Signed-off-by: Radostin Stoyanov <rstoyanov@fedoraproject.org>
This commit is contained in:
Radostin Stoyanov 2025-11-10 12:55:17 +00:00
parent 90300748ef
commit b0fb87ea04
6 changed files with 39 additions and 4 deletions

View file

@ -720,6 +720,11 @@ The 'mode' may be one of the following:
Required when dumped with this option. Refer to this option in the section
on dumping for more details.
*--gpu-device-map*::
Optional GPU device map used during restore to remap old devices onto new ones.
For more information, please refer to the documentation for the *--device-map*
option of the *cuda-checkpoint* tool.
*check*
~~~~~~~
Checks whether the kernel supports the features needed by *criu* to

View file

@ -705,6 +705,7 @@ int parse_options(int argc, char **argv, bool *usage_error, bool *has_exec_cmd,
BOOL_OPT("unprivileged", &opts.unprivileged),
BOOL_OPT("ghost-fiemap", &opts.ghost_fiemap),
BOOL_OPT(OPT_ALLOW_UPROBES, &opts.allow_uprobes),
{ "gpu-device-map", required_argument, 0, 1101 },
{},
};
@ -1045,6 +1046,9 @@ int parse_options(int argc, char **argv, bool *usage_error, bool *has_exec_cmd,
return 1;
}
break;
case 1101:
SET_CHAR_OPTS(gpu_device_map, optarg);
break;
case 'V':
pr_msg("Version: %s\n", CRIU_VERSION);
if (strcmp(CRIU_GITID, "0"))

View file

@ -521,6 +521,9 @@ usage:
" --skip-file-rwx-check\n"
" Skip checking file permissions\n"
" (r/w/x for u/g/o) on restore.\n"
" --gpu-device-map\n"
" Optional GPU device map used during restore\n"
" to remap old devices onto new ones.\n"
"\n"
"Check options:\n"
" Without options, \"criu check\" checks availability of absolutely required\n"

View file

@ -247,6 +247,11 @@ struct cr_options {
* explicitly request it as it comes with many limitations.
*/
int unprivileged;
/*
* Optional GPU device map used during restore to manually remap old devices
* onto new ones. See `cuda-checkpoint --help` for more information.
*/
char *gpu_device_map;
};
extern struct cr_options opts;

View file

@ -145,6 +145,7 @@ message criu_opts {
optional bool leave_stopped = 69;
optional bool display_stats = 70;
optional bool log_to_stderr = 71;
optional string gpu_device_map = 72;
/* optional bool check_mounts = 128; */
}

View file

@ -261,15 +261,32 @@ static int cuda_process_checkpoint_action(int pid, const char *action, unsigned
{
char pid_buf[16];
char timeout_buf[16];
int args_idx = 5;
snprintf(pid_buf, sizeof(pid_buf), "%d", pid);
const char *args[] = { CUDA_CHECKPOINT, "--action", action, "--pid", pid_buf, NULL /* --timeout */,
NULL /* timeout_val */, NULL };
const char *args[] = {
CUDA_CHECKPOINT,
"--action", action,
"--pid", pid_buf,
NULL /* --timeout */,
NULL /* timeout_val */,
NULL /* --device-map */,
NULL /* device_map_val */,
NULL
};
if (timeout > 0) {
snprintf(timeout_buf, sizeof(timeout_buf), "%d", timeout);
args[5] = "--timeout";
args[6] = timeout_buf;
args[args_idx] = "--timeout";
args[args_idx+1] = timeout_buf;
args_idx += 2;
}
if (opts.gpu_device_map && strncmp(action, ACTION_RESTORE, strlen(ACTION_RESTORE)) == 0) {
pr_debug("opts.gpu_device_map: %s\n", opts.gpu_device_map);
args[args_idx] = "--device-map";
args[args_idx+1] = opts.gpu_device_map;
}
return launch_cuda_checkpoint(args, msg_buf, buf_size);