mirror of
https://github.com/checkpoint-restore/criu.git
synced 2026-01-23 02:14:37 +00:00
criu: add support for --gpu-device-map
This option allows to specify a GPU device map that will be used during
restore. The provided map must contain all checkpointed GPU devices
in the format "oldUuid=newUuid,oldUuid=newUuid,..." that will be
used to remap old devices to new devices.
Example of migrating all process from GPU_0 to GPU_1:
i=0
for uuid in $(nvidia-smi --list-gpus | grep -oP 'UUID: \K[^)]+'); do
export GPU_$i=$uuid
i=$((i+1))
done
criu restore --gpu-device-map=$GPU_0=$GPU_1,$GPU_1=$GPU_0,$GPU_2=$GPU_2,$GPU_3=$GPU_3
Signed-off-by: Radostin Stoyanov <rstoyanov@fedoraproject.org>
This commit is contained in:
parent
90300748ef
commit
b0fb87ea04
6 changed files with 39 additions and 4 deletions
|
|
@ -720,6 +720,11 @@ The 'mode' may be one of the following:
|
|||
Required when dumped with this option. Refer to this option in the section
|
||||
on dumping for more details.
|
||||
|
||||
*--gpu-device-map*::
|
||||
Optional GPU device map used during restore to remap old devices onto new ones.
|
||||
For more information, please refer to the documentation for the *--device-map*
|
||||
option of the *cuda-checkpoint* tool.
|
||||
|
||||
*check*
|
||||
~~~~~~~
|
||||
Checks whether the kernel supports the features needed by *criu* to
|
||||
|
|
|
|||
|
|
@ -705,6 +705,7 @@ int parse_options(int argc, char **argv, bool *usage_error, bool *has_exec_cmd,
|
|||
BOOL_OPT("unprivileged", &opts.unprivileged),
|
||||
BOOL_OPT("ghost-fiemap", &opts.ghost_fiemap),
|
||||
BOOL_OPT(OPT_ALLOW_UPROBES, &opts.allow_uprobes),
|
||||
{ "gpu-device-map", required_argument, 0, 1101 },
|
||||
{},
|
||||
};
|
||||
|
||||
|
|
@ -1045,6 +1046,9 @@ int parse_options(int argc, char **argv, bool *usage_error, bool *has_exec_cmd,
|
|||
return 1;
|
||||
}
|
||||
break;
|
||||
case 1101:
|
||||
SET_CHAR_OPTS(gpu_device_map, optarg);
|
||||
break;
|
||||
case 'V':
|
||||
pr_msg("Version: %s\n", CRIU_VERSION);
|
||||
if (strcmp(CRIU_GITID, "0"))
|
||||
|
|
|
|||
|
|
@ -521,6 +521,9 @@ usage:
|
|||
" --skip-file-rwx-check\n"
|
||||
" Skip checking file permissions\n"
|
||||
" (r/w/x for u/g/o) on restore.\n"
|
||||
" --gpu-device-map\n"
|
||||
" Optional GPU device map used during restore\n"
|
||||
" to remap old devices onto new ones.\n"
|
||||
"\n"
|
||||
"Check options:\n"
|
||||
" Without options, \"criu check\" checks availability of absolutely required\n"
|
||||
|
|
|
|||
|
|
@ -247,6 +247,11 @@ struct cr_options {
|
|||
* explicitly request it as it comes with many limitations.
|
||||
*/
|
||||
int unprivileged;
|
||||
/*
|
||||
* Optional GPU device map used during restore to manually remap old devices
|
||||
* onto new ones. See `cuda-checkpoint --help` for more information.
|
||||
*/
|
||||
char *gpu_device_map;
|
||||
};
|
||||
|
||||
extern struct cr_options opts;
|
||||
|
|
|
|||
|
|
@ -145,6 +145,7 @@ message criu_opts {
|
|||
optional bool leave_stopped = 69;
|
||||
optional bool display_stats = 70;
|
||||
optional bool log_to_stderr = 71;
|
||||
optional string gpu_device_map = 72;
|
||||
/* optional bool check_mounts = 128; */
|
||||
}
|
||||
|
||||
|
|
|
|||
|
|
@ -261,15 +261,32 @@ static int cuda_process_checkpoint_action(int pid, const char *action, unsigned
|
|||
{
|
||||
char pid_buf[16];
|
||||
char timeout_buf[16];
|
||||
int args_idx = 5;
|
||||
|
||||
snprintf(pid_buf, sizeof(pid_buf), "%d", pid);
|
||||
|
||||
const char *args[] = { CUDA_CHECKPOINT, "--action", action, "--pid", pid_buf, NULL /* --timeout */,
|
||||
NULL /* timeout_val */, NULL };
|
||||
const char *args[] = {
|
||||
CUDA_CHECKPOINT,
|
||||
"--action", action,
|
||||
"--pid", pid_buf,
|
||||
NULL /* --timeout */,
|
||||
NULL /* timeout_val */,
|
||||
NULL /* --device-map */,
|
||||
NULL /* device_map_val */,
|
||||
NULL
|
||||
};
|
||||
|
||||
if (timeout > 0) {
|
||||
snprintf(timeout_buf, sizeof(timeout_buf), "%d", timeout);
|
||||
args[5] = "--timeout";
|
||||
args[6] = timeout_buf;
|
||||
args[args_idx] = "--timeout";
|
||||
args[args_idx+1] = timeout_buf;
|
||||
args_idx += 2;
|
||||
}
|
||||
|
||||
if (opts.gpu_device_map && strncmp(action, ACTION_RESTORE, strlen(ACTION_RESTORE)) == 0) {
|
||||
pr_debug("opts.gpu_device_map: %s\n", opts.gpu_device_map);
|
||||
args[args_idx] = "--device-map";
|
||||
args[args_idx+1] = opts.gpu_device_map;
|
||||
}
|
||||
|
||||
return launch_cuda_checkpoint(args, msg_buf, buf_size);
|
||||
|
|
|
|||
Loading…
Add table
Add a link
Reference in a new issue