mirror of
https://github.com/checkpoint-restore/criu.git
synced 2026-01-23 02:14:37 +00:00
cuda: unlock on timeout error
When attempting to checkpoint a container with CUDA processes, CRIU could fail with the following error: Error (criu/cr-dump.c:1791): Timeout reached. Try to interrupt: 1 Error (cuda_plugin.c:143): cuda_plugin: Unable to read output of cuda-checkpoint: Interrupted system call Error (cuda_plugin.c:384): cuda_plugin: PAUSE_DEVICES failed with In this situation, the target process is locked, but CRIU fails due to a timeout and exits with an error. We need to make sure that the target PID is unlocked in such case. Signed-off-by: Radostin Stoyanov <rstoyanov@fedoraproject.org>
This commit is contained in:
parent
dbfa450246
commit
b1b3c14b17
1 changed files with 14 additions and 8 deletions
|
|
@ -4,6 +4,7 @@
|
|||
#include "cr_options.h"
|
||||
#include "pid.h"
|
||||
#include "proc_parse.h"
|
||||
#include "seize.h"
|
||||
|
||||
#include <common/list.h>
|
||||
#include <compel/infect.h>
|
||||
|
|
@ -379,18 +380,23 @@ int cuda_plugin_pause_devices(int pid)
|
|||
int status = cuda_process_checkpoint_action(pid, ACTION_LOCK, opts.timeout * 1000, msg_buf, sizeof(msg_buf));
|
||||
if (status) {
|
||||
pr_err("PAUSE_DEVICES failed with %s\n", msg_buf);
|
||||
return -1;
|
||||
}
|
||||
if (add_pid_to_buf(&cuda_pids, pid)) {
|
||||
pr_err("unable to track paused pid %d\n", pid);
|
||||
status = cuda_process_checkpoint_action(pid, ACTION_UNLOCK, 0, msg_buf, sizeof(msg_buf));
|
||||
if (status) {
|
||||
pr_err("Failed to unlock process status %s, pid %d may hang\n", msg_buf, pid);
|
||||
}
|
||||
if (alarm_timeouted())
|
||||
goto unlock;
|
||||
return -1;
|
||||
}
|
||||
|
||||
if (add_pid_to_buf(&cuda_pids, pid)) {
|
||||
pr_err("unable to track paused pid %d\n", pid);
|
||||
goto unlock;
|
||||
}
|
||||
|
||||
return 0;
|
||||
unlock:
|
||||
status = cuda_process_checkpoint_action(pid, ACTION_UNLOCK, 0, msg_buf, sizeof(msg_buf));
|
||||
if (status) {
|
||||
pr_err("Failed to unlock process status %s, pid %d may hang\n", msg_buf, pid);
|
||||
}
|
||||
return -1;
|
||||
}
|
||||
CR_PLUGIN_REGISTER_HOOK(CR_PLUGIN_HOOK__PAUSE_DEVICES, cuda_plugin_pause_devices)
|
||||
|
||||
|
|
|
|||
Loading…
Add table
Add a link
Reference in a new issue