mirror of
https://github.com/checkpoint-restore/criu.git
synced 2026-01-23 02:14:37 +00:00
cuda: fix check for GPU device availability
The check for `/dev/nvidiactl` to determine if the CUDA plugin can be used is unreliable because in some cases the default path for driver installation is different [1]. This patch changes the logic to check if a GPU device is available in `/proc/driver/nvidia/gpus/`. This approach is similar to `torch.cuda.is_available()` and it is a more accurate indicator. The subsequent check for support of the `cuda-checkpoint --action` option would confirm if the driver supports checkpoint/restore. [1] https://github.com/NVIDIA/gpu-operator Fixes: #2509 Signed-off-by: Radostin Stoyanov <rstoyanov@fedoraproject.org>
This commit is contained in:
parent
36a53fe23c
commit
b1cac7a8e5
1 changed files with 16 additions and 2 deletions
|
|
@ -470,6 +470,20 @@ int cuda_plugin_resume_devices_late(int pid)
|
|||
}
|
||||
CR_PLUGIN_REGISTER_HOOK(CR_PLUGIN_HOOK__RESUME_DEVICES_LATE, cuda_plugin_resume_devices_late)
|
||||
|
||||
/**
|
||||
* Check if a CUDA device is available on the system
|
||||
*/
|
||||
static bool is_cuda_device_available(void)
|
||||
{
|
||||
const char *gpu_path = "/proc/driver/nvidia/gpus/";
|
||||
struct stat sb;
|
||||
|
||||
if (stat(gpu_path, &sb) != 0)
|
||||
return false;
|
||||
|
||||
return S_ISDIR(sb.st_mode);
|
||||
}
|
||||
|
||||
int cuda_plugin_init(int stage)
|
||||
{
|
||||
int ret;
|
||||
|
|
@ -481,8 +495,8 @@ int cuda_plugin_init(int stage)
|
|||
}
|
||||
}
|
||||
|
||||
if (!fault_injected(FI_PLUGIN_CUDA_FORCE_ENABLE) && access("/dev/nvidiactl", F_OK)) {
|
||||
pr_info("/dev/nvidiactl doesn't exist. The CUDA plugin is disabled.\n");
|
||||
if (!fault_injected(FI_PLUGIN_CUDA_FORCE_ENABLE) && !is_cuda_device_available()) {
|
||||
pr_info("No GPU device found; CUDA plugin is disabled\n");
|
||||
plugin_disabled = true;
|
||||
return 0;
|
||||
}
|
||||
|
|
|
|||
Loading…
Add table
Add a link
Reference in a new issue