cuda: fix check for GPU device availability

The check for `/dev/nvidiactl` to determine if the CUDA plugin can be used is unreliable because in some cases the default path for driver installation is different [1]. This patch changes the logic to check if a GPU device is available in `/proc/driver/nvidia/gpus/`. This approach is similar to `torch.cuda.is_available()` and it is a more accurate indicator. The subsequent check for support of the `cuda-checkpoint --action` option would confirm if the driver supports checkpoint/restore. [1] https://github.com/NVIDIA/gpu-operator Fixes: #2509 Signed-off-by: Radostin Stoyanov <rstoyanov@fedoraproject.org>
2026-01-23 02:14:37 +00:00 · 2024-11-02 08:29:43 +00:00 · 2024-11-02 08:29:43 +00:00 · b1cac7a8e5
commit b1cac7a8e5
parent 36a53fe23c
1 changed files with 16 additions and 2 deletions
--- a/plugins/cuda/cuda_plugin.c
+++ b/plugins/cuda/cuda_plugin.c
@ -470,6 +470,20 @@ int cuda_plugin_resume_devices_late(int pid)
 }
 CR_PLUGIN_REGISTER_HOOK(CR_PLUGIN_HOOK__RESUME_DEVICES_LATE, cuda_plugin_resume_devices_late)

+/**
+ * Check if a CUDA device is available on the system
+ */
+static bool is_cuda_device_available(void)
+{
+	const char *gpu_path = "/proc/driver/nvidia/gpus/";
+	struct stat sb;
+
+	if (stat(gpu_path, &sb) != 0)
+		return false;
+
+	return S_ISDIR(sb.st_mode);
+}
+
 int cuda_plugin_init(int stage)
 {
 	int ret;
@ -481,8 +495,8 @@ int cuda_plugin_init(int stage)
 		}
 	}

-	if (!fault_injected(FI_PLUGIN_CUDA_FORCE_ENABLE) && access("/dev/nvidiactl", F_OK)) {
-		pr_info("/dev/nvidiactl doesn't exist. The CUDA plugin is disabled.\n");
+	if (!fault_injected(FI_PLUGIN_CUDA_FORCE_ENABLE) && !is_cuda_device_available()) {
+		pr_info("No GPU device found; CUDA plugin is disabled\n");
 		plugin_disabled = true;
 		return 0;
 	}