From ddbb3dbd8d84a785ad211be42d2ba0d034c0291f Mon Sep 17 00:00:00 2001 From: Pengda Yang Date: Wed, 15 Mar 2023 16:58:31 +0800 Subject: [PATCH 01/51] limit the field width of 'scanf' Fixes: #2121 Signed-off-by: Pengda Yang --- criu/proc_parse.c | 6 +++--- test/zdtm/lib/fs.c | 2 +- test/zdtm/static/apparmor.c | 2 +- test/zdtm/static/apparmor_stacking.c | 2 +- test/zdtm/static/cgroup01.c | 2 +- test/zdtm/static/cgroup02.c | 2 +- test/zdtm/static/change_mnt_context.c | 2 +- test/zdtm/static/file_locks01.c | 2 +- test/zdtm/static/file_locks02.c | 2 +- test/zdtm/static/file_locks03.c | 2 +- test/zdtm/static/file_locks04.c | 2 +- test/zdtm/static/netns-dev.c | 2 +- test/zdtm/static/ofd_file_locks.c | 2 +- 13 files changed, 15 insertions(+), 15 deletions(-) diff --git a/criu/proc_parse.c b/criu/proc_parse.c index 0d3b5b23f..f51f2e801 100644 --- a/criu/proc_parse.c +++ b/criu/proc_parse.c @@ -1477,7 +1477,7 @@ static int parse_mountinfo_ent(char *str, struct mount_info *new, char **fsname) goto err; new->mountpoint[0] = '.'; - ret = sscanf(str, "%i %i %u:%u %ms %s %ms %n", &new->mnt_id, &new->parent_mnt_id, &kmaj, &kmin, &new->root, + ret = sscanf(str, "%i %i %u:%u %ms %4094s %ms %n", &new->mnt_id, &new->parent_mnt_id, &kmaj, &kmin, &new->root, new->mountpoint + 1, &opt, &n); if (ret != 7) goto err; @@ -2302,10 +2302,10 @@ static int parse_file_lock_buf(char *buf, struct file_lock *fl, bool is_blocked) char fl_flag[10], fl_type[15], fl_option[10]; if (is_blocked) { - num = sscanf(buf, "%lld: -> %s %s %s %d %x:%x:%ld %lld %s", &fl->fl_id, fl_flag, fl_type, fl_option, + num = sscanf(buf, "%lld: -> %9s %14s %9s %d %x:%x:%ld %lld %31s", &fl->fl_id, fl_flag, fl_type, fl_option, &fl->fl_owner, &fl->maj, &fl->min, &fl->i_no, &fl->start, fl->end); } else { - num = sscanf(buf, "%lld:%s %s %s %d %x:%x:%ld %lld %s", &fl->fl_id, fl_flag, fl_type, fl_option, + num = sscanf(buf, "%lld:%9s %14s %9s %d %x:%x:%ld %lld %31s", &fl->fl_id, fl_flag, fl_type, fl_option, &fl->fl_owner, &fl->maj, &fl->min, &fl->i_no, &fl->start, fl->end); } diff --git a/test/zdtm/lib/fs.c b/test/zdtm/lib/fs.c index bf8cd9cd3..efcc7a1d0 100644 --- a/test/zdtm/lib/fs.c +++ b/test/zdtm/lib/fs.c @@ -54,7 +54,7 @@ mnt_info_t *get_cwd_mnt_info(void) while (fgets(str, sizeof(str), f)) { char *hyphen = strchr(str, '-'); - ret = sscanf(str, "%i %i %u:%u %s %s", &mnt_id, &parent_mnt_id, &kmaj, &kmin, root, mountpoint); + ret = sscanf(str, "%i %i %u:%u %4095s %4095s", &mnt_id, &parent_mnt_id, &kmaj, &kmin, root, mountpoint); if (ret != 6 || !hyphen) goto err; ret = sscanf(hyphen + 1, " %ms", &fsname); diff --git a/test/zdtm/static/apparmor.c b/test/zdtm/static/apparmor.c index 713ffaa46..dc1636821 100644 --- a/test/zdtm/static/apparmor.c +++ b/test/zdtm/static/apparmor.c @@ -59,7 +59,7 @@ int checkprofile(void) return -1; } - len = fscanf(f, "%[^ \n]s", profile); + len = fscanf(f, "%1023[^ \n]s", profile); fclose(f); if (len != 1) { fail("wrong number of items scanned %d", len); diff --git a/test/zdtm/static/apparmor_stacking.c b/test/zdtm/static/apparmor_stacking.c index 76de8b8b4..0bc36048c 100644 --- a/test/zdtm/static/apparmor_stacking.c +++ b/test/zdtm/static/apparmor_stacking.c @@ -56,7 +56,7 @@ static int checkprofile(pid_t pid, char *expected) return -1; } - len = fscanf(f, "%[^ \n]s", profile); + len = fscanf(f, "%1023[^ \n]s", profile); fclose(f); if (len != 1) { fail("wrong number of items scanned %d", len); diff --git a/test/zdtm/static/cgroup01.c b/test/zdtm/static/cgroup01.c index bc8515264..7bfb67762 100644 --- a/test/zdtm/static/cgroup01.c +++ b/test/zdtm/static/cgroup01.c @@ -79,7 +79,7 @@ int main(int argc, char **argv) if (!s) continue; - sscanf(paux, "%*d %*d %*d:%*d %*s %s", aux); + sscanf(paux, "%*d %*d %*d:%*d %*s %1023s", aux); test_msg("found cgroup at %s\n", aux); for (i = 0; i < 2; i++) { diff --git a/test/zdtm/static/cgroup02.c b/test/zdtm/static/cgroup02.c index 6229a8a08..8a925c0a4 100644 --- a/test/zdtm/static/cgroup02.c +++ b/test/zdtm/static/cgroup02.c @@ -75,7 +75,7 @@ bool test_exists(char *mountinfo_line, char *path) char aux[1024], paux[1024]; struct stat st; - sscanf(mountinfo_line, "%*d %*d %*d:%*d %*s %s", aux); + sscanf(mountinfo_line, "%*d %*d %*d:%*d %*s %1023s", aux); test_msg("found cgroup at %s\n", aux); ssprintf(paux, "%s/%s", aux, path); diff --git a/test/zdtm/static/change_mnt_context.c b/test/zdtm/static/change_mnt_context.c index 6d436014b..8787ae5cf 100644 --- a/test/zdtm/static/change_mnt_context.c +++ b/test/zdtm/static/change_mnt_context.c @@ -46,7 +46,7 @@ int main(int argc, char **argv) if (!pos) continue; - result = sscanf(pos, " - %*s %*s %s", opts); + result = sscanf(pos, " - %*s %*s %1023s", opts); if (result != 1) { fail("Not able to sscanf line from mountinfo"); goto out; diff --git a/test/zdtm/static/file_locks01.c b/test/zdtm/static/file_locks01.c index beea171f5..bfdca51d9 100644 --- a/test/zdtm/static/file_locks01.c +++ b/test/zdtm/static/file_locks01.c @@ -107,7 +107,7 @@ static int check_file_lock(int fd, char *expected_type, char *expected_option, u memset(fl_type, 0, sizeof(fl_type)); memset(fl_option, 0, sizeof(fl_option)); - num = sscanf(buf, "%*s %*d:%s %s %s %d %x:%x:%ld %*d %*s", fl_flag, fl_type, fl_option, &fl_owner, &maj, + num = sscanf(buf, "%*s %*d:%15s %15s %15s %d %x:%x:%ld %*d %*s", fl_flag, fl_type, fl_option, &fl_owner, &maj, &min, &i_no); if (num < 7) { pr_err("Invalid lock info\n"); diff --git a/test/zdtm/static/file_locks02.c b/test/zdtm/static/file_locks02.c index d2049ebaa..ae4827de9 100644 --- a/test/zdtm/static/file_locks02.c +++ b/test/zdtm/static/file_locks02.c @@ -41,7 +41,7 @@ static int check_file_lock(pid_t pid, pid_t child, int fd, char *expected_type, memset(fl_type, 0, sizeof(fl_type)); memset(fl_option, 0, sizeof(fl_option)); - num = sscanf(buf, "%*s %*d:%s %s %s %d", fl_flag, fl_type, fl_option, &fl_owner); + num = sscanf(buf, "%*s %*d:%15s %15s %15s %d", fl_flag, fl_type, fl_option, &fl_owner); if (num < 4) { pr_perror("Invalid lock info."); break; diff --git a/test/zdtm/static/file_locks03.c b/test/zdtm/static/file_locks03.c index 35ef41a21..228e66892 100644 --- a/test/zdtm/static/file_locks03.c +++ b/test/zdtm/static/file_locks03.c @@ -41,7 +41,7 @@ static int check_file_lock(pid_t pid, pid_t child, int fd, char *expected_type, memset(fl_type, 0, sizeof(fl_type)); memset(fl_option, 0, sizeof(fl_option)); - num = sscanf(buf, "%*s %*d:%s %s %s %d", fl_flag, fl_type, fl_option, &fl_owner); + num = sscanf(buf, "%*s %*d:%15s %15s %15s %d", fl_flag, fl_type, fl_option, &fl_owner); if (num < 4) { pr_perror("Invalid lock info."); break; diff --git a/test/zdtm/static/file_locks04.c b/test/zdtm/static/file_locks04.c index 11d224fa7..7e0d2654e 100644 --- a/test/zdtm/static/file_locks04.c +++ b/test/zdtm/static/file_locks04.c @@ -34,7 +34,7 @@ static int check_file_locks(pid_t child_pid, int fd, int child_fd) continue; test_msg("c: %s", buf); - num = sscanf(buf, "%*s %*d:%s %s %s %d %*02x:%*02x:%*d %*d %*s", fl_flag, fl_type, fl_option, + num = sscanf(buf, "%*s %*d:%15s %15s %15s %d %*02x:%*02x:%*d %*d %*s", fl_flag, fl_type, fl_option, &fl_owner); if (num < 4) { diff --git a/test/zdtm/static/netns-dev.c b/test/zdtm/static/netns-dev.c index 1e6ee1dea..f268f2fec 100644 --- a/test/zdtm/static/netns-dev.c +++ b/test/zdtm/static/netns-dev.c @@ -414,7 +414,7 @@ static int check_stable_secret(struct test_conf *tc) return -1; } - ret = fscanf(fp, "%s", val); + ret = fscanf(fp, "%200s", val); if (ret != 1) { pr_perror("fscanf"); fclose(fp); diff --git a/test/zdtm/static/ofd_file_locks.c b/test/zdtm/static/ofd_file_locks.c index 68b6f22f5..a68fa38ee 100644 --- a/test/zdtm/static/ofd_file_locks.c +++ b/test/zdtm/static/ofd_file_locks.c @@ -16,7 +16,7 @@ static int parse_ofd_lock(char *buf, struct flock *lck) if (strncmp(buf, "lock:\t", 6) != 0) return 1; /* isn't lock, skip record */ - num = sscanf(buf, "%*s %*d: %s %s %s %*d %*x:%*x:%*d %lld %s", fl_flag, fl_type, fl_option, &start, fl_end); + num = sscanf(buf, "%*s %*d: %9s %14s %9s %*d %*x:%*x:%*d %lld %31s", fl_flag, fl_type, fl_option, &start, fl_end); if (num < 4) { pr_err("Invalid lock info %s\n", buf); From 63861407544172a04c8b03d3387ea6a8b23d9be2 Mon Sep 17 00:00:00 2001 From: Yanning Yang Date: Tue, 10 Dec 2024 12:36:33 +0000 Subject: [PATCH 02/51] plugins/amdgpu: Add socket operations When enabling parallel restore, the target process and the main CRIU process need an IPC interface to communicate and transfer restore commands. This patch adds a Unix domain TCP socket and stores this socket in `fdstore`. Signed-off-by: Yanning Yang --- plugins/amdgpu/amdgpu_socket_utils.c | 59 ++++++++++++++++++++++++++++ plugins/amdgpu/amdgpu_socket_utils.h | 6 +++ 2 files changed, 65 insertions(+) create mode 100644 plugins/amdgpu/amdgpu_socket_utils.c create mode 100644 plugins/amdgpu/amdgpu_socket_utils.h diff --git a/plugins/amdgpu/amdgpu_socket_utils.c b/plugins/amdgpu/amdgpu_socket_utils.c new file mode 100644 index 000000000..9e957ae54 --- /dev/null +++ b/plugins/amdgpu/amdgpu_socket_utils.c @@ -0,0 +1,59 @@ +#include +#include +#include +#include +#include +#include + +#include "amdgpu_socket_utils.h" +#include "criu-log.h" +#include "common/scm.h" +#include "fdstore.h" +#include "util-pie.h" +#include "util.h" + +int parallel_socket_addr_len; +struct sockaddr_un parallel_socket_addr; +int parallel_socket_id = 0; + +static void amdgpu_socket_name_gen(struct sockaddr_un *addr, int *len) +{ + addr->sun_family = AF_UNIX; + snprintf(addr->sun_path, UNIX_PATH_MAX, "x/criu-amdgpu-parallel-%s", criu_run_id); + *len = SUN_LEN(addr); + *addr->sun_path = '\0'; +} + +int install_parallel_sock(void) +{ + int ret = 0; + int sock_fd; + + sock_fd = socket(PF_UNIX, SOCK_STREAM | SOCK_CLOEXEC, 0); + if (sock_fd < 0) { + pr_perror("socket creation failed"); + return -1; + } + + amdgpu_socket_name_gen(¶llel_socket_addr, ¶llel_socket_addr_len); + ret = bind(sock_fd, (struct sockaddr *)¶llel_socket_addr, parallel_socket_addr_len); + if (ret < 0) { + pr_perror("bind failed"); + goto err; + } + + ret = listen(sock_fd, SOMAXCONN); + if (ret < 0) { + pr_perror("listen failed"); + goto err; + } + + parallel_socket_id = fdstore_add(sock_fd); + if (parallel_socket_id < 0) { + ret = -1; + goto err; + } +err: + close(sock_fd); + return ret; +} \ No newline at end of file diff --git a/plugins/amdgpu/amdgpu_socket_utils.h b/plugins/amdgpu/amdgpu_socket_utils.h new file mode 100644 index 000000000..4e7aa2aa4 --- /dev/null +++ b/plugins/amdgpu/amdgpu_socket_utils.h @@ -0,0 +1,6 @@ +#ifndef __KFD_PLUGIN_AMDGPU_SOCKET_UTILS_H__ +#define __KFD_PLUGIN_AMDGPU_SOCKET_UTILS_H__ + +int install_parallel_sock(void); + +#endif \ No newline at end of file From 33ed774c8dd13fc48955557434bad9908031379e Mon Sep 17 00:00:00 2001 From: Yanning Yang Date: Tue, 10 Dec 2024 12:38:48 +0000 Subject: [PATCH 03/51] plugins/amdgpu: Add parallel restore command Currently the restore of buffer object comsumes a significant amount of time. However, this part has no logical dependencies with other restore operations. This patch introduce some structures and some helper functions for the target process to offload this task to the main CRIU process. Signed-off-by: Yanning Yang --- plugins/amdgpu/amdgpu_socket_utils.c | 261 +++++++++++++++++++++++++++ plugins/amdgpu/amdgpu_socket_utils.h | 48 +++++ 2 files changed, 309 insertions(+) diff --git a/plugins/amdgpu/amdgpu_socket_utils.c b/plugins/amdgpu/amdgpu_socket_utils.c index 9e957ae54..c8bf6d1ba 100644 --- a/plugins/amdgpu/amdgpu_socket_utils.c +++ b/plugins/amdgpu/amdgpu_socket_utils.c @@ -4,6 +4,7 @@ #include #include #include +#include #include "amdgpu_socket_utils.h" #include "criu-log.h" @@ -53,6 +54,266 @@ int install_parallel_sock(void) ret = -1; goto err; } +err: + close(sock_fd); + return ret; +} + +void parallel_restore_bo_add(int dmabuf_fd, int gpu_id, uint64_t size, uint64_t offset, + parallel_restore_cmd *restore_cmd) +{ + parallel_restore_entry *restore_entry = &restore_cmd->entries[restore_cmd->cmd_head.entry_num]; + restore_entry->gpu_id = gpu_id; + restore_entry->write_id = restore_cmd->cmd_head.fd_write_num; + restore_entry->write_offset = 0; + restore_entry->read_offset = offset; + restore_entry->size = size; + + restore_cmd->fds_write[restore_cmd->cmd_head.fd_write_num] = dmabuf_fd; + + restore_cmd->cmd_head.entry_num += 1; + restore_cmd->cmd_head.fd_write_num += 1; +} + +void parallel_restore_gpu_id_add(int gpu_id, int minor, parallel_restore_cmd *restore_cmd) +{ + restore_cmd->gpu_ids[restore_cmd->cmd_head.gpu_num] = (parallel_gpu_info){ gpu_id, minor }; + restore_cmd->cmd_head.gpu_num += 1; +} + +static int send_metadata(int sock_fd, parallel_restore_cmd *restore_cmd) +{ + if (send(sock_fd, &restore_cmd->cmd_head, sizeof(parallel_restore_cmd_head), 0) < 0) { + pr_perror("Send parallel restore command head fail"); + return -1; + } + return 0; +} + +static int send_gpu_ids(int sock_fd, parallel_restore_cmd *restore_cmd) +{ + if (send(sock_fd, restore_cmd->gpu_ids, restore_cmd->cmd_head.gpu_num * sizeof(parallel_gpu_info), 0) < 0) { + pr_perror("Send GPU ids of parallel restore command fail"); + return -1; + } + return 0; +} + +static int send_cmds(int sock_fd, parallel_restore_cmd *restore_cmd) +{ + if (send(sock_fd, restore_cmd->entries, restore_cmd->cmd_head.entry_num * sizeof(parallel_restore_entry), 0) < 0) { + pr_perror("Send parallel restore command fail"); + return -1; + } + return 0; +} + +static int send_dmabuf_fds(int sock_fd, parallel_restore_cmd *restore_cmd) +{ + if (send_fds(sock_fd, NULL, 0, restore_cmd->fds_write, restore_cmd->cmd_head.fd_write_num, 0, 0) < 0) { + pr_perror("Send dmabuf fds fail"); + return -1; + } + return 0; +} + +int send_parallel_restore_cmd(parallel_restore_cmd *restore_cmd) +{ + int sock_fd; + int ret = 0; + + sock_fd = socket(PF_UNIX, SOCK_STREAM | SOCK_CLOEXEC, 0); + if (sock_fd < 0) { + pr_perror("Socket creation failed"); + return -1; + } + + ret = connect(sock_fd, (struct sockaddr *)¶llel_socket_addr, parallel_socket_addr_len); + if (ret < 0) { + pr_perror("Connect failed"); + goto err; + } + + ret = send_metadata(sock_fd, restore_cmd); + if (ret) { + goto err; + } + + ret = send_gpu_ids(sock_fd, restore_cmd); + if (ret) { + goto err; + } + + ret = send_cmds(sock_fd, restore_cmd); + if (ret) { + goto err; + } + + ret = send_dmabuf_fds(sock_fd, restore_cmd); + +err: + close(sock_fd); + return ret; +} + +int init_parallel_restore_cmd(int num, int id, int gpu_num, parallel_restore_cmd *restore_cmd) +{ + restore_cmd->cmd_head.id = id; + restore_cmd->cmd_head.fd_write_num = 0; + restore_cmd->cmd_head.entry_num = 0; + restore_cmd->cmd_head.gpu_num = 0; + + restore_cmd->gpu_ids = xzalloc(gpu_num * sizeof(parallel_gpu_info)); + if (!restore_cmd->gpu_ids) + return -ENOMEM; + restore_cmd->fds_write = xzalloc(num * sizeof(int)); + if (!restore_cmd->fds_write) + return -ENOMEM; + restore_cmd->entries = xzalloc(num * sizeof(parallel_restore_entry)); + if (!restore_cmd->entries) + return -ENOMEM; + return 0; +} + +void free_parallel_restore_cmd(parallel_restore_cmd *restore_cmd) +{ + if (restore_cmd->gpu_ids) + xfree(restore_cmd->gpu_ids); + if (restore_cmd->fds_write) + xfree(restore_cmd->fds_write); + if (restore_cmd->entries) + xfree(restore_cmd->entries); +} + +static int init_parallel_restore_cmd_by_head(parallel_restore_cmd *restore_cmd) +{ + restore_cmd->gpu_ids = xzalloc(restore_cmd->cmd_head.gpu_num * sizeof(parallel_gpu_info)); + if (!restore_cmd->gpu_ids) + return -ENOMEM; + restore_cmd->fds_write = xzalloc(restore_cmd->cmd_head.fd_write_num * sizeof(int)); + if (!restore_cmd->fds_write) + return -ENOMEM; + restore_cmd->entries = xzalloc(restore_cmd->cmd_head.entry_num * sizeof(parallel_restore_entry)); + if (!restore_cmd->entries) + return -ENOMEM; + return 0; +} + +static int check_quit_cmd(parallel_restore_cmd *restore_cmd) +{ + return restore_cmd->cmd_head.fd_write_num == 0; +} + +static int recv_metadata(int client_fd, parallel_restore_cmd *restore_cmd) +{ + if (recv(client_fd, &restore_cmd->cmd_head, sizeof(parallel_restore_cmd_head), 0) < 0) { + pr_perror("Recv parallel restore command head fail"); + return -1; + } + return 0; +} + +static int recv_cmds(int client_fd, parallel_restore_cmd *restore_cmd) +{ + if (recv(client_fd, restore_cmd->entries, restore_cmd->cmd_head.entry_num * sizeof(parallel_restore_entry), 0) < 0) { + pr_perror("Recv parallel restore command fail"); + return -1; + } + return 0; +} + +static int recv_gpu_ids(int sock_fd, parallel_restore_cmd *restore_cmd) +{ + if (recv(sock_fd, restore_cmd->gpu_ids, restore_cmd->cmd_head.gpu_num * sizeof(parallel_gpu_info), 0) < 0) { + pr_perror("Send GPU ids of parallel restore command fail"); + return -1; + } + return 0; +} + +static int recv_dmabuf_fds(int client_fd, parallel_restore_cmd *restore_cmd) +{ + if (recv_fds(client_fd, restore_cmd->fds_write, restore_cmd->cmd_head.fd_write_num, 0, 0) < 0) { + pr_perror("Recv dmabuf fds fail"); + return -1; + } + return 0; +} + +int recv_parallel_restore_cmd(parallel_restore_cmd *restore_cmd) +{ + int sock_fd, client_fd; + int ret = 0; + + sock_fd = fdstore_get(parallel_socket_id); + if (sock_fd < 0) + return -1; + + client_fd = accept(sock_fd, NULL, NULL); + if (client_fd < 0) { + ret = client_fd; + goto err_accept; + } + + ret = recv_metadata(client_fd, restore_cmd); + if (ret) { + goto err; + } + + // Return 1 to quit + if (check_quit_cmd(restore_cmd)) { + ret = 1; + goto err; + } + + ret = init_parallel_restore_cmd_by_head(restore_cmd); + if (ret) { + goto err; + } + + ret = recv_gpu_ids(client_fd, restore_cmd); + if (ret) { + goto err; + } + + ret = recv_cmds(client_fd, restore_cmd); + if (ret) { + goto err; + } + + ret = recv_dmabuf_fds(client_fd, restore_cmd); + +err: + close(client_fd); +err_accept: + close(sock_fd); + return ret; +} + +int close_parallel_restore_server(void) +{ + int sock_fd; + int ret = 0; + parallel_restore_cmd_head cmd_head; + + sock_fd = socket(PF_UNIX, SOCK_STREAM | SOCK_CLOEXEC, 0); + if (sock_fd < 0) { + pr_perror("Socket creation failed"); + return -1; + } + + ret = connect(sock_fd, (struct sockaddr *)¶llel_socket_addr, parallel_socket_addr_len); + if (ret < 0) { + pr_perror("Connect failed"); + goto err; + } + + memset(&cmd_head, 0, sizeof(parallel_restore_cmd_head)); + if (send(sock_fd, &cmd_head, sizeof(parallel_restore_cmd_head), 0) < 0) { + pr_perror("Send parallel restore command head fail"); + return -1; + } + err: close(sock_fd); return ret; diff --git a/plugins/amdgpu/amdgpu_socket_utils.h b/plugins/amdgpu/amdgpu_socket_utils.h index 4e7aa2aa4..d7200c6bd 100644 --- a/plugins/amdgpu/amdgpu_socket_utils.h +++ b/plugins/amdgpu/amdgpu_socket_utils.h @@ -1,6 +1,54 @@ #ifndef __KFD_PLUGIN_AMDGPU_SOCKET_UTILS_H__ #define __KFD_PLUGIN_AMDGPU_SOCKET_UTILS_H__ +typedef struct { + int id; + int fd_write_num; /* The number of buffer objects to be restored. */ + int entry_num; /* The number of restore commands.*/ + int gpu_num; +} parallel_restore_cmd_head; + +typedef struct { + int gpu_id; + int minor; +} parallel_gpu_info; + +typedef struct { + int gpu_id; + int write_id; + uint64_t read_offset; + uint64_t write_offset; + uint64_t size; +} parallel_restore_entry; + +typedef struct { + parallel_restore_cmd_head cmd_head; + int *fds_write; + parallel_gpu_info *gpu_ids; + parallel_restore_entry *entries; +} parallel_restore_cmd; + +/* + * For parallel_restore, a background thread in the main CRIU process is used to restore the GPU + * buffer object. However, initially, the ownership of these buffer objects and the metadata for + * restoration are all with the target process. Therefore, we introduce a series of functions to + * help the target process send these tasks to the main CRIU process. + */ +int init_parallel_restore_cmd(int num, int id, int gpu_num, parallel_restore_cmd *restore_cmd); + +void free_parallel_restore_cmd(parallel_restore_cmd *restore_cmd); + int install_parallel_sock(void); +int send_parallel_restore_cmd(parallel_restore_cmd *restore_cmd); + +int recv_parallel_restore_cmd(parallel_restore_cmd *restore_cmd); + +void parallel_restore_bo_add(int dmabuf_fd, int gpu_id, uint64_t size, uint64_t offset, + parallel_restore_cmd *restore_cmd); + +void parallel_restore_gpu_id_add(int gpu_id, int minor, parallel_restore_cmd *restore_cmd); + +int close_parallel_restore_server(void); + #endif \ No newline at end of file From 4a3a695dfb9da7338174549b0cadcc4279cbf51a Mon Sep 17 00:00:00 2001 From: Yanning Yang Date: Wed, 15 Jan 2025 06:38:27 +0000 Subject: [PATCH 04/51] plugins/amdgpu: Implement parallel restore This patch implements the entire logic to enable the offloading of buffer object content restoration. The goal of this patch is to offload the buffer object content restoration to the main CRIU process so that this restoration can occur in parallel with other restoration logic (mainly the restoration of memory state in the restore blob, which is time-consuming) to speed up the restore phase. The restoration of buffer object content usually takes a significant amount of time for GPU applications, so parallelizing it with other operations can reduce the overall restore time. It has three parts: the first replaces the restoration of buffer objects in the target process by sending a parallel restore command to the main CRIU process; the second implements the POST_FORKING hook in the amdgpu plugin to enable buffer object content restoration in the main CRIU process; the third stops the parallel thread in the RESUME_DEVICES_LATE hook. This optimization only focuses on the single-process situation (common case). In other scenarios, it will turn to the original method. This is achieved with the new `parallel_disabled` flag. Signed-off-by: Yanning Yang --- plugins/amdgpu/Makefile | 2 +- plugins/amdgpu/amdgpu_plugin.c | 420 +++++++++++++++++++++--- plugins/amdgpu/amdgpu_plugin_topology.c | 2 +- plugins/amdgpu/amdgpu_plugin_topology.h | 1 + 4 files changed, 374 insertions(+), 51 deletions(-) diff --git a/plugins/amdgpu/Makefile b/plugins/amdgpu/Makefile index 3d55f8bb4..870a039cd 100644 --- a/plugins/amdgpu/Makefile +++ b/plugins/amdgpu/Makefile @@ -27,7 +27,7 @@ endif criu-amdgpu.pb-c.c: criu-amdgpu.proto protoc --proto_path=. --c_out=. criu-amdgpu.proto -amdgpu_plugin.so: amdgpu_plugin.c amdgpu_plugin_drm.c amdgpu_plugin_topology.c amdgpu_plugin_util.c criu-amdgpu.pb-c.c +amdgpu_plugin.so: amdgpu_plugin.c amdgpu_plugin_drm.c amdgpu_plugin_topology.c amdgpu_plugin_util.c criu-amdgpu.pb-c.c amdgpu_socket_utils.c $(CC) $(PLUGIN_CFLAGS) $(shell $(COMPEL) includes) $^ -o $@ $(PLUGIN_INCLUDE) $(PLUGIN_LDFLAGS) $(LIBDRM_INC) amdgpu_plugin_clean: diff --git a/plugins/amdgpu/amdgpu_plugin.c b/plugins/amdgpu/amdgpu_plugin.c index 96c086162..69194fbc7 100644 --- a/plugins/amdgpu/amdgpu_plugin.c +++ b/plugins/amdgpu/amdgpu_plugin.c @@ -28,11 +28,13 @@ #include "xmalloc.h" #include "criu-log.h" #include "files.h" +#include "pstree.h" #include "common/list.h" #include "amdgpu_plugin_drm.h" #include "amdgpu_plugin_util.h" #include "amdgpu_plugin_topology.h" +#include "amdgpu_socket_utils.h" #include "img-streamer.h" #include "image.h" @@ -64,6 +66,18 @@ bool plugin_added_to_inventory = false; bool plugin_disabled = false; +/* + * In the case of a single process (common case), this optimization can effectively + * reduce the restore latency with parallel restore. In the case of multiple processes, + * states are already restored in parallel within different processes. Therefore, this + * optimization does not introduce further improvement and will be disabled by default + * in this case. The flag, parallel_disabled, is used to control whether the + * optimization is enabled or disabled. + */ +bool parallel_disabled = false; + +pthread_t parallel_thread = 0; +int parallel_thread_result = 0; /**************************************************************************************************/ /* Call ioctl, restarting if it is interrupted */ @@ -351,6 +365,15 @@ int amdgpu_plugin_init(int stage) maps_init(&restore_maps); if (stage == CR_PLUGIN_STAGE__RESTORE) { + if (has_children(root_item)) { + pr_info("Parallel restore disabled\n"); + parallel_disabled = true; + } else { + if (install_parallel_sock() < 0) { + pr_err("Failed to install parallel socket\n"); + return -1; + } + } /* Default Values */ kfd_fw_version_check = true; kfd_sdma_fw_version_check = true; @@ -1439,14 +1462,9 @@ static int restore_bos(struct kfd_ioctl_criu_args *args, CriuKfd *e) static int restore_bo_data(int id, struct kfd_criu_bo_bucket *bo_buckets, CriuKfd *e) { - struct thread_data *thread_datas; + struct thread_data *thread_datas = NULL; int thread_i, ret = 0; - - thread_datas = xzalloc(sizeof(*thread_datas) * e->num_of_gpus); - if (!thread_datas) { - ret = -ENOMEM; - goto exit; - } + int offset = 0; for (int i = 0; i < e->num_of_bos; i++) { struct kfd_criu_bo_bucket *bo_bucket = &bo_buckets[i]; @@ -1489,56 +1507,101 @@ static int restore_bo_data(int id, struct kfd_criu_bo_bucket *bo_buckets, CriuKf } } - thread_i = 0; - for (int i = 0; i < e->num_of_gpus + e->num_of_cpus; i++) { - struct tp_node *dev; - int ret_thread = 0; - uint32_t target_gpu_id; + if (!parallel_disabled) { + parallel_restore_cmd restore_cmd; + pr_info("Begin to send parallel restore cmd\n"); + ret = init_parallel_restore_cmd(e->num_of_bos, id, e->num_of_gpus, &restore_cmd); + if (ret) + goto exit_parallel; - if (!e->device_entries[i]->gpu_id) - continue; + for (int i = 0; i < e->num_of_gpus + e->num_of_cpus; i++) { + uint32_t target_gpu_id; + struct tp_node *dev; - /* e->device_entries[i]->gpu_id is user_gpu_id, target_gpu_id is actual_gpu_id */ - target_gpu_id = maps_get_dest_gpu(&restore_maps, e->device_entries[i]->gpu_id); + if (!e->device_entries[i]->gpu_id) + continue; - /* We need the fd for actual_gpu_id */ - dev = sys_get_node_by_gpu_id(&dest_topology, target_gpu_id); - if (!dev) { - pr_err("Failed to find node with gpu_id:0x%04x\n", target_gpu_id); - ret = -ENODEV; + target_gpu_id = maps_get_dest_gpu(&restore_maps, e->device_entries[i]->gpu_id); + dev = sys_get_node_by_gpu_id(&dest_topology, target_gpu_id); + if (!dev) { + pr_err("Failed to find node with gpu_id:0x%04x\n", target_gpu_id); + ret = -ENODEV; + goto exit_parallel; + } + parallel_restore_gpu_id_add(e->device_entries[i]->gpu_id, dev->drm_render_minor, &restore_cmd); + + for (int j = 0; j < e->num_of_bos; j++) { + if (bo_buckets[j].gpu_id != e->device_entries[i]->gpu_id) + continue; + if (bo_buckets[j].alloc_flags & + (KFD_IOC_ALLOC_MEM_FLAGS_VRAM | KFD_IOC_ALLOC_MEM_FLAGS_GTT)) { + parallel_restore_bo_add(bo_buckets[j].dmabuf_fd, bo_buckets[j].gpu_id, + bo_buckets[j].size, offset, &restore_cmd); + offset += bo_buckets[j].size; + } + } + } + ret = send_parallel_restore_cmd(&restore_cmd); +exit_parallel: + free_parallel_restore_cmd(&restore_cmd); + } else { + thread_datas = xzalloc(sizeof(*thread_datas) * e->num_of_gpus); + if (!thread_datas) { + ret = -ENOMEM; goto exit; } - thread_datas[thread_i].id = id; - thread_datas[thread_i].gpu_id = e->device_entries[i]->gpu_id; - thread_datas[thread_i].bo_buckets = bo_buckets; - thread_datas[thread_i].bo_entries = e->bo_entries; - thread_datas[thread_i].pid = e->pid; - thread_datas[thread_i].num_of_bos = e->num_of_bos; + thread_i = 0; + for (int i = 0; i < e->num_of_gpus + e->num_of_cpus; i++) { + struct tp_node *dev; + int ret_thread = 0; + uint32_t target_gpu_id; - thread_datas[thread_i].drm_fd = node_get_drm_render_device(dev); - if (thread_datas[thread_i].drm_fd < 0) { - ret = -thread_datas[thread_i].drm_fd; - goto exit; + if (!e->device_entries[i]->gpu_id) + continue; + + /* e->device_entries[i]->gpu_id is user_gpu_id, target_gpu_id is actual_gpu_id */ + target_gpu_id = maps_get_dest_gpu(&restore_maps, e->device_entries[i]->gpu_id); + + /* We need the fd for actual_gpu_id */ + dev = sys_get_node_by_gpu_id(&dest_topology, target_gpu_id); + if (!dev) { + pr_err("Failed to find node with gpu_id:0x%04x\n", target_gpu_id); + ret = -ENODEV; + goto exit; + } + + thread_datas[thread_i].id = id; + thread_datas[thread_i].gpu_id = e->device_entries[i]->gpu_id; + thread_datas[thread_i].bo_buckets = bo_buckets; + thread_datas[thread_i].bo_entries = e->bo_entries; + thread_datas[thread_i].pid = e->pid; + thread_datas[thread_i].num_of_bos = e->num_of_bos; + + thread_datas[thread_i].drm_fd = node_get_drm_render_device(dev); + if (thread_datas[thread_i].drm_fd < 0) { + ret = -thread_datas[thread_i].drm_fd; + goto exit; + } + + ret_thread = pthread_create(&thread_datas[thread_i].thread, NULL, restore_bo_contents, + (void *)&thread_datas[thread_i]); + if (ret_thread) { + pr_err("Failed to create thread[%i] ret:%d\n", thread_i, ret_thread); + ret = -ret_thread; + goto exit; + } + thread_i++; } - ret_thread = pthread_create(&thread_datas[thread_i].thread, NULL, restore_bo_contents, - (void *)&thread_datas[thread_i]); - if (ret_thread) { - pr_err("Failed to create thread[%i] ret:%d\n", thread_i, ret_thread); - ret = -ret_thread; - goto exit; - } - thread_i++; - } + for (int i = 0; i < e->num_of_gpus; i++) { + pthread_join(thread_datas[i].thread, NULL); + pr_info("Thread[0x%x] finished ret:%d\n", thread_datas[i].gpu_id, thread_datas[i].ret); - for (int i = 0; i < e->num_of_gpus; i++) { - pthread_join(thread_datas[i].thread, NULL); - pr_info("Thread[0x%x] finished ret:%d\n", thread_datas[i].gpu_id, thread_datas[i].ret); - - if (thread_datas[i].ret) { - ret = thread_datas[i].ret; - goto exit; + if (thread_datas[i].ret) { + ret = thread_datas[i].ret; + goto exit; + } } } exit: @@ -1546,8 +1609,8 @@ exit: if (bo_buckets[i].dmabuf_fd != KFD_INVALID_FD) close(bo_buckets[i].dmabuf_fd); } - - xfree(thread_datas); + if (thread_datas) + xfree(thread_datas); return ret; } @@ -1836,6 +1899,24 @@ int amdgpu_plugin_resume_devices_late(int target_pid) if (plugin_disabled) return -ENOTSUP; + if (!parallel_disabled) { + pr_info("Close parallel restore server\n"); + if (close_parallel_restore_server()) { + pr_err("Close parallel restore server fail\n"); + return -1; + } + + exit_code = pthread_join(parallel_thread, NULL); + if (exit_code) { + pr_err("Failed to join parallel thread ret:%d\n", exit_code); + return -1; + } + if (parallel_thread_result) { + pr_err("Parallel restore fail\n"); + return parallel_thread_result; + } + } + pr_info("Inside %s for target pid = %d\n", __func__, target_pid); fd = open(AMDGPU_KFD_DEVICE, O_RDWR | O_CLOEXEC); @@ -1862,3 +1943,244 @@ int amdgpu_plugin_resume_devices_late(int target_pid) } CR_PLUGIN_REGISTER_HOOK(CR_PLUGIN_HOOK__RESUME_DEVICES_LATE, amdgpu_plugin_resume_devices_late) + +int sdma_copy_bo_helper(uint64_t size, int fd, FILE *storage_fp, void *buffer, size_t buffer_size, + amdgpu_device_handle h_dev, uint64_t max_copy_size, enum sdma_op_type type) +{ + return sdma_copy_bo((struct kfd_criu_bo_bucket){ 0, size, 0, 0, 0, 0, fd, 0 }, storage_fp, buffer, + buffer_size, h_dev, max_copy_size, SDMA_OP_VRAM_WRITE); +} + +int init_dev(int dev_minor, amdgpu_device_handle *h_dev, uint64_t *max_copy_size) +{ + int ret = 0; + int drm_fd = -1; + uint32_t major, minor; + + struct amdgpu_gpu_info gpu_info = { 0 }; + + drm_fd = open_drm_render_device(dev_minor); + if (drm_fd < 0) { + return drm_fd; + } + + ret = amdgpu_device_initialize(drm_fd, &major, &minor, h_dev); + if (ret) { + pr_perror("Failed to initialize device"); + goto err; + } + + ret = amdgpu_query_gpu_info(*h_dev, &gpu_info); + if (ret) { + pr_perror("failed to query gpuinfo via libdrm"); + goto err; + } + *max_copy_size = (gpu_info.family_id >= AMDGPU_FAMILY_AI) ? SDMA_LINEAR_COPY_MAX_SIZE : + SDMA_LINEAR_COPY_MAX_SIZE - 1; + return 0; +err: + amdgpu_device_deinitialize(*h_dev); + return ret; +} + +FILE *get_bo_contents_fp(int id, int gpu_id, size_t tot_size) +{ + char img_path[PATH_MAX]; + size_t image_size = 0; + FILE *bo_contents_fp = NULL; + + snprintf(img_path, sizeof(img_path), IMG_KFD_PAGES_FILE, id, gpu_id); + bo_contents_fp = open_img_file(img_path, false, &image_size); + if (!bo_contents_fp) { + pr_perror("Cannot fopen %s", img_path); + return NULL; + } + + if (tot_size != image_size) { + pr_err("%s size mismatch (current:%ld:expected:%ld)\n", img_path, image_size, tot_size); + fclose(bo_contents_fp); + return NULL; + } + return bo_contents_fp; +} + +struct parallel_thread_data { + pthread_t thread; + uint32_t gpu_id; + int minor; + parallel_restore_cmd *restore_cmd; + int ret; +}; + +void *parallel_restore_bo_contents(void *_thread_data) +{ + struct parallel_thread_data *thread_data = (struct parallel_thread_data *)_thread_data; + amdgpu_device_handle h_dev; + uint64_t max_copy_size; + size_t total_bo_size = 0, max_bo_size = 0, buffer_size = 0; + FILE *bo_contents_fp = NULL; + parallel_restore_entry *entry; + parallel_restore_cmd *restore_cmd = thread_data->restore_cmd; + int ret = 0; + int offset = 0; + void *buffer = NULL; + + ret = init_dev(thread_data->minor, &h_dev, &max_copy_size); + if (ret) { + goto err; + } + + for (int i = 0; i < restore_cmd->cmd_head.entry_num; i++) { + if (restore_cmd->entries[i].gpu_id == thread_data->gpu_id) { + total_bo_size += restore_cmd->entries[i].size; + max_bo_size = max(restore_cmd->entries[i].size, max_bo_size); + } + } + + buffer_size = kfd_max_buffer_size > 0 ? min(kfd_max_buffer_size, max_bo_size) : max_bo_size; + + bo_contents_fp = get_bo_contents_fp(restore_cmd->cmd_head.id, thread_data->gpu_id, total_bo_size); + if (bo_contents_fp == NULL) { + ret = -1; + goto err_sdma; + } + offset = ftell(bo_contents_fp); + + posix_memalign(&buffer, sysconf(_SC_PAGE_SIZE), buffer_size); + if (!buffer) { + pr_perror("Failed to alloc aligned memory. Consider setting KFD_MAX_BUFFER_SIZE."); + ret = -ENOMEM; + goto err_sdma; + } + + for (int i = 0; i < restore_cmd->cmd_head.entry_num; i++) { + if (restore_cmd->entries[i].gpu_id != thread_data->gpu_id) + continue; + + entry = &restore_cmd->entries[i]; + fseek(bo_contents_fp, entry->read_offset + offset, SEEK_SET); + ret = sdma_copy_bo_helper(entry->size, restore_cmd->fds_write[entry->write_id], bo_contents_fp, buffer, + buffer_size, h_dev, max_copy_size, SDMA_OP_VRAM_WRITE); + if (ret) { + pr_err("Failed to fill the BO using sDMA: bo_buckets[%d]\n", i); + goto err_sdma; + } + } + +err_sdma: + if (bo_contents_fp) + fclose(bo_contents_fp); + if (buffer) + xfree(buffer); + amdgpu_device_deinitialize(h_dev); +err: + thread_data->ret = ret; + return NULL; +} + +void *restore_device_parallel_worker(void *arg) +{ + while (1) { + parallel_restore_cmd restore_cmd = { 0 }; + struct parallel_thread_data *thread_datas = NULL; + int ret; + int error_occurred = 0, join_ret = 0, created_threads = 0; + + ret = recv_parallel_restore_cmd(&restore_cmd); + if (ret) { + if (ret == 1) { + *(int *)arg = 0; + goto exit; + } + goto err; + } + + thread_datas = xzalloc(sizeof(*thread_datas) * restore_cmd.cmd_head.gpu_num); + if (!thread_datas) { + ret = -ENOMEM; + goto err; + } + + for (; created_threads < restore_cmd.cmd_head.gpu_num; created_threads++) { + thread_datas[created_threads].gpu_id = restore_cmd.gpu_ids[created_threads].gpu_id; + thread_datas[created_threads].minor = restore_cmd.gpu_ids[created_threads].minor; + thread_datas[created_threads].restore_cmd = &restore_cmd; + + ret = pthread_create(&thread_datas[created_threads].thread, NULL, parallel_restore_bo_contents, + (void *)&thread_datas[created_threads]); + if (ret) { + pr_err("Failed to create thread[0x%x] ret:%d\n", thread_datas[created_threads].gpu_id, ret); + error_occurred = 1; + break; + } + } + + for (int i = 0; i < created_threads; i++) { + join_ret = pthread_join(thread_datas[i].thread, NULL); + if (join_ret != 0) { + pr_err("pthread_join failed for Thread[0x%x] ret:%d\n", + thread_datas[i].gpu_id, join_ret); + if (!error_occurred) { + ret = join_ret; + error_occurred = 1; + } + } + + pr_info("Thread[0x%x] finished ret:%d\n", thread_datas[i].gpu_id, thread_datas[i].ret); + + /* Check thread return value */ + if (thread_datas[i].ret && !error_occurred) { + ret = thread_datas[i].ret; + error_occurred = 1; + } + } + + if (thread_datas) + xfree(thread_datas); +err: + free_parallel_restore_cmd(&restore_cmd); + + if (ret) { + *(int *)arg = ret; + return NULL; + } + } +exit: + return NULL; +} + +/* + * While the background thread is running, some processing functions (e.g., stop_cgroupd) + * in the main thread need to block SIGCHLD. To prevent interference from this background + * thread, SIGCHLD is blocked in this thread. + */ +static int back_thread_create(pthread_t *newthread, void *(*f)(void *), void *arg) +{ + int ret = 0; + sigset_t blockmask, oldmask; + + sigemptyset(&blockmask); + sigaddset(&blockmask, SIGCHLD); + sigprocmask(SIG_BLOCK, &blockmask, &oldmask); + + ret = pthread_create(newthread, NULL, f, arg); + if (ret) { + pr_err("Create worker thread fail: %d\n", ret); + return -1; + } + + sigprocmask(SIG_SETMASK, &oldmask, NULL); + return 0; +} + +int amdgpu_plugin_post_forking(void) +{ + if (plugin_disabled) + return -ENOTSUP; + + if (parallel_disabled) + return 0; + + return back_thread_create(¶llel_thread, restore_device_parallel_worker, ¶llel_thread_result); +} +CR_PLUGIN_REGISTER_HOOK(CR_PLUGIN_HOOK__POST_FORKING, amdgpu_plugin_post_forking) \ No newline at end of file diff --git a/plugins/amdgpu/amdgpu_plugin_topology.c b/plugins/amdgpu/amdgpu_plugin_topology.c index 5b4396a0c..730f2e028 100644 --- a/plugins/amdgpu/amdgpu_plugin_topology.c +++ b/plugins/amdgpu/amdgpu_plugin_topology.c @@ -45,7 +45,7 @@ bool kfd_capability_check = true; */ int fd_next = -1; -static int open_drm_render_device(int minor) +int open_drm_render_device(int minor) { char path[128]; int fd, ret_fd; diff --git a/plugins/amdgpu/amdgpu_plugin_topology.h b/plugins/amdgpu/amdgpu_plugin_topology.h index c890e3dda..e19f8e7ce 100644 --- a/plugins/amdgpu/amdgpu_plugin_topology.h +++ b/plugins/amdgpu/amdgpu_plugin_topology.h @@ -118,6 +118,7 @@ struct tp_node *sys_get_node_by_gpu_id(const struct tp_system *sys, const uint32 struct tp_node *sys_get_node_by_render_minor(const struct tp_system *sys, const int drm_render_minor); struct tp_node *sys_get_node_by_index(const struct tp_system *sys, uint32_t index); +int open_drm_render_device(int minor); int node_get_drm_render_device(struct tp_node *node); void sys_close_drm_render_devices(struct tp_system *sys); From 920437205c4f5359e4c54765c9e23d2d57c2f4ec Mon Sep 17 00:00:00 2001 From: Yanning Yang Date: Tue, 10 Dec 2024 12:44:35 +0000 Subject: [PATCH 05/51] plugins/amdgpu: Update `README.md` and `criu-amdgpu-plugin.txt` Signed-off-by: Yanning Yang --- Documentation/criu-amdgpu-plugin.txt | 1 + plugins/amdgpu/README.md | 23 ++++++++++++++++++++++- 2 files changed, 23 insertions(+), 1 deletion(-) diff --git a/Documentation/criu-amdgpu-plugin.txt b/Documentation/criu-amdgpu-plugin.txt index 68803f3db..fe76fc3bc 100644 --- a/Documentation/criu-amdgpu-plugin.txt +++ b/Documentation/criu-amdgpu-plugin.txt @@ -15,6 +15,7 @@ Checkpoint / Restore inside a docker container Pytorch Tensorflow Using CRIU Image Streamer +Parallel Restore DESCRIPTION ----------- diff --git a/plugins/amdgpu/README.md b/plugins/amdgpu/README.md index 1078eafe6..b808fbc4f 100644 --- a/plugins/amdgpu/README.md +++ b/plugins/amdgpu/README.md @@ -3,7 +3,8 @@ Supporting ROCm with CRIU _Felix Kuehling _
_Rajneesh Bardwaj _
-_David Yat Sin _ +_David Yat Sin _
+_Yanning Yang _ # Introduction @@ -224,6 +225,26 @@ to resume execution on the GPUs. *This new plugin is enabled by the new hook `__RESUME_DEVICES_LATE` in our RFC patch series.* +## Restoring BO content in parallel + +Restoring the BO content is an important part in the restore of GPU state and +usually takes a significant amount of time. A possible location for this +procedure is the `cr_plugin_restore_file` hook. However, restoring in this hook +blocks the target process from performing other restore operations, which +hinders further optimization of the restore process. + +Therefore, a new plugin hook that runs in the master restore process is +introduced, and it interacts with the `cr_plugin_restore_file` hook to complete +the restore of BO content. Specifically, the target process only needs to send +the relevant BOs to the master restore process, while this new hook handles all +the restore of buffer objects. Through this method, during the restore of the BO +content, the target process can perform other restore operations, thus +accelerating the restore procedure. This is an implementation of the gCROP +method proposed in the ACM SoCC'24 paper: [On-demand and Parallel +Checkpoint/Restore for GPU Applications](https://dl.acm.org/doi/10.1145/3698038.3698510). + +*This optimization technique is enabled by the `__POST_FORKING` hook.* + ## Other CRIU changes In addition to the new plugins, we need to make some changes to CRIU itself to From 7a4ee0ae8effdbf475804b72995912b0911ad28a Mon Sep 17 00:00:00 2001 From: David Francis Date: Wed, 12 Feb 2025 09:26:21 -0500 Subject: [PATCH 06/51] restorer: Skip non-regular VMAs amdgpu represents allocated device memory as a memory mapping of the device file. This is a non-standard VMA that must be handled by the plugin, not the normal VMA code. Ignore all VMAs on device files. Signed-off-by: David Francis --- criu/pie/restorer.c | 3 +++ 1 file changed, 3 insertions(+) diff --git a/criu/pie/restorer.c b/criu/pie/restorer.c index 5c40b0e93..008e1398d 100644 --- a/criu/pie/restorer.c +++ b/criu/pie/restorer.c @@ -1989,6 +1989,9 @@ __visible long __export_restore_task(struct task_restore_args *args) for (m = 0; m < sizeof(vma_entry->madv) * 8; m++) { if (vma_entry->madv & (1ul << m)) { + if (!(vma_entry_is(vma_entry, VMA_AREA_REGULAR))) + continue; + ret = sys_madvise(vma_entry->start, vma_entry_len(vma_entry), m); if (ret) { pr_err("madvise(%" PRIx64 ", %" PRIu64 ", %ld) " From fb02dbf68582c6589724d7aa3bb06ce3d588cc71 Mon Sep 17 00:00:00 2001 From: David Francis Date: Wed, 12 Feb 2025 09:45:37 -0500 Subject: [PATCH 07/51] files-ext: Allow plugin files to retry amdgpu dmabuf CRIU requires the ability of the amdgpu plugin to retry. Change files_ext.c to read a response of 1 from a plugin restore function to mean retry. Signed-off-by: David Francis --- criu/files-ext.c | 10 +++++++--- criu/include/criu-plugin.h | 2 +- plugins/amdgpu/amdgpu_plugin.c | 4 +++- 3 files changed, 11 insertions(+), 5 deletions(-) diff --git a/criu/files-ext.c b/criu/files-ext.c index 95ec8e37c..4cc99d921 100644 --- a/criu/files-ext.c +++ b/criu/files-ext.c @@ -45,10 +45,11 @@ static int open_fd(struct file_desc *d, int *new_fd) { struct ext_file_info *xfi; int fd; + bool retry_needed; xfi = container_of(d, struct ext_file_info, d); - fd = run_plugins(RESTORE_EXT_FILE, xfi->xfe->id); + fd = run_plugins(RESTORE_EXT_FILE, xfi->xfe->id, &retry_needed); if (fd < 0) { pr_err("Unable to restore %#x\n", xfi->xfe->id); return -1; @@ -57,8 +58,11 @@ static int open_fd(struct file_desc *d, int *new_fd) if (restore_fown(fd, xfi->xfe->fown)) return -1; - *new_fd = fd; - return 0; + if (!retry_needed) + *new_fd = fd; + else + *new_fd = -1; + return retry_needed; } static struct file_desc_ops ext_desc_ops = { diff --git a/criu/include/criu-plugin.h b/criu/include/criu-plugin.h index 9fb21a449..ee84ccdf6 100644 --- a/criu/include/criu-plugin.h +++ b/criu/include/criu-plugin.h @@ -70,7 +70,7 @@ enum { DECLARE_PLUGIN_HOOK_ARGS(CR_PLUGIN_HOOK__DUMP_UNIX_SK, int fd, int id); DECLARE_PLUGIN_HOOK_ARGS(CR_PLUGIN_HOOK__RESTORE_UNIX_SK, int id); DECLARE_PLUGIN_HOOK_ARGS(CR_PLUGIN_HOOK__DUMP_EXT_FILE, int fd, int id); -DECLARE_PLUGIN_HOOK_ARGS(CR_PLUGIN_HOOK__RESTORE_EXT_FILE, int id); +DECLARE_PLUGIN_HOOK_ARGS(CR_PLUGIN_HOOK__RESTORE_EXT_FILE, int id, bool *retry_needed); DECLARE_PLUGIN_HOOK_ARGS(CR_PLUGIN_HOOK__DUMP_EXT_MOUNT, char *mountpoint, int id); DECLARE_PLUGIN_HOOK_ARGS(CR_PLUGIN_HOOK__RESTORE_EXT_MOUNT, int id, char *mountpoint, char *old_root, int *is_file); DECLARE_PLUGIN_HOOK_ARGS(CR_PLUGIN_HOOK__DUMP_EXT_LINK, int index, int type, char *kind); diff --git a/plugins/amdgpu/amdgpu_plugin.c b/plugins/amdgpu/amdgpu_plugin.c index 69194fbc7..e3b4ead3f 100644 --- a/plugins/amdgpu/amdgpu_plugin.c +++ b/plugins/amdgpu/amdgpu_plugin.c @@ -1614,7 +1614,7 @@ exit: return ret; } -int amdgpu_plugin_restore_file(int id) +int amdgpu_plugin_restore_file(int id, bool *retry_needed) { int ret = 0, fd; char img_path[PATH_MAX]; @@ -1625,6 +1625,8 @@ int amdgpu_plugin_restore_file(int id) size_t img_size; FILE *img_fp = NULL; + *retry_needed = false; + if (plugin_disabled) return -ENOTSUP; From 0b7ca29c1944a8021c22a8e7041f047facb52e48 Mon Sep 17 00:00:00 2001 From: David Francis Date: Wed, 19 Feb 2025 14:30:07 -0500 Subject: [PATCH 08/51] plugin/amdgpu: Add amdgpu drm header For amdgpu plugin to call the new amdgpu drm CRIU ioctls, it needs the amdgpu drm header file, copied from the kernel's includes. Signed-off-by: David Francis --- plugins/amdgpu/amdgpu_drm.h | 1688 +++++++++++++++++++++++++++++++++++ 1 file changed, 1688 insertions(+) create mode 100644 plugins/amdgpu/amdgpu_drm.h diff --git a/plugins/amdgpu/amdgpu_drm.h b/plugins/amdgpu/amdgpu_drm.h new file mode 100644 index 000000000..9cebd072a --- /dev/null +++ b/plugins/amdgpu/amdgpu_drm.h @@ -0,0 +1,1688 @@ +/* amdgpu_drm.h -- Public header for the amdgpu driver -*- linux-c -*- + * + * Copyright 2000 Precision Insight, Inc., Cedar Park, Texas. + * Copyright 2000 VA Linux Systems, Inc., Fremont, California. + * Copyright 2002 Tungsten Graphics, Inc., Cedar Park, Texas. + * Copyright 2014 Advanced Micro Devices, Inc. + * + * Permission is hereby granted, free of charge, to any person obtaining a + * copy of this software and associated documentation files (the "Software"), + * to deal in the Software without restriction, including without limitation + * the rights to use, copy, modify, merge, publish, distribute, sublicense, + * and/or sell copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL + * THE COPYRIGHT HOLDER(S) OR AUTHOR(S) BE LIABLE FOR ANY CLAIM, DAMAGES OR + * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, + * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR + * OTHER DEALINGS IN THE SOFTWARE. + * + * Authors: + * Kevin E. Martin + * Gareth Hughes + * Keith Whitwell + */ + +#ifndef __AMDGPU_DRM_H__ +#define __AMDGPU_DRM_H__ + +#include "drm.h" + +#if defined(__cplusplus) +extern "C" { +#endif + +#define DRM_AMDGPU_GEM_CREATE 0x00 +#define DRM_AMDGPU_GEM_MMAP 0x01 +#define DRM_AMDGPU_CTX 0x02 +#define DRM_AMDGPU_BO_LIST 0x03 +#define DRM_AMDGPU_CS 0x04 +#define DRM_AMDGPU_INFO 0x05 +#define DRM_AMDGPU_GEM_METADATA 0x06 +#define DRM_AMDGPU_GEM_WAIT_IDLE 0x07 +#define DRM_AMDGPU_GEM_VA 0x08 +#define DRM_AMDGPU_WAIT_CS 0x09 +#define DRM_AMDGPU_GEM_OP 0x10 +#define DRM_AMDGPU_GEM_USERPTR 0x11 +#define DRM_AMDGPU_WAIT_FENCES 0x12 +#define DRM_AMDGPU_VM 0x13 +#define DRM_AMDGPU_FENCE_TO_HANDLE 0x14 +#define DRM_AMDGPU_SCHED 0x15 +#define DRM_AMDGPU_USERQ 0x16 +#define DRM_AMDGPU_USERQ_SIGNAL 0x17 +#define DRM_AMDGPU_USERQ_WAIT 0x18 +#define DRM_AMDGPU_GEM_LIST_HANDLES 0x19 + +#define DRM_IOCTL_AMDGPU_GEM_CREATE DRM_IOWR(DRM_COMMAND_BASE + DRM_AMDGPU_GEM_CREATE, union drm_amdgpu_gem_create) +#define DRM_IOCTL_AMDGPU_GEM_MMAP DRM_IOWR(DRM_COMMAND_BASE + DRM_AMDGPU_GEM_MMAP, union drm_amdgpu_gem_mmap) +#define DRM_IOCTL_AMDGPU_CTX DRM_IOWR(DRM_COMMAND_BASE + DRM_AMDGPU_CTX, union drm_amdgpu_ctx) +#define DRM_IOCTL_AMDGPU_BO_LIST DRM_IOWR(DRM_COMMAND_BASE + DRM_AMDGPU_BO_LIST, union drm_amdgpu_bo_list) +#define DRM_IOCTL_AMDGPU_CS DRM_IOWR(DRM_COMMAND_BASE + DRM_AMDGPU_CS, union drm_amdgpu_cs) +#define DRM_IOCTL_AMDGPU_INFO DRM_IOW(DRM_COMMAND_BASE + DRM_AMDGPU_INFO, struct drm_amdgpu_info) +#define DRM_IOCTL_AMDGPU_GEM_METADATA DRM_IOWR(DRM_COMMAND_BASE + DRM_AMDGPU_GEM_METADATA, struct drm_amdgpu_gem_metadata) +#define DRM_IOCTL_AMDGPU_GEM_WAIT_IDLE DRM_IOWR(DRM_COMMAND_BASE + DRM_AMDGPU_GEM_WAIT_IDLE, union drm_amdgpu_gem_wait_idle) +#define DRM_IOCTL_AMDGPU_GEM_VA DRM_IOW(DRM_COMMAND_BASE + DRM_AMDGPU_GEM_VA, struct drm_amdgpu_gem_va) +#define DRM_IOCTL_AMDGPU_WAIT_CS DRM_IOWR(DRM_COMMAND_BASE + DRM_AMDGPU_WAIT_CS, union drm_amdgpu_wait_cs) +#define DRM_IOCTL_AMDGPU_GEM_OP DRM_IOWR(DRM_COMMAND_BASE + DRM_AMDGPU_GEM_OP, struct drm_amdgpu_gem_op) +#define DRM_IOCTL_AMDGPU_GEM_USERPTR DRM_IOWR(DRM_COMMAND_BASE + DRM_AMDGPU_GEM_USERPTR, struct drm_amdgpu_gem_userptr) +#define DRM_IOCTL_AMDGPU_WAIT_FENCES DRM_IOWR(DRM_COMMAND_BASE + DRM_AMDGPU_WAIT_FENCES, union drm_amdgpu_wait_fences) +#define DRM_IOCTL_AMDGPU_VM DRM_IOWR(DRM_COMMAND_BASE + DRM_AMDGPU_VM, union drm_amdgpu_vm) +#define DRM_IOCTL_AMDGPU_FENCE_TO_HANDLE DRM_IOWR(DRM_COMMAND_BASE + DRM_AMDGPU_FENCE_TO_HANDLE, union drm_amdgpu_fence_to_handle) +#define DRM_IOCTL_AMDGPU_SCHED DRM_IOW(DRM_COMMAND_BASE + DRM_AMDGPU_SCHED, union drm_amdgpu_sched) +#define DRM_IOCTL_AMDGPU_USERQ DRM_IOWR(DRM_COMMAND_BASE + DRM_AMDGPU_USERQ, union drm_amdgpu_userq) +#define DRM_IOCTL_AMDGPU_USERQ_SIGNAL DRM_IOWR(DRM_COMMAND_BASE + DRM_AMDGPU_USERQ_SIGNAL, struct drm_amdgpu_userq_signal) +#define DRM_IOCTL_AMDGPU_USERQ_WAIT DRM_IOWR(DRM_COMMAND_BASE + DRM_AMDGPU_USERQ_WAIT, struct drm_amdgpu_userq_wait) +#define DRM_IOCTL_AMDGPU_GEM_LIST_HANDLES DRM_IOWR(DRM_COMMAND_BASE + DRM_AMDGPU_GEM_LIST_HANDLES, struct drm_amdgpu_gem_list_handles) + +/** + * DOC: memory domains + * + * %AMDGPU_GEM_DOMAIN_CPU System memory that is not GPU accessible. + * Memory in this pool could be swapped out to disk if there is pressure. + * + * %AMDGPU_GEM_DOMAIN_GTT GPU accessible system memory, mapped into the + * GPU's virtual address space via gart. Gart memory linearizes non-contiguous + * pages of system memory, allows GPU access system memory in a linearized + * fashion. + * + * %AMDGPU_GEM_DOMAIN_VRAM Local video memory. For APUs, it is memory + * carved out by the BIOS. + * + * %AMDGPU_GEM_DOMAIN_GDS Global on-chip data storage used to share data + * across shader threads. + * + * %AMDGPU_GEM_DOMAIN_GWS Global wave sync, used to synchronize the + * execution of all the waves on a device. + * + * %AMDGPU_GEM_DOMAIN_OA Ordered append, used by 3D or Compute engines + * for appending data. + * + * %AMDGPU_GEM_DOMAIN_DOORBELL Doorbell. It is an MMIO region for + * signalling user mode queues. + */ +#define AMDGPU_GEM_DOMAIN_CPU 0x1 +#define AMDGPU_GEM_DOMAIN_GTT 0x2 +#define AMDGPU_GEM_DOMAIN_VRAM 0x4 +#define AMDGPU_GEM_DOMAIN_GDS 0x8 +#define AMDGPU_GEM_DOMAIN_GWS 0x10 +#define AMDGPU_GEM_DOMAIN_OA 0x20 +#define AMDGPU_GEM_DOMAIN_DOORBELL 0x40 +#define AMDGPU_GEM_DOMAIN_MASK (AMDGPU_GEM_DOMAIN_CPU | \ + AMDGPU_GEM_DOMAIN_GTT | \ + AMDGPU_GEM_DOMAIN_VRAM | \ + AMDGPU_GEM_DOMAIN_GDS | \ + AMDGPU_GEM_DOMAIN_GWS | \ + AMDGPU_GEM_DOMAIN_OA | \ + AMDGPU_GEM_DOMAIN_DOORBELL) + +/* Flag that CPU access will be required for the case of VRAM domain */ +#define AMDGPU_GEM_CREATE_CPU_ACCESS_REQUIRED (1 << 0) +/* Flag that CPU access will not work, this VRAM domain is invisible */ +#define AMDGPU_GEM_CREATE_NO_CPU_ACCESS (1 << 1) +/* Flag that USWC attributes should be used for GTT */ +#define AMDGPU_GEM_CREATE_CPU_GTT_USWC (1 << 2) +/* Flag that the memory should be in VRAM and cleared */ +#define AMDGPU_GEM_CREATE_VRAM_CLEARED (1 << 3) +/* Flag that allocating the BO should use linear VRAM */ +#define AMDGPU_GEM_CREATE_VRAM_CONTIGUOUS (1 << 5) +/* Flag that BO is always valid in this VM */ +#define AMDGPU_GEM_CREATE_VM_ALWAYS_VALID (1 << 6) +/* Flag that BO sharing will be explicitly synchronized */ +#define AMDGPU_GEM_CREATE_EXPLICIT_SYNC (1 << 7) +/* Flag that indicates allocating MQD gart on GFX9, where the mtype + * for the second page onward should be set to NC. It should never + * be used by user space applications. + */ +#define AMDGPU_GEM_CREATE_CP_MQD_GFX9 (1 << 8) +/* Flag that BO may contain sensitive data that must be wiped before + * releasing the memory + */ +#define AMDGPU_GEM_CREATE_VRAM_WIPE_ON_RELEASE (1 << 9) +/* Flag that BO will be encrypted and that the TMZ bit should be + * set in the PTEs when mapping this buffer via GPUVM or + * accessing it with various hw blocks + */ +#define AMDGPU_GEM_CREATE_ENCRYPTED (1 << 10) +/* Flag that BO will be used only in preemptible context, which does + * not require GTT memory accounting + */ +#define AMDGPU_GEM_CREATE_PREEMPTIBLE (1 << 11) +/* Flag that BO can be discarded under memory pressure without keeping the + * content. + */ +#define AMDGPU_GEM_CREATE_DISCARDABLE (1 << 12) +/* Flag that BO is shared coherently between multiple devices or CPU threads. + * May depend on GPU instructions to flush caches to system scope explicitly. + * + * This influences the choice of MTYPE in the PTEs on GFXv9 and later GPUs and + * may override the MTYPE selected in AMDGPU_VA_OP_MAP. + */ +#define AMDGPU_GEM_CREATE_COHERENT (1 << 13) +/* Flag that BO should not be cached by GPU. Coherent without having to flush + * GPU caches explicitly + * + * This influences the choice of MTYPE in the PTEs on GFXv9 and later GPUs and + * may override the MTYPE selected in AMDGPU_VA_OP_MAP. + */ +#define AMDGPU_GEM_CREATE_UNCACHED (1 << 14) +/* Flag that BO should be coherent across devices when using device-level + * atomics. May depend on GPU instructions to flush caches to device scope + * explicitly, promoting them to system scope automatically. + * + * This influences the choice of MTYPE in the PTEs on GFXv9 and later GPUs and + * may override the MTYPE selected in AMDGPU_VA_OP_MAP. + */ +#define AMDGPU_GEM_CREATE_EXT_COHERENT (1 << 15) +/* Set PTE.D and recompress during GTT->VRAM moves according to TILING flags. */ +#define AMDGPU_GEM_CREATE_GFX12_DCC (1 << 16) + +struct drm_amdgpu_gem_create_in { + /** the requested memory size */ + __u64 bo_size; + /** physical start_addr alignment in bytes for some HW requirements */ + __u64 alignment; + /** the requested memory domains */ + __u64 domains; + /** allocation flags */ + __u64 domain_flags; +}; + +struct drm_amdgpu_gem_create_out { + /** returned GEM object handle */ + __u32 handle; + __u32 _pad; +}; + +union drm_amdgpu_gem_create { + struct drm_amdgpu_gem_create_in in; + struct drm_amdgpu_gem_create_out out; +}; + +/** Opcode to create new residency list. */ +#define AMDGPU_BO_LIST_OP_CREATE 0 +/** Opcode to destroy previously created residency list */ +#define AMDGPU_BO_LIST_OP_DESTROY 1 +/** Opcode to update resource information in the list */ +#define AMDGPU_BO_LIST_OP_UPDATE 2 + +struct drm_amdgpu_bo_list_in { + /** Type of operation */ + __u32 operation; + /** Handle of list or 0 if we want to create one */ + __u32 list_handle; + /** Number of BOs in list */ + __u32 bo_number; + /** Size of each element describing BO */ + __u32 bo_info_size; + /** Pointer to array describing BOs */ + __u64 bo_info_ptr; +}; + +struct drm_amdgpu_bo_list_entry { + /** Handle of BO */ + __u32 bo_handle; + /** New (if specified) BO priority to be used during migration */ + __u32 bo_priority; +}; + +struct drm_amdgpu_bo_list_out { + /** Handle of resource list */ + __u32 list_handle; + __u32 _pad; +}; + +union drm_amdgpu_bo_list { + struct drm_amdgpu_bo_list_in in; + struct drm_amdgpu_bo_list_out out; +}; + +/* context related */ +#define AMDGPU_CTX_OP_ALLOC_CTX 1 +#define AMDGPU_CTX_OP_FREE_CTX 2 +#define AMDGPU_CTX_OP_QUERY_STATE 3 +#define AMDGPU_CTX_OP_QUERY_STATE2 4 +#define AMDGPU_CTX_OP_GET_STABLE_PSTATE 5 +#define AMDGPU_CTX_OP_SET_STABLE_PSTATE 6 + +/* GPU reset status */ +#define AMDGPU_CTX_NO_RESET 0 +/* this the context caused it */ +#define AMDGPU_CTX_GUILTY_RESET 1 +/* some other context caused it */ +#define AMDGPU_CTX_INNOCENT_RESET 2 +/* unknown cause */ +#define AMDGPU_CTX_UNKNOWN_RESET 3 + +/* indicate gpu reset occurred after ctx created */ +#define AMDGPU_CTX_QUERY2_FLAGS_RESET (1<<0) +/* indicate vram lost occurred after ctx created */ +#define AMDGPU_CTX_QUERY2_FLAGS_VRAMLOST (1<<1) +/* indicate some job from this context once cause gpu hang */ +#define AMDGPU_CTX_QUERY2_FLAGS_GUILTY (1<<2) +/* indicate some errors are detected by RAS */ +#define AMDGPU_CTX_QUERY2_FLAGS_RAS_CE (1<<3) +#define AMDGPU_CTX_QUERY2_FLAGS_RAS_UE (1<<4) +/* indicate that the reset hasn't completed yet */ +#define AMDGPU_CTX_QUERY2_FLAGS_RESET_IN_PROGRESS (1<<5) + +/* Context priority level */ +#define AMDGPU_CTX_PRIORITY_UNSET -2048 +#define AMDGPU_CTX_PRIORITY_VERY_LOW -1023 +#define AMDGPU_CTX_PRIORITY_LOW -512 +#define AMDGPU_CTX_PRIORITY_NORMAL 0 +/* + * When used in struct drm_amdgpu_ctx_in, a priority above NORMAL requires + * CAP_SYS_NICE or DRM_MASTER +*/ +#define AMDGPU_CTX_PRIORITY_HIGH 512 +#define AMDGPU_CTX_PRIORITY_VERY_HIGH 1023 + +/* select a stable profiling pstate for perfmon tools */ +#define AMDGPU_CTX_STABLE_PSTATE_FLAGS_MASK 0xf +#define AMDGPU_CTX_STABLE_PSTATE_NONE 0 +#define AMDGPU_CTX_STABLE_PSTATE_STANDARD 1 +#define AMDGPU_CTX_STABLE_PSTATE_MIN_SCLK 2 +#define AMDGPU_CTX_STABLE_PSTATE_MIN_MCLK 3 +#define AMDGPU_CTX_STABLE_PSTATE_PEAK 4 + +struct drm_amdgpu_ctx_in { + /** AMDGPU_CTX_OP_* */ + __u32 op; + /** Flags */ + __u32 flags; + __u32 ctx_id; + /** AMDGPU_CTX_PRIORITY_* */ + __s32 priority; +}; + +union drm_amdgpu_ctx_out { + struct { + __u32 ctx_id; + __u32 _pad; + } alloc; + + struct { + /** For future use, no flags defined so far */ + __u64 flags; + /** Number of resets caused by this context so far. */ + __u32 hangs; + /** Reset status since the last call of the ioctl. */ + __u32 reset_status; + } state; + + struct { + __u32 flags; + __u32 _pad; + } pstate; +}; + +union drm_amdgpu_ctx { + struct drm_amdgpu_ctx_in in; + union drm_amdgpu_ctx_out out; +}; + +/* user queue IOCTL operations */ +#define AMDGPU_USERQ_OP_CREATE 1 +#define AMDGPU_USERQ_OP_FREE 2 + +/* queue priority levels */ +/* low < normal low < normal high < high */ +#define AMDGPU_USERQ_CREATE_FLAGS_QUEUE_PRIORITY_MASK 0x3 +#define AMDGPU_USERQ_CREATE_FLAGS_QUEUE_PRIORITY_SHIFT 0 +#define AMDGPU_USERQ_CREATE_FLAGS_QUEUE_PRIORITY_NORMAL_LOW 0 +#define AMDGPU_USERQ_CREATE_FLAGS_QUEUE_PRIORITY_LOW 1 +#define AMDGPU_USERQ_CREATE_FLAGS_QUEUE_PRIORITY_NORMAL_HIGH 2 +#define AMDGPU_USERQ_CREATE_FLAGS_QUEUE_PRIORITY_HIGH 3 /* admin only */ +/* for queues that need access to protected content */ +#define AMDGPU_USERQ_CREATE_FLAGS_QUEUE_SECURE (1 << 2) + +/* + * This structure is a container to pass input configuration + * info for all supported userqueue related operations. + * For operation AMDGPU_USERQ_OP_CREATE: user is expected + * to set all fields, excep the parameter 'queue_id'. + * For operation AMDGPU_USERQ_OP_FREE: the only input parameter expected + * to be set is 'queue_id', eveything else is ignored. + */ +struct drm_amdgpu_userq_in { + /** AMDGPU_USERQ_OP_* */ + __u32 op; + /** Queue id passed for operation USERQ_OP_FREE */ + __u32 queue_id; + /** the target GPU engine to execute workload (AMDGPU_HW_IP_*) */ + __u32 ip_type; + /** + * @doorbell_handle: the handle of doorbell GEM object + * associated with this userqueue client. + */ + __u32 doorbell_handle; + /** + * @doorbell_offset: 32-bit offset of the doorbell in the doorbell bo. + * Kernel will generate absolute doorbell offset using doorbell_handle + * and doorbell_offset in the doorbell bo. + */ + __u32 doorbell_offset; + /** + * @flags: flags used for queue parameters + */ + __u32 flags; + /** + * @queue_va: Virtual address of the GPU memory which holds the queue + * object. The queue holds the workload packets. + */ + __u64 queue_va; + /** + * @queue_size: Size of the queue in bytes, this needs to be 256-byte + * aligned. + */ + __u64 queue_size; + /** + * @rptr_va : Virtual address of the GPU memory which holds the ring RPTR. + * This object must be at least 8 byte in size and aligned to 8-byte offset. + */ + __u64 rptr_va; + /** + * @wptr_va : Virtual address of the GPU memory which holds the ring WPTR. + * This object must be at least 8 byte in size and aligned to 8-byte offset. + * + * Queue, RPTR and WPTR can come from the same object, as long as the size + * and alignment related requirements are met. + */ + __u64 wptr_va; + /** + * @mqd: MQD (memory queue descriptor) is a set of parameters which allow + * the GPU to uniquely define and identify a usermode queue. + * + * MQD data can be of different size for different GPU IP/engine and + * their respective versions/revisions, so this points to a __u64 * + * which holds IP specific MQD of this usermode queue. + */ + __u64 mqd; + /** + * @size: size of MQD data in bytes, it must match the MQD structure + * size of the respective engine/revision defined in UAPI for ex, for + * gfx11 workloads, size = sizeof(drm_amdgpu_userq_mqd_gfx11). + */ + __u64 mqd_size; +}; + +/* The structure to carry output of userqueue ops */ +struct drm_amdgpu_userq_out { + /** + * For operation AMDGPU_USERQ_OP_CREATE: This field contains a unique + * queue ID to represent the newly created userqueue in the system, otherwise + * it should be ignored. + */ + __u32 queue_id; + __u32 _pad; +}; + +union drm_amdgpu_userq { + struct drm_amdgpu_userq_in in; + struct drm_amdgpu_userq_out out; +}; + +/* GFX V11 IP specific MQD parameters */ +struct drm_amdgpu_userq_mqd_gfx11 { + /** + * @shadow_va: Virtual address of the GPU memory to hold the shadow buffer. + * Use AMDGPU_INFO_IOCTL to find the exact size of the object. + */ + __u64 shadow_va; + /** + * @csa_va: Virtual address of the GPU memory to hold the CSA buffer. + * Use AMDGPU_INFO_IOCTL to find the exact size of the object. + */ + __u64 csa_va; +}; + +/* GFX V11 SDMA IP specific MQD parameters */ +struct drm_amdgpu_userq_mqd_sdma_gfx11 { + /** + * @csa_va: Virtual address of the GPU memory to hold the CSA buffer. + * This must be a from a separate GPU object, and use AMDGPU_INFO IOCTL + * to get the size. + */ + __u64 csa_va; +}; + +/* GFX V11 Compute IP specific MQD parameters */ +struct drm_amdgpu_userq_mqd_compute_gfx11 { + /** + * @eop_va: Virtual address of the GPU memory to hold the EOP buffer. + * This must be a from a separate GPU object, and use AMDGPU_INFO IOCTL + * to get the size. + */ + __u64 eop_va; +}; + +/* userq signal/wait ioctl */ +struct drm_amdgpu_userq_signal { + /** + * @queue_id: Queue handle used by the userq fence creation function + * to retrieve the WPTR. + */ + __u32 queue_id; + __u32 pad; + /** + * @syncobj_handles: The list of syncobj handles submitted by the user queue + * job to be signaled. + */ + __u64 syncobj_handles; + /** + * @num_syncobj_handles: A count that represents the number of syncobj handles in + * @syncobj_handles. + */ + __u64 num_syncobj_handles; + /** + * @bo_read_handles: The list of BO handles that the submitted user queue job + * is using for read only. This will update BO fences in the kernel. + */ + __u64 bo_read_handles; + /** + * @bo_write_handles: The list of BO handles that the submitted user queue job + * is using for write only. This will update BO fences in the kernel. + */ + __u64 bo_write_handles; + /** + * @num_bo_read_handles: A count that represents the number of read BO handles in + * @bo_read_handles. + */ + __u32 num_bo_read_handles; + /** + * @num_bo_write_handles: A count that represents the number of write BO handles in + * @bo_write_handles. + */ + __u32 num_bo_write_handles; +}; + +struct drm_amdgpu_userq_fence_info { + /** + * @va: A gpu address allocated for each queue which stores the + * read pointer (RPTR) value. + */ + __u64 va; + /** + * @value: A 64 bit value represents the write pointer (WPTR) of the + * queue commands which compared with the RPTR value to signal the + * fences. + */ + __u64 value; +}; + +struct drm_amdgpu_userq_wait { + /** + * @waitq_id: Queue handle used by the userq wait IOCTL to retrieve the + * wait queue and maintain the fence driver references in it. + */ + __u32 waitq_id; + __u32 pad; + /** + * @syncobj_handles: The list of syncobj handles submitted by the user queue + * job to get the va/value pairs. + */ + __u64 syncobj_handles; + /** + * @syncobj_timeline_handles: The list of timeline syncobj handles submitted by + * the user queue job to get the va/value pairs at given @syncobj_timeline_points. + */ + __u64 syncobj_timeline_handles; + /** + * @syncobj_timeline_points: The list of timeline syncobj points submitted by the + * user queue job for the corresponding @syncobj_timeline_handles. + */ + __u64 syncobj_timeline_points; + /** + * @bo_read_handles: The list of read BO handles submitted by the user queue + * job to get the va/value pairs. + */ + __u64 bo_read_handles; + /** + * @bo_write_handles: The list of write BO handles submitted by the user queue + * job to get the va/value pairs. + */ + __u64 bo_write_handles; + /** + * @num_syncobj_timeline_handles: A count that represents the number of timeline + * syncobj handles in @syncobj_timeline_handles. + */ + __u16 num_syncobj_timeline_handles; + /** + * @num_fences: This field can be used both as input and output. As input it defines + * the maximum number of fences that can be returned and as output it will specify + * how many fences were actually returned from the ioctl. + */ + __u16 num_fences; + /** + * @num_syncobj_handles: A count that represents the number of syncobj handles in + * @syncobj_handles. + */ + __u32 num_syncobj_handles; + /** + * @num_bo_read_handles: A count that represents the number of read BO handles in + * @bo_read_handles. + */ + __u32 num_bo_read_handles; + /** + * @num_bo_write_handles: A count that represents the number of write BO handles in + * @bo_write_handles. + */ + __u32 num_bo_write_handles; + /** + * @out_fences: The field is a return value from the ioctl containing the list of + * address/value pairs to wait for. + */ + __u64 out_fences; +}; + +/* vm ioctl */ +#define AMDGPU_VM_OP_RESERVE_VMID 1 +#define AMDGPU_VM_OP_UNRESERVE_VMID 2 + +struct drm_amdgpu_vm_in { + /** AMDGPU_VM_OP_* */ + __u32 op; + __u32 flags; +}; + +struct drm_amdgpu_vm_out { + /** For future use, no flags defined so far */ + __u64 flags; +}; + +union drm_amdgpu_vm { + struct drm_amdgpu_vm_in in; + struct drm_amdgpu_vm_out out; +}; + +/* sched ioctl */ +#define AMDGPU_SCHED_OP_PROCESS_PRIORITY_OVERRIDE 1 +#define AMDGPU_SCHED_OP_CONTEXT_PRIORITY_OVERRIDE 2 + +struct drm_amdgpu_sched_in { + /* AMDGPU_SCHED_OP_* */ + __u32 op; + __u32 fd; + /** AMDGPU_CTX_PRIORITY_* */ + __s32 priority; + __u32 ctx_id; +}; + +union drm_amdgpu_sched { + struct drm_amdgpu_sched_in in; +}; + +/* + * This is not a reliable API and you should expect it to fail for any + * number of reasons and have fallback path that do not use userptr to + * perform any operation. + */ +#define AMDGPU_GEM_USERPTR_READONLY (1 << 0) +#define AMDGPU_GEM_USERPTR_ANONONLY (1 << 1) +#define AMDGPU_GEM_USERPTR_VALIDATE (1 << 2) +#define AMDGPU_GEM_USERPTR_REGISTER (1 << 3) + +struct drm_amdgpu_gem_userptr { + __u64 addr; + __u64 size; + /* AMDGPU_GEM_USERPTR_* */ + __u32 flags; + /* Resulting GEM handle */ + __u32 handle; +}; + +/* SI-CI-VI: */ +/* same meaning as the GB_TILE_MODE and GL_MACRO_TILE_MODE fields */ +#define AMDGPU_TILING_ARRAY_MODE_SHIFT 0 +#define AMDGPU_TILING_ARRAY_MODE_MASK 0xf +#define AMDGPU_TILING_PIPE_CONFIG_SHIFT 4 +#define AMDGPU_TILING_PIPE_CONFIG_MASK 0x1f +#define AMDGPU_TILING_TILE_SPLIT_SHIFT 9 +#define AMDGPU_TILING_TILE_SPLIT_MASK 0x7 +#define AMDGPU_TILING_MICRO_TILE_MODE_SHIFT 12 +#define AMDGPU_TILING_MICRO_TILE_MODE_MASK 0x7 +#define AMDGPU_TILING_BANK_WIDTH_SHIFT 15 +#define AMDGPU_TILING_BANK_WIDTH_MASK 0x3 +#define AMDGPU_TILING_BANK_HEIGHT_SHIFT 17 +#define AMDGPU_TILING_BANK_HEIGHT_MASK 0x3 +#define AMDGPU_TILING_MACRO_TILE_ASPECT_SHIFT 19 +#define AMDGPU_TILING_MACRO_TILE_ASPECT_MASK 0x3 +#define AMDGPU_TILING_NUM_BANKS_SHIFT 21 +#define AMDGPU_TILING_NUM_BANKS_MASK 0x3 + +/* GFX9 - GFX11: */ +#define AMDGPU_TILING_SWIZZLE_MODE_SHIFT 0 +#define AMDGPU_TILING_SWIZZLE_MODE_MASK 0x1f +#define AMDGPU_TILING_DCC_OFFSET_256B_SHIFT 5 +#define AMDGPU_TILING_DCC_OFFSET_256B_MASK 0xFFFFFF +#define AMDGPU_TILING_DCC_PITCH_MAX_SHIFT 29 +#define AMDGPU_TILING_DCC_PITCH_MAX_MASK 0x3FFF +#define AMDGPU_TILING_DCC_INDEPENDENT_64B_SHIFT 43 +#define AMDGPU_TILING_DCC_INDEPENDENT_64B_MASK 0x1 +#define AMDGPU_TILING_DCC_INDEPENDENT_128B_SHIFT 44 +#define AMDGPU_TILING_DCC_INDEPENDENT_128B_MASK 0x1 +#define AMDGPU_TILING_SCANOUT_SHIFT 63 +#define AMDGPU_TILING_SCANOUT_MASK 0x1 + +/* GFX12 and later: */ +#define AMDGPU_TILING_GFX12_SWIZZLE_MODE_SHIFT 0 +#define AMDGPU_TILING_GFX12_SWIZZLE_MODE_MASK 0x7 +/* These are DCC recompression settings for memory management: */ +#define AMDGPU_TILING_GFX12_DCC_MAX_COMPRESSED_BLOCK_SHIFT 3 +#define AMDGPU_TILING_GFX12_DCC_MAX_COMPRESSED_BLOCK_MASK 0x3 /* 0:64B, 1:128B, 2:256B */ +#define AMDGPU_TILING_GFX12_DCC_NUMBER_TYPE_SHIFT 5 +#define AMDGPU_TILING_GFX12_DCC_NUMBER_TYPE_MASK 0x7 /* CB_COLOR0_INFO.NUMBER_TYPE */ +#define AMDGPU_TILING_GFX12_DCC_DATA_FORMAT_SHIFT 8 +#define AMDGPU_TILING_GFX12_DCC_DATA_FORMAT_MASK 0x3f /* [0:4]:CB_COLOR0_INFO.FORMAT, [5]:MM */ +/* When clearing the buffer or moving it from VRAM to GTT, don't compress and set DCC metadata + * to uncompressed. Set when parts of an allocation bypass DCC and read raw data. */ +#define AMDGPU_TILING_GFX12_DCC_WRITE_COMPRESS_DISABLE_SHIFT 14 +#define AMDGPU_TILING_GFX12_DCC_WRITE_COMPRESS_DISABLE_MASK 0x1 +/* bit gap */ +#define AMDGPU_TILING_GFX12_SCANOUT_SHIFT 63 +#define AMDGPU_TILING_GFX12_SCANOUT_MASK 0x1 + +/* Set/Get helpers for tiling flags. */ +#define AMDGPU_TILING_SET(field, value) \ + (((__u64)(value) & AMDGPU_TILING_##field##_MASK) << AMDGPU_TILING_##field##_SHIFT) +#define AMDGPU_TILING_GET(value, field) \ + (((__u64)(value) >> AMDGPU_TILING_##field##_SHIFT) & AMDGPU_TILING_##field##_MASK) + +#define AMDGPU_GEM_METADATA_OP_SET_METADATA 1 +#define AMDGPU_GEM_METADATA_OP_GET_METADATA 2 + +/** The same structure is shared for input/output */ +struct drm_amdgpu_gem_metadata { + /** GEM Object handle */ + __u32 handle; + /** Do we want get or set metadata */ + __u32 op; + struct { + /** For future use, no flags defined so far */ + __u64 flags; + /** family specific tiling info */ + __u64 tiling_info; + __u32 data_size_bytes; + __u32 data[64]; + } data; +}; + +struct drm_amdgpu_gem_mmap_in { + /** the GEM object handle */ + __u32 handle; + __u32 _pad; +}; + +struct drm_amdgpu_gem_mmap_out { + /** mmap offset from the vma offset manager */ + __u64 addr_ptr; +}; + +union drm_amdgpu_gem_mmap { + struct drm_amdgpu_gem_mmap_in in; + struct drm_amdgpu_gem_mmap_out out; +}; + +struct drm_amdgpu_gem_wait_idle_in { + /** GEM object handle */ + __u32 handle; + /** For future use, no flags defined so far */ + __u32 flags; + /** Absolute timeout to wait */ + __u64 timeout; +}; + +struct drm_amdgpu_gem_wait_idle_out { + /** BO status: 0 - BO is idle, 1 - BO is busy */ + __u32 status; + /** Returned current memory domain */ + __u32 domain; +}; + +union drm_amdgpu_gem_wait_idle { + struct drm_amdgpu_gem_wait_idle_in in; + struct drm_amdgpu_gem_wait_idle_out out; +}; + +struct drm_amdgpu_wait_cs_in { + /* Command submission handle + * handle equals 0 means none to wait for + * handle equals ~0ull means wait for the latest sequence number + */ + __u64 handle; + /** Absolute timeout to wait */ + __u64 timeout; + __u32 ip_type; + __u32 ip_instance; + __u32 ring; + __u32 ctx_id; +}; + +struct drm_amdgpu_wait_cs_out { + /** CS status: 0 - CS completed, 1 - CS still busy */ + __u64 status; +}; + +union drm_amdgpu_wait_cs { + struct drm_amdgpu_wait_cs_in in; + struct drm_amdgpu_wait_cs_out out; +}; + +struct drm_amdgpu_fence { + __u32 ctx_id; + __u32 ip_type; + __u32 ip_instance; + __u32 ring; + __u64 seq_no; +}; + +struct drm_amdgpu_wait_fences_in { + /** This points to uint64_t * which points to fences */ + __u64 fences; + __u32 fence_count; + __u32 wait_all; + __u64 timeout_ns; +}; + +struct drm_amdgpu_wait_fences_out { + __u32 status; + __u32 first_signaled; +}; + +union drm_amdgpu_wait_fences { + struct drm_amdgpu_wait_fences_in in; + struct drm_amdgpu_wait_fences_out out; +}; + +#define AMDGPU_GEM_OP_GET_GEM_CREATE_INFO 0 +#define AMDGPU_GEM_OP_SET_PLACEMENT 1 +#define AMDGPU_GEM_OP_GET_MAPPING_INFO 2 + +struct drm_amdgpu_gem_vm_entry { + /* Start of mapping (in bytes) */ + __u64 addr; + + /* Size of mapping (in bytes) */ + __u64 size; + + /* Mapping offset */ + __u64 offset; + + /* flags needed to recreate mapping */ + __u64 flags; +}; + +/* Sets or returns a value associated with a buffer. */ +struct drm_amdgpu_gem_op { + /** GEM object handle */ + __u32 handle; + /** AMDGPU_GEM_OP_* */ + __u32 op; + /** Input or return value. For MAPPING_INFO op: pointer to array of struct drm_amdgpu_gem_vm_entry */ + __u64 value; + /** For MAPPING_INFO op: number of mappings (in/out) */ + __u32 num_entries; + + __u32 padding; +}; + +#define AMDGPU_GEM_LIST_HANDLES_FLAG_IS_IMPORT (1 << 0) + +struct drm_amdgpu_gem_list_handles { + /* User pointer to array of drm_amdgpu_gem_bo_info_entry */ + __u64 entries; + + /* Size of entries buffer / Number of handles in process (if larger than size of buffer, must retry) */ + __u32 num_entries; + + __u32 padding; +}; + +struct drm_amdgpu_gem_list_handles_entry { + /* gem handle of buffer object */ + __u32 gem_handle; + + /* Currently just one flag: IS_IMPORT */ + __u32 flags; + + /* Size of bo */ + __u64 size; + + /* Preferred domains for GEM_CREATE */ + __u64 preferred_domains; + + /* GEM_CREATE flags for re-creation of buffer */ + __u64 alloc_flags; + + /* physical start_addr alignment in bytes for some HW requirements */ + __u64 alignment; +}; + +#define AMDGPU_VA_OP_MAP 1 +#define AMDGPU_VA_OP_UNMAP 2 +#define AMDGPU_VA_OP_CLEAR 3 +#define AMDGPU_VA_OP_REPLACE 4 + +/* Delay the page table update till the next CS */ +#define AMDGPU_VM_DELAY_UPDATE (1 << 0) + +/* Mapping flags */ +/* readable mapping */ +#define AMDGPU_VM_PAGE_READABLE (1 << 1) +/* writable mapping */ +#define AMDGPU_VM_PAGE_WRITEABLE (1 << 2) +/* executable mapping, new for VI */ +#define AMDGPU_VM_PAGE_EXECUTABLE (1 << 3) +/* partially resident texture */ +#define AMDGPU_VM_PAGE_PRT (1 << 4) +/* MTYPE flags use bit 5 to 8 */ +#define AMDGPU_VM_MTYPE_MASK (0xf << 5) +/* Default MTYPE. Pre-AI must use this. Recommended for newer ASICs. */ +#define AMDGPU_VM_MTYPE_DEFAULT (0 << 5) +/* Use Non Coherent MTYPE instead of default MTYPE */ +#define AMDGPU_VM_MTYPE_NC (1 << 5) +/* Use Write Combine MTYPE instead of default MTYPE */ +#define AMDGPU_VM_MTYPE_WC (2 << 5) +/* Use Cache Coherent MTYPE instead of default MTYPE */ +#define AMDGPU_VM_MTYPE_CC (3 << 5) +/* Use UnCached MTYPE instead of default MTYPE */ +#define AMDGPU_VM_MTYPE_UC (4 << 5) +/* Use Read Write MTYPE instead of default MTYPE */ +#define AMDGPU_VM_MTYPE_RW (5 << 5) +/* don't allocate MALL */ +#define AMDGPU_VM_PAGE_NOALLOC (1 << 9) + +struct drm_amdgpu_gem_va { + /** GEM object handle */ + __u32 handle; + __u32 _pad; + /** AMDGPU_VA_OP_* */ + __u32 operation; + /** AMDGPU_VM_PAGE_* */ + __u32 flags; + /** va address to assign . Must be correctly aligned.*/ + __u64 va_address; + /** Specify offset inside of BO to assign. Must be correctly aligned.*/ + __u64 offset_in_bo; + /** Specify mapping size. Must be correctly aligned. */ + __u64 map_size; + /** + * vm_timeline_point is a sequence number used to add new timeline point. + */ + __u64 vm_timeline_point; + /** + * The vm page table update fence is installed in given vm_timeline_syncobj_out + * at vm_timeline_point. + */ + __u32 vm_timeline_syncobj_out; + /** the number of syncobj handles in @input_fence_syncobj_handles */ + __u32 num_syncobj_handles; + /** Array of sync object handle to wait for given input fences */ + __u64 input_fence_syncobj_handles; +}; + +#define AMDGPU_HW_IP_GFX 0 +#define AMDGPU_HW_IP_COMPUTE 1 +#define AMDGPU_HW_IP_DMA 2 +#define AMDGPU_HW_IP_UVD 3 +#define AMDGPU_HW_IP_VCE 4 +#define AMDGPU_HW_IP_UVD_ENC 5 +#define AMDGPU_HW_IP_VCN_DEC 6 +/* + * From VCN4, AMDGPU_HW_IP_VCN_ENC is re-used to support + * both encoding and decoding jobs. + */ +#define AMDGPU_HW_IP_VCN_ENC 7 +#define AMDGPU_HW_IP_VCN_JPEG 8 +#define AMDGPU_HW_IP_VPE 9 +#define AMDGPU_HW_IP_NUM 10 + +#define AMDGPU_HW_IP_INSTANCE_MAX_COUNT 1 + +#define AMDGPU_CHUNK_ID_IB 0x01 +#define AMDGPU_CHUNK_ID_FENCE 0x02 +#define AMDGPU_CHUNK_ID_DEPENDENCIES 0x03 +#define AMDGPU_CHUNK_ID_SYNCOBJ_IN 0x04 +#define AMDGPU_CHUNK_ID_SYNCOBJ_OUT 0x05 +#define AMDGPU_CHUNK_ID_BO_HANDLES 0x06 +#define AMDGPU_CHUNK_ID_SCHEDULED_DEPENDENCIES 0x07 +#define AMDGPU_CHUNK_ID_SYNCOBJ_TIMELINE_WAIT 0x08 +#define AMDGPU_CHUNK_ID_SYNCOBJ_TIMELINE_SIGNAL 0x09 +#define AMDGPU_CHUNK_ID_CP_GFX_SHADOW 0x0a + +struct drm_amdgpu_cs_chunk { + __u32 chunk_id; + __u32 length_dw; + __u64 chunk_data; +}; + +struct drm_amdgpu_cs_in { + /** Rendering context id */ + __u32 ctx_id; + /** Handle of resource list associated with CS */ + __u32 bo_list_handle; + __u32 num_chunks; + __u32 flags; + /** this points to __u64 * which point to cs chunks */ + __u64 chunks; +}; + +struct drm_amdgpu_cs_out { + __u64 handle; +}; + +union drm_amdgpu_cs { + struct drm_amdgpu_cs_in in; + struct drm_amdgpu_cs_out out; +}; + +/* Specify flags to be used for IB */ + +/* This IB should be submitted to CE */ +#define AMDGPU_IB_FLAG_CE (1<<0) + +/* Preamble flag, which means the IB could be dropped if no context switch */ +#define AMDGPU_IB_FLAG_PREAMBLE (1<<1) + +/* Preempt flag, IB should set Pre_enb bit if PREEMPT flag detected */ +#define AMDGPU_IB_FLAG_PREEMPT (1<<2) + +/* The IB fence should do the L2 writeback but not invalidate any shader + * caches (L2/vL1/sL1/I$). */ +#define AMDGPU_IB_FLAG_TC_WB_NOT_INVALIDATE (1 << 3) + +/* Set GDS_COMPUTE_MAX_WAVE_ID = DEFAULT before PACKET3_INDIRECT_BUFFER. + * This will reset wave ID counters for the IB. + */ +#define AMDGPU_IB_FLAG_RESET_GDS_MAX_WAVE_ID (1 << 4) + +/* Flag the IB as secure (TMZ) + */ +#define AMDGPU_IB_FLAGS_SECURE (1 << 5) + +/* Tell KMD to flush and invalidate caches + */ +#define AMDGPU_IB_FLAG_EMIT_MEM_SYNC (1 << 6) + +struct drm_amdgpu_cs_chunk_ib { + __u32 _pad; + /** AMDGPU_IB_FLAG_* */ + __u32 flags; + /** Virtual address to begin IB execution */ + __u64 va_start; + /** Size of submission */ + __u32 ib_bytes; + /** HW IP to submit to */ + __u32 ip_type; + /** HW IP index of the same type to submit to */ + __u32 ip_instance; + /** Ring index to submit to */ + __u32 ring; +}; + +struct drm_amdgpu_cs_chunk_dep { + __u32 ip_type; + __u32 ip_instance; + __u32 ring; + __u32 ctx_id; + __u64 handle; +}; + +struct drm_amdgpu_cs_chunk_fence { + __u32 handle; + __u32 offset; +}; + +struct drm_amdgpu_cs_chunk_sem { + __u32 handle; +}; + +struct drm_amdgpu_cs_chunk_syncobj { + __u32 handle; + __u32 flags; + __u64 point; +}; + +#define AMDGPU_FENCE_TO_HANDLE_GET_SYNCOBJ 0 +#define AMDGPU_FENCE_TO_HANDLE_GET_SYNCOBJ_FD 1 +#define AMDGPU_FENCE_TO_HANDLE_GET_SYNC_FILE_FD 2 + +union drm_amdgpu_fence_to_handle { + struct { + struct drm_amdgpu_fence fence; + __u32 what; + __u32 pad; + } in; + struct { + __u32 handle; + } out; +}; + +struct drm_amdgpu_cs_chunk_data { + union { + struct drm_amdgpu_cs_chunk_ib ib_data; + struct drm_amdgpu_cs_chunk_fence fence_data; + }; +}; + +#define AMDGPU_CS_CHUNK_CP_GFX_SHADOW_FLAGS_INIT_SHADOW 0x1 + +struct drm_amdgpu_cs_chunk_cp_gfx_shadow { + __u64 shadow_va; + __u64 csa_va; + __u64 gds_va; + __u64 flags; +}; + +/* + * Query h/w info: Flag that this is integrated (a.h.a. fusion) GPU + * + */ +#define AMDGPU_IDS_FLAGS_FUSION 0x1 +#define AMDGPU_IDS_FLAGS_PREEMPTION 0x2 +#define AMDGPU_IDS_FLAGS_TMZ 0x4 +#define AMDGPU_IDS_FLAGS_CONFORMANT_TRUNC_COORD 0x8 + +/* + * Query h/w info: Flag identifying VF/PF/PT mode + * + */ +#define AMDGPU_IDS_FLAGS_MODE_MASK 0x300 +#define AMDGPU_IDS_FLAGS_MODE_SHIFT 0x8 +#define AMDGPU_IDS_FLAGS_MODE_PF 0x0 +#define AMDGPU_IDS_FLAGS_MODE_VF 0x1 +#define AMDGPU_IDS_FLAGS_MODE_PT 0x2 + +/* indicate if acceleration can be working */ +#define AMDGPU_INFO_ACCEL_WORKING 0x00 +/* get the crtc_id from the mode object id? */ +#define AMDGPU_INFO_CRTC_FROM_ID 0x01 +/* query hw IP info */ +#define AMDGPU_INFO_HW_IP_INFO 0x02 +/* query hw IP instance count for the specified type */ +#define AMDGPU_INFO_HW_IP_COUNT 0x03 +/* timestamp for GL_ARB_timer_query */ +#define AMDGPU_INFO_TIMESTAMP 0x05 +/* Query the firmware version */ +#define AMDGPU_INFO_FW_VERSION 0x0e + /* Subquery id: Query VCE firmware version */ + #define AMDGPU_INFO_FW_VCE 0x1 + /* Subquery id: Query UVD firmware version */ + #define AMDGPU_INFO_FW_UVD 0x2 + /* Subquery id: Query GMC firmware version */ + #define AMDGPU_INFO_FW_GMC 0x03 + /* Subquery id: Query GFX ME firmware version */ + #define AMDGPU_INFO_FW_GFX_ME 0x04 + /* Subquery id: Query GFX PFP firmware version */ + #define AMDGPU_INFO_FW_GFX_PFP 0x05 + /* Subquery id: Query GFX CE firmware version */ + #define AMDGPU_INFO_FW_GFX_CE 0x06 + /* Subquery id: Query GFX RLC firmware version */ + #define AMDGPU_INFO_FW_GFX_RLC 0x07 + /* Subquery id: Query GFX MEC firmware version */ + #define AMDGPU_INFO_FW_GFX_MEC 0x08 + /* Subquery id: Query SMC firmware version */ + #define AMDGPU_INFO_FW_SMC 0x0a + /* Subquery id: Query SDMA firmware version */ + #define AMDGPU_INFO_FW_SDMA 0x0b + /* Subquery id: Query PSP SOS firmware version */ + #define AMDGPU_INFO_FW_SOS 0x0c + /* Subquery id: Query PSP ASD firmware version */ + #define AMDGPU_INFO_FW_ASD 0x0d + /* Subquery id: Query VCN firmware version */ + #define AMDGPU_INFO_FW_VCN 0x0e + /* Subquery id: Query GFX RLC SRLC firmware version */ + #define AMDGPU_INFO_FW_GFX_RLC_RESTORE_LIST_CNTL 0x0f + /* Subquery id: Query GFX RLC SRLG firmware version */ + #define AMDGPU_INFO_FW_GFX_RLC_RESTORE_LIST_GPM_MEM 0x10 + /* Subquery id: Query GFX RLC SRLS firmware version */ + #define AMDGPU_INFO_FW_GFX_RLC_RESTORE_LIST_SRM_MEM 0x11 + /* Subquery id: Query DMCU firmware version */ + #define AMDGPU_INFO_FW_DMCU 0x12 + #define AMDGPU_INFO_FW_TA 0x13 + /* Subquery id: Query DMCUB firmware version */ + #define AMDGPU_INFO_FW_DMCUB 0x14 + /* Subquery id: Query TOC firmware version */ + #define AMDGPU_INFO_FW_TOC 0x15 + /* Subquery id: Query CAP firmware version */ + #define AMDGPU_INFO_FW_CAP 0x16 + /* Subquery id: Query GFX RLCP firmware version */ + #define AMDGPU_INFO_FW_GFX_RLCP 0x17 + /* Subquery id: Query GFX RLCV firmware version */ + #define AMDGPU_INFO_FW_GFX_RLCV 0x18 + /* Subquery id: Query MES_KIQ firmware version */ + #define AMDGPU_INFO_FW_MES_KIQ 0x19 + /* Subquery id: Query MES firmware version */ + #define AMDGPU_INFO_FW_MES 0x1a + /* Subquery id: Query IMU firmware version */ + #define AMDGPU_INFO_FW_IMU 0x1b + /* Subquery id: Query VPE firmware version */ + #define AMDGPU_INFO_FW_VPE 0x1c + +/* number of bytes moved for TTM migration */ +#define AMDGPU_INFO_NUM_BYTES_MOVED 0x0f +/* the used VRAM size */ +#define AMDGPU_INFO_VRAM_USAGE 0x10 +/* the used GTT size */ +#define AMDGPU_INFO_GTT_USAGE 0x11 +/* Information about GDS, etc. resource configuration */ +#define AMDGPU_INFO_GDS_CONFIG 0x13 +/* Query information about VRAM and GTT domains */ +#define AMDGPU_INFO_VRAM_GTT 0x14 +/* Query information about register in MMR address space*/ +#define AMDGPU_INFO_READ_MMR_REG 0x15 +/* Query information about device: rev id, family, etc. */ +#define AMDGPU_INFO_DEV_INFO 0x16 +/* visible vram usage */ +#define AMDGPU_INFO_VIS_VRAM_USAGE 0x17 +/* number of TTM buffer evictions */ +#define AMDGPU_INFO_NUM_EVICTIONS 0x18 +/* Query memory about VRAM and GTT domains */ +#define AMDGPU_INFO_MEMORY 0x19 +/* Query vce clock table */ +#define AMDGPU_INFO_VCE_CLOCK_TABLE 0x1A +/* Query vbios related information */ +#define AMDGPU_INFO_VBIOS 0x1B + /* Subquery id: Query vbios size */ + #define AMDGPU_INFO_VBIOS_SIZE 0x1 + /* Subquery id: Query vbios image */ + #define AMDGPU_INFO_VBIOS_IMAGE 0x2 + /* Subquery id: Query vbios info */ + #define AMDGPU_INFO_VBIOS_INFO 0x3 +/* Query UVD handles */ +#define AMDGPU_INFO_NUM_HANDLES 0x1C +/* Query sensor related information */ +#define AMDGPU_INFO_SENSOR 0x1D + /* Subquery id: Query GPU shader clock */ + #define AMDGPU_INFO_SENSOR_GFX_SCLK 0x1 + /* Subquery id: Query GPU memory clock */ + #define AMDGPU_INFO_SENSOR_GFX_MCLK 0x2 + /* Subquery id: Query GPU temperature */ + #define AMDGPU_INFO_SENSOR_GPU_TEMP 0x3 + /* Subquery id: Query GPU load */ + #define AMDGPU_INFO_SENSOR_GPU_LOAD 0x4 + /* Subquery id: Query average GPU power */ + #define AMDGPU_INFO_SENSOR_GPU_AVG_POWER 0x5 + /* Subquery id: Query northbridge voltage */ + #define AMDGPU_INFO_SENSOR_VDDNB 0x6 + /* Subquery id: Query graphics voltage */ + #define AMDGPU_INFO_SENSOR_VDDGFX 0x7 + /* Subquery id: Query GPU stable pstate shader clock */ + #define AMDGPU_INFO_SENSOR_STABLE_PSTATE_GFX_SCLK 0x8 + /* Subquery id: Query GPU stable pstate memory clock */ + #define AMDGPU_INFO_SENSOR_STABLE_PSTATE_GFX_MCLK 0x9 + /* Subquery id: Query GPU peak pstate shader clock */ + #define AMDGPU_INFO_SENSOR_PEAK_PSTATE_GFX_SCLK 0xa + /* Subquery id: Query GPU peak pstate memory clock */ + #define AMDGPU_INFO_SENSOR_PEAK_PSTATE_GFX_MCLK 0xb + /* Subquery id: Query input GPU power */ + #define AMDGPU_INFO_SENSOR_GPU_INPUT_POWER 0xc +/* Number of VRAM page faults on CPU access. */ +#define AMDGPU_INFO_NUM_VRAM_CPU_PAGE_FAULTS 0x1E +#define AMDGPU_INFO_VRAM_LOST_COUNTER 0x1F +/* query ras mask of enabled features*/ +#define AMDGPU_INFO_RAS_ENABLED_FEATURES 0x20 +/* RAS MASK: UMC (VRAM) */ +#define AMDGPU_INFO_RAS_ENABLED_UMC (1 << 0) +/* RAS MASK: SDMA */ +#define AMDGPU_INFO_RAS_ENABLED_SDMA (1 << 1) +/* RAS MASK: GFX */ +#define AMDGPU_INFO_RAS_ENABLED_GFX (1 << 2) +/* RAS MASK: MMHUB */ +#define AMDGPU_INFO_RAS_ENABLED_MMHUB (1 << 3) +/* RAS MASK: ATHUB */ +#define AMDGPU_INFO_RAS_ENABLED_ATHUB (1 << 4) +/* RAS MASK: PCIE */ +#define AMDGPU_INFO_RAS_ENABLED_PCIE (1 << 5) +/* RAS MASK: HDP */ +#define AMDGPU_INFO_RAS_ENABLED_HDP (1 << 6) +/* RAS MASK: XGMI */ +#define AMDGPU_INFO_RAS_ENABLED_XGMI (1 << 7) +/* RAS MASK: DF */ +#define AMDGPU_INFO_RAS_ENABLED_DF (1 << 8) +/* RAS MASK: SMN */ +#define AMDGPU_INFO_RAS_ENABLED_SMN (1 << 9) +/* RAS MASK: SEM */ +#define AMDGPU_INFO_RAS_ENABLED_SEM (1 << 10) +/* RAS MASK: MP0 */ +#define AMDGPU_INFO_RAS_ENABLED_MP0 (1 << 11) +/* RAS MASK: MP1 */ +#define AMDGPU_INFO_RAS_ENABLED_MP1 (1 << 12) +/* RAS MASK: FUSE */ +#define AMDGPU_INFO_RAS_ENABLED_FUSE (1 << 13) +/* query video encode/decode caps */ +#define AMDGPU_INFO_VIDEO_CAPS 0x21 + /* Subquery id: Decode */ + #define AMDGPU_INFO_VIDEO_CAPS_DECODE 0 + /* Subquery id: Encode */ + #define AMDGPU_INFO_VIDEO_CAPS_ENCODE 1 +/* Query the max number of IBs per gang per submission */ +#define AMDGPU_INFO_MAX_IBS 0x22 +/* query last page fault info */ +#define AMDGPU_INFO_GPUVM_FAULT 0x23 +/* query FW object size and alignment */ +#define AMDGPU_INFO_UQ_FW_AREAS 0x24 + +#define AMDGPU_INFO_MMR_SE_INDEX_SHIFT 0 +#define AMDGPU_INFO_MMR_SE_INDEX_MASK 0xff +#define AMDGPU_INFO_MMR_SH_INDEX_SHIFT 8 +#define AMDGPU_INFO_MMR_SH_INDEX_MASK 0xff + +struct drm_amdgpu_query_fw { + /** AMDGPU_INFO_FW_* */ + __u32 fw_type; + /** + * Index of the IP if there are more IPs of + * the same type. + */ + __u32 ip_instance; + /** + * Index of the engine. Whether this is used depends + * on the firmware type. (e.g. MEC, SDMA) + */ + __u32 index; + __u32 _pad; +}; + +/* Input structure for the INFO ioctl */ +struct drm_amdgpu_info { + /* Where the return value will be stored */ + __u64 return_pointer; + /* The size of the return value. Just like "size" in "snprintf", + * it limits how many bytes the kernel can write. */ + __u32 return_size; + /* The query request id. */ + __u32 query; + + union { + struct { + __u32 id; + __u32 _pad; + } mode_crtc; + + struct { + /** AMDGPU_HW_IP_* */ + __u32 type; + /** + * Index of the IP if there are more IPs of the same + * type. Ignored by AMDGPU_INFO_HW_IP_COUNT. + */ + __u32 ip_instance; + } query_hw_ip; + + struct { + __u32 dword_offset; + /** number of registers to read */ + __u32 count; + __u32 instance; + /** For future use, no flags defined so far */ + __u32 flags; + } read_mmr_reg; + + struct drm_amdgpu_query_fw query_fw; + + struct { + __u32 type; + __u32 offset; + } vbios_info; + + struct { + __u32 type; + } sensor_info; + + struct { + __u32 type; + } video_cap; + }; +}; + +struct drm_amdgpu_info_gds { + /** GDS GFX partition size */ + __u32 gds_gfx_partition_size; + /** GDS compute partition size */ + __u32 compute_partition_size; + /** total GDS memory size */ + __u32 gds_total_size; + /** GWS size per GFX partition */ + __u32 gws_per_gfx_partition; + /** GSW size per compute partition */ + __u32 gws_per_compute_partition; + /** OA size per GFX partition */ + __u32 oa_per_gfx_partition; + /** OA size per compute partition */ + __u32 oa_per_compute_partition; + __u32 _pad; +}; + +struct drm_amdgpu_info_vram_gtt { + __u64 vram_size; + __u64 vram_cpu_accessible_size; + __u64 gtt_size; +}; + +struct drm_amdgpu_heap_info { + /** max. physical memory */ + __u64 total_heap_size; + + /** Theoretical max. available memory in the given heap */ + __u64 usable_heap_size; + + /** + * Number of bytes allocated in the heap. This includes all processes + * and private allocations in the kernel. It changes when new buffers + * are allocated, freed, and moved. It cannot be larger than + * heap_size. + */ + __u64 heap_usage; + + /** + * Theoretical possible max. size of buffer which + * could be allocated in the given heap + */ + __u64 max_allocation; +}; + +struct drm_amdgpu_memory_info { + struct drm_amdgpu_heap_info vram; + struct drm_amdgpu_heap_info cpu_accessible_vram; + struct drm_amdgpu_heap_info gtt; +}; + +struct drm_amdgpu_info_firmware { + __u32 ver; + __u32 feature; +}; + +struct drm_amdgpu_info_vbios { + __u8 name[64]; + __u8 vbios_pn[64]; + __u32 version; + __u32 pad; + __u8 vbios_ver_str[32]; + __u8 date[32]; +}; + +#define AMDGPU_VRAM_TYPE_UNKNOWN 0 +#define AMDGPU_VRAM_TYPE_GDDR1 1 +#define AMDGPU_VRAM_TYPE_DDR2 2 +#define AMDGPU_VRAM_TYPE_GDDR3 3 +#define AMDGPU_VRAM_TYPE_GDDR4 4 +#define AMDGPU_VRAM_TYPE_GDDR5 5 +#define AMDGPU_VRAM_TYPE_HBM 6 +#define AMDGPU_VRAM_TYPE_DDR3 7 +#define AMDGPU_VRAM_TYPE_DDR4 8 +#define AMDGPU_VRAM_TYPE_GDDR6 9 +#define AMDGPU_VRAM_TYPE_DDR5 10 +#define AMDGPU_VRAM_TYPE_LPDDR4 11 +#define AMDGPU_VRAM_TYPE_LPDDR5 12 +#define AMDGPU_VRAM_TYPE_HBM3E 13 + +struct drm_amdgpu_info_device { + /** PCI Device ID */ + __u32 device_id; + /** Internal chip revision: A0, A1, etc.) */ + __u32 chip_rev; + __u32 external_rev; + /** Revision id in PCI Config space */ + __u32 pci_rev; + __u32 family; + __u32 num_shader_engines; + __u32 num_shader_arrays_per_engine; + /* in KHz */ + __u32 gpu_counter_freq; + __u64 max_engine_clock; + __u64 max_memory_clock; + /* cu information */ + __u32 cu_active_number; + /* NOTE: cu_ao_mask is INVALID, DON'T use it */ + __u32 cu_ao_mask; + __u32 cu_bitmap[4][4]; + /** Render backend pipe mask. One render backend is CB+DB. */ + __u32 enabled_rb_pipes_mask; + __u32 num_rb_pipes; + __u32 num_hw_gfx_contexts; + /* PCIe version (the smaller of the GPU and the CPU/motherboard) */ + __u32 pcie_gen; + __u64 ids_flags; + /** Starting virtual address for UMDs. */ + __u64 virtual_address_offset; + /** The maximum virtual address */ + __u64 virtual_address_max; + /** Required alignment of virtual addresses. */ + __u32 virtual_address_alignment; + /** Page table entry - fragment size */ + __u32 pte_fragment_size; + __u32 gart_page_size; + /** constant engine ram size*/ + __u32 ce_ram_size; + /** video memory type info*/ + __u32 vram_type; + /** video memory bit width*/ + __u32 vram_bit_width; + /* vce harvesting instance */ + __u32 vce_harvest_config; + /* gfx double offchip LDS buffers */ + __u32 gc_double_offchip_lds_buf; + /* NGG Primitive Buffer */ + __u64 prim_buf_gpu_addr; + /* NGG Position Buffer */ + __u64 pos_buf_gpu_addr; + /* NGG Control Sideband */ + __u64 cntl_sb_buf_gpu_addr; + /* NGG Parameter Cache */ + __u64 param_buf_gpu_addr; + __u32 prim_buf_size; + __u32 pos_buf_size; + __u32 cntl_sb_buf_size; + __u32 param_buf_size; + /* wavefront size*/ + __u32 wave_front_size; + /* shader visible vgprs*/ + __u32 num_shader_visible_vgprs; + /* CU per shader array*/ + __u32 num_cu_per_sh; + /* number of tcc blocks*/ + __u32 num_tcc_blocks; + /* gs vgt table depth*/ + __u32 gs_vgt_table_depth; + /* gs primitive buffer depth*/ + __u32 gs_prim_buffer_depth; + /* max gs wavefront per vgt*/ + __u32 max_gs_waves_per_vgt; + /* PCIe number of lanes (the smaller of the GPU and the CPU/motherboard) */ + __u32 pcie_num_lanes; + /* always on cu bitmap */ + __u32 cu_ao_bitmap[4][4]; + /** Starting high virtual address for UMDs. */ + __u64 high_va_offset; + /** The maximum high virtual address */ + __u64 high_va_max; + /* gfx10 pa_sc_tile_steering_override */ + __u32 pa_sc_tile_steering_override; + /* disabled TCCs */ + __u64 tcc_disabled_mask; + __u64 min_engine_clock; + __u64 min_memory_clock; + /* The following fields are only set on gfx11+, older chips set 0. */ + __u32 tcp_cache_size; /* AKA GL0, VMEM cache */ + __u32 num_sqc_per_wgp; + __u32 sqc_data_cache_size; /* AKA SMEM cache */ + __u32 sqc_inst_cache_size; + __u32 gl1c_cache_size; + __u32 gl2c_cache_size; + __u64 mall_size; /* AKA infinity cache */ + /* high 32 bits of the rb pipes mask */ + __u32 enabled_rb_pipes_mask_hi; + /* shadow area size for gfx11 */ + __u32 shadow_size; + /* shadow area base virtual alignment for gfx11 */ + __u32 shadow_alignment; + /* context save area size for gfx11 */ + __u32 csa_size; + /* context save area base virtual alignment for gfx11 */ + __u32 csa_alignment; + /* Userq IP mask (1 << AMDGPU_HW_IP_*) */ + __u32 userq_ip_mask; + __u32 pad; +}; + +struct drm_amdgpu_info_hw_ip { + /** Version of h/w IP */ + __u32 hw_ip_version_major; + __u32 hw_ip_version_minor; + /** Capabilities */ + __u64 capabilities_flags; + /** command buffer address start alignment*/ + __u32 ib_start_alignment; + /** command buffer size alignment*/ + __u32 ib_size_alignment; + /** Bitmask of available rings. Bit 0 means ring 0, etc. */ + __u32 available_rings; + /** version info: bits 23:16 major, 15:8 minor, 7:0 revision */ + __u32 ip_discovery_version; + /* Userq available slots */ + __u32 userq_num_slots; +}; + +/* GFX metadata BO sizes and alignment info (in bytes) */ +struct drm_amdgpu_info_uq_fw_areas_gfx { + /* shadow area size */ + __u32 shadow_size; + /* shadow area base virtual mem alignment */ + __u32 shadow_alignment; + /* context save area size */ + __u32 csa_size; + /* context save area base virtual mem alignment */ + __u32 csa_alignment; +}; + +/* IP specific fw related information used in the + * subquery AMDGPU_INFO_UQ_FW_AREAS + */ +struct drm_amdgpu_info_uq_fw_areas { + union { + struct drm_amdgpu_info_uq_fw_areas_gfx gfx; + }; +}; + +struct drm_amdgpu_info_num_handles { + /** Max handles as supported by firmware for UVD */ + __u32 uvd_max_handles; + /** Handles currently in use for UVD */ + __u32 uvd_used_handles; +}; + +#define AMDGPU_VCE_CLOCK_TABLE_ENTRIES 6 + +struct drm_amdgpu_info_vce_clock_table_entry { + /** System clock */ + __u32 sclk; + /** Memory clock */ + __u32 mclk; + /** VCE clock */ + __u32 eclk; + __u32 pad; +}; + +struct drm_amdgpu_info_vce_clock_table { + struct drm_amdgpu_info_vce_clock_table_entry entries[AMDGPU_VCE_CLOCK_TABLE_ENTRIES]; + __u32 num_valid_entries; + __u32 pad; +}; + +/* query video encode/decode caps */ +#define AMDGPU_INFO_VIDEO_CAPS_CODEC_IDX_MPEG2 0 +#define AMDGPU_INFO_VIDEO_CAPS_CODEC_IDX_MPEG4 1 +#define AMDGPU_INFO_VIDEO_CAPS_CODEC_IDX_VC1 2 +#define AMDGPU_INFO_VIDEO_CAPS_CODEC_IDX_MPEG4_AVC 3 +#define AMDGPU_INFO_VIDEO_CAPS_CODEC_IDX_HEVC 4 +#define AMDGPU_INFO_VIDEO_CAPS_CODEC_IDX_JPEG 5 +#define AMDGPU_INFO_VIDEO_CAPS_CODEC_IDX_VP9 6 +#define AMDGPU_INFO_VIDEO_CAPS_CODEC_IDX_AV1 7 +#define AMDGPU_INFO_VIDEO_CAPS_CODEC_IDX_COUNT 8 + +struct drm_amdgpu_info_video_codec_info { + __u32 valid; + __u32 max_width; + __u32 max_height; + __u32 max_pixels_per_frame; + __u32 max_level; + __u32 pad; +}; + +struct drm_amdgpu_info_video_caps { + struct drm_amdgpu_info_video_codec_info codec_info[AMDGPU_INFO_VIDEO_CAPS_CODEC_IDX_COUNT]; +}; + +#define AMDGPU_VMHUB_TYPE_MASK 0xff +#define AMDGPU_VMHUB_TYPE_SHIFT 0 +#define AMDGPU_VMHUB_TYPE_GFX 0 +#define AMDGPU_VMHUB_TYPE_MM0 1 +#define AMDGPU_VMHUB_TYPE_MM1 2 +#define AMDGPU_VMHUB_IDX_MASK 0xff00 +#define AMDGPU_VMHUB_IDX_SHIFT 8 + +struct drm_amdgpu_info_gpuvm_fault { + __u64 addr; + __u32 status; + __u32 vmhub; +}; + +struct drm_amdgpu_info_uq_metadata_gfx { + /* shadow area size for gfx11 */ + __u32 shadow_size; + /* shadow area base virtual alignment for gfx11 */ + __u32 shadow_alignment; + /* context save area size for gfx11 */ + __u32 csa_size; + /* context save area base virtual alignment for gfx11 */ + __u32 csa_alignment; +}; + +struct drm_amdgpu_info_uq_metadata { + union { + struct drm_amdgpu_info_uq_metadata_gfx gfx; + }; +}; + +/* + * Supported GPU families + */ +#define AMDGPU_FAMILY_UNKNOWN 0 +#define AMDGPU_FAMILY_SI 110 /* Hainan, Oland, Verde, Pitcairn, Tahiti */ +#define AMDGPU_FAMILY_CI 120 /* Bonaire, Hawaii */ +#define AMDGPU_FAMILY_KV 125 /* Kaveri, Kabini, Mullins */ +#define AMDGPU_FAMILY_VI 130 /* Iceland, Tonga */ +#define AMDGPU_FAMILY_CZ 135 /* Carrizo, Stoney */ +#define AMDGPU_FAMILY_AI 141 /* Vega10 */ +#define AMDGPU_FAMILY_RV 142 /* Raven */ +#define AMDGPU_FAMILY_NV 143 /* Navi10 */ +#define AMDGPU_FAMILY_VGH 144 /* Van Gogh */ +#define AMDGPU_FAMILY_GC_11_0_0 145 /* GC 11.0.0 */ +#define AMDGPU_FAMILY_YC 146 /* Yellow Carp */ +#define AMDGPU_FAMILY_GC_11_0_1 148 /* GC 11.0.1 */ +#define AMDGPU_FAMILY_GC_10_3_6 149 /* GC 10.3.6 */ +#define AMDGPU_FAMILY_GC_10_3_7 151 /* GC 10.3.7 */ +#define AMDGPU_FAMILY_GC_11_5_0 150 /* GC 11.5.0 */ +#define AMDGPU_FAMILY_GC_12_0_0 152 /* GC 12.0.0 */ + +/* FIXME wrong namespace! */ +struct drm_color_ctm_3x4 { + /* + * Conversion matrix with 3x4 dimensions in S31.32 sign-magnitude + * (not two's complement!) format. + */ + __u64 matrix[12]; +}; + +#if defined(__cplusplus) +} +#endif + +#endif From 5eb61e1b14959acb858fea69d45bf5c8f7f53ee5 Mon Sep 17 00:00:00 2001 From: David Francis Date: Thu, 15 May 2025 09:49:24 -0400 Subject: [PATCH 09/51] plugin/amdgpu: Add drm header The amdgpu plugin usually calls drm ioctls through the libdrm wrappers. However, amdgpu restore requires dealing with dmabufs and gem handles directly, which means drm ioctls must be called directly. Add the drm.h header (from the kernel's uapi). Signed-off-by: David Francis --- plugins/amdgpu/drm.h | 1450 ++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 1450 insertions(+) create mode 100644 plugins/amdgpu/drm.h diff --git a/plugins/amdgpu/drm.h b/plugins/amdgpu/drm.h new file mode 100644 index 000000000..84c819c17 --- /dev/null +++ b/plugins/amdgpu/drm.h @@ -0,0 +1,1450 @@ +/* + * Header for the Direct Rendering Manager + * + * Author: Rickard E. (Rik) Faith + * + * Acknowledgments: + * Dec 1999, Richard Henderson , move to generic cmpxchg. + */ + +/* + * Copyright 1999 Precision Insight, Inc., Cedar Park, Texas. + * Copyright 2000 VA Linux Systems, Inc., Sunnyvale, California. + * All rights reserved. + * + * Permission is hereby granted, free of charge, to any person obtaining a + * copy of this software and associated documentation files (the "Software"), + * to deal in the Software without restriction, including without limitation + * the rights to use, copy, modify, merge, publish, distribute, sublicense, + * and/or sell copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice (including the next + * paragraph) shall be included in all copies or substantial portions of the + * Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL + * VA LINUX SYSTEMS AND/OR ITS SUPPLIERS BE LIABLE FOR ANY CLAIM, DAMAGES OR + * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, + * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR + * OTHER DEALINGS IN THE SOFTWARE. + */ + +#ifndef _DRM_H_ +#define _DRM_H_ + +#if defined(__KERNEL__) + +#include +#include +typedef unsigned int drm_handle_t; + +#elif defined(__linux__) + +#include +#include +typedef unsigned int drm_handle_t; + +#else /* One of the BSDs */ + +#include +#include +#include +typedef int8_t __s8; +typedef uint8_t __u8; +typedef int16_t __s16; +typedef uint16_t __u16; +typedef int32_t __s32; +typedef uint32_t __u32; +typedef int64_t __s64; +typedef uint64_t __u64; +typedef size_t __kernel_size_t; +typedef unsigned long drm_handle_t; + +#endif + +#if defined(__cplusplus) +extern "C" { +#endif + +#define DRM_NAME "drm" /**< Name in kernel, /dev, and /proc */ +#define DRM_MIN_ORDER 5 /**< At least 2^5 bytes = 32 bytes */ +#define DRM_MAX_ORDER 22 /**< Up to 2^22 bytes = 4MB */ +#define DRM_RAM_PERCENT 10 /**< How much system ram can we lock? */ + +#define _DRM_LOCK_HELD 0x80000000U /**< Hardware lock is held */ +#define _DRM_LOCK_CONT 0x40000000U /**< Hardware lock is contended */ +#define _DRM_LOCK_IS_HELD(lock) ((lock) & _DRM_LOCK_HELD) +#define _DRM_LOCK_IS_CONT(lock) ((lock) & _DRM_LOCK_CONT) +#define _DRM_LOCKING_CONTEXT(lock) ((lock) & ~(_DRM_LOCK_HELD|_DRM_LOCK_CONT)) + +typedef unsigned int drm_context_t; +typedef unsigned int drm_drawable_t; +typedef unsigned int drm_magic_t; + +/* + * Cliprect. + * + * \warning: If you change this structure, make sure you change + * XF86DRIClipRectRec in the server as well + * + * \note KW: Actually it's illegal to change either for + * backwards-compatibility reasons. + */ +struct drm_clip_rect { + unsigned short x1; + unsigned short y1; + unsigned short x2; + unsigned short y2; +}; + +/* + * Drawable information. + */ +struct drm_drawable_info { + unsigned int num_rects; + struct drm_clip_rect *rects; +}; + +/* + * Texture region, + */ +struct drm_tex_region { + unsigned char next; + unsigned char prev; + unsigned char in_use; + unsigned char padding; + unsigned int age; +}; + +/* + * Hardware lock. + * + * The lock structure is a simple cache-line aligned integer. To avoid + * processor bus contention on a multiprocessor system, there should not be any + * other data stored in the same cache line. + */ +struct drm_hw_lock { + __volatile__ unsigned int lock; /**< lock variable */ + char padding[60]; /**< Pad to cache line */ +}; + +/* + * DRM_IOCTL_VERSION ioctl argument type. + * + * \sa drmGetVersion(). + */ +struct drm_version { + int version_major; /**< Major version */ + int version_minor; /**< Minor version */ + int version_patchlevel; /**< Patch level */ + __kernel_size_t name_len; /**< Length of name buffer */ + char __user *name; /**< Name of driver */ + __kernel_size_t date_len; /**< Length of date buffer */ + char __user *date; /**< User-space buffer to hold date */ + __kernel_size_t desc_len; /**< Length of desc buffer */ + char __user *desc; /**< User-space buffer to hold desc */ +}; + +/* + * DRM_IOCTL_GET_UNIQUE ioctl argument type. + * + * \sa drmGetBusid() and drmSetBusId(). + */ +struct drm_unique { + __kernel_size_t unique_len; /**< Length of unique */ + char __user *unique; /**< Unique name for driver instantiation */ +}; + +struct drm_list { + int count; /**< Length of user-space structures */ + struct drm_version __user *version; +}; + +struct drm_block { + int unused; +}; + +/* + * DRM_IOCTL_CONTROL ioctl argument type. + * + * \sa drmCtlInstHandler() and drmCtlUninstHandler(). + */ +struct drm_control { + enum { + DRM_ADD_COMMAND, + DRM_RM_COMMAND, + DRM_INST_HANDLER, + DRM_UNINST_HANDLER + } func; + int irq; +}; + +/* + * Type of memory to map. + */ +enum drm_map_type { + _DRM_FRAME_BUFFER = 0, /**< WC (no caching), no core dump */ + _DRM_REGISTERS = 1, /**< no caching, no core dump */ + _DRM_SHM = 2, /**< shared, cached */ + _DRM_AGP = 3, /**< AGP/GART */ + _DRM_SCATTER_GATHER = 4, /**< Scatter/gather memory for PCI DMA */ + _DRM_CONSISTENT = 5 /**< Consistent memory for PCI DMA */ +}; + +/* + * Memory mapping flags. + */ +enum drm_map_flags { + _DRM_RESTRICTED = 0x01, /**< Cannot be mapped to user-virtual */ + _DRM_READ_ONLY = 0x02, + _DRM_LOCKED = 0x04, /**< shared, cached, locked */ + _DRM_KERNEL = 0x08, /**< kernel requires access */ + _DRM_WRITE_COMBINING = 0x10, /**< use write-combining if available */ + _DRM_CONTAINS_LOCK = 0x20, /**< SHM page that contains lock */ + _DRM_REMOVABLE = 0x40, /**< Removable mapping */ + _DRM_DRIVER = 0x80 /**< Managed by driver */ +}; + +struct drm_ctx_priv_map { + unsigned int ctx_id; /**< Context requesting private mapping */ + void *handle; /**< Handle of map */ +}; + +/* + * DRM_IOCTL_GET_MAP, DRM_IOCTL_ADD_MAP and DRM_IOCTL_RM_MAP ioctls + * argument type. + * + * \sa drmAddMap(). + */ +struct drm_map { + unsigned long offset; /**< Requested physical address (0 for SAREA)*/ + unsigned long size; /**< Requested physical size (bytes) */ + enum drm_map_type type; /**< Type of memory to map */ + enum drm_map_flags flags; /**< Flags */ + void *handle; /**< User-space: "Handle" to pass to mmap() */ + /**< Kernel-space: kernel-virtual address */ + int mtrr; /**< MTRR slot used */ + /* Private data */ +}; + +/* + * DRM_IOCTL_GET_CLIENT ioctl argument type. + */ +struct drm_client { + int idx; /**< Which client desired? */ + int auth; /**< Is client authenticated? */ + unsigned long pid; /**< Process ID */ + unsigned long uid; /**< User ID */ + unsigned long magic; /**< Magic */ + unsigned long iocs; /**< Ioctl count */ +}; + +enum drm_stat_type { + _DRM_STAT_LOCK, + _DRM_STAT_OPENS, + _DRM_STAT_CLOSES, + _DRM_STAT_IOCTLS, + _DRM_STAT_LOCKS, + _DRM_STAT_UNLOCKS, + _DRM_STAT_VALUE, /**< Generic value */ + _DRM_STAT_BYTE, /**< Generic byte counter (1024bytes/K) */ + _DRM_STAT_COUNT, /**< Generic non-byte counter (1000/k) */ + + _DRM_STAT_IRQ, /**< IRQ */ + _DRM_STAT_PRIMARY, /**< Primary DMA bytes */ + _DRM_STAT_SECONDARY, /**< Secondary DMA bytes */ + _DRM_STAT_DMA, /**< DMA */ + _DRM_STAT_SPECIAL, /**< Special DMA (e.g., priority or polled) */ + _DRM_STAT_MISSED /**< Missed DMA opportunity */ + /* Add to the *END* of the list */ +}; + +/* + * DRM_IOCTL_GET_STATS ioctl argument type. + */ +struct drm_stats { + unsigned long count; + struct { + unsigned long value; + enum drm_stat_type type; + } data[15]; +}; + +/* + * Hardware locking flags. + */ +enum drm_lock_flags { + _DRM_LOCK_READY = 0x01, /**< Wait until hardware is ready for DMA */ + _DRM_LOCK_QUIESCENT = 0x02, /**< Wait until hardware quiescent */ + _DRM_LOCK_FLUSH = 0x04, /**< Flush this context's DMA queue first */ + _DRM_LOCK_FLUSH_ALL = 0x08, /**< Flush all DMA queues first */ + /* These *HALT* flags aren't supported yet + -- they will be used to support the + full-screen DGA-like mode. */ + _DRM_HALT_ALL_QUEUES = 0x10, /**< Halt all current and future queues */ + _DRM_HALT_CUR_QUEUES = 0x20 /**< Halt all current queues */ +}; + +/* + * DRM_IOCTL_LOCK, DRM_IOCTL_UNLOCK and DRM_IOCTL_FINISH ioctl argument type. + * + * \sa drmGetLock() and drmUnlock(). + */ +struct drm_lock { + int context; + enum drm_lock_flags flags; +}; + +/* + * DMA flags + * + * \warning + * These values \e must match xf86drm.h. + * + * \sa drm_dma. + */ +enum drm_dma_flags { + /* Flags for DMA buffer dispatch */ + _DRM_DMA_BLOCK = 0x01, /**< + * Block until buffer dispatched. + * + * \note The buffer may not yet have + * been processed by the hardware -- + * getting a hardware lock with the + * hardware quiescent will ensure + * that the buffer has been + * processed. + */ + _DRM_DMA_WHILE_LOCKED = 0x02, /**< Dispatch while lock held */ + _DRM_DMA_PRIORITY = 0x04, /**< High priority dispatch */ + + /* Flags for DMA buffer request */ + _DRM_DMA_WAIT = 0x10, /**< Wait for free buffers */ + _DRM_DMA_SMALLER_OK = 0x20, /**< Smaller-than-requested buffers OK */ + _DRM_DMA_LARGER_OK = 0x40 /**< Larger-than-requested buffers OK */ +}; + +/* + * DRM_IOCTL_ADD_BUFS and DRM_IOCTL_MARK_BUFS ioctl argument type. + * + * \sa drmAddBufs(). + */ +struct drm_buf_desc { + int count; /**< Number of buffers of this size */ + int size; /**< Size in bytes */ + int low_mark; /**< Low water mark */ + int high_mark; /**< High water mark */ + enum { + _DRM_PAGE_ALIGN = 0x01, /**< Align on page boundaries for DMA */ + _DRM_AGP_BUFFER = 0x02, /**< Buffer is in AGP space */ + _DRM_SG_BUFFER = 0x04, /**< Scatter/gather memory buffer */ + _DRM_FB_BUFFER = 0x08, /**< Buffer is in frame buffer */ + _DRM_PCI_BUFFER_RO = 0x10 /**< Map PCI DMA buffer read-only */ + } flags; + unsigned long agp_start; /**< + * Start address of where the AGP buffers are + * in the AGP aperture + */ +}; + +/* + * DRM_IOCTL_INFO_BUFS ioctl argument type. + */ +struct drm_buf_info { + int count; /**< Entries in list */ + struct drm_buf_desc __user *list; +}; + +/* + * DRM_IOCTL_FREE_BUFS ioctl argument type. + */ +struct drm_buf_free { + int count; + int __user *list; +}; + +/* + * Buffer information + * + * \sa drm_buf_map. + */ +struct drm_buf_pub { + int idx; /**< Index into the master buffer list */ + int total; /**< Buffer size */ + int used; /**< Amount of buffer in use (for DMA) */ + void __user *address; /**< Address of buffer */ +}; + +/* + * DRM_IOCTL_MAP_BUFS ioctl argument type. + */ +struct drm_buf_map { + int count; /**< Length of the buffer list */ +#ifdef __cplusplus + void __user *virt; +#else + void __user *virtual; /**< Mmap'd area in user-virtual */ +#endif + struct drm_buf_pub __user *list; /**< Buffer information */ +}; + +/* + * DRM_IOCTL_DMA ioctl argument type. + * + * Indices here refer to the offset into the buffer list in drm_buf_get. + * + * \sa drmDMA(). + */ +struct drm_dma { + int context; /**< Context handle */ + int send_count; /**< Number of buffers to send */ + int __user *send_indices; /**< List of handles to buffers */ + int __user *send_sizes; /**< Lengths of data to send */ + enum drm_dma_flags flags; /**< Flags */ + int request_count; /**< Number of buffers requested */ + int request_size; /**< Desired size for buffers */ + int __user *request_indices; /**< Buffer information */ + int __user *request_sizes; + int granted_count; /**< Number of buffers granted */ +}; + +enum drm_ctx_flags { + _DRM_CONTEXT_PRESERVED = 0x01, + _DRM_CONTEXT_2DONLY = 0x02 +}; + +/* + * DRM_IOCTL_ADD_CTX ioctl argument type. + * + * \sa drmCreateContext() and drmDestroyContext(). + */ +struct drm_ctx { + drm_context_t handle; + enum drm_ctx_flags flags; +}; + +/* + * DRM_IOCTL_RES_CTX ioctl argument type. + */ +struct drm_ctx_res { + int count; + struct drm_ctx __user *contexts; +}; + +/* + * DRM_IOCTL_ADD_DRAW and DRM_IOCTL_RM_DRAW ioctl argument type. + */ +struct drm_draw { + drm_drawable_t handle; +}; + +/* + * DRM_IOCTL_UPDATE_DRAW ioctl argument type. + */ +typedef enum { + DRM_DRAWABLE_CLIPRECTS +} drm_drawable_info_type_t; + +struct drm_update_draw { + drm_drawable_t handle; + unsigned int type; + unsigned int num; + unsigned long long data; +}; + +/* + * DRM_IOCTL_GET_MAGIC and DRM_IOCTL_AUTH_MAGIC ioctl argument type. + */ +struct drm_auth { + drm_magic_t magic; +}; + +/* + * DRM_IOCTL_IRQ_BUSID ioctl argument type. + * + * \sa drmGetInterruptFromBusID(). + */ +struct drm_irq_busid { + int irq; /**< IRQ number */ + int busnum; /**< bus number */ + int devnum; /**< device number */ + int funcnum; /**< function number */ +}; + +enum drm_vblank_seq_type { + _DRM_VBLANK_ABSOLUTE = 0x0, /**< Wait for specific vblank sequence number */ + _DRM_VBLANK_RELATIVE = 0x1, /**< Wait for given number of vblanks */ + /* bits 1-6 are reserved for high crtcs */ + _DRM_VBLANK_HIGH_CRTC_MASK = 0x0000003e, + _DRM_VBLANK_EVENT = 0x4000000, /**< Send event instead of blocking */ + _DRM_VBLANK_FLIP = 0x8000000, /**< Scheduled buffer swap should flip */ + _DRM_VBLANK_NEXTONMISS = 0x10000000, /**< If missed, wait for next vblank */ + _DRM_VBLANK_SECONDARY = 0x20000000, /**< Secondary display controller */ + _DRM_VBLANK_SIGNAL = 0x40000000 /**< Send signal instead of blocking, unsupported */ +}; +#define _DRM_VBLANK_HIGH_CRTC_SHIFT 1 + +#define _DRM_VBLANK_TYPES_MASK (_DRM_VBLANK_ABSOLUTE | _DRM_VBLANK_RELATIVE) +#define _DRM_VBLANK_FLAGS_MASK (_DRM_VBLANK_EVENT | _DRM_VBLANK_SIGNAL | \ + _DRM_VBLANK_SECONDARY | _DRM_VBLANK_NEXTONMISS) + +struct drm_wait_vblank_request { + enum drm_vblank_seq_type type; + unsigned int sequence; + unsigned long signal; +}; + +struct drm_wait_vblank_reply { + enum drm_vblank_seq_type type; + unsigned int sequence; + long tval_sec; + long tval_usec; +}; + +/* + * DRM_IOCTL_WAIT_VBLANK ioctl argument type. + * + * \sa drmWaitVBlank(). + */ +union drm_wait_vblank { + struct drm_wait_vblank_request request; + struct drm_wait_vblank_reply reply; +}; + +#define _DRM_PRE_MODESET 1 +#define _DRM_POST_MODESET 2 + +/* + * DRM_IOCTL_MODESET_CTL ioctl argument type + * + * \sa drmModesetCtl(). + */ +struct drm_modeset_ctl { + __u32 crtc; + __u32 cmd; +}; + +/* + * DRM_IOCTL_AGP_ENABLE ioctl argument type. + * + * \sa drmAgpEnable(). + */ +struct drm_agp_mode { + unsigned long mode; /**< AGP mode */ +}; + +/* + * DRM_IOCTL_AGP_ALLOC and DRM_IOCTL_AGP_FREE ioctls argument type. + * + * \sa drmAgpAlloc() and drmAgpFree(). + */ +struct drm_agp_buffer { + unsigned long size; /**< In bytes -- will round to page boundary */ + unsigned long handle; /**< Used for binding / unbinding */ + unsigned long type; /**< Type of memory to allocate */ + unsigned long physical; /**< Physical used by i810 */ +}; + +/* + * DRM_IOCTL_AGP_BIND and DRM_IOCTL_AGP_UNBIND ioctls argument type. + * + * \sa drmAgpBind() and drmAgpUnbind(). + */ +struct drm_agp_binding { + unsigned long handle; /**< From drm_agp_buffer */ + unsigned long offset; /**< In bytes -- will round to page boundary */ +}; + +/* + * DRM_IOCTL_AGP_INFO ioctl argument type. + * + * \sa drmAgpVersionMajor(), drmAgpVersionMinor(), drmAgpGetMode(), + * drmAgpBase(), drmAgpSize(), drmAgpMemoryUsed(), drmAgpMemoryAvail(), + * drmAgpVendorId() and drmAgpDeviceId(). + */ +struct drm_agp_info { + int agp_version_major; + int agp_version_minor; + unsigned long mode; + unsigned long aperture_base; /* physical address */ + unsigned long aperture_size; /* bytes */ + unsigned long memory_allowed; /* bytes */ + unsigned long memory_used; + + /* PCI information */ + unsigned short id_vendor; + unsigned short id_device; +}; + +/* + * DRM_IOCTL_SG_ALLOC ioctl argument type. + */ +struct drm_scatter_gather { + unsigned long size; /**< In bytes -- will round to page boundary */ + unsigned long handle; /**< Used for mapping / unmapping */ +}; + +/* + * DRM_IOCTL_SET_VERSION ioctl argument type. + */ +struct drm_set_version { + int drm_di_major; + int drm_di_minor; + int drm_dd_major; + int drm_dd_minor; +}; + +/* DRM_IOCTL_GEM_CLOSE ioctl argument type */ +struct drm_gem_close { + /** Handle of the object to be closed. */ + __u32 handle; + __u32 pad; +}; + +/* DRM_IOCTL_GEM_FLINK ioctl argument type */ +struct drm_gem_flink { + /** Handle for the object being named */ + __u32 handle; + + /** Returned global name */ + __u32 name; +}; + +/* DRM_IOCTL_GEM_OPEN ioctl argument type */ +struct drm_gem_open { + /** Name of object being opened */ + __u32 name; + + /** Returned handle for the object */ + __u32 handle; + + /** Returned size of the object */ + __u64 size; +}; + +/* DRM_IOCTL_GEM_CHANGE_HANDLE ioctl argument type */ +struct drm_gem_change_handle { + /** Current handle of object */ + __u32 handle; + + /** Handle to change that object to */ + __u32 new_handle; +}; + +/** + * DRM_CAP_DUMB_BUFFER + * + * If set to 1, the driver supports creating dumb buffers via the + * &DRM_IOCTL_MODE_CREATE_DUMB ioctl. + */ +#define DRM_CAP_DUMB_BUFFER 0x1 +/** + * DRM_CAP_VBLANK_HIGH_CRTC + * + * If set to 1, the kernel supports specifying a :ref:`CRTC index` + * in the high bits of &drm_wait_vblank_request.type. + * + * Starting kernel version 2.6.39, this capability is always set to 1. + */ +#define DRM_CAP_VBLANK_HIGH_CRTC 0x2 +/** + * DRM_CAP_DUMB_PREFERRED_DEPTH + * + * The preferred bit depth for dumb buffers. + * + * The bit depth is the number of bits used to indicate the color of a single + * pixel excluding any padding. This is different from the number of bits per + * pixel. For instance, XRGB8888 has a bit depth of 24 but has 32 bits per + * pixel. + * + * Note that this preference only applies to dumb buffers, it's irrelevant for + * other types of buffers. + */ +#define DRM_CAP_DUMB_PREFERRED_DEPTH 0x3 +/** + * DRM_CAP_DUMB_PREFER_SHADOW + * + * If set to 1, the driver prefers userspace to render to a shadow buffer + * instead of directly rendering to a dumb buffer. For best speed, userspace + * should do streaming ordered memory copies into the dumb buffer and never + * read from it. + * + * Note that this preference only applies to dumb buffers, it's irrelevant for + * other types of buffers. + */ +#define DRM_CAP_DUMB_PREFER_SHADOW 0x4 +/** + * DRM_CAP_PRIME + * + * Bitfield of supported PRIME sharing capabilities. See &DRM_PRIME_CAP_IMPORT + * and &DRM_PRIME_CAP_EXPORT. + * + * Starting from kernel version 6.6, both &DRM_PRIME_CAP_IMPORT and + * &DRM_PRIME_CAP_EXPORT are always advertised. + * + * PRIME buffers are exposed as dma-buf file descriptors. + * See :ref:`prime_buffer_sharing`. + */ +#define DRM_CAP_PRIME 0x5 +/** + * DRM_PRIME_CAP_IMPORT + * + * If this bit is set in &DRM_CAP_PRIME, the driver supports importing PRIME + * buffers via the &DRM_IOCTL_PRIME_FD_TO_HANDLE ioctl. + * + * Starting from kernel version 6.6, this bit is always set in &DRM_CAP_PRIME. + */ +#define DRM_PRIME_CAP_IMPORT 0x1 +/** + * DRM_PRIME_CAP_EXPORT + * + * If this bit is set in &DRM_CAP_PRIME, the driver supports exporting PRIME + * buffers via the &DRM_IOCTL_PRIME_HANDLE_TO_FD ioctl. + * + * Starting from kernel version 6.6, this bit is always set in &DRM_CAP_PRIME. + */ +#define DRM_PRIME_CAP_EXPORT 0x2 +/** + * DRM_CAP_TIMESTAMP_MONOTONIC + * + * If set to 0, the kernel will report timestamps with ``CLOCK_REALTIME`` in + * struct drm_event_vblank. If set to 1, the kernel will report timestamps with + * ``CLOCK_MONOTONIC``. See ``clock_gettime(2)`` for the definition of these + * clocks. + * + * Starting from kernel version 2.6.39, the default value for this capability + * is 1. Starting kernel version 4.15, this capability is always set to 1. + */ +#define DRM_CAP_TIMESTAMP_MONOTONIC 0x6 +/** + * DRM_CAP_ASYNC_PAGE_FLIP + * + * If set to 1, the driver supports &DRM_MODE_PAGE_FLIP_ASYNC for legacy + * page-flips. + */ +#define DRM_CAP_ASYNC_PAGE_FLIP 0x7 +/** + * DRM_CAP_CURSOR_WIDTH + * + * The ``CURSOR_WIDTH`` and ``CURSOR_HEIGHT`` capabilities return a valid + * width x height combination for the hardware cursor. The intention is that a + * hardware agnostic userspace can query a cursor plane size to use. + * + * Note that the cross-driver contract is to merely return a valid size; + * drivers are free to attach another meaning on top, eg. i915 returns the + * maximum plane size. + */ +#define DRM_CAP_CURSOR_WIDTH 0x8 +/** + * DRM_CAP_CURSOR_HEIGHT + * + * See &DRM_CAP_CURSOR_WIDTH. + */ +#define DRM_CAP_CURSOR_HEIGHT 0x9 +/** + * DRM_CAP_ADDFB2_MODIFIERS + * + * If set to 1, the driver supports supplying modifiers in the + * &DRM_IOCTL_MODE_ADDFB2 ioctl. + */ +#define DRM_CAP_ADDFB2_MODIFIERS 0x10 +/** + * DRM_CAP_PAGE_FLIP_TARGET + * + * If set to 1, the driver supports the &DRM_MODE_PAGE_FLIP_TARGET_ABSOLUTE and + * &DRM_MODE_PAGE_FLIP_TARGET_RELATIVE flags in + * &drm_mode_crtc_page_flip_target.flags for the &DRM_IOCTL_MODE_PAGE_FLIP + * ioctl. + */ +#define DRM_CAP_PAGE_FLIP_TARGET 0x11 +/** + * DRM_CAP_CRTC_IN_VBLANK_EVENT + * + * If set to 1, the kernel supports reporting the CRTC ID in + * &drm_event_vblank.crtc_id for the &DRM_EVENT_VBLANK and + * &DRM_EVENT_FLIP_COMPLETE events. + * + * Starting kernel version 4.12, this capability is always set to 1. + */ +#define DRM_CAP_CRTC_IN_VBLANK_EVENT 0x12 +/** + * DRM_CAP_SYNCOBJ + * + * If set to 1, the driver supports sync objects. See :ref:`drm_sync_objects`. + */ +#define DRM_CAP_SYNCOBJ 0x13 +/** + * DRM_CAP_SYNCOBJ_TIMELINE + * + * If set to 1, the driver supports timeline operations on sync objects. See + * :ref:`drm_sync_objects`. + */ +#define DRM_CAP_SYNCOBJ_TIMELINE 0x14 +/** + * DRM_CAP_ATOMIC_ASYNC_PAGE_FLIP + * + * If set to 1, the driver supports &DRM_MODE_PAGE_FLIP_ASYNC for atomic + * commits. + */ +#define DRM_CAP_ATOMIC_ASYNC_PAGE_FLIP 0x15 + +/* DRM_IOCTL_GET_CAP ioctl argument type */ +struct drm_get_cap { + __u64 capability; + __u64 value; +}; + +/** + * DRM_CLIENT_CAP_STEREO_3D + * + * If set to 1, the DRM core will expose the stereo 3D capabilities of the + * monitor by advertising the supported 3D layouts in the flags of struct + * drm_mode_modeinfo. See ``DRM_MODE_FLAG_3D_*``. + * + * This capability is always supported for all drivers starting from kernel + * version 3.13. + */ +#define DRM_CLIENT_CAP_STEREO_3D 1 + +/** + * DRM_CLIENT_CAP_UNIVERSAL_PLANES + * + * If set to 1, the DRM core will expose all planes (overlay, primary, and + * cursor) to userspace. + * + * This capability has been introduced in kernel version 3.15. Starting from + * kernel version 3.17, this capability is always supported for all drivers. + */ +#define DRM_CLIENT_CAP_UNIVERSAL_PLANES 2 + +/** + * DRM_CLIENT_CAP_ATOMIC + * + * If set to 1, the DRM core will expose atomic properties to userspace. This + * implicitly enables &DRM_CLIENT_CAP_UNIVERSAL_PLANES and + * &DRM_CLIENT_CAP_ASPECT_RATIO. + * + * If the driver doesn't support atomic mode-setting, enabling this capability + * will fail with -EOPNOTSUPP. + * + * This capability has been introduced in kernel version 4.0. Starting from + * kernel version 4.2, this capability is always supported for atomic-capable + * drivers. + */ +#define DRM_CLIENT_CAP_ATOMIC 3 + +/** + * DRM_CLIENT_CAP_ASPECT_RATIO + * + * If set to 1, the DRM core will provide aspect ratio information in modes. + * See ``DRM_MODE_FLAG_PIC_AR_*``. + * + * This capability is always supported for all drivers starting from kernel + * version 4.18. + */ +#define DRM_CLIENT_CAP_ASPECT_RATIO 4 + +/** + * DRM_CLIENT_CAP_WRITEBACK_CONNECTORS + * + * If set to 1, the DRM core will expose special connectors to be used for + * writing back to memory the scene setup in the commit. The client must enable + * &DRM_CLIENT_CAP_ATOMIC first. + * + * This capability is always supported for atomic-capable drivers starting from + * kernel version 4.19. + */ +#define DRM_CLIENT_CAP_WRITEBACK_CONNECTORS 5 + +/** + * DRM_CLIENT_CAP_CURSOR_PLANE_HOTSPOT + * + * Drivers for para-virtualized hardware (e.g. vmwgfx, qxl, virtio and + * virtualbox) have additional restrictions for cursor planes (thus + * making cursor planes on those drivers not truly universal,) e.g. + * they need cursor planes to act like one would expect from a mouse + * cursor and have correctly set hotspot properties. + * If this client cap is not set the DRM core will hide cursor plane on + * those virtualized drivers because not setting it implies that the + * client is not capable of dealing with those extra restictions. + * Clients which do set cursor hotspot and treat the cursor plane + * like a mouse cursor should set this property. + * The client must enable &DRM_CLIENT_CAP_ATOMIC first. + * + * Setting this property on drivers which do not special case + * cursor planes (i.e. non-virtualized drivers) will return + * EOPNOTSUPP, which can be used by userspace to gauge + * requirements of the hardware/drivers they're running on. + * + * This capability is always supported for atomic-capable virtualized + * drivers starting from kernel version 6.6. + */ +#define DRM_CLIENT_CAP_CURSOR_PLANE_HOTSPOT 6 + +/* DRM_IOCTL_SET_CLIENT_CAP ioctl argument type */ +struct drm_set_client_cap { + __u64 capability; + __u64 value; +}; + +#define DRM_RDWR O_RDWR +#define DRM_CLOEXEC O_CLOEXEC +struct drm_prime_handle { + __u32 handle; + + /** Flags.. only applicable for handle->fd */ + __u32 flags; + + /** Returned dmabuf file descriptor */ + __s32 fd; +}; + +struct drm_syncobj_create { + __u32 handle; +#define DRM_SYNCOBJ_CREATE_SIGNALED (1 << 0) + __u32 flags; +}; + +struct drm_syncobj_destroy { + __u32 handle; + __u32 pad; +}; + +#define DRM_SYNCOBJ_FD_TO_HANDLE_FLAGS_IMPORT_SYNC_FILE (1 << 0) +#define DRM_SYNCOBJ_HANDLE_TO_FD_FLAGS_EXPORT_SYNC_FILE (1 << 0) +struct drm_syncobj_handle { + __u32 handle; + __u32 flags; + + __s32 fd; + __u32 pad; +}; + +struct drm_syncobj_transfer { + __u32 src_handle; + __u32 dst_handle; + __u64 src_point; + __u64 dst_point; + __u32 flags; + __u32 pad; +}; + +#define DRM_SYNCOBJ_WAIT_FLAGS_WAIT_ALL (1 << 0) +#define DRM_SYNCOBJ_WAIT_FLAGS_WAIT_FOR_SUBMIT (1 << 1) +#define DRM_SYNCOBJ_WAIT_FLAGS_WAIT_AVAILABLE (1 << 2) /* wait for time point to become available */ +#define DRM_SYNCOBJ_WAIT_FLAGS_WAIT_DEADLINE (1 << 3) /* set fence deadline to deadline_nsec */ +struct drm_syncobj_wait { + __u64 handles; + /* absolute timeout */ + __s64 timeout_nsec; + __u32 count_handles; + __u32 flags; + __u32 first_signaled; /* only valid when not waiting all */ + __u32 pad; + /** + * @deadline_nsec - fence deadline hint + * + * Deadline hint, in absolute CLOCK_MONOTONIC, to set on backing + * fence(s) if the DRM_SYNCOBJ_WAIT_FLAGS_WAIT_DEADLINE flag is + * set. + */ + __u64 deadline_nsec; +}; + +struct drm_syncobj_timeline_wait { + __u64 handles; + /* wait on specific timeline point for every handles*/ + __u64 points; + /* absolute timeout */ + __s64 timeout_nsec; + __u32 count_handles; + __u32 flags; + __u32 first_signaled; /* only valid when not waiting all */ + __u32 pad; + /** + * @deadline_nsec - fence deadline hint + * + * Deadline hint, in absolute CLOCK_MONOTONIC, to set on backing + * fence(s) if the DRM_SYNCOBJ_WAIT_FLAGS_WAIT_DEADLINE flag is + * set. + */ + __u64 deadline_nsec; +}; + +/** + * struct drm_syncobj_eventfd + * @handle: syncobj handle. + * @flags: Zero to wait for the point to be signalled, or + * &DRM_SYNCOBJ_WAIT_FLAGS_WAIT_AVAILABLE to wait for a fence to be + * available for the point. + * @point: syncobj timeline point (set to zero for binary syncobjs). + * @fd: Existing eventfd to sent events to. + * @pad: Must be zero. + * + * Register an eventfd to be signalled by a syncobj. The eventfd counter will + * be incremented by one. + */ +struct drm_syncobj_eventfd { + __u32 handle; + __u32 flags; + __u64 point; + __s32 fd; + __u32 pad; +}; + + +struct drm_syncobj_array { + __u64 handles; + __u32 count_handles; + __u32 pad; +}; + +#define DRM_SYNCOBJ_QUERY_FLAGS_LAST_SUBMITTED (1 << 0) /* last available point on timeline syncobj */ +struct drm_syncobj_timeline_array { + __u64 handles; + __u64 points; + __u32 count_handles; + __u32 flags; +}; + + +/* Query current scanout sequence number */ +struct drm_crtc_get_sequence { + __u32 crtc_id; /* requested crtc_id */ + __u32 active; /* return: crtc output is active */ + __u64 sequence; /* return: most recent vblank sequence */ + __s64 sequence_ns; /* return: most recent time of first pixel out */ +}; + +/* Queue event to be delivered at specified sequence. Time stamp marks + * when the first pixel of the refresh cycle leaves the display engine + * for the display + */ +#define DRM_CRTC_SEQUENCE_RELATIVE 0x00000001 /* sequence is relative to current */ +#define DRM_CRTC_SEQUENCE_NEXT_ON_MISS 0x00000002 /* Use next sequence if we've missed */ + +struct drm_crtc_queue_sequence { + __u32 crtc_id; + __u32 flags; + __u64 sequence; /* on input, target sequence. on output, actual sequence */ + __u64 user_data; /* user data passed to event */ +}; + +#define DRM_CLIENT_NAME_MAX_LEN 64 +struct drm_set_client_name { + __u64 name_len; + __u64 name; +}; + + +#if defined(__cplusplus) +} +#endif + +#include "drm_mode.h" + +#if defined(__cplusplus) +extern "C" { +#endif + +#define DRM_IOCTL_BASE 'd' +#define DRM_IO(nr) _IO(DRM_IOCTL_BASE,nr) +#define DRM_IOR(nr,type) _IOR(DRM_IOCTL_BASE,nr,type) +#define DRM_IOW(nr,type) _IOW(DRM_IOCTL_BASE,nr,type) +#define DRM_IOWR(nr,type) _IOWR(DRM_IOCTL_BASE,nr,type) + +#define DRM_IOCTL_VERSION DRM_IOWR(0x00, struct drm_version) +#define DRM_IOCTL_GET_UNIQUE DRM_IOWR(0x01, struct drm_unique) +#define DRM_IOCTL_GET_MAGIC DRM_IOR( 0x02, struct drm_auth) +#define DRM_IOCTL_IRQ_BUSID DRM_IOWR(0x03, struct drm_irq_busid) +#define DRM_IOCTL_GET_MAP DRM_IOWR(0x04, struct drm_map) +#define DRM_IOCTL_GET_CLIENT DRM_IOWR(0x05, struct drm_client) +#define DRM_IOCTL_GET_STATS DRM_IOR( 0x06, struct drm_stats) +#define DRM_IOCTL_SET_VERSION DRM_IOWR(0x07, struct drm_set_version) +#define DRM_IOCTL_MODESET_CTL DRM_IOW(0x08, struct drm_modeset_ctl) +/** + * DRM_IOCTL_GEM_CLOSE - Close a GEM handle. + * + * GEM handles are not reference-counted by the kernel. User-space is + * responsible for managing their lifetime. For example, if user-space imports + * the same memory object twice on the same DRM file description, the same GEM + * handle is returned by both imports, and user-space needs to ensure + * &DRM_IOCTL_GEM_CLOSE is performed once only. The same situation can happen + * when a memory object is allocated, then exported and imported again on the + * same DRM file description. The &DRM_IOCTL_MODE_GETFB2 IOCTL is an exception + * and always returns fresh new GEM handles even if an existing GEM handle + * already refers to the same memory object before the IOCTL is performed. + */ +#define DRM_IOCTL_GEM_CLOSE DRM_IOW (0x09, struct drm_gem_close) +#define DRM_IOCTL_GEM_FLINK DRM_IOWR(0x0a, struct drm_gem_flink) +#define DRM_IOCTL_GEM_OPEN DRM_IOWR(0x0b, struct drm_gem_open) +#define DRM_IOCTL_GET_CAP DRM_IOWR(0x0c, struct drm_get_cap) +#define DRM_IOCTL_SET_CLIENT_CAP DRM_IOW( 0x0d, struct drm_set_client_cap) + +#define DRM_IOCTL_SET_UNIQUE DRM_IOW( 0x10, struct drm_unique) +#define DRM_IOCTL_AUTH_MAGIC DRM_IOW( 0x11, struct drm_auth) +#define DRM_IOCTL_BLOCK DRM_IOWR(0x12, struct drm_block) +#define DRM_IOCTL_UNBLOCK DRM_IOWR(0x13, struct drm_block) +#define DRM_IOCTL_CONTROL DRM_IOW( 0x14, struct drm_control) +#define DRM_IOCTL_ADD_MAP DRM_IOWR(0x15, struct drm_map) +#define DRM_IOCTL_ADD_BUFS DRM_IOWR(0x16, struct drm_buf_desc) +#define DRM_IOCTL_MARK_BUFS DRM_IOW( 0x17, struct drm_buf_desc) +#define DRM_IOCTL_INFO_BUFS DRM_IOWR(0x18, struct drm_buf_info) +#define DRM_IOCTL_MAP_BUFS DRM_IOWR(0x19, struct drm_buf_map) +#define DRM_IOCTL_FREE_BUFS DRM_IOW( 0x1a, struct drm_buf_free) + +#define DRM_IOCTL_RM_MAP DRM_IOW( 0x1b, struct drm_map) + +#define DRM_IOCTL_SET_SAREA_CTX DRM_IOW( 0x1c, struct drm_ctx_priv_map) +#define DRM_IOCTL_GET_SAREA_CTX DRM_IOWR(0x1d, struct drm_ctx_priv_map) + +#define DRM_IOCTL_SET_MASTER DRM_IO(0x1e) +#define DRM_IOCTL_DROP_MASTER DRM_IO(0x1f) + +#define DRM_IOCTL_ADD_CTX DRM_IOWR(0x20, struct drm_ctx) +#define DRM_IOCTL_RM_CTX DRM_IOWR(0x21, struct drm_ctx) +#define DRM_IOCTL_MOD_CTX DRM_IOW( 0x22, struct drm_ctx) +#define DRM_IOCTL_GET_CTX DRM_IOWR(0x23, struct drm_ctx) +#define DRM_IOCTL_SWITCH_CTX DRM_IOW( 0x24, struct drm_ctx) +#define DRM_IOCTL_NEW_CTX DRM_IOW( 0x25, struct drm_ctx) +#define DRM_IOCTL_RES_CTX DRM_IOWR(0x26, struct drm_ctx_res) +#define DRM_IOCTL_ADD_DRAW DRM_IOWR(0x27, struct drm_draw) +#define DRM_IOCTL_RM_DRAW DRM_IOWR(0x28, struct drm_draw) +#define DRM_IOCTL_DMA DRM_IOWR(0x29, struct drm_dma) +#define DRM_IOCTL_LOCK DRM_IOW( 0x2a, struct drm_lock) +#define DRM_IOCTL_UNLOCK DRM_IOW( 0x2b, struct drm_lock) +#define DRM_IOCTL_FINISH DRM_IOW( 0x2c, struct drm_lock) + +/** + * DRM_IOCTL_PRIME_HANDLE_TO_FD - Convert a GEM handle to a DMA-BUF FD. + * + * User-space sets &drm_prime_handle.handle with the GEM handle to export and + * &drm_prime_handle.flags, and gets back a DMA-BUF file descriptor in + * &drm_prime_handle.fd. + * + * The export can fail for any driver-specific reason, e.g. because export is + * not supported for this specific GEM handle (but might be for others). + * + * Support for exporting DMA-BUFs is advertised via &DRM_PRIME_CAP_EXPORT. + */ +#define DRM_IOCTL_PRIME_HANDLE_TO_FD DRM_IOWR(0x2d, struct drm_prime_handle) +/** + * DRM_IOCTL_PRIME_FD_TO_HANDLE - Convert a DMA-BUF FD to a GEM handle. + * + * User-space sets &drm_prime_handle.fd with a DMA-BUF file descriptor to + * import, and gets back a GEM handle in &drm_prime_handle.handle. + * &drm_prime_handle.flags is unused. + * + * If an existing GEM handle refers to the memory object backing the DMA-BUF, + * that GEM handle is returned. Therefore user-space which needs to handle + * arbitrary DMA-BUFs must have a user-space lookup data structure to manually + * reference-count duplicated GEM handles. For more information see + * &DRM_IOCTL_GEM_CLOSE. + * + * The import can fail for any driver-specific reason, e.g. because import is + * only supported for DMA-BUFs allocated on this DRM device. + * + * Support for importing DMA-BUFs is advertised via &DRM_PRIME_CAP_IMPORT. + */ +#define DRM_IOCTL_PRIME_FD_TO_HANDLE DRM_IOWR(0x2e, struct drm_prime_handle) + +#define DRM_IOCTL_AGP_ACQUIRE DRM_IO( 0x30) +#define DRM_IOCTL_AGP_RELEASE DRM_IO( 0x31) +#define DRM_IOCTL_AGP_ENABLE DRM_IOW( 0x32, struct drm_agp_mode) +#define DRM_IOCTL_AGP_INFO DRM_IOR( 0x33, struct drm_agp_info) +#define DRM_IOCTL_AGP_ALLOC DRM_IOWR(0x34, struct drm_agp_buffer) +#define DRM_IOCTL_AGP_FREE DRM_IOW( 0x35, struct drm_agp_buffer) +#define DRM_IOCTL_AGP_BIND DRM_IOW( 0x36, struct drm_agp_binding) +#define DRM_IOCTL_AGP_UNBIND DRM_IOW( 0x37, struct drm_agp_binding) + +#define DRM_IOCTL_SG_ALLOC DRM_IOWR(0x38, struct drm_scatter_gather) +#define DRM_IOCTL_SG_FREE DRM_IOW( 0x39, struct drm_scatter_gather) + +#define DRM_IOCTL_WAIT_VBLANK DRM_IOWR(0x3a, union drm_wait_vblank) + +#define DRM_IOCTL_CRTC_GET_SEQUENCE DRM_IOWR(0x3b, struct drm_crtc_get_sequence) +#define DRM_IOCTL_CRTC_QUEUE_SEQUENCE DRM_IOWR(0x3c, struct drm_crtc_queue_sequence) + +#define DRM_IOCTL_UPDATE_DRAW DRM_IOW(0x3f, struct drm_update_draw) + +#define DRM_IOCTL_MODE_GETRESOURCES DRM_IOWR(0xA0, struct drm_mode_card_res) +#define DRM_IOCTL_MODE_GETCRTC DRM_IOWR(0xA1, struct drm_mode_crtc) +#define DRM_IOCTL_MODE_SETCRTC DRM_IOWR(0xA2, struct drm_mode_crtc) +#define DRM_IOCTL_MODE_CURSOR DRM_IOWR(0xA3, struct drm_mode_cursor) +#define DRM_IOCTL_MODE_GETGAMMA DRM_IOWR(0xA4, struct drm_mode_crtc_lut) +#define DRM_IOCTL_MODE_SETGAMMA DRM_IOWR(0xA5, struct drm_mode_crtc_lut) +#define DRM_IOCTL_MODE_GETENCODER DRM_IOWR(0xA6, struct drm_mode_get_encoder) +#define DRM_IOCTL_MODE_GETCONNECTOR DRM_IOWR(0xA7, struct drm_mode_get_connector) +#define DRM_IOCTL_MODE_ATTACHMODE DRM_IOWR(0xA8, struct drm_mode_mode_cmd) /* deprecated (never worked) */ +#define DRM_IOCTL_MODE_DETACHMODE DRM_IOWR(0xA9, struct drm_mode_mode_cmd) /* deprecated (never worked) */ + +#define DRM_IOCTL_MODE_GETPROPERTY DRM_IOWR(0xAA, struct drm_mode_get_property) +#define DRM_IOCTL_MODE_SETPROPERTY DRM_IOWR(0xAB, struct drm_mode_connector_set_property) +#define DRM_IOCTL_MODE_GETPROPBLOB DRM_IOWR(0xAC, struct drm_mode_get_blob) +#define DRM_IOCTL_MODE_GETFB DRM_IOWR(0xAD, struct drm_mode_fb_cmd) +#define DRM_IOCTL_MODE_ADDFB DRM_IOWR(0xAE, struct drm_mode_fb_cmd) +/** + * DRM_IOCTL_MODE_RMFB - Remove a framebuffer. + * + * This removes a framebuffer previously added via ADDFB/ADDFB2. The IOCTL + * argument is a framebuffer object ID. + * + * Warning: removing a framebuffer currently in-use on an enabled plane will + * disable that plane. The CRTC the plane is linked to may also be disabled + * (depending on driver capabilities). + */ +#define DRM_IOCTL_MODE_RMFB DRM_IOWR(0xAF, unsigned int) +#define DRM_IOCTL_MODE_PAGE_FLIP DRM_IOWR(0xB0, struct drm_mode_crtc_page_flip) +#define DRM_IOCTL_MODE_DIRTYFB DRM_IOWR(0xB1, struct drm_mode_fb_dirty_cmd) + +/** + * DRM_IOCTL_MODE_CREATE_DUMB - Create a new dumb buffer object. + * + * KMS dumb buffers provide a very primitive way to allocate a buffer object + * suitable for scanout and map it for software rendering. KMS dumb buffers are + * not suitable for hardware-accelerated rendering nor video decoding. KMS dumb + * buffers are not suitable to be displayed on any other device than the KMS + * device where they were allocated from. Also see + * :ref:`kms_dumb_buffer_objects`. + * + * The IOCTL argument is a struct drm_mode_create_dumb. + * + * User-space is expected to create a KMS dumb buffer via this IOCTL, then add + * it as a KMS framebuffer via &DRM_IOCTL_MODE_ADDFB and map it via + * &DRM_IOCTL_MODE_MAP_DUMB. + * + * &DRM_CAP_DUMB_BUFFER indicates whether this IOCTL is supported. + * &DRM_CAP_DUMB_PREFERRED_DEPTH and &DRM_CAP_DUMB_PREFER_SHADOW indicate + * driver preferences for dumb buffers. + */ +#define DRM_IOCTL_MODE_CREATE_DUMB DRM_IOWR(0xB2, struct drm_mode_create_dumb) +#define DRM_IOCTL_MODE_MAP_DUMB DRM_IOWR(0xB3, struct drm_mode_map_dumb) +#define DRM_IOCTL_MODE_DESTROY_DUMB DRM_IOWR(0xB4, struct drm_mode_destroy_dumb) +#define DRM_IOCTL_MODE_GETPLANERESOURCES DRM_IOWR(0xB5, struct drm_mode_get_plane_res) +#define DRM_IOCTL_MODE_GETPLANE DRM_IOWR(0xB6, struct drm_mode_get_plane) +#define DRM_IOCTL_MODE_SETPLANE DRM_IOWR(0xB7, struct drm_mode_set_plane) +#define DRM_IOCTL_MODE_ADDFB2 DRM_IOWR(0xB8, struct drm_mode_fb_cmd2) +#define DRM_IOCTL_MODE_OBJ_GETPROPERTIES DRM_IOWR(0xB9, struct drm_mode_obj_get_properties) +#define DRM_IOCTL_MODE_OBJ_SETPROPERTY DRM_IOWR(0xBA, struct drm_mode_obj_set_property) +#define DRM_IOCTL_MODE_CURSOR2 DRM_IOWR(0xBB, struct drm_mode_cursor2) +#define DRM_IOCTL_MODE_ATOMIC DRM_IOWR(0xBC, struct drm_mode_atomic) +#define DRM_IOCTL_MODE_CREATEPROPBLOB DRM_IOWR(0xBD, struct drm_mode_create_blob) +#define DRM_IOCTL_MODE_DESTROYPROPBLOB DRM_IOWR(0xBE, struct drm_mode_destroy_blob) + +#define DRM_IOCTL_SYNCOBJ_CREATE DRM_IOWR(0xBF, struct drm_syncobj_create) +#define DRM_IOCTL_SYNCOBJ_DESTROY DRM_IOWR(0xC0, struct drm_syncobj_destroy) +#define DRM_IOCTL_SYNCOBJ_HANDLE_TO_FD DRM_IOWR(0xC1, struct drm_syncobj_handle) +#define DRM_IOCTL_SYNCOBJ_FD_TO_HANDLE DRM_IOWR(0xC2, struct drm_syncobj_handle) +#define DRM_IOCTL_SYNCOBJ_WAIT DRM_IOWR(0xC3, struct drm_syncobj_wait) +#define DRM_IOCTL_SYNCOBJ_RESET DRM_IOWR(0xC4, struct drm_syncobj_array) +#define DRM_IOCTL_SYNCOBJ_SIGNAL DRM_IOWR(0xC5, struct drm_syncobj_array) + +#define DRM_IOCTL_MODE_CREATE_LEASE DRM_IOWR(0xC6, struct drm_mode_create_lease) +#define DRM_IOCTL_MODE_LIST_LESSEES DRM_IOWR(0xC7, struct drm_mode_list_lessees) +#define DRM_IOCTL_MODE_GET_LEASE DRM_IOWR(0xC8, struct drm_mode_get_lease) +#define DRM_IOCTL_MODE_REVOKE_LEASE DRM_IOWR(0xC9, struct drm_mode_revoke_lease) + +#define DRM_IOCTL_SYNCOBJ_TIMELINE_WAIT DRM_IOWR(0xCA, struct drm_syncobj_timeline_wait) +#define DRM_IOCTL_SYNCOBJ_QUERY DRM_IOWR(0xCB, struct drm_syncobj_timeline_array) +#define DRM_IOCTL_SYNCOBJ_TRANSFER DRM_IOWR(0xCC, struct drm_syncobj_transfer) +#define DRM_IOCTL_SYNCOBJ_TIMELINE_SIGNAL DRM_IOWR(0xCD, struct drm_syncobj_timeline_array) + +/** + * DRM_IOCTL_MODE_GETFB2 - Get framebuffer metadata. + * + * This queries metadata about a framebuffer. User-space fills + * &drm_mode_fb_cmd2.fb_id as the input, and the kernels fills the rest of the + * struct as the output. + * + * If the client is DRM master or has &CAP_SYS_ADMIN, &drm_mode_fb_cmd2.handles + * will be filled with GEM buffer handles. Fresh new GEM handles are always + * returned, even if another GEM handle referring to the same memory object + * already exists on the DRM file description. The caller is responsible for + * removing the new handles, e.g. via the &DRM_IOCTL_GEM_CLOSE IOCTL. The same + * new handle will be returned for multiple planes in case they use the same + * memory object. Planes are valid until one has a zero handle -- this can be + * used to compute the number of planes. + * + * Otherwise, &drm_mode_fb_cmd2.handles will be zeroed and planes are valid + * until one has a zero &drm_mode_fb_cmd2.pitches. + * + * If the framebuffer has a format modifier, &DRM_MODE_FB_MODIFIERS will be set + * in &drm_mode_fb_cmd2.flags and &drm_mode_fb_cmd2.modifier will contain the + * modifier. Otherwise, user-space must ignore &drm_mode_fb_cmd2.modifier. + * + * To obtain DMA-BUF FDs for each plane without leaking GEM handles, user-space + * can export each handle via &DRM_IOCTL_PRIME_HANDLE_TO_FD, then immediately + * close each unique handle via &DRM_IOCTL_GEM_CLOSE, making sure to not + * double-close handles which are specified multiple times in the array. + */ +#define DRM_IOCTL_MODE_GETFB2 DRM_IOWR(0xCE, struct drm_mode_fb_cmd2) + +#define DRM_IOCTL_SYNCOBJ_EVENTFD DRM_IOWR(0xCF, struct drm_syncobj_eventfd) + +/** + * DRM_IOCTL_MODE_CLOSEFB - Close a framebuffer. + * + * This closes a framebuffer previously added via ADDFB/ADDFB2. The IOCTL + * argument is a framebuffer object ID. + * + * This IOCTL is similar to &DRM_IOCTL_MODE_RMFB, except it doesn't disable + * planes and CRTCs. As long as the framebuffer is used by a plane, it's kept + * alive. When the plane no longer uses the framebuffer (because the + * framebuffer is replaced with another one, or the plane is disabled), the + * framebuffer is cleaned up. + * + * This is useful to implement flicker-free transitions between two processes. + * + * Depending on the threat model, user-space may want to ensure that the + * framebuffer doesn't expose any sensitive user information: closed + * framebuffers attached to a plane can be read back by the next DRM master. + */ +#define DRM_IOCTL_MODE_CLOSEFB DRM_IOWR(0xD0, struct drm_mode_closefb) + +/** + * DRM_IOCTL_SET_CLIENT_NAME - Attach a name to a drm_file + * + * Having a name allows for easier tracking and debugging. + * The length of the name (without null ending char) must be + * <= DRM_CLIENT_NAME_MAX_LEN. + * The call will fail if the name contains whitespaces or non-printable chars. + */ +#define DRM_IOCTL_SET_CLIENT_NAME DRM_IOWR(0xD1, struct drm_set_client_name) + +/** + * DRM_IOCTL_GEM_CHANGE_HANDLE - Move an object to a different handle + * + * Some applications (notably CRIU) need objects to have specific gem handles. + * This ioctl changes the object at one gem handle to use a new gem handle. + */ +#define DRM_IOCTL_GEM_CHANGE_HANDLE DRM_IOWR(0xD2, struct drm_gem_change_handle) + +/* + * Device specific ioctls should only be in their respective headers + * The device specific ioctl range is from 0x40 to 0x9f. + * Generic IOCTLS restart at 0xA0. + * + * \sa drmCommandNone(), drmCommandRead(), drmCommandWrite(), and + * drmCommandReadWrite(). + */ +#define DRM_COMMAND_BASE 0x40 +#define DRM_COMMAND_END 0xA0 + +/** + * struct drm_event - Header for DRM events + * @type: event type. + * @length: total number of payload bytes (including header). + * + * This struct is a header for events written back to user-space on the DRM FD. + * A read on the DRM FD will always only return complete events: e.g. if the + * read buffer is 100 bytes large and there are two 64 byte events pending, + * only one will be returned. + * + * Event types 0 - 0x7fffffff are generic DRM events, 0x80000000 and + * up are chipset specific. Generic DRM events include &DRM_EVENT_VBLANK, + * &DRM_EVENT_FLIP_COMPLETE and &DRM_EVENT_CRTC_SEQUENCE. + */ +struct drm_event { + __u32 type; + __u32 length; +}; + +/** + * DRM_EVENT_VBLANK - vertical blanking event + * + * This event is sent in response to &DRM_IOCTL_WAIT_VBLANK with the + * &_DRM_VBLANK_EVENT flag set. + * + * The event payload is a struct drm_event_vblank. + */ +#define DRM_EVENT_VBLANK 0x01 +/** + * DRM_EVENT_FLIP_COMPLETE - page-flip completion event + * + * This event is sent in response to an atomic commit or legacy page-flip with + * the &DRM_MODE_PAGE_FLIP_EVENT flag set. + * + * The event payload is a struct drm_event_vblank. + */ +#define DRM_EVENT_FLIP_COMPLETE 0x02 +/** + * DRM_EVENT_CRTC_SEQUENCE - CRTC sequence event + * + * This event is sent in response to &DRM_IOCTL_CRTC_QUEUE_SEQUENCE. + * + * The event payload is a struct drm_event_crtc_sequence. + */ +#define DRM_EVENT_CRTC_SEQUENCE 0x03 + +struct drm_event_vblank { + struct drm_event base; + __u64 user_data; + __u32 tv_sec; + __u32 tv_usec; + __u32 sequence; + __u32 crtc_id; /* 0 on older kernels that do not support this */ +}; + +/* Event delivered at sequence. Time stamp marks when the first pixel + * of the refresh cycle leaves the display engine for the display + */ +struct drm_event_crtc_sequence { + struct drm_event base; + __u64 user_data; + __s64 time_ns; + __u64 sequence; +}; + +/* typedef area */ +#ifndef __KERNEL__ +typedef struct drm_clip_rect drm_clip_rect_t; +typedef struct drm_drawable_info drm_drawable_info_t; +typedef struct drm_tex_region drm_tex_region_t; +typedef struct drm_hw_lock drm_hw_lock_t; +typedef struct drm_version drm_version_t; +typedef struct drm_unique drm_unique_t; +typedef struct drm_list drm_list_t; +typedef struct drm_block drm_block_t; +typedef struct drm_control drm_control_t; +typedef enum drm_map_type drm_map_type_t; +typedef enum drm_map_flags drm_map_flags_t; +typedef struct drm_ctx_priv_map drm_ctx_priv_map_t; +typedef struct drm_map drm_map_t; +typedef struct drm_client drm_client_t; +typedef enum drm_stat_type drm_stat_type_t; +typedef struct drm_stats drm_stats_t; +typedef enum drm_lock_flags drm_lock_flags_t; +typedef struct drm_lock drm_lock_t; +typedef enum drm_dma_flags drm_dma_flags_t; +typedef struct drm_buf_desc drm_buf_desc_t; +typedef struct drm_buf_info drm_buf_info_t; +typedef struct drm_buf_free drm_buf_free_t; +typedef struct drm_buf_pub drm_buf_pub_t; +typedef struct drm_buf_map drm_buf_map_t; +typedef struct drm_dma drm_dma_t; +typedef union drm_wait_vblank drm_wait_vblank_t; +typedef struct drm_agp_mode drm_agp_mode_t; +typedef enum drm_ctx_flags drm_ctx_flags_t; +typedef struct drm_ctx drm_ctx_t; +typedef struct drm_ctx_res drm_ctx_res_t; +typedef struct drm_draw drm_draw_t; +typedef struct drm_update_draw drm_update_draw_t; +typedef struct drm_auth drm_auth_t; +typedef struct drm_irq_busid drm_irq_busid_t; +typedef enum drm_vblank_seq_type drm_vblank_seq_type_t; + +typedef struct drm_agp_buffer drm_agp_buffer_t; +typedef struct drm_agp_binding drm_agp_binding_t; +typedef struct drm_agp_info drm_agp_info_t; +typedef struct drm_scatter_gather drm_scatter_gather_t; +typedef struct drm_set_version drm_set_version_t; +#endif + +#if defined(__cplusplus) +} +#endif + +#endif From db0ec806d12d1435fbf2ccbcac05ec878fe0f401 Mon Sep 17 00:00:00 2001 From: David Francis Date: Wed, 12 Feb 2025 09:29:21 -0500 Subject: [PATCH 10/51] plugin/amdgpu: Add handling for amdgpu drm buffer objects Buffer objects held by the amdgpu drm driver are checkpointed with the new BO_INFO and MAPPING_INFO ioctls/ioctl options. Handling is in amdgpu_plugin_drm.h Handling of imported buffer objects may require dmabuf fds to be transferred between processes. These occur over fdstore, with the handle-fstore id relationships kept in shread memory. There is a new plugin callback: RESTORE_INIT to create the shared memory. During checkpoint, track shared buffer objects, so that buffer objects that are shared across processes can be identified. During restore, track which buffer objects have been restored. Retry restore of a drm file if a buffer object is imported and the original has not been exported yet. Skip buffer objects that have already been completed or cannot be completed in the current restore. So drm code can use sdma_copy_bo, that function no longer requires kfd bo structs Update the protobuf messages with new amdgpu drm information. Signed-off-by: David Francis --- criu/include/criu-plugin.h | 3 + criu/plugin.c | 13 +- criu/servicefd.c | 2 +- plugins/amdgpu/amdgpu_plugin.c | 281 ++++++++++++++-- plugins/amdgpu/amdgpu_plugin_drm.c | 487 +++++++++++++++++++++++++++- plugins/amdgpu/amdgpu_plugin_drm.h | 12 + plugins/amdgpu/amdgpu_plugin_util.c | 84 +++++ plugins/amdgpu/amdgpu_plugin_util.h | 39 ++- plugins/amdgpu/criu-amdgpu.proto | 25 ++ 9 files changed, 900 insertions(+), 46 deletions(-) diff --git a/criu/include/criu-plugin.h b/criu/include/criu-plugin.h index ee84ccdf6..977dad655 100644 --- a/criu/include/criu-plugin.h +++ b/criu/include/criu-plugin.h @@ -62,6 +62,8 @@ enum { CR_PLUGIN_HOOK__POST_FORKING = 12, + CR_PLUGIN_HOOK__RESTORE_INIT = 13, + CR_PLUGIN_HOOK__MAX }; @@ -81,6 +83,7 @@ DECLARE_PLUGIN_HOOK_ARGS(CR_PLUGIN_HOOK__RESUME_DEVICES_LATE, int pid); DECLARE_PLUGIN_HOOK_ARGS(CR_PLUGIN_HOOK__PAUSE_DEVICES, int pid); DECLARE_PLUGIN_HOOK_ARGS(CR_PLUGIN_HOOK__CHECKPOINT_DEVICES, int pid); DECLARE_PLUGIN_HOOK_ARGS(CR_PLUGIN_HOOK__POST_FORKING, void); +DECLARE_PLUGIN_HOOK_ARGS(CR_PLUGIN_HOOK__RESTORE_INIT, void); enum { CR_PLUGIN_STAGE__DUMP, diff --git a/criu/plugin.c b/criu/plugin.c index 18da0499d..a2057e9c1 100644 --- a/criu/plugin.c +++ b/criu/plugin.c @@ -60,6 +60,7 @@ static cr_plugin_desc_t *cr_gen_plugin_desc(void *h, char *path) __assign_hook(PAUSE_DEVICES, "cr_plugin_pause_devices"); __assign_hook(CHECKPOINT_DEVICES, "cr_plugin_checkpoint_devices"); __assign_hook(POST_FORKING, "cr_plugin_post_forking"); + __assign_hook(RESTORE_INIT, "cr_plugin_restore_init"); #undef __assign_hook @@ -257,8 +258,16 @@ int cr_plugin_init(int stage) goto err; } - if (stage == CR_PLUGIN_STAGE__RESTORE && check_inventory_plugins()) - goto err; + if (stage == CR_PLUGIN_STAGE__RESTORE) { + int ret; + + if (check_inventory_plugins()) + goto err; + + ret = run_plugins(RESTORE_INIT); + if (ret < 0 && ret != -ENOTSUP) + goto err; + } exit_code = 0; err: diff --git a/criu/servicefd.c b/criu/servicefd.c index 06a8d3eba..dfb019066 100644 --- a/criu/servicefd.c +++ b/criu/servicefd.c @@ -313,4 +313,4 @@ int clone_service_fd(struct pstree_item *me) ret = 0; return ret; -} +} \ No newline at end of file diff --git a/plugins/amdgpu/amdgpu_plugin.c b/plugins/amdgpu/amdgpu_plugin.c index e3b4ead3f..4be8421a0 100644 --- a/plugins/amdgpu/amdgpu_plugin.c +++ b/plugins/amdgpu/amdgpu_plugin.c @@ -12,6 +12,8 @@ #include #include #include +#include +#include #include #include #include @@ -23,12 +25,17 @@ #include "criu-plugin.h" #include "plugin.h" #include "criu-amdgpu.pb-c.h" +#include "util.h" +#include "util-pie.h" +#include "fdstore.h" #include "kfd_ioctl.h" #include "xmalloc.h" #include "criu-log.h" #include "files.h" #include "pstree.h" +#include "sockets.h" +#include "rst-malloc.h" #include "common/list.h" #include "amdgpu_plugin_drm.h" @@ -66,6 +73,19 @@ bool plugin_added_to_inventory = false; bool plugin_disabled = false; +struct handle_id { + int handle; + int fdstore_id; +}; +struct shared_handle_ids { + int num_handles; + struct handle_id *handles; +}; +struct shared_handle_ids *shared_memory = NULL; + +static mutex_t *shared_memory_mutex; + +int current_pid; /* * In the case of a single process (common case), this optimization can effectively * reduce the restore latency with parallel restore. In the case of multiple processes, @@ -526,11 +546,11 @@ void free_and_unmap(uint64_t size, amdgpu_bo_handle h_bo, amdgpu_va_handle h_va, amdgpu_bo_free(h_bo); } -static int sdma_copy_bo(struct kfd_criu_bo_bucket bo_bucket, FILE *storage_fp, - void *buffer, size_t buffer_size, amdgpu_device_handle h_dev, - uint64_t max_copy_size, enum sdma_op_type type) +int sdma_copy_bo(int shared_fd, uint64_t size, FILE *storage_fp, + void *buffer, size_t buffer_size, amdgpu_device_handle h_dev, + uint64_t max_copy_size, enum sdma_op_type type, bool do_not_free) { - uint64_t size, src_bo_size, dst_bo_size, buffer_bo_size, bytes_remain, buffer_space_remain; + uint64_t src_bo_size, dst_bo_size, buffer_bo_size, bytes_remain, buffer_space_remain; uint64_t gpu_addr_src, gpu_addr_dst, gpu_addr_ib, copy_src, copy_dst, copy_size; amdgpu_va_handle h_va_src, h_va_dst, h_va_ib; amdgpu_bo_handle h_bo_src, h_bo_dst, h_bo_ib; @@ -543,10 +563,8 @@ static int sdma_copy_bo(struct kfd_criu_bo_bucket bo_bucket, FILE *storage_fp, uint32_t expired; amdgpu_context_handle h_ctx; uint32_t *ib = NULL; - int j, err, shared_fd, packets_per_buffer; + int j, err, packets_per_buffer; - shared_fd = bo_bucket.dmabuf_fd; - size = bo_bucket.size; buffer_bo_size = min(size, buffer_size); packets_per_buffer = ((buffer_bo_size - 1) / max_copy_size) + 1; src_bo_size = (type == SDMA_OP_VRAM_WRITE) ? buffer_bo_size : size; @@ -757,7 +775,8 @@ err_dst_bo_map: if (err) pr_perror("dest range free failed"); err_dst_va: - err = amdgpu_bo_free(h_bo_dst); + if (!do_not_free) + err = amdgpu_bo_free(h_bo_dst); if (err) pr_perror("dest bo free failed"); err_dst_bo_prep: @@ -845,8 +864,9 @@ void *dump_bo_contents(void *_thread_data) num_bos++; /* perform sDMA based vram copy */ - ret = sdma_copy_bo(bo_buckets[i], bo_contents_fp, buffer, buffer_size, h_dev, max_copy_size, - SDMA_OP_VRAM_READ); + ret = sdma_copy_bo(bo_buckets[i].dmabuf_fd, bo_buckets[i].size, bo_contents_fp, buffer, buffer_size, h_dev, max_copy_size, + SDMA_OP_VRAM_READ, false); + if (ret) { pr_err("Failed to drain the BO using sDMA: bo_buckets[%d]\n", i); break; @@ -943,8 +963,8 @@ void *restore_bo_contents(void *_thread_data) num_bos++; - ret = sdma_copy_bo(bo_buckets[i], bo_contents_fp, buffer, buffer_size, h_dev, max_copy_size, - SDMA_OP_VRAM_WRITE); + ret = sdma_copy_bo(bo_buckets[i].dmabuf_fd, bo_buckets[i].size, bo_contents_fp, buffer, buffer_size, h_dev, max_copy_size, + SDMA_OP_VRAM_WRITE, false); if (ret) { pr_err("Failed to fill the BO using sDMA: bo_buckets[%d]\n", i); break; @@ -1053,6 +1073,134 @@ exit: return ret; } +int store_dmabuf_fd(int handle, int fd) +{ + int id; + + id = fdstore_add(fd); + mutex_lock(shared_memory_mutex); + for (int i = 0; i < shared_memory->num_handles; i++) { + if (shared_memory->handles[i].handle == handle) { + mutex_unlock(shared_memory_mutex); + return 0; + } + if (shared_memory->handles[i].handle == -1) { + shared_memory->handles[i].handle = handle; + shared_memory->handles[i].fdstore_id = id; + mutex_unlock(shared_memory_mutex); + return 0; + } + } + mutex_unlock(shared_memory_mutex); + + return -1; +} + +int amdgpu_id_for_handle(int handle) +{ + mutex_lock(shared_memory_mutex); + for (int i = 0; i < shared_memory->num_handles; i++) { + if (shared_memory->handles[i].handle == handle) { + mutex_unlock(shared_memory_mutex); + return shared_memory->handles[i].fdstore_id; + } + } + mutex_unlock(shared_memory_mutex); + return -1; +} + +int amdgpu_restore_init(void) +{ + if (!shared_memory) { + int protection = PROT_READ | PROT_WRITE; + int visibility = MAP_SHARED | MAP_ANONYMOUS; + size_t img_size; + FILE *img_fp = NULL; + int ret; + unsigned char *buf; + int num_handles = 0; + char img_path[PATH_MAX]; + CriuRenderNode *rd = NULL; + CriuKfd *e = NULL; + + DIR *d; + struct dirent *dir; + d = opendir("."); + if (d) { + while ((dir = readdir(d)) != NULL) { + if (strncmp("amdgpu-kfd-", dir->d_name, strlen("amdgpu-kfd-")) == 0) { + pr_info("CC3: Found kfd file\n"); + img_fp = open_img_file(dir->d_name, false, &img_size); + buf = xmalloc(img_size); + if (!buf) { + fclose(img_fp); + return -ENOMEM; + } + + ret = read_fp(img_fp, buf, img_size); + if (ret) { + pr_perror("Unable to read from %s", img_path); + fclose(img_fp); + xfree(buf); + return ret; + } + + fclose(img_fp); + e = criu_kfd__unpack(NULL, img_size, buf); + num_handles += e->num_of_bos; + criu_kfd__free_unpacked(e, NULL); + xfree(buf); + } + if (strncmp("amdgpu-renderD-", dir->d_name, strlen("amdgpu-renderD-")) == 0) { + pr_info("CC3: Found drm file\n"); + img_fp = open_img_file(dir->d_name, false, &img_size); + buf = xmalloc(img_size); + if (!buf) { + fclose(img_fp); + return -ENOMEM; + } + + ret = read_fp(img_fp, buf, img_size); + if (ret) { + pr_perror("Unable to read from %s", img_path); + fclose(img_fp); + xfree(buf); + return ret; + } + + fclose(img_fp); + rd = criu_render_node__unpack(NULL, img_size, buf); + num_handles += rd->num_of_bos; + criu_render_node__free_unpacked(rd, NULL); + xfree(buf); + } + } + closedir(d); + } + + if (num_handles > 0) { + shared_memory = mmap(NULL, sizeof(shared_memory), protection, visibility, -1, 0); + shared_memory->num_handles = num_handles; + shared_memory->handles = mmap(NULL, sizeof(struct handle_id) * num_handles, protection, visibility, -1, 0); + + for (int i = 0; i < num_handles; i++) { + shared_memory->handles[i].handle = -1; + shared_memory->handles[i].fdstore_id = -1; + } + + shared_memory_mutex = shmalloc(sizeof(*shared_memory_mutex)); + if (!shared_memory_mutex) { + pr_err("Can't create amdgpu mutex\n"); + return -1; + } + mutex_init(shared_memory_mutex); + } + } + + return 0; +} +CR_PLUGIN_REGISTER_HOOK(CR_PLUGIN_HOOK__RESTORE_INIT, amdgpu_restore_init) + static int save_devices(int fd, struct kfd_ioctl_criu_args *args, struct kfd_criu_device_bucket *device_buckets, CriuKfd *e) { @@ -1095,6 +1243,8 @@ static int save_bos(int id, int fd, struct kfd_ioctl_criu_args *args, struct kfd { struct thread_data *thread_datas; int ret = 0, i; + amdgpu_device_handle h_dev; + uint32_t major, minor; pr_debug("Dumping %d BOs\n", args->num_bos); @@ -1118,6 +1268,19 @@ static int save_bos(int id, int fd, struct kfd_ioctl_criu_args *args, struct kfd boinfo->size = bo_bucket->size; boinfo->offset = bo_bucket->offset; boinfo->alloc_flags = bo_bucket->alloc_flags; + + ret = amdgpu_device_initialize(node_get_drm_render_device(sys_get_node_by_gpu_id(&src_topology, bo_bucket->gpu_id)), &major, &minor, &h_dev); + + boinfo->handle = get_gem_handle(h_dev, bo_bucket->dmabuf_fd); + + amdgpu_device_deinitialize(h_dev); + } + for (i = 0; i < e->num_of_bos; i++) { + KfdBoEntry *boinfo = e->bo_entries[i]; + + ret = record_shared_bo(boinfo->handle, false); + if (ret) + goto exit; } for (int i = 0; i < e->num_of_gpus; i++) { @@ -1457,6 +1620,29 @@ static int restore_bos(struct kfd_ioctl_criu_args *args, CriuKfd *e) } pr_info("Restore BOs Ok\n"); + + return 0; +} + +int save_vma_updates(uint64_t offset, uint64_t addr, uint64_t restored_offset, int fd) +{ + struct vma_metadata *vma_md; + + vma_md = xmalloc(sizeof(*vma_md)); + if (!vma_md) { + return -ENOMEM; + } + + memset(vma_md, 0, sizeof(*vma_md)); + + vma_md->old_pgoff = offset; + vma_md->vma_entry = addr; + + vma_md->new_pgoff = restored_offset; + vma_md->fd = fd; + + list_add_tail(&vma_md->list, &update_vma_info_list); + return 0; } @@ -1691,8 +1877,18 @@ int amdgpu_plugin_restore_file(int id, bool *retry_needed) pr_info("render node destination gpu_id = 0x%04x\n", tp_node->gpu_id); fd = node_get_drm_render_device(tp_node); - if (fd < 0) + if (fd < 0) { pr_err("Failed to open render device (minor:%d)\n", tp_node->drm_render_minor); + return -1; + } + + ret = amdgpu_plugin_drm_restore_file(fd, rd); + if (ret == 1) + *retry_needed = true; + if (ret < 0) { + fd = ret; + goto fail; + } fail: criu_render_node__free_unpacked(rd, NULL); xfree(buf); @@ -1704,12 +1900,20 @@ int amdgpu_plugin_restore_file(int id, bool *retry_needed) * copy of the fd. CRIU core owns the duplicated returned fd, and amdgpu_plugin owns the fd stored in * tp_node. */ - fd = dup(fd); - if (fd == -1) { - pr_perror("unable to duplicate the render fd"); - return -1; + + if (fd < 0) + return fd; + + if (!(*retry_needed)) { + fd = dup(fd); + if (fd == -1) { + pr_perror("unable to duplicate the render fd"); + return -1; + } + return fd; } - return fd; + + return 0; } fd = open(AMDGPU_KFD_DEVICE, O_RDWR | O_CLOEXEC); @@ -1753,11 +1957,13 @@ int amdgpu_plugin_restore_file(int id, bool *retry_needed) * This way, we know that the file descriptors we store will not conflict with file descriptors inside core * CRIU. */ - fd_next = find_unused_fd_pid(e->pid); - if (fd_next <= 0) { - pr_err("Failed to find unused fd (fd:%d)\n", fd_next); - ret = -EINVAL; - goto exit; + if (fd_next == -1) { + fd_next = find_unused_fd_pid(e->pid); + if (fd_next <= 0) { + pr_err("Failed to find unused fd (fd:%d)\n", fd_next); + ret = -EINVAL; + goto exit; + } } ret = devinfo_to_topology(e->device_entries, e->num_of_gpus + e->num_of_cpus, &src_topology); @@ -1790,14 +1996,26 @@ int amdgpu_plugin_restore_file(int id, bool *retry_needed) args.num_objects = e->num_of_objects; args.priv_data_size = e->priv_data.len; args.priv_data = (uintptr_t)e->priv_data.data; - args.op = KFD_CRIU_OP_RESTORE; + if (kmtIoctl(fd, AMDKFD_IOC_CRIU_OP, &args) == -1) { pr_perror("Restore ioctl failed"); ret = -1; goto exit; } + if (ret < 0) + goto exit; + + for (int i = 0; i < args.num_bos; i++) { + struct kfd_criu_bo_bucket *bo_bucket = &((struct kfd_criu_bo_bucket *)args.bos)[i]; + KfdBoEntry *bo_entry = e->bo_entries[i]; + + if (bo_entry->handle != -1) { + store_dmabuf_fd(bo_entry->handle, bo_bucket->dmabuf_fd); + } + } + ret = restore_bo_data(id, (struct kfd_criu_bo_bucket *)args.bos, e); if (ret) goto exit; @@ -1940,19 +2158,14 @@ int amdgpu_plugin_resume_devices_late(int target_pid) } } + clear_restore_state(); + close(fd); return exit_code; } CR_PLUGIN_REGISTER_HOOK(CR_PLUGIN_HOOK__RESUME_DEVICES_LATE, amdgpu_plugin_resume_devices_late) -int sdma_copy_bo_helper(uint64_t size, int fd, FILE *storage_fp, void *buffer, size_t buffer_size, - amdgpu_device_handle h_dev, uint64_t max_copy_size, enum sdma_op_type type) -{ - return sdma_copy_bo((struct kfd_criu_bo_bucket){ 0, size, 0, 0, 0, 0, fd, 0 }, storage_fp, buffer, - buffer_size, h_dev, max_copy_size, SDMA_OP_VRAM_WRITE); -} - int init_dev(int dev_minor, amdgpu_device_handle *h_dev, uint64_t *max_copy_size) { int ret = 0; @@ -2061,8 +2274,10 @@ void *parallel_restore_bo_contents(void *_thread_data) entry = &restore_cmd->entries[i]; fseek(bo_contents_fp, entry->read_offset + offset, SEEK_SET); - ret = sdma_copy_bo_helper(entry->size, restore_cmd->fds_write[entry->write_id], bo_contents_fp, buffer, - buffer_size, h_dev, max_copy_size, SDMA_OP_VRAM_WRITE); + ret = sdma_copy_bo(restore_cmd->fds_write[entry->write_id], entry->size, bo_contents_fp, + buffer, buffer_size, h_dev, + max_copy_size, SDMA_OP_VRAM_WRITE, false); + if (ret) { pr_err("Failed to fill the BO using sDMA: bo_buckets[%d]\n", i); goto err_sdma; diff --git a/plugins/amdgpu/amdgpu_plugin_drm.c b/plugins/amdgpu/amdgpu_plugin_drm.c index d54cd937d..199dad21e 100644 --- a/plugins/amdgpu/amdgpu_plugin_drm.c +++ b/plugins/amdgpu/amdgpu_plugin_drm.c @@ -19,19 +19,112 @@ #include #include "common/list.h" +#include "files.h" +#include "fdstore.h" #include "criu-amdgpu.pb-c.h" +#define __user +#include "drm.h" #include #include #include "xmalloc.h" -#include "criu-log.h" -#include "kfd_ioctl.h" +#include "amdgpu_drm.h" #include "amdgpu_plugin_drm.h" #include "amdgpu_plugin_util.h" #include "amdgpu_plugin_topology.h" +#include "util.h" +#include "common/scm.h" + +int get_gem_handle(amdgpu_device_handle h_dev, int dmabuf_fd) +{ + uint32_t handle; + int fd = amdgpu_device_get_fd(h_dev); + + if (dmabuf_fd == -1) { + return -1; + } + + drmPrimeFDToHandle(fd, dmabuf_fd, &handle); + + return handle; +} + +int drmIoctl(int fd, unsigned long request, void *arg) +{ + int ret, max_retries = 200; + + do { + ret = ioctl(fd, request, arg); + } while (ret == -1 && max_retries-- > 0 && (errno == EINTR || errno == EAGAIN)); + + if (ret == -1 && errno == EBADF) + /* In case pthread_atfork didn't catch it, this will + * make any subsequent hsaKmt calls fail in CHECK_KFD_OPEN. + */ + pr_perror("KFD file descriptor not valid in this process"); + return ret; +} + +static int allocate_bo_entries(CriuRenderNode *e, int num_bos) +{ + e->bo_entries = xmalloc(sizeof(DrmBoEntry *) * num_bos); + if (!e->bo_entries) { + pr_err("Failed to allocate bo_info\n"); + return -ENOMEM; + } + + for (int i = 0; i < num_bos; i++) { + DrmBoEntry *entry = xzalloc(sizeof(*entry)); + + if (!entry) { + pr_err("Failed to allocate botest\n"); + return -ENOMEM; + } + + drm_bo_entry__init(entry); + + e->bo_entries[i] = entry; + e->n_bo_entries++; + } + return 0; +} + +static int allocate_vm_entries(DrmBoEntry *e, int num_vms) +{ + e->vm_entries = xmalloc(sizeof(DrmVmEntry *) * num_vms); + if (!e->vm_entries) { + pr_err("Failed to allocate bo_info\n"); + return -ENOMEM; + } + + for (int i = 0; i < num_vms; i++) { + DrmVmEntry *entry = xzalloc(sizeof(*entry)); + + if (!entry) { + pr_err("Failed to allocate botest\n"); + return -ENOMEM; + } + + drm_vm_entry__init(entry); + + e->vm_entries[i] = entry; + e->n_vm_entries++; + } + return 0; +} + +static void free_e(CriuRenderNode *e) +{ + for (int i = 0; i < e->n_bo_entries; i++) { + if (e->bo_entries[i]) + xfree(e->bo_entries[i]); + } + + xfree(e); +} int amdgpu_plugin_drm_handle_device_vma(int fd, const struct stat *st) { @@ -60,19 +153,260 @@ int amdgpu_plugin_drm_handle_device_vma(int fd, const struct stat *st) return 0; } +static int restore_bo_contents_drm(int drm_render_minor, CriuRenderNode *rd, int drm_fd, int *dmabufs) +{ + size_t image_size = 0, max_bo_size = 0, buffer_size; + struct amdgpu_gpu_info gpu_info = { 0 }; + amdgpu_device_handle h_dev; + uint64_t max_copy_size; + uint32_t major, minor; + FILE *bo_contents_fp = NULL; + void *buffer = NULL; + char img_path[40]; + int num_bos = 0; + int i, ret = 0; + + ret = amdgpu_device_initialize(drm_fd, &major, &minor, &h_dev); + if (ret) { + pr_perror("failed to initialize device"); + goto exit; + } + plugin_log_msg("libdrm initialized successfully\n"); + + ret = amdgpu_query_gpu_info(h_dev, &gpu_info); + if (ret) { + pr_perror("failed to query gpuinfo via libdrm"); + goto exit; + } + + max_copy_size = (gpu_info.family_id >= AMDGPU_FAMILY_AI) ? SDMA_LINEAR_COPY_MAX_SIZE : + SDMA_LINEAR_COPY_MAX_SIZE - 1; + + for (i = 0; i < rd->num_of_bos; i++) { + if (rd->bo_entries[i]->preferred_domains & (AMDGPU_GEM_DOMAIN_VRAM | AMDGPU_GEM_DOMAIN_GTT)) { + if (rd->bo_entries[i]->size > max_bo_size) + max_bo_size = rd->bo_entries[i]->size; + } + } + + buffer_size = max_bo_size; + + posix_memalign(&buffer, sysconf(_SC_PAGE_SIZE), buffer_size); + if (!buffer) { + pr_perror("Failed to alloc aligned memory. Consider setting KFD_MAX_BUFFER_SIZE."); + ret = -ENOMEM; + goto exit; + } + + for (i = 0; i < rd->num_of_bos; i++) { + if (!(rd->bo_entries[i]->preferred_domains & (AMDGPU_GEM_DOMAIN_VRAM | AMDGPU_GEM_DOMAIN_GTT))) + continue; + + if (rd->bo_entries[i]->num_of_vms == 0) + continue; + + num_bos++; + + snprintf(img_path, sizeof(img_path), IMG_DRM_PAGES_FILE, rd->id, drm_render_minor, i); + + bo_contents_fp = open_img_file(img_path, false, &image_size); + + ret = sdma_copy_bo(dmabufs[i], rd->bo_entries[i]->size, bo_contents_fp, buffer, buffer_size, h_dev, max_copy_size, + SDMA_OP_VRAM_WRITE, true); + if (ret) { + pr_err("Failed to fill the BO using sDMA: bo_buckets[%d]\n", i); + break; + } + plugin_log_msg("** Successfully filled the BO using sDMA: bo_buckets[%d] **\n", i); + + if (bo_contents_fp) + fclose(bo_contents_fp); + } + +exit: + for (int i = 0; i < rd->num_of_bos; i++) { + if (dmabufs[i] != KFD_INVALID_FD) + close(dmabufs[i]); + } + + xfree(buffer); + + amdgpu_device_deinitialize(h_dev); + return ret; +} int amdgpu_plugin_drm_dump_file(int fd, int id, struct stat *drm) { - CriuRenderNode rd = CRIU_RENDER_NODE__INIT; - struct tp_node *tp_node; + CriuRenderNode *rd = NULL; char path[PATH_MAX]; unsigned char *buf; int minor; int len; int ret; + size_t image_size; + struct tp_node *tp_node; + struct drm_amdgpu_gem_list_handles list_handles_args = { 0 }; + struct drm_amdgpu_gem_list_handles_entry *list_handles_entries; + int num_bos; + + rd = xmalloc(sizeof(*rd)); + if (!rd) { + ret = -ENOMEM; + goto exit; + } + criu_render_node__init(rd); /* Get the topology node of the DRM device */ minor = minor(drm->st_rdev); + rd->drm_render_minor = minor; + rd->id = id; + + num_bos = 8; + list_handles_entries = xzalloc(sizeof(struct drm_amdgpu_gem_list_handles_entry) * num_bos); + list_handles_args.num_entries = num_bos; + list_handles_args.entries = (uintptr_t)list_handles_entries; + + ret = drmIoctl(fd, DRM_IOCTL_AMDGPU_GEM_LIST_HANDLES, &list_handles_args); + if (ret && errno == EINVAL) { + pr_info("This kernel appears not to have AMDGPU_GEM_LIST_HANDLES ioctl. Consider disabling Dmabuf IPC or updating your kernel.\n"); + list_handles_args.num_entries = 0; + } else if (ret) { + pr_perror("Failed to call bo info ioctl"); + goto exit; + } + + if (list_handles_args.num_entries > num_bos) { + num_bos = list_handles_args.num_entries; + xfree(list_handles_entries); + list_handles_entries = xzalloc(sizeof(struct drm_amdgpu_gem_list_handles_entry) * num_bos); + list_handles_args.num_entries = num_bos; + list_handles_args.entries = (uintptr_t)list_handles_entries; + ret = drmIoctl(fd, DRM_IOCTL_AMDGPU_GEM_LIST_HANDLES, &list_handles_args); + if (ret) { + pr_perror("Failed to call bo info ioctl"); + goto exit; + } + } else { + num_bos = list_handles_args.num_entries; + } + + rd->num_of_bos = num_bos; + ret = allocate_bo_entries(rd, num_bos); + if (ret) + goto exit; + + for (int i = 0; i < num_bos; i++) { + int num_vm_entries = 8; + struct drm_amdgpu_gem_vm_entry *vm_info_entries; + struct drm_amdgpu_gem_op vm_info_args = { 0 }; + DrmBoEntry *boinfo = rd->bo_entries[i]; + struct drm_amdgpu_gem_list_handles_entry handle_entry = list_handles_entries[i]; + union drm_amdgpu_gem_mmap mmap_args = { 0 }; + int dmabuf_fd; + uint32_t major, minor; + amdgpu_device_handle h_dev; + void *buffer = NULL; + char img_path[40]; + FILE *bo_contents_fp = NULL; + int device_fd; + + boinfo->size = handle_entry.size; + + boinfo->alloc_flags = handle_entry.alloc_flags; + boinfo->preferred_domains = handle_entry.preferred_domains; + boinfo->alignment = handle_entry.alignment; + boinfo->handle = handle_entry.gem_handle; + boinfo->is_import = (handle_entry.flags & AMDGPU_GEM_LIST_HANDLES_FLAG_IS_IMPORT) || shared_bo_has_exporter(boinfo->handle); + + mmap_args.in.handle = boinfo->handle; + + if (drmIoctl(fd, DRM_IOCTL_AMDGPU_GEM_MMAP, &mmap_args) == -1) { + pr_perror("Error Failed to call mmap ioctl"); + ret = -1; + goto exit; + } + + boinfo->offset = mmap_args.out.addr_ptr; + + vm_info_entries = xzalloc(sizeof(struct drm_amdgpu_gem_vm_entry) * num_vm_entries); + vm_info_args.handle = handle_entry.gem_handle; + vm_info_args.num_entries = num_vm_entries; + vm_info_args.value = (uintptr_t)vm_info_entries; + vm_info_args.op = AMDGPU_GEM_OP_GET_MAPPING_INFO; + ret = drmIoctl(fd, DRM_IOCTL_AMDGPU_GEM_OP, &vm_info_args); + if (ret) { + pr_perror("Failed to call vm info ioctl"); + goto exit; + } + + if (vm_info_args.num_entries > num_vm_entries) { + num_vm_entries = vm_info_args.num_entries; + xfree(vm_info_entries); + vm_info_entries = xzalloc(sizeof(struct drm_amdgpu_gem_vm_entry) * num_vm_entries); + vm_info_args.handle = handle_entry.gem_handle; + vm_info_args.num_entries = num_vm_entries; + vm_info_args.value = (uintptr_t)vm_info_entries; + vm_info_args.op = AMDGPU_GEM_OP_GET_MAPPING_INFO; + ret = drmIoctl(fd, DRM_IOCTL_AMDGPU_GEM_OP, &vm_info_args); + if (ret) { + pr_perror("Failed to call vm info ioctl"); + goto exit; + } + } else { + num_vm_entries = vm_info_args.num_entries; + } + + boinfo->num_of_vms = num_vm_entries; + ret = allocate_vm_entries(boinfo, num_vm_entries); + if (ret) + goto exit; + + for (int j = 0; j < num_vm_entries; j++) { + DrmVmEntry *vminfo = boinfo->vm_entries[j]; + + boinfo->addr = vm_info_entries[j].addr; + vminfo->addr = vm_info_entries[j].addr; + vminfo->size = vm_info_entries[j].size; + vminfo->offset = vm_info_entries[j].offset; + vminfo->flags = vm_info_entries[j].flags; + } + + ret = amdgpu_device_initialize(fd, &major, &minor, &h_dev); + + device_fd = amdgpu_device_get_fd(h_dev); + + drmPrimeHandleToFD(device_fd, boinfo->handle, 0, &dmabuf_fd); + + snprintf(img_path, sizeof(img_path), IMG_DRM_PAGES_FILE, rd->id, rd->drm_render_minor, i); + bo_contents_fp = open_img_file(img_path, true, &image_size); + + posix_memalign(&buffer, sysconf(_SC_PAGE_SIZE), handle_entry.size); + + ret = sdma_copy_bo(dmabuf_fd, handle_entry.size, bo_contents_fp, buffer, handle_entry.size, h_dev, 0x1000, + SDMA_OP_VRAM_READ, false); + + if (dmabuf_fd != KFD_INVALID_FD) + close(dmabuf_fd); + + if (bo_contents_fp) + fclose(bo_contents_fp); + + ret = amdgpu_device_deinitialize(h_dev); + if (ret) + goto exit; + + xfree(vm_info_entries); + } + xfree(list_handles_entries); + + for (int i = 0; i < num_bos; i++) { + DrmBoEntry *boinfo = rd->bo_entries[i]; + + ret = record_shared_bo(boinfo->handle, boinfo->is_import); + if (ret) + goto exit; + } + tp_node = sys_get_node_by_render_minor(&src_topology, minor); if (!tp_node) { pr_err("Failed to find a device with minor number = %d\n", minor); @@ -80,21 +414,156 @@ int amdgpu_plugin_drm_dump_file(int fd, int id, struct stat *drm) } /* Get the GPU_ID of the DRM device */ - rd.gpu_id = maps_get_dest_gpu(&checkpoint_maps, tp_node->gpu_id); - if (!rd.gpu_id) { - pr_err("Failed to find valid gpu_id for the device = %d\n", rd.gpu_id); + rd->gpu_id = maps_get_dest_gpu(&checkpoint_maps, tp_node->gpu_id); + if (!rd->gpu_id) { + pr_err("Failed to find valid gpu_id for the device = %d\n", rd->gpu_id); return -ENODEV; } - len = criu_render_node__get_packed_size(&rd); + len = criu_render_node__get_packed_size(rd); buf = xmalloc(len); if (!buf) return -ENOMEM; - criu_render_node__pack(&rd, buf); + criu_render_node__pack(rd, buf); snprintf(path, sizeof(path), IMG_DRM_FILE, id); ret = write_img_file(path, buf, len); + xfree(buf); +exit: + free_e(rd); return ret; } + +int amdgpu_plugin_drm_restore_file(int fd, CriuRenderNode *rd) +{ + int ret = 0; + bool retry_needed = false; + uint32_t major, minor; + amdgpu_device_handle h_dev; + int device_fd; + int *dmabufs = xzalloc(sizeof(int) * rd->num_of_bos); + + ret = amdgpu_device_initialize(fd, &major, &minor, &h_dev); + if (ret) { + pr_info("Error in init amdgpu device\n"); + goto exit; + } + + device_fd = amdgpu_device_get_fd(h_dev); + + for (int i = 0; i < rd->num_of_bos; i++) { + DrmBoEntry *boinfo = rd->bo_entries[i]; + int dmabuf_fd = -1; + uint32_t handle; + struct drm_gem_change_handle change_args = { 0 }; + union drm_amdgpu_gem_mmap mmap_args = { 0 }; + struct drm_amdgpu_gem_va va_args = { 0 }; + int fd_id; + + if (work_already_completed(boinfo->handle, rd->drm_render_minor)) { + continue; + } else if (boinfo->handle != -1) { + if (boinfo->is_import) { + fd_id = amdgpu_id_for_handle(boinfo->handle); + if (fd_id == -1) { + retry_needed = true; + continue; + } + dmabuf_fd = fdstore_get(fd_id); + } + } + + if (boinfo->is_import) { + drmPrimeFDToHandle(device_fd, dmabuf_fd, &handle); + } else { + union drm_amdgpu_gem_create create_args = { 0 }; + + create_args.in.bo_size = boinfo->size; + create_args.in.alignment = boinfo->alignment; + create_args.in.domains = boinfo->preferred_domains; + create_args.in.domain_flags = boinfo->alloc_flags; + + if (drmIoctl(fd, DRM_IOCTL_AMDGPU_GEM_CREATE, &create_args) == -1) { + pr_perror("Error Failed to call create ioctl"); + ret = -1; + goto exit; + } + handle = create_args.out.handle; + + drmPrimeHandleToFD(device_fd, handle, 0, &dmabuf_fd); + } + + change_args.handle = handle; + change_args.new_handle = boinfo->handle; + + if (drmIoctl(fd, DRM_IOCTL_GEM_CHANGE_HANDLE, &change_args) == -1) { + pr_perror("Error Failed to call change ioctl; check if the kernel has DRM_IOCTL_GEM_CHANGE_HANDLE support"); + ret = -1; + goto exit; + } + + if (!boinfo->is_import) + store_dmabuf_fd(boinfo->handle, dmabuf_fd); + + dmabufs[i] = dmabuf_fd; + + ret = record_completed_work(boinfo->handle, rd->drm_render_minor); + if (ret) + goto exit; + + mmap_args.in.handle = boinfo->handle; + + if (drmIoctl(fd, DRM_IOCTL_AMDGPU_GEM_MMAP, &mmap_args) == -1) { + pr_perror("Error Failed to call mmap ioctl"); + ret = -1; + goto exit; + } + + for (int j = 0; j < boinfo->num_of_vms; j++) { + DrmVmEntry *vminfo = boinfo->vm_entries[j]; + + va_args.handle = boinfo->handle; + va_args.operation = AMDGPU_VA_OP_MAP; + va_args.flags = vminfo->flags; + va_args.va_address = vminfo->addr; + va_args.offset_in_bo = vminfo->offset; + va_args.map_size = vminfo->size; + + if (drmIoctl(fd, DRM_IOCTL_AMDGPU_GEM_VA, &va_args) == -1) { + pr_perror("Error Failed to call gem va ioctl"); + ret = -1; + goto exit; + } + } + + ret = save_vma_updates(boinfo->offset, boinfo->addr, mmap_args.out.addr_ptr, fd); + if (ret < 0) + goto exit; + } + + if (ret) { + pr_info("Error in deinit amdgpu device\n"); + goto exit; + } + + ret = record_completed_work(-1, rd->drm_render_minor); + if (ret) + goto exit; + + ret = amdgpu_device_deinitialize(h_dev); + + if (rd->num_of_bos > 0) { + ret = restore_bo_contents_drm(rd->drm_render_minor, rd, fd, dmabufs); + if (ret) + goto exit; + } + +exit: + if (ret < 0) + return ret; + xfree(dmabufs); + + return retry_needed; +} diff --git a/plugins/amdgpu/amdgpu_plugin_drm.h b/plugins/amdgpu/amdgpu_plugin_drm.h index 6f0c1a9a6..c766def56 100644 --- a/plugins/amdgpu/amdgpu_plugin_drm.h +++ b/plugins/amdgpu/amdgpu_plugin_drm.h @@ -24,5 +24,17 @@ int amdgpu_plugin_drm_handle_device_vma(int fd, const struct stat *drm); */ int amdgpu_plugin_drm_dump_file(int fd, int id, struct stat *drm); +int amdgpu_plugin_drm_restore_file(int fd, CriuRenderNode *rd); + +int amdgpu_plugin_drm_unpause_file(int fd); + +int amdgpu_id_for_handle(int handle); + +int store_dmabuf_fd(int handle, int fd); + +int get_gem_handle(amdgpu_device_handle h_dev, int dmabuf_fd); + +int save_vma_updates(uint64_t offset, uint64_t addr, uint64_t restored_offset, int gpu_id); + #endif /* __AMDGPU_PLUGIN_DRM_H__ */ diff --git a/plugins/amdgpu/amdgpu_plugin_util.c b/plugins/amdgpu/amdgpu_plugin_util.c index a165fc9cd..491e7fc74 100644 --- a/plugins/amdgpu/amdgpu_plugin_util.c +++ b/plugins/amdgpu/amdgpu_plugin_util.c @@ -41,6 +41,9 @@ /* Tracks number of device files that need to be checkpointed */ static int dev_file_cnt = 0; +static LIST_HEAD(shared_bos); +static LIST_HEAD(completed_work); + /* Helper structures to encode device topology of SRC and DEST platforms */ struct tp_system src_topology; struct tp_system dest_topology; @@ -68,6 +71,87 @@ void init_gpu_count(struct tp_system *topo) dev_file_cnt = 1 + topology_gpu_count(topo); } +bool shared_bo_has_exporter(int handle) +{ + struct shared_bo *bo; + + if (handle == -1) + return false; + + list_for_each_entry(bo, &shared_bos, l) { + if (bo->handle == handle) { + return bo->has_exporter; + } + } + + return false; +} + +int record_shared_bo(int handle, bool is_imported) +{ + struct shared_bo *bo; + + if (handle == -1) + return 0; + + list_for_each_entry(bo, &shared_bos, l) { + if (bo->handle == handle) { + return 0; + } + } + bo = malloc(sizeof(struct shared_bo)); + if (!bo) + return -1; + bo->handle = handle; + bo->has_exporter = !is_imported; + list_add(&bo->l, &shared_bos); + + return 0; +} + +int record_completed_work(int handle, int id) +{ + struct restore_completed_work *work; + + work = malloc(sizeof(struct restore_completed_work)); + if (!work) + return -1; + work->handle = handle; + work->id = id; + list_add(&work->l, &completed_work); + + return 0; +} + +bool work_already_completed(int handle, int id) +{ + struct restore_completed_work *work; + + list_for_each_entry(work, &completed_work, l) { + if (work->handle == handle && work->id == id) { + return true; + } + } + + return false; +} + +void clear_restore_state() +{ + while (!list_empty(&shared_dmabuf_fds)) { + struct shared_dmabuf *st = list_first_entry(&shared_dmabuf_fds, struct shared_dmabuf, l); + list_del(&st->l); + close(st->dmabuf_fd); + free(st); + } + + while (!list_empty(&completed_work)) { + struct restore_completed_work *st = list_first_entry(&completed_work, struct restore_completed_work, l); + list_del(&st->l); + free(st); + } +} + int read_fp(FILE *fp, void *buf, const size_t buf_len) { size_t len_read; diff --git a/plugins/amdgpu/amdgpu_plugin_util.h b/plugins/amdgpu/amdgpu_plugin_util.h index aacca3a28..046a82fb0 100644 --- a/plugins/amdgpu/amdgpu_plugin_util.h +++ b/plugins/amdgpu/amdgpu_plugin_util.h @@ -1,6 +1,8 @@ #ifndef __AMDGPU_PLUGIN_UTIL_H__ #define __AMDGPU_PLUGIN_UTIL_H__ +#include + #ifndef _GNU_SOURCE #define _GNU_SOURCE 1 #endif @@ -52,7 +54,7 @@ #define IMG_DRM_FILE "amdgpu-renderD-%d.img" /* Name of file having serialized data of DRM device buffer objects (BOs) */ -#define IMG_DRM_PAGES_FILE "amdgpu-drm-pages-%d-%04x.img" +#define IMG_DRM_PAGES_FILE "amdgpu-drm-pages-%d-%d-%04x.img" /* Helper macros to Checkpoint and Restore a ROCm file */ #define HSAKMT_SHM_PATH "/dev/shm/hsakmt_shared_mem" @@ -73,6 +75,24 @@ enum sdma_op_type { SDMA_OP_VRAM_WRITE, }; +struct dumped_fd { + struct list_head l; + int fd; + bool is_drm; +}; + +struct shared_bo { + struct list_head l; + int handle; + bool has_exporter; +}; + +struct restore_completed_work { + struct list_head l; + int handle; + int id; +}; + /* Helper structures to encode device topology of SRC and DEST platforms */ extern struct tp_system src_topology; extern struct tp_system dest_topology; @@ -101,6 +121,23 @@ bool checkpoint_is_complete(); void decrement_checkpoint_count(); void init_gpu_count(struct tp_system *topology); +bool shared_bo_has_exporter(int handle); +int record_shared_bo(int handle, bool is_imported); + +int record_shared_dmabuf_fd(int handle, int dmabuf_fd); +int dmabuf_fd_for_handle(int handle); + +int record_completed_work(int handle, int id); +bool work_already_completed(int handle, int id); + +void clear_restore_state(); + void print_kfd_bo_stat(int bo_cnt, struct kfd_criu_bo_bucket *bo_list); +int sdma_copy_bo(int shared_fd, uint64_t size, FILE *storage_fp, + void *buffer, size_t buffer_size, amdgpu_device_handle h_dev, + uint64_t max_copy_size, enum sdma_op_type type, bool do_not_free); + +int serve_out_dmabuf_fd(int handle, int fd); + #endif /* __AMDGPU_PLUGIN_UTIL_H__ */ diff --git a/plugins/amdgpu/criu-amdgpu.proto b/plugins/amdgpu/criu-amdgpu.proto index 078b67650..565413c34 100644 --- a/plugins/amdgpu/criu-amdgpu.proto +++ b/plugins/amdgpu/criu-amdgpu.proto @@ -46,6 +46,7 @@ message kfd_bo_entry { required uint64 offset = 3; required uint32 alloc_flags = 4; required uint32 gpu_id = 5; + required uint32 handle = 6; } message criu_kfd { @@ -61,6 +62,30 @@ message criu_kfd { required bytes priv_data = 10; } +message drm_bo_entry { + required uint64 addr = 1; + required uint64 size = 2; + required uint64 offset = 3; + required uint64 alloc_flags = 4; + required uint64 alignment = 5; + required uint32 preferred_domains = 6; + required uint32 handle = 7; + required uint32 is_import = 8; + required uint32 num_of_vms = 9; + repeated drm_vm_entry vm_entries = 10; +} + +message drm_vm_entry { + required uint64 addr = 1; + required uint64 size = 2; + required uint64 offset = 3; + required uint64 flags = 4; +} + message criu_render_node { required uint32 gpu_id = 1; + required uint32 id = 2; + required uint32 drm_render_minor = 3; + required uint64 num_of_bos = 4; + repeated drm_bo_entry bo_entries = 5; } From d43217dadb9764e0342306da84f45f7a85c78bbf Mon Sep 17 00:00:00 2001 From: David Francis Date: Thu, 30 Oct 2025 22:56:37 -0700 Subject: [PATCH 11/51] plugin: Add DUMP_DEVICES_LATE callback The amdgpu plugin was counting how many files were checkpointed to determine when it should close the device files. The number of device files is not consistent; a process may have multiple copies of the drm device files open. Instead of doing this counting, add a new callback after all files are checkpointed, so plugins can clean up their resources at an appropriate time. Signed-off-by: David Francis --- criu/cr-dump.c | 4 +++ criu/include/criu-plugin.h | 3 ++ criu/plugin.c | 1 + plugins/amdgpu/amdgpu_plugin.c | 55 ++++++++++++----------------- plugins/amdgpu/amdgpu_plugin_util.c | 42 +++++++++++++--------- plugins/amdgpu/amdgpu_plugin_util.h | 6 ++-- 6 files changed, 60 insertions(+), 51 deletions(-) diff --git a/criu/cr-dump.c b/criu/cr-dump.c index 60b8e793c..4df40e9b6 100644 --- a/criu/cr-dump.c +++ b/criu/cr-dump.c @@ -2247,6 +2247,10 @@ int cr_dump_tasks(pid_t pid) goto err; } + ret = run_plugins(DUMP_DEVICES_LATE, pid); + if (ret && ret != -ENOTSUP) + goto err; + if (parent_ie) { inventory_entry__free_unpacked(parent_ie, NULL); parent_ie = NULL; diff --git a/criu/include/criu-plugin.h b/criu/include/criu-plugin.h index 977dad655..c3bea1385 100644 --- a/criu/include/criu-plugin.h +++ b/criu/include/criu-plugin.h @@ -64,6 +64,8 @@ enum { CR_PLUGIN_HOOK__RESTORE_INIT = 13, + CR_PLUGIN_HOOK__DUMP_DEVICES_LATE = 14, + CR_PLUGIN_HOOK__MAX }; @@ -84,6 +86,7 @@ DECLARE_PLUGIN_HOOK_ARGS(CR_PLUGIN_HOOK__PAUSE_DEVICES, int pid); DECLARE_PLUGIN_HOOK_ARGS(CR_PLUGIN_HOOK__CHECKPOINT_DEVICES, int pid); DECLARE_PLUGIN_HOOK_ARGS(CR_PLUGIN_HOOK__POST_FORKING, void); DECLARE_PLUGIN_HOOK_ARGS(CR_PLUGIN_HOOK__RESTORE_INIT, void); +DECLARE_PLUGIN_HOOK_ARGS(CR_PLUGIN_HOOK__DUMP_DEVICES_LATE, int id); enum { CR_PLUGIN_STAGE__DUMP, diff --git a/criu/plugin.c b/criu/plugin.c index a2057e9c1..f9322a3c2 100644 --- a/criu/plugin.c +++ b/criu/plugin.c @@ -61,6 +61,7 @@ static cr_plugin_desc_t *cr_gen_plugin_desc(void *h, char *path) __assign_hook(CHECKPOINT_DEVICES, "cr_plugin_checkpoint_devices"); __assign_hook(POST_FORKING, "cr_plugin_post_forking"); __assign_hook(RESTORE_INIT, "cr_plugin_restore_init"); + __assign_hook(DUMP_DEVICES_LATE, "cr_plugin_dump_devices_late"); #undef __assign_hook diff --git a/plugins/amdgpu/amdgpu_plugin.c b/plugins/amdgpu/amdgpu_plugin.c index 4be8421a0..11e410c31 100644 --- a/plugins/amdgpu/amdgpu_plugin.c +++ b/plugins/amdgpu/amdgpu_plugin.c @@ -58,13 +58,6 @@ struct vma_metadata { /************************************ Global Variables ********************************************/ -/** - * FD of KFD device used to checkpoint. On a multi-process - * tree the order of checkpointing goes from parent to child - * and so on - so saving the FD will not be overwritten - */ -static int kfd_checkpoint_fd; - static LIST_HEAD(update_vma_info_list); size_t kfd_max_buffer_size; @@ -1050,28 +1043,34 @@ int restore_hsakmt_shared_mem(const uint64_t shared_mem_size, const uint32_t sha return 0; } -static int unpause_process(int fd) +int amdgpu_unpause_processes(int pid) { int ret = 0; struct kfd_ioctl_criu_args args = { 0 }; + struct list_head *l = get_dumped_fds(); + struct dumped_fd *st; - args.op = KFD_CRIU_OP_UNPAUSE; + list_for_each_entry(st, l, l) { + if (st->is_drm) { + close(st->fd); + } else { + args.op = KFD_CRIU_OP_UNPAUSE; - ret = kmtIoctl(fd, AMDKFD_IOC_CRIU_OP, &args); - if (ret) { - pr_perror("Failed to unpause process"); - goto exit; + ret = kmtIoctl(st->fd, AMDKFD_IOC_CRIU_OP, &args); + if (ret) { + pr_perror("Failed to unpause process"); + goto exit; + } + } } - // Reset the KFD FD - kfd_checkpoint_fd = -1; - sys_close_drm_render_devices(&src_topology); - exit: pr_info("Process unpaused %s (ret:%d)\n", ret ? "Failed" : "Ok", ret); + clear_dumped_fds(); return ret; } +CR_PLUGIN_REGISTER_HOOK(CR_PLUGIN_HOOK__DUMP_DEVICES_LATE, amdgpu_unpause_processes) int store_dmabuf_fd(int handle, int fd) { @@ -1401,9 +1400,6 @@ int amdgpu_plugin_dump_file(int fd, int id) return -1; } - /* Initialize number of device files that will be checkpointed */ - init_gpu_count(&src_topology); - /* Check whether this plugin was called for kfd or render nodes */ if (major(st.st_rdev) != major(st_kfd.st_rdev) || minor(st.st_rdev) != 0) { @@ -1415,11 +1411,9 @@ int amdgpu_plugin_dump_file(int fd, int id) if (ret) return ret; - /* Invoke unpause process if needed */ - decrement_checkpoint_count(); - if (checkpoint_is_complete()) { - ret = unpause_process(kfd_checkpoint_fd); - } + ret = record_dumped_fd(fd, true); + if (ret) + return ret; /* Need to return success here so that criu can call plugins for renderD nodes */ return ret; @@ -1517,14 +1511,11 @@ int amdgpu_plugin_dump_file(int fd, int id) xfree(buf); -exit: - /* Restore all queues if conditions permit */ - kfd_checkpoint_fd = fd; - decrement_checkpoint_count(); - if (checkpoint_is_complete()) { - ret = unpause_process(fd); - } + ret = record_dumped_fd(fd, false); + if (ret) + goto exit; +exit: xfree((void *)args.devices); xfree((void *)args.bos); xfree((void *)args.priv_data); diff --git a/plugins/amdgpu/amdgpu_plugin_util.c b/plugins/amdgpu/amdgpu_plugin_util.c index 491e7fc74..fd59c06ad 100644 --- a/plugins/amdgpu/amdgpu_plugin_util.c +++ b/plugins/amdgpu/amdgpu_plugin_util.c @@ -38,9 +38,7 @@ #include "amdgpu_plugin_util.h" #include "amdgpu_plugin_topology.h" -/* Tracks number of device files that need to be checkpointed */ -static int dev_file_cnt = 0; - +static LIST_HEAD(dumped_fds); static LIST_HEAD(shared_bos); static LIST_HEAD(completed_work); @@ -52,23 +50,25 @@ struct tp_system dest_topology; struct device_maps checkpoint_maps; struct device_maps restore_maps; -bool checkpoint_is_complete() +int record_dumped_fd(int fd, bool is_drm) { - return (dev_file_cnt == 0); + int newfd = dup(fd); + + if (newfd < 0) + return newfd; + struct dumped_fd *st = malloc(sizeof(struct dumped_fd)); + if (!st) + return -1; + st->fd = newfd; + st->is_drm = is_drm; + list_add(&st->l, &dumped_fds); + + return 0; } -void decrement_checkpoint_count() +struct list_head *get_dumped_fds() { - dev_file_cnt--; -} - -void init_gpu_count(struct tp_system *topo) -{ - if (dev_file_cnt != 0) - return; - - /* We add ONE to include checkpointing of KFD device */ - dev_file_cnt = 1 + topology_gpu_count(topo); + return &dumped_fds; } bool shared_bo_has_exporter(int handle) @@ -152,6 +152,16 @@ void clear_restore_state() } } +void clear_dumped_fds() +{ + while (!list_empty(&dumped_fds)) { + struct dumped_fd *st = list_first_entry(&dumped_fds, struct dumped_fd, l); + list_del(&st->l); + close(st->fd); + free(st); + } +} + int read_fp(FILE *fp, void *buf, const size_t buf_len) { size_t len_read; diff --git a/plugins/amdgpu/amdgpu_plugin_util.h b/plugins/amdgpu/amdgpu_plugin_util.h index 046a82fb0..f20388efa 100644 --- a/plugins/amdgpu/amdgpu_plugin_util.h +++ b/plugins/amdgpu/amdgpu_plugin_util.h @@ -117,9 +117,9 @@ int read_file(const char *file_path, void *buf, const size_t buf_len); int write_img_file(char *path, const void *buf, const size_t buf_len); FILE *open_img_file(char *path, bool write, size_t *size); -bool checkpoint_is_complete(); -void decrement_checkpoint_count(); -void init_gpu_count(struct tp_system *topology); +int record_dumped_fd(int fd, bool is_drm); +struct list_head *get_dumped_fds(); +void clear_dumped_fds(); bool shared_bo_has_exporter(int handle); int record_shared_bo(int handle, bool is_imported); From 9e404e2083913cde0bad2d0396e6cc7c311a8ba4 Mon Sep 17 00:00:00 2001 From: David Francis Date: Thu, 30 Oct 2025 22:57:04 -0700 Subject: [PATCH 12/51] plugin/amdgpu: Support for checkpoint of dmabuf fds amdgpu libraries that use dmabuf fd to share GPU memory between processes close the dmabuf fds immediately after using them. However, it is possible that checkpoint of a process catches one of the dmabuf fds open. In that case, the amdgpu plugin needs to handle it. The checkpoint of the dmabuf fd does require the device file it was exported from to have already been dumped To identify which device this dmabuf fd was exprted from, attempt to import it on each device, then record the dmabuf handle it imports as. This handle can be used to restore it. Signed-off-by: David Francis --- plugins/amdgpu/Makefile | 2 +- plugins/amdgpu/amdgpu_plugin.c | 34 ++++- plugins/amdgpu/amdgpu_plugin_dmabuf.c | 207 ++++++++++++++++++++++++++ plugins/amdgpu/amdgpu_plugin_dmabuf.h | 16 ++ plugins/amdgpu/amdgpu_plugin_drm.c | 7 +- plugins/amdgpu/amdgpu_plugin_util.c | 48 +++++- plugins/amdgpu/amdgpu_plugin_util.h | 8 +- plugins/amdgpu/criu-amdgpu.proto | 4 + 8 files changed, 306 insertions(+), 20 deletions(-) create mode 100644 plugins/amdgpu/amdgpu_plugin_dmabuf.c create mode 100644 plugins/amdgpu/amdgpu_plugin_dmabuf.h diff --git a/plugins/amdgpu/Makefile b/plugins/amdgpu/Makefile index 870a039cd..31e177e4a 100644 --- a/plugins/amdgpu/Makefile +++ b/plugins/amdgpu/Makefile @@ -27,7 +27,7 @@ endif criu-amdgpu.pb-c.c: criu-amdgpu.proto protoc --proto_path=. --c_out=. criu-amdgpu.proto -amdgpu_plugin.so: amdgpu_plugin.c amdgpu_plugin_drm.c amdgpu_plugin_topology.c amdgpu_plugin_util.c criu-amdgpu.pb-c.c amdgpu_socket_utils.c +amdgpu_plugin.so: amdgpu_plugin.c amdgpu_plugin_drm.c amdgpu_plugin_dmabuf.c amdgpu_plugin_topology.c amdgpu_plugin_util.c criu-amdgpu.pb-c.c amdgpu_socket_utils.c $(CC) $(PLUGIN_CFLAGS) $(shell $(COMPEL) includes) $^ -o $@ $(PLUGIN_INCLUDE) $(PLUGIN_LDFLAGS) $(LIBDRM_INC) amdgpu_plugin_clean: diff --git a/plugins/amdgpu/amdgpu_plugin.c b/plugins/amdgpu/amdgpu_plugin.c index 11e410c31..125aaef9a 100644 --- a/plugins/amdgpu/amdgpu_plugin.c +++ b/plugins/amdgpu/amdgpu_plugin.c @@ -38,6 +38,7 @@ #include "rst-malloc.h" #include "common/list.h" +#include "amdgpu_plugin_dmabuf.h" #include "amdgpu_plugin_drm.h" #include "amdgpu_plugin_util.h" #include "amdgpu_plugin_topology.h" @@ -46,7 +47,7 @@ #include "img-streamer.h" #include "image.h" #include "cr_options.h" - +#include "util.h" struct vma_metadata { struct list_head list; uint64_t old_pgoff; @@ -1064,6 +1065,9 @@ int amdgpu_unpause_processes(int pid) } } + if (post_dump_dmabuf_check() < 0) + ret = -1; + exit: pr_info("Process unpaused %s (ret:%d)\n", ret ? "Failed" : "Ok", ret); clear_dumped_fds(); @@ -1400,7 +1404,17 @@ int amdgpu_plugin_dump_file(int fd, int id) return -1; } - /* Check whether this plugin was called for kfd or render nodes */ + /* Check whether this plugin was called for kfd, dmabuf or render nodes */ + ret = get_dmabuf_info(fd, &st); + if (ret < 0) { + pr_perror("Failed to get dmabuf info"); + return -1; + } else if (ret == 0) { + pr_info("Dumping dmabuf fd = %d\n", fd); + ret = amdgpu_plugin_dmabuf_dump(fd, id); + return ret; + } + if (major(st.st_rdev) != major(st_kfd.st_rdev) || minor(st.st_rdev) != 0) { /* This is RenderD dumper plugin, for now just save renderD @@ -1414,7 +1428,7 @@ int amdgpu_plugin_dump_file(int fd, int id) ret = record_dumped_fd(fd, true); if (ret) return ret; - + ret = try_dump_dmabuf_list(); /* Need to return success here so that criu can call plugins for renderD nodes */ return ret; } @@ -1538,7 +1552,6 @@ static int restore_devices(struct kfd_ioctl_criu_args *args, CriuKfd *e) int ret = 0, bucket_index = 0; pr_debug("Restoring %d devices\n", e->num_of_gpus); - args->num_devices = e->num_of_gpus; device_buckets = xzalloc(sizeof(*device_buckets) * args->num_devices); if (!device_buckets) @@ -1822,12 +1835,17 @@ int amdgpu_plugin_restore_file(int id, bool *retry_needed) * first as we assume restore_maps is already filled. Need to fix this later. */ snprintf(img_path, sizeof(img_path), IMG_DRM_FILE, id); - pr_info("Restoring RenderD %s\n", img_path); img_fp = open_img_file(img_path, false, &img_size); - if (!img_fp) - return -EINVAL; - + if (!img_fp) { + ret = amdgpu_plugin_dmabuf_restore(id); + if (ret == 1) { + *retry_needed = true; + return 0; + } + return ret; + } + pr_info("Restoring RenderD %s\n", img_path); pr_debug("RenderD Image file size:%ld\n", img_size); buf = xmalloc(img_size); if (!buf) { diff --git a/plugins/amdgpu/amdgpu_plugin_dmabuf.c b/plugins/amdgpu/amdgpu_plugin_dmabuf.c new file mode 100644 index 000000000..74b5f9038 --- /dev/null +++ b/plugins/amdgpu/amdgpu_plugin_dmabuf.c @@ -0,0 +1,207 @@ +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include +#include +#include + +#include "common/list.h" +#include "criu-amdgpu.pb-c.h" + +#include "xmalloc.h" +#include "criu-log.h" +#include "amdgpu_plugin_drm.h" +#include "amdgpu_plugin_util.h" +#include "amdgpu_plugin_dmabuf.h" +#include "fdstore.h" + +#include "util.h" +#include "common/scm.h" + +struct dmabuf { + int id; + int dmabuf_fd; + struct list_head node; +}; + +static LIST_HEAD(dmabuf_list); + +/* Return < 0 for error, > 0 for "not a dmabuf" and 0 "is a dmabuf" */ +int get_dmabuf_info(int fd, struct stat *st) +{ + char path[PATH_MAX]; + + if (read_fd_link(fd, path, sizeof(path)) < 0) + return -1; + + if (strncmp(path, DMABUF_LINK, strlen(DMABUF_LINK)) != 0) + return 1; + + return 0; +} + +int __amdgpu_plugin_dmabuf_dump(int dmabuf_fd, int id) +{ + int ret = 0; + char path[PATH_MAX]; + size_t len = 0; + unsigned char *buf = NULL; + int gem_handle; + + pr_info("TWI: Dumping dmabuf fd = %d\n", dmabuf_fd); + + gem_handle = handle_for_shared_bo_fd(dmabuf_fd); + if (gem_handle < 0) { + pr_err("Failed to get handle for dmabuf_fd = %d\n", dmabuf_fd); + return -EAGAIN; /* Retry needed */ + } + + CriuDmabufNode *node = xmalloc(sizeof(*node)); + if (!node) { + pr_err("Failed to allocate memory for dmabuf node\n"); + return -ENOMEM; + } + criu_dmabuf_node__init(node); + + node->gem_handle = gem_handle; + + if (node->gem_handle < 0) { + pr_err("Failed to get handle for dmabuf_fd\n"); + xfree(node); + return -EINVAL; + } + + /* Serialize metadata to a file */ + snprintf(path, sizeof(path), IMG_DMABUF_FILE, id); + len = criu_dmabuf_node__get_packed_size(node); + buf = xmalloc(len); + if (!buf) { + pr_err("Failed to allocate buffer for dmabuf metadata\n"); + xfree(node); + return -ENOMEM; + } + criu_dmabuf_node__pack(node, buf); + ret = write_img_file(path, buf, len); + + xfree(buf); + xfree(node); + return ret; +} + +int amdgpu_plugin_dmabuf_restore(int id) +{ + char path[PATH_MAX]; + size_t img_size; + FILE *img_fp = NULL; + int ret = 0; + CriuDmabufNode *rd = NULL; + unsigned char *buf = NULL; + int fd_id; + + snprintf(path, sizeof(path), IMG_DMABUF_FILE, id); + + pr_info("TWI: Restoring dmabuf fd, id = %d\n", id); + + /* Read serialized metadata */ + img_fp = open_img_file(path, false, &img_size); + if (!img_fp) { + pr_err("Failed to open dmabuf metadata file: %s\n", path); + return -EINVAL; + } + + pr_debug("dmabuf Image file size:%ld\n", img_size); + buf = xmalloc(img_size); + if (!buf) { + pr_perror("Failed to allocate memory"); + return -ENOMEM; + } + + ret = read_fp(img_fp, buf, img_size); + if (ret) { + pr_perror("Unable to read from %s", path); + xfree(buf); + return ret; + } + + rd = criu_dmabuf_node__unpack(NULL, img_size, buf); + if (rd == NULL) { + pr_perror("Unable to parse the dmabuf message %d", id); + xfree(buf); + fclose(img_fp); + return -1; + } + fclose(img_fp); + + pr_info("TWI: dmabuf node gem_handle = %d\n", rd->gem_handle); + + /* Match GEM handle with shared_dmabuf list */ + fd_id = amdgpu_id_for_handle(rd->gem_handle); + if (fd_id == -1) { + pr_err("Failed to find dmabuf_fd for GEM handle = %d\n", + rd->gem_handle); + return 1; + } + int dmabuf_fd = fdstore_get(fd_id); + pr_info("TWI: dmabuf node fd_id = %d, dmabuf_fd = %d\n", fd_id, dmabuf_fd); + if (dmabuf_fd == -1) { + pr_err("Failed to find dmabuf_fd for GEM handle = %d\n", + rd->gem_handle); + return 1; /* Retry needed */ + } else { + pr_info("Restored dmabuf_fd = %d for GEM handle = %d\n", + dmabuf_fd, rd->gem_handle); + } + ret = dmabuf_fd; + + pr_info("Successfully restored dmabuf_fd %d\n", + dmabuf_fd); + criu_dmabuf_node__free_unpacked(rd, NULL); + xfree(buf); + return ret; +} + +int amdgpu_plugin_dmabuf_dump(int dmabuf_fd, int id) +{ + int ret; + + ret = __amdgpu_plugin_dmabuf_dump(dmabuf_fd, id); + if (ret == -EAGAIN) { + struct dmabuf *b = xmalloc(sizeof(*b)); + b->id = id; + b->dmabuf_fd = dmabuf_fd; + list_add(&b->node, &dmabuf_list); + return 0; + } + return ret; +} + +int try_dump_dmabuf_list() +{ + struct dmabuf *b, *t; + list_for_each_entry_safe(b, t, &dmabuf_list, node) { + int ret = __amdgpu_plugin_dmabuf_dump(b->dmabuf_fd, b->id); + if (ret == -EAGAIN) + continue; + else if (ret) + return ret; + list_del(&b->node); + xfree(b); + } + return 0; +} + +int post_dump_dmabuf_check() +{ + if (!list_empty(&dmabuf_list)) { + pr_err("Not all dma buffers have been dumped\n"); + return -1; + } + return 1; +} \ No newline at end of file diff --git a/plugins/amdgpu/amdgpu_plugin_dmabuf.h b/plugins/amdgpu/amdgpu_plugin_dmabuf.h new file mode 100644 index 000000000..f07af7ee0 --- /dev/null +++ b/plugins/amdgpu/amdgpu_plugin_dmabuf.h @@ -0,0 +1,16 @@ + +#ifndef __AMDGPU_PLUGIN_DMABUF_H__ +#define __AMDGPU_PLUGIN_DMABUF_H__ + +#include "amdgpu_plugin_util.h" +#include "criu-amdgpu.pb-c.h" + +int amdgpu_plugin_dmabuf_dump(int fd, int id); +int amdgpu_plugin_dmabuf_restore(int id); + +int try_dump_dmabuf_list(); +int post_dump_dmabuf_check(); + +int get_dmabuf_info(int fd, struct stat *st); + +#endif /* __AMDGPU_PLUGIN_DMABUF_H__ */ \ No newline at end of file diff --git a/plugins/amdgpu/amdgpu_plugin_drm.c b/plugins/amdgpu/amdgpu_plugin_drm.c index 199dad21e..8466ca40d 100644 --- a/plugins/amdgpu/amdgpu_plugin_drm.c +++ b/plugins/amdgpu/amdgpu_plugin_drm.c @@ -47,7 +47,8 @@ int get_gem_handle(amdgpu_device_handle h_dev, int dmabuf_fd) return -1; } - drmPrimeFDToHandle(fd, dmabuf_fd, &handle); + if (drmPrimeFDToHandle(fd, dmabuf_fd, &handle)) + return -1; return handle; } @@ -465,6 +466,7 @@ int amdgpu_plugin_drm_restore_file(int fd, CriuRenderNode *rd) if (work_already_completed(boinfo->handle, rd->drm_render_minor)) { continue; } else if (boinfo->handle != -1) { + pr_info("TWI: restore bo %d\n", boinfo->handle); if (boinfo->is_import) { fd_id = amdgpu_id_for_handle(boinfo->handle); if (fd_id == -1) { @@ -472,11 +474,13 @@ int amdgpu_plugin_drm_restore_file(int fd, CriuRenderNode *rd) continue; } dmabuf_fd = fdstore_get(fd_id); + pr_info("TWI: restore bo %d: fd_id %d, dmabuf_fd %d\n", boinfo->handle, fd_id, dmabuf_fd); } } if (boinfo->is_import) { drmPrimeFDToHandle(device_fd, dmabuf_fd, &handle); + pr_info("TWI: restore bo imported to handle %d\n", handle); } else { union drm_amdgpu_gem_create create_args = { 0 }; @@ -493,6 +497,7 @@ int amdgpu_plugin_drm_restore_file(int fd, CriuRenderNode *rd) handle = create_args.out.handle; drmPrimeHandleToFD(device_fd, handle, 0, &dmabuf_fd); + pr_info("TWI: restore bo created at handle %d and exported to fd %d\n", handle, dmabuf_fd); } change_args.handle = handle; diff --git a/plugins/amdgpu/amdgpu_plugin_util.c b/plugins/amdgpu/amdgpu_plugin_util.c index fd59c06ad..a2cafa4a3 100644 --- a/plugins/amdgpu/amdgpu_plugin_util.c +++ b/plugins/amdgpu/amdgpu_plugin_util.c @@ -37,6 +37,7 @@ #include "amdgpu_drm.h" #include "amdgpu_plugin_util.h" #include "amdgpu_plugin_topology.h" +#include "amdgpu_plugin_drm.h" static LIST_HEAD(dumped_fds); static LIST_HEAD(shared_bos); @@ -109,6 +110,46 @@ int record_shared_bo(int handle, bool is_imported) return 0; } +int handle_for_shared_bo_fd(int fd) +{ + struct dumped_fd *df; + int trial_handle; + amdgpu_device_handle h_dev; + uint32_t major, minor; + struct shared_bo *bo; + + list_for_each_entry(df, &dumped_fds, l) { + /* see if the gem handle for fd using the hdev for df->fd is the + same as bo->handle. */ + + if (!df->is_drm) { + continue; + } + + if (amdgpu_device_initialize(df->fd, &major, &minor, &h_dev)) { + pr_err("Failed to initialize amdgpu device\n"); + continue; + } + + trial_handle = get_gem_handle(h_dev, fd); + if (trial_handle < 0) + continue; + + pr_info("TWI: Check device %d, got handle %d\n", df->fd, trial_handle); + + list_for_each_entry(bo, &shared_bos, l) { + if (bo->handle == trial_handle) { + pr_info("TWI: And that handle exists\n"); + return trial_handle; + } + } + + amdgpu_device_deinitialize(h_dev); + } + + return -1; +} + int record_completed_work(int handle, int id) { struct restore_completed_work *work; @@ -138,13 +179,6 @@ bool work_already_completed(int handle, int id) void clear_restore_state() { - while (!list_empty(&shared_dmabuf_fds)) { - struct shared_dmabuf *st = list_first_entry(&shared_dmabuf_fds, struct shared_dmabuf, l); - list_del(&st->l); - close(st->dmabuf_fd); - free(st); - } - while (!list_empty(&completed_work)) { struct restore_completed_work *st = list_first_entry(&completed_work, struct restore_completed_work, l); list_del(&st->l); diff --git a/plugins/amdgpu/amdgpu_plugin_util.h b/plugins/amdgpu/amdgpu_plugin_util.h index f20388efa..f5f752d0b 100644 --- a/plugins/amdgpu/amdgpu_plugin_util.h +++ b/plugins/amdgpu/amdgpu_plugin_util.h @@ -53,6 +53,9 @@ /* Name of file having serialized data of DRM device */ #define IMG_DRM_FILE "amdgpu-renderD-%d.img" +/* Name of file having serialized data of dmabuf meta */ +#define IMG_DMABUF_FILE "amdgpu-dmabuf_%d.img" + /* Name of file having serialized data of DRM device buffer objects (BOs) */ #define IMG_DRM_PAGES_FILE "amdgpu-drm-pages-%d-%d-%04x.img" @@ -61,6 +64,7 @@ #define HSAKMT_SHM "/hsakmt_shared_mem" #define HSAKMT_SEM_PATH "/dev/shm/sem.hsakmt_semaphore" #define HSAKMT_SEM "hsakmt_semaphore" +#define DMABUF_LINK "/dmabuf" /* Help macros to build sDMA command packets */ #define SDMA_PACKET(op, sub_op, e) ((((e)&0xFFFF) << 16) | (((sub_op)&0xFF) << 8) | (((op)&0xFF) << 0)) @@ -123,9 +127,7 @@ void clear_dumped_fds(); bool shared_bo_has_exporter(int handle); int record_shared_bo(int handle, bool is_imported); - -int record_shared_dmabuf_fd(int handle, int dmabuf_fd); -int dmabuf_fd_for_handle(int handle); +int handle_for_shared_bo_fd(int dmabuf_fd); int record_completed_work(int handle, int id); bool work_already_completed(int handle, int id); diff --git a/plugins/amdgpu/criu-amdgpu.proto b/plugins/amdgpu/criu-amdgpu.proto index 565413c34..7682a8f21 100644 --- a/plugins/amdgpu/criu-amdgpu.proto +++ b/plugins/amdgpu/criu-amdgpu.proto @@ -89,3 +89,7 @@ message criu_render_node { required uint64 num_of_bos = 4; repeated drm_bo_entry bo_entries = 5; } + +message criu_dmabuf_node { + required uint32 gem_handle = 1; +} From ff35a9126e3a2d4e6f5f9f2ca89b032f9ae5bc22 Mon Sep 17 00:00:00 2001 From: David Francis Date: Sun, 2 Nov 2025 07:26:44 -0800 Subject: [PATCH 13/51] plugins/amdgpu: remove excessive debug messages These pr_info lines begin with "CC3" and "TWI" were not meant to be included in the patch. Co-authored-by: Andrei Vagin Signed-off-by: David Francis Signed-off-by: Radostin Stoyanov --- plugins/amdgpu/amdgpu_plugin.c | 5 +---- plugins/amdgpu/amdgpu_plugin_dmabuf.c | 22 ++++++---------------- plugins/amdgpu/amdgpu_plugin_drm.c | 4 ---- plugins/amdgpu/amdgpu_plugin_util.c | 6 +----- 4 files changed, 8 insertions(+), 29 deletions(-) diff --git a/plugins/amdgpu/amdgpu_plugin.c b/plugins/amdgpu/amdgpu_plugin.c index 125aaef9a..4640ccf88 100644 --- a/plugins/amdgpu/amdgpu_plugin.c +++ b/plugins/amdgpu/amdgpu_plugin.c @@ -48,6 +48,7 @@ #include "image.h" #include "cr_options.h" #include "util.h" + struct vma_metadata { struct list_head list; uint64_t old_pgoff; @@ -327,8 +328,6 @@ void getenv_size_t(const char *var, size_t *value) int sh = 0; size_t size; - pr_info("Value str: %s\n", value_str); - if (value_str) { size = (size_t)strtoul(value_str, &endp, 0); if (errno || value_str == endp) { @@ -1132,7 +1131,6 @@ int amdgpu_restore_init(void) if (d) { while ((dir = readdir(d)) != NULL) { if (strncmp("amdgpu-kfd-", dir->d_name, strlen("amdgpu-kfd-")) == 0) { - pr_info("CC3: Found kfd file\n"); img_fp = open_img_file(dir->d_name, false, &img_size); buf = xmalloc(img_size); if (!buf) { @@ -1155,7 +1153,6 @@ int amdgpu_restore_init(void) xfree(buf); } if (strncmp("amdgpu-renderD-", dir->d_name, strlen("amdgpu-renderD-")) == 0) { - pr_info("CC3: Found drm file\n"); img_fp = open_img_file(dir->d_name, false, &img_size); buf = xmalloc(img_size); if (!buf) { diff --git a/plugins/amdgpu/amdgpu_plugin_dmabuf.c b/plugins/amdgpu/amdgpu_plugin_dmabuf.c index 74b5f9038..bdc107f64 100644 --- a/plugins/amdgpu/amdgpu_plugin_dmabuf.c +++ b/plugins/amdgpu/amdgpu_plugin_dmabuf.c @@ -55,8 +55,6 @@ int __amdgpu_plugin_dmabuf_dump(int dmabuf_fd, int id) unsigned char *buf = NULL; int gem_handle; - pr_info("TWI: Dumping dmabuf fd = %d\n", dmabuf_fd); - gem_handle = handle_for_shared_bo_fd(dmabuf_fd); if (gem_handle < 0) { pr_err("Failed to get handle for dmabuf_fd = %d\n", dmabuf_fd); @@ -107,8 +105,6 @@ int amdgpu_plugin_dmabuf_restore(int id) snprintf(path, sizeof(path), IMG_DMABUF_FILE, id); - pr_info("TWI: Restoring dmabuf fd, id = %d\n", id); - /* Read serialized metadata */ img_fp = open_img_file(path, false, &img_size); if (!img_fp) { @@ -139,29 +135,23 @@ int amdgpu_plugin_dmabuf_restore(int id) } fclose(img_fp); - pr_info("TWI: dmabuf node gem_handle = %d\n", rd->gem_handle); - /* Match GEM handle with shared_dmabuf list */ fd_id = amdgpu_id_for_handle(rd->gem_handle); if (fd_id == -1) { - pr_err("Failed to find dmabuf_fd for GEM handle = %d\n", - rd->gem_handle); + pr_err("Failed to find dmabuf_fd for GEM handle = %d\n", rd->gem_handle); return 1; } + int dmabuf_fd = fdstore_get(fd_id); - pr_info("TWI: dmabuf node fd_id = %d, dmabuf_fd = %d\n", fd_id, dmabuf_fd); if (dmabuf_fd == -1) { - pr_err("Failed to find dmabuf_fd for GEM handle = %d\n", - rd->gem_handle); + pr_err("Failed to find dmabuf_fd for GEM handle = %d\n", rd->gem_handle); return 1; /* Retry needed */ - } else { - pr_info("Restored dmabuf_fd = %d for GEM handle = %d\n", - dmabuf_fd, rd->gem_handle); } + + pr_info("Restored dmabuf_fd = %d for GEM handle = %d\n", dmabuf_fd, rd->gem_handle); ret = dmabuf_fd; - pr_info("Successfully restored dmabuf_fd %d\n", - dmabuf_fd); + pr_info("Successfully restored dmabuf_fd %d\n", dmabuf_fd); criu_dmabuf_node__free_unpacked(rd, NULL); xfree(buf); return ret; diff --git a/plugins/amdgpu/amdgpu_plugin_drm.c b/plugins/amdgpu/amdgpu_plugin_drm.c index 8466ca40d..00bcb7a29 100644 --- a/plugins/amdgpu/amdgpu_plugin_drm.c +++ b/plugins/amdgpu/amdgpu_plugin_drm.c @@ -466,7 +466,6 @@ int amdgpu_plugin_drm_restore_file(int fd, CriuRenderNode *rd) if (work_already_completed(boinfo->handle, rd->drm_render_minor)) { continue; } else if (boinfo->handle != -1) { - pr_info("TWI: restore bo %d\n", boinfo->handle); if (boinfo->is_import) { fd_id = amdgpu_id_for_handle(boinfo->handle); if (fd_id == -1) { @@ -474,13 +473,11 @@ int amdgpu_plugin_drm_restore_file(int fd, CriuRenderNode *rd) continue; } dmabuf_fd = fdstore_get(fd_id); - pr_info("TWI: restore bo %d: fd_id %d, dmabuf_fd %d\n", boinfo->handle, fd_id, dmabuf_fd); } } if (boinfo->is_import) { drmPrimeFDToHandle(device_fd, dmabuf_fd, &handle); - pr_info("TWI: restore bo imported to handle %d\n", handle); } else { union drm_amdgpu_gem_create create_args = { 0 }; @@ -497,7 +494,6 @@ int amdgpu_plugin_drm_restore_file(int fd, CriuRenderNode *rd) handle = create_args.out.handle; drmPrimeHandleToFD(device_fd, handle, 0, &dmabuf_fd); - pr_info("TWI: restore bo created at handle %d and exported to fd %d\n", handle, dmabuf_fd); } change_args.handle = handle; diff --git a/plugins/amdgpu/amdgpu_plugin_util.c b/plugins/amdgpu/amdgpu_plugin_util.c index a2cafa4a3..592562474 100644 --- a/plugins/amdgpu/amdgpu_plugin_util.c +++ b/plugins/amdgpu/amdgpu_plugin_util.c @@ -135,13 +135,9 @@ int handle_for_shared_bo_fd(int fd) if (trial_handle < 0) continue; - pr_info("TWI: Check device %d, got handle %d\n", df->fd, trial_handle); - list_for_each_entry(bo, &shared_bos, l) { - if (bo->handle == trial_handle) { - pr_info("TWI: And that handle exists\n"); + if (bo->handle == trial_handle) return trial_handle; - } } amdgpu_device_deinitialize(h_dev); From 690b6104321dc64dd2ff0c9f6aa6f7c093b24f65 Mon Sep 17 00:00:00 2001 From: David Francis Date: Sun, 2 Nov 2025 07:29:35 -0800 Subject: [PATCH 14/51] plugins/amdgpu: return 0 in post_dump_dmabuf_check Use `return 0` on success in `post_dump_dmabuf_check()` for consistency with other functions. Co-authored-by: Andrei Vagin Signed-off-by: David Francis Signed-off-by: Radostin Stoyanov --- plugins/amdgpu/amdgpu_plugin_dmabuf.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/plugins/amdgpu/amdgpu_plugin_dmabuf.c b/plugins/amdgpu/amdgpu_plugin_dmabuf.c index bdc107f64..11c9792e3 100644 --- a/plugins/amdgpu/amdgpu_plugin_dmabuf.c +++ b/plugins/amdgpu/amdgpu_plugin_dmabuf.c @@ -179,7 +179,7 @@ int try_dump_dmabuf_list() int ret = __amdgpu_plugin_dmabuf_dump(b->dmabuf_fd, b->id); if (ret == -EAGAIN) continue; - else if (ret) + if (ret) return ret; list_del(&b->node); xfree(b); @@ -193,5 +193,5 @@ int post_dump_dmabuf_check() pr_err("Not all dma buffers have been dumped\n"); return -1; } - return 1; -} \ No newline at end of file + return 0; +} From 77e6558ddb134e0e8cfbeb6ce3341bf9b3116ccd Mon Sep 17 00:00:00 2001 From: David Francis Date: Sun, 2 Nov 2025 07:32:03 -0800 Subject: [PATCH 15/51] plugins/amdgpu: apply code-style fixes Co-authored-by: Andrei Vagin Signed-off-by: David Francis Signed-off-by: Radostin Stoyanov --- plugins/amdgpu/amdgpu_plugin.c | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/plugins/amdgpu/amdgpu_plugin.c b/plugins/amdgpu/amdgpu_plugin.c index 4640ccf88..83fa41724 100644 --- a/plugins/amdgpu/amdgpu_plugin.c +++ b/plugins/amdgpu/amdgpu_plugin.c @@ -1406,10 +1406,10 @@ int amdgpu_plugin_dump_file(int fd, int id) if (ret < 0) { pr_perror("Failed to get dmabuf info"); return -1; - } else if (ret == 0) { + } + if (ret == 0) { pr_info("Dumping dmabuf fd = %d\n", fd); - ret = amdgpu_plugin_dmabuf_dump(fd, id); - return ret; + return amdgpu_plugin_dmabuf_dump(fd, id); } if (major(st.st_rdev) != major(st_kfd.st_rdev) || minor(st.st_rdev) != 0) { @@ -1425,9 +1425,9 @@ int amdgpu_plugin_dump_file(int fd, int id) ret = record_dumped_fd(fd, true); if (ret) return ret; - ret = try_dump_dmabuf_list(); + /* Need to return success here so that criu can call plugins for renderD nodes */ - return ret; + return try_dump_dmabuf_list(); } pr_info("%s() called for fd = %d\n", __func__, major(st.st_rdev)); From 6ed49894c5da4466cc89d2fc69afce29dedd6f2e Mon Sep 17 00:00:00 2001 From: David Francis Date: Sun, 2 Nov 2025 07:32:44 -0800 Subject: [PATCH 16/51] plugins/amdgpu: add a comment for retry_needed Add a comment that explains the purpose of `retry_needed`. Co-authored-by: Andrei Vagin Signed-off-by: David Francis Signed-off-by: Radostin Stoyanov --- plugins/amdgpu/amdgpu_plugin.c | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/plugins/amdgpu/amdgpu_plugin.c b/plugins/amdgpu/amdgpu_plugin.c index 83fa41724..36dc0b6b0 100644 --- a/plugins/amdgpu/amdgpu_plugin.c +++ b/plugins/amdgpu/amdgpu_plugin.c @@ -1837,6 +1837,10 @@ int amdgpu_plugin_restore_file(int id, bool *retry_needed) if (!img_fp) { ret = amdgpu_plugin_dmabuf_restore(id); if (ret == 1) { + /* This is a dmabuf fd, but the corresponding buffer object that was + * exported to make it has not yet been restored. Need to try again + * later when the buffer object exists, so it can be re-exported. + */ *retry_needed = true; return 0; } From f56ccfd2d6815b499f321abf2c95a6c7cb3a1c40 Mon Sep 17 00:00:00 2001 From: Radostin Stoyanov Date: Sun, 2 Nov 2025 17:01:31 +0000 Subject: [PATCH 17/51] plugins/amdgpu: remove unused variable amdgpu_plugin_drm.c:167:6: error: variable 'num_bos' set but not used [-Werror,-Wunused-but-set-variable] 167 | int num_bos = 0; | Signed-off-by: Radostin Stoyanov --- plugins/amdgpu/amdgpu_plugin_drm.c | 3 --- 1 file changed, 3 deletions(-) diff --git a/plugins/amdgpu/amdgpu_plugin_drm.c b/plugins/amdgpu/amdgpu_plugin_drm.c index 00bcb7a29..923bfcdd1 100644 --- a/plugins/amdgpu/amdgpu_plugin_drm.c +++ b/plugins/amdgpu/amdgpu_plugin_drm.c @@ -164,7 +164,6 @@ static int restore_bo_contents_drm(int drm_render_minor, CriuRenderNode *rd, int FILE *bo_contents_fp = NULL; void *buffer = NULL; char img_path[40]; - int num_bos = 0; int i, ret = 0; ret = amdgpu_device_initialize(drm_fd, &major, &minor, &h_dev); @@ -206,8 +205,6 @@ static int restore_bo_contents_drm(int drm_render_minor, CriuRenderNode *rd, int if (rd->bo_entries[i]->num_of_vms == 0) continue; - num_bos++; - snprintf(img_path, sizeof(img_path), IMG_DRM_PAGES_FILE, rd->id, drm_render_minor, i); bo_contents_fp = open_img_file(img_path, false, &image_size); From e4a5e164b4ccad7e82cef638f9510f932daea00f Mon Sep 17 00:00:00 2001 From: Radostin Stoyanov Date: Wed, 5 Nov 2025 15:12:06 +0000 Subject: [PATCH 18/51] plugins/amdgpu: update kernel headers This patch updates drm.h and amdgpu_drm.h kernel headers, and adds drm_mode.h (included by drm.h) from the rocm-7.1.0 release tag. Signed-off-by: Radostin Stoyanov --- plugins/amdgpu/amdgpu_drm.h | 125 +++- plugins/amdgpu/drm.h | 58 +- plugins/amdgpu/drm_mode.h | 1362 +++++++++++++++++++++++++++++++++++ 3 files changed, 1523 insertions(+), 22 deletions(-) create mode 100644 plugins/amdgpu/drm_mode.h diff --git a/plugins/amdgpu/amdgpu_drm.h b/plugins/amdgpu/amdgpu_drm.h index 9cebd072a..69227a12b 100644 --- a/plugins/amdgpu/amdgpu_drm.h +++ b/plugins/amdgpu/amdgpu_drm.h @@ -58,6 +58,11 @@ extern "C" { #define DRM_AMDGPU_USERQ_SIGNAL 0x17 #define DRM_AMDGPU_USERQ_WAIT 0x18 #define DRM_AMDGPU_GEM_LIST_HANDLES 0x19 +/* not upstream */ +#define DRM_AMDGPU_GEM_DGMA 0x5c + +/* hybrid specific ioctls */ +#define DRM_AMDGPU_SEM 0x5b #define DRM_IOCTL_AMDGPU_GEM_CREATE DRM_IOWR(DRM_COMMAND_BASE + DRM_AMDGPU_GEM_CREATE, union drm_amdgpu_gem_create) #define DRM_IOCTL_AMDGPU_GEM_MMAP DRM_IOWR(DRM_COMMAND_BASE + DRM_AMDGPU_GEM_MMAP, union drm_amdgpu_gem_mmap) @@ -80,6 +85,8 @@ extern "C" { #define DRM_IOCTL_AMDGPU_USERQ_WAIT DRM_IOWR(DRM_COMMAND_BASE + DRM_AMDGPU_USERQ_WAIT, struct drm_amdgpu_userq_wait) #define DRM_IOCTL_AMDGPU_GEM_LIST_HANDLES DRM_IOWR(DRM_COMMAND_BASE + DRM_AMDGPU_GEM_LIST_HANDLES, struct drm_amdgpu_gem_list_handles) +#define DRM_IOCTL_AMDGPU_GEM_DGMA DRM_IOWR(DRM_COMMAND_BASE + DRM_AMDGPU_GEM_DGMA, struct drm_amdgpu_gem_dgma) + /** * DOC: memory domains * @@ -105,7 +112,12 @@ extern "C" { * * %AMDGPU_GEM_DOMAIN_DOORBELL Doorbell. It is an MMIO region for * signalling user mode queues. + * + * %AMDGPU_GEM_DOMAIN_MMIO_REMAP MMIO remap page (special mapping for HDP flushing). */ +/* hybrid specific ioctls */ +#define DRM_IOCTL_AMDGPU_SEM DRM_IOWR(DRM_COMMAND_BASE + DRM_AMDGPU_SEM, union drm_amdgpu_sem) + #define AMDGPU_GEM_DOMAIN_CPU 0x1 #define AMDGPU_GEM_DOMAIN_GTT 0x2 #define AMDGPU_GEM_DOMAIN_VRAM 0x4 @@ -113,13 +125,20 @@ extern "C" { #define AMDGPU_GEM_DOMAIN_GWS 0x10 #define AMDGPU_GEM_DOMAIN_OA 0x20 #define AMDGPU_GEM_DOMAIN_DOORBELL 0x40 +#define AMDGPU_GEM_DOMAIN_MMIO_REMAP 0x80 +#define AMDGPU_GEM_DOMAIN_DGMA 0x400 +#define AMDGPU_GEM_DOMAIN_DGMA_IMPORT 0x800 + #define AMDGPU_GEM_DOMAIN_MASK (AMDGPU_GEM_DOMAIN_CPU | \ AMDGPU_GEM_DOMAIN_GTT | \ AMDGPU_GEM_DOMAIN_VRAM | \ AMDGPU_GEM_DOMAIN_GDS | \ AMDGPU_GEM_DOMAIN_GWS | \ - AMDGPU_GEM_DOMAIN_OA | \ - AMDGPU_GEM_DOMAIN_DOORBELL) + AMDGPU_GEM_DOMAIN_OA |\ + AMDGPU_GEM_DOMAIN_DOORBELL |\ + AMDGPU_GEM_DOMAIN_MMIO_REMAP |\ + AMDGPU_GEM_DOMAIN_DGMA |\ + AMDGPU_GEM_DOMAIN_DGMA_IMPORT) /* Flag that CPU access will be required for the case of VRAM domain */ #define AMDGPU_GEM_CREATE_CPU_ACCESS_REQUIRED (1 << 0) @@ -182,6 +201,14 @@ extern "C" { /* Set PTE.D and recompress during GTT->VRAM moves according to TILING flags. */ #define AMDGPU_GEM_CREATE_GFX12_DCC (1 << 16) +/* hybrid specific */ +/* Flag that the memory should be in SPARSE resource */ +#define AMDGPU_GEM_CREATE_SPARSE (1ULL << 29) +/* Flag that the memory allocation should be from top of domain */ +#define AMDGPU_GEM_CREATE_TOP_DOWN (1ULL << 30) +/* Flag that the memory allocation should be pinned */ +#define AMDGPU_GEM_CREATE_NO_EVICT (1ULL << 31) + struct drm_amdgpu_gem_create_in { /** the requested memory size */ __u64 bo_size; @@ -581,6 +608,35 @@ struct drm_amdgpu_userq_wait { __u64 out_fences; }; +/* sem related */ +#define AMDGPU_SEM_OP_CREATE_SEM 1 +#define AMDGPU_SEM_OP_WAIT_SEM 2 +#define AMDGPU_SEM_OP_SIGNAL_SEM 3 +#define AMDGPU_SEM_OP_DESTROY_SEM 4 +#define AMDGPU_SEM_OP_IMPORT_SEM 5 +#define AMDGPU_SEM_OP_EXPORT_SEM 6 + +struct drm_amdgpu_sem_in { + /** AMDGPU_SEM_OP_* */ + uint32_t op; + uint32_t handle; + uint32_t ctx_id; + uint32_t ip_type; + uint32_t ip_instance; + uint32_t ring; + uint64_t seq; +}; + +union drm_amdgpu_sem_out { + int32_t fd; + uint32_t handle; +}; + +union drm_amdgpu_sem { + struct drm_amdgpu_sem_in in; + union drm_amdgpu_sem_out out; +}; + /* vm ioctl */ #define AMDGPU_VM_OP_RESERVE_VMID 1 #define AMDGPU_VM_OP_UNRESERVE_VMID 2 @@ -637,6 +693,15 @@ struct drm_amdgpu_gem_userptr { __u32 handle; }; +#define AMDGPU_GEM_DGMA_IMPORT 0 +#define AMDGPU_GEM_DGMA_QUERY_PHYS_ADDR 1 +struct drm_amdgpu_gem_dgma { + __u64 addr; + __u64 size; + __u32 op; + __u32 handle; +}; + /* SI-CI-VI: */ /* same meaning as the GB_TILE_MODE and GL_MACRO_TILE_MODE fields */ #define AMDGPU_TILING_ARRAY_MODE_SHIFT 0 @@ -1084,10 +1149,11 @@ struct drm_amdgpu_cs_chunk_cp_gfx_shadow { * Query h/w info: Flag that this is integrated (a.h.a. fusion) GPU * */ -#define AMDGPU_IDS_FLAGS_FUSION 0x1 -#define AMDGPU_IDS_FLAGS_PREEMPTION 0x2 -#define AMDGPU_IDS_FLAGS_TMZ 0x4 -#define AMDGPU_IDS_FLAGS_CONFORMANT_TRUNC_COORD 0x8 +#define AMDGPU_IDS_FLAGS_FUSION 0x01 +#define AMDGPU_IDS_FLAGS_PREEMPTION 0x02 +#define AMDGPU_IDS_FLAGS_TMZ 0x04 +#define AMDGPU_IDS_FLAGS_CONFORMANT_TRUNC_COORD 0x08 +#define AMDGPU_IDS_FLAGS_GANG_SUBMIT 0x10 /* * Query h/w info: Flag identifying VF/PF/PT mode @@ -1269,6 +1335,16 @@ struct drm_amdgpu_cs_chunk_cp_gfx_shadow { /* query FW object size and alignment */ #define AMDGPU_INFO_UQ_FW_AREAS 0x24 +/* Hybrid Stack Specific Defs*/ +/* gpu capability */ +#define AMDGPU_INFO_CAPABILITY 0x50 +/* virtual range */ +#define AMDGPU_INFO_VIRTUAL_RANGE 0x51 +/* query pin memory capability */ +#define AMDGPU_CAPABILITY_PIN_MEM_FLAG (1 << 0) +/* query direct gma capability */ +#define AMDGPU_CAPABILITY_DIRECT_GMA_FLAG (1 << 1) + #define AMDGPU_INFO_MMR_SE_INDEX_SHIFT 0 #define AMDGPU_INFO_MMR_SE_INDEX_MASK 0xff #define AMDGPU_INFO_MMR_SH_INDEX_SHIFT 8 @@ -1325,6 +1401,11 @@ struct drm_amdgpu_info { __u32 flags; } read_mmr_reg; + struct { + uint32_t aperture; + uint32_t _pad; + } virtual_range; + struct drm_amdgpu_query_fw query_fw; struct { @@ -1423,6 +1504,8 @@ struct drm_amdgpu_info_vbios { #define AMDGPU_VRAM_TYPE_LPDDR5 12 #define AMDGPU_VRAM_TYPE_HBM3E 13 +#define AMDGPU_VRAM_TYPE_HBM_WIDTH 4096 + struct drm_amdgpu_info_device { /** PCI Device ID */ __u32 device_id; @@ -1672,6 +1755,7 @@ struct drm_amdgpu_info_uq_metadata { #define AMDGPU_FAMILY_GC_11_5_0 150 /* GC 11.5.0 */ #define AMDGPU_FAMILY_GC_12_0_0 152 /* GC 12.0.0 */ +#ifndef HAVE_DRM_COLOR_CTM_3X4 /* FIXME wrong namespace! */ struct drm_color_ctm_3x4 { /* @@ -1680,6 +1764,35 @@ struct drm_color_ctm_3x4 { */ __u64 matrix[12]; }; +#endif + +/** + * Definition of System Unified Address (SUA) apertures + */ +#define AMDGPU_SUA_APERTURE_PRIVATE 1 +#define AMDGPU_SUA_APERTURE_SHARED 2 +struct drm_amdgpu_virtual_range { + uint64_t start; + uint64_t end; +}; + +struct drm_amdgpu_capability { + __u32 flag; + __u32 direct_gma_size; +}; + +/* + * Definition of free sync enter and exit signals + * We may have more options in the future + */ +#define AMDGPU_FREESYNC_FULLSCREEN_ENTER 1 +#define AMDGPU_FREESYNC_FULLSCREEN_EXIT 2 + +struct drm_amdgpu_freesync { + __u32 op; /* AMDGPU_FREESYNC_FULLSCREEN_ENTER or */ + /* AMDGPU_FREESYNC_FULLSCREEN_ENTER */ + __u32 spare[7]; +}; #if defined(__cplusplus) } diff --git a/plugins/amdgpu/drm.h b/plugins/amdgpu/drm.h index 84c819c17..3cd5cf15e 100644 --- a/plugins/amdgpu/drm.h +++ b/plugins/amdgpu/drm.h @@ -597,40 +597,62 @@ struct drm_set_version { int drm_dd_minor; }; -/* DRM_IOCTL_GEM_CLOSE ioctl argument type */ +/** + * struct drm_gem_close - Argument for &DRM_IOCTL_GEM_CLOSE ioctl. + * @handle: Handle of the object to be closed. + * @pad: Padding. + * + * Releases the handle to an mm object. + */ struct drm_gem_close { - /** Handle of the object to be closed. */ __u32 handle; __u32 pad; }; -/* DRM_IOCTL_GEM_FLINK ioctl argument type */ +/** + * struct drm_gem_flink - Argument for &DRM_IOCTL_GEM_FLINK ioctl. + * @handle: Handle for the object being named. + * @name: Returned global name. + * + * Create a global name for an object, returning the name. + * + * Note that the name does not hold a reference; when the object + * is freed, the name goes away. + */ struct drm_gem_flink { - /** Handle for the object being named */ __u32 handle; - - /** Returned global name */ __u32 name; }; -/* DRM_IOCTL_GEM_OPEN ioctl argument type */ +/** + * struct drm_gem_open - Argument for &DRM_IOCTL_GEM_OPEN ioctl. + * @name: Name of object being opened. + * @handle: Returned handle for the object. + * @size: Returned size of the object + * + * Open an object using the global name, returning a handle and the size. + * + * This handle (of course) holds a reference to the object, so the object + * will not go away until the handle is deleted. + */ struct drm_gem_open { - /** Name of object being opened */ __u32 name; - - /** Returned handle for the object */ __u32 handle; - - /** Returned size of the object */ __u64 size; }; -/* DRM_IOCTL_GEM_CHANGE_HANDLE ioctl argument type */ +/** + * struct drm_gem_change_handle - Argument for &DRM_IOCTL_GEM_CHANGE_HANDLE ioctl. + * @handle: The handle of a gem object. + * @new_handle: An available gem handle. + * + * This ioctl changes the handle of a GEM object to the specified one. + * The new handle must be unused. On success the old handle is closed + * and all further IOCTL should refer to the new handle only. + * Calls to DRM_IOCTL_PRIME_FD_TO_HANDLE will return the new handle. + */ struct drm_gem_change_handle { - /** Current handle of object */ __u32 handle; - - /** Handle to change that object to */ __u32 new_handle; }; @@ -914,13 +936,17 @@ struct drm_syncobj_destroy { }; #define DRM_SYNCOBJ_FD_TO_HANDLE_FLAGS_IMPORT_SYNC_FILE (1 << 0) +#define DRM_SYNCOBJ_FD_TO_HANDLE_FLAGS_TIMELINE (1 << 1) #define DRM_SYNCOBJ_HANDLE_TO_FD_FLAGS_EXPORT_SYNC_FILE (1 << 0) +#define DRM_SYNCOBJ_HANDLE_TO_FD_FLAGS_TIMELINE (1 << 1) struct drm_syncobj_handle { __u32 handle; __u32 flags; __s32 fd; __u32 pad; + + __u64 point; }; struct drm_syncobj_transfer { diff --git a/plugins/amdgpu/drm_mode.h b/plugins/amdgpu/drm_mode.h new file mode 100644 index 000000000..c082810c0 --- /dev/null +++ b/plugins/amdgpu/drm_mode.h @@ -0,0 +1,1362 @@ +/* + * Copyright (c) 2007 Dave Airlie + * Copyright (c) 2007 Jakob Bornecrantz + * Copyright (c) 2008 Red Hat Inc. + * Copyright (c) 2007-2008 Tungsten Graphics, Inc., Cedar Park, TX., USA + * Copyright (c) 2007-2008 Intel Corporation + * + * Permission is hereby granted, free of charge, to any person obtaining a + * copy of this software and associated documentation files (the "Software"), + * to deal in the Software without restriction, including without limitation + * the rights to use, copy, modify, merge, publish, distribute, sublicense, + * and/or sell copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS + * IN THE SOFTWARE. + */ + +#ifndef _DRM_MODE_H +#define _DRM_MODE_H + +#include "drm.h" + +#if defined(__cplusplus) +extern "C" { +#endif + +/** + * DOC: overview + * + * DRM exposes many UAPI and structure definitions to have a consistent + * and standardized interface with users. + * Userspace can refer to these structure definitions and UAPI formats + * to communicate to drivers. + */ + +#define DRM_CONNECTOR_NAME_LEN 32 +#define DRM_DISPLAY_MODE_LEN 32 +#define DRM_PROP_NAME_LEN 32 + +#define DRM_MODE_TYPE_BUILTIN (1<<0) /* deprecated */ +#define DRM_MODE_TYPE_CLOCK_C ((1<<1) | DRM_MODE_TYPE_BUILTIN) /* deprecated */ +#define DRM_MODE_TYPE_CRTC_C ((1<<2) | DRM_MODE_TYPE_BUILTIN) /* deprecated */ +#define DRM_MODE_TYPE_PREFERRED (1<<3) +#define DRM_MODE_TYPE_DEFAULT (1<<4) /* deprecated */ +#define DRM_MODE_TYPE_USERDEF (1<<5) +#define DRM_MODE_TYPE_DRIVER (1<<6) + +#define DRM_MODE_TYPE_ALL (DRM_MODE_TYPE_PREFERRED | \ + DRM_MODE_TYPE_USERDEF | \ + DRM_MODE_TYPE_DRIVER) + +/* Video mode flags */ +/* bit compatible with the xrandr RR_ definitions (bits 0-13) + * + * ABI warning: Existing userspace really expects + * the mode flags to match the xrandr definitions. Any + * changes that don't match the xrandr definitions will + * likely need a new client cap or some other mechanism + * to avoid breaking existing userspace. This includes + * allocating new flags in the previously unused bits! + */ +#define DRM_MODE_FLAG_PHSYNC (1<<0) +#define DRM_MODE_FLAG_NHSYNC (1<<1) +#define DRM_MODE_FLAG_PVSYNC (1<<2) +#define DRM_MODE_FLAG_NVSYNC (1<<3) +#define DRM_MODE_FLAG_INTERLACE (1<<4) +#define DRM_MODE_FLAG_DBLSCAN (1<<5) +#define DRM_MODE_FLAG_CSYNC (1<<6) +#define DRM_MODE_FLAG_PCSYNC (1<<7) +#define DRM_MODE_FLAG_NCSYNC (1<<8) +#define DRM_MODE_FLAG_HSKEW (1<<9) /* hskew provided */ +#define DRM_MODE_FLAG_BCAST (1<<10) /* deprecated */ +#define DRM_MODE_FLAG_PIXMUX (1<<11) /* deprecated */ +#define DRM_MODE_FLAG_DBLCLK (1<<12) +#define DRM_MODE_FLAG_CLKDIV2 (1<<13) + /* + * When adding a new stereo mode don't forget to adjust DRM_MODE_FLAGS_3D_MAX + * (define not exposed to user space). + */ +#define DRM_MODE_FLAG_3D_MASK (0x1f<<14) +#define DRM_MODE_FLAG_3D_NONE (0<<14) +#define DRM_MODE_FLAG_3D_FRAME_PACKING (1<<14) +#define DRM_MODE_FLAG_3D_FIELD_ALTERNATIVE (2<<14) +#define DRM_MODE_FLAG_3D_LINE_ALTERNATIVE (3<<14) +#define DRM_MODE_FLAG_3D_SIDE_BY_SIDE_FULL (4<<14) +#define DRM_MODE_FLAG_3D_L_DEPTH (5<<14) +#define DRM_MODE_FLAG_3D_L_DEPTH_GFX_GFX_DEPTH (6<<14) +#define DRM_MODE_FLAG_3D_TOP_AND_BOTTOM (7<<14) +#define DRM_MODE_FLAG_3D_SIDE_BY_SIDE_HALF (8<<14) + +/* Picture aspect ratio options */ +#define DRM_MODE_PICTURE_ASPECT_NONE 0 +#define DRM_MODE_PICTURE_ASPECT_4_3 1 +#define DRM_MODE_PICTURE_ASPECT_16_9 2 +#define DRM_MODE_PICTURE_ASPECT_64_27 3 +#define DRM_MODE_PICTURE_ASPECT_256_135 4 + +/* Content type options */ +#define DRM_MODE_CONTENT_TYPE_NO_DATA 0 +#define DRM_MODE_CONTENT_TYPE_GRAPHICS 1 +#define DRM_MODE_CONTENT_TYPE_PHOTO 2 +#define DRM_MODE_CONTENT_TYPE_CINEMA 3 +#define DRM_MODE_CONTENT_TYPE_GAME 4 + +/* Aspect ratio flag bitmask (4 bits 22:19) */ +#define DRM_MODE_FLAG_PIC_AR_MASK (0x0F<<19) +#define DRM_MODE_FLAG_PIC_AR_NONE \ + (DRM_MODE_PICTURE_ASPECT_NONE<<19) +#define DRM_MODE_FLAG_PIC_AR_4_3 \ + (DRM_MODE_PICTURE_ASPECT_4_3<<19) +#define DRM_MODE_FLAG_PIC_AR_16_9 \ + (DRM_MODE_PICTURE_ASPECT_16_9<<19) +#define DRM_MODE_FLAG_PIC_AR_64_27 \ + (DRM_MODE_PICTURE_ASPECT_64_27<<19) +#define DRM_MODE_FLAG_PIC_AR_256_135 \ + (DRM_MODE_PICTURE_ASPECT_256_135<<19) + +#define DRM_MODE_FLAG_ALL (DRM_MODE_FLAG_PHSYNC | \ + DRM_MODE_FLAG_NHSYNC | \ + DRM_MODE_FLAG_PVSYNC | \ + DRM_MODE_FLAG_NVSYNC | \ + DRM_MODE_FLAG_INTERLACE | \ + DRM_MODE_FLAG_DBLSCAN | \ + DRM_MODE_FLAG_CSYNC | \ + DRM_MODE_FLAG_PCSYNC | \ + DRM_MODE_FLAG_NCSYNC | \ + DRM_MODE_FLAG_HSKEW | \ + DRM_MODE_FLAG_DBLCLK | \ + DRM_MODE_FLAG_CLKDIV2 | \ + DRM_MODE_FLAG_3D_MASK) + +/* DPMS flags */ +/* bit compatible with the xorg definitions. */ +#define DRM_MODE_DPMS_ON 0 +#define DRM_MODE_DPMS_STANDBY 1 +#define DRM_MODE_DPMS_SUSPEND 2 +#define DRM_MODE_DPMS_OFF 3 + +/* Scaling mode options */ +#define DRM_MODE_SCALE_NONE 0 /* Unmodified timing (display or + software can still scale) */ +#define DRM_MODE_SCALE_FULLSCREEN 1 /* Full screen, ignore aspect */ +#define DRM_MODE_SCALE_CENTER 2 /* Centered, no scaling */ +#define DRM_MODE_SCALE_ASPECT 3 /* Full screen, preserve aspect */ + +/* Dithering mode options */ +#define DRM_MODE_DITHERING_OFF 0 +#define DRM_MODE_DITHERING_ON 1 +#define DRM_MODE_DITHERING_AUTO 2 + +/* Dirty info options */ +#define DRM_MODE_DIRTY_OFF 0 +#define DRM_MODE_DIRTY_ON 1 +#define DRM_MODE_DIRTY_ANNOTATE 2 + +/* Link Status options */ +#define DRM_MODE_LINK_STATUS_GOOD 0 +#define DRM_MODE_LINK_STATUS_BAD 1 + +/* + * DRM_MODE_ROTATE_ + * + * Signals that a drm plane is been rotated degrees in counter + * clockwise direction. + * + * This define is provided as a convenience, looking up the property id + * using the name->prop id lookup is the preferred method. + */ +#define DRM_MODE_ROTATE_0 (1<<0) +#define DRM_MODE_ROTATE_90 (1<<1) +#define DRM_MODE_ROTATE_180 (1<<2) +#define DRM_MODE_ROTATE_270 (1<<3) + +/* + * DRM_MODE_ROTATE_MASK + * + * Bitmask used to look for drm plane rotations. + */ +#define DRM_MODE_ROTATE_MASK (\ + DRM_MODE_ROTATE_0 | \ + DRM_MODE_ROTATE_90 | \ + DRM_MODE_ROTATE_180 | \ + DRM_MODE_ROTATE_270) + +/* + * DRM_MODE_REFLECT_ + * + * Signals that the contents of a drm plane is reflected along the axis, + * in the same way as mirroring. + * See kerneldoc chapter "Plane Composition Properties" for more details. + * + * This define is provided as a convenience, looking up the property id + * using the name->prop id lookup is the preferred method. + */ +#define DRM_MODE_REFLECT_X (1<<4) +#define DRM_MODE_REFLECT_Y (1<<5) + +/* + * DRM_MODE_REFLECT_MASK + * + * Bitmask used to look for drm plane reflections. + */ +#define DRM_MODE_REFLECT_MASK (\ + DRM_MODE_REFLECT_X | \ + DRM_MODE_REFLECT_Y) + +/* Content Protection Flags */ +#define DRM_MODE_CONTENT_PROTECTION_UNDESIRED 0 +#define DRM_MODE_CONTENT_PROTECTION_DESIRED 1 +#define DRM_MODE_CONTENT_PROTECTION_ENABLED 2 + +/** + * struct drm_mode_modeinfo - Display mode information. + * @clock: pixel clock in kHz + * @hdisplay: horizontal display size + * @hsync_start: horizontal sync start + * @hsync_end: horizontal sync end + * @htotal: horizontal total size + * @hskew: horizontal skew + * @vdisplay: vertical display size + * @vsync_start: vertical sync start + * @vsync_end: vertical sync end + * @vtotal: vertical total size + * @vscan: vertical scan + * @vrefresh: approximate vertical refresh rate in Hz + * @flags: bitmask of misc. flags, see DRM_MODE_FLAG_* defines + * @type: bitmask of type flags, see DRM_MODE_TYPE_* defines + * @name: string describing the mode resolution + * + * This is the user-space API display mode information structure. For the + * kernel version see struct drm_display_mode. + */ +struct drm_mode_modeinfo { + __u32 clock; + __u16 hdisplay; + __u16 hsync_start; + __u16 hsync_end; + __u16 htotal; + __u16 hskew; + __u16 vdisplay; + __u16 vsync_start; + __u16 vsync_end; + __u16 vtotal; + __u16 vscan; + + __u32 vrefresh; + + __u32 flags; + __u32 type; + char name[DRM_DISPLAY_MODE_LEN]; +}; + +struct drm_mode_card_res { + __u64 fb_id_ptr; + __u64 crtc_id_ptr; + __u64 connector_id_ptr; + __u64 encoder_id_ptr; + __u32 count_fbs; + __u32 count_crtcs; + __u32 count_connectors; + __u32 count_encoders; + __u32 min_width; + __u32 max_width; + __u32 min_height; + __u32 max_height; +}; + +struct drm_mode_crtc { + __u64 set_connectors_ptr; + __u32 count_connectors; + + __u32 crtc_id; /**< Id */ + __u32 fb_id; /**< Id of framebuffer */ + + __u32 x; /**< x Position on the framebuffer */ + __u32 y; /**< y Position on the framebuffer */ + + __u32 gamma_size; + __u32 mode_valid; + struct drm_mode_modeinfo mode; +}; + +#define DRM_MODE_PRESENT_TOP_FIELD (1<<0) +#define DRM_MODE_PRESENT_BOTTOM_FIELD (1<<1) + +/* Planes blend with or override other bits on the CRTC */ +struct drm_mode_set_plane { + __u32 plane_id; + __u32 crtc_id; + __u32 fb_id; /* fb object contains surface format type */ + __u32 flags; /* see above flags */ + + /* Signed dest location allows it to be partially off screen */ + __s32 crtc_x; + __s32 crtc_y; + __u32 crtc_w; + __u32 crtc_h; + + /* Source values are 16.16 fixed point */ + __u32 src_x; + __u32 src_y; + __u32 src_h; + __u32 src_w; +}; + +/** + * struct drm_mode_get_plane - Get plane metadata. + * + * Userspace can perform a GETPLANE ioctl to retrieve information about a + * plane. + * + * To retrieve the number of formats supported, set @count_format_types to zero + * and call the ioctl. @count_format_types will be updated with the value. + * + * To retrieve these formats, allocate an array with the memory needed to store + * @count_format_types formats. Point @format_type_ptr to this array and call + * the ioctl again (with @count_format_types still set to the value returned in + * the first ioctl call). + */ +struct drm_mode_get_plane { + /** + * @plane_id: Object ID of the plane whose information should be + * retrieved. Set by caller. + */ + __u32 plane_id; + + /** @crtc_id: Object ID of the current CRTC. */ + __u32 crtc_id; + /** @fb_id: Object ID of the current fb. */ + __u32 fb_id; + + /** + * @possible_crtcs: Bitmask of CRTC's compatible with the plane. CRTC's + * are created and they receive an index, which corresponds to their + * position in the bitmask. Bit N corresponds to + * :ref:`CRTC index` N. + */ + __u32 possible_crtcs; + /** @gamma_size: Never used. */ + __u32 gamma_size; + + /** @count_format_types: Number of formats. */ + __u32 count_format_types; + /** + * @format_type_ptr: Pointer to ``__u32`` array of formats that are + * supported by the plane. These formats do not require modifiers. + */ + __u64 format_type_ptr; +}; + +struct drm_mode_get_plane_res { + __u64 plane_id_ptr; + __u32 count_planes; +}; + +#define DRM_MODE_ENCODER_NONE 0 +#define DRM_MODE_ENCODER_DAC 1 +#define DRM_MODE_ENCODER_TMDS 2 +#define DRM_MODE_ENCODER_LVDS 3 +#define DRM_MODE_ENCODER_TVDAC 4 +#define DRM_MODE_ENCODER_VIRTUAL 5 +#define DRM_MODE_ENCODER_DSI 6 +#define DRM_MODE_ENCODER_DPMST 7 +#define DRM_MODE_ENCODER_DPI 8 + +struct drm_mode_get_encoder { + __u32 encoder_id; + __u32 encoder_type; + + __u32 crtc_id; /**< Id of crtc */ + + __u32 possible_crtcs; + __u32 possible_clones; +}; + +/* This is for connectors with multiple signal types. */ +/* Try to match DRM_MODE_CONNECTOR_X as closely as possible. */ +enum drm_mode_subconnector { + DRM_MODE_SUBCONNECTOR_Automatic = 0, /* DVI-I, TV */ + DRM_MODE_SUBCONNECTOR_Unknown = 0, /* DVI-I, TV, DP */ + DRM_MODE_SUBCONNECTOR_VGA = 1, /* DP */ + DRM_MODE_SUBCONNECTOR_DVID = 3, /* DVI-I DP */ + DRM_MODE_SUBCONNECTOR_DVIA = 4, /* DVI-I */ + DRM_MODE_SUBCONNECTOR_Composite = 5, /* TV */ + DRM_MODE_SUBCONNECTOR_SVIDEO = 6, /* TV */ + DRM_MODE_SUBCONNECTOR_Component = 8, /* TV */ + DRM_MODE_SUBCONNECTOR_SCART = 9, /* TV */ + DRM_MODE_SUBCONNECTOR_DisplayPort = 10, /* DP */ + DRM_MODE_SUBCONNECTOR_HDMIA = 11, /* DP */ + DRM_MODE_SUBCONNECTOR_Native = 15, /* DP */ + DRM_MODE_SUBCONNECTOR_Wireless = 18, /* DP */ +}; + +#define DRM_MODE_CONNECTOR_Unknown 0 +#define DRM_MODE_CONNECTOR_VGA 1 +#define DRM_MODE_CONNECTOR_DVII 2 +#define DRM_MODE_CONNECTOR_DVID 3 +#define DRM_MODE_CONNECTOR_DVIA 4 +#define DRM_MODE_CONNECTOR_Composite 5 +#define DRM_MODE_CONNECTOR_SVIDEO 6 +#define DRM_MODE_CONNECTOR_LVDS 7 +#define DRM_MODE_CONNECTOR_Component 8 +#define DRM_MODE_CONNECTOR_9PinDIN 9 +#define DRM_MODE_CONNECTOR_DisplayPort 10 +#define DRM_MODE_CONNECTOR_HDMIA 11 +#define DRM_MODE_CONNECTOR_HDMIB 12 +#define DRM_MODE_CONNECTOR_TV 13 +#define DRM_MODE_CONNECTOR_eDP 14 +#define DRM_MODE_CONNECTOR_VIRTUAL 15 +#define DRM_MODE_CONNECTOR_DSI 16 +#define DRM_MODE_CONNECTOR_DPI 17 +#define DRM_MODE_CONNECTOR_WRITEBACK 18 +#define DRM_MODE_CONNECTOR_SPI 19 +#define DRM_MODE_CONNECTOR_USB 20 + +/** + * struct drm_mode_get_connector - Get connector metadata. + * + * User-space can perform a GETCONNECTOR ioctl to retrieve information about a + * connector. User-space is expected to retrieve encoders, modes and properties + * by performing this ioctl at least twice: the first time to retrieve the + * number of elements, the second time to retrieve the elements themselves. + * + * To retrieve the number of elements, set @count_props and @count_encoders to + * zero, set @count_modes to 1, and set @modes_ptr to a temporary struct + * drm_mode_modeinfo element. + * + * To retrieve the elements, allocate arrays for @encoders_ptr, @modes_ptr, + * @props_ptr and @prop_values_ptr, then set @count_modes, @count_props and + * @count_encoders to their capacity. + * + * Performing the ioctl only twice may be racy: the number of elements may have + * changed with a hotplug event in-between the two ioctls. User-space is + * expected to retry the last ioctl until the number of elements stabilizes. + * The kernel won't fill any array which doesn't have the expected length. + * + * **Force-probing a connector** + * + * If the @count_modes field is set to zero and the DRM client is the current + * DRM master, the kernel will perform a forced probe on the connector to + * refresh the connector status, modes and EDID. A forced-probe can be slow, + * might cause flickering and the ioctl will block. + * + * User-space needs to force-probe connectors to ensure their metadata is + * up-to-date at startup and after receiving a hot-plug event. User-space + * may perform a forced-probe when the user explicitly requests it. User-space + * shouldn't perform a forced-probe in other situations. + */ +struct drm_mode_get_connector { + /** @encoders_ptr: Pointer to ``__u32`` array of object IDs. */ + __u64 encoders_ptr; + /** @modes_ptr: Pointer to struct drm_mode_modeinfo array. */ + __u64 modes_ptr; + /** @props_ptr: Pointer to ``__u32`` array of property IDs. */ + __u64 props_ptr; + /** @prop_values_ptr: Pointer to ``__u64`` array of property values. */ + __u64 prop_values_ptr; + + /** @count_modes: Number of modes. */ + __u32 count_modes; + /** @count_props: Number of properties. */ + __u32 count_props; + /** @count_encoders: Number of encoders. */ + __u32 count_encoders; + + /** @encoder_id: Object ID of the current encoder. */ + __u32 encoder_id; + /** @connector_id: Object ID of the connector. */ + __u32 connector_id; + /** + * @connector_type: Type of the connector. + * + * See DRM_MODE_CONNECTOR_* defines. + */ + __u32 connector_type; + /** + * @connector_type_id: Type-specific connector number. + * + * This is not an object ID. This is a per-type connector number. Each + * (type, type_id) combination is unique across all connectors of a DRM + * device. + * + * The (type, type_id) combination is not a stable identifier: the + * type_id can change depending on the driver probe order. + */ + __u32 connector_type_id; + + /** + * @connection: Status of the connector. + * + * See enum drm_connector_status. + */ + __u32 connection; + /** @mm_width: Width of the connected sink in millimeters. */ + __u32 mm_width; + /** @mm_height: Height of the connected sink in millimeters. */ + __u32 mm_height; + /** + * @subpixel: Subpixel order of the connected sink. + * + * See enum subpixel_order. + */ + __u32 subpixel; + + /** @pad: Padding, must be zero. */ + __u32 pad; +}; + +#define DRM_MODE_PROP_PENDING (1<<0) /* deprecated, do not use */ +#define DRM_MODE_PROP_RANGE (1<<1) +#define DRM_MODE_PROP_IMMUTABLE (1<<2) +#define DRM_MODE_PROP_ENUM (1<<3) /* enumerated type with text strings */ +#define DRM_MODE_PROP_BLOB (1<<4) +#define DRM_MODE_PROP_BITMASK (1<<5) /* bitmask of enumerated types */ + +/* non-extended types: legacy bitmask, one bit per type: */ +#define DRM_MODE_PROP_LEGACY_TYPE ( \ + DRM_MODE_PROP_RANGE | \ + DRM_MODE_PROP_ENUM | \ + DRM_MODE_PROP_BLOB | \ + DRM_MODE_PROP_BITMASK) + +/* extended-types: rather than continue to consume a bit per type, + * grab a chunk of the bits to use as integer type id. + */ +#define DRM_MODE_PROP_EXTENDED_TYPE 0x0000ffc0 +#define DRM_MODE_PROP_TYPE(n) ((n) << 6) +#define DRM_MODE_PROP_OBJECT DRM_MODE_PROP_TYPE(1) +#define DRM_MODE_PROP_SIGNED_RANGE DRM_MODE_PROP_TYPE(2) + +/* the PROP_ATOMIC flag is used to hide properties from userspace that + * is not aware of atomic properties. This is mostly to work around + * older userspace (DDX drivers) that read/write each prop they find, + * without being aware that this could be triggering a lengthy modeset. + */ +#define DRM_MODE_PROP_ATOMIC 0x80000000 + +/** + * struct drm_mode_property_enum - Description for an enum/bitfield entry. + * @value: numeric value for this enum entry. + * @name: symbolic name for this enum entry. + * + * See struct drm_property_enum for details. + */ +struct drm_mode_property_enum { + __u64 value; + char name[DRM_PROP_NAME_LEN]; +}; + +/** + * struct drm_mode_get_property - Get property metadata. + * + * User-space can perform a GETPROPERTY ioctl to retrieve information about a + * property. The same property may be attached to multiple objects, see + * "Modeset Base Object Abstraction". + * + * The meaning of the @values_ptr field changes depending on the property type. + * See &drm_property.flags for more details. + * + * The @enum_blob_ptr and @count_enum_blobs fields are only meaningful when the + * property has the type &DRM_MODE_PROP_ENUM or &DRM_MODE_PROP_BITMASK. For + * backwards compatibility, the kernel will always set @count_enum_blobs to + * zero when the property has the type &DRM_MODE_PROP_BLOB. User-space must + * ignore these two fields if the property has a different type. + * + * User-space is expected to retrieve values and enums by performing this ioctl + * at least twice: the first time to retrieve the number of elements, the + * second time to retrieve the elements themselves. + * + * To retrieve the number of elements, set @count_values and @count_enum_blobs + * to zero, then call the ioctl. @count_values will be updated with the number + * of elements. If the property has the type &DRM_MODE_PROP_ENUM or + * &DRM_MODE_PROP_BITMASK, @count_enum_blobs will be updated as well. + * + * To retrieve the elements themselves, allocate an array for @values_ptr and + * set @count_values to its capacity. If the property has the type + * &DRM_MODE_PROP_ENUM or &DRM_MODE_PROP_BITMASK, allocate an array for + * @enum_blob_ptr and set @count_enum_blobs to its capacity. Calling the ioctl + * again will fill the arrays. + */ +struct drm_mode_get_property { + /** @values_ptr: Pointer to a ``__u64`` array. */ + __u64 values_ptr; + /** @enum_blob_ptr: Pointer to a struct drm_mode_property_enum array. */ + __u64 enum_blob_ptr; + + /** + * @prop_id: Object ID of the property which should be retrieved. Set + * by the caller. + */ + __u32 prop_id; + /** + * @flags: ``DRM_MODE_PROP_*`` bitfield. See &drm_property.flags for + * a definition of the flags. + */ + __u32 flags; + /** + * @name: Symbolic property name. User-space should use this field to + * recognize properties. + */ + char name[DRM_PROP_NAME_LEN]; + + /** @count_values: Number of elements in @values_ptr. */ + __u32 count_values; + /** @count_enum_blobs: Number of elements in @enum_blob_ptr. */ + __u32 count_enum_blobs; +}; + +struct drm_mode_connector_set_property { + __u64 value; + __u32 prop_id; + __u32 connector_id; +}; + +#define DRM_MODE_OBJECT_CRTC 0xcccccccc +#define DRM_MODE_OBJECT_CONNECTOR 0xc0c0c0c0 +#define DRM_MODE_OBJECT_ENCODER 0xe0e0e0e0 +#define DRM_MODE_OBJECT_MODE 0xdededede +#define DRM_MODE_OBJECT_PROPERTY 0xb0b0b0b0 +#define DRM_MODE_OBJECT_FB 0xfbfbfbfb +#define DRM_MODE_OBJECT_BLOB 0xbbbbbbbb +#define DRM_MODE_OBJECT_PLANE 0xeeeeeeee +#define DRM_MODE_OBJECT_ANY 0 + +struct drm_mode_obj_get_properties { + __u64 props_ptr; + __u64 prop_values_ptr; + __u32 count_props; + __u32 obj_id; + __u32 obj_type; +}; + +struct drm_mode_obj_set_property { + __u64 value; + __u32 prop_id; + __u32 obj_id; + __u32 obj_type; +}; + +struct drm_mode_get_blob { + __u32 blob_id; + __u32 length; + __u64 data; +}; + +struct drm_mode_fb_cmd { + __u32 fb_id; + __u32 width; + __u32 height; + __u32 pitch; + __u32 bpp; + __u32 depth; + /* driver specific handle */ + __u32 handle; +}; + +#define DRM_MODE_FB_INTERLACED (1<<0) /* for interlaced framebuffers */ +#define DRM_MODE_FB_MODIFIERS (1<<1) /* enables ->modifier[] */ + +/** + * struct drm_mode_fb_cmd2 - Frame-buffer metadata. + * + * This struct holds frame-buffer metadata. There are two ways to use it: + * + * - User-space can fill this struct and perform a &DRM_IOCTL_MODE_ADDFB2 + * ioctl to register a new frame-buffer. The new frame-buffer object ID will + * be set by the kernel in @fb_id. + * - User-space can set @fb_id and perform a &DRM_IOCTL_MODE_GETFB2 ioctl to + * fetch metadata about an existing frame-buffer. + * + * In case of planar formats, this struct allows up to 4 buffer objects with + * offsets and pitches per plane. The pitch and offset order are dictated by + * the format FourCC as defined by ``drm_fourcc.h``, e.g. NV12 is described as: + * + * YUV 4:2:0 image with a plane of 8-bit Y samples followed by an + * interleaved U/V plane containing 8-bit 2x2 subsampled colour difference + * samples. + * + * So it would consist of a Y plane at ``offsets[0]`` and a UV plane at + * ``offsets[1]``. + * + * To accommodate tiled, compressed, etc formats, a modifier can be specified. + * For more information see the "Format Modifiers" section. Note that even + * though it looks like we have a modifier per-plane, we in fact do not. The + * modifier for each plane must be identical. Thus all combinations of + * different data layouts for multi-plane formats must be enumerated as + * separate modifiers. + * + * All of the entries in @handles, @pitches, @offsets and @modifier must be + * zero when unused. Warning, for @offsets and @modifier zero can't be used to + * figure out whether the entry is used or not since it's a valid value (a zero + * offset is common, and a zero modifier is &DRM_FORMAT_MOD_LINEAR). + */ +struct drm_mode_fb_cmd2 { + /** @fb_id: Object ID of the frame-buffer. */ + __u32 fb_id; + /** @width: Width of the frame-buffer. */ + __u32 width; + /** @height: Height of the frame-buffer. */ + __u32 height; + /** + * @pixel_format: FourCC format code, see ``DRM_FORMAT_*`` constants in + * ``drm_fourcc.h``. + */ + __u32 pixel_format; + /** + * @flags: Frame-buffer flags (see &DRM_MODE_FB_INTERLACED and + * &DRM_MODE_FB_MODIFIERS). + */ + __u32 flags; + + /** + * @handles: GEM buffer handle, one per plane. Set to 0 if the plane is + * unused. The same handle can be used for multiple planes. + */ + __u32 handles[4]; + /** @pitches: Pitch (aka. stride) in bytes, one per plane. */ + __u32 pitches[4]; + /** @offsets: Offset into the buffer in bytes, one per plane. */ + __u32 offsets[4]; + /** + * @modifier: Format modifier, one per plane. See ``DRM_FORMAT_MOD_*`` + * constants in ``drm_fourcc.h``. All planes must use the same + * modifier. Ignored unless &DRM_MODE_FB_MODIFIERS is set in @flags. + */ + __u64 modifier[4]; +}; + +#define DRM_MODE_FB_DIRTY_ANNOTATE_COPY 0x01 +#define DRM_MODE_FB_DIRTY_ANNOTATE_FILL 0x02 +#define DRM_MODE_FB_DIRTY_FLAGS 0x03 + +#define DRM_MODE_FB_DIRTY_MAX_CLIPS 256 + +/* + * Mark a region of a framebuffer as dirty. + * + * Some hardware does not automatically update display contents + * as a hardware or software draw to a framebuffer. This ioctl + * allows userspace to tell the kernel and the hardware what + * regions of the framebuffer have changed. + * + * The kernel or hardware is free to update more then just the + * region specified by the clip rects. The kernel or hardware + * may also delay and/or coalesce several calls to dirty into a + * single update. + * + * Userspace may annotate the updates, the annotates are a + * promise made by the caller that the change is either a copy + * of pixels or a fill of a single color in the region specified. + * + * If the DRM_MODE_FB_DIRTY_ANNOTATE_COPY flag is given then + * the number of updated regions are half of num_clips given, + * where the clip rects are paired in src and dst. The width and + * height of each one of the pairs must match. + * + * If the DRM_MODE_FB_DIRTY_ANNOTATE_FILL flag is given the caller + * promises that the region specified of the clip rects is filled + * completely with a single color as given in the color argument. + */ + +struct drm_mode_fb_dirty_cmd { + __u32 fb_id; + __u32 flags; + __u32 color; + __u32 num_clips; + __u64 clips_ptr; +}; + +struct drm_mode_mode_cmd { + __u32 connector_id; + struct drm_mode_modeinfo mode; +}; + +#define DRM_MODE_CURSOR_BO 0x01 +#define DRM_MODE_CURSOR_MOVE 0x02 +#define DRM_MODE_CURSOR_FLAGS 0x03 + +/* + * depending on the value in flags different members are used. + * + * CURSOR_BO uses + * crtc_id + * width + * height + * handle - if 0 turns the cursor off + * + * CURSOR_MOVE uses + * crtc_id + * x + * y + */ +struct drm_mode_cursor { + __u32 flags; + __u32 crtc_id; + __s32 x; + __s32 y; + __u32 width; + __u32 height; + /* driver specific handle */ + __u32 handle; +}; + +struct drm_mode_cursor2 { + __u32 flags; + __u32 crtc_id; + __s32 x; + __s32 y; + __u32 width; + __u32 height; + /* driver specific handle */ + __u32 handle; + __s32 hot_x; + __s32 hot_y; +}; + +struct drm_mode_crtc_lut { + __u32 crtc_id; + __u32 gamma_size; + + /* pointers to arrays */ + __u64 red; + __u64 green; + __u64 blue; +}; + +struct drm_color_ctm { + /* + * Conversion matrix in S31.32 sign-magnitude + * (not two's complement!) format. + * + * out matrix in + * |R| |0 1 2| |R| + * |G| = |3 4 5| x |G| + * |B| |6 7 8| |B| + */ + __u64 matrix[9]; +}; + +struct drm_color_lut { + /* + * Values are mapped linearly to 0.0 - 1.0 range, with 0x0 == 0.0 and + * 0xffff == 1.0. + */ + __u16 red; + __u16 green; + __u16 blue; + __u16 reserved; +}; + +/** + * struct drm_plane_size_hint - Plane size hints + * @width: The width of the plane in pixel + * @height: The height of the plane in pixel + * + * The plane SIZE_HINTS property blob contains an + * array of struct drm_plane_size_hint. + */ +struct drm_plane_size_hint { + __u16 width; + __u16 height; +}; + +/** + * struct hdr_metadata_infoframe - HDR Metadata Infoframe Data. + * + * HDR Metadata Infoframe as per CTA 861.G spec. This is expected + * to match exactly with the spec. + * + * Userspace is expected to pass the metadata information as per + * the format described in this structure. + */ +struct hdr_metadata_infoframe { + /** + * @eotf: Electro-Optical Transfer Function (EOTF) + * used in the stream. + */ + __u8 eotf; + /** + * @metadata_type: Static_Metadata_Descriptor_ID. + */ + __u8 metadata_type; + /** + * @display_primaries: Color Primaries of the Data. + * These are coded as unsigned 16-bit values in units of + * 0.00002, where 0x0000 represents zero and 0xC350 + * represents 1.0000. + * @display_primaries.x: X coordinate of color primary. + * @display_primaries.y: Y coordinate of color primary. + */ + struct { + __u16 x, y; + } display_primaries[3]; + /** + * @white_point: White Point of Colorspace Data. + * These are coded as unsigned 16-bit values in units of + * 0.00002, where 0x0000 represents zero and 0xC350 + * represents 1.0000. + * @white_point.x: X coordinate of whitepoint of color primary. + * @white_point.y: Y coordinate of whitepoint of color primary. + */ + struct { + __u16 x, y; + } white_point; + /** + * @max_display_mastering_luminance: Max Mastering Display Luminance. + * This value is coded as an unsigned 16-bit value in units of 1 cd/m2, + * where 0x0001 represents 1 cd/m2 and 0xFFFF represents 65535 cd/m2. + */ + __u16 max_display_mastering_luminance; + /** + * @min_display_mastering_luminance: Min Mastering Display Luminance. + * This value is coded as an unsigned 16-bit value in units of + * 0.0001 cd/m2, where 0x0001 represents 0.0001 cd/m2 and 0xFFFF + * represents 6.5535 cd/m2. + */ + __u16 min_display_mastering_luminance; + /** + * @max_cll: Max Content Light Level. + * This value is coded as an unsigned 16-bit value in units of 1 cd/m2, + * where 0x0001 represents 1 cd/m2 and 0xFFFF represents 65535 cd/m2. + */ + __u16 max_cll; + /** + * @max_fall: Max Frame Average Light Level. + * This value is coded as an unsigned 16-bit value in units of 1 cd/m2, + * where 0x0001 represents 1 cd/m2 and 0xFFFF represents 65535 cd/m2. + */ + __u16 max_fall; +}; + +/** + * struct hdr_output_metadata - HDR output metadata + * + * Metadata Information to be passed from userspace + */ +struct hdr_output_metadata { + /** + * @metadata_type: Static_Metadata_Descriptor_ID. + */ + __u32 metadata_type; + /** + * @hdmi_metadata_type1: HDR Metadata Infoframe. + */ + union { + struct hdr_metadata_infoframe hdmi_metadata_type1; + }; +}; + +/** + * DRM_MODE_PAGE_FLIP_EVENT + * + * Request that the kernel sends back a vblank event (see + * struct drm_event_vblank) with the &DRM_EVENT_FLIP_COMPLETE type when the + * page-flip is done. + */ +#define DRM_MODE_PAGE_FLIP_EVENT 0x01 +/** + * DRM_MODE_PAGE_FLIP_ASYNC + * + * Request that the page-flip is performed as soon as possible, ie. with no + * delay due to waiting for vblank. This may cause tearing to be visible on + * the screen. + * + * When used with atomic uAPI, the driver will return an error if the hardware + * doesn't support performing an asynchronous page-flip for this update. + * User-space should handle this, e.g. by falling back to a regular page-flip. + * + * Note, some hardware might need to perform one last synchronous page-flip + * before being able to switch to asynchronous page-flips. As an exception, + * the driver will return success even though that first page-flip is not + * asynchronous. + */ +#define DRM_MODE_PAGE_FLIP_ASYNC 0x02 +#define DRM_MODE_PAGE_FLIP_TARGET_ABSOLUTE 0x4 +#define DRM_MODE_PAGE_FLIP_TARGET_RELATIVE 0x8 +#define DRM_MODE_PAGE_FLIP_TARGET (DRM_MODE_PAGE_FLIP_TARGET_ABSOLUTE | \ + DRM_MODE_PAGE_FLIP_TARGET_RELATIVE) +/** + * DRM_MODE_PAGE_FLIP_FLAGS + * + * Bitmask of flags suitable for &drm_mode_crtc_page_flip_target.flags. + */ +#define DRM_MODE_PAGE_FLIP_FLAGS (DRM_MODE_PAGE_FLIP_EVENT | \ + DRM_MODE_PAGE_FLIP_ASYNC | \ + DRM_MODE_PAGE_FLIP_TARGET) + +/* + * Request a page flip on the specified crtc. + * + * This ioctl will ask KMS to schedule a page flip for the specified + * crtc. Once any pending rendering targeting the specified fb (as of + * ioctl time) has completed, the crtc will be reprogrammed to display + * that fb after the next vertical refresh. The ioctl returns + * immediately, but subsequent rendering to the current fb will block + * in the execbuffer ioctl until the page flip happens. If a page + * flip is already pending as the ioctl is called, EBUSY will be + * returned. + * + * Flag DRM_MODE_PAGE_FLIP_EVENT requests that drm sends back a vblank + * event (see drm.h: struct drm_event_vblank) when the page flip is + * done. The user_data field passed in with this ioctl will be + * returned as the user_data field in the vblank event struct. + * + * Flag DRM_MODE_PAGE_FLIP_ASYNC requests that the flip happen + * 'as soon as possible', meaning that it not delay waiting for vblank. + * This may cause tearing on the screen. + * + * The reserved field must be zero. + */ + +struct drm_mode_crtc_page_flip { + __u32 crtc_id; + __u32 fb_id; + __u32 flags; + __u32 reserved; + __u64 user_data; +}; + +/* + * Request a page flip on the specified crtc. + * + * Same as struct drm_mode_crtc_page_flip, but supports new flags and + * re-purposes the reserved field: + * + * The sequence field must be zero unless either of the + * DRM_MODE_PAGE_FLIP_TARGET_ABSOLUTE/RELATIVE flags is specified. When + * the ABSOLUTE flag is specified, the sequence field denotes the absolute + * vblank sequence when the flip should take effect. When the RELATIVE + * flag is specified, the sequence field denotes the relative (to the + * current one when the ioctl is called) vblank sequence when the flip + * should take effect. NOTE: DRM_IOCTL_WAIT_VBLANK must still be used to + * make sure the vblank sequence before the target one has passed before + * calling this ioctl. The purpose of the + * DRM_MODE_PAGE_FLIP_TARGET_ABSOLUTE/RELATIVE flags is merely to clarify + * the target for when code dealing with a page flip runs during a + * vertical blank period. + */ + +struct drm_mode_crtc_page_flip_target { + __u32 crtc_id; + __u32 fb_id; + __u32 flags; + __u32 sequence; + __u64 user_data; +}; + +/** + * struct drm_mode_create_dumb - Create a KMS dumb buffer for scanout. + * @height: buffer height in pixels + * @width: buffer width in pixels + * @bpp: bits per pixel + * @flags: must be zero + * @handle: buffer object handle + * @pitch: number of bytes between two consecutive lines + * @size: size of the whole buffer in bytes + * + * User-space fills @height, @width, @bpp and @flags. If the IOCTL succeeds, + * the kernel fills @handle, @pitch and @size. + */ +struct drm_mode_create_dumb { + __u32 height; + __u32 width; + __u32 bpp; + __u32 flags; + + __u32 handle; + __u32 pitch; + __u64 size; +}; + +/* set up for mmap of a dumb scanout buffer */ +struct drm_mode_map_dumb { + /** Handle for the object being mapped. */ + __u32 handle; + __u32 pad; + /** + * Fake offset to use for subsequent mmap call + * + * This is a fixed-size type for 32/64 compatibility. + */ + __u64 offset; +}; + +struct drm_mode_destroy_dumb { + __u32 handle; +}; + +/** + * DRM_MODE_ATOMIC_TEST_ONLY + * + * Do not apply the atomic commit, instead check whether the hardware supports + * this configuration. + * + * See &drm_mode_config_funcs.atomic_check for more details on test-only + * commits. + */ +#define DRM_MODE_ATOMIC_TEST_ONLY 0x0100 +/** + * DRM_MODE_ATOMIC_NONBLOCK + * + * Do not block while applying the atomic commit. The &DRM_IOCTL_MODE_ATOMIC + * IOCTL returns immediately instead of waiting for the changes to be applied + * in hardware. Note, the driver will still check that the update can be + * applied before retuning. + */ +#define DRM_MODE_ATOMIC_NONBLOCK 0x0200 +/** + * DRM_MODE_ATOMIC_ALLOW_MODESET + * + * Allow the update to result in temporary or transient visible artifacts while + * the update is being applied. Applying the update may also take significantly + * more time than a page flip. All visual artifacts will disappear by the time + * the update is completed, as signalled through the vblank event's timestamp + * (see struct drm_event_vblank). + * + * This flag must be set when the KMS update might cause visible artifacts. + * Without this flag such KMS update will return a EINVAL error. What kind of + * update may cause visible artifacts depends on the driver and the hardware. + * User-space that needs to know beforehand if an update might cause visible + * artifacts can use &DRM_MODE_ATOMIC_TEST_ONLY without + * &DRM_MODE_ATOMIC_ALLOW_MODESET to see if it fails. + * + * To the best of the driver's knowledge, visual artifacts are guaranteed to + * not appear when this flag is not set. Some sinks might display visual + * artifacts outside of the driver's control. + */ +#define DRM_MODE_ATOMIC_ALLOW_MODESET 0x0400 + +/** + * DRM_MODE_ATOMIC_FLAGS + * + * Bitfield of flags accepted by the &DRM_IOCTL_MODE_ATOMIC IOCTL in + * &drm_mode_atomic.flags. + */ +#define DRM_MODE_ATOMIC_FLAGS (\ + DRM_MODE_PAGE_FLIP_EVENT |\ + DRM_MODE_PAGE_FLIP_ASYNC |\ + DRM_MODE_ATOMIC_TEST_ONLY |\ + DRM_MODE_ATOMIC_NONBLOCK |\ + DRM_MODE_ATOMIC_ALLOW_MODESET) + +struct drm_mode_atomic { + __u32 flags; + __u32 count_objs; + __u64 objs_ptr; + __u64 count_props_ptr; + __u64 props_ptr; + __u64 prop_values_ptr; + __u64 reserved; + __u64 user_data; +}; + +struct drm_format_modifier_blob { +#define FORMAT_BLOB_CURRENT 1 + /* Version of this blob format */ + __u32 version; + + /* Flags */ + __u32 flags; + + /* Number of fourcc formats supported */ + __u32 count_formats; + + /* Where in this blob the formats exist (in bytes) */ + __u32 formats_offset; + + /* Number of drm_format_modifiers */ + __u32 count_modifiers; + + /* Where in this blob the modifiers exist (in bytes) */ + __u32 modifiers_offset; + + /* __u32 formats[] */ + /* struct drm_format_modifier modifiers[] */ +}; + +struct drm_format_modifier { + /* Bitmask of formats in get_plane format list this info applies to. The + * offset allows a sliding window of which 64 formats (bits). + * + * Some examples: + * In today's world with < 65 formats, and formats 0, and 2 are + * supported + * 0x0000000000000005 + * ^-offset = 0, formats = 5 + * + * If the number formats grew to 128, and formats 98-102 are + * supported with the modifier: + * + * 0x0000007c00000000 0000000000000000 + * ^ + * |__offset = 64, formats = 0x7c00000000 + * + */ + __u64 formats; + __u32 offset; + __u32 pad; + + /* The modifier that applies to the >get_plane format list bitmask. */ + __u64 modifier; +}; + +/** + * struct drm_mode_create_blob - Create New blob property + * + * Create a new 'blob' data property, copying length bytes from data pointer, + * and returning new blob ID. + */ +struct drm_mode_create_blob { + /** @data: Pointer to data to copy. */ + __u64 data; + /** @length: Length of data to copy. */ + __u32 length; + /** @blob_id: Return: new property ID. */ + __u32 blob_id; +}; + +/** + * struct drm_mode_destroy_blob - Destroy user blob + * @blob_id: blob_id to destroy + * + * Destroy a user-created blob property. + * + * User-space can release blobs as soon as they do not need to refer to them by + * their blob object ID. For instance, if you are using a MODE_ID blob in an + * atomic commit and you will not make another commit re-using the same ID, you + * can destroy the blob as soon as the commit has been issued, without waiting + * for it to complete. + */ +struct drm_mode_destroy_blob { + __u32 blob_id; +}; + +/** + * struct drm_mode_create_lease - Create lease + * + * Lease mode resources, creating another drm_master. + * + * The @object_ids array must reference at least one CRTC, one connector and + * one plane if &DRM_CLIENT_CAP_UNIVERSAL_PLANES is enabled. Alternatively, + * the lease can be completely empty. + */ +struct drm_mode_create_lease { + /** @object_ids: Pointer to array of object ids (__u32) */ + __u64 object_ids; + /** @object_count: Number of object ids */ + __u32 object_count; + /** @flags: flags for new FD (O_CLOEXEC, etc) */ + __u32 flags; + + /** @lessee_id: Return: unique identifier for lessee. */ + __u32 lessee_id; + /** @fd: Return: file descriptor to new drm_master file */ + __u32 fd; +}; + +/** + * struct drm_mode_list_lessees - List lessees + * + * List lesses from a drm_master. + */ +struct drm_mode_list_lessees { + /** + * @count_lessees: Number of lessees. + * + * On input, provides length of the array. + * On output, provides total number. No + * more than the input number will be written + * back, so two calls can be used to get + * the size and then the data. + */ + __u32 count_lessees; + /** @pad: Padding. */ + __u32 pad; + + /** + * @lessees_ptr: Pointer to lessees. + * + * Pointer to __u64 array of lessee ids + */ + __u64 lessees_ptr; +}; + +/** + * struct drm_mode_get_lease - Get Lease + * + * Get leased objects. + */ +struct drm_mode_get_lease { + /** + * @count_objects: Number of leased objects. + * + * On input, provides length of the array. + * On output, provides total number. No + * more than the input number will be written + * back, so two calls can be used to get + * the size and then the data. + */ + __u32 count_objects; + /** @pad: Padding. */ + __u32 pad; + + /** + * @objects_ptr: Pointer to objects. + * + * Pointer to __u32 array of object ids. + */ + __u64 objects_ptr; +}; + +/** + * struct drm_mode_revoke_lease - Revoke lease + */ +struct drm_mode_revoke_lease { + /** @lessee_id: Unique ID of lessee */ + __u32 lessee_id; +}; + +/** + * struct drm_mode_rect - Two dimensional rectangle. + * @x1: Horizontal starting coordinate (inclusive). + * @y1: Vertical starting coordinate (inclusive). + * @x2: Horizontal ending coordinate (exclusive). + * @y2: Vertical ending coordinate (exclusive). + * + * With drm subsystem using struct drm_rect to manage rectangular area this + * export it to user-space. + * + * Currently used by drm_mode_atomic blob property FB_DAMAGE_CLIPS. + */ +struct drm_mode_rect { + __s32 x1; + __s32 y1; + __s32 x2; + __s32 y2; +}; + +/** + * struct drm_mode_closefb + * @fb_id: Framebuffer ID. + * @pad: Must be zero. + */ +struct drm_mode_closefb { + __u32 fb_id; + __u32 pad; +}; + +#if defined(__cplusplus) +} +#endif + +#endif From 29525f8cb3fa244b3b5ecf9fc92e42b9587fd9ef Mon Sep 17 00:00:00 2001 From: Radostin Stoyanov Date: Tue, 4 Nov 2025 14:34:12 +0000 Subject: [PATCH 19/51] codespell: skip amdgpu kernel headers These header files are copied directly from the Linux kernel and contain typos. We skip these files in codespell to simplify maintenance. Signed-off-by: Radostin Stoyanov --- .codespellrc | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.codespellrc b/.codespellrc index e91a6d2eb..5def594b2 100644 --- a/.codespellrc +++ b/.codespellrc @@ -1,3 +1,3 @@ [codespell] -skip = ./.git,./test/pki,./tags +skip = ./.git,./test/pki,./tags,./plugins/amdgpu/amdgpu_drm.h,./plugins/amdgpu/drm.h,./plugins/amdgpu/drm_mode.h ignore-words-list = creat,fpr,fle,ue,bord,parms,nd,te,testng,inh,wronly,renderd,bui,clen,sems From 1db7eed69fa974563abc6d7348ee93b679c06cc3 Mon Sep 17 00:00:00 2001 From: Radostin Stoyanov Date: Tue, 4 Nov 2025 14:41:52 +0000 Subject: [PATCH 20/51] amdgpu: use local kernel headers instead of libdrm Use local copies of amdgpu and DRM headers for consistency. Signed-off-by: Radostin Stoyanov --- plugins/amdgpu/amdgpu_plugin.c | 2 +- plugins/amdgpu/amdgpu_plugin_drm.c | 2 ++ plugins/amdgpu/kfd_ioctl.h | 5 ++++- 3 files changed, 7 insertions(+), 2 deletions(-) diff --git a/plugins/amdgpu/amdgpu_plugin.c b/plugins/amdgpu/amdgpu_plugin.c index 36dc0b6b0..713ffed6e 100644 --- a/plugins/amdgpu/amdgpu_plugin.c +++ b/plugins/amdgpu/amdgpu_plugin.c @@ -20,7 +20,6 @@ #include #include -#include #include "criu-plugin.h" #include "plugin.h" @@ -38,6 +37,7 @@ #include "rst-malloc.h" #include "common/list.h" +#include "amdgpu_drm.h" #include "amdgpu_plugin_dmabuf.h" #include "amdgpu_plugin_drm.h" #include "amdgpu_plugin_util.h" diff --git a/plugins/amdgpu/amdgpu_plugin_drm.c b/plugins/amdgpu/amdgpu_plugin_drm.c index 923bfcdd1..3520bca7a 100644 --- a/plugins/amdgpu/amdgpu_plugin_drm.c +++ b/plugins/amdgpu/amdgpu_plugin_drm.c @@ -23,6 +23,8 @@ #include "fdstore.h" #include "criu-amdgpu.pb-c.h" + +/* Define __user as empty for kernel headers in user-space */ #define __user #include "drm.h" diff --git a/plugins/amdgpu/kfd_ioctl.h b/plugins/amdgpu/kfd_ioctl.h index 1a3bcea95..a63d453f0 100644 --- a/plugins/amdgpu/kfd_ioctl.h +++ b/plugins/amdgpu/kfd_ioctl.h @@ -23,9 +23,12 @@ #ifndef KFD_IOCTL_H_INCLUDED #define KFD_IOCTL_H_INCLUDED -#include #include +/* Define __user as empty for kernel headers in user-space */ +#define __user +#include "drm.h" + /* * - 1.1 - initial version * - 1.3 - Add SMI events support From 62aadb22ab1efeccef7fb322f525bd1b2cb6969c Mon Sep 17 00:00:00 2001 From: Yanning Yang Date: Fri, 14 Nov 2025 23:08:16 +0000 Subject: [PATCH 21/51] amdgpu: use 64-bit offsets for parallel restore On AMD Instinct MI300 systems, restoring a large GPU application can fail because the checkpoint size is too large and the maximum value of an offset (with integer type) is insufficient. This problem occurs when the total size of all buffer objects exceeds int max, not because any single buffer is too large, but it can also happen with a large number of small buffers. Fixes: #2812 Signed-off-by: Yanning Yang Signed-off-by: Radostin Stoyanov --- plugins/amdgpu/amdgpu_plugin.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/plugins/amdgpu/amdgpu_plugin.c b/plugins/amdgpu/amdgpu_plugin.c index 713ffed6e..574d7b829 100644 --- a/plugins/amdgpu/amdgpu_plugin.c +++ b/plugins/amdgpu/amdgpu_plugin.c @@ -1651,7 +1651,7 @@ static int restore_bo_data(int id, struct kfd_criu_bo_bucket *bo_buckets, CriuKf { struct thread_data *thread_datas = NULL; int thread_i, ret = 0; - int offset = 0; + uint64_t offset = 0; for (int i = 0; i < e->num_of_bos; i++) { struct kfd_criu_bo_bucket *bo_bucket = &bo_buckets[i]; @@ -2283,7 +2283,7 @@ void *parallel_restore_bo_contents(void *_thread_data) continue; entry = &restore_cmd->entries[i]; - fseek(bo_contents_fp, entry->read_offset + offset, SEEK_SET); + fseeko64(bo_contents_fp, entry->read_offset + offset, SEEK_SET); ret = sdma_copy_bo(restore_cmd->fds_write[entry->write_id], entry->size, bo_contents_fp, buffer, buffer_size, h_dev, max_copy_size, SDMA_OP_VRAM_WRITE, false); @@ -2410,4 +2410,4 @@ int amdgpu_plugin_post_forking(void) return back_thread_create(¶llel_thread, restore_device_parallel_worker, ¶llel_thread_result); } -CR_PLUGIN_REGISTER_HOOK(CR_PLUGIN_HOOK__POST_FORKING, amdgpu_plugin_post_forking) \ No newline at end of file +CR_PLUGIN_REGISTER_HOOK(CR_PLUGIN_HOOK__POST_FORKING, amdgpu_plugin_post_forking) From 2cf8f13ca1f11a0491977e438b262e646137256c Mon Sep 17 00:00:00 2001 From: Mark Polyakov Date: Tue, 11 Nov 2025 15:48:42 -0800 Subject: [PATCH 22/51] doc: update pipe/socket examples for --inherit-fd The syntax of the inherit-fd functionality for unix socket and pipe includes a colon. Fixes: 0df3f79fc023 ("criu(8): fix --inherit-fd description") Fixes: c37324b6d0bc ("crtools: describe the inherit-fd option") Signed-off-by: Mark Polyakov Signed-off-by: Radostin Stoyanov --- Documentation/criu.txt | 4 ++-- criu/crtools.c | 4 ++-- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/Documentation/criu.txt b/Documentation/criu.txt index 40ede84e2..0c9a9e527 100644 --- a/Documentation/criu.txt +++ b/Documentation/criu.txt @@ -502,8 +502,8 @@ Restores previously checkpointed processes. The 'resource' argument can be one of the following: + - **tty[**__rdev__**:**__dev__**]** - - **pipe[**__inode__**]** - - **socket[**__inode__*]* + - **pipe:[**__inode__**]** + - **socket:[**__inode__*]* - **file[**__mnt_id__**:**__inode__**]** - 'path/to/file' diff --git a/criu/crtools.c b/criu/crtools.c index e207133ac..4dc55a065 100644 --- a/criu/crtools.c +++ b/criu/crtools.c @@ -503,8 +503,8 @@ usage: " Inherit file descriptors, treating fd NUM as being\n" " already opened via an existing RES, which can be:\n" " tty[rdev:dev]\n" - " pipe[inode]\n" - " socket[inode]\n" + " pipe:[inode]\n" + " socket:[inode]\n" " file[mnt_id:inode]\n" " /memfd:name\n" " path/to/file\n" From bf82389de36ef940be3640229f5a68d0e9211b71 Mon Sep 17 00:00:00 2001 From: Adrian Reber Date: Wed, 26 Nov 2025 07:48:02 +0000 Subject: [PATCH 23/51] dump: fix "Defect type: IDENTICAL_BRANCHES" Static code analysis reported: criu/cr-dump.c:2328:2: identical_branches: The same code is executed when the condition "ret" is true or false, because the code in the if-then branch and after the if statement is identical. Should the if statement be removed? This is a fix for the warning. Signed-off-by: Adrian Reber --- criu/cr-dump.c | 2 -- 1 file changed, 2 deletions(-) diff --git a/criu/cr-dump.c b/criu/cr-dump.c index 4df40e9b6..98b4223ba 100644 --- a/criu/cr-dump.c +++ b/criu/cr-dump.c @@ -2329,8 +2329,6 @@ int cr_dump_tasks(pid_t pid) } ret = write_img_inventory(&he); - if (ret) - goto err; err: if (parent_ie) inventory_entry__free_unpacked(parent_ie, NULL); From 09bb3626646f285a3c00c9d424df3028dba9a10b Mon Sep 17 00:00:00 2001 From: Adrian Reber Date: Wed, 26 Nov 2025 07:49:05 +0000 Subject: [PATCH 24/51] restore: fix "Defect type: UNINIT" Static code analysis reported: 1. criu/cr-restore.c:2438:2: var_decl: Declaring variable "end_vma" without initializer. 4. criu/cr-restore.c:2451:5: assign: Assigning: "s_vma" = "&end_vma", which points to uninitialized data. 7. criu/cr-restore.c:2449:4: uninit_use: Using uninitialized value "s_vma->list.next". This tries to fix it by initializing the variable. Signed-off-by: Adrian Reber --- criu/cr-restore.c | 1 + 1 file changed, 1 insertion(+) diff --git a/criu/cr-restore.c b/criu/cr-restore.c index 057ec0e93..a5eda8d60 100644 --- a/criu/cr-restore.c +++ b/criu/cr-restore.c @@ -2440,6 +2440,7 @@ static long restorer_get_vma_hint(struct list_head *tgt_vma_list, struct list_he end_vma.e = &end_e; end_e.start = end_e.end = kdat.task_size; + INIT_LIST_HEAD(&end_vma.list); s_vma = list_first_entry(self_vma_list, struct vma_area, list); t_vma = list_first_entry(tgt_vma_list, struct vma_area, list); From 90300748effc1cf0fe56e35d3d1cc2ddfedab246 Mon Sep 17 00:00:00 2001 From: Adrian Reber Date: Wed, 26 Nov 2025 13:21:07 +0000 Subject: [PATCH 25/51] tty: fix compiler error At least on tests running on Fedora rawhide following error could be seen: ``` criu/tty.c: In function 'pts_fd_get_index': criu/tty.c:262:21: error: initialization discards 'const' qualifier from pointer target type [-Werror=discarded-qualifiers] 262 | char *pos = strrchr(link->name, '/'); | ``` This fixes it. Signed-off-by: Adrian Reber --- criu/tty.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/criu/tty.c b/criu/tty.c index ae23094b7..9a4520d53 100644 --- a/criu/tty.c +++ b/criu/tty.c @@ -259,7 +259,7 @@ static int pts_fd_get_index(int fd, const struct fd_parms *p) { int index; const struct fd_link *link = p->link; - char *pos = strrchr(link->name, '/'); + const char *pos = strrchr(link->name, '/'); if (!pos || pos == (link->name + link->len - 1)) { pr_err("Unexpected format on path %s\n", link->name + 1); From 501b714f76b121e66a6f91ffbd707a29bc9edd39 Mon Sep 17 00:00:00 2001 From: Igor Svilenkov Bozic Date: Tue, 12 Aug 2025 14:22:14 +0200 Subject: [PATCH 26/51] compel/aarch64: refactor fpregs handling Refactor user_fpregs_struct_t to wrap user_fpsimd_state in a dedicated struct, preparing for future extending by just adding new members Signed-off-by: Igor Svilenkov Bozic [ alex: fixes ] Signed-off-by: Alexander Mikhalitsyn Reviewed-by: Alexander Mikhalitsyn Acked-by: Mike Rapoport --- .../src/lib/include/uapi/asm/infect-types.h | 6 +++++- compel/arch/aarch64/src/lib/infect.c | 18 +++++++++--------- criu/arch/aarch64/crtools.c | 8 ++++---- 3 files changed, 18 insertions(+), 14 deletions(-) diff --git a/compel/arch/aarch64/src/lib/include/uapi/asm/infect-types.h b/compel/arch/aarch64/src/lib/include/uapi/asm/infect-types.h index 9d4ce7e2e..39aed4ac5 100644 --- a/compel/arch/aarch64/src/lib/include/uapi/asm/infect-types.h +++ b/compel/arch/aarch64/src/lib/include/uapi/asm/infect-types.h @@ -16,7 +16,11 @@ */ typedef struct user_pt_regs user_regs_struct_t; -typedef struct user_fpsimd_state user_fpregs_struct_t; + +struct user_fpregs_struct { + struct user_fpsimd_state fpstate; +}; +typedef struct user_fpregs_struct user_fpregs_struct_t; #define __compel_arch_fetch_thread_area(tid, th) 0 #define compel_arch_fetch_thread_area(tctl) 0 diff --git a/compel/arch/aarch64/src/lib/infect.c b/compel/arch/aarch64/src/lib/infect.c index ec1d0d59e..503616df7 100644 --- a/compel/arch/aarch64/src/lib/infect.c +++ b/compel/arch/aarch64/src/lib/infect.c @@ -43,10 +43,10 @@ int sigreturn_prep_regs_plain(struct rt_sigframe *sigframe, user_regs_struct_t * sigframe->uc.uc_mcontext.pc = regs->pc; sigframe->uc.uc_mcontext.pstate = regs->pstate; - memcpy(fpsimd->vregs, fpregs->vregs, 32 * sizeof(__uint128_t)); + memcpy(fpsimd->vregs, fpregs->fpstate.vregs, 32 * sizeof(__uint128_t)); - fpsimd->fpsr = fpregs->fpsr; - fpsimd->fpcr = fpregs->fpcr; + fpsimd->fpsr = fpregs->fpstate.fpsr; + fpsimd->fpcr = fpregs->fpstate.fpcr; fpsimd->head.magic = FPSIMD_MAGIC; fpsimd->head.size = sizeof(*fpsimd); @@ -59,7 +59,7 @@ int sigreturn_prep_fpu_frame_plain(struct rt_sigframe *sigframe, struct rt_sigfr return 0; } -int compel_get_task_regs(pid_t pid, user_regs_struct_t *regs, user_fpregs_struct_t *fpsimd, save_regs_t save, +int compel_get_task_regs(pid_t pid, user_regs_struct_t *regs, user_fpregs_struct_t *ext_regs, save_regs_t save, void *arg, __maybe_unused unsigned long flags) { struct iovec iov; @@ -74,14 +74,14 @@ int compel_get_task_regs(pid_t pid, user_regs_struct_t *regs, user_fpregs_struct goto err; } - iov.iov_base = fpsimd; - iov.iov_len = sizeof(*fpsimd); + iov.iov_base = &ext_regs->fpstate; + iov.iov_len = sizeof(ext_regs->fpstate); if ((ret = ptrace(PTRACE_GETREGSET, pid, NT_PRFPREG, &iov))) { pr_perror("Failed to obtain FPU registers for %d", pid); goto err; } - ret = save(pid, arg, regs, fpsimd); + ret = save(pid, arg, regs, ext_regs); err: return ret; } @@ -92,8 +92,8 @@ int compel_set_task_ext_regs(pid_t pid, user_fpregs_struct_t *ext_regs) pr_info("Restoring GP/FPU registers for %d\n", pid); - iov.iov_base = ext_regs; - iov.iov_len = sizeof(*ext_regs); + iov.iov_base = &ext_regs->fpstate; + iov.iov_len = sizeof(ext_regs->fpstate); if (ptrace(PTRACE_SETREGSET, pid, NT_PRFPREG, &iov)) { pr_perror("Failed to set FPU registers for %d", pid); return -1; diff --git a/criu/arch/aarch64/crtools.c b/criu/arch/aarch64/crtools.c index 3ed5c9d63..3cd082a34 100644 --- a/criu/arch/aarch64/crtools.c +++ b/criu/arch/aarch64/crtools.c @@ -157,11 +157,11 @@ int save_task_regs(pid_t pid, void *x, user_regs_struct_t *regs, user_fpregs_str // Save the FP/SIMD state for (i = 0; i < 32; ++i) { - core->ti_aarch64->fpsimd->vregs[2 * i] = fpsimd->vregs[i]; - core->ti_aarch64->fpsimd->vregs[2 * i + 1] = fpsimd->vregs[i] >> 64; + core->ti_aarch64->fpsimd->vregs[2 * i] = fpsimd->fpstate.vregs[i]; + core->ti_aarch64->fpsimd->vregs[2 * i + 1] = fpsimd->fpstate.vregs[i] >> 64; } - assign_reg(core->ti_aarch64->fpsimd, fpsimd, fpsr); - assign_reg(core->ti_aarch64->fpsimd, fpsimd, fpcr); + assign_reg(core->ti_aarch64->fpsimd, &fpsimd->fpstate, fpsr); + assign_reg(core->ti_aarch64->fpsimd, &fpsimd->fpstate, fpcr); if (save_pac_keys(pid, core)) return -1; From 73ca07148398c58e344cd2be71933836f55d93f4 Mon Sep 17 00:00:00 2001 From: Igor Svilenkov Bozic Date: Wed, 13 Aug 2025 17:48:56 +0200 Subject: [PATCH 27/51] gcs: add GCS constants and helper macros Introduce ARM64 Guarded Control Stack (GCS) constants and macros in a new uapi header for use in both CRIU and compel. Includes: - NT_ARM_GCS type - prctl(2) constants for GCS enable/write/push modes - Capability token helpers (GCS_CAP, GCS_SIGNAL_CAP) - HWCAP_GCS definition These are based on upstream Linux definitions Signed-off-by: Igor Svilenkov Bozic Reviewed-by: Alexander Mikhalitsyn Acked-by: Mike Rapoport --- .../src/lib/include/uapi/asm/gcs-types.h | 47 +++++++++++++++++++ 1 file changed, 47 insertions(+) create mode 100644 compel/arch/aarch64/src/lib/include/uapi/asm/gcs-types.h diff --git a/compel/arch/aarch64/src/lib/include/uapi/asm/gcs-types.h b/compel/arch/aarch64/src/lib/include/uapi/asm/gcs-types.h new file mode 100644 index 000000000..9f9655e3b --- /dev/null +++ b/compel/arch/aarch64/src/lib/include/uapi/asm/gcs-types.h @@ -0,0 +1,47 @@ +#ifndef __UAPI_ASM_GCS_TYPES_H__ +#define __UAPI_ASM_GCS_TYPES_H__ + +#ifndef NT_ARM_GCS +#define NT_ARM_GCS 0x410 /* ARM GCS state */ +#endif + +/* Shadow Stack/Guarded Control Stack interface */ +#define PR_GET_SHADOW_STACK_STATUS 74 +#define PR_SET_SHADOW_STACK_STATUS 75 +#define PR_LOCK_SHADOW_STACK_STATUS 76 + +/* When set PR_SHADOW_STACK_ENABLE flag allocates a Guarded Control Stack */ +#ifndef PR_SHADOW_STACK_ENABLE +#define PR_SHADOW_STACK_ENABLE (1UL << 0) +#endif + +/* Allows explicit GCS stores (eg. using GCSSTR) */ +#ifndef PR_SHADOW_STACK_WRITE +#define PR_SHADOW_STACK_WRITE (1UL << 1) +#endif + +/* Allows explicit GCS pushes (eg. using GCSPUSHM) */ +#ifndef PR_SHADOW_STACK_PUSH +#define PR_SHADOW_STACK_PUSH (1UL << 2) +#endif + +#ifndef SHADOW_STACK_SET_TOKEN +#define SHADOW_STACK_SET_TOKEN 0x1 /* Set up a restore token in the shadow stack */ +#endif + +#define PR_SHADOW_STACK_ALL_MODES \ + PR_SHADOW_STACK_ENABLE | PR_SHADOW_STACK_WRITE | PR_SHADOW_STACK_PUSH + +/* copied from: arch/arm64/include/asm/sysreg.h */ +#define GCS_CAP_VALID_TOKEN 0x1 +#define GCS_CAP_ADDR_MASK 0xFFFFFFFFFFFFF000ULL +#define GCS_CAP(x) ((((unsigned long)x) & GCS_CAP_ADDR_MASK) | GCS_CAP_VALID_TOKEN) +#define GCS_SIGNAL_CAP(addr) (((unsigned long)addr) & GCS_CAP_ADDR_MASK) + +#include + +#ifndef HWCAP_GCS +#define HWCAP_GCS (1UL << 32) +#endif + +#endif /* __UAPI_ASM_GCS_TYPES_H__ */ \ No newline at end of file From 6bb856b0af85fc6c1a90f2a6f28afd9f3b0db493 Mon Sep 17 00:00:00 2001 From: Igor Svilenkov Bozic Date: Fri, 1 Aug 2025 12:09:57 +0200 Subject: [PATCH 28/51] compel: gcs: initial GCS support for signal frames MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Add basic prerequisites for Guarded Control Stack (GCS) state on AArch64. This adds a gcs_context to the signal frame and extends user_fpregs_struct_t to carry GCS metadata, preparing the groundwork for GCS in the parasite. For now, the GCS fields are zeroed during compel_get_task_regs(), technically ignoring GCS since it does not reach the control logic yet; that will be introduced in the next commit. The code path is gated and does not affect normal tests. Can be explicitly enabled and tested via:     make -C infect GCS_ENABLE=1 && make -C infect run Signed-off-by: Igor Svilenkov Bozic [ alex: clean up fixes ] Signed-off-by: Alexander Mikhalitsyn Acked-by: Mike Rapoport --- .../src/lib/include/uapi/asm/infect-types.h | 14 +++++++ .../src/lib/include/uapi/asm/sigframe.h | 10 +++++ compel/arch/aarch64/src/lib/infect.c | 39 ++++++++++++++++++- compel/include/uapi/infect.h | 8 ++++ 4 files changed, 70 insertions(+), 1 deletion(-) diff --git a/compel/arch/aarch64/src/lib/include/uapi/asm/infect-types.h b/compel/arch/aarch64/src/lib/include/uapi/asm/infect-types.h index 39aed4ac5..e11f2910f 100644 --- a/compel/arch/aarch64/src/lib/include/uapi/asm/infect-types.h +++ b/compel/arch/aarch64/src/lib/include/uapi/asm/infect-types.h @@ -2,6 +2,7 @@ #define UAPI_COMPEL_ASM_TYPES_H__ #include +#include #include #include #include @@ -17,8 +18,18 @@ typedef struct user_pt_regs user_regs_struct_t; +/* + * GCS (Guarded Control Stack) + */ +struct user_gcs { + __u64 features_enabled; + __u64 features_locked; + __u64 gcspr_el0; +}; + struct user_fpregs_struct { struct user_fpsimd_state fpstate; + struct user_gcs gcs; }; typedef struct user_fpregs_struct user_fpregs_struct_t; @@ -43,4 +54,7 @@ typedef struct user_fpregs_struct user_fpregs_struct_t; __NR_##syscall; \ }) +extern bool __compel_host_supports_gcs(void); +#define compel_host_supports_gcs __compel_host_supports_gcs + #endif /* UAPI_COMPEL_ASM_TYPES_H__ */ diff --git a/compel/arch/aarch64/src/lib/include/uapi/asm/sigframe.h b/compel/arch/aarch64/src/lib/include/uapi/asm/sigframe.h index a3528500d..7efee528f 100644 --- a/compel/arch/aarch64/src/lib/include/uapi/asm/sigframe.h +++ b/compel/arch/aarch64/src/lib/include/uapi/asm/sigframe.h @@ -10,11 +10,20 @@ /* Copied from the kernel header arch/arm64/include/uapi/asm/sigcontext.h */ #define FPSIMD_MAGIC 0x46508001 +#define GCS_MAGIC 0x47435300 typedef struct fpsimd_context fpu_state_t; +struct gcs_context { + struct _aarch64_ctx head; + __u64 gcspr; + __u64 features_enabled; + __u64 reserved; +}; + struct aux_context { struct fpsimd_context fpsimd; + struct gcs_context gcs; /* additional context to be added before "end" */ struct _aarch64_ctx end; }; @@ -63,6 +72,7 @@ struct cr_sigcontext { #define RT_SIGFRAME_AUX_CONTEXT(rt_sigframe) ((struct aux_context *)&(RT_SIGFRAME_SIGCONTEXT(rt_sigframe)->__reserved)) #define RT_SIGFRAME_FPU(rt_sigframe) (&RT_SIGFRAME_AUX_CONTEXT(rt_sigframe)->fpsimd) #define RT_SIGFRAME_OFFSET(rt_sigframe) 0 +#define RT_SIGFRAME_GCS(rt_sigframe) (&RT_SIGFRAME_AUX_CONTEXT(rt_sigframe)->gcs) #define rt_sigframe_erase_sigset(sigframe) memset(&sigframe->uc.uc_sigmask, 0, sizeof(k_rtsigset_t)) #define rt_sigframe_copy_sigset(sigframe, from) memcpy(&sigframe->uc.uc_sigmask, from, sizeof(k_rtsigset_t)) diff --git a/compel/arch/aarch64/src/lib/infect.c b/compel/arch/aarch64/src/lib/infect.c index 503616df7..0f74a023a 100644 --- a/compel/arch/aarch64/src/lib/infect.c +++ b/compel/arch/aarch64/src/lib/infect.c @@ -2,8 +2,8 @@ #include #include #include +#include #include -#include #include #include "common/page.h" @@ -13,6 +13,8 @@ #include "infect.h" #include "infect-priv.h" #include "asm/breakpoints.h" +#include "asm/gcs-types.h" +#include unsigned __page_size = 0; unsigned __page_shift = 0; @@ -33,12 +35,32 @@ static inline void __always_unused __check_code_syscall(void) BUILD_BUG_ON(!is_log2(sizeof(code_syscall))); } +bool __compel_host_supports_gcs(void) +{ + unsigned long hwcap = getauxval(AT_HWCAP); + return (hwcap & HWCAP_GCS) != 0; +} + +static bool __compel_gcs_enabled(struct user_gcs *gcs) +{ + if (!compel_host_supports_gcs()) + return false; + + if (gcs->features_enabled & PR_SHADOW_STACK_ENABLE) + return true; + + return false; +} + int sigreturn_prep_regs_plain(struct rt_sigframe *sigframe, user_regs_struct_t *regs, user_fpregs_struct_t *fpregs) { struct fpsimd_context *fpsimd = RT_SIGFRAME_FPU(sigframe); + struct gcs_context *gcs = RT_SIGFRAME_GCS(sigframe); memcpy(sigframe->uc.uc_mcontext.regs, regs->regs, sizeof(regs->regs)); + pr_debug("sigreturn_prep_regs_plain: sp %lx pc %lx\n", (long)regs->sp, (long)regs->pc); + sigframe->uc.uc_mcontext.sp = regs->sp; sigframe->uc.uc_mcontext.pc = regs->pc; sigframe->uc.uc_mcontext.pstate = regs->pstate; @@ -51,6 +73,19 @@ int sigreturn_prep_regs_plain(struct rt_sigframe *sigframe, user_regs_struct_t * fpsimd->head.magic = FPSIMD_MAGIC; fpsimd->head.size = sizeof(*fpsimd); + if (__compel_gcs_enabled(&fpregs->gcs)) { + gcs->head.magic = GCS_MAGIC; + gcs->head.size = sizeof(*gcs); + gcs->reserved = 0; + gcs->gcspr = fpregs->gcs.gcspr_el0 - 8; + gcs->features_enabled = fpregs->gcs.features_enabled; + + pr_debug("sigframe gcspr=%llx features_enabled=%llx\n", fpregs->gcs.gcspr_el0 - 8, fpregs->gcs.features_enabled); + } else { + pr_debug("sigframe gcspr=[disabled]\n"); + memset(gcs, 0, sizeof(*gcs)); + } + return 0; } @@ -81,6 +116,8 @@ int compel_get_task_regs(pid_t pid, user_regs_struct_t *regs, user_fpregs_struct goto err; } + memset(&ext_regs->gcs, 0, sizeof(ext_regs->gcs)); + ret = save(pid, arg, regs, ext_regs); err: return ret; diff --git a/compel/include/uapi/infect.h b/compel/include/uapi/infect.h index 1f61876ff..d21c261b7 100644 --- a/compel/include/uapi/infect.h +++ b/compel/include/uapi/infect.h @@ -192,6 +192,14 @@ void compel_set_thread_ip(struct parasite_thread_ctl *tctl, uint64_t v); extern void compel_get_stack(struct parasite_ctl *ctl, void **rstack, void **r_thread_stack); +#ifndef compel_host_supports_gcs +static inline bool compel_host_supports_gcs(void) +{ + return false; +} +#define compel_host_supports_gcs +#endif + #ifndef compel_shstk_enabled static inline bool compel_shstk_enabled(user_fpregs_struct_t *ext_regs) { From 2f676d20e41337568403e9f8ac79f5cd3af620e3 Mon Sep 17 00:00:00 2001 From: Igor Svilenkov Bozic Date: Tue, 12 Aug 2025 18:32:55 +0200 Subject: [PATCH 29/51] compel: gcs: set up GCS token/restorer for rt_sigreturn MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit When GCS is enabled, the kernel expects a capability token at GCSPR_EL0-8 and sa_restorer at GCSPR_EL0-16 on rt_sigreturn. The sigframe must be consistent with the kernel’s expectations, with GCSPR_EL0 advanced by -8 having it point to the token on signal entry. On rt_sigreturn, the kernel verifies the cap at GCSPR_EL0, invalidates it and increments GCSPR_EL0 by 8 at the end of gcs_restore_signal() . Implement parasite_setup_gcs() to: - read NT_ARM_GCS via ptrace(PTRACE_GETREGSET) - write (via ptrace) the computed capability token and restorer address - update GCSPR_EL0 to point to the token's location Call parasite_setup_gcs() into parasite_start_daemon() so the sigreturn frame satisfies kernel's expectation Tests with GCS remain opt‑in: make -C compel/test/infect GCS_ENABLE=1 && make -C compel/test/infect run Signed-off-by: Igor Svilenkov Bozic [ alex: cleanup fixes ] Signed-off-by: Alexander Mikhalitsyn Acked-by: Mike Rapoport --- .../src/lib/include/uapi/asm/infect-types.h | 5 ++ compel/arch/aarch64/src/lib/infect.c | 82 ++++++++++++++++++- compel/arch/x86/src/lib/infect.c | 2 +- 3 files changed, 84 insertions(+), 5 deletions(-) diff --git a/compel/arch/aarch64/src/lib/include/uapi/asm/infect-types.h b/compel/arch/aarch64/src/lib/include/uapi/asm/infect-types.h index e11f2910f..3a34ab4f6 100644 --- a/compel/arch/aarch64/src/lib/include/uapi/asm/infect-types.h +++ b/compel/arch/aarch64/src/lib/include/uapi/asm/infect-types.h @@ -57,4 +57,9 @@ typedef struct user_fpregs_struct user_fpregs_struct_t; extern bool __compel_host_supports_gcs(void); #define compel_host_supports_gcs __compel_host_supports_gcs +struct parasite_ctl; +extern int __parasite_setup_shstk(struct parasite_ctl *ctl, + user_fpregs_struct_t *ext_regs); +#define parasite_setup_shstk __parasite_setup_shstk + #endif /* UAPI_COMPEL_ASM_TYPES_H__ */ diff --git a/compel/arch/aarch64/src/lib/infect.c b/compel/arch/aarch64/src/lib/infect.c index 0f74a023a..39be558ea 100644 --- a/compel/arch/aarch64/src/lib/infect.c +++ b/compel/arch/aarch64/src/lib/infect.c @@ -46,10 +46,7 @@ static bool __compel_gcs_enabled(struct user_gcs *gcs) if (!compel_host_supports_gcs()) return false; - if (gcs->features_enabled & PR_SHADOW_STACK_ENABLE) - return true; - - return false; + return gcs && (gcs->features_enabled & PR_SHADOW_STACK_ENABLE) != 0; } int sigreturn_prep_regs_plain(struct rt_sigframe *sigframe, user_regs_struct_t *regs, user_fpregs_struct_t *fpregs) @@ -118,6 +115,18 @@ int compel_get_task_regs(pid_t pid, user_regs_struct_t *regs, user_fpregs_struct memset(&ext_regs->gcs, 0, sizeof(ext_regs->gcs)); + iov.iov_base = &ext_regs->gcs; + iov.iov_len = sizeof(ext_regs->gcs); + if (ptrace(PTRACE_GETREGSET, pid, NT_ARM_GCS, &iov) == 0) { + pr_info("gcs: GCSPR_EL0 for %d: 0x%llx, features: 0x%llx\n", + pid, ext_regs->gcs.gcspr_el0, ext_regs->gcs.features_enabled); + + if (!__compel_gcs_enabled(&ext_regs->gcs)) + pr_info("gcs: GCS is NOT enabled\n"); + } else { + pr_info("gcs: GCS state not available for %d\n", pid); + } + ret = save(pid, arg, regs, ext_regs); err: return ret; @@ -323,3 +332,68 @@ int ptrace_flush_breakpoints(pid_t pid) return 0; } + +int inject_gcs_cap_token(struct parasite_ctl *ctl, pid_t pid, struct user_gcs *gcs) +{ + struct iovec gcs_iov = { .iov_base = gcs, .iov_len = sizeof(*gcs) }; + + uint64_t token_addr = gcs->gcspr_el0 - 8; + uint64_t sigtramp_addr = gcs->gcspr_el0 - 16; + + uint64_t cap_token = ALIGN_DOWN(GCS_SIGNAL_CAP(token_addr), 8); + unsigned long restorer_addr; + + pr_info("gcs: (setup) CAP token: 0x%lx at addr: 0x%lx\n", cap_token, token_addr); + + /* Inject capability token at gcspr_el0 - 8 */ + if (ptrace(PTRACE_POKEDATA, pid, (void *)token_addr, cap_token)) { + pr_perror("gcs: (setup) Inject GCS cap token failed"); + return -1; + } + + /* Inject restorer trampoline address (gcspr_el0 - 16) */ + restorer_addr = ctl->parasite_ip; + if (ptrace(PTRACE_POKEDATA, pid, (void *)sigtramp_addr, restorer_addr)) { + pr_perror("gcs: (setup) Inject GCS restorer failed"); + return -1; + } + + /* Update GCSPR_EL0 */ + gcs->gcspr_el0 = token_addr; + if (ptrace(PTRACE_SETREGSET, pid, NT_ARM_GCS, &gcs_iov)) { + pr_perror("gcs: PTRACE_SETREGS FAILED"); + return -1; + } + + pr_debug("gcs: parasite_ip=%#lx sp=%#llx gcspr_el0=%#llx\n", + ctl->parasite_ip, ctl->orig.regs.sp, gcs->gcspr_el0); + + return 0; +} + +int parasite_setup_shstk(struct parasite_ctl *ctl, user_fpregs_struct_t *ext_regs) +{ + struct user_gcs gcs; + struct iovec gcs_iov = { .iov_base = &gcs, .iov_len = sizeof(gcs) }; + pid_t pid = ctl->rpid; + + if(!__compel_host_supports_gcs()) + return 0; + + if (ptrace(PTRACE_GETREGSET, pid, NT_ARM_GCS, &gcs_iov) != 0) { + pr_perror("GCS state not available for %d", pid); + return -1; + } + + if (!__compel_gcs_enabled(&gcs)) + return 0; + + if (inject_gcs_cap_token(ctl, pid, &gcs)) { + pr_perror("Failed to inject GCS cap token for %d", pid); + return -1; + } + + pr_info("gcs: GCS enabled for %d\n", pid); + + return 0; +} diff --git a/compel/arch/x86/src/lib/infect.c b/compel/arch/x86/src/lib/infect.c index 644c483b4..afcf2c53b 100644 --- a/compel/arch/x86/src/lib/infect.c +++ b/compel/arch/x86/src/lib/infect.c @@ -761,7 +761,7 @@ bool __compel_shstk_enabled(user_fpregs_struct_t *ext_regs) return false; } -int parasite_setup_shstk(struct parasite_ctl *ctl, user_fpregs_struct_t *ext_regs) +int parasite_setup_shstk(struct parasite_ctl *ctl, __maybe_unused user_fpregs_struct_t *ext_regs) { pid_t pid = ctl->rpid; unsigned long sa_restorer = ctl->parasite_ip; From 92e6e523b51f342bdc6dbaf79d9c43e915f02af0 Mon Sep 17 00:00:00 2001 From: Igor Svilenkov Bozic Date: Thu, 14 Aug 2025 14:39:52 +0200 Subject: [PATCH 30/51] compel: gcs: add opt-in GCS test support for AArch64 Introduce an opt-in mode for building and running compel tests with Guarded Control Stack (GCS) enabled on AArch64. Changes: - Extend compel/test/infect to support `GCS_ENABLE=1` builds, adding `-mbranch-protection=standard` and `-z experimental-gcs=check` to CFLAGS/LDFLAGS. - Export required GLIBC_TUNABLES at runtime via `TEST_ENV`. Usage: make -C compel/test/infect GCS_ENABLE=1 make -C compel/test/infect GCS_ENABLE=1 run By default (`GCS_ENABLE` unset or 0), builds and runs are unchanged. Signed-off-by: Igor Svilenkov Bozic --- compel/test/infect/Makefile | 7 ++++++- compel/test/infect/spy.c | 3 +++ 2 files changed, 9 insertions(+), 1 deletion(-) diff --git a/compel/test/infect/Makefile b/compel/test/infect/Makefile index bacfad962..85efa5fd9 100644 --- a/compel/test/infect/Makefile +++ b/compel/test/infect/Makefile @@ -3,6 +3,11 @@ CFLAGS ?= -O2 -g -Wall -Werror COMPEL := ../../../compel/compel-host +ifeq ($(GCS_ENABLE),1) +CFLAGS += -mbranch-protection=standard -DGCS_TEST_ENABLE=1 +LDFLAGS += -z experimental-gcs=check +endif + all: victim spy run: @@ -17,7 +22,7 @@ clean: rm -f parasite.o victim: victim.c - $(CC) $(CFLAGS) -o $@ $^ + $(CC) $(CFLAGS) -o $@ $^ $(LDFLAGS) spy: spy.c parasite.h $(CC) $(CFLAGS) $(shell $(COMPEL) includes) -o $@ $< $(shell $(COMPEL) --static libs) diff --git a/compel/test/infect/spy.c b/compel/test/infect/spy.c index b10db4d47..143946941 100644 --- a/compel/test/infect/spy.c +++ b/compel/test/infect/spy.c @@ -112,6 +112,9 @@ int main(int argc, char **argv) return -1; } +#ifdef GCS_TEST_ENABLE + setenv("GLIBC_TUNABLES", "glibc.cpu.aarch64_gcs=1:glibc.cpu.aarch64_gcs_policy=2", 1); +#endif pid = vfork(); if (pid == 0) { close(p_in[1]); From 41ecb7ac71f1396d7471467d21443d144000d6c2 Mon Sep 17 00:00:00 2001 From: Igor Svilenkov Bozic Date: Tue, 12 Aug 2025 18:52:36 +0200 Subject: [PATCH 31/51] images: aarch64: add user_aarch64_gcs_entry - Define user_aarch64_gcs_entry in core-aarch64.proto to store Guarded Control Stack state (gcspr_el0, features_enabled). - Extend thread_info_aarch64 with an optional gcs field Also extend thread_info_aarch64 with an optional gcs field Signed-off-by: Igor Svilenkov Bozic --- images/core-aarch64.proto | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/images/core-aarch64.proto b/images/core-aarch64.proto index 64b0ee9fb..a94911c0b 100644 --- a/images/core-aarch64.proto +++ b/images/core-aarch64.proto @@ -17,6 +17,11 @@ message user_aarch64_fpsimd_context_entry { required uint32 fpcr = 3; } +message user_aarch64_gcs_entry { + required uint64 gcspr_el0 = 1 [(criu).hex = true]; + required uint64 features_enabled = 2 [(criu).hex = true]; +} + message pac_address_keys { required uint64 apiakey_lo = 1; required uint64 apiakey_hi = 2; @@ -45,4 +50,5 @@ message thread_info_aarch64 { required user_aarch64_regs_entry gpregs = 3[(criu).hex = true]; required user_aarch64_fpsimd_context_entry fpsimd = 4; optional pac_keys pac_keys = 5; + optional user_aarch64_gcs_entry gcs = 6; } From 2429d49e677377575aa72cc35e18f96d671ad72e Mon Sep 17 00:00:00 2001 From: Igor Svilenkov Bozic Date: Tue, 12 Aug 2025 18:56:41 +0200 Subject: [PATCH 32/51] criu/dump: gcs: save GCS state during dump Add debug and info messages to log Guarded Control Stack state when dumping AArch64 threads. This includes the following values: - gcspr_el0 - features_enabled Signed-off-by: Igor Svilenkov Bozic [ alex: cleanup fixes ] Signed-off-by: Alexander Mikhalitsyn Acked-by: Mike Rapoport --- criu/arch/aarch64/crtools.c | 41 +++++++++++++++++++++++++++++++++++++ 1 file changed, 41 insertions(+) diff --git a/criu/arch/aarch64/crtools.c b/criu/arch/aarch64/crtools.c index 3cd082a34..835a83400 100644 --- a/criu/arch/aarch64/crtools.c +++ b/criu/arch/aarch64/crtools.c @@ -12,6 +12,7 @@ #include "common/compiler.h" #include #include "asm/dump.h" +#include "asm/gcs-types.h" #include "protobuf.h" #include "images/core.pb-c.h" #include "images/creds.pb-c.h" @@ -22,6 +23,7 @@ #include "restorer.h" #include "compel/infect.h" #include "pstree.h" +#include /* * cr_user_pac_* are a copy of the corresponding uapi structs @@ -146,6 +148,11 @@ static int save_pac_keys(int pid, CoreEntry *core) int save_task_regs(pid_t pid, void *x, user_regs_struct_t *regs, user_fpregs_struct_t *fpsimd) { int i; + struct user_gcs gcs_live; + struct iovec gcs_iov = { + .iov_base = &gcs_live, + .iov_len = sizeof(gcs_live), + }; CoreEntry *core = x; // Save the Aarch64 CPU state @@ -165,6 +172,17 @@ int save_task_regs(pid_t pid, void *x, user_regs_struct_t *regs, user_fpregs_str if (save_pac_keys(pid, core)) return -1; + + /* Save the GCS state */ + if (compel_host_supports_gcs()) { + if (ptrace(PTRACE_GETREGSET, pid, NT_ARM_GCS, &gcs_iov) < 0) { + pr_perror("Failed to get GCS for %d", pid); + return -1; + } + core->ti_aarch64->gcs->gcspr_el0 = gcs_live.gcspr_el0; + core->ti_aarch64->gcs->features_enabled = gcs_live.features_enabled; + } + return 0; } @@ -173,6 +191,7 @@ int arch_alloc_thread_info(CoreEntry *core) ThreadInfoAarch64 *ti_aarch64; UserAarch64RegsEntry *gpregs; UserAarch64FpsimdContextEntry *fpsimd; + UserAarch64GcsEntry *gcs; ti_aarch64 = xmalloc(sizeof(*ti_aarch64)); if (!ti_aarch64) @@ -202,6 +221,15 @@ int arch_alloc_thread_info(CoreEntry *core) if (!fpsimd->vregs) goto err; + /* Allocate & init GCS */ + if (compel_host_supports_gcs()) { + gcs = xmalloc(sizeof(*gcs)); + if (!gcs) + goto err; + user_aarch64_gcs_entry__init(gcs); + ti_aarch64->gcs = gcs; + } + return 0; err: return -1; @@ -231,6 +259,7 @@ int restore_fpu(struct rt_sigframe *sigframe, CoreEntry *core) { int i; struct fpsimd_context *fpsimd = RT_SIGFRAME_FPU(sigframe); + struct gcs_context *gcs; if (core->ti_aarch64->fpsimd->n_vregs != 64) return 1; @@ -244,6 +273,18 @@ int restore_fpu(struct rt_sigframe *sigframe, CoreEntry *core) fpsimd->head.magic = FPSIMD_MAGIC; fpsimd->head.size = sizeof(*fpsimd); + if (compel_host_supports_gcs()) { + gcs = RT_SIGFRAME_GCS(sigframe); + + pr_debug("sigframe gcspr %llx enabled %llx\n", gcs->gcspr, gcs->features_enabled); + + gcs->head.magic = GCS_MAGIC; + gcs->head.size = sizeof(*gcs); + gcs->reserved = 0; + gcs->gcspr = core->ti_aarch64->gcs->gcspr_el0 - 8; + gcs->features_enabled = core->ti_aarch64->gcs->features_enabled; + } + return 0; } From d591e320e0ef3dd816a2c61a46a074e21f2b769f Mon Sep 17 00:00:00 2001 From: Igor Svilenkov Bozic Date: Tue, 12 Aug 2025 20:13:28 +0200 Subject: [PATCH 33/51] criu/restore: gcs: adds restore implementation for Guarded Control Stack MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit This commit finalizes AArch64 Guarded Control Stack (GCS) support by wiring the full dump and restore flow. The restore path adds the following steps: - Define shared AArch64 GCS types and constants in a dedicated header for both compel and CRIU inclusion - compel: add get/set NT_ARM_GCS via ptrace, enabling user-space GCS state save and restore. - During restore switch to the new GCS (via GCSSTR) to place capability token sa_restorer address - arch_shstk_trampoline() — We enable GCS in a trampoline that using prctl(PR_SET_SHADOW_STACK_STATUS, ...) via inline SVC. The trampoline ineeded because we can’t RET without a valid GCS. - restorer: map the recorded GCS VMA, populate contents top-down with GCSSTR, write the signal capability at GCSPR_EL0 and the valid token at GCSPR_EL0-8, then switch to the rebuilt GCS (GCSSS1) - Save and restore registers via ptrace - Extend restorer argument structures to carry GCS state into post-restore execution - Add shstk_set_restorer_stack(): sets tmp_gcs to temporary restorer shadow stack start - Add gcs_vma_restore implementation (required for mremap of the GCS VMA) Tested with: GCS_ENABLE=1 ./zdtm.py run -t zdtm/static/env00 Signed-off-by: Igor Svilenkov Bozic --- compel/arch/aarch64/src/lib/infect.c | 30 +++ .../arch/arm/plugins/std/syscalls/syscall.def | 1 + compel/include/infect-priv.h | 1 + criu/arch/aarch64/Makefile | 1 + criu/arch/aarch64/gcs.c | 157 ++++++++++++++ criu/arch/aarch64/include/asm/gcs.h | 196 ++++++++++++++++++ criu/arch/aarch64/include/asm/restorer.h | 1 + 7 files changed, 387 insertions(+) create mode 100644 criu/arch/aarch64/gcs.c create mode 100644 criu/arch/aarch64/include/asm/gcs.h diff --git a/compel/arch/aarch64/src/lib/infect.c b/compel/arch/aarch64/src/lib/infect.c index 39be558ea..7450ac026 100644 --- a/compel/arch/aarch64/src/lib/infect.c +++ b/compel/arch/aarch64/src/lib/infect.c @@ -136,6 +136,9 @@ int compel_set_task_ext_regs(pid_t pid, user_fpregs_struct_t *ext_regs) { struct iovec iov; + struct user_gcs gcs; + struct iovec gcs_iov = { .iov_base = &gcs, .iov_len = sizeof(gcs) }; + pr_info("Restoring GP/FPU registers for %d\n", pid); iov.iov_base = &ext_regs->fpstate; @@ -144,6 +147,33 @@ int compel_set_task_ext_regs(pid_t pid, user_fpregs_struct_t *ext_regs) pr_perror("Failed to set FPU registers for %d", pid); return -1; } + + if (ptrace(PTRACE_GETREGSET, pid, NT_ARM_GCS, &gcs_iov) < 0) { + pr_warn("gcs: Failed to get GCS for %d\n", pid); + } else { + ext_regs->gcs = gcs; + compel_set_task_gcs_regs(pid, ext_regs); + } + + return 0; +} + +int compel_set_task_gcs_regs(pid_t pid, user_fpregs_struct_t *ext_regs) +{ + struct iovec iov; + + pr_info("gcs: restoring GCS registers for %d\n", pid); + pr_info("gcs: restoring GCS: gcspr=%llx features=%llx\n", + ext_regs->gcs.gcspr_el0, ext_regs->gcs.features_enabled); + + iov.iov_base = &ext_regs->gcs; + iov.iov_len = sizeof(ext_regs->gcs); + + if (ptrace(PTRACE_SETREGSET, pid, NT_ARM_GCS, &iov)) { + pr_perror("gcs: Failed to set GCS registers for %d", pid); + return -1; + } + return 0; } diff --git a/compel/arch/arm/plugins/std/syscalls/syscall.def b/compel/arch/arm/plugins/std/syscalls/syscall.def index 9a33009eb..819678566 100644 --- a/compel/arch/arm/plugins/std/syscalls/syscall.def +++ b/compel/arch/arm/plugins/std/syscalls/syscall.def @@ -124,3 +124,4 @@ openat2 437 437 (int dirfd, char *pathname, struct open_how *how, size_t size pidfd_getfd 438 438 (int pidfd, int targetfd, unsigned int flags) rseq 293 398 (void *rseq, uint32_t rseq_len, int flags, uint32_t sig) membarrier 283 389 (int cmd, unsigned int flags, int cpu_id) +map_shadow_stack 453 ! (unsigned long addr, unsigned long size, unsigned int flags) \ No newline at end of file diff --git a/compel/include/infect-priv.h b/compel/include/infect-priv.h index 9d3442839..8e78a7f6c 100644 --- a/compel/include/infect-priv.h +++ b/compel/include/infect-priv.h @@ -72,6 +72,7 @@ extern bool arch_can_dump_task(struct parasite_ctl *ctl); extern int compel_get_task_regs(pid_t pid, user_regs_struct_t *regs, user_fpregs_struct_t *ext_regs, save_regs_t save, void *arg, unsigned long flags); extern int compel_set_task_ext_regs(pid_t pid, user_fpregs_struct_t *ext_regs); +extern int compel_set_task_gcs_regs(pid_t pid, user_fpregs_struct_t *ext_regs); extern int arch_fetch_sas(struct parasite_ctl *ctl, struct rt_sigframe *s); extern int sigreturn_prep_regs_plain(struct rt_sigframe *sigframe, user_regs_struct_t *regs, user_fpregs_struct_t *fpregs); diff --git a/criu/arch/aarch64/Makefile b/criu/arch/aarch64/Makefile index b26487367..b87fcaa5b 100644 --- a/criu/arch/aarch64/Makefile +++ b/criu/arch/aarch64/Makefile @@ -6,3 +6,4 @@ obj-y += cpu.o obj-y += crtools.o obj-y += sigframe.o obj-y += bitops.o +obj-y += gcs.o \ No newline at end of file diff --git a/criu/arch/aarch64/gcs.c b/criu/arch/aarch64/gcs.c new file mode 100644 index 000000000..4bdb9d2e4 --- /dev/null +++ b/criu/arch/aarch64/gcs.c @@ -0,0 +1,157 @@ +#include +#include + +#include +#include + +#include "asm/gcs-types.h" +#include "pstree.h" +#include "restorer.h" +#include "rst-malloc.h" +#include "vma.h" + +#include +#include + +static bool task_has_gcs_enabled(UserAarch64GcsEntry *gcs) +{ + return gcs && (gcs->features_enabled & PR_SHADOW_STACK_ENABLE) != 0; +} + +static bool host_supports_gcs(void) +{ + unsigned long hwcap = getauxval(AT_HWCAP); + return (hwcap & HWCAP_GCS) != 0; +} + +static bool task_needs_gcs(struct pstree_item *item, CoreEntry *core) +{ + UserAarch64GcsEntry *gcs; + + if (!task_alive(item)) + return false; + + gcs = core->ti_aarch64->gcs; + + if (task_has_gcs_enabled(gcs)) { + if (!host_supports_gcs()) { + pr_warn_once("Restoring task with GCS on non-GCS host\n"); + return false; + } + + pr_info("Restoring task with GCS\n"); + return true; + } + + pr_info("Restoring a task without GCS\n"); + return false; +} + +static int gcs_prepare_task(struct vm_area_list *vmas, + struct rst_shstk_info *gcs) +{ + struct vma_area *vma; + + list_for_each_entry(vma, &vmas->h, list) { + if (vma_area_is(vma, VMA_AREA_SHSTK) && + in_vma_area(vma, gcs->gcspr_el0)) { + unsigned long premapped_addr = vma->premmaped_addr; + unsigned long size = vma_area_len(vma); + + gcs->vma_start = vma->e->start; + gcs->vma_size = size; + gcs->premapped_addr = premapped_addr; + + return 0; + } + } + + pr_err("Unable to find a shadow stack vma: %lx\n", gcs->gcspr_el0); + return -1; +} + +int arch_gcs_prepare(struct pstree_item *item, CoreEntry *core, + struct task_restore_args *ta) +{ + int i; + struct thread_restore_args *args_array = (struct thread_restore_args *)(&ta[1]); + struct vm_area_list *vmas = &rsti(item)->vmas; + struct rst_shstk_info *gcs = &ta->shstk; + + if (!task_needs_gcs(item, core)) + return 0; + + gcs->gcspr_el0 = core->ti_aarch64->gcs->gcspr_el0; + gcs->features_enabled = core->ti_aarch64->gcs->features_enabled; + + if (gcs_prepare_task(vmas, gcs)) { + pr_err("gcs: failed to prepare shadow stack memory\n"); + return -1; + } + + for (i = 0; i < item->nr_threads; i++) { + struct thread_restore_args *thread_args = &args_array[i]; + + core = item->core[i]; + gcs = &thread_args->shstk; + + gcs->gcspr_el0 = core->ti_aarch64->gcs->gcspr_el0; + gcs->features_enabled = core->ti_aarch64->gcs->features_enabled; + + if (gcs_prepare_task(vmas, gcs)) { + pr_err("gcs: failed to prepare GCS memory\n"); + return -1; + } + } + + return 0; +} + +int arch_shstk_trampoline(struct pstree_item *item, CoreEntry *core, + int (*func)(void *arg), void *arg) +{ + int fret; + unsigned long flags = PR_SHADOW_STACK_ENABLE | + PR_SHADOW_STACK_PUSH | + PR_SHADOW_STACK_WRITE; + + long ret, x1_after, x8_after; + + /* If task doesn't need GCS, just call func */ + if (!task_needs_gcs(item, core)) { + return func(arg); + } + + pr_debug("gcs: GCS enable SVC about to fire: x8=%d x0=%d x1=0x%lx\n", + __NR_prctl, PR_SET_SHADOW_STACK_STATUS, flags); + + asm volatile( + "mov x0, %3\n" // x0 = PR_SET_SHADOW_STACK_STATUS (75) + "mov x1, %4\n" // x1 = flags + "mov x2, xzr\n" // x2 = 0 + "mov x3, xzr\n" // x3 = 0 + "mov x4, xzr\n" // x4 = 0 + "mov x8, %5\n" // x8 = __NR_prctl (167) + "svc #0\n" // Invoke syscall + "mov %0, x0\n" // Capture return value + "mov %1, x1\n" // Capture x1 after + "mov %2, x8\n" // Capture x8 after + : "=r"(ret), "=r"(x1_after), "=r"(x8_after) + : "i"(PR_SET_SHADOW_STACK_STATUS), // x0 - %3rd + "r"(flags), // x1 - %4th + "i"(__NR_prctl) // x8 - %5th + : "x0", "x1", "x2", "x3", "x4", "x8", "memory", "cc"); + + pr_info("gcs: after SVC: ret=%ld x1=%ld x8=%ld\n", ret, x1_after, x8_after); + + if (ret != 0) { + int err = errno; + pr_err("gcs: failed to enable GCS: ret=%ld errno=%d (%s)\n", ret, err, strerror(err)); + return -1; + } + + fret = func(arg); + exit(fret); + + return -1; +} diff --git a/criu/arch/aarch64/include/asm/gcs.h b/criu/arch/aarch64/include/asm/gcs.h new file mode 100644 index 000000000..28faa23b7 --- /dev/null +++ b/criu/arch/aarch64/include/asm/gcs.h @@ -0,0 +1,196 @@ +#ifndef __CR_ASM_GCS_H__ +#define __CR_ASM_GCS_H__ + +#include + +struct rst_shstk_info { + unsigned long vma_start; /* start of GCS VMA */ + unsigned long vma_size; /* size of GCS VMA */ + unsigned long premapped_addr; /* premapped buffer */ + unsigned long tmp_gcs; /* temp area for GCS if needed */ + u64 gcspr_el0; /* GCS pointer */ + u64 features_enabled; /* GCS flags */ +}; + +#define rst_shstk_info rst_shstk_info + +struct task_restore_args; +struct pstree_item; + +int arch_gcs_prepare(struct pstree_item *item, CoreEntry *core, + struct task_restore_args *ta); +#define arch_shstk_prepare arch_gcs_prepare + +int arch_shstk_trampoline(struct pstree_item *item, CoreEntry *core, + int (*func)(void *arg), void *arg); +#define arch_shstk_trampoline arch_shstk_trampoline + +static always_inline void shstk_set_restorer_stack(struct rst_shstk_info *gcs, void *ptr) +{ + gcs->tmp_gcs = (long unsigned)ptr; +} +#define shstk_set_restorer_stack shstk_set_restorer_stack + +static always_inline long shstk_restorer_stack_size(void) +{ + return PAGE_SIZE; +} +#define shstk_restorer_stack_size shstk_restorer_stack_size + +#ifdef CR_NOGLIBC +#include +#include +#include "vma.h" + +static inline unsigned long gcs_map(unsigned long addr, unsigned long size, unsigned int flags) +{ + long gcspr = sys_map_shadow_stack(addr, size, flags); + pr_info("gcs: syscall: map_shadow_stack at=%lx size=%ld\n", addr, size); + + if (gcspr < 0) { + pr_err("gcs: failed to map GCS at %lx: %ld\n", addr, gcspr); + return -1; + } + + if (addr && gcspr != addr) { + pr_err("gcs: address mismatch: need %lx, got %lx\n", addr, gcspr); + return -1; + } + + pr_info("gcs: mmapped GCS at %lx\n", gcspr); + + return gcspr; +} + +/* clang-format off */ +static always_inline void gcsss1(unsigned long *Xt) +{ + asm volatile ( + "sys #3, C7, C7, #2, %0\n" + : + : "rZ" (Xt) + : "memory"); +} + +static always_inline unsigned long *gcsss2(void) +{ + unsigned long *Xt; + + asm volatile ( + "SYSL %0, #3, C7, C7, #3\n" + : "=r" (Xt) + : + : "memory"); + + return Xt; +} + +static inline void gcsstr(unsigned long addr, unsigned long val) +{ + asm volatile( + "mov x0, %0\n" + "mov x1, %1\n" + ".inst 0xd91f1c01\n" // GCSSTR x1, [x0] + "mov x0, #0\n" + : + : "r"(addr), "r"(val) + : "x0", "x1", "memory"); +} +/* clang-format on */ + +static always_inline int gcs_restore(struct rst_shstk_info *gcs) +{ + unsigned long gcspr, val; + + if (!(gcs && gcs->features_enabled & PR_SHADOW_STACK_ENABLE)) { + return 0; + } + + gcspr = gcs->gcspr_el0 - 8; + + val = ALIGN_DOWN(GCS_SIGNAL_CAP(gcspr), 8); + pr_debug("gcs: [0] GCSSTR VAL=%lx write at GCSPR=%lx\n", val, gcspr); + gcsstr(gcspr, val); + + val = ALIGN_DOWN(GCS_SIGNAL_CAP(gcspr), 8) | GCS_CAP_VALID_TOKEN; + gcspr -= 8; + pr_debug("gcs: [1] GCSSTR VAL=%lx write at GCSPR=%lx\n", val, gcspr); + gcsstr(gcspr, val); + + pr_debug("gcs: about to switch stacks via GCSSS1 to: %lx\n", gcspr); + gcsss1((unsigned long *)gcspr); + return 0; +} +#define arch_shstk_restore gcs_restore + +static always_inline int gcs_vma_restore(VmaEntry *vma_entry) +{ + unsigned long shstk, i, ret; + unsigned long *gcs_data = (void *)vma_premmaped_start(vma_entry); + unsigned long vma_size = vma_entry_len(vma_entry); + + shstk = gcs_map(0, vma_size, SHADOW_STACK_SET_TOKEN); + if (shstk < 0) { + pr_err("Failed to map shadow stack at %lx: %ld\n", shstk, shstk); + } + + /* restore shadow stack contents */ + for (i = 0; i < vma_size / 8; i++) + gcsstr(shstk + i * 8, gcs_data[i]); + + pr_debug("unmap %lx %ld\n", (unsigned long)gcs_data, vma_size); + ret = sys_munmap(gcs_data, vma_size); + if (ret < 0) { + pr_err("Failed to unmap premmaped shadow stack\n"); + return ret; + } + + vma_premmaped_start(vma_entry) = shstk; + + return 0; +} +#define shstk_vma_restore gcs_vma_restore + +static always_inline int gcs_switch_to_restorer(struct rst_shstk_info *gcs) +{ + int ret; + unsigned long *ssp; + unsigned long addr; + unsigned long gcspr; + + if (!(gcs && gcs->features_enabled & PR_SHADOW_STACK_ENABLE)) { + return 0; + } + + pr_debug("gcs->premapped_addr + gcs->vma_size = %lx\n", gcs->premapped_addr + gcs->vma_size); + pr_debug("gcs->tmp_gcs = %lx\n", gcs->tmp_gcs); + addr = gcs->tmp_gcs; + + if (addr % PAGE_SIZE != 0) { + pr_err("gcs: 0x%lx not page-aligned to size 0x%lx\n", addr, PAGE_SIZE); + return -1; + } + + ret = sys_munmap((void *)addr, PAGE_SIZE); + if (ret < 0) { + pr_err("gcs: Failed to unmap aarea for dumpee GCS VMAs\n"); + return -1; + } + + gcspr = gcs_map(addr, PAGE_SIZE, SHADOW_STACK_SET_TOKEN); + + if (gcspr == -1) { + pr_err("gcs: failed to gcs_map(%lx, %lx)\n", (unsigned long)addr, PAGE_SIZE); + return -1; + } + + ssp = (unsigned long *)(addr + PAGE_SIZE - 8); + gcsss1(ssp); + + return 0; +} +#define arch_shstk_switch_to_restorer gcs_switch_to_restorer + +#endif /* CR_NOGLIBC */ + +#endif /* __CR_ASM_GCS_H__ */ diff --git a/criu/arch/aarch64/include/asm/restorer.h b/criu/arch/aarch64/include/asm/restorer.h index 2174df4fa..8f3edc257 100644 --- a/criu/arch/aarch64/include/asm/restorer.h +++ b/criu/arch/aarch64/include/asm/restorer.h @@ -5,6 +5,7 @@ #include #include "asm/types.h" +#include "asm/gcs.h" #include "images/core.pb-c.h" #include From f78bea8d34cb54a53ae976f5abb2091f0a45a90a Mon Sep 17 00:00:00 2001 From: Igor Svilenkov Bozic Date: Sun, 23 Nov 2025 17:27:49 +0100 Subject: [PATCH 34/51] zdtm: gcs: add opt-in GCS test support for AArch64 Introduce an opt-in mode for building and running ZDTM static tests with Guarded Control Stack (GCS) enabled on AArch64. Changes: - Support `GCS_ENABLE=1` builds, adding `-mbranch-protection=standard` and `-z experimental-gcs=check` to CFLAGS/LDFLAGS. - Export required GLIBC_TUNABLES at runtime via `TEST_ENV`. - %.pid rules to prefix test binaries with `$(TEST_ENV)` so the tunables are set when running tests. - Makefile rules for selectively enabling GCS in tests Usage: # Build and run with GCS enabled make -C zdtm/static GCS_ENABLE=1 posix_timers GCS_ENABLE=1 ./zdtm.py run --keep-img=always \ -t zdtm/static/posix_timers By default (`GCS_ENABLE` unset or 0), test builds and runs are unchanged. NOTE: This assumes that the test victim was compiled also using GCS_ENABLE=1 so that the proper GCS AArch64 ELF headers are present Signed-off-by: Igor Svilenkov Bozic Reviewed-by: Alexander Mikhalitsyn aleksandr.mikhalitsyn@canonical.com --- test/zdtm/Makefile.inc | 8 ++++++++ test/zdtm/static/Makefile | 18 +++++++++--------- 2 files changed, 17 insertions(+), 9 deletions(-) diff --git a/test/zdtm/Makefile.inc b/test/zdtm/Makefile.inc index 465285f08..c95b4ef6a 100644 --- a/test/zdtm/Makefile.inc +++ b/test/zdtm/Makefile.inc @@ -79,6 +79,14 @@ define pkg-cflags $(shell PKG_CONFIG_PATH="$(PKG_CONFIG_PATH)" $(PKG_CONFIG) --cflags $(1)) endef +ifeq ($(GCS_ENABLE),1) + CFLAGS += -mbranch-protection=standard + LDFLAGS += -z experimental-gcs=check + TEST_ENV = GLIBC_TUNABLES=glibc.cpu.aarch64_gcs=1:glibc.cpu.aarch64_gcs_policy=2 +else + TEST_ENV = +endif + %.d: %.c $(E) " DEP " $@ $(Q)$(CC) $(CFLAGS) $(CPPFLAGS) -MM -MP $< -o $@ diff --git a/test/zdtm/static/Makefile b/test/zdtm/static/Makefile index ea901a805..e1df2e5fa 100644 --- a/test/zdtm/static/Makefile +++ b/test/zdtm/static/Makefile @@ -520,30 +520,30 @@ install: all .PHONY: all install $(TST_NOFILE:%=%.pid): %.pid: % - $( Date: Tue, 9 Dec 2025 19:26:03 +0000 Subject: [PATCH 35/51] cr-dump: fix error handling Commit "plugin: Add DUMP_DEVICES_LATE callback" introduced a new plugin callback that is invoked in cr_dump_tasks(). The return value of this callback was assigned to the variable ret. However, this variable is later used as the return value when goto err is triggered in subsequent conditions. As a result, CRIU exits with "Dumping finished successfully" even when some actions have failed and inventory.img has not been created. To fix this, we replace ret with exit_code and use it only when it is actually needed. Signed-off-by: Radostin Stoyanov --- criu/cr-dump.c | 35 ++++++++++++++--------------------- 1 file changed, 14 insertions(+), 21 deletions(-) diff --git a/criu/cr-dump.c b/criu/cr-dump.c index 98b4223ba..a58aaf34a 100644 --- a/criu/cr-dump.c +++ b/criu/cr-dump.c @@ -2138,8 +2138,8 @@ int cr_dump_tasks(pid_t pid) InventoryEntry he = INVENTORY_ENTRY__INIT; InventoryEntry *parent_ie = NULL; struct pstree_item *item; - int pre_dump_ret = 0; - int ret = -1; + int ret; + int exit_code = -1; kerndat_warn_about_madv_guards(); @@ -2159,9 +2159,9 @@ int cr_dump_tasks(pid_t pid) goto err; root_item->pid->real = pid; - pre_dump_ret = run_scripts(ACT_PRE_DUMP); - if (pre_dump_ret != 0) { - pr_err("Pre dump script failed with %d!\n", pre_dump_ret); + ret = run_scripts(ACT_PRE_DUMP); + if (ret != 0) { + pr_err("Pre dump script failed with %d!\n", ret); goto err; } if (init_stats(DUMP_STATS)) @@ -2287,39 +2287,32 @@ int cr_dump_tasks(pid_t pid) * ipc shared memory, but an ipc namespace is dumped in a child * process. */ - ret = cr_dump_shmem(); - if (ret) + if (cr_dump_shmem()) goto err; if (root_ns_mask) { - ret = dump_namespaces(root_item, root_ns_mask); - if (ret) + if (dump_namespaces(root_item, root_ns_mask)) goto err; } if ((root_ns_mask & CLONE_NEWTIME) == 0) { - ret = dump_time_ns(0); - if (ret) + if (dump_time_ns(0)) goto err; } if (dump_aa_namespaces() < 0) goto err; - ret = dump_cgroups(); - if (ret) + if (dump_cgroups()) goto err; - ret = fix_external_unix_sockets(); - if (ret) + if (fix_external_unix_sockets()) goto err; - ret = tty_post_actions(); - if (ret) + if (tty_post_actions()) goto err; - ret = inventory_save_uptime(&he); - if (ret) + if (inventory_save_uptime(&he)) goto err; he.has_pre_dump_mode = false; @@ -2328,10 +2321,10 @@ int cr_dump_tasks(pid_t pid) he.allow_uprobes = true; } - ret = write_img_inventory(&he); + exit_code = write_img_inventory(&he); err: if (parent_ie) inventory_entry__free_unpacked(parent_ie, NULL); - return cr_dump_finish(ret); + return cr_dump_finish(exit_code); } From 30acbabcddbad502660f2e4b1aab9f16f99bca4f Mon Sep 17 00:00:00 2001 From: Adrian Reber Date: Wed, 26 Nov 2025 13:49:25 +0000 Subject: [PATCH 36/51] ci: also exclude docker version 29 Docker version 28 broke container restore in combination with network namespaces. The workaround in the CI script was excluding Docker version 28. Now that there is also Docker version 29, which is still broken, this also excludes Docker version 29. Signed-off-by: Adrian Reber --- scripts/ci/docker-test.sh | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/scripts/ci/docker-test.sh b/scripts/ci/docker-test.sh index bc5a74667..c1c745544 100755 --- a/scripts/ci/docker-test.sh +++ b/scripts/ci/docker-test.sh @@ -2,7 +2,7 @@ set -x -e -o pipefail -# Workaround: Docker 28.x has a known regression that breaks the checkpoint and +# Workaround: Docker 28.x and 29.x has a known regression that breaks the checkpoint and # restore (C/R) feature. Let's install previous, or next major version. See # https://github.com/moby/moby/issues/50750 for details on the bug. export DEBIAN_FRONTEND=noninteractive @@ -17,7 +17,7 @@ echo \ $(. /etc/os-release && echo "${UBUNTU_CODENAME:-$VERSION_CODENAME}") stable" > /etc/apt/sources.list.d/docker.list apt update -y apt-cache madison docker-ce | awk '{ print $3 }' -verstr="$(apt-cache madison docker-ce | awk '{ print $3 }' | sort | grep -v ':28\.'| tail -n 1)" +verstr="$(apt-cache madison docker-ce | awk '{ print $3 }' | sort | grep -Ev ':(28|29)\.'| tail -n 1)" ../../contrib/apt-install -y "docker-ce=$verstr" "docker-ce-cli=$verstr" # docker checkpoint and restore is an experimental feature From d4e8114130bd97ad280f85568e9feeb93c27cd53 Mon Sep 17 00:00:00 2001 From: Radostin Stoyanov Date: Wed, 17 Dec 2025 00:53:59 +0000 Subject: [PATCH 37/51] readme: use a local copy of the CRIU logo The README currently uses an external link to criu.org for the embedded CRIU logo. Loading this URL when viewing the README on GitHub sometimes fails with "Error Fetching Resource". Using a local copy of the logo fixes this issue. Signed-off-by: Radostin Stoyanov --- Documentation/logo.svg | 136 +++++++++++++++++++++++++++++++++++++++++ README.md | 2 +- 2 files changed, 137 insertions(+), 1 deletion(-) create mode 100644 Documentation/logo.svg diff --git a/Documentation/logo.svg b/Documentation/logo.svg new file mode 100644 index 000000000..f713e72b7 --- /dev/null +++ b/Documentation/logo.svg @@ -0,0 +1,136 @@ + + + + + + + diff --git a/README.md b/README.md index f578e745c..6e2a0de9e 100644 --- a/README.md +++ b/README.md @@ -7,7 +7,7 @@ [![CircleCI](https://circleci.com/gh/checkpoint-restore/criu.svg?style=svg)]( https://circleci.com/gh/checkpoint-restore/criu) -

+

## CRIU -- A project to implement checkpoint/restore functionality for Linux From 2e5f9facf92e90a81f06f3afd3fa214a9fa7bf1c Mon Sep 17 00:00:00 2001 From: Pavel Tikhomirov Date: Fri, 26 Dec 2025 12:28:45 +0800 Subject: [PATCH 38/51] util: Make close_safe() reset fd to -1 even on close() failure The "man 2 close":"Dealing with error returns from close()" says: "Retrying the close() after a failure return is the wrong thing to do" We should not leave the fd there, attempting to close it again on next close()/close_safe() may lead to accidentally closing something else. It confirms with the kernel code where sys_close() removes fd from fdtable in this stack: +-> sys_close +-> file_close_fd +-> file_close_fd_locked +-> rcu_assign_pointer(fdt->fd[fd], NULL) If there was an fd this stack is always reached and fd is always removed. Let's replace the fd with -1 after close no matter what. Signed-off-by: Pavel Tikhomirov --- criu/util.c | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) diff --git a/criu/util.c b/criu/util.c index e2f80e4c6..2eaad35bb 100644 --- a/criu/util.c +++ b/criu/util.c @@ -222,10 +222,9 @@ int close_safe(int *fd) if (*fd > -1) { ret = close(*fd); - if (!ret) - *fd = -1; - else - pr_perror("Unable to close fd %d", *fd); + if (ret) + pr_perror("Failed closing fd %d", *fd); + *fd = -1; } return ret; From fc1867c44d1b6b0771deb2ff317331fc30b7ac78 Mon Sep 17 00:00:00 2001 From: Pavel Tikhomirov Date: Fri, 2 Jan 2026 13:15:43 +0800 Subject: [PATCH 39/51] kerndat: Fix error handling for kerndat_has_timer_cr_ids() fail After commit [1] we accidentally stopped reporting the errors from kerndat_has_timer_cr_ids(), let's fix that. Fixes: 1eaa870cc ("kerndat: check that hardware breakpoints work") [1] Signed-off-by: Pavel Tikhomirov --- criu/kerndat.c | 1 + 1 file changed, 1 insertion(+) diff --git a/criu/kerndat.c b/criu/kerndat.c index 997181ce7..2dc2f77d5 100644 --- a/criu/kerndat.c +++ b/criu/kerndat.c @@ -2116,6 +2116,7 @@ int kerndat_init(void) } if (!ret && kerndat_has_timer_cr_ids()) { pr_err("kerndat_has_timer_cr_ids has failed when initializing kerndat.\n"); + ret = -1; } if (!ret && kerndat_breakpoints()) { pr_err("kerndat_breakpoints has failed when initializing kerndat.\n"); From b1a51489ddfce210ed6f64024fbd5ea823bacfd1 Mon Sep 17 00:00:00 2001 From: Radostin Stoyanov Date: Mon, 29 Dec 2025 11:00:15 +0000 Subject: [PATCH 40/51] compel: fix sys_clock_gettime function signature The initialization of the struct timespec used as clockid input parameter was removed in commit: b4441d1bd8a56ed9ec08603e1d4acf5c779fe935 ("restorer.c: rm unneded struct init") This causes the build to fail on Alpine with clang version 21.1.2: GEN criu/pie/parasite-blob.h criu/pie/restorer.c:1230:39: error: variable 'ts' is uninitialized when passed as a const pointer argument here [-Werror,-Wuninitialized-const-pointer] 1230 | if (sys_clock_gettime(t->clockid, &ts)) { | ^~ 1 error generated. make[2]: *** [/criu/scripts/nmk/scripts/build.mk:118: criu/pie/restorer.o] Error 1 make[1]: *** [criu/Makefile:59: pie] Error 2 make: *** [Makefile:278: criu] Error 2 To fix this, we remove the "const" from the declaration of clock_gettime. Since the kernel writes the current time into the struct timespec provided by the caller, the pointer must be writable. Suggested-by: Andrei Vagin Signed-off-by: Radostin Stoyanov --- compel/arch/arm/plugins/std/syscalls/syscall.def | 2 +- compel/arch/loongarch64/plugins/std/syscalls/syscall_64.tbl | 2 +- compel/arch/mips/plugins/std/syscalls/syscall_64.tbl | 2 +- compel/arch/ppc64/plugins/std/syscalls/syscall-ppc64.tbl | 2 +- compel/arch/riscv64/plugins/std/syscalls/syscall.def | 2 +- compel/arch/s390/plugins/std/syscalls/syscall-s390.tbl | 2 +- compel/arch/x86/plugins/std/syscalls/syscall_64.tbl | 2 +- 7 files changed, 7 insertions(+), 7 deletions(-) diff --git a/compel/arch/arm/plugins/std/syscalls/syscall.def b/compel/arch/arm/plugins/std/syscalls/syscall.def index 819678566..f4deb02b2 100644 --- a/compel/arch/arm/plugins/std/syscalls/syscall.def +++ b/compel/arch/arm/plugins/std/syscalls/syscall.def @@ -85,7 +85,7 @@ timer_settime 110 258 (kernel_timer_t timer_id, int flags, const struct itimer timer_gettime 108 259 (int timer_id, const struct itimerspec *setting) timer_getoverrun 109 260 (int timer_id) timer_delete 111 261 (kernel_timer_t timer_id) -clock_gettime 113 263 (const clockid_t which_clock, const struct timespec *tp) +clock_gettime 113 263 (clockid_t which_clock, struct timespec *tp) exit_group 94 248 (int error_code) set_robust_list 99 338 (struct robust_list_head *head, size_t len) get_robust_list 100 339 (int pid, struct robust_list_head **head_ptr, size_t *len_ptr) diff --git a/compel/arch/loongarch64/plugins/std/syscalls/syscall_64.tbl b/compel/arch/loongarch64/plugins/std/syscalls/syscall_64.tbl index aa6ffb44d..83dcdab4a 100644 --- a/compel/arch/loongarch64/plugins/std/syscalls/syscall_64.tbl +++ b/compel/arch/loongarch64/plugins/std/syscalls/syscall_64.tbl @@ -46,7 +46,7 @@ __NR_sys_timer_gettime 108 sys_timer_gettime (int timer_id, const struct itimer __NR_sys_timer_getoverrun 109 sys_timer_getoverrun (int timer_id) __NR_sys_timer_settime 110 sys_timer_settime (kernel_timer_t timer_id, int flags, const struct itimerspec *new_setting, struct itimerspec *old_setting) __NR_sys_timer_delete 111 sys_timer_delete (kernel_timer_t timer_id) -__NR_clock_gettime 113 sys_clock_gettime (const clockid_t which_clock, const struct timespec *tp) +__NR_clock_gettime 113 sys_clock_gettime (clockid_t which_clock, struct timespec *tp) __NR_sched_setscheduler 119 sys_sched_setscheduler (int pid, int policy, struct sched_param *p) __NR_restart_syscall 128 sys_restart_syscall (void) __NR_kill 129 sys_kill (long pid, int sig) diff --git a/compel/arch/mips/plugins/std/syscalls/syscall_64.tbl b/compel/arch/mips/plugins/std/syscalls/syscall_64.tbl index 85faca5a9..ad3d44634 100644 --- a/compel/arch/mips/plugins/std/syscalls/syscall_64.tbl +++ b/compel/arch/mips/plugins/std/syscalls/syscall_64.tbl @@ -84,7 +84,7 @@ __NR_sys_timer_settime 5217 sys_timer_settime (kernel_timer_t timer_id, int fl __NR_sys_timer_gettime 5218 sys_timer_gettime (int timer_id, const struct itimerspec *setting) __NR_sys_timer_getoverrun 5219 sys_timer_getoverrun (int timer_id) __NR_sys_timer_delete 5220 sys_timer_delete (kernel_timer_t timer_id) -__NR_clock_gettime 5222 sys_clock_gettime (const clockid_t which_clock, const struct timespec *tp) +__NR_clock_gettime 5222 sys_clock_gettime (clockid_t which_clock, struct timespec *tp) __NR_exit_group 5205 sys_exit_group (int error_code) __NR_set_thread_area 5242 sys_set_thread_area (unsigned long *addr) __NR_openat 5247 sys_openat (int dfd, const char *filename, int flags, int mode) diff --git a/compel/arch/ppc64/plugins/std/syscalls/syscall-ppc64.tbl b/compel/arch/ppc64/plugins/std/syscalls/syscall-ppc64.tbl index c56b4e6de..3deb41cf7 100644 --- a/compel/arch/ppc64/plugins/std/syscalls/syscall-ppc64.tbl +++ b/compel/arch/ppc64/plugins/std/syscalls/syscall-ppc64.tbl @@ -82,7 +82,7 @@ __NR_sys_timer_settime 241 sys_timer_settime (kernel_timer_t timer_id, int flag __NR_sys_timer_gettime 242 sys_timer_gettime (int timer_id, const struct itimerspec *setting) __NR_sys_timer_getoverrun 243 sys_timer_getoverrun (int timer_id) __NR_sys_timer_delete 244 sys_timer_delete (kernel_timer_t timer_id) -__NR_clock_gettime 246 sys_clock_gettime (const clockid_t which_clock, const struct timespec *tp) +__NR_clock_gettime 246 sys_clock_gettime (clockid_t which_clock, struct timespec *tp) __NR_exit_group 234 sys_exit_group (int error_code) __NR_waitid 272 sys_waitid (int which, pid_t pid, struct siginfo *infop, int options, struct rusage *ru) __NR_set_robust_list 300 sys_set_robust_list (struct robust_list_head *head, size_t len) diff --git a/compel/arch/riscv64/plugins/std/syscalls/syscall.def b/compel/arch/riscv64/plugins/std/syscalls/syscall.def index 17f763e90..967f097f9 100644 --- a/compel/arch/riscv64/plugins/std/syscalls/syscall.def +++ b/compel/arch/riscv64/plugins/std/syscalls/syscall.def @@ -85,7 +85,7 @@ timer_settime 110 258 (kernel_timer_t timer_id, int flags, const struct itimer timer_gettime 108 259 (int timer_id, const struct itimerspec *setting) timer_getoverrun 109 260 (int timer_id) timer_delete 111 261 (kernel_timer_t timer_id) -clock_gettime 113 263 (const clockid_t which_clock, const struct timespec *tp) +clock_gettime 113 263 (clockid_t which_clock, struct timespec *tp) exit_group 94 248 (int error_code) set_robust_list 99 338 (struct robust_list_head *head, size_t len) get_robust_list 100 339 (int pid, struct robust_list_head **head_ptr, size_t *len_ptr) diff --git a/compel/arch/s390/plugins/std/syscalls/syscall-s390.tbl b/compel/arch/s390/plugins/std/syscalls/syscall-s390.tbl index 018d58a59..ff2f33006 100644 --- a/compel/arch/s390/plugins/std/syscalls/syscall-s390.tbl +++ b/compel/arch/s390/plugins/std/syscalls/syscall-s390.tbl @@ -82,7 +82,7 @@ __NR_sys_timer_settime 255 sys_timer_settime (kernel_timer_t timer_id, int flag __NR_sys_timer_gettime 256 sys_timer_gettime (int timer_id, const struct itimerspec *setting) __NR_sys_timer_getoverrun 257 sys_timer_getoverrun (int timer_id) __NR_sys_timer_delete 258 sys_timer_delete (kernel_timer_t timer_id) -__NR_clock_gettime 260 sys_clock_gettime (const clockid_t which_clock, const struct timespec *tp) +__NR_clock_gettime 260 sys_clock_gettime (clockid_t which_clock, struct timespec *tp) __NR_exit_group 248 sys_exit_group (int error_code) __NR_waitid 281 sys_waitid (int which, pid_t pid, struct siginfo *infop, int options, struct rusage *ru) __NR_set_robust_list 304 sys_set_robust_list (struct robust_list_head *head, size_t len) diff --git a/compel/arch/x86/plugins/std/syscalls/syscall_64.tbl b/compel/arch/x86/plugins/std/syscalls/syscall_64.tbl index 7fbfd69ad..8c3620c2a 100644 --- a/compel/arch/x86/plugins/std/syscalls/syscall_64.tbl +++ b/compel/arch/x86/plugins/std/syscalls/syscall_64.tbl @@ -85,7 +85,7 @@ __NR_sys_timer_settime 223 sys_timer_settime (kernel_timer_t timer_id, int fla __NR_sys_timer_gettime 224 sys_timer_gettime (int timer_id, const struct itimerspec *setting) __NR_sys_timer_getoverrun 225 sys_timer_getoverrun (int timer_id) __NR_sys_timer_delete 226 sys_timer_delete (kernel_timer_t timer_id) -__NR_clock_gettime 228 sys_clock_gettime (const clockid_t which_clock, const struct timespec *tp) +__NR_clock_gettime 228 sys_clock_gettime (clockid_t which_clock, struct timespec *tp) __NR_exit_group 231 sys_exit_group (int error_code) __NR_openat 257 sys_openat (int dfd, const char *filename, int flags, int mode) __NR_waitid 247 sys_waitid (int which, pid_t pid, struct siginfo *infop, int options, struct rusage *ru) From 974c1bc898bb4f5104a44d19029c5800ae431686 Mon Sep 17 00:00:00 2001 From: Radostin Stoyanov Date: Mon, 29 Dec 2025 14:05:20 +0000 Subject: [PATCH 41/51] zdtm/tempfs_subns: fix uninitialized variable DEP tempfs_subns.d CC tempfs_subns.o tempfs_subns.c:50:23: error: variable 'fd' is uninitialized when passed as a const pointer argument here [-Werror,-Wuninitialized-const-pointer] 50 | if (write(fds[1], &fd, sizeof(fd)) != sizeof(fd)) { | ^~ 1 error generated. make[1]: *** [../Makefile.inc:96: tempfs_subns.o] Error 1 Signed-off-by: Radostin Stoyanov --- test/zdtm/static/tempfs_subns.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/test/zdtm/static/tempfs_subns.c b/test/zdtm/static/tempfs_subns.c index ed3ef9a3a..490fdad6e 100644 --- a/test/zdtm/static/tempfs_subns.c +++ b/test/zdtm/static/tempfs_subns.c @@ -20,7 +20,7 @@ int main(int argc, char **argv) { int fds[2], i; pid_t pid; - int fd, status; + int status, fd = -1; test_init(argc, argv); From 2dd66866e3e07c2cebeaa1713bc310c98d5027e7 Mon Sep 17 00:00:00 2001 From: Radostin Stoyanov Date: Mon, 29 Dec 2025 14:13:05 +0000 Subject: [PATCH 42/51] zdtm/cgroup_stray: fix uninitialized variable 51.04 DEP cgroup_stray.d 51.07 CC cgroup_stray.o 51.11 cgroup_stray.c:164:18: error: variable 'c' is uninitialized when passed as a const pointer argument here [-Werror,-Wuninitialized-const-pointer] 51.11 164 | if (write(sk, &c, 1) != 1) { 51.11 | ^ 51.11 1 error generated. 51.12 make[1]: *** [../Makefile.inc:96: cgroup_stray.o] Error 1 51.12 make[1]: Leaving directory '/criu/test/zdtm/static' 51.12 make: *** [Makefile:7: static] Error 2 Signed-off-by: Radostin Stoyanov --- test/zdtm/static/cgroup_stray.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/test/zdtm/static/cgroup_stray.c b/test/zdtm/static/cgroup_stray.c index 0c0ed93cf..f5754410f 100644 --- a/test/zdtm/static/cgroup_stray.c +++ b/test/zdtm/static/cgroup_stray.c @@ -135,7 +135,7 @@ out: int main(int argc, char **argv) { int ret = -1, sk_pair[2], sk, status; - char path[PATH_MAX], c; + char path[PATH_MAX], c = 0; pid_t pid = 0; test_init(argc, argv); From ddf7a170ff74befbe7da2114054e4b1c85ea2d3d Mon Sep 17 00:00:00 2001 From: Radostin Stoyanov Date: Mon, 29 Dec 2025 12:12:53 +0000 Subject: [PATCH 43/51] infect-types: fix user_gcs redefine error In file included from compel/arch/aarch64/src/lib/infect.c:10: compel/include/uapi/compel/asm/infect-types.h:24:8: error: redefinition of 'user_gcs' 24 | struct user_gcs { | ^ /usr/include/asm/ptrace.h:329:8: note: previous definition is here 329 | struct user_gcs { | ^ 1 error generated. make[1]: *** [/criu/scripts/nmk/scripts/build.mk:215: compel/arch/aarch64/src/lib/infect.o] Error 1 Suggested-by: Andrei Vagin Signed-off-by: Radostin Stoyanov --- .../arch/aarch64/src/lib/include/uapi/asm/infect-types.h | 7 +++++-- compel/arch/aarch64/src/lib/infect.c | 8 ++++---- criu/arch/aarch64/crtools.c | 2 +- 3 files changed, 10 insertions(+), 7 deletions(-) diff --git a/compel/arch/aarch64/src/lib/include/uapi/asm/infect-types.h b/compel/arch/aarch64/src/lib/include/uapi/asm/infect-types.h index 3a34ab4f6..606c92ffe 100644 --- a/compel/arch/aarch64/src/lib/include/uapi/asm/infect-types.h +++ b/compel/arch/aarch64/src/lib/include/uapi/asm/infect-types.h @@ -20,8 +20,11 @@ typedef struct user_pt_regs user_regs_struct_t; /* * GCS (Guarded Control Stack) + * + * This mirrors the kernel definition but renamed to cr_user_gcs + * to avoid conflict with kernel headers (/usr/include/asm/ptrace.h). */ -struct user_gcs { +struct cr_user_gcs { __u64 features_enabled; __u64 features_locked; __u64 gcspr_el0; @@ -29,7 +32,7 @@ struct user_gcs { struct user_fpregs_struct { struct user_fpsimd_state fpstate; - struct user_gcs gcs; + struct cr_user_gcs gcs; }; typedef struct user_fpregs_struct user_fpregs_struct_t; diff --git a/compel/arch/aarch64/src/lib/infect.c b/compel/arch/aarch64/src/lib/infect.c index 7450ac026..42f593c79 100644 --- a/compel/arch/aarch64/src/lib/infect.c +++ b/compel/arch/aarch64/src/lib/infect.c @@ -41,7 +41,7 @@ bool __compel_host_supports_gcs(void) return (hwcap & HWCAP_GCS) != 0; } -static bool __compel_gcs_enabled(struct user_gcs *gcs) +static bool __compel_gcs_enabled(struct cr_user_gcs *gcs) { if (!compel_host_supports_gcs()) return false; @@ -136,7 +136,7 @@ int compel_set_task_ext_regs(pid_t pid, user_fpregs_struct_t *ext_regs) { struct iovec iov; - struct user_gcs gcs; + struct cr_user_gcs gcs; struct iovec gcs_iov = { .iov_base = &gcs, .iov_len = sizeof(gcs) }; pr_info("Restoring GP/FPU registers for %d\n", pid); @@ -363,7 +363,7 @@ int ptrace_flush_breakpoints(pid_t pid) return 0; } -int inject_gcs_cap_token(struct parasite_ctl *ctl, pid_t pid, struct user_gcs *gcs) +int inject_gcs_cap_token(struct parasite_ctl *ctl, pid_t pid, struct cr_user_gcs *gcs) { struct iovec gcs_iov = { .iov_base = gcs, .iov_len = sizeof(*gcs) }; @@ -403,7 +403,7 @@ int inject_gcs_cap_token(struct parasite_ctl *ctl, pid_t pid, struct user_gcs *g int parasite_setup_shstk(struct parasite_ctl *ctl, user_fpregs_struct_t *ext_regs) { - struct user_gcs gcs; + struct cr_user_gcs gcs; struct iovec gcs_iov = { .iov_base = &gcs, .iov_len = sizeof(gcs) }; pid_t pid = ctl->rpid; diff --git a/criu/arch/aarch64/crtools.c b/criu/arch/aarch64/crtools.c index 835a83400..2e89f9ce3 100644 --- a/criu/arch/aarch64/crtools.c +++ b/criu/arch/aarch64/crtools.c @@ -148,7 +148,7 @@ static int save_pac_keys(int pid, CoreEntry *core) int save_task_regs(pid_t pid, void *x, user_regs_struct_t *regs, user_fpregs_struct_t *fpsimd) { int i; - struct user_gcs gcs_live; + struct cr_user_gcs gcs_live; struct iovec gcs_iov = { .iov_base = &gcs_live, .iov_len = sizeof(gcs_live), From 36f1e9d38c3b697ca38f405df666929bcd483034 Mon Sep 17 00:00:00 2001 From: Radostin Stoyanov Date: Mon, 29 Dec 2025 15:26:22 +0000 Subject: [PATCH 44/51] amdgpu: use fseeko with large-file support instead of fseeko64 As of Alpine Linux 3.19, musl libc no longer contains separate fopen64(), fseeko64(), or ftello64() functions. This causes building CRIU with amdgpu plugin to fail with the following error: amdgpu_plugin.c: In function 'parallel_restore_bo_contents': amdgpu_plugin.c:2286:17: error: implicit declaration of function 'fseeko64'; did you mean 'fseeko'? [-Wimplicit-function-declaration] 2286 | fseeko64(bo_contents_fp, entry->read_offset + offset, SEEK_SET); | ^~~~~~~~ | fseeko make[2]: *** [Makefile:31: amdgpu_plugin.so] Error 1 make[1]: *** [Makefile:363: amdgpu_plugin] Error 2 To fix this, add the missing $(DEFINES) to plugin builds, and since we always compile with _FILE_OFFSET_BITS=64, we don't need the 64 suffix. Fixes: #2826 Suggested-by: Andrei Vagin Signed-off-by: Radostin Stoyanov --- plugins/amdgpu/Makefile | 2 +- plugins/amdgpu/amdgpu_plugin.c | 2 +- plugins/cuda/Makefile | 2 +- 3 files changed, 3 insertions(+), 3 deletions(-) diff --git a/plugins/amdgpu/Makefile b/plugins/amdgpu/Makefile index 31e177e4a..250e7b0e7 100644 --- a/plugins/amdgpu/Makefile +++ b/plugins/amdgpu/Makefile @@ -28,7 +28,7 @@ criu-amdgpu.pb-c.c: criu-amdgpu.proto protoc --proto_path=. --c_out=. criu-amdgpu.proto amdgpu_plugin.so: amdgpu_plugin.c amdgpu_plugin_drm.c amdgpu_plugin_dmabuf.c amdgpu_plugin_topology.c amdgpu_plugin_util.c criu-amdgpu.pb-c.c amdgpu_socket_utils.c - $(CC) $(PLUGIN_CFLAGS) $(shell $(COMPEL) includes) $^ -o $@ $(PLUGIN_INCLUDE) $(PLUGIN_LDFLAGS) $(LIBDRM_INC) + $(CC) $(PLUGIN_CFLAGS) $(DEFINES) $(shell $(COMPEL) includes) $^ -o $@ $(PLUGIN_INCLUDE) $(PLUGIN_LDFLAGS) $(LIBDRM_INC) amdgpu_plugin_clean: $(call msg-clean, $@) diff --git a/plugins/amdgpu/amdgpu_plugin.c b/plugins/amdgpu/amdgpu_plugin.c index 574d7b829..ee55bde0a 100644 --- a/plugins/amdgpu/amdgpu_plugin.c +++ b/plugins/amdgpu/amdgpu_plugin.c @@ -2283,7 +2283,7 @@ void *parallel_restore_bo_contents(void *_thread_data) continue; entry = &restore_cmd->entries[i]; - fseeko64(bo_contents_fp, entry->read_offset + offset, SEEK_SET); + fseeko(bo_contents_fp, entry->read_offset + offset, SEEK_SET); ret = sdma_copy_bo(restore_cmd->fds_write[entry->write_id], entry->size, bo_contents_fp, buffer, buffer_size, h_dev, max_copy_size, SDMA_OP_VRAM_WRITE, false); diff --git a/plugins/cuda/Makefile b/plugins/cuda/Makefile index cc3d98ac9..2c1944a34 100644 --- a/plugins/cuda/Makefile +++ b/plugins/cuda/Makefile @@ -19,7 +19,7 @@ all: $(DEPS_CUDA) cuda_plugin.so: cuda_plugin.c $(call msg-gen, $@) - $(Q) $(CC) $(PLUGIN_CFLAGS) $(shell $(COMPEL) includes) $^ -o $@ $(PLUGIN_INCLUDE) $(PLUGIN_LDFLAGS) + $(Q) $(CC) $(PLUGIN_CFLAGS) $(DEFINES) $(shell $(COMPEL) includes) $^ -o $@ $(PLUGIN_INCLUDE) $(PLUGIN_LDFLAGS) clean: $(call msg-clean, $@) From 71fe85ec90871ffcfb57060f28760f607e0e6d08 Mon Sep 17 00:00:00 2001 From: ImranullahKhann Date: Thu, 8 Jan 2026 19:55:04 +0500 Subject: [PATCH 45/51] ci: add iproute2 to the list of packages in apt-packages.sh When running the command 'make docker-test', almost all zdtm tests fail, logging 'ip: not found'. 'ip' command of the iproute2 package was missing. So added the package to the list of dependencies in 'apt-packages.sh'. Now tests run Signed-off-by: ImranullahKhann --- contrib/dependencies/apt-packages.sh | 1 + 1 file changed, 1 insertion(+) diff --git a/contrib/dependencies/apt-packages.sh b/contrib/dependencies/apt-packages.sh index 1fd42d4e6..7963be7b4 100755 --- a/contrib/dependencies/apt-packages.sh +++ b/contrib/dependencies/apt-packages.sh @@ -13,6 +13,7 @@ fi build-essential \ gdb \ git-core \ + iproute2 \ iptables \ kmod \ libaio-dev \ From 9885fb3c75ee6d18e554ec1ddddf6ec2c89ea848 Mon Sep 17 00:00:00 2001 From: Radostin Stoyanov Date: Sun, 11 Jan 2026 14:56:49 +0000 Subject: [PATCH 46/51] crit: fix incorrect task state decoding CRIU defines the following constants for task state in compel/include/uapi/task-state.h COMPEL_TASK_ALIVE = 0x01 COMPEL_TASK_STOPPED = 0x03 COMPEL_TASK_ZOMBIE = 0x06 Thus, we need to swap the values for "zombie" and "stopped" used in CRIT. Signed-off-by: Radostin Stoyanov --- lib/pycriu/images/pb2dict.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/lib/pycriu/images/pb2dict.py b/lib/pycriu/images/pb2dict.py index a35dd3c3f..b80e3475f 100644 --- a/lib/pycriu/images/pb2dict.py +++ b/lib/pycriu/images/pb2dict.py @@ -154,8 +154,8 @@ flags_maps = { gen_maps = { 'task_state': { 1: 'Alive', - 3: 'Zombie', - 6: 'Stopped' + 3: 'Stopped', + 6: 'Zombie', }, } From b208bec12d5bc68b47b2a1f026f527e7f3141445 Mon Sep 17 00:00:00 2001 From: Radostin Stoyanov Date: Mon, 12 Jan 2026 05:47:02 +0000 Subject: [PATCH 47/51] crit: show dead task_state In some cases, CRIU can observe tasks that exit during checkpointing, and sets the state of these tasks to COMPEL_TASK_DEAD. This patch adds a string representation of this value that can be used by CRIT when decoding the images. Signed-off-by: Radostin Stoyanov --- lib/pycriu/images/pb2dict.py | 1 + 1 file changed, 1 insertion(+) diff --git a/lib/pycriu/images/pb2dict.py b/lib/pycriu/images/pb2dict.py index b80e3475f..f22887a52 100644 --- a/lib/pycriu/images/pb2dict.py +++ b/lib/pycriu/images/pb2dict.py @@ -154,6 +154,7 @@ flags_maps = { gen_maps = { 'task_state': { 1: 'Alive', + 2: 'Dead', 3: 'Stopped', 6: 'Zombie', }, From fb59ae504e90edf08c133b4d2d938f645421c396 Mon Sep 17 00:00:00 2001 From: Adrian Reber Date: Fri, 9 Jan 2026 08:15:05 +0000 Subject: [PATCH 48/51] test: fix GCC 16 compile error MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Fedora rawhide ships a pre-release of GCC 16 which produces following error: uprobes.c:34:22: error: variable ‘dummy’ set but not used [-Werror=unused-but-set-variable=] 34 | volatile int dummy = 0; | ^~~~~ Marking this variable as "__maybe_unused" to fix the error. Signed-off-by: Adrian Reber --- test/zdtm/static/uprobes.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/test/zdtm/static/uprobes.c b/test/zdtm/static/uprobes.c index 4164375b7..6ef9a56bc 100644 --- a/test/zdtm/static/uprobes.c +++ b/test/zdtm/static/uprobes.c @@ -31,7 +31,7 @@ const char *test_author = "Shashank Balaji "; * compiler optimization) and use it (to prevent "unused variable" warning) */ void UPROBED_FUNCTION(void) { - volatile int dummy = 0; + volatile int dummy __maybe_unused = 0; dummy += 1; } /* Calling via volatile function pointer ensures noinline at callsite */ From 07af3304fdce72b479d8670fced93604c295461f Mon Sep 17 00:00:00 2001 From: liqiang2020 Date: Tue, 16 Dec 2025 18:14:13 +0800 Subject: [PATCH 49/51] restore/pie: check return value of sys_rseq on unregister The return value of sys_rseq was previously ignored during unregistration, under the assumption that it would not fail if the rseq structure was properly registered. However, if sys_rseq fails, the kernel retains the registration. If the memory containing the rseq structure is subsequently unmapped or reused, kernel updates to the rseq area can cause the process to crash (e.g., via SIGSEGV). Check the return value of sys_rseq. If it fails, log the error code and abort the restoration process. This makes rseq unregistration failures fatal and explicit, aiding in debugging and preventing later obscure crashes. Signed-off-by: liqiang2020 --- criu/pie/restorer.c | 19 +++++++++++++------ 1 file changed, 13 insertions(+), 6 deletions(-) diff --git a/criu/pie/restorer.c b/criu/pie/restorer.c index 008e1398d..0a8aba41b 100644 --- a/criu/pie/restorer.c +++ b/criu/pie/restorer.c @@ -1363,13 +1363,19 @@ __visible void __export_unmap(void) sys_munmap(bootstrap_start, bootstrap_len - vdso_rt_size); } -static void unregister_libc_rseq(struct rst_rseq_param *rseq) +static int unregister_libc_rseq(struct rst_rseq_param *rseq) { - if (!rseq->rseq_abi_pointer) - return; + long ret; - /* can't fail if rseq is registered */ - sys_rseq(decode_pointer(rseq->rseq_abi_pointer), rseq->rseq_abi_size, 1, rseq->signature); + if (!rseq->rseq_abi_pointer) + return 0; + + ret = sys_rseq(decode_pointer(rseq->rseq_abi_pointer), rseq->rseq_abi_size, 1, rseq->signature); + if (ret) { + pr_err("Failed to unregister libc rseq %ld\n", ret); + return -1; + } + return 0; } /* @@ -1803,7 +1809,8 @@ __visible long __export_restore_task(struct task_restore_args *args) * for instance once the kernel will want to update (struct rseq).cpu_id field: * https://github.com/torvalds/linux/blob/ce522ba9ef7e/kernel/rseq.c#L89 */ - unregister_libc_rseq(&args->libc_rseq); + if (unregister_libc_rseq(&args->libc_rseq)) + goto core_restore_end; if (unmap_old_vmas((void *)args->premmapped_addr, args->premmapped_len, bootstrap_start, bootstrap_len, args->task_size)) From 21a6758268e9230ae258d3c70aa12ae1a4da6750 Mon Sep 17 00:00:00 2001 From: Pavel Tikhomirov Date: Fri, 16 Jan 2026 12:33:10 +0800 Subject: [PATCH 50/51] cr-restore/shstk: Make arch_shstk_unlock use correct pid In a simple case where the parent process and the child one are in one pid namespace we can safely use vpid(item) to prace the child. But, for the cases where the child is a pid namespace init, or the child is put into external pid namespace, the parent and the child have different pid namespaces and using pid vpid(item) (which e.g. for init will always be 1 here) to ptrace the child process is inorrect. Let's use the pid reported to us from clone as it's always the right pid of the child from the parent's point of view. Fixes: 7dd583002 ("restore: add infrastructure to enable shadow stack") Signed-off-by: Pavel Tikhomirov --- criu/cr-restore.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/criu/cr-restore.c b/criu/cr-restore.c index a5eda8d60..b92b92715 100644 --- a/criu/cr-restore.c +++ b/criu/cr-restore.c @@ -1238,7 +1238,7 @@ static inline int fork_with_pid(struct pstree_item *item) pr_debug("PID: real %d virt %d\n", item->pid->real, vpid(item)); } - arch_shstk_unlock(item, ca.core, pid); + arch_shstk_unlock(item, ca.core, ret); err_unlock: if (!(ca.clone_flags & CLONE_NEWPID)) From 9e5fbcd668709a79a876e4c6eff13b975b0f3631 Mon Sep 17 00:00:00 2001 From: unichronic Date: Sun, 18 Jan 2026 00:37:44 +0530 Subject: [PATCH 51/51] pycriu: Fix self-dump failure with explicit PID When `opts.pid` is explicitly set to `os.getpid()`, `pycriu` fails to daemonize the `criu` process. This causes `criu` to run as a child of the dumped process, leading to the error "The criu itself is within dumped tree". This can be fixed by modifying `_send_req_and_recv_resp` to check if the target PID matches the current process PID. If so, it enables daemon mode, ensuring `criu` is detached and the dump succeeds. Signed-off-by: unichronic --- lib/pycriu/criu.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/lib/pycriu/criu.py b/lib/pycriu/criu.py index 760d2be78..51a5c2902 100644 --- a/lib/pycriu/criu.py +++ b/lib/pycriu/criu.py @@ -242,7 +242,7 @@ class criu: # process resources from its own if criu is located in a same # process tree it is trying to dump. daemon = False - if req.type == rpc.DUMP and not req.opts.HasField('pid'): + if req.type == rpc.DUMP and (not req.opts.HasField('pid') or req.opts.pid == os.getpid()): daemon = True try: