mirror of
https://github.com/checkpoint-restore/criu.git
synced 2026-01-23 10:16:41 +00:00
To support Checkpoint Restore with AMDGPUs for ROCm workloads, introduce
a new plugin to assist CRIU with the help of AMD KFD kernel driver. This
initial commit just provides the basic framework to build up further
capabilities. Like CRIU, the amdgpu plugin also uses protobuf to
serialize
and save the amdkfd data which is mostly VRAM contents with some
metadata.
We generate a data file "amdgpu-kfd-<id>.img" during the dump stage. On restore
this file is read and extracted to re-create various types of buffer
objects that belonged to the previously checkpointed process. Upon
restore the mmap page offset within a device file might change so we use
the new hook to update and adjust the mmap offsets for newly created
target process. This is needed for sys_mmap call in pie restorer phase.
Support for queues and events is added in future patches of this series.
With the current implementation (amdgpu_plugin), we support:
- Only compute workloads such (Non Gfx) are supported
- GPU visible inside a container
- AMD GPU Gfx 9 Family
- Pytorch Benchmarks such as BERT Base
amdgpu plugin dependes on libdrm and libdrm_amdgpu which are typically
installed with libdrm-dev package. We build amdgpu_plugin only when the
dependencies are met on the target system and when user intends to
install the amdgpu plugin and not by default with criu build.
Suggested-by: Felix Kuehling <felix.kuehling@amd.com>
Co-authored-by: David Yat Sin <david.yatsin@amd.com>
Signed-off-by: David Yat Sin <david.yatsin@amd.com>
Signed-off-by: Rajneesh Bhardwaj <rajneesh.bhardwaj@amd.com>
63 lines
1.6 KiB
Text
63 lines
1.6 KiB
Text
#
|
|
# Installation paths.
|
|
PREFIX ?= /usr/local
|
|
BINDIR ?= $(PREFIX)/bin
|
|
SBINDIR ?= $(PREFIX)/sbin
|
|
MANDIR ?= $(PREFIX)/share/man
|
|
INCLUDEDIR ?= $(PREFIX)/include
|
|
LIBEXECDIR ?= $(PREFIX)/libexec
|
|
RUNDIR ?= /run
|
|
PLUGINDIR ?= /var/lib/criu
|
|
|
|
#
|
|
# For recent Debian/Ubuntu with multiarch support.
|
|
DEB_HOST_MULTIARCH := $(shell dpkg-architecture -qDEB_HOST_MULTIARCH 2>/dev/null)
|
|
ifneq "$(DEB_HOST_MULTIARCH)" ""
|
|
LIBDIR ?= $(PREFIX)/lib/$(DEB_HOST_MULTIARCH)
|
|
else
|
|
#
|
|
# For most other systems
|
|
ifeq "$(shell uname -m)" "x86_64"
|
|
LIBDIR ?= $(PREFIX)/lib64
|
|
endif
|
|
endif
|
|
|
|
#
|
|
# LIBDIR falls back to the standard path.
|
|
LIBDIR ?= $(PREFIX)/lib
|
|
|
|
export PREFIX BINDIR SBINDIR MANDIR RUNDIR
|
|
export LIBDIR INCLUDEDIR LIBEXECDIR PLUGINDIR
|
|
|
|
install-man:
|
|
$(Q) $(MAKE) -C Documentation install
|
|
.PHONY: install-man
|
|
|
|
install-lib: lib
|
|
$(Q) $(MAKE) $(build)=lib install
|
|
.PHONY: install-lib
|
|
|
|
install-criu: criu
|
|
$(Q) $(MAKE) $(build)=criu install
|
|
.PHONY: install-criu
|
|
|
|
install-amdgpu_plugin: amdgpu_plugin
|
|
$(Q) $(MAKE) -C plugins/amdgpu install
|
|
.PHONY: install-amdgpu_plugin
|
|
|
|
install-compel: $(compel-install-targets)
|
|
$(Q) $(MAKE) $(build)=compel install
|
|
$(Q) $(MAKE) $(build)=compel/plugins install
|
|
.PHONY: install-compel
|
|
|
|
install: install-man install-lib install-criu install-compel install-amdgpu_plugin ;
|
|
.PHONY: install
|
|
|
|
uninstall:
|
|
$(Q) $(MAKE) -C Documentation $@
|
|
$(Q) $(MAKE) $(build)=lib $@
|
|
$(Q) $(MAKE) $(build)=criu $@
|
|
$(Q) $(MAKE) $(build)=compel $@
|
|
$(Q) $(MAKE) $(build)=compel/plugins $@
|
|
$(Q) $(MAKE) -C plugins/amdgpu $@
|
|
.PHONY: uninstall
|