mirror of
https://github.com/checkpoint-restore/criu.git
synced 2026-01-23 02:14:37 +00:00
Merge branch 'criu-dev' into criu-cow
This commit is contained in:
commit
f060d6d938
36 changed files with 7091 additions and 484 deletions
|
|
@ -1,3 +1,3 @@
|
|||
[codespell]
|
||||
skip = ./.git,./test/pki,./tags
|
||||
skip = ./.git,./test/pki,./tags,./plugins/amdgpu/amdgpu_drm.h,./plugins/amdgpu/drm.h,./plugins/amdgpu/drm_mode.h
|
||||
ignore-words-list = creat,fpr,fle,ue,bord,parms,nd,te,testng,inh,wronly,renderd,bui,clen,sems
|
||||
|
|
|
|||
299
COW_DUMP_DESIGN.md
Normal file
299
COW_DUMP_DESIGN.md
Normal file
|
|
@ -0,0 +1,299 @@
|
|||
# COW-Based Live Migration Design Document
|
||||
|
||||
|
||||
## Introduction
|
||||
This feature implements COW (Copy-On-Write) based live migration for CRIU, enabling process duplication to remote instances to achieve the goal of:
|
||||
1. Minimized downtime at the source.
|
||||
2. Making the destination alive ASAP like in the current design of lazy dump.
|
||||
3. Transfer the data at high speed to complete the process soon and reduce the amount of COW operations.
|
||||
|
||||
|
||||
The approach uses userfaultfd write-protection to track memory modifications while the process continues running at the source and the destination is loaded same as in the lazy dump implementation. It overcomes the main issue with the lazy dump where the source is frozen during the dump.
|
||||
|
||||
## Architecture Overview
|
||||
|
||||
### Data Flow Source
|
||||
|
||||
|
||||
**Phase 1: Setup via Parasite RPC**
|
||||
- Create userfaultfd in target process
|
||||
- Register VMAs with UFFDIO_REGISTER_MODE_WP
|
||||
- Apply write-protection (UFFDIO_WRITEPROTECT)
|
||||
- Send userfaultfd back to CRIU
|
||||
- Create Monitor thread to get write faults events
|
||||
- Process resumes with COW protection active
|
||||
|
||||
**Phase 2: Monitor Thread (Background)**
|
||||
- read() from userfaultfd (blocking)
|
||||
- On write fault:
|
||||
1. Read page from /proc/pid/mem (before modification)
|
||||
2. Copy the page and store it in hash table
|
||||
3. Unprotect page
|
||||
4. Wake faulting thread at the source process
|
||||
|
||||
**Phase 3: Page Transfer (page_server_get_pages)**
|
||||
- Lookup COW pages in hash table
|
||||
- Fast path: No COW → splice (zero-copy)
|
||||
- Slow path: COW present → buffer + overlay
|
||||
- Bulk unprotect after transfer
|
||||
|
||||
#### Detailed design source
|
||||
|
||||
##### 1. cow-dump.c (CRIU-side Coordinator)
|
||||
|
||||
Main coordinator for COW tracking on the CRIU side. Manages the lifecycle of COW dump operations.
|
||||
|
||||
*Key Data Structures*
|
||||
|
||||
```c
|
||||
/* Per-process COW dump state */
|
||||
struct cow_dump_info {
|
||||
struct pstree_item *item;
|
||||
int uffd; /* userfaultfd from target */
|
||||
int proc_mem_fd; /* /proc/pid/mem handle */
|
||||
unsigned long total_pages; /* Total pages tracked */
|
||||
unsigned long dirty_pages; /* Modified pages count */
|
||||
|
||||
/* Hash table: 65K buckets for O(1) lookup */
|
||||
struct hlist_head cow_hash[COW_HASH_SIZE]; /* 2^16 buckets */
|
||||
pthread_spinlock_t cow_hash_locks[COW_HASH_SIZE]; //Lock for each hash entry to have fine grain locking.
|
||||
};
|
||||
|
||||
/* Hash table entry for copied pages */
|
||||
struct cow_page {
|
||||
unsigned long vaddr; /* Virtual address */
|
||||
void *data; /* 4KB page content */
|
||||
struct hlist_node hash; /* Hash linkage */
|
||||
};
|
||||
|
||||
#define COW_HASH_SIZE (1 << 16) /* 65536 buckets */
|
||||
```
|
||||
|
||||
*Key Functions*
|
||||
|
||||
**Init- Initialize COW tracking**
|
||||
- Opens `/proc/pid/mem` for reading page contents
|
||||
- Calls parasite RPC to setup userfaultfd
|
||||
- Receives userfaultfd from parasite
|
||||
- Initializes hash table and spinlocks
|
||||
- Init COW monitoring thread
|
||||
|
||||
|
||||
**cow_monitor_thread()** - Background monitoring
|
||||
- Continuously reads from userfaultfd
|
||||
- Processes write fault events
|
||||
|
||||
**cow_handle_write_fault()** - Handle write fault event
|
||||
```
|
||||
Input: fault address
|
||||
1. Allocate cow_page structure
|
||||
2. Read page from /proc/pid/mem (BEFORE modification)
|
||||
3. Add to hash table (thread-safe)
|
||||
4. Unprotect page (UFFDIO_WRITEPROTECT mode=0)
|
||||
5. Wake faulting thread (UFFDIO_WAKE)
|
||||
```
|
||||
|
||||
|
||||
**cow_lookup_and_remove_page()** - Thread-safe page lookup
|
||||
- Hash-based O(1) lookup
|
||||
- Removes from hash table atomically
|
||||
|
||||
##### 2. pie/parasite.c (In-Process Setup)
|
||||
|
||||
Runs inside the target process to setup userfaultfd with write-protection.
|
||||
|
||||
**Purpose:** The parasite code is injected into the target process and executes in its context to create and configure the userfaultfd.
|
||||
|
||||
*Key Function: parasite_cow_dump_init()*
|
||||
|
||||
|
||||
**Why Parasite-Based?**
|
||||
1. **Context Requirement:** userfaultfd must be created in target process context
|
||||
2. **Inheritance:** Automatically inherited by all threads
|
||||
3. **Permissions:** Avoids ptrace permission issues
|
||||
4. **Atomic Setup:** All VMAs protected before process resumes
|
||||
|
||||
|
||||
##### 3. page-xfer.c (Page Server Integration)
|
||||
|
||||
Integrates COW tracking with page transfer, overlaying modified pages during transfer.
|
||||
|
||||
Key Function: page_server_get_pages()
|
||||
|
||||
Step 1: Read pages from page_pipe
|
||||
page_pipe_read(pp, &pipe_read_dest, vaddr, &nr_pages)
|
||||
|
||||
Step 2: Check for COW pages at the hash table, recall each modified page is stored in the hash table (single pass)
|
||||
for each page:
|
||||
cow_pages[i] = cow_lookup_and_remove_page(addr)
|
||||
cow_count = number of non-NULL entries
|
||||
|
||||
Fast path: (cow_count is zero, same as in the current lazy implementation)
|
||||
Zero-copy splice: splice(pipe -> sock)
|
||||
No memory copies!
|
||||
|
||||
|
||||
Slow path: (cow_count is above zero)
|
||||
1. read(pipe -> buffer)
|
||||
2. overlay COW pages
|
||||
|
||||
Step 3: Bulk unprotect
|
||||
wp.range.start = vaddr
|
||||
wp.range.len = len
|
||||
wp.mode = 0
|
||||
ioctl(uffd, UFFDIO_WRITEPROTECT)
|
||||
|
||||
|
||||
### Data Flow Destination
|
||||
|
||||
No changes were made at the destination and it is almost the same as in the original code. I implemented a single performance improvement that handles lazy page requests from destination with aggressive pipelining.
|
||||
|
||||
```
|
||||
┌─────────────────────────────────────────────────────────┐
|
||||
│ Traditional: Sequential (1 request at a time) │
|
||||
│ │
|
||||
│ Request → Wait → Response → Request → Wait → Response │
|
||||
│ │
|
||||
│ Throughput: Limited by RTT │
|
||||
└─────────────────────────────────────────────────────────┘
|
||||
|
||||
┌─────────────────────────────────────────────────────────┐
|
||||
│ Aggressive: Pipeline (256 requests in-flight) │
|
||||
│ │
|
||||
│ Request ─┐ │
|
||||
│ Request ─┤ │
|
||||
│ Request ─┤ │
|
||||
│ ... ├─► In Flight (256 concurrent) │
|
||||
│ Request ─┤ │
|
||||
│ Request ─┤ │
|
||||
│ Request ─┘ │
|
||||
│ │
|
||||
│ Response → IMMEDIATELY refill pipeline │
|
||||
│ │
|
||||
│ Throughput: Near maximum network bandwidth │
|
||||
└─────────────────────────────────────────────────────────┘
|
||||
```
|
||||
|
||||
|
||||
## Kernel Requirements
|
||||
|
||||
### Minimum Kernel Version
|
||||
**Linux 5.7+** (released May 2020)
|
||||
|
||||
### Required Features
|
||||
|
||||
| Feature | Flag | Purpose | Since |
|
||||
|---------|------|---------|-------|
|
||||
| WP Flag | `UFFD_FEATURE_PAGEFAULT_FLAG_WP` | Identify write faults | 5.7 |
|
||||
|
||||
|
||||
### System Configuration
|
||||
|
||||
**Unprivileged Access:**
|
||||
```bash
|
||||
# Allow unprivileged userfaultfd
|
||||
echo 1 > /proc/sys/vm/unprivileged_userfaultfd
|
||||
|
||||
# Or require CAP_SYS_PTRACE
|
||||
```
|
||||
|
||||
|
||||
|
||||
---
|
||||
|
||||
## Future Work
|
||||
|
||||
#### 1. Explore UFFD_FEATURE_WP_ASYNC
|
||||
|
||||
We should explore how to use this feature. It should only mark the page as touched and then we can do a second pass to copy only the touched pages. I will dive deeper to see if it is more efficient.
|
||||
|
||||
#### 2. Reduce communication overhead between source and destination
|
||||
|
||||
Currently the communication is driven by the destination which sends requests. We can improve this by making the source send the data and the destination only asks if there is a read page fault. That way, we reduce the amount of work from the source.
|
||||
|
||||
#### 3. Make the source multithreaded
|
||||
|
||||
Can we make the source multithreaded to reduce the overall time? Should be explored.
|
||||
|
||||
#### 4. Non-Registerable VMAs
|
||||
|
||||
**Issue:** Some VMAs cannot be write-protected.
|
||||
|
||||
I will be happy to get advice.
|
||||
|
||||
|
||||
|
||||
### Next Steps
|
||||
|
||||
For maintainers reviewing this code:
|
||||
|
||||
1. **Testing:** Extensive testing with various workloads + add regression tests.
|
||||
2. **Documentation:** Update user-facing documentation
|
||||
3. **Performance Tuning:** Try differnt techniques discussed at the Future Work section.
|
||||
|
||||
|
||||
### Usage
|
||||
```bash
|
||||
criu dump --cow-dump --lazy-pages ...
|
||||
```
|
||||
|
||||
## Appendix - Statistics and Monitoring
|
||||
|
||||
### COW Tracking Statistics
|
||||
|
||||
**Per-Second Logging:**
|
||||
```
|
||||
[COW_STATS] events: wr=1234 fork=0 remap=0 unk=0 |
|
||||
ops: copied=1234 unprot=1234 woken=1234 |
|
||||
errs: alloc=0 read=0 unprot_err=0 wake_err=0
|
||||
read_err=0 eagain_err=0
|
||||
```
|
||||
|
||||
**Metrics:**
|
||||
|
||||
| Metric | Description | Good Value | Alert If |
|
||||
|--------|-------------|------------|----------|
|
||||
| `wr` | Write faults | Varies | - |
|
||||
| `copied` | Pages copied | = wr | < wr |
|
||||
| `unprot` | Pages unprotected | = wr | < wr |
|
||||
| `woken` | Threads woken | = wr | < wr |
|
||||
| `alloc_failures` | Allocation failures | 0 | > 0 |
|
||||
| `read_failures` | Read failures | 0 | > 0 |
|
||||
| `eagain_errors` | EAGAIN on read | Low | High |
|
||||
|
||||
### Page Server Statistics
|
||||
|
||||
**Per-Second Logging:**
|
||||
```
|
||||
[PAGE_SERVER_STATS] get_pages: reqs=500 with_cow=50 no_cow=450
|
||||
pages=8000 cow=400 errs=0 |
|
||||
serve: open2=1 parent=0 add_f=7950 get=500
|
||||
close=1
|
||||
```
|
||||
|
||||
**Metrics:**
|
||||
|
||||
| Metric | Description | Indicates |
|
||||
|--------|-------------|-----------|
|
||||
| `reqs` | Total requests | Transfer activity |
|
||||
| `with_cow` | Slow path taken | COW overlay needed |
|
||||
| `no_cow` | Fast path taken | Zero-copy efficiency |
|
||||
| `pages` | Total pages transferred | Bandwidth |
|
||||
| `cow` | COW pages overlaid | Write activity |
|
||||
|
||||
### UFFD Daemon Statistics
|
||||
|
||||
**Per-Second Logging:**
|
||||
```
|
||||
[UFFD_STATS] reqs=1000(pf:50,bg:950) pages=8000 pipe_avg=180
|
||||
PF: 4K=30 64K=15 128K=5
|
||||
BG: 4K=100 64K=500 128K=200 256K=100 512K=50
|
||||
```
|
||||
|
||||
**Histograms:**
|
||||
- **PF (Page Fault):** Destination-initiated requests
|
||||
- **BG (Background):** Proactive prefetch
|
||||
|
||||
**Pipeline Depth:**
|
||||
- `pipe_avg`: Average in-flight requests
|
||||
- Target: Close to `max_pipeline_depth` (256)
|
||||
|
|
@ -1,10 +1,10 @@
|
|||
#
|
||||
# CRIU version.
|
||||
CRIU_VERSION_MAJOR := 4
|
||||
CRIU_VERSION_MINOR := 0
|
||||
CRIU_VERSION_MINOR := 2
|
||||
CRIU_VERSION_SUBLEVEL :=
|
||||
CRIU_VERSION_EXTRA :=
|
||||
CRIU_VERSION_NAME := CRIUDA
|
||||
CRIU_VERSION_NAME := CRIUTIBILITY
|
||||
CRIU_VERSION := $(CRIU_VERSION_MAJOR)$(if $(CRIU_VERSION_MINOR),.$(CRIU_VERSION_MINOR))$(if $(CRIU_VERSION_SUBLEVEL),.$(CRIU_VERSION_SUBLEVEL))$(if $(CRIU_VERSION_EXTRA),.$(CRIU_VERSION_EXTRA))
|
||||
|
||||
export CRIU_VERSION_MAJOR CRIU_VERSION_MINOR CRIU_VERSION_SUBLEVEL
|
||||
|
|
|
|||
8
compel/arch/arm/plugins/std/parasite-head.S
Normal file
8
compel/arch/arm/plugins/std/parasite-head.S
Normal file
|
|
@ -0,0 +1,8 @@
|
|||
#include "common/asm/linkage.h"
|
||||
|
||||
.section .head.text, "ax"
|
||||
ENTRY(__export_parasite_head_start)
|
||||
bl parasite_service
|
||||
.byte 0xf0, 0x01, 0xf0, 0xe7 @ the instruction UDF #32 generates the signal SIGTRAP in Linux
|
||||
|
||||
END(__export_parasite_head_start)
|
||||
|
|
@ -794,7 +794,8 @@ class coredump_generator:
|
|||
off = 0 # in pages
|
||||
for m in pagemap[1:]:
|
||||
found = False
|
||||
num_pages = m.get("nr_pages", m.compat_nr_pages)
|
||||
num_pages = m.get("nr_pages", m["compat_nr_pages"])
|
||||
|
||||
for i in range(num_pages):
|
||||
if m["vaddr"] + i * PAGESIZE == page_no * PAGESIZE:
|
||||
found = True
|
||||
|
|
|
|||
96
criu/arch/arm/aeabi-helpers.S
Normal file
96
criu/arch/arm/aeabi-helpers.S
Normal file
|
|
@ -0,0 +1,96 @@
|
|||
/*
|
||||
* Code borrowed from gcc, arm/lib1funcs.S
|
||||
* and adapted to CRIU macros.
|
||||
*/
|
||||
|
||||
#if defined(__thumb__)
|
||||
/*
|
||||
* We don't support compiling PIEs in Thumb mode,
|
||||
* see top Makefile for details (ARM CFLAGS_PIE section).
|
||||
*/
|
||||
#error Unsupported Thumb mode
|
||||
#endif
|
||||
|
||||
#include "common/asm/linkage.h"
|
||||
|
||||
#define RET bx lr
|
||||
#define RETc(x) bx##x lr
|
||||
#define LSYM(x) .x
|
||||
|
||||
.macro do_it cond, suffix=""
|
||||
.endm
|
||||
|
||||
.macro ARM_DIV2_ORDER divisor, order
|
||||
clz \order, \divisor
|
||||
rsb \order, \order, #31
|
||||
.endm
|
||||
|
||||
.macro ARM_DIV_BODY dividend, divisor, result, curbit
|
||||
clz \curbit, \dividend
|
||||
clz \result, \divisor
|
||||
sub \curbit, \result, \curbit
|
||||
rsbs \curbit, \curbit, #31
|
||||
addne \curbit, \curbit, \curbit, lsl #1
|
||||
mov \result, #0
|
||||
addne pc, pc, \curbit, lsl #2
|
||||
nop
|
||||
.set shift, 32
|
||||
.rept 32
|
||||
.set shift, shift - 1
|
||||
cmp \dividend, \divisor, lsl #shift
|
||||
adc \result, \result, \result
|
||||
subcs \dividend, \dividend, \divisor, lsl #shift
|
||||
.endr
|
||||
.endm
|
||||
|
||||
/*
|
||||
* XXX: as an optimization add udiv instruction based version.
|
||||
* It's possible to check if CPU supports the instruction by
|
||||
* reading Instruction Set Attribute Register (ID_ISAR0)
|
||||
* and checking fields "Divide_instrs".
|
||||
*/
|
||||
ENTRY(__aeabi_uidiv)
|
||||
/* Note: if called via udivsi3_skip_div0_test, this will unnecessarily
|
||||
check for division-by-zero a second time. */
|
||||
LSYM(udivsi3_skip_div0_test):
|
||||
subs r2, r1, #1
|
||||
do_it eq
|
||||
RETc(eq)
|
||||
bcc LSYM(Ldiv0)
|
||||
cmp r0, r1
|
||||
bls 11f
|
||||
tst r1, r2
|
||||
beq 12f
|
||||
|
||||
ARM_DIV_BODY r0, r1, r2, r3
|
||||
|
||||
mov r0, r2
|
||||
RET
|
||||
|
||||
11: do_it eq, e
|
||||
moveq r0, #1
|
||||
movne r0, #0
|
||||
RET
|
||||
|
||||
12: ARM_DIV2_ORDER r1, r2
|
||||
|
||||
mov r0, r0, lsr r2
|
||||
RET
|
||||
|
||||
LSYM(Ldiv0):
|
||||
.byte 0xf0, 0x01, 0xf0, 0xe7 @ the instruction UDF #32 generates the signal SIGTRAP in Linux
|
||||
|
||||
END(__aeabi_uidiv)
|
||||
ALIAS(__udivsi3, __aeabi_uidiv)
|
||||
|
||||
ENTRY(__aeabi_uidivmod)
|
||||
cmp r1, #0
|
||||
beq LSYM(Ldiv0)
|
||||
stmfd sp!, { r0, r1, lr }
|
||||
bl LSYM(udivsi3_skip_div0_test)
|
||||
ldmfd sp!, { r1, r2, lr }
|
||||
mul r3, r2, r0
|
||||
sub r1, r1, r3
|
||||
RET
|
||||
END(__aeabi_uidivmod)
|
||||
ALIAS(__umodsi3, __aeabi_uidiv)
|
||||
24
criu/arch/arm/bitops.S
Normal file
24
criu/arch/arm/bitops.S
Normal file
|
|
@ -0,0 +1,24 @@
|
|||
#include "common/asm/linkage.h"
|
||||
|
||||
.syntax unified
|
||||
|
||||
ENTRY(test_and_set_bit)
|
||||
ands ip, r1, #3
|
||||
strbne r1, [ip] @ assert word-aligned
|
||||
mov r2, #1
|
||||
and r3, r0, #31 @ Get bit offset
|
||||
mov r0, r0, lsr #5
|
||||
add r1, r1, r0, lsl #2 @ Get word offset
|
||||
mov r3, r2, lsl r3 @ create mask
|
||||
dmb ish
|
||||
1: ldrex r2, [r1]
|
||||
ands r0, r2, r3 @ save old value of bit
|
||||
orreq r2, r2, r3 @ toggle bit
|
||||
strex ip, r2, [r1]
|
||||
cmp ip, #0
|
||||
bne 1b
|
||||
dmb ish
|
||||
cmp r0, #0
|
||||
movne r0, #1
|
||||
2: bx lr
|
||||
END(test_and_set_bit)
|
||||
|
|
@ -672,7 +672,6 @@ int parse_options(int argc, char **argv, bool *usage_error, bool *has_exec_cmd,
|
|||
{ "external", required_argument, 0, 1073 },
|
||||
{ "empty-ns", required_argument, 0, 1074 },
|
||||
{ "lazy-pages", no_argument, 0, 1076 },
|
||||
{ "cow-dump", no_argument, 0, 1101 },
|
||||
BOOL_OPT("extra", &opts.check_extra_features),
|
||||
BOOL_OPT("experimental", &opts.check_experimental_features),
|
||||
{ "all", no_argument, 0, 1079 },
|
||||
|
|
@ -706,6 +705,7 @@ int parse_options(int argc, char **argv, bool *usage_error, bool *has_exec_cmd,
|
|||
BOOL_OPT("unprivileged", &opts.unprivileged),
|
||||
BOOL_OPT("ghost-fiemap", &opts.ghost_fiemap),
|
||||
BOOL_OPT(OPT_ALLOW_UPROBES, &opts.allow_uprobes),
|
||||
{ "cow-dump", no_argument, 0, 1105 },
|
||||
{},
|
||||
};
|
||||
|
||||
|
|
@ -943,9 +943,6 @@ int parse_options(int argc, char **argv, bool *usage_error, bool *has_exec_cmd,
|
|||
case 1076:
|
||||
opts.lazy_pages = true;
|
||||
break;
|
||||
case 1101:
|
||||
opts.cow_dump = true;
|
||||
break;
|
||||
case 'M': {
|
||||
char *aux;
|
||||
|
||||
|
|
@ -1049,6 +1046,9 @@ int parse_options(int argc, char **argv, bool *usage_error, bool *has_exec_cmd,
|
|||
return 1;
|
||||
}
|
||||
break;
|
||||
case 1105:
|
||||
opts.cow_dump = true;
|
||||
break;
|
||||
case 'V':
|
||||
pr_msg("Version: %s\n", CRIU_VERSION);
|
||||
if (strcmp(CRIU_GITID, "0"))
|
||||
|
|
|
|||
363
criu/cow-dump.c
363
criu/cow-dump.c
|
|
@ -8,6 +8,9 @@
|
|||
#include <errno.h>
|
||||
#include <linux/userfaultfd.h>
|
||||
#include <pthread.h>
|
||||
#include <time.h>
|
||||
#include <string.h>
|
||||
#include <poll.h>
|
||||
|
||||
#include "types.h"
|
||||
#include "cr_options.h"
|
||||
|
|
@ -37,9 +40,8 @@ struct cow_dump_info {
|
|||
unsigned long dirty_pages_dumped; /* Pages already written to disk */
|
||||
unsigned long iteration; /* Current iteration number */
|
||||
struct list_head dirty_list; /* List of dirty page ranges */
|
||||
struct page_xfer xfer; /* Page transfer context */
|
||||
struct page_pipe *pp; /* Page pipe for batching writes */
|
||||
bool xfer_initialized; /* Whether xfer was opened */
|
||||
struct hlist_head cow_hash[COW_HASH_SIZE]; /* Hash table for copied pages */
|
||||
pthread_spinlock_t cow_hash_locks[COW_HASH_SIZE]; /* Per-bucket spinlocks */
|
||||
};
|
||||
|
||||
/* Dirty page range */
|
||||
|
|
@ -57,6 +59,56 @@ static volatile bool g_stop_monitoring = false;
|
|||
#define COW_CONVERGENCE_THRESHOLD 100 /* Stop if < 100 pages dirty per iteration */
|
||||
#define COW_FLUSH_THRESHOLD 1000 /* Flush to disk every 1000 pages */
|
||||
|
||||
/* Statistics tracking structure */
|
||||
static struct {
|
||||
/* Event counters */
|
||||
unsigned long write_faults;
|
||||
unsigned long fork_events;
|
||||
unsigned long remap_events;
|
||||
unsigned long unknown_events;
|
||||
|
||||
/* Operation counters */
|
||||
unsigned long pages_copied;
|
||||
unsigned long pages_unprotected;
|
||||
unsigned long pages_woken;
|
||||
|
||||
/* Error counters */
|
||||
unsigned long alloc_failures;
|
||||
unsigned long read_failures;
|
||||
unsigned long unprotect_failures;
|
||||
unsigned long wake_failures;
|
||||
unsigned long eagain_errors;
|
||||
unsigned long read_errors;
|
||||
|
||||
time_t last_print_time;
|
||||
} cow_stats;
|
||||
|
||||
static void check_and_print_cow_stats(void)
|
||||
{
|
||||
time_t now = time(NULL);
|
||||
|
||||
if (now - cow_stats.last_print_time >= 1) {
|
||||
pr_warn("[COW_STATS] events: wr=%lu fork=%lu remap=%lu unk=%lu | ops: copied=%lu unprot=%lu woken=%lu | errs: alloc=%lu read=%lu unprot_err=%lu wake_err=%lu read_err=%lu eagain_err=%lu\n",
|
||||
cow_stats.write_faults,
|
||||
cow_stats.fork_events,
|
||||
cow_stats.remap_events,
|
||||
cow_stats.unknown_events,
|
||||
cow_stats.pages_copied,
|
||||
cow_stats.pages_unprotected,
|
||||
cow_stats.pages_woken,
|
||||
cow_stats.alloc_failures,
|
||||
cow_stats.read_failures,
|
||||
cow_stats.unprotect_failures,
|
||||
cow_stats.wake_failures,
|
||||
cow_stats.read_errors,
|
||||
cow_stats.eagain_errors);
|
||||
|
||||
/* Reset all counters */
|
||||
memset(&cow_stats, 0, sizeof(cow_stats));
|
||||
cow_stats.last_print_time = now;
|
||||
}
|
||||
}
|
||||
|
||||
bool cow_check_kernel_support(void)
|
||||
{
|
||||
unsigned long features = UFFD_FEATURE_WP_ASYNC |
|
||||
|
|
@ -133,6 +185,12 @@ int cow_dump_init(struct pstree_item *item, struct vm_area_list *vma_area_list,
|
|||
INIT_LIST_HEAD(&cdi->dirty_list);
|
||||
cdi->uffd = -1; /* Will be received from parasite */
|
||||
|
||||
/* Initialize hash table for COW pages */
|
||||
for (int i = 0; i < COW_HASH_SIZE; i++) {
|
||||
INIT_HLIST_HEAD(&cdi->cow_hash[i]);
|
||||
pthread_spin_init(&cdi->cow_hash_locks[i], PTHREAD_PROCESS_PRIVATE);
|
||||
}
|
||||
|
||||
/* Open /proc/pid/mem for reading pages */
|
||||
cdi->proc_mem_fd = open_proc_mem(item->pid->real);
|
||||
if (cdi->proc_mem_fd < 0)
|
||||
|
|
@ -204,26 +262,7 @@ int cow_dump_init(struct pstree_item *item, struct vm_area_list *vma_area_list,
|
|||
|
||||
cdi->total_pages = args->total_pages;
|
||||
cdi->dirty_pages_dumped = 0;
|
||||
cdi->xfer_initialized = false;
|
||||
|
||||
/* Initialize page_xfer for writing pages to disk */
|
||||
ret = open_page_xfer(&cdi->xfer, CR_FD_PAGEMAP, vpid(item));
|
||||
if (ret < 0) {
|
||||
pr_err("Failed to open page_xfer\n");
|
||||
close(cdi->uffd);
|
||||
goto err_close_mem;
|
||||
}
|
||||
cdi->xfer_initialized = true;
|
||||
|
||||
/* Create page_pipe for batching page writes */
|
||||
cdi->pp = create_page_pipe(cdi->total_pages, NULL, 0);
|
||||
if (!cdi->pp) {
|
||||
pr_err("Failed to create page_pipe\n");
|
||||
cdi->xfer.close(&cdi->xfer);
|
||||
close(cdi->uffd);
|
||||
goto err_close_mem;
|
||||
}
|
||||
|
||||
|
||||
pr_info("COW dump initialized: tracking %lu pages, uffd=%d\n",
|
||||
cdi->total_pages, cdi->uffd);
|
||||
|
||||
|
|
@ -241,31 +280,36 @@ err_free:
|
|||
void cow_dump_fini(void)
|
||||
{
|
||||
struct dirty_range *dr, *tmp;
|
||||
struct cow_page *cp;
|
||||
struct hlist_node *n;
|
||||
int i, remaining = 0;
|
||||
|
||||
if (!g_cow_info)
|
||||
return;
|
||||
|
||||
pr_info("Cleaning up COW dump\n");
|
||||
|
||||
/* Flush any remaining dirty pages before cleanup */
|
||||
if (g_cow_info->pp && g_cow_info->xfer_initialized) {
|
||||
pr_info("Flushing remaining dirty pages: %lu dumped so far\n",
|
||||
g_cow_info->dirty_pages_dumped);
|
||||
if (page_xfer_dump_pages(&g_cow_info->xfer, g_cow_info->pp) < 0)
|
||||
pr_err("Failed to flush remaining pages during cleanup\n");
|
||||
/* Clean up any remaining COW pages */
|
||||
for (i = 0; i < COW_HASH_SIZE; i++) {
|
||||
pthread_spin_lock(&g_cow_info->cow_hash_locks[i]);
|
||||
hlist_for_each_entry_safe(cp, n, &g_cow_info->cow_hash[i], hash) {
|
||||
hlist_del(&cp->hash);
|
||||
xfree(cp->data);
|
||||
xfree(cp);
|
||||
remaining++;
|
||||
}
|
||||
pthread_spin_unlock(&g_cow_info->cow_hash_locks[i]);
|
||||
pthread_spin_destroy(&g_cow_info->cow_hash_locks[i]);
|
||||
}
|
||||
|
||||
if (remaining > 0)
|
||||
pr_warn("Freed %d remaining COW pages\n", remaining);
|
||||
|
||||
list_for_each_entry_safe(dr, tmp, &g_cow_info->dirty_list, list) {
|
||||
list_del(&dr->list);
|
||||
xfree(dr);
|
||||
}
|
||||
|
||||
if (g_cow_info->pp)
|
||||
destroy_page_pipe(g_cow_info->pp);
|
||||
|
||||
if (g_cow_info->xfer_initialized)
|
||||
g_cow_info->xfer.close(&g_cow_info->xfer);
|
||||
|
||||
|
||||
if (g_cow_info->proc_mem_fd >= 0)
|
||||
close(g_cow_info->proc_mem_fd);
|
||||
|
||||
|
|
@ -275,127 +319,59 @@ void cow_dump_fini(void)
|
|||
xfree(g_cow_info);
|
||||
g_cow_info = NULL;
|
||||
}
|
||||
#if 0
|
||||
/* Flush accumulated dirty pages to disk */
|
||||
static int cow_flush_dirty_pages(struct cow_dump_info *cdi)
|
||||
{
|
||||
int ret;
|
||||
|
||||
if (!cdi->pp || !cdi->xfer_initialized)
|
||||
return 0;
|
||||
|
||||
/* Check if there are pages to flush */
|
||||
if (cdi->pp->nr_pipes == 0)
|
||||
return 0;
|
||||
|
||||
pr_info("Flushing %lu dirty pages to disk\n",
|
||||
cdi->dirty_pages_dumped - (cdi->dirty_pages_dumped - cdi->pp->nr_pipes));
|
||||
|
||||
ret = page_xfer_dump_pages(&cdi->xfer, cdi->pp);
|
||||
if (ret < 0) {
|
||||
pr_err("Failed to flush dirty pages to disk\n");
|
||||
return ret;
|
||||
}
|
||||
|
||||
/* Reset page_pipe for next batch */
|
||||
page_pipe_reinit(cdi->pp);
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
/* Write a single page to the page_pipe */
|
||||
static int cow_write_page_to_pipe(struct cow_dump_info *cdi, unsigned long page_addr)
|
||||
{
|
||||
unsigned char page_buf[PAGE_SIZE];
|
||||
ssize_t ret;
|
||||
int pipe_fd;
|
||||
|
||||
/* Read the page from /proc/pid/mem */
|
||||
ret = pread(cdi->proc_mem_fd, page_buf, PAGE_SIZE, page_addr);
|
||||
if (ret != PAGE_SIZE) {
|
||||
if (ret < 0)
|
||||
pr_perror("Failed to read page at 0x%lx from /proc/pid/mem", page_addr);
|
||||
else
|
||||
pr_err("Short read from /proc/pid/mem at 0x%lx: %zd\n", page_addr, ret);
|
||||
return -1;
|
||||
}
|
||||
|
||||
/* Add page to page_pipe - this creates the iov entry */
|
||||
ret = page_pipe_add_page(cdi->pp, page_addr, 0);
|
||||
if (ret < 0) {
|
||||
if (ret == -EAGAIN) {
|
||||
/* Page pipe is full, flush it */
|
||||
if (cow_flush_dirty_pages(cdi) < 0)
|
||||
return -1;
|
||||
/* Try again after flush */
|
||||
ret = page_pipe_add_page(cdi->pp, page_addr, 0);
|
||||
if (ret < 0) {
|
||||
pr_err("Failed to add page to pipe even after flush\n");
|
||||
return -1;
|
||||
}
|
||||
} else {
|
||||
pr_err("Failed to add page 0x%lx to page_pipe: %d\n", page_addr, (int)ret);
|
||||
return -1;
|
||||
}
|
||||
}
|
||||
|
||||
/* Write page data to the pipe */
|
||||
/* The page_pipe has buffers, we need to write to the last buffer's write end */
|
||||
if (!list_empty(&cdi->pp->bufs)) {
|
||||
struct page_pipe_buf *ppb = list_entry(cdi->pp->bufs.prev, struct page_pipe_buf, l);
|
||||
pipe_fd = ppb->p[1]; /* Write end of pipe */
|
||||
|
||||
ret = write(pipe_fd, page_buf, PAGE_SIZE);
|
||||
if (ret != PAGE_SIZE) {
|
||||
if (ret < 0)
|
||||
pr_perror("Failed to write page to pipe");
|
||||
else
|
||||
pr_err("Short write to pipe: %zd\n", ret);
|
||||
return -1;
|
||||
}
|
||||
} else {
|
||||
pr_err("No page_pipe buffers available\n");
|
||||
return -1;
|
||||
}
|
||||
|
||||
cdi->dirty_pages_dumped++;
|
||||
|
||||
/* Check if we should flush */
|
||||
if (cdi->dirty_pages_dumped % COW_FLUSH_THRESHOLD == 0) {
|
||||
pr_debug("Reached flush threshold, flushing pages\n");
|
||||
return cow_flush_dirty_pages(cdi);
|
||||
}
|
||||
|
||||
return 0;
|
||||
}
|
||||
#endif
|
||||
static int cow_handle_write_fault(struct cow_dump_info *cdi, unsigned long addr)
|
||||
{
|
||||
struct dirty_range *dr;
|
||||
struct cow_page *cp;
|
||||
unsigned long page_addr = addr & ~(PAGE_SIZE - 1);
|
||||
void* page;
|
||||
struct uffdio_writeprotect wp;
|
||||
struct uffdio_range range;
|
||||
ssize_t ret;
|
||||
unsigned int hash;
|
||||
|
||||
|
||||
pr_debug("Write fault at 0x%lx\n", page_addr);
|
||||
pr_info("Write fault at 0x%lx\n", page_addr);
|
||||
|
||||
cow_stats.write_faults++;
|
||||
cdi->dirty_pages++;
|
||||
|
||||
/* Add to dirty list for tracking */
|
||||
dr = xmalloc(sizeof(*dr));
|
||||
if (!dr) {
|
||||
/* Allocate cow_page structure */
|
||||
cp = xmalloc(sizeof(*cp));
|
||||
if (!cp) {
|
||||
pr_err("Failed to allocate cow_page structure\n");
|
||||
cow_stats.alloc_failures++;
|
||||
return -1;
|
||||
}
|
||||
|
||||
page = xmalloc(PAGE_SIZE);
|
||||
//memcpy(page,(void*)page_addr, PAGE_SIZE);
|
||||
cp->data = xmalloc(PAGE_SIZE);
|
||||
if (!cp->data) {
|
||||
pr_err("Failed to allocate page data\n");
|
||||
xfree(cp);
|
||||
cow_stats.alloc_failures++;
|
||||
return -1;
|
||||
}
|
||||
|
||||
dr->start = (unsigned long)page;
|
||||
dr->len = PAGE_SIZE;
|
||||
INIT_LIST_HEAD(&dr->list);
|
||||
list_add_tail(&dr->list, &cdi->dirty_list);
|
||||
cp->vaddr = page_addr;
|
||||
INIT_HLIST_NODE(&cp->hash);
|
||||
|
||||
/* Read original page content from /proc/pid/mem */
|
||||
ret = pread(cdi->proc_mem_fd, cp->data, PAGE_SIZE, page_addr);
|
||||
if (ret != PAGE_SIZE) {
|
||||
pr_perror("Failed to read page at 0x%lx (read %zd bytes)", page_addr, ret);
|
||||
xfree(cp->data);
|
||||
xfree(cp);
|
||||
cow_stats.read_failures++;
|
||||
return -1;
|
||||
}
|
||||
|
||||
/* Add to hash table (thread-safe with per-bucket spinlock) */
|
||||
hash = (page_addr >> PAGE_SHIFT) & (COW_HASH_SIZE - 1);
|
||||
|
||||
pthread_spin_lock(&cdi->cow_hash_locks[hash]);
|
||||
hlist_add_head(&cp->hash, &cdi->cow_hash[hash]);
|
||||
pthread_spin_unlock(&cdi->cow_hash_locks[hash]);
|
||||
|
||||
cow_stats.pages_copied++;
|
||||
pr_debug("Copied page at 0x%lx to hash bucket %u\n", page_addr, hash);
|
||||
|
||||
/* Unprotect the page so the process can continue */
|
||||
wp.range.start = page_addr;
|
||||
|
|
@ -404,18 +380,23 @@ static int cow_handle_write_fault(struct cow_dump_info *cdi, unsigned long addr)
|
|||
|
||||
if (ioctl(cdi->uffd, UFFDIO_WRITEPROTECT, &wp)) {
|
||||
pr_perror("Failed to unprotect page at 0x%lx", page_addr);
|
||||
cow_stats.unprotect_failures++;
|
||||
return -1;
|
||||
}
|
||||
|
||||
cow_stats.pages_unprotected++;
|
||||
|
||||
/* Wake up the faulting thread */
|
||||
range.start = page_addr;
|
||||
range.len = PAGE_SIZE;
|
||||
|
||||
if (ioctl(cdi->uffd, UFFDIO_WAKE, &range)) {
|
||||
pr_perror("Failed to wake thread after unprotect");
|
||||
cow_stats.wake_failures++;
|
||||
return -1;
|
||||
}
|
||||
|
||||
cow_stats.pages_woken++;
|
||||
cdi->total_pages--;
|
||||
return 0;
|
||||
}
|
||||
|
|
@ -423,20 +404,52 @@ static int cow_handle_write_fault(struct cow_dump_info *cdi, unsigned long addr)
|
|||
static int cow_process_events(struct cow_dump_info *cdi, bool blocking)
|
||||
{
|
||||
struct uffd_msg msg;
|
||||
int ret;
|
||||
//int flags = blocking ? MSG_WAITALL : MSG_DONTWAIT;
|
||||
struct pollfd pfd;
|
||||
int ret, poll_ret;
|
||||
|
||||
while (1) {
|
||||
/* Check and print stats */
|
||||
check_and_print_cow_stats();
|
||||
|
||||
/* Try reading directly first - avoids poll() overhead when data is ready */
|
||||
ret = read(cdi->uffd, &msg, sizeof(msg));
|
||||
|
||||
if (ret < 0 && errno == EAGAIN && blocking) {
|
||||
/* No data available and we want to block - use poll() with timeout */
|
||||
pfd.fd = cdi->uffd;
|
||||
pfd.events = POLLIN;
|
||||
pfd.revents = 0;
|
||||
|
||||
poll_ret = poll(&pfd, 1, 500); /* 500ms timeout */
|
||||
if (poll_ret < 0) {
|
||||
pr_perror("poll() failed on uffd");
|
||||
cow_stats.read_errors++;
|
||||
return -1;
|
||||
}
|
||||
|
||||
if (poll_ret == 0) {
|
||||
/* Timeout - no events within 500ms */
|
||||
return 0;
|
||||
}
|
||||
|
||||
/* Data ready after poll - retry read */
|
||||
ret = read(cdi->uffd, &msg, sizeof(msg));
|
||||
}
|
||||
|
||||
if (ret < 0) {
|
||||
if (errno == EAGAIN && !blocking)
|
||||
return 0; /* No more events */
|
||||
if (errno == EAGAIN && !blocking) {
|
||||
/* Non-blocking mode and no data */
|
||||
cow_stats.eagain_errors++;
|
||||
return 0;
|
||||
}
|
||||
pr_perror("Failed to read uffd event");
|
||||
cow_stats.read_errors++;
|
||||
return -1;
|
||||
}
|
||||
|
||||
if (ret != sizeof(msg)) {
|
||||
pr_err("Short read from uffd: %d\n", ret);
|
||||
cow_stats.read_errors++;
|
||||
return -1;
|
||||
}
|
||||
|
||||
|
|
@ -450,14 +463,17 @@ static int cow_process_events(struct cow_dump_info *cdi, bool blocking)
|
|||
break;
|
||||
|
||||
case UFFD_EVENT_FORK:
|
||||
cow_stats.fork_events++;
|
||||
pr_warn("Process forked during COW dump (not fully supported)\n");
|
||||
break;
|
||||
|
||||
case UFFD_EVENT_REMAP:
|
||||
cow_stats.remap_events++;
|
||||
pr_info("Memory remap event\n");
|
||||
break;
|
||||
|
||||
default:
|
||||
cow_stats.unknown_events++;
|
||||
pr_err("Unexpected uffd event: %u\n", msg.event);
|
||||
return -1;
|
||||
}
|
||||
|
|
@ -469,28 +485,19 @@ static int cow_process_events(struct cow_dump_info *cdi, bool blocking)
|
|||
/* Background thread that monitors for write faults */
|
||||
static void *cow_monitor_thread(void *arg)
|
||||
{
|
||||
int iteration_count = 0;
|
||||
struct cow_dump_info *cdi = (struct cow_dump_info *)arg;
|
||||
|
||||
pr_info("COW monitor thread started\n");
|
||||
|
||||
while (g_cow_info->total_pages != 0) {
|
||||
|
||||
pr_warn("PAGE SERVER READY TO SERVE\n");
|
||||
|
||||
while (!g_stop_monitoring) {
|
||||
|
||||
|
||||
/* Process events with short timeout */
|
||||
if (cow_process_events(cdi, false) < 0) {
|
||||
if (cow_process_events(cdi, true) < 0) {
|
||||
pr_err("Error processing COW events in monitor thread\n");
|
||||
break;
|
||||
}
|
||||
/* Small delay to avoid busy-waiting */
|
||||
//usleep(1000); /* 1ms */
|
||||
/* Print total pages once per second */
|
||||
iteration_count++;
|
||||
if (iteration_count >= 10000) { /* 1000 * 1ms = 1 second */
|
||||
pr_info("COW monitor: %lu pages remaining\n", g_cow_info->total_pages);
|
||||
iteration_count = 0;
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
pr_info("COW monitor thread stopped\n");
|
||||
|
|
@ -538,3 +545,39 @@ int cow_stop_monitor_thread(void)
|
|||
pr_info("COW monitor thread stopped successfully\n");
|
||||
return 0;
|
||||
}
|
||||
|
||||
int cow_get_uffd(void)
|
||||
{
|
||||
if (!g_cow_info)
|
||||
return -1;
|
||||
|
||||
return g_cow_info->uffd;
|
||||
}
|
||||
|
||||
struct cow_page *cow_lookup_and_remove_page(unsigned long vaddr)
|
||||
{
|
||||
struct cow_page *cp;
|
||||
struct hlist_node *n;
|
||||
unsigned int hash;
|
||||
unsigned long page_addr = vaddr & ~(PAGE_SIZE - 1);
|
||||
|
||||
if (!g_cow_info)
|
||||
return NULL;
|
||||
|
||||
hash = (page_addr >> PAGE_SHIFT) & (COW_HASH_SIZE - 1);
|
||||
|
||||
pthread_spin_lock(&g_cow_info->cow_hash_locks[hash]);
|
||||
|
||||
hlist_for_each_entry_safe(cp, n, &g_cow_info->cow_hash[hash], hash) {
|
||||
if (cp->vaddr == page_addr) {
|
||||
hlist_del(&cp->hash);
|
||||
pthread_spin_unlock(&g_cow_info->cow_hash_locks[hash]);
|
||||
pr_debug("Found and removed COW page at 0x%lx from hash bucket %u\n",
|
||||
page_addr, hash);
|
||||
return cp;
|
||||
}
|
||||
}
|
||||
|
||||
pthread_spin_unlock(&g_cow_info->cow_hash_locks[hash]);
|
||||
return NULL;
|
||||
}
|
||||
|
|
|
|||
153
criu/cr-dump.c
153
criu/cr-dump.c
|
|
@ -1711,125 +1711,73 @@ static int dump_one_task(struct pstree_item *item, InventoryEntry *parent_ie)
|
|||
mdc.stat = &pps_buf;
|
||||
mdc.parent_ie = parent_ie;
|
||||
|
||||
if (!opts.cow_dump) {
|
||||
/* Normal dump - dump all pages */
|
||||
ret = parasite_dump_pages_seized(item, &vmas, &mdc, parasite_ctl);
|
||||
if (ret)
|
||||
goto err_cure;
|
||||
} else {
|
||||
/* COW dump mode: split VMAs by size */
|
||||
unsigned long threshold_pages = 25000; /* 25K pages ~= 100MB */
|
||||
unsigned long large_pages = 0;
|
||||
struct vma_area *vma, *tmp;
|
||||
|
||||
pr_info("COW dump: splitting VMAs (threshold=%lu pages) vmas.\n", threshold_pages);
|
||||
pr_info("COW dump: splitting VMAs (threshold=%lu pages) vmas.nr=%u nr_aios=%u rst_priv_size=%lu nr_priv_pages_longest=%lu nr_shared_pages_longest=%lu\n", threshold_pages,
|
||||
vmas.nr, vmas.nr_aios, vmas.rst_priv_size, vmas.nr_priv_pages_longest, vmas.nr_shared_pages_longest);
|
||||
|
||||
/* Split VMAs by size */
|
||||
list_for_each_entry_safe(vma, tmp, &vmas.h, list) {
|
||||
|
||||
if (vma_area_is(vma, VMA_AREA_GUARD)) {
|
||||
pr_info("COW dump: splitting VMAs VMA_AREA_GUARDVMA_AREA_GUARDVMA_AREA_GUARDVMA_AREA_GUARDVMA_AREA_GUARD(threshold=%lu pages) vmas.\n", threshold_pages);
|
||||
continue;
|
||||
}
|
||||
|
||||
if ((vma_area_len(vma) / PAGE_SIZE) >= threshold_pages) {
|
||||
vma->e->status |= VMA_AREA_GUARD;
|
||||
large_pages +=1;
|
||||
}
|
||||
}
|
||||
|
||||
ret = parasite_dump_pages_seized(item, &vmas, &mdc, parasite_ctl);
|
||||
if (ret) {
|
||||
pr_err("Failed to dump small VMAs\n");
|
||||
goto err_cure;
|
||||
}
|
||||
|
||||
|
||||
/* Initialize COW tracking for large VMAs only */
|
||||
if (large_pages > 0) {
|
||||
/* Rebuild the list for large VMAs */
|
||||
list_for_each_entry_safe(vma, tmp, &vmas.h, list) {
|
||||
unsigned long nr_pages;
|
||||
|
||||
nr_pages = vma_area_len(vma) / PAGE_SIZE;
|
||||
if (nr_pages >= threshold_pages) {
|
||||
vma->e->status = vma->e->status & (~VMA_AREA_GUARD);
|
||||
}
|
||||
}
|
||||
|
||||
ret = cow_dump_init(item, &vmas, parasite_ctl);
|
||||
if (ret) {
|
||||
pr_err("Failed to initialize COW dump for large VMAs\n");
|
||||
goto err_cure;
|
||||
}
|
||||
|
||||
/* Start background thread to monitor page faults */
|
||||
ret = cow_start_monitor_thread();
|
||||
if (ret) {
|
||||
pr_err("Failed to start COW monitor thread\n");
|
||||
goto err_cure;
|
||||
}
|
||||
ret = parasite_dump_pages_seized(item, &vmas, &mdc, parasite_ctl);
|
||||
if (ret)
|
||||
goto err_cure;
|
||||
|
||||
} else {
|
||||
pr_info("No large VMAs found, skipping COW tracking\n");
|
||||
}
|
||||
}
|
||||
|
||||
pr_info("file = %s, line = %d\n", __FILE__, __LINE__);
|
||||
ret = parasite_dump_sigacts_seized(parasite_ctl, item);
|
||||
if (ret) {
|
||||
pr_err("Can't dump sigactions (pid: %d) with parasite\n", pid);
|
||||
goto err_cure;
|
||||
}
|
||||
pr_info("file = %s, line = %d\n", __FILE__, __LINE__);
|
||||
|
||||
ret = parasite_dump_itimers_seized(parasite_ctl, item);
|
||||
if (ret) {
|
||||
pr_err("Can't dump itimers (pid: %d)\n", pid);
|
||||
goto err_cure;
|
||||
}
|
||||
pr_info("file = %s, line = %d\n", __FILE__, __LINE__);
|
||||
|
||||
ret = parasite_dump_posix_timers_seized(&proc_args, parasite_ctl, item);
|
||||
if (ret) {
|
||||
pr_err("Can't dump posix timers (pid: %d)\n", pid);
|
||||
goto err_cure;
|
||||
}
|
||||
pr_info("file = %s, line = %d\n", __FILE__, __LINE__);
|
||||
|
||||
ret = dump_task_core_all(parasite_ctl, item, &pps_buf, cr_imgset, &misc);
|
||||
if (ret) {
|
||||
pr_err("Dump core (pid: %d) failed with %d\n", pid, ret);
|
||||
goto err_cure;
|
||||
}
|
||||
pr_info("file = %s, line = %d\n", __FILE__, __LINE__);
|
||||
|
||||
ret = dump_task_cgroup(parasite_ctl, item);
|
||||
if (ret) {
|
||||
pr_err("Dump cgroup of threads in process (pid: %d) failed with %d\n", pid, ret);
|
||||
goto err_cure;
|
||||
}
|
||||
pr_info("file = %s, line = %d\n", __FILE__, __LINE__);
|
||||
|
||||
if (opts.cow_dump) {
|
||||
/* COW dump mode: split VMAs by size */
|
||||
ret = cow_dump_init(item, &vmas, parasite_ctl);
|
||||
if (ret) {
|
||||
pr_err("Failed to initialize COW dump for VMAs\n");
|
||||
goto err_cure;
|
||||
}
|
||||
|
||||
/* Start background thread to monitor page faults */
|
||||
ret = cow_start_monitor_thread();
|
||||
if (ret) {
|
||||
pr_err("Failed to start COW monitor thread\n");
|
||||
goto err_cure;
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
pr_info("file = %s, line = %d\n", __FILE__, __LINE__);
|
||||
ret = compel_stop_daemon(parasite_ctl);
|
||||
pr_info("file = %s, line = %d\n", __FILE__, __LINE__);
|
||||
if (ret) {
|
||||
pr_err("Can't stop daemon in parasite (pid: %d)\n", pid);
|
||||
goto err_cure;
|
||||
}
|
||||
pr_info("file = %s, line = %d\n", __FILE__, __LINE__);
|
||||
|
||||
ret = dump_task_threads(parasite_ctl, item);
|
||||
if (ret) {
|
||||
pr_err("Can't dump threads\n");
|
||||
goto err_cure;
|
||||
}
|
||||
pr_info("file = %s, line = %d\n", __FILE__, __LINE__);
|
||||
|
||||
/*
|
||||
* On failure local map will be cured in cr_dump_finish()
|
||||
* for lazy pages.
|
||||
*/
|
||||
|
||||
if (opts.lazy_pages)
|
||||
ret = compel_cure_remote(parasite_ctl);
|
||||
else
|
||||
|
|
@ -1838,20 +1786,19 @@ static int dump_one_task(struct pstree_item *item, InventoryEntry *parent_ie)
|
|||
pr_err("Can't cure (pid: %d) from parasite\n", pid);
|
||||
goto err;
|
||||
}
|
||||
pr_info("file = %s, line = %d\n", __FILE__, __LINE__);
|
||||
|
||||
ret = dump_task_mm(pid, &pps_buf, &misc, &vmas, cr_imgset);
|
||||
if (ret) {
|
||||
pr_err("Dump mappings (pid: %d) failed with %d\n", pid, ret);
|
||||
goto err;
|
||||
}
|
||||
pr_info("file = %s, line = %d\n", __FILE__, __LINE__);
|
||||
|
||||
ret = dump_task_fs(pid, &misc, cr_imgset);
|
||||
if (ret) {
|
||||
pr_err("Dump fs (pid: %d) failed with %d\n", pid, ret);
|
||||
goto err;
|
||||
}
|
||||
|
||||
pr_info("file = %s, line = %d\n", __FILE__, __LINE__);
|
||||
|
||||
exit_code = 0;
|
||||
err:
|
||||
close_cr_imgset(&cr_imgset);
|
||||
|
|
@ -2114,7 +2061,7 @@ static int cr_lazy_mem_dump(void)
|
|||
static int cr_dump_finish(int ret)
|
||||
{
|
||||
int post_dump_ret = 0;
|
||||
pr_info("file = %s, line = %d\n", __FILE__, __LINE__);
|
||||
|
||||
if (disconnect_from_page_server())
|
||||
ret = -1;
|
||||
|
||||
|
|
@ -2170,27 +2117,39 @@ static int cr_dump_finish(int ret)
|
|||
delete_link_remaps();
|
||||
clean_cr_time_mounts();
|
||||
}
|
||||
pr_info("file = %s, line = %d\n", __FILE__, __LINE__);
|
||||
if (!ret && opts.lazy_pages)
|
||||
ret = cr_lazy_mem_dump();
|
||||
|
||||
if (arch_set_thread_regs(root_item, true) < 0)
|
||||
return -1;
|
||||
|
||||
cr_plugin_fini(CR_PLUGIN_STAGE__DUMP, ret);
|
||||
|
||||
pstree_switch_state(root_item, (ret || post_dump_ret) ? TASK_ALIVE : opts.final_state);
|
||||
timing_stop(TIME_FROZEN);
|
||||
|
||||
if (!ret && opts.cow_dump) {
|
||||
pr_info("file = %s, line = %d\n", __FILE__, __LINE__);
|
||||
/* Resume process early if using COW dump with lazy pages */
|
||||
if (!ret && opts.lazy_pages && opts.cow_dump) {
|
||||
pr_info("Resuming process with COW protection active\n");
|
||||
|
||||
/* Stop the monitor thread before final dump */
|
||||
if (arch_set_thread_regs(root_item, true) < 0)
|
||||
return -1;
|
||||
|
||||
cr_plugin_fini(CR_PLUGIN_STAGE__DUMP, ret);
|
||||
|
||||
pstree_switch_state(root_item, TASK_ALIVE);
|
||||
timing_stop(TIME_FROZEN);
|
||||
|
||||
/* Now start lazy page transfer with process running */
|
||||
ret = cr_lazy_mem_dump();
|
||||
|
||||
/* Stop the monitor thread after lazy dump completes */
|
||||
if (cow_stop_monitor_thread()) {
|
||||
pr_err("Failed to stop COW monitor thread\n");
|
||||
ret = -1;
|
||||
}
|
||||
} else {
|
||||
/* Standard path: transfer pages then resume */
|
||||
if (!ret && opts.lazy_pages)
|
||||
ret = cr_lazy_mem_dump();
|
||||
|
||||
if (arch_set_thread_regs(root_item, true) < 0)
|
||||
return -1;
|
||||
|
||||
cr_plugin_fini(CR_PLUGIN_STAGE__DUMP, ret);
|
||||
|
||||
pstree_switch_state(root_item, (ret || post_dump_ret) ? TASK_ALIVE : opts.final_state);
|
||||
timing_stop(TIME_FROZEN);
|
||||
}
|
||||
|
||||
free_pstree(root_item);
|
||||
|
|
@ -2330,6 +2289,10 @@ int cr_dump_tasks(pid_t pid)
|
|||
goto err;
|
||||
}
|
||||
|
||||
ret = run_plugins(DUMP_DEVICES_LATE, pid);
|
||||
if (ret && ret != -ENOTSUP)
|
||||
goto err;
|
||||
|
||||
if (parent_ie) {
|
||||
inventory_entry__free_unpacked(parent_ie, NULL);
|
||||
parent_ie = NULL;
|
||||
|
|
|
|||
|
|
@ -439,12 +439,6 @@ static int setup_opts_from_req(int sk, CriuOpts *req)
|
|||
if (req->has_unprivileged)
|
||||
opts.unprivileged = req->unprivileged;
|
||||
|
||||
if (check_caps())
|
||||
return 1;
|
||||
|
||||
if (kerndat_init())
|
||||
return 1;
|
||||
|
||||
if (log_keep_err()) {
|
||||
pr_perror("Can't tune log");
|
||||
goto err;
|
||||
|
|
@ -738,9 +732,6 @@ static int setup_opts_from_req(int sk, CriuOpts *req)
|
|||
}
|
||||
}
|
||||
|
||||
if (req->has_pidfd_store_sk && init_pidfd_store_sk(ids.pid, req->pidfd_store_sk))
|
||||
goto err;
|
||||
|
||||
if (req->orphan_pts_master)
|
||||
opts.orphan_pts_master = true;
|
||||
|
||||
|
|
@ -817,6 +808,16 @@ static int setup_opts_from_req(int sk, CriuOpts *req)
|
|||
if (setup_logging_from_req(req, output_changed_by_rpc_conf))
|
||||
goto err;
|
||||
|
||||
if (check_caps())
|
||||
goto err;
|
||||
|
||||
if (kerndat_init())
|
||||
goto err;
|
||||
|
||||
/* init_pidfd_store_sk must be called after kerndat_init. */
|
||||
if (req->has_pidfd_store_sk && init_pidfd_store_sk(ids.pid, req->pidfd_store_sk))
|
||||
goto err;
|
||||
|
||||
if (req->mntns_compat_mode)
|
||||
opts.mntns_compat_mode = true;
|
||||
|
||||
|
|
|
|||
|
|
@ -45,10 +45,11 @@ static int open_fd(struct file_desc *d, int *new_fd)
|
|||
{
|
||||
struct ext_file_info *xfi;
|
||||
int fd;
|
||||
bool retry_needed;
|
||||
|
||||
xfi = container_of(d, struct ext_file_info, d);
|
||||
|
||||
fd = run_plugins(RESTORE_EXT_FILE, xfi->xfe->id);
|
||||
fd = run_plugins(RESTORE_EXT_FILE, xfi->xfe->id, &retry_needed);
|
||||
if (fd < 0) {
|
||||
pr_err("Unable to restore %#x\n", xfi->xfe->id);
|
||||
return -1;
|
||||
|
|
@ -57,8 +58,11 @@ static int open_fd(struct file_desc *d, int *new_fd)
|
|||
if (restore_fown(fd, xfi->xfe->fown))
|
||||
return -1;
|
||||
|
||||
*new_fd = fd;
|
||||
return 0;
|
||||
if (!retry_needed)
|
||||
*new_fd = fd;
|
||||
else
|
||||
*new_fd = -1;
|
||||
return retry_needed;
|
||||
}
|
||||
|
||||
static struct file_desc_ops ext_desc_ops = {
|
||||
|
|
|
|||
|
|
@ -2,11 +2,21 @@
|
|||
#define __CR_COW_DUMP_H_
|
||||
|
||||
#include "types.h"
|
||||
#include "common/list.h"
|
||||
|
||||
struct pstree_item;
|
||||
struct vm_area_list;
|
||||
struct parasite_ctl;
|
||||
|
||||
#define COW_HASH_BITS 16
|
||||
#define COW_HASH_SIZE (1 << COW_HASH_BITS)
|
||||
|
||||
struct cow_page {
|
||||
unsigned long vaddr;
|
||||
void *data;
|
||||
struct hlist_node hash;
|
||||
};
|
||||
|
||||
|
||||
/**
|
||||
* cow_dump_init - Initialize COW dump for a process
|
||||
|
|
@ -59,4 +69,25 @@ extern int cow_start_monitor_thread(void);
|
|||
*/
|
||||
extern int cow_stop_monitor_thread(void);
|
||||
|
||||
/**
|
||||
* cow_get_uffd - Get the userfaultfd file descriptor
|
||||
*
|
||||
* Returns the userfaultfd associated with the current COW dump session.
|
||||
*
|
||||
* Returns: userfaultfd on success, -1 if COW dump not initialized
|
||||
*/
|
||||
extern int cow_get_uffd(void);
|
||||
|
||||
/**
|
||||
* cow_lookup_and_remove_page - Look up and remove a COW page
|
||||
* @vaddr: Virtual address of the page
|
||||
*
|
||||
* Thread-safe lookup and removal of a copied page from the hash table.
|
||||
* The caller is responsible for freeing the returned cow_page structure
|
||||
* and its data.
|
||||
*
|
||||
* Returns: cow_page structure on success, NULL if not found
|
||||
*/
|
||||
extern struct cow_page *cow_lookup_and_remove_page(unsigned long vaddr);
|
||||
|
||||
#endif /* __CR_COW_DUMP_H_ */
|
||||
|
|
|
|||
|
|
@ -62,6 +62,10 @@ enum {
|
|||
|
||||
CR_PLUGIN_HOOK__POST_FORKING = 12,
|
||||
|
||||
CR_PLUGIN_HOOK__RESTORE_INIT = 13,
|
||||
|
||||
CR_PLUGIN_HOOK__DUMP_DEVICES_LATE = 14,
|
||||
|
||||
CR_PLUGIN_HOOK__MAX
|
||||
};
|
||||
|
||||
|
|
@ -70,7 +74,7 @@ enum {
|
|||
DECLARE_PLUGIN_HOOK_ARGS(CR_PLUGIN_HOOK__DUMP_UNIX_SK, int fd, int id);
|
||||
DECLARE_PLUGIN_HOOK_ARGS(CR_PLUGIN_HOOK__RESTORE_UNIX_SK, int id);
|
||||
DECLARE_PLUGIN_HOOK_ARGS(CR_PLUGIN_HOOK__DUMP_EXT_FILE, int fd, int id);
|
||||
DECLARE_PLUGIN_HOOK_ARGS(CR_PLUGIN_HOOK__RESTORE_EXT_FILE, int id);
|
||||
DECLARE_PLUGIN_HOOK_ARGS(CR_PLUGIN_HOOK__RESTORE_EXT_FILE, int id, bool *retry_needed);
|
||||
DECLARE_PLUGIN_HOOK_ARGS(CR_PLUGIN_HOOK__DUMP_EXT_MOUNT, char *mountpoint, int id);
|
||||
DECLARE_PLUGIN_HOOK_ARGS(CR_PLUGIN_HOOK__RESTORE_EXT_MOUNT, int id, char *mountpoint, char *old_root, int *is_file);
|
||||
DECLARE_PLUGIN_HOOK_ARGS(CR_PLUGIN_HOOK__DUMP_EXT_LINK, int index, int type, char *kind);
|
||||
|
|
@ -81,6 +85,8 @@ DECLARE_PLUGIN_HOOK_ARGS(CR_PLUGIN_HOOK__RESUME_DEVICES_LATE, int pid);
|
|||
DECLARE_PLUGIN_HOOK_ARGS(CR_PLUGIN_HOOK__PAUSE_DEVICES, int pid);
|
||||
DECLARE_PLUGIN_HOOK_ARGS(CR_PLUGIN_HOOK__CHECKPOINT_DEVICES, int pid);
|
||||
DECLARE_PLUGIN_HOOK_ARGS(CR_PLUGIN_HOOK__POST_FORKING, void);
|
||||
DECLARE_PLUGIN_HOOK_ARGS(CR_PLUGIN_HOOK__RESTORE_INIT, void);
|
||||
DECLARE_PLUGIN_HOOK_ARGS(CR_PLUGIN_HOOK__DUMP_DEVICES_LATE, int id);
|
||||
|
||||
enum {
|
||||
CR_PLUGIN_STAGE__DUMP,
|
||||
|
|
|
|||
22
criu/log.c
22
criu/log.c
|
|
@ -190,7 +190,7 @@ void flush_early_log_buffer(int fd)
|
|||
* with reading the log_level.
|
||||
*/
|
||||
struct early_log_hdr *hdr = (void *)early_log_buffer + pos;
|
||||
pos += sizeof(hdr);
|
||||
pos += sizeof(*hdr);
|
||||
if (hdr->level <= current_loglevel) {
|
||||
size_t size = 0;
|
||||
while (size < hdr->len) {
|
||||
|
|
@ -202,7 +202,7 @@ void flush_early_log_buffer(int fd)
|
|||
}
|
||||
pos += hdr->len;
|
||||
}
|
||||
if (early_log_buf_off == EARLY_LOG_BUF_LEN)
|
||||
if ((early_log_buf_off + sizeof(struct early_log_hdr)) >= EARLY_LOG_BUF_LEN)
|
||||
pr_warn("The early log buffer is full, some messages may have been lost\n");
|
||||
early_log_buf_off = 0;
|
||||
}
|
||||
|
|
@ -320,10 +320,10 @@ unsigned int log_get_loglevel(void)
|
|||
|
||||
static void early_vprint(const char *format, unsigned int loglevel, va_list params)
|
||||
{
|
||||
unsigned int log_size = 0;
|
||||
int log_size = 0, log_space;
|
||||
struct early_log_hdr *hdr;
|
||||
|
||||
if ((early_log_buf_off + sizeof(hdr)) >= EARLY_LOG_BUF_LEN)
|
||||
if ((early_log_buf_off + sizeof(*hdr)) >= EARLY_LOG_BUF_LEN)
|
||||
return;
|
||||
|
||||
/* Save loglevel */
|
||||
|
|
@ -331,7 +331,8 @@ static void early_vprint(const char *format, unsigned int loglevel, va_list para
|
|||
hdr = (void *)early_log_buffer + early_log_buf_off;
|
||||
hdr->level = loglevel;
|
||||
/* Skip the log entry size */
|
||||
early_log_buf_off += sizeof(hdr);
|
||||
early_log_buf_off += sizeof(*hdr);
|
||||
log_space = EARLY_LOG_BUF_LEN - early_log_buf_off;
|
||||
if (loglevel >= LOG_TIMESTAMP) {
|
||||
/*
|
||||
* If logging is not yet setup we just write zeros
|
||||
|
|
@ -339,12 +340,17 @@ static void early_vprint(const char *format, unsigned int loglevel, va_list para
|
|||
* keep the same format as the other messages on
|
||||
* log levels with timestamps (>=LOG_TIMESTAMP).
|
||||
*/
|
||||
log_size = snprintf(early_log_buffer + early_log_buf_off, sizeof(early_log_buffer) - early_log_buf_off,
|
||||
log_size = snprintf(early_log_buffer + early_log_buf_off, log_space,
|
||||
"(00.000000) ");
|
||||
}
|
||||
|
||||
log_size += vsnprintf(early_log_buffer + early_log_buf_off + log_size,
|
||||
sizeof(early_log_buffer) - early_log_buf_off - log_size, format, params);
|
||||
if (log_size < log_space)
|
||||
log_size += vsnprintf(early_log_buffer + early_log_buf_off + log_size,
|
||||
log_space - log_size, format, params);
|
||||
if (log_size > log_space) {
|
||||
/* vsnprintf always add the terminating null byte. */
|
||||
log_size = log_space - 1;
|
||||
}
|
||||
|
||||
/* Save log entry size */
|
||||
hdr->len = log_size;
|
||||
|
|
|
|||
45
criu/mem.c
45
criu/mem.c
|
|
@ -290,15 +290,13 @@ prep_dump_pages_args(struct parasite_ctl *ctl, struct vm_area_list *vma_area_lis
|
|||
struct parasite_dump_pages_args *args;
|
||||
struct parasite_vma_entry *p_vma;
|
||||
struct vma_area *vma;
|
||||
pr_info("parasite_dump_pages_seized file = %s, line = %d\n", __FILE__, __LINE__);
|
||||
|
||||
args = compel_parasite_args_s(ctl, dump_pages_args_size(vma_area_list));
|
||||
pr_info("parasite_dump_pages_seized file = %s, line = %d\n", __FILE__, __LINE__);
|
||||
|
||||
p_vma = pargs_vmas(args);
|
||||
pr_info("parasite_dump_pages_seized file = %s, line = %d\n", __FILE__, __LINE__);
|
||||
args->nr_vmas = 0;
|
||||
|
||||
list_for_each_entry(vma, &vma_area_list->h, list) {
|
||||
pr_info("parasite_dump_pages_seized file = %s, line = %d\n", __FILE__, __LINE__);
|
||||
if (!vma_area_is_private(vma, kdat.task_size))
|
||||
continue;
|
||||
/*
|
||||
|
|
@ -319,12 +317,10 @@ pr_info("parasite_dump_pages_seized file = %s, line = %d\n", __FILE__, __LINE__)
|
|||
p_vma->start = vma->e->start;
|
||||
p_vma->len = vma_area_len(vma);
|
||||
p_vma->prot = vma->e->prot;
|
||||
pr_info("parasite_dump_pages_seized file = %s, line = %d\n", __FILE__, __LINE__);
|
||||
|
||||
args->nr_vmas++;
|
||||
p_vma++;
|
||||
}
|
||||
pr_info("parasite_dump_pages_seized file = %s, line = %d args->nr_vmas=%u\n", __FILE__, __LINE__,args->nr_vmas);
|
||||
|
||||
return args;
|
||||
}
|
||||
|
|
@ -333,7 +329,6 @@ static int drain_pages(struct page_pipe *pp, struct parasite_ctl *ctl, struct pa
|
|||
{
|
||||
struct page_pipe_buf *ppb;
|
||||
int ret = 0;
|
||||
pr_info("drain_pages file = %s, line = %d\n", __FILE__, __LINE__);
|
||||
|
||||
debug_show_page_pipe(pp);
|
||||
|
||||
|
|
@ -343,20 +338,15 @@ static int drain_pages(struct page_pipe *pp, struct parasite_ctl *ctl, struct pa
|
|||
args->nr_pages = ppb->pages_in;
|
||||
pr_debug("PPB: %ld pages %d segs %u pipe %d off\n", args->nr_pages, args->nr_segs, ppb->pipe_size,
|
||||
args->off);
|
||||
pr_info("drain_pages file = %s, line = %d\n", __FILE__, __LINE__);
|
||||
|
||||
ret = compel_rpc_call(PARASITE_CMD_DUMPPAGES, ctl);
|
||||
pr_info("drain_pages file = %s, line = %d ret=%d\n", __FILE__, __LINE__, ret);
|
||||
if (ret < 0)
|
||||
return -1;
|
||||
pr_info("drain_pages file = %s, line = %d\n", __FILE__, __LINE__);
|
||||
|
||||
ret = compel_util_send_fd(ctl, ppb->p[1]);
|
||||
pr_info("drain_pages file = %s, line = %d ret=%d\n", __FILE__, __LINE__, ret);
|
||||
if (ret)
|
||||
return -1;
|
||||
|
||||
ret = compel_rpc_sync(PARASITE_CMD_DUMPPAGES, ctl);
|
||||
pr_info("drain_pages file = %s, line = %d ret=%d\n", __FILE__, __LINE__, ret);
|
||||
if (ret < 0)
|
||||
return -1;
|
||||
|
||||
|
|
@ -553,18 +543,16 @@ static int __parasite_dump_pages_seized(struct pstree_item *item, struct parasit
|
|||
|
||||
timing_start(TIME_MEMDUMP);
|
||||
|
||||
pr_info(" Private vmas %lu/%lu pages\n", vma_area_list->nr_priv_pages_longest, vma_area_list->nr_priv_pages);
|
||||
pr_debug(" Private vmas %lu/%lu pages\n", vma_area_list->nr_priv_pages_longest, vma_area_list->nr_priv_pages);
|
||||
|
||||
/*
|
||||
* Step 0 -- prepare
|
||||
*/
|
||||
pr_info("__parasite_dump_pages_seized file = %s, line = %d\n", __FILE__, __LINE__);
|
||||
|
||||
pmc_size = max(vma_area_list->nr_priv_pages_longest, vma_area_list->nr_shared_pages_longest);
|
||||
pr_info("__parasite_dump_pages_seized file = %s, line = %d pmc_size=%lu\n", __FILE__, __LINE__,pmc_size);
|
||||
|
||||
if (pmc_init(&pmc, item->pid->real, &vma_area_list->h, pmc_size * PAGE_SIZE))
|
||||
return -1;
|
||||
pr_info("__parasite_dump_pages_seized file = %s, line = %d\n", __FILE__, __LINE__);
|
||||
|
||||
if (!(mdc->pre_dump || mdc->lazy))
|
||||
/*
|
||||
* Chunk mode pushes pages portion by portion. This mode
|
||||
|
|
@ -575,7 +563,7 @@ static int __parasite_dump_pages_seized(struct pstree_item *item, struct parasit
|
|||
pp = create_page_pipe(vma_area_list->nr_priv_pages, mdc->lazy ? NULL : pargs_iovs(args), cpp_flags);
|
||||
if (!pp)
|
||||
goto out;
|
||||
pr_info("__parasite_dump_pages_seized file = %s, line = %d\n", __FILE__, __LINE__);
|
||||
|
||||
if (!mdc->pre_dump) {
|
||||
/*
|
||||
* Regular dump -- create xfer object and send pages to it
|
||||
|
|
@ -595,13 +583,13 @@ static int __parasite_dump_pages_seized(struct pstree_item *item, struct parasit
|
|||
if (ret)
|
||||
xfer.parent = NULL + 1;
|
||||
}
|
||||
pr_info("__parasite_dump_pages_seized file = %s, line = %d\n", __FILE__, __LINE__);
|
||||
|
||||
if (xfer.parent) {
|
||||
possible_pid_reuse = detect_pid_reuse(item, mdc->stat, mdc->parent_ie);
|
||||
if (possible_pid_reuse == -1)
|
||||
goto out_xfer;
|
||||
}
|
||||
pr_info("__parasite_dump_pages_seized file = %s, line = %d\n", __FILE__, __LINE__);
|
||||
|
||||
/*
|
||||
* Step 1 -- generate the pagemap
|
||||
*/
|
||||
|
|
@ -619,10 +607,9 @@ static int __parasite_dump_pages_seized(struct pstree_item *item, struct parasit
|
|||
if (ret < 0)
|
||||
goto out_xfer;
|
||||
}
|
||||
pr_info("__parasite_dump_pages_seized file = %s, line = %d\n", __FILE__, __LINE__);
|
||||
|
||||
if (mdc->lazy)
|
||||
memcpy(pargs_iovs(args), pp->iovs, sizeof(struct iovec) * pp->nr_iovs);
|
||||
pr_info("__parasite_dump_pages_seized file = %s, line = %d\n", __FILE__, __LINE__);
|
||||
|
||||
/*
|
||||
* Faking drain_pages for pre-dump here. Actual drain_pages for pre-dump
|
||||
|
|
@ -634,14 +621,14 @@ static int __parasite_dump_pages_seized(struct pstree_item *item, struct parasit
|
|||
ret = 0;
|
||||
else
|
||||
ret = drain_pages(pp, ctl, args);
|
||||
pr_info("__parasite_dump_pages_seized file = %s, line = %d\n", __FILE__, __LINE__);
|
||||
|
||||
if (!ret && !mdc->pre_dump)
|
||||
ret = xfer_pages(pp, &xfer);
|
||||
if (ret)
|
||||
goto out_xfer;
|
||||
|
||||
timing_stop(TIME_MEMDUMP);
|
||||
pr_info("__parasite_dump_pages_seized file = %s, line = %d\n", __FILE__, __LINE__);
|
||||
|
||||
/*
|
||||
* Step 4 -- clean up
|
||||
*/
|
||||
|
|
@ -669,9 +656,9 @@ int parasite_dump_pages_seized(struct pstree_item *item, struct vm_area_list *vm
|
|||
{
|
||||
int ret;
|
||||
struct parasite_dump_pages_args *pargs;
|
||||
pr_info("parasite_dump_pages_seized file = %s, line = %d\n", __FILE__, __LINE__);
|
||||
|
||||
pargs = prep_dump_pages_args(ctl, vma_area_list, mdc->pre_dump);
|
||||
pr_info("parasite_dump_pages_seized file = %s, line = %d\n", __FILE__, __LINE__);
|
||||
|
||||
/*
|
||||
* Add PROT_READ protection for all VMAs we're about to
|
||||
* dump if they don't have one. Otherwise we'll not be
|
||||
|
|
@ -711,19 +698,15 @@ int parasite_dump_pages_seized(struct pstree_item *item, struct vm_area_list *vm
|
|||
* 9. syscall fails to copy
|
||||
* data from M
|
||||
*/
|
||||
pr_info("parasite_dump_pages_seized file = %s, line = %d\n", __FILE__, __LINE__);
|
||||
|
||||
if ((pargs->nr_vmas != 0) &&(!mdc->pre_dump || opts.pre_dump_mode == PRE_DUMP_SPLICE)) {
|
||||
pargs->add_prot = PROT_READ;
|
||||
pr_info("parasite_dump_pages_seized file = %s, line = %d\n", __FILE__, __LINE__);
|
||||
|
||||
ret = compel_rpc_call_sync(PARASITE_CMD_MPROTECT_VMAS, ctl);
|
||||
if (ret) {
|
||||
pr_err("Can't dump unprotect vmas with parasite\n");
|
||||
return ret;
|
||||
}
|
||||
}
|
||||
pr_info("parasite_dump_pages_seized file = %s, line = %d\n", __FILE__, __LINE__);
|
||||
|
||||
if (fault_injected(FI_DUMP_PAGES)) {
|
||||
pr_err("fault: Dump VMA pages failure!\n");
|
||||
|
|
|
|||
238
criu/page-xfer.c
238
criu/page-xfer.c
|
|
@ -8,6 +8,10 @@
|
|||
#include <sys/wait.h>
|
||||
#include <sys/stat.h>
|
||||
#include <sys/mman.h>
|
||||
#include <sys/ioctl.h>
|
||||
#include <linux/userfaultfd.h>
|
||||
#include <time.h>
|
||||
#include <string.h>
|
||||
|
||||
#undef LOG_PREFIX
|
||||
#define LOG_PREFIX "page-xfer: "
|
||||
|
|
@ -27,6 +31,11 @@
|
|||
#include "rst_info.h"
|
||||
#include "stats.h"
|
||||
#include "tls.h"
|
||||
#include "uffd.h"
|
||||
#include "cow-dump.h"
|
||||
#include "criu-plugin.h"
|
||||
#include "plugin.h"
|
||||
#include "dump.h"
|
||||
|
||||
static int page_server_sk = -1;
|
||||
|
||||
|
|
@ -1067,6 +1076,55 @@ static int prep_loc_xfer(struct page_server_iov *pi)
|
|||
return 0;
|
||||
}
|
||||
|
||||
/* Statistics tracking structure */
|
||||
static struct {
|
||||
/* page_server_get_pages counters */
|
||||
unsigned long get_total_requests;
|
||||
unsigned long get_with_cow;
|
||||
unsigned long get_no_cow;
|
||||
unsigned long get_total_pages;
|
||||
unsigned long get_cow_pages;
|
||||
unsigned long get_errors;
|
||||
|
||||
/* page_server_serve counters */
|
||||
unsigned long serve_open;
|
||||
unsigned long serve_open2;
|
||||
unsigned long serve_parent;
|
||||
unsigned long serve_add_f;
|
||||
unsigned long serve_add;
|
||||
unsigned long serve_hole;
|
||||
unsigned long serve_close;
|
||||
unsigned long serve_force_close;
|
||||
unsigned long serve_get;
|
||||
unsigned long serve_unknown;
|
||||
|
||||
time_t last_print_time;
|
||||
} ps_stats;
|
||||
|
||||
static void check_and_print_stats(void)
|
||||
{
|
||||
time_t now = time(NULL);
|
||||
|
||||
if (now - ps_stats.last_print_time >= 1) {
|
||||
pr_warn("[PAGE_SERVER_STATS] get_pages: reqs=%lu with_cow=%lu no_cow=%lu pages=%lu cow=%lu errs=%lu | serve: open2=%lu parent=%lu add_f=%lu get=%lu close=%lu\n",
|
||||
ps_stats.get_total_requests,
|
||||
ps_stats.get_with_cow,
|
||||
ps_stats.get_no_cow,
|
||||
ps_stats.get_total_pages,
|
||||
ps_stats.get_cow_pages,
|
||||
ps_stats.get_errors,
|
||||
ps_stats.serve_open2,
|
||||
ps_stats.serve_parent,
|
||||
ps_stats.serve_add_f,
|
||||
ps_stats.serve_get,
|
||||
ps_stats.serve_close + ps_stats.serve_force_close);
|
||||
|
||||
/* Reset all counters */
|
||||
memset(&ps_stats, 0, sizeof(ps_stats));
|
||||
ps_stats.last_print_time = now;
|
||||
}
|
||||
}
|
||||
|
||||
static int page_server_add(int sk, struct page_server_iov *pi, u32 flags)
|
||||
{
|
||||
size_t len;
|
||||
|
|
@ -1141,6 +1199,16 @@ static int page_server_get_pages(int sk, struct page_server_iov *pi)
|
|||
struct page_pipe *pp;
|
||||
unsigned long len, nr_pages;
|
||||
int ret;
|
||||
struct uffdio_writeprotect wp;
|
||||
int uffd = -1;
|
||||
void *buffer = NULL;
|
||||
unsigned long i;
|
||||
struct cow_page **cow_pages = NULL;
|
||||
unsigned long cow_count = 0;
|
||||
|
||||
/* Update statistics */
|
||||
ps_stats.get_total_requests++;
|
||||
check_and_print_stats();
|
||||
|
||||
item = pstree_item_by_virt(pi->dst_id);
|
||||
pp = dmpi(item)->mem_pp;
|
||||
|
|
@ -1150,8 +1218,11 @@ static int page_server_get_pages(int sk, struct page_server_iov *pi)
|
|||
* on 32-bit platforms (e.g. armv7). */
|
||||
nr_pages = pi->nr_pages;
|
||||
ret = page_pipe_read(pp, &pipe_read_dest, pi->vaddr, &nr_pages, PPB_LAZY);
|
||||
if (ret)
|
||||
return ret;
|
||||
|
||||
if (ret) {
|
||||
ps_stats.get_errors++;
|
||||
return ret;
|
||||
}
|
||||
|
||||
/*
|
||||
* The pi is reused for send_psi here, so .nr_pages, .vaddr and
|
||||
|
|
@ -1161,29 +1232,142 @@ static int page_server_get_pages(int sk, struct page_server_iov *pi)
|
|||
pi->nr_pages = nr_pages;
|
||||
if (pi->nr_pages == 0) {
|
||||
pr_debug("no iovs found, zero pages\n");
|
||||
ps_stats.get_errors++;
|
||||
return -1;
|
||||
}
|
||||
|
||||
pi->cmd = encode_ps_cmd(PS_IOV_ADD_F, PE_PRESENT);
|
||||
if (send_psi(sk, pi))
|
||||
return -1;
|
||||
|
||||
len = pi->nr_pages * PAGE_SIZE;
|
||||
ps_stats.get_total_pages += pi->nr_pages;
|
||||
|
||||
if (opts.tls) {
|
||||
if (tls_send_data_from_fd(pipe_read_dest.p[0], len))
|
||||
return -1;
|
||||
/* Single-pass lookup - collect all COW pages */
|
||||
cow_pages = xzalloc(pi->nr_pages * sizeof(struct cow_page *));
|
||||
if (!cow_pages) {
|
||||
pr_err("Failed to allocate COW pages array\n");
|
||||
ps_stats.get_errors++;
|
||||
return -1;
|
||||
}
|
||||
|
||||
for (i = 0; i < pi->nr_pages; i++) {
|
||||
unsigned long page_addr = pi->vaddr + (i * PAGE_SIZE);
|
||||
cow_pages[i] = cow_lookup_and_remove_page(page_addr);
|
||||
if (cow_pages[i])
|
||||
cow_count++;
|
||||
}
|
||||
|
||||
/* Send response header */
|
||||
pi->cmd = encode_ps_cmd(PS_IOV_ADD_F, PE_PRESENT);
|
||||
if (send_psi(sk, pi)) {
|
||||
xfree(cow_pages);
|
||||
ps_stats.get_errors++;
|
||||
return -1;
|
||||
}
|
||||
|
||||
/* Choose fast or slow path based on COW presence */
|
||||
if (cow_count == 0) {
|
||||
/* FAST PATH: Zero-copy splice from pipe to socket */
|
||||
pr_debug("Zero-copy path: splicing %lu pages directly\n", pi->nr_pages);
|
||||
ps_stats.get_no_cow++;
|
||||
|
||||
if (opts.tls) {
|
||||
ret = tls_send_data_from_fd(pipe_read_dest.p[0], len);
|
||||
if (ret) {
|
||||
pr_err("Failed to send via TLS from pipe\n");
|
||||
xfree(cow_pages);
|
||||
ps_stats.get_errors++;
|
||||
return -1;
|
||||
}
|
||||
} else {
|
||||
ssize_t spliced = 0;
|
||||
while (spliced < len) {
|
||||
ret = splice(pipe_read_dest.p[0], NULL, sk, NULL,
|
||||
len - spliced, SPLICE_F_MOVE);
|
||||
if (ret <= 0) {
|
||||
pr_perror("Failed to splice pipe to socket");
|
||||
xfree(cow_pages);
|
||||
ps_stats.get_errors++;
|
||||
return -1;
|
||||
}
|
||||
spliced += ret;
|
||||
}
|
||||
}
|
||||
} else {
|
||||
ret = splice(pipe_read_dest.p[0], NULL, sk, NULL, len, SPLICE_F_MOVE);
|
||||
if (ret != len)
|
||||
/* SLOW PATH: Buffer + overlay COW pages */
|
||||
pr_debug("Buffered path: overlaying %lu COW pages out of %lu total\n",
|
||||
cow_count, pi->nr_pages);
|
||||
ps_stats.get_with_cow++;
|
||||
ps_stats.get_cow_pages += cow_count;
|
||||
|
||||
buffer = xmalloc(len);
|
||||
if (!buffer) {
|
||||
pr_err("Failed to allocate buffer for %lu pages\n", pi->nr_pages);
|
||||
goto err_free_cow;
|
||||
}
|
||||
|
||||
ret = read(pipe_read_dest.p[0], buffer, len);
|
||||
if (ret != len) {
|
||||
pr_err("Short read from pipe: %d vs %lu\n", ret, len);
|
||||
goto err_free_all;
|
||||
}
|
||||
|
||||
/* Overlay COW pages */
|
||||
for (i = 0; i < pi->nr_pages; i++) {
|
||||
if (cow_pages[i]) {
|
||||
pr_debug("Overlaying COW page at index %lu\n", i);
|
||||
memcpy(buffer + (i * PAGE_SIZE), cow_pages[i]->data, PAGE_SIZE);
|
||||
xfree(cow_pages[i]->data);
|
||||
xfree(cow_pages[i]);
|
||||
}
|
||||
}
|
||||
|
||||
/* Send buffered data */
|
||||
if (opts.tls) {
|
||||
if (__send(sk, buffer, len, 0) != len) {
|
||||
pr_perror("Failed to send page buffer via TLS");
|
||||
goto err_free_all;
|
||||
}
|
||||
} else {
|
||||
if (send(sk, buffer, len, 0) != len) {
|
||||
pr_perror("Failed to send page buffer");
|
||||
goto err_free_all;
|
||||
}
|
||||
}
|
||||
|
||||
xfree(buffer);
|
||||
}
|
||||
|
||||
xfree(cow_pages);
|
||||
|
||||
/* Step 5: Unprotect all pages in one operation */
|
||||
uffd = cow_get_uffd();
|
||||
if (uffd >= 0) {
|
||||
wp.range.start = pi->vaddr;
|
||||
wp.range.len = len;
|
||||
wp.mode = 0; /* Clear write-protect */
|
||||
|
||||
if (ioctl(uffd, UFFDIO_WRITEPROTECT, &wp)) {
|
||||
pr_perror("Failed to unprotect pages at 0x%llx", wp.range.start);
|
||||
ps_stats.get_errors++;
|
||||
return -1;
|
||||
}
|
||||
}
|
||||
|
||||
tcp_nodelay(sk, true);
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
err_free_all:
|
||||
xfree(buffer);
|
||||
err_free_cow:
|
||||
for (i = 0; i < pi->nr_pages; i++) {
|
||||
if (cow_pages[i]) {
|
||||
xfree(cow_pages[i]->data);
|
||||
xfree(cow_pages[i]);
|
||||
}
|
||||
}
|
||||
xfree(cow_pages);
|
||||
ps_stats.get_errors++;
|
||||
return -1;
|
||||
}
|
||||
extern void pstree_switch_state(struct pstree_item *root_item, int st);
|
||||
static int page_server_serve(int sk)
|
||||
{
|
||||
int ret = -1;
|
||||
|
|
@ -1228,36 +1412,54 @@ static int page_server_serve(int sk)
|
|||
flushed = false;
|
||||
cmd = decode_ps_cmd(pi.cmd);
|
||||
|
||||
/* Check and print stats on each iteration */
|
||||
check_and_print_stats();
|
||||
|
||||
switch (cmd) {
|
||||
case PS_IOV_OPEN:
|
||||
ps_stats.serve_open++;
|
||||
ret = page_server_open(-1, &pi);
|
||||
break;
|
||||
case PS_IOV_OPEN2:
|
||||
ps_stats.serve_open2++;
|
||||
ret = page_server_open(sk, &pi);
|
||||
break;
|
||||
case PS_IOV_PARENT:
|
||||
ps_stats.serve_parent++;
|
||||
ret = page_server_check_parent(sk, &pi);
|
||||
break;
|
||||
case PS_IOV_ADD_F:
|
||||
case PS_IOV_ADD:
|
||||
case PS_IOV_HOLE: {
|
||||
u32 flags;
|
||||
|
||||
if (likely(cmd == PS_IOV_ADD_F))
|
||||
|
||||
if (likely(cmd == PS_IOV_ADD_F)) {
|
||||
flags = decode_ps_flags(pi.cmd);
|
||||
else if (cmd == PS_IOV_ADD)
|
||||
ps_stats.serve_add_f++;
|
||||
}
|
||||
else if (cmd == PS_IOV_ADD){
|
||||
flags = PE_PRESENT;
|
||||
ps_stats.serve_add++;
|
||||
}
|
||||
else /* PS_IOV_HOLE */
|
||||
{
|
||||
flags = PE_PARENT;
|
||||
ps_stats.serve_hole++;
|
||||
}
|
||||
|
||||
ret = page_server_add(sk, &pi, flags);
|
||||
break;
|
||||
}
|
||||
}
|
||||
case PS_IOV_CLOSE:
|
||||
case PS_IOV_FORCE_CLOSE: {
|
||||
int32_t status = 0;
|
||||
|
||||
ret = 0;
|
||||
|
||||
if (cmd == PS_IOV_CLOSE)
|
||||
ps_stats.serve_close++;
|
||||
else
|
||||
ps_stats.serve_force_close++;
|
||||
|
||||
/*
|
||||
* An answer must be sent back to inform another side,
|
||||
|
|
@ -1272,10 +1474,12 @@ static int page_server_serve(int sk)
|
|||
break;
|
||||
}
|
||||
case PS_IOV_GET:
|
||||
ps_stats.serve_get++;
|
||||
ret = page_server_get_pages(sk, &pi);
|
||||
break;
|
||||
default:
|
||||
pr_err("Unknown command %u\n", pi.cmd);
|
||||
ps_stats.serve_unknown++;
|
||||
ret = -1;
|
||||
break;
|
||||
}
|
||||
|
|
|
|||
|
|
@ -50,7 +50,6 @@ static int mprotect_vmas(struct parasite_dump_pages_args *args)
|
|||
{
|
||||
struct parasite_vma_entry *vmas, *vma;
|
||||
int ret = 0, i;
|
||||
pr_info("mprotect_vmas file = %s, line = %d\n", __FILE__, __LINE__);
|
||||
|
||||
vmas = pargs_vmas(args);
|
||||
for (i = 0; i < args->nr_vmas; i++) {
|
||||
|
|
@ -76,19 +75,18 @@ static int dump_pages(struct parasite_dump_pages_args *args)
|
|||
struct iovec *iovs;
|
||||
int off, nr_segs;
|
||||
unsigned long spliced_bytes = 0;
|
||||
pr_err("dump_pages file = %s, line = %d\n", __FILE__, __LINE__);
|
||||
|
||||
tsock = parasite_get_rpc_sock();
|
||||
p = recv_fd(tsock);
|
||||
if (p < 0)
|
||||
return -1;
|
||||
pr_info("dump_pages file = %s, line = %d\n", __FILE__, __LINE__);
|
||||
|
||||
iovs = pargs_iovs(args);
|
||||
off = 0;
|
||||
nr_segs = args->nr_segs;
|
||||
if (nr_segs > UIO_MAXIOV)
|
||||
nr_segs = UIO_MAXIOV;
|
||||
while (1) {
|
||||
pr_info("dump_pages file = %s, line = %d\n", __FILE__, __LINE__);
|
||||
ret = sys_vmsplice(p, &iovs[args->off + off], nr_segs, SPLICE_F_GIFT | SPLICE_F_NONBLOCK);
|
||||
if (ret < 0) {
|
||||
sys_close(p);
|
||||
|
|
@ -868,10 +866,7 @@ static int parasite_cow_dump_init(struct parasite_cow_dump_args *args)
|
|||
unsigned long addr, len;
|
||||
unsigned long total_pages = 0;
|
||||
unsigned int *failed_indices;
|
||||
unsigned long threshold_pages = 25000; /* 25K pages ~= 100MB */
|
||||
/*unsigned long features = UFFD_FEATURE_PAGEFAULT_FLAG_WP |
|
||||
UFFD_FEATURE_EVENT_FORK |
|
||||
UFFD_FEATURE_EVENT_REMAP;*/
|
||||
|
||||
|
||||
pr_info("COW dump init: registering %d VMAs\n", args->nr_vmas);
|
||||
|
||||
|
|
@ -879,10 +874,14 @@ static int parasite_cow_dump_init(struct parasite_cow_dump_args *args)
|
|||
failed_indices = cow_dump_failed_indices(args);
|
||||
|
||||
/* Create userfaultfd in target process context */
|
||||
uffd = sys_userfaultfd(O_CLOEXEC | O_NONBLOCK);
|
||||
uffd = sys_userfaultfd(O_CLOEXEC | O_NONBLOCK);
|
||||
if (uffd < 0) {
|
||||
pr_err("Failed to create userfaultfd: %d\n", uffd);
|
||||
return -1;
|
||||
int err = -uffd; // Convert negative errno to positive
|
||||
pr_err("Failed to create userfaultfd: %d (%s)\n", err,
|
||||
err == ENOSYS ? "not supported" :
|
||||
err == EPERM ? "permission denied" :
|
||||
err == EINVAL ? "invalid flags" : "unknown error");
|
||||
return -1;
|
||||
}
|
||||
|
||||
/* Initialize userfaultfd API with WP features */
|
||||
|
|
@ -913,10 +912,6 @@ static int parasite_cow_dump_init(struct parasite_cow_dump_args *args)
|
|||
pr_info("Registering VMA %d: %lx-%lx prot=%x len=%lu\n",
|
||||
i, addr, addr + len, vma->prot, len);
|
||||
|
||||
if (((len / PAGE_SIZE) < threshold_pages)){
|
||||
pr_info("Skipping small VMA: %lx-%lx len=%lu\n", addr, addr + len, len);
|
||||
}
|
||||
|
||||
/* Skip non-writable VMAs */
|
||||
if (!(vma->prot & PROT_WRITE)) {
|
||||
pr_info("Skipping non-writable VMA: %lx-%lx len=%lu\n", addr, addr + len, len);
|
||||
|
|
|
|||
|
|
@ -1989,6 +1989,9 @@ __visible long __export_restore_task(struct task_restore_args *args)
|
|||
|
||||
for (m = 0; m < sizeof(vma_entry->madv) * 8; m++) {
|
||||
if (vma_entry->madv & (1ul << m)) {
|
||||
if (!(vma_entry_is(vma_entry, VMA_AREA_REGULAR)))
|
||||
continue;
|
||||
|
||||
ret = sys_madvise(vma_entry->start, vma_entry_len(vma_entry), m);
|
||||
if (ret) {
|
||||
pr_err("madvise(%" PRIx64 ", %" PRIu64 ", %ld) "
|
||||
|
|
|
|||
|
|
@ -60,6 +60,8 @@ static cr_plugin_desc_t *cr_gen_plugin_desc(void *h, char *path)
|
|||
__assign_hook(PAUSE_DEVICES, "cr_plugin_pause_devices");
|
||||
__assign_hook(CHECKPOINT_DEVICES, "cr_plugin_checkpoint_devices");
|
||||
__assign_hook(POST_FORKING, "cr_plugin_post_forking");
|
||||
__assign_hook(RESTORE_INIT, "cr_plugin_restore_init");
|
||||
__assign_hook(DUMP_DEVICES_LATE, "cr_plugin_dump_devices_late");
|
||||
|
||||
#undef __assign_hook
|
||||
|
||||
|
|
@ -257,8 +259,16 @@ int cr_plugin_init(int stage)
|
|||
goto err;
|
||||
}
|
||||
|
||||
if (stage == CR_PLUGIN_STAGE__RESTORE && check_inventory_plugins())
|
||||
goto err;
|
||||
if (stage == CR_PLUGIN_STAGE__RESTORE) {
|
||||
int ret;
|
||||
|
||||
if (check_inventory_plugins())
|
||||
goto err;
|
||||
|
||||
ret = run_plugins(RESTORE_INIT);
|
||||
if (ret < 0 && ret != -ENOTSUP)
|
||||
goto err;
|
||||
}
|
||||
|
||||
exit_code = 0;
|
||||
err:
|
||||
|
|
|
|||
|
|
@ -313,4 +313,4 @@ int clone_service_fd(struct pstree_item *me)
|
|||
ret = 0;
|
||||
|
||||
return ret;
|
||||
}
|
||||
}
|
||||
192
criu/uffd.c
192
criu/uffd.c
|
|
@ -98,6 +98,10 @@ struct lazy_pages_info {
|
|||
|
||||
unsigned long buf_size;
|
||||
void *buf;
|
||||
|
||||
/* Pipeline control */
|
||||
unsigned int pipeline_depth; /* Current in-flight requests */
|
||||
unsigned int max_pipeline_depth; /* Max allowed concurrent requests */
|
||||
};
|
||||
|
||||
/* global lazy-pages daemon state */
|
||||
|
|
@ -110,6 +114,108 @@ static struct epoll_rfd lazy_sk_rfd;
|
|||
/* socket for communication with lazy-pages daemon */
|
||||
static int lazy_pages_sk_id = -1;
|
||||
|
||||
/* Histogram statistics structure */
|
||||
static struct {
|
||||
/* Histogram buckets by page count: 1, 16, 32, 64, 128, 256, 512, 1024, >1024 */
|
||||
unsigned long pf_hist[9]; /* Page fault histogram */
|
||||
unsigned long bg_hist[9]; /* Background transfer histogram */
|
||||
|
||||
unsigned long total_pf_reqs;
|
||||
unsigned long total_bg_reqs;
|
||||
unsigned long total_pages;
|
||||
|
||||
/* Pipeline statistics */
|
||||
unsigned long pipeline_depth_sum;
|
||||
unsigned long pipeline_samples;
|
||||
|
||||
time_t last_print_time;
|
||||
} uffd_stats;
|
||||
|
||||
static int get_histogram_bucket(unsigned long nr_pages)
|
||||
{
|
||||
if (nr_pages == 1) return 0; /* 4KB */
|
||||
if (nr_pages <= 16) return 1; /* 64KB */
|
||||
if (nr_pages <= 32) return 2; /* 128KB */
|
||||
if (nr_pages <= 64) return 3; /* 256KB */
|
||||
if (nr_pages <= 128) return 4; /* 512KB */
|
||||
if (nr_pages <= 256) return 5; /* 1MB */
|
||||
if (nr_pages <= 512) return 6; /* 2MB */
|
||||
if (nr_pages <= 1024) return 7; /* 4MB */
|
||||
return 8; /* >4MB */
|
||||
}
|
||||
|
||||
static const char *get_bucket_label(int bucket)
|
||||
{
|
||||
switch (bucket) {
|
||||
case 0: return "4K";
|
||||
case 1: return "64K";
|
||||
case 2: return "128K";
|
||||
case 3: return "256K";
|
||||
case 4: return "512K";
|
||||
case 5: return "1M";
|
||||
case 6: return "2M";
|
||||
case 7: return "4M";
|
||||
case 8: return ">4M";
|
||||
default: return "?";
|
||||
}
|
||||
}
|
||||
|
||||
static void check_and_print_uffd_stats(void)
|
||||
{
|
||||
time_t now = time(NULL);
|
||||
int i;
|
||||
bool has_pf = false, has_bg = false;
|
||||
unsigned long avg_pipeline = 0;
|
||||
|
||||
if (now - uffd_stats.last_print_time >= 1) {
|
||||
/* Check if we have any data to print */
|
||||
for (i = 0; i < 9; i++) {
|
||||
if (uffd_stats.pf_hist[i] > 0) has_pf = true;
|
||||
if (uffd_stats.bg_hist[i] > 0) has_bg = true;
|
||||
}
|
||||
|
||||
if (!has_pf && !has_bg && uffd_stats.total_pf_reqs == 0 && uffd_stats.total_bg_reqs == 0) {
|
||||
uffd_stats.last_print_time = now;
|
||||
return;
|
||||
}
|
||||
|
||||
/* Calculate average pipeline depth */
|
||||
if (uffd_stats.pipeline_samples > 0)
|
||||
avg_pipeline = uffd_stats.pipeline_depth_sum / uffd_stats.pipeline_samples;
|
||||
|
||||
pr_warn("[UFFD_STATS] reqs=%lu(pf:%lu,bg:%lu) pages=%lu pipe_avg=%lu\n",
|
||||
uffd_stats.total_pf_reqs + uffd_stats.total_bg_reqs,
|
||||
uffd_stats.total_pf_reqs,
|
||||
uffd_stats.total_bg_reqs,
|
||||
uffd_stats.total_pages,
|
||||
avg_pipeline);
|
||||
|
||||
/* Print page fault histogram */
|
||||
if (has_pf) {
|
||||
pr_warn(" PF: ");
|
||||
for (i = 0; i < 9; i++) {
|
||||
if (uffd_stats.pf_hist[i] > 0)
|
||||
pr_warn(" %s=%lu", get_bucket_label(i), uffd_stats.pf_hist[i]);
|
||||
}
|
||||
pr_warn("\n");
|
||||
}
|
||||
|
||||
/* Print background transfer histogram */
|
||||
if (has_bg) {
|
||||
pr_warn(" BG: ");
|
||||
for (i = 0; i < 9; i++) {
|
||||
if (uffd_stats.bg_hist[i] > 0)
|
||||
pr_warn(" %s=%lu", get_bucket_label(i), uffd_stats.bg_hist[i]);
|
||||
}
|
||||
pr_warn("\n");
|
||||
}
|
||||
|
||||
/* Reset all counters */
|
||||
memset(&uffd_stats, 0, sizeof(uffd_stats));
|
||||
uffd_stats.last_print_time = now;
|
||||
}
|
||||
}
|
||||
|
||||
static int handle_uffd_event(struct epoll_rfd *lpfd);
|
||||
|
||||
static struct lazy_pages_info *lpi_init(void)
|
||||
|
|
@ -127,6 +233,10 @@ static struct lazy_pages_info *lpi_init(void)
|
|||
lpi->lpfd.read_event = handle_uffd_event;
|
||||
lpi->xfer_len = DEFAULT_XFER_LEN;
|
||||
lpi->ref_cnt = 1;
|
||||
|
||||
/* Initialize pipeline control - start with aggressive pipelining */
|
||||
lpi->pipeline_depth = 0;
|
||||
lpi->max_pipeline_depth = 256; /* 256 concurrent requests for maximum throughput */
|
||||
|
||||
return lpi;
|
||||
}
|
||||
|
|
@ -843,6 +953,27 @@ static int uffd_check_op_error(struct lazy_pages_info *lpi, const char *op, unsi
|
|||
return 0;
|
||||
}
|
||||
|
||||
static int xfer_pages(struct lazy_pages_info *lpi);
|
||||
/*
|
||||
* Aggressively refill pipeline to maximum capacity.
|
||||
* Called immediately when a response arrives to keep pipeline saturated.
|
||||
*/
|
||||
static int refill_pipeline(struct lazy_pages_info *lpi)
|
||||
{
|
||||
int ret;
|
||||
|
||||
/* Keep filling until pipeline is full or we run out of data */
|
||||
while (!list_empty(&lpi->iovs) &&
|
||||
lpi->pipeline_depth < lpi->max_pipeline_depth) {
|
||||
ret = xfer_pages(lpi);
|
||||
if (ret < 0)
|
||||
return ret;
|
||||
}
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
|
||||
static int uffd_copy(struct lazy_pages_info *lpi, __u64 address, unsigned long *nr_pages)
|
||||
{
|
||||
struct uffdio_copy uffdio_copy;
|
||||
|
|
@ -916,7 +1047,20 @@ static int uffd_io_complete(struct page_read *pr, unsigned long img_addr, unsign
|
|||
* list and let drop_iovs do the range math, free memory etc.
|
||||
*/
|
||||
iov_list_insert(req, &lpi->iovs);
|
||||
return drop_iovs(lpi, addr, nr * PAGE_SIZE);
|
||||
ret = drop_iovs(lpi, addr, nr * PAGE_SIZE);
|
||||
|
||||
/*
|
||||
* Decrement pipeline depth now that response is processed.
|
||||
* IMMEDIATELY refill pipeline to keep it saturated - don't wait for main loop!
|
||||
* This is the key to aggressive pipelining and reducing source EAGAIN.
|
||||
*/
|
||||
lpi->pipeline_depth--;
|
||||
|
||||
if (!lpi->exited && !list_empty(&lpi->iovs)) {
|
||||
refill_pipeline(lpi);
|
||||
}
|
||||
|
||||
return ret;
|
||||
}
|
||||
|
||||
static int uffd_zero(struct lazy_pages_info *lpi, __u64 address, unsigned long nr_pages)
|
||||
|
|
@ -990,6 +1134,8 @@ static struct lazy_iov *pick_next_range(struct lazy_pages_info *lpi)
|
|||
*/
|
||||
static void update_xfer_len(struct lazy_pages_info *lpi, bool pf)
|
||||
{
|
||||
lpi->xfer_len = 8*1024;//MAX_XFER_LEN;
|
||||
return; //TODO remove
|
||||
if (pf)
|
||||
lpi->xfer_len = DEFAULT_XFER_LEN;
|
||||
else
|
||||
|
|
@ -1005,6 +1151,7 @@ static int xfer_pages(struct lazy_pages_info *lpi)
|
|||
unsigned long nr_pages;
|
||||
unsigned long len;
|
||||
int err;
|
||||
int bucket;
|
||||
|
||||
iov = pick_next_range(lpi);
|
||||
if (!iov)
|
||||
|
|
@ -1019,17 +1166,29 @@ static int xfer_pages(struct lazy_pages_info *lpi)
|
|||
|
||||
nr_pages = (iov->end - iov->start) / PAGE_SIZE;
|
||||
|
||||
/* Update statistics */
|
||||
uffd_stats.total_bg_reqs++;
|
||||
uffd_stats.total_pages += nr_pages;
|
||||
bucket = get_histogram_bucket(nr_pages);
|
||||
uffd_stats.bg_hist[bucket]++;
|
||||
|
||||
update_xfer_len(lpi, false);
|
||||
|
||||
/* Increment pipeline depth BEFORE sending request */
|
||||
lpi->pipeline_depth++;
|
||||
|
||||
err = uffd_handle_pages(lpi, iov->img_start, nr_pages, PR_ASYNC | PR_ASAP);
|
||||
if (err < 0) {
|
||||
lp_err(lpi, "Error during UFFD copy\n");
|
||||
lpi->pipeline_depth--; /* Rollback on error */
|
||||
return -1;
|
||||
}
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
|
||||
|
||||
static int handle_remove(struct lazy_pages_info *lpi, struct uffd_msg *msg)
|
||||
{
|
||||
struct uffdio_range unreg;
|
||||
|
|
@ -1154,6 +1313,8 @@ static int handle_page_fault(struct lazy_pages_info *lpi, struct uffd_msg *msg)
|
|||
struct lazy_iov *iov;
|
||||
__u64 address;
|
||||
int ret;
|
||||
unsigned long nr_pages;
|
||||
int bucket;
|
||||
|
||||
/* Align requested address to the next page boundary */
|
||||
address = msg->arg.pagefault.address & ~(page_size() - 1);
|
||||
|
|
@ -1172,11 +1333,23 @@ static int handle_page_fault(struct lazy_pages_info *lpi, struct uffd_msg *msg)
|
|||
|
||||
list_move(&iov->l, &lpi->reqs);
|
||||
|
||||
nr_pages = (iov->end - iov->start) / PAGE_SIZE;
|
||||
|
||||
/* Update statistics */
|
||||
uffd_stats.total_pf_reqs++;
|
||||
uffd_stats.total_pages += nr_pages;
|
||||
bucket = get_histogram_bucket(nr_pages);
|
||||
uffd_stats.pf_hist[bucket]++;
|
||||
|
||||
update_xfer_len(lpi, true);
|
||||
|
||||
ret = uffd_handle_pages(lpi, iov->img_start, 1, PR_ASYNC | PR_ASAP);
|
||||
/* Increment pipeline depth BEFORE sending request (just like background transfers) */
|
||||
lpi->pipeline_depth++;
|
||||
|
||||
ret = uffd_handle_pages(lpi, iov->img_start, nr_pages, PR_ASYNC | PR_ASAP);
|
||||
if (ret < 0) {
|
||||
lp_err(lpi, "Error during regular page copy\n");
|
||||
lpi->pipeline_depth--; /* Rollback on error */
|
||||
return -1;
|
||||
}
|
||||
|
||||
|
|
@ -1248,6 +1421,15 @@ static int handle_requests(int epollfd, struct epoll_event **events, int nr_fds)
|
|||
int ret;
|
||||
|
||||
for (;;) {
|
||||
/* Sample pipeline depth for statistics */
|
||||
list_for_each_entry_safe(lpi, n, &lpis, l) {
|
||||
uffd_stats.pipeline_depth_sum += lpi->pipeline_depth;
|
||||
uffd_stats.pipeline_samples++;
|
||||
}
|
||||
|
||||
/* Check and print statistics every second */
|
||||
check_and_print_uffd_stats();
|
||||
|
||||
ret = epoll_run_rfds(epollfd, *events, nr_fds, poll_timeout);
|
||||
if (ret < 0)
|
||||
goto out;
|
||||
|
|
@ -1265,11 +1447,11 @@ static int handle_requests(int epollfd, struct epoll_event **events, int nr_fds)
|
|||
ret = 0;
|
||||
|
||||
list_for_each_entry_safe(lpi, n, &lpis, l) {
|
||||
if (!list_empty(&lpi->iovs) && list_empty(&lpi->reqs)) {
|
||||
ret = xfer_pages(lpi);
|
||||
/* Aggressively refill pipeline to keep it saturated at all times */
|
||||
if (!list_empty(&lpi->iovs)) {
|
||||
ret = refill_pipeline(lpi);
|
||||
if (ret < 0)
|
||||
goto out;
|
||||
break;
|
||||
}
|
||||
|
||||
if (list_empty(&lpi->reqs)) {
|
||||
|
|
|
|||
|
|
@ -27,7 +27,7 @@ endif
|
|||
criu-amdgpu.pb-c.c: criu-amdgpu.proto
|
||||
protoc --proto_path=. --c_out=. criu-amdgpu.proto
|
||||
|
||||
amdgpu_plugin.so: amdgpu_plugin.c amdgpu_plugin_drm.c amdgpu_plugin_topology.c amdgpu_plugin_util.c criu-amdgpu.pb-c.c amdgpu_socket_utils.c
|
||||
amdgpu_plugin.so: amdgpu_plugin.c amdgpu_plugin_drm.c amdgpu_plugin_dmabuf.c amdgpu_plugin_topology.c amdgpu_plugin_util.c criu-amdgpu.pb-c.c amdgpu_socket_utils.c
|
||||
$(CC) $(PLUGIN_CFLAGS) $(shell $(COMPEL) includes) $^ -o $@ $(PLUGIN_INCLUDE) $(PLUGIN_LDFLAGS) $(LIBDRM_INC)
|
||||
|
||||
amdgpu_plugin_clean:
|
||||
|
|
|
|||
1801
plugins/amdgpu/amdgpu_drm.h
Normal file
1801
plugins/amdgpu/amdgpu_drm.h
Normal file
File diff suppressed because it is too large
Load diff
|
|
@ -12,25 +12,33 @@
|
|||
#include <sys/sysmacros.h>
|
||||
#include <sys/mman.h>
|
||||
#include <sys/types.h>
|
||||
#include <sys/socket.h>
|
||||
#include <sys/un.h>
|
||||
#include <stdint.h>
|
||||
#include <pthread.h>
|
||||
#include <semaphore.h>
|
||||
|
||||
#include <xf86drm.h>
|
||||
#include <libdrm/amdgpu.h>
|
||||
#include <libdrm/amdgpu_drm.h>
|
||||
|
||||
#include "criu-plugin.h"
|
||||
#include "plugin.h"
|
||||
#include "criu-amdgpu.pb-c.h"
|
||||
#include "util.h"
|
||||
#include "util-pie.h"
|
||||
#include "fdstore.h"
|
||||
|
||||
#include "kfd_ioctl.h"
|
||||
#include "xmalloc.h"
|
||||
#include "criu-log.h"
|
||||
#include "files.h"
|
||||
#include "pstree.h"
|
||||
#include "sockets.h"
|
||||
#include "rst-malloc.h"
|
||||
|
||||
#include "common/list.h"
|
||||
#include "amdgpu_drm.h"
|
||||
#include "amdgpu_plugin_dmabuf.h"
|
||||
#include "amdgpu_plugin_drm.h"
|
||||
#include "amdgpu_plugin_util.h"
|
||||
#include "amdgpu_plugin_topology.h"
|
||||
|
|
@ -39,6 +47,7 @@
|
|||
#include "img-streamer.h"
|
||||
#include "image.h"
|
||||
#include "cr_options.h"
|
||||
#include "util.h"
|
||||
|
||||
struct vma_metadata {
|
||||
struct list_head list;
|
||||
|
|
@ -51,13 +60,6 @@ struct vma_metadata {
|
|||
|
||||
/************************************ Global Variables ********************************************/
|
||||
|
||||
/**
|
||||
* FD of KFD device used to checkpoint. On a multi-process
|
||||
* tree the order of checkpointing goes from parent to child
|
||||
* and so on - so saving the FD will not be overwritten
|
||||
*/
|
||||
static int kfd_checkpoint_fd;
|
||||
|
||||
static LIST_HEAD(update_vma_info_list);
|
||||
|
||||
size_t kfd_max_buffer_size;
|
||||
|
|
@ -66,6 +68,19 @@ bool plugin_added_to_inventory = false;
|
|||
|
||||
bool plugin_disabled = false;
|
||||
|
||||
struct handle_id {
|
||||
int handle;
|
||||
int fdstore_id;
|
||||
};
|
||||
struct shared_handle_ids {
|
||||
int num_handles;
|
||||
struct handle_id *handles;
|
||||
};
|
||||
struct shared_handle_ids *shared_memory = NULL;
|
||||
|
||||
static mutex_t *shared_memory_mutex;
|
||||
|
||||
int current_pid;
|
||||
/*
|
||||
* In the case of a single process (common case), this optimization can effectively
|
||||
* reduce the restore latency with parallel restore. In the case of multiple processes,
|
||||
|
|
@ -313,8 +328,6 @@ void getenv_size_t(const char *var, size_t *value)
|
|||
int sh = 0;
|
||||
size_t size;
|
||||
|
||||
pr_info("Value str: %s\n", value_str);
|
||||
|
||||
if (value_str) {
|
||||
size = (size_t)strtoul(value_str, &endp, 0);
|
||||
if (errno || value_str == endp) {
|
||||
|
|
@ -526,11 +539,11 @@ void free_and_unmap(uint64_t size, amdgpu_bo_handle h_bo, amdgpu_va_handle h_va,
|
|||
amdgpu_bo_free(h_bo);
|
||||
}
|
||||
|
||||
static int sdma_copy_bo(struct kfd_criu_bo_bucket bo_bucket, FILE *storage_fp,
|
||||
void *buffer, size_t buffer_size, amdgpu_device_handle h_dev,
|
||||
uint64_t max_copy_size, enum sdma_op_type type)
|
||||
int sdma_copy_bo(int shared_fd, uint64_t size, FILE *storage_fp,
|
||||
void *buffer, size_t buffer_size, amdgpu_device_handle h_dev,
|
||||
uint64_t max_copy_size, enum sdma_op_type type, bool do_not_free)
|
||||
{
|
||||
uint64_t size, src_bo_size, dst_bo_size, buffer_bo_size, bytes_remain, buffer_space_remain;
|
||||
uint64_t src_bo_size, dst_bo_size, buffer_bo_size, bytes_remain, buffer_space_remain;
|
||||
uint64_t gpu_addr_src, gpu_addr_dst, gpu_addr_ib, copy_src, copy_dst, copy_size;
|
||||
amdgpu_va_handle h_va_src, h_va_dst, h_va_ib;
|
||||
amdgpu_bo_handle h_bo_src, h_bo_dst, h_bo_ib;
|
||||
|
|
@ -543,10 +556,8 @@ static int sdma_copy_bo(struct kfd_criu_bo_bucket bo_bucket, FILE *storage_fp,
|
|||
uint32_t expired;
|
||||
amdgpu_context_handle h_ctx;
|
||||
uint32_t *ib = NULL;
|
||||
int j, err, shared_fd, packets_per_buffer;
|
||||
int j, err, packets_per_buffer;
|
||||
|
||||
shared_fd = bo_bucket.dmabuf_fd;
|
||||
size = bo_bucket.size;
|
||||
buffer_bo_size = min(size, buffer_size);
|
||||
packets_per_buffer = ((buffer_bo_size - 1) / max_copy_size) + 1;
|
||||
src_bo_size = (type == SDMA_OP_VRAM_WRITE) ? buffer_bo_size : size;
|
||||
|
|
@ -757,7 +768,8 @@ err_dst_bo_map:
|
|||
if (err)
|
||||
pr_perror("dest range free failed");
|
||||
err_dst_va:
|
||||
err = amdgpu_bo_free(h_bo_dst);
|
||||
if (!do_not_free)
|
||||
err = amdgpu_bo_free(h_bo_dst);
|
||||
if (err)
|
||||
pr_perror("dest bo free failed");
|
||||
err_dst_bo_prep:
|
||||
|
|
@ -845,8 +857,9 @@ void *dump_bo_contents(void *_thread_data)
|
|||
num_bos++;
|
||||
|
||||
/* perform sDMA based vram copy */
|
||||
ret = sdma_copy_bo(bo_buckets[i], bo_contents_fp, buffer, buffer_size, h_dev, max_copy_size,
|
||||
SDMA_OP_VRAM_READ);
|
||||
ret = sdma_copy_bo(bo_buckets[i].dmabuf_fd, bo_buckets[i].size, bo_contents_fp, buffer, buffer_size, h_dev, max_copy_size,
|
||||
SDMA_OP_VRAM_READ, false);
|
||||
|
||||
if (ret) {
|
||||
pr_err("Failed to drain the BO using sDMA: bo_buckets[%d]\n", i);
|
||||
break;
|
||||
|
|
@ -943,8 +956,8 @@ void *restore_bo_contents(void *_thread_data)
|
|||
|
||||
num_bos++;
|
||||
|
||||
ret = sdma_copy_bo(bo_buckets[i], bo_contents_fp, buffer, buffer_size, h_dev, max_copy_size,
|
||||
SDMA_OP_VRAM_WRITE);
|
||||
ret = sdma_copy_bo(bo_buckets[i].dmabuf_fd, bo_buckets[i].size, bo_contents_fp, buffer, buffer_size, h_dev, max_copy_size,
|
||||
SDMA_OP_VRAM_WRITE, false);
|
||||
if (ret) {
|
||||
pr_err("Failed to fill the BO using sDMA: bo_buckets[%d]\n", i);
|
||||
break;
|
||||
|
|
@ -1030,28 +1043,163 @@ int restore_hsakmt_shared_mem(const uint64_t shared_mem_size, const uint32_t sha
|
|||
return 0;
|
||||
}
|
||||
|
||||
static int unpause_process(int fd)
|
||||
int amdgpu_unpause_processes(int pid)
|
||||
{
|
||||
int ret = 0;
|
||||
struct kfd_ioctl_criu_args args = { 0 };
|
||||
struct list_head *l = get_dumped_fds();
|
||||
struct dumped_fd *st;
|
||||
|
||||
args.op = KFD_CRIU_OP_UNPAUSE;
|
||||
list_for_each_entry(st, l, l) {
|
||||
if (st->is_drm) {
|
||||
close(st->fd);
|
||||
} else {
|
||||
args.op = KFD_CRIU_OP_UNPAUSE;
|
||||
|
||||
ret = kmtIoctl(fd, AMDKFD_IOC_CRIU_OP, &args);
|
||||
if (ret) {
|
||||
pr_perror("Failed to unpause process");
|
||||
goto exit;
|
||||
ret = kmtIoctl(st->fd, AMDKFD_IOC_CRIU_OP, &args);
|
||||
if (ret) {
|
||||
pr_perror("Failed to unpause process");
|
||||
goto exit;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// Reset the KFD FD
|
||||
kfd_checkpoint_fd = -1;
|
||||
sys_close_drm_render_devices(&src_topology);
|
||||
if (post_dump_dmabuf_check() < 0)
|
||||
ret = -1;
|
||||
|
||||
exit:
|
||||
pr_info("Process unpaused %s (ret:%d)\n", ret ? "Failed" : "Ok", ret);
|
||||
clear_dumped_fds();
|
||||
|
||||
return ret;
|
||||
}
|
||||
CR_PLUGIN_REGISTER_HOOK(CR_PLUGIN_HOOK__DUMP_DEVICES_LATE, amdgpu_unpause_processes)
|
||||
|
||||
int store_dmabuf_fd(int handle, int fd)
|
||||
{
|
||||
int id;
|
||||
|
||||
id = fdstore_add(fd);
|
||||
mutex_lock(shared_memory_mutex);
|
||||
for (int i = 0; i < shared_memory->num_handles; i++) {
|
||||
if (shared_memory->handles[i].handle == handle) {
|
||||
mutex_unlock(shared_memory_mutex);
|
||||
return 0;
|
||||
}
|
||||
if (shared_memory->handles[i].handle == -1) {
|
||||
shared_memory->handles[i].handle = handle;
|
||||
shared_memory->handles[i].fdstore_id = id;
|
||||
mutex_unlock(shared_memory_mutex);
|
||||
return 0;
|
||||
}
|
||||
}
|
||||
mutex_unlock(shared_memory_mutex);
|
||||
|
||||
return -1;
|
||||
}
|
||||
|
||||
int amdgpu_id_for_handle(int handle)
|
||||
{
|
||||
mutex_lock(shared_memory_mutex);
|
||||
for (int i = 0; i < shared_memory->num_handles; i++) {
|
||||
if (shared_memory->handles[i].handle == handle) {
|
||||
mutex_unlock(shared_memory_mutex);
|
||||
return shared_memory->handles[i].fdstore_id;
|
||||
}
|
||||
}
|
||||
mutex_unlock(shared_memory_mutex);
|
||||
return -1;
|
||||
}
|
||||
|
||||
int amdgpu_restore_init(void)
|
||||
{
|
||||
if (!shared_memory) {
|
||||
int protection = PROT_READ | PROT_WRITE;
|
||||
int visibility = MAP_SHARED | MAP_ANONYMOUS;
|
||||
size_t img_size;
|
||||
FILE *img_fp = NULL;
|
||||
int ret;
|
||||
unsigned char *buf;
|
||||
int num_handles = 0;
|
||||
char img_path[PATH_MAX];
|
||||
CriuRenderNode *rd = NULL;
|
||||
CriuKfd *e = NULL;
|
||||
|
||||
DIR *d;
|
||||
struct dirent *dir;
|
||||
d = opendir(".");
|
||||
if (d) {
|
||||
while ((dir = readdir(d)) != NULL) {
|
||||
if (strncmp("amdgpu-kfd-", dir->d_name, strlen("amdgpu-kfd-")) == 0) {
|
||||
img_fp = open_img_file(dir->d_name, false, &img_size);
|
||||
buf = xmalloc(img_size);
|
||||
if (!buf) {
|
||||
fclose(img_fp);
|
||||
return -ENOMEM;
|
||||
}
|
||||
|
||||
ret = read_fp(img_fp, buf, img_size);
|
||||
if (ret) {
|
||||
pr_perror("Unable to read from %s", img_path);
|
||||
fclose(img_fp);
|
||||
xfree(buf);
|
||||
return ret;
|
||||
}
|
||||
|
||||
fclose(img_fp);
|
||||
e = criu_kfd__unpack(NULL, img_size, buf);
|
||||
num_handles += e->num_of_bos;
|
||||
criu_kfd__free_unpacked(e, NULL);
|
||||
xfree(buf);
|
||||
}
|
||||
if (strncmp("amdgpu-renderD-", dir->d_name, strlen("amdgpu-renderD-")) == 0) {
|
||||
img_fp = open_img_file(dir->d_name, false, &img_size);
|
||||
buf = xmalloc(img_size);
|
||||
if (!buf) {
|
||||
fclose(img_fp);
|
||||
return -ENOMEM;
|
||||
}
|
||||
|
||||
ret = read_fp(img_fp, buf, img_size);
|
||||
if (ret) {
|
||||
pr_perror("Unable to read from %s", img_path);
|
||||
fclose(img_fp);
|
||||
xfree(buf);
|
||||
return ret;
|
||||
}
|
||||
|
||||
fclose(img_fp);
|
||||
rd = criu_render_node__unpack(NULL, img_size, buf);
|
||||
num_handles += rd->num_of_bos;
|
||||
criu_render_node__free_unpacked(rd, NULL);
|
||||
xfree(buf);
|
||||
}
|
||||
}
|
||||
closedir(d);
|
||||
}
|
||||
|
||||
if (num_handles > 0) {
|
||||
shared_memory = mmap(NULL, sizeof(shared_memory), protection, visibility, -1, 0);
|
||||
shared_memory->num_handles = num_handles;
|
||||
shared_memory->handles = mmap(NULL, sizeof(struct handle_id) * num_handles, protection, visibility, -1, 0);
|
||||
|
||||
for (int i = 0; i < num_handles; i++) {
|
||||
shared_memory->handles[i].handle = -1;
|
||||
shared_memory->handles[i].fdstore_id = -1;
|
||||
}
|
||||
|
||||
shared_memory_mutex = shmalloc(sizeof(*shared_memory_mutex));
|
||||
if (!shared_memory_mutex) {
|
||||
pr_err("Can't create amdgpu mutex\n");
|
||||
return -1;
|
||||
}
|
||||
mutex_init(shared_memory_mutex);
|
||||
}
|
||||
}
|
||||
|
||||
return 0;
|
||||
}
|
||||
CR_PLUGIN_REGISTER_HOOK(CR_PLUGIN_HOOK__RESTORE_INIT, amdgpu_restore_init)
|
||||
|
||||
static int save_devices(int fd, struct kfd_ioctl_criu_args *args, struct kfd_criu_device_bucket *device_buckets,
|
||||
CriuKfd *e)
|
||||
|
|
@ -1095,6 +1243,8 @@ static int save_bos(int id, int fd, struct kfd_ioctl_criu_args *args, struct kfd
|
|||
{
|
||||
struct thread_data *thread_datas;
|
||||
int ret = 0, i;
|
||||
amdgpu_device_handle h_dev;
|
||||
uint32_t major, minor;
|
||||
|
||||
pr_debug("Dumping %d BOs\n", args->num_bos);
|
||||
|
||||
|
|
@ -1118,6 +1268,19 @@ static int save_bos(int id, int fd, struct kfd_ioctl_criu_args *args, struct kfd
|
|||
boinfo->size = bo_bucket->size;
|
||||
boinfo->offset = bo_bucket->offset;
|
||||
boinfo->alloc_flags = bo_bucket->alloc_flags;
|
||||
|
||||
ret = amdgpu_device_initialize(node_get_drm_render_device(sys_get_node_by_gpu_id(&src_topology, bo_bucket->gpu_id)), &major, &minor, &h_dev);
|
||||
|
||||
boinfo->handle = get_gem_handle(h_dev, bo_bucket->dmabuf_fd);
|
||||
|
||||
amdgpu_device_deinitialize(h_dev);
|
||||
}
|
||||
for (i = 0; i < e->num_of_bos; i++) {
|
||||
KfdBoEntry *boinfo = e->bo_entries[i];
|
||||
|
||||
ret = record_shared_bo(boinfo->handle, false);
|
||||
if (ret)
|
||||
goto exit;
|
||||
}
|
||||
|
||||
for (int i = 0; i < e->num_of_gpus; i++) {
|
||||
|
|
@ -1238,10 +1401,17 @@ int amdgpu_plugin_dump_file(int fd, int id)
|
|||
return -1;
|
||||
}
|
||||
|
||||
/* Initialize number of device files that will be checkpointed */
|
||||
init_gpu_count(&src_topology);
|
||||
/* Check whether this plugin was called for kfd, dmabuf or render nodes */
|
||||
ret = get_dmabuf_info(fd, &st);
|
||||
if (ret < 0) {
|
||||
pr_perror("Failed to get dmabuf info");
|
||||
return -1;
|
||||
}
|
||||
if (ret == 0) {
|
||||
pr_info("Dumping dmabuf fd = %d\n", fd);
|
||||
return amdgpu_plugin_dmabuf_dump(fd, id);
|
||||
}
|
||||
|
||||
/* Check whether this plugin was called for kfd or render nodes */
|
||||
if (major(st.st_rdev) != major(st_kfd.st_rdev) || minor(st.st_rdev) != 0) {
|
||||
|
||||
/* This is RenderD dumper plugin, for now just save renderD
|
||||
|
|
@ -1252,14 +1422,12 @@ int amdgpu_plugin_dump_file(int fd, int id)
|
|||
if (ret)
|
||||
return ret;
|
||||
|
||||
/* Invoke unpause process if needed */
|
||||
decrement_checkpoint_count();
|
||||
if (checkpoint_is_complete()) {
|
||||
ret = unpause_process(kfd_checkpoint_fd);
|
||||
}
|
||||
ret = record_dumped_fd(fd, true);
|
||||
if (ret)
|
||||
return ret;
|
||||
|
||||
/* Need to return success here so that criu can call plugins for renderD nodes */
|
||||
return ret;
|
||||
return try_dump_dmabuf_list();
|
||||
}
|
||||
|
||||
pr_info("%s() called for fd = %d\n", __func__, major(st.st_rdev));
|
||||
|
|
@ -1354,14 +1522,11 @@ int amdgpu_plugin_dump_file(int fd, int id)
|
|||
|
||||
xfree(buf);
|
||||
|
||||
exit:
|
||||
/* Restore all queues if conditions permit */
|
||||
kfd_checkpoint_fd = fd;
|
||||
decrement_checkpoint_count();
|
||||
if (checkpoint_is_complete()) {
|
||||
ret = unpause_process(fd);
|
||||
}
|
||||
ret = record_dumped_fd(fd, false);
|
||||
if (ret)
|
||||
goto exit;
|
||||
|
||||
exit:
|
||||
xfree((void *)args.devices);
|
||||
xfree((void *)args.bos);
|
||||
xfree((void *)args.priv_data);
|
||||
|
|
@ -1384,7 +1549,6 @@ static int restore_devices(struct kfd_ioctl_criu_args *args, CriuKfd *e)
|
|||
int ret = 0, bucket_index = 0;
|
||||
|
||||
pr_debug("Restoring %d devices\n", e->num_of_gpus);
|
||||
|
||||
args->num_devices = e->num_of_gpus;
|
||||
device_buckets = xzalloc(sizeof(*device_buckets) * args->num_devices);
|
||||
if (!device_buckets)
|
||||
|
|
@ -1457,6 +1621,29 @@ static int restore_bos(struct kfd_ioctl_criu_args *args, CriuKfd *e)
|
|||
}
|
||||
|
||||
pr_info("Restore BOs Ok\n");
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
int save_vma_updates(uint64_t offset, uint64_t addr, uint64_t restored_offset, int fd)
|
||||
{
|
||||
struct vma_metadata *vma_md;
|
||||
|
||||
vma_md = xmalloc(sizeof(*vma_md));
|
||||
if (!vma_md) {
|
||||
return -ENOMEM;
|
||||
}
|
||||
|
||||
memset(vma_md, 0, sizeof(*vma_md));
|
||||
|
||||
vma_md->old_pgoff = offset;
|
||||
vma_md->vma_entry = addr;
|
||||
|
||||
vma_md->new_pgoff = restored_offset;
|
||||
vma_md->fd = fd;
|
||||
|
||||
list_add_tail(&vma_md->list, &update_vma_info_list);
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
|
|
@ -1614,7 +1801,7 @@ exit:
|
|||
return ret;
|
||||
}
|
||||
|
||||
int amdgpu_plugin_restore_file(int id)
|
||||
int amdgpu_plugin_restore_file(int id, bool *retry_needed)
|
||||
{
|
||||
int ret = 0, fd;
|
||||
char img_path[PATH_MAX];
|
||||
|
|
@ -1625,6 +1812,8 @@ int amdgpu_plugin_restore_file(int id)
|
|||
size_t img_size;
|
||||
FILE *img_fp = NULL;
|
||||
|
||||
*retry_needed = false;
|
||||
|
||||
if (plugin_disabled)
|
||||
return -ENOTSUP;
|
||||
|
||||
|
|
@ -1643,12 +1832,21 @@ int amdgpu_plugin_restore_file(int id)
|
|||
* first as we assume restore_maps is already filled. Need to fix this later.
|
||||
*/
|
||||
snprintf(img_path, sizeof(img_path), IMG_DRM_FILE, id);
|
||||
pr_info("Restoring RenderD %s\n", img_path);
|
||||
|
||||
img_fp = open_img_file(img_path, false, &img_size);
|
||||
if (!img_fp)
|
||||
return -EINVAL;
|
||||
|
||||
if (!img_fp) {
|
||||
ret = amdgpu_plugin_dmabuf_restore(id);
|
||||
if (ret == 1) {
|
||||
/* This is a dmabuf fd, but the corresponding buffer object that was
|
||||
* exported to make it has not yet been restored. Need to try again
|
||||
* later when the buffer object exists, so it can be re-exported.
|
||||
*/
|
||||
*retry_needed = true;
|
||||
return 0;
|
||||
}
|
||||
return ret;
|
||||
}
|
||||
pr_info("Restoring RenderD %s\n", img_path);
|
||||
pr_debug("RenderD Image file size:%ld\n", img_size);
|
||||
buf = xmalloc(img_size);
|
||||
if (!buf) {
|
||||
|
|
@ -1689,8 +1887,18 @@ int amdgpu_plugin_restore_file(int id)
|
|||
pr_info("render node destination gpu_id = 0x%04x\n", tp_node->gpu_id);
|
||||
|
||||
fd = node_get_drm_render_device(tp_node);
|
||||
if (fd < 0)
|
||||
if (fd < 0) {
|
||||
pr_err("Failed to open render device (minor:%d)\n", tp_node->drm_render_minor);
|
||||
return -1;
|
||||
}
|
||||
|
||||
ret = amdgpu_plugin_drm_restore_file(fd, rd);
|
||||
if (ret == 1)
|
||||
*retry_needed = true;
|
||||
if (ret < 0) {
|
||||
fd = ret;
|
||||
goto fail;
|
||||
}
|
||||
fail:
|
||||
criu_render_node__free_unpacked(rd, NULL);
|
||||
xfree(buf);
|
||||
|
|
@ -1702,12 +1910,20 @@ int amdgpu_plugin_restore_file(int id)
|
|||
* copy of the fd. CRIU core owns the duplicated returned fd, and amdgpu_plugin owns the fd stored in
|
||||
* tp_node.
|
||||
*/
|
||||
fd = dup(fd);
|
||||
if (fd == -1) {
|
||||
pr_perror("unable to duplicate the render fd");
|
||||
return -1;
|
||||
|
||||
if (fd < 0)
|
||||
return fd;
|
||||
|
||||
if (!(*retry_needed)) {
|
||||
fd = dup(fd);
|
||||
if (fd == -1) {
|
||||
pr_perror("unable to duplicate the render fd");
|
||||
return -1;
|
||||
}
|
||||
return fd;
|
||||
}
|
||||
return fd;
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
fd = open(AMDGPU_KFD_DEVICE, O_RDWR | O_CLOEXEC);
|
||||
|
|
@ -1751,11 +1967,13 @@ int amdgpu_plugin_restore_file(int id)
|
|||
* This way, we know that the file descriptors we store will not conflict with file descriptors inside core
|
||||
* CRIU.
|
||||
*/
|
||||
fd_next = find_unused_fd_pid(e->pid);
|
||||
if (fd_next <= 0) {
|
||||
pr_err("Failed to find unused fd (fd:%d)\n", fd_next);
|
||||
ret = -EINVAL;
|
||||
goto exit;
|
||||
if (fd_next == -1) {
|
||||
fd_next = find_unused_fd_pid(e->pid);
|
||||
if (fd_next <= 0) {
|
||||
pr_err("Failed to find unused fd (fd:%d)\n", fd_next);
|
||||
ret = -EINVAL;
|
||||
goto exit;
|
||||
}
|
||||
}
|
||||
|
||||
ret = devinfo_to_topology(e->device_entries, e->num_of_gpus + e->num_of_cpus, &src_topology);
|
||||
|
|
@ -1788,14 +2006,26 @@ int amdgpu_plugin_restore_file(int id)
|
|||
args.num_objects = e->num_of_objects;
|
||||
args.priv_data_size = e->priv_data.len;
|
||||
args.priv_data = (uintptr_t)e->priv_data.data;
|
||||
|
||||
args.op = KFD_CRIU_OP_RESTORE;
|
||||
|
||||
if (kmtIoctl(fd, AMDKFD_IOC_CRIU_OP, &args) == -1) {
|
||||
pr_perror("Restore ioctl failed");
|
||||
ret = -1;
|
||||
goto exit;
|
||||
}
|
||||
|
||||
if (ret < 0)
|
||||
goto exit;
|
||||
|
||||
for (int i = 0; i < args.num_bos; i++) {
|
||||
struct kfd_criu_bo_bucket *bo_bucket = &((struct kfd_criu_bo_bucket *)args.bos)[i];
|
||||
KfdBoEntry *bo_entry = e->bo_entries[i];
|
||||
|
||||
if (bo_entry->handle != -1) {
|
||||
store_dmabuf_fd(bo_entry->handle, bo_bucket->dmabuf_fd);
|
||||
}
|
||||
}
|
||||
|
||||
ret = restore_bo_data(id, (struct kfd_criu_bo_bucket *)args.bos, e);
|
||||
if (ret)
|
||||
goto exit;
|
||||
|
|
@ -1938,19 +2168,14 @@ int amdgpu_plugin_resume_devices_late(int target_pid)
|
|||
}
|
||||
}
|
||||
|
||||
clear_restore_state();
|
||||
|
||||
close(fd);
|
||||
return exit_code;
|
||||
}
|
||||
|
||||
CR_PLUGIN_REGISTER_HOOK(CR_PLUGIN_HOOK__RESUME_DEVICES_LATE, amdgpu_plugin_resume_devices_late)
|
||||
|
||||
int sdma_copy_bo_helper(uint64_t size, int fd, FILE *storage_fp, void *buffer, size_t buffer_size,
|
||||
amdgpu_device_handle h_dev, uint64_t max_copy_size, enum sdma_op_type type)
|
||||
{
|
||||
return sdma_copy_bo((struct kfd_criu_bo_bucket){ 0, size, 0, 0, 0, 0, fd, 0 }, storage_fp, buffer,
|
||||
buffer_size, h_dev, max_copy_size, SDMA_OP_VRAM_WRITE);
|
||||
}
|
||||
|
||||
int init_dev(int dev_minor, amdgpu_device_handle *h_dev, uint64_t *max_copy_size)
|
||||
{
|
||||
int ret = 0;
|
||||
|
|
@ -2059,8 +2284,10 @@ void *parallel_restore_bo_contents(void *_thread_data)
|
|||
|
||||
entry = &restore_cmd->entries[i];
|
||||
fseek(bo_contents_fp, entry->read_offset + offset, SEEK_SET);
|
||||
ret = sdma_copy_bo_helper(entry->size, restore_cmd->fds_write[entry->write_id], bo_contents_fp, buffer,
|
||||
buffer_size, h_dev, max_copy_size, SDMA_OP_VRAM_WRITE);
|
||||
ret = sdma_copy_bo(restore_cmd->fds_write[entry->write_id], entry->size, bo_contents_fp,
|
||||
buffer, buffer_size, h_dev,
|
||||
max_copy_size, SDMA_OP_VRAM_WRITE, false);
|
||||
|
||||
if (ret) {
|
||||
pr_err("Failed to fill the BO using sDMA: bo_buckets[%d]\n", i);
|
||||
goto err_sdma;
|
||||
|
|
|
|||
197
plugins/amdgpu/amdgpu_plugin_dmabuf.c
Normal file
197
plugins/amdgpu/amdgpu_plugin_dmabuf.c
Normal file
|
|
@ -0,0 +1,197 @@
|
|||
#include <errno.h>
|
||||
#include <fcntl.h>
|
||||
#include <stdlib.h>
|
||||
#include <stdint.h>
|
||||
#include <stdbool.h>
|
||||
#include <string.h>
|
||||
#include <stdio.h>
|
||||
#include <unistd.h>
|
||||
|
||||
#include <sys/stat.h>
|
||||
#include <sys/mman.h>
|
||||
#include <sys/types.h>
|
||||
#include <linux/limits.h>
|
||||
|
||||
#include "common/list.h"
|
||||
#include "criu-amdgpu.pb-c.h"
|
||||
|
||||
#include "xmalloc.h"
|
||||
#include "criu-log.h"
|
||||
#include "amdgpu_plugin_drm.h"
|
||||
#include "amdgpu_plugin_util.h"
|
||||
#include "amdgpu_plugin_dmabuf.h"
|
||||
#include "fdstore.h"
|
||||
|
||||
#include "util.h"
|
||||
#include "common/scm.h"
|
||||
|
||||
struct dmabuf {
|
||||
int id;
|
||||
int dmabuf_fd;
|
||||
struct list_head node;
|
||||
};
|
||||
|
||||
static LIST_HEAD(dmabuf_list);
|
||||
|
||||
/* Return < 0 for error, > 0 for "not a dmabuf" and 0 "is a dmabuf" */
|
||||
int get_dmabuf_info(int fd, struct stat *st)
|
||||
{
|
||||
char path[PATH_MAX];
|
||||
|
||||
if (read_fd_link(fd, path, sizeof(path)) < 0)
|
||||
return -1;
|
||||
|
||||
if (strncmp(path, DMABUF_LINK, strlen(DMABUF_LINK)) != 0)
|
||||
return 1;
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
int __amdgpu_plugin_dmabuf_dump(int dmabuf_fd, int id)
|
||||
{
|
||||
int ret = 0;
|
||||
char path[PATH_MAX];
|
||||
size_t len = 0;
|
||||
unsigned char *buf = NULL;
|
||||
int gem_handle;
|
||||
|
||||
gem_handle = handle_for_shared_bo_fd(dmabuf_fd);
|
||||
if (gem_handle < 0) {
|
||||
pr_err("Failed to get handle for dmabuf_fd = %d\n", dmabuf_fd);
|
||||
return -EAGAIN; /* Retry needed */
|
||||
}
|
||||
|
||||
CriuDmabufNode *node = xmalloc(sizeof(*node));
|
||||
if (!node) {
|
||||
pr_err("Failed to allocate memory for dmabuf node\n");
|
||||
return -ENOMEM;
|
||||
}
|
||||
criu_dmabuf_node__init(node);
|
||||
|
||||
node->gem_handle = gem_handle;
|
||||
|
||||
if (node->gem_handle < 0) {
|
||||
pr_err("Failed to get handle for dmabuf_fd\n");
|
||||
xfree(node);
|
||||
return -EINVAL;
|
||||
}
|
||||
|
||||
/* Serialize metadata to a file */
|
||||
snprintf(path, sizeof(path), IMG_DMABUF_FILE, id);
|
||||
len = criu_dmabuf_node__get_packed_size(node);
|
||||
buf = xmalloc(len);
|
||||
if (!buf) {
|
||||
pr_err("Failed to allocate buffer for dmabuf metadata\n");
|
||||
xfree(node);
|
||||
return -ENOMEM;
|
||||
}
|
||||
criu_dmabuf_node__pack(node, buf);
|
||||
ret = write_img_file(path, buf, len);
|
||||
|
||||
xfree(buf);
|
||||
xfree(node);
|
||||
return ret;
|
||||
}
|
||||
|
||||
int amdgpu_plugin_dmabuf_restore(int id)
|
||||
{
|
||||
char path[PATH_MAX];
|
||||
size_t img_size;
|
||||
FILE *img_fp = NULL;
|
||||
int ret = 0;
|
||||
CriuDmabufNode *rd = NULL;
|
||||
unsigned char *buf = NULL;
|
||||
int fd_id;
|
||||
|
||||
snprintf(path, sizeof(path), IMG_DMABUF_FILE, id);
|
||||
|
||||
/* Read serialized metadata */
|
||||
img_fp = open_img_file(path, false, &img_size);
|
||||
if (!img_fp) {
|
||||
pr_err("Failed to open dmabuf metadata file: %s\n", path);
|
||||
return -EINVAL;
|
||||
}
|
||||
|
||||
pr_debug("dmabuf Image file size:%ld\n", img_size);
|
||||
buf = xmalloc(img_size);
|
||||
if (!buf) {
|
||||
pr_perror("Failed to allocate memory");
|
||||
return -ENOMEM;
|
||||
}
|
||||
|
||||
ret = read_fp(img_fp, buf, img_size);
|
||||
if (ret) {
|
||||
pr_perror("Unable to read from %s", path);
|
||||
xfree(buf);
|
||||
return ret;
|
||||
}
|
||||
|
||||
rd = criu_dmabuf_node__unpack(NULL, img_size, buf);
|
||||
if (rd == NULL) {
|
||||
pr_perror("Unable to parse the dmabuf message %d", id);
|
||||
xfree(buf);
|
||||
fclose(img_fp);
|
||||
return -1;
|
||||
}
|
||||
fclose(img_fp);
|
||||
|
||||
/* Match GEM handle with shared_dmabuf list */
|
||||
fd_id = amdgpu_id_for_handle(rd->gem_handle);
|
||||
if (fd_id == -1) {
|
||||
pr_err("Failed to find dmabuf_fd for GEM handle = %d\n", rd->gem_handle);
|
||||
return 1;
|
||||
}
|
||||
|
||||
int dmabuf_fd = fdstore_get(fd_id);
|
||||
if (dmabuf_fd == -1) {
|
||||
pr_err("Failed to find dmabuf_fd for GEM handle = %d\n", rd->gem_handle);
|
||||
return 1; /* Retry needed */
|
||||
}
|
||||
|
||||
pr_info("Restored dmabuf_fd = %d for GEM handle = %d\n", dmabuf_fd, rd->gem_handle);
|
||||
ret = dmabuf_fd;
|
||||
|
||||
pr_info("Successfully restored dmabuf_fd %d\n", dmabuf_fd);
|
||||
criu_dmabuf_node__free_unpacked(rd, NULL);
|
||||
xfree(buf);
|
||||
return ret;
|
||||
}
|
||||
|
||||
int amdgpu_plugin_dmabuf_dump(int dmabuf_fd, int id)
|
||||
{
|
||||
int ret;
|
||||
|
||||
ret = __amdgpu_plugin_dmabuf_dump(dmabuf_fd, id);
|
||||
if (ret == -EAGAIN) {
|
||||
struct dmabuf *b = xmalloc(sizeof(*b));
|
||||
b->id = id;
|
||||
b->dmabuf_fd = dmabuf_fd;
|
||||
list_add(&b->node, &dmabuf_list);
|
||||
return 0;
|
||||
}
|
||||
return ret;
|
||||
}
|
||||
|
||||
int try_dump_dmabuf_list()
|
||||
{
|
||||
struct dmabuf *b, *t;
|
||||
list_for_each_entry_safe(b, t, &dmabuf_list, node) {
|
||||
int ret = __amdgpu_plugin_dmabuf_dump(b->dmabuf_fd, b->id);
|
||||
if (ret == -EAGAIN)
|
||||
continue;
|
||||
if (ret)
|
||||
return ret;
|
||||
list_del(&b->node);
|
||||
xfree(b);
|
||||
}
|
||||
return 0;
|
||||
}
|
||||
|
||||
int post_dump_dmabuf_check()
|
||||
{
|
||||
if (!list_empty(&dmabuf_list)) {
|
||||
pr_err("Not all dma buffers have been dumped\n");
|
||||
return -1;
|
||||
}
|
||||
return 0;
|
||||
}
|
||||
16
plugins/amdgpu/amdgpu_plugin_dmabuf.h
Normal file
16
plugins/amdgpu/amdgpu_plugin_dmabuf.h
Normal file
|
|
@ -0,0 +1,16 @@
|
|||
|
||||
#ifndef __AMDGPU_PLUGIN_DMABUF_H__
|
||||
#define __AMDGPU_PLUGIN_DMABUF_H__
|
||||
|
||||
#include "amdgpu_plugin_util.h"
|
||||
#include "criu-amdgpu.pb-c.h"
|
||||
|
||||
int amdgpu_plugin_dmabuf_dump(int fd, int id);
|
||||
int amdgpu_plugin_dmabuf_restore(int id);
|
||||
|
||||
int try_dump_dmabuf_list();
|
||||
int post_dump_dmabuf_check();
|
||||
|
||||
int get_dmabuf_info(int fd, struct stat *st);
|
||||
|
||||
#endif /* __AMDGPU_PLUGIN_DMABUF_H__ */
|
||||
|
|
@ -19,19 +19,115 @@
|
|||
|
||||
#include <dirent.h>
|
||||
#include "common/list.h"
|
||||
#include "files.h"
|
||||
#include "fdstore.h"
|
||||
|
||||
#include "criu-amdgpu.pb-c.h"
|
||||
|
||||
/* Define __user as empty for kernel headers in user-space */
|
||||
#define __user
|
||||
#include "drm.h"
|
||||
|
||||
#include <xf86drm.h>
|
||||
#include <libdrm/amdgpu.h>
|
||||
|
||||
#include "xmalloc.h"
|
||||
#include "criu-log.h"
|
||||
#include "kfd_ioctl.h"
|
||||
#include "amdgpu_drm.h"
|
||||
#include "amdgpu_plugin_drm.h"
|
||||
#include "amdgpu_plugin_util.h"
|
||||
#include "amdgpu_plugin_topology.h"
|
||||
|
||||
#include "util.h"
|
||||
#include "common/scm.h"
|
||||
|
||||
int get_gem_handle(amdgpu_device_handle h_dev, int dmabuf_fd)
|
||||
{
|
||||
uint32_t handle;
|
||||
int fd = amdgpu_device_get_fd(h_dev);
|
||||
|
||||
if (dmabuf_fd == -1) {
|
||||
return -1;
|
||||
}
|
||||
|
||||
if (drmPrimeFDToHandle(fd, dmabuf_fd, &handle))
|
||||
return -1;
|
||||
|
||||
return handle;
|
||||
}
|
||||
|
||||
int drmIoctl(int fd, unsigned long request, void *arg)
|
||||
{
|
||||
int ret, max_retries = 200;
|
||||
|
||||
do {
|
||||
ret = ioctl(fd, request, arg);
|
||||
} while (ret == -1 && max_retries-- > 0 && (errno == EINTR || errno == EAGAIN));
|
||||
|
||||
if (ret == -1 && errno == EBADF)
|
||||
/* In case pthread_atfork didn't catch it, this will
|
||||
* make any subsequent hsaKmt calls fail in CHECK_KFD_OPEN.
|
||||
*/
|
||||
pr_perror("KFD file descriptor not valid in this process");
|
||||
return ret;
|
||||
}
|
||||
|
||||
static int allocate_bo_entries(CriuRenderNode *e, int num_bos)
|
||||
{
|
||||
e->bo_entries = xmalloc(sizeof(DrmBoEntry *) * num_bos);
|
||||
if (!e->bo_entries) {
|
||||
pr_err("Failed to allocate bo_info\n");
|
||||
return -ENOMEM;
|
||||
}
|
||||
|
||||
for (int i = 0; i < num_bos; i++) {
|
||||
DrmBoEntry *entry = xzalloc(sizeof(*entry));
|
||||
|
||||
if (!entry) {
|
||||
pr_err("Failed to allocate botest\n");
|
||||
return -ENOMEM;
|
||||
}
|
||||
|
||||
drm_bo_entry__init(entry);
|
||||
|
||||
e->bo_entries[i] = entry;
|
||||
e->n_bo_entries++;
|
||||
}
|
||||
return 0;
|
||||
}
|
||||
|
||||
static int allocate_vm_entries(DrmBoEntry *e, int num_vms)
|
||||
{
|
||||
e->vm_entries = xmalloc(sizeof(DrmVmEntry *) * num_vms);
|
||||
if (!e->vm_entries) {
|
||||
pr_err("Failed to allocate bo_info\n");
|
||||
return -ENOMEM;
|
||||
}
|
||||
|
||||
for (int i = 0; i < num_vms; i++) {
|
||||
DrmVmEntry *entry = xzalloc(sizeof(*entry));
|
||||
|
||||
if (!entry) {
|
||||
pr_err("Failed to allocate botest\n");
|
||||
return -ENOMEM;
|
||||
}
|
||||
|
||||
drm_vm_entry__init(entry);
|
||||
|
||||
e->vm_entries[i] = entry;
|
||||
e->n_vm_entries++;
|
||||
}
|
||||
return 0;
|
||||
}
|
||||
|
||||
static void free_e(CriuRenderNode *e)
|
||||
{
|
||||
for (int i = 0; i < e->n_bo_entries; i++) {
|
||||
if (e->bo_entries[i])
|
||||
xfree(e->bo_entries[i]);
|
||||
}
|
||||
|
||||
xfree(e);
|
||||
}
|
||||
|
||||
int amdgpu_plugin_drm_handle_device_vma(int fd, const struct stat *st)
|
||||
{
|
||||
|
|
@ -60,19 +156,257 @@ int amdgpu_plugin_drm_handle_device_vma(int fd, const struct stat *st)
|
|||
return 0;
|
||||
}
|
||||
|
||||
static int restore_bo_contents_drm(int drm_render_minor, CriuRenderNode *rd, int drm_fd, int *dmabufs)
|
||||
{
|
||||
size_t image_size = 0, max_bo_size = 0, buffer_size;
|
||||
struct amdgpu_gpu_info gpu_info = { 0 };
|
||||
amdgpu_device_handle h_dev;
|
||||
uint64_t max_copy_size;
|
||||
uint32_t major, minor;
|
||||
FILE *bo_contents_fp = NULL;
|
||||
void *buffer = NULL;
|
||||
char img_path[40];
|
||||
int i, ret = 0;
|
||||
|
||||
ret = amdgpu_device_initialize(drm_fd, &major, &minor, &h_dev);
|
||||
if (ret) {
|
||||
pr_perror("failed to initialize device");
|
||||
goto exit;
|
||||
}
|
||||
plugin_log_msg("libdrm initialized successfully\n");
|
||||
|
||||
ret = amdgpu_query_gpu_info(h_dev, &gpu_info);
|
||||
if (ret) {
|
||||
pr_perror("failed to query gpuinfo via libdrm");
|
||||
goto exit;
|
||||
}
|
||||
|
||||
max_copy_size = (gpu_info.family_id >= AMDGPU_FAMILY_AI) ? SDMA_LINEAR_COPY_MAX_SIZE :
|
||||
SDMA_LINEAR_COPY_MAX_SIZE - 1;
|
||||
|
||||
for (i = 0; i < rd->num_of_bos; i++) {
|
||||
if (rd->bo_entries[i]->preferred_domains & (AMDGPU_GEM_DOMAIN_VRAM | AMDGPU_GEM_DOMAIN_GTT)) {
|
||||
if (rd->bo_entries[i]->size > max_bo_size)
|
||||
max_bo_size = rd->bo_entries[i]->size;
|
||||
}
|
||||
}
|
||||
|
||||
buffer_size = max_bo_size;
|
||||
|
||||
posix_memalign(&buffer, sysconf(_SC_PAGE_SIZE), buffer_size);
|
||||
if (!buffer) {
|
||||
pr_perror("Failed to alloc aligned memory. Consider setting KFD_MAX_BUFFER_SIZE.");
|
||||
ret = -ENOMEM;
|
||||
goto exit;
|
||||
}
|
||||
|
||||
for (i = 0; i < rd->num_of_bos; i++) {
|
||||
if (!(rd->bo_entries[i]->preferred_domains & (AMDGPU_GEM_DOMAIN_VRAM | AMDGPU_GEM_DOMAIN_GTT)))
|
||||
continue;
|
||||
|
||||
if (rd->bo_entries[i]->num_of_vms == 0)
|
||||
continue;
|
||||
|
||||
snprintf(img_path, sizeof(img_path), IMG_DRM_PAGES_FILE, rd->id, drm_render_minor, i);
|
||||
|
||||
bo_contents_fp = open_img_file(img_path, false, &image_size);
|
||||
|
||||
ret = sdma_copy_bo(dmabufs[i], rd->bo_entries[i]->size, bo_contents_fp, buffer, buffer_size, h_dev, max_copy_size,
|
||||
SDMA_OP_VRAM_WRITE, true);
|
||||
if (ret) {
|
||||
pr_err("Failed to fill the BO using sDMA: bo_buckets[%d]\n", i);
|
||||
break;
|
||||
}
|
||||
plugin_log_msg("** Successfully filled the BO using sDMA: bo_buckets[%d] **\n", i);
|
||||
|
||||
if (bo_contents_fp)
|
||||
fclose(bo_contents_fp);
|
||||
}
|
||||
|
||||
exit:
|
||||
for (int i = 0; i < rd->num_of_bos; i++) {
|
||||
if (dmabufs[i] != KFD_INVALID_FD)
|
||||
close(dmabufs[i]);
|
||||
}
|
||||
|
||||
xfree(buffer);
|
||||
|
||||
amdgpu_device_deinitialize(h_dev);
|
||||
return ret;
|
||||
}
|
||||
|
||||
int amdgpu_plugin_drm_dump_file(int fd, int id, struct stat *drm)
|
||||
{
|
||||
CriuRenderNode rd = CRIU_RENDER_NODE__INIT;
|
||||
struct tp_node *tp_node;
|
||||
CriuRenderNode *rd = NULL;
|
||||
char path[PATH_MAX];
|
||||
unsigned char *buf;
|
||||
int minor;
|
||||
int len;
|
||||
int ret;
|
||||
size_t image_size;
|
||||
struct tp_node *tp_node;
|
||||
struct drm_amdgpu_gem_list_handles list_handles_args = { 0 };
|
||||
struct drm_amdgpu_gem_list_handles_entry *list_handles_entries;
|
||||
int num_bos;
|
||||
|
||||
rd = xmalloc(sizeof(*rd));
|
||||
if (!rd) {
|
||||
ret = -ENOMEM;
|
||||
goto exit;
|
||||
}
|
||||
criu_render_node__init(rd);
|
||||
|
||||
/* Get the topology node of the DRM device */
|
||||
minor = minor(drm->st_rdev);
|
||||
rd->drm_render_minor = minor;
|
||||
rd->id = id;
|
||||
|
||||
num_bos = 8;
|
||||
list_handles_entries = xzalloc(sizeof(struct drm_amdgpu_gem_list_handles_entry) * num_bos);
|
||||
list_handles_args.num_entries = num_bos;
|
||||
list_handles_args.entries = (uintptr_t)list_handles_entries;
|
||||
|
||||
ret = drmIoctl(fd, DRM_IOCTL_AMDGPU_GEM_LIST_HANDLES, &list_handles_args);
|
||||
if (ret && errno == EINVAL) {
|
||||
pr_info("This kernel appears not to have AMDGPU_GEM_LIST_HANDLES ioctl. Consider disabling Dmabuf IPC or updating your kernel.\n");
|
||||
list_handles_args.num_entries = 0;
|
||||
} else if (ret) {
|
||||
pr_perror("Failed to call bo info ioctl");
|
||||
goto exit;
|
||||
}
|
||||
|
||||
if (list_handles_args.num_entries > num_bos) {
|
||||
num_bos = list_handles_args.num_entries;
|
||||
xfree(list_handles_entries);
|
||||
list_handles_entries = xzalloc(sizeof(struct drm_amdgpu_gem_list_handles_entry) * num_bos);
|
||||
list_handles_args.num_entries = num_bos;
|
||||
list_handles_args.entries = (uintptr_t)list_handles_entries;
|
||||
ret = drmIoctl(fd, DRM_IOCTL_AMDGPU_GEM_LIST_HANDLES, &list_handles_args);
|
||||
if (ret) {
|
||||
pr_perror("Failed to call bo info ioctl");
|
||||
goto exit;
|
||||
}
|
||||
} else {
|
||||
num_bos = list_handles_args.num_entries;
|
||||
}
|
||||
|
||||
rd->num_of_bos = num_bos;
|
||||
ret = allocate_bo_entries(rd, num_bos);
|
||||
if (ret)
|
||||
goto exit;
|
||||
|
||||
for (int i = 0; i < num_bos; i++) {
|
||||
int num_vm_entries = 8;
|
||||
struct drm_amdgpu_gem_vm_entry *vm_info_entries;
|
||||
struct drm_amdgpu_gem_op vm_info_args = { 0 };
|
||||
DrmBoEntry *boinfo = rd->bo_entries[i];
|
||||
struct drm_amdgpu_gem_list_handles_entry handle_entry = list_handles_entries[i];
|
||||
union drm_amdgpu_gem_mmap mmap_args = { 0 };
|
||||
int dmabuf_fd;
|
||||
uint32_t major, minor;
|
||||
amdgpu_device_handle h_dev;
|
||||
void *buffer = NULL;
|
||||
char img_path[40];
|
||||
FILE *bo_contents_fp = NULL;
|
||||
int device_fd;
|
||||
|
||||
boinfo->size = handle_entry.size;
|
||||
|
||||
boinfo->alloc_flags = handle_entry.alloc_flags;
|
||||
boinfo->preferred_domains = handle_entry.preferred_domains;
|
||||
boinfo->alignment = handle_entry.alignment;
|
||||
boinfo->handle = handle_entry.gem_handle;
|
||||
boinfo->is_import = (handle_entry.flags & AMDGPU_GEM_LIST_HANDLES_FLAG_IS_IMPORT) || shared_bo_has_exporter(boinfo->handle);
|
||||
|
||||
mmap_args.in.handle = boinfo->handle;
|
||||
|
||||
if (drmIoctl(fd, DRM_IOCTL_AMDGPU_GEM_MMAP, &mmap_args) == -1) {
|
||||
pr_perror("Error Failed to call mmap ioctl");
|
||||
ret = -1;
|
||||
goto exit;
|
||||
}
|
||||
|
||||
boinfo->offset = mmap_args.out.addr_ptr;
|
||||
|
||||
vm_info_entries = xzalloc(sizeof(struct drm_amdgpu_gem_vm_entry) * num_vm_entries);
|
||||
vm_info_args.handle = handle_entry.gem_handle;
|
||||
vm_info_args.num_entries = num_vm_entries;
|
||||
vm_info_args.value = (uintptr_t)vm_info_entries;
|
||||
vm_info_args.op = AMDGPU_GEM_OP_GET_MAPPING_INFO;
|
||||
ret = drmIoctl(fd, DRM_IOCTL_AMDGPU_GEM_OP, &vm_info_args);
|
||||
if (ret) {
|
||||
pr_perror("Failed to call vm info ioctl");
|
||||
goto exit;
|
||||
}
|
||||
|
||||
if (vm_info_args.num_entries > num_vm_entries) {
|
||||
num_vm_entries = vm_info_args.num_entries;
|
||||
xfree(vm_info_entries);
|
||||
vm_info_entries = xzalloc(sizeof(struct drm_amdgpu_gem_vm_entry) * num_vm_entries);
|
||||
vm_info_args.handle = handle_entry.gem_handle;
|
||||
vm_info_args.num_entries = num_vm_entries;
|
||||
vm_info_args.value = (uintptr_t)vm_info_entries;
|
||||
vm_info_args.op = AMDGPU_GEM_OP_GET_MAPPING_INFO;
|
||||
ret = drmIoctl(fd, DRM_IOCTL_AMDGPU_GEM_OP, &vm_info_args);
|
||||
if (ret) {
|
||||
pr_perror("Failed to call vm info ioctl");
|
||||
goto exit;
|
||||
}
|
||||
} else {
|
||||
num_vm_entries = vm_info_args.num_entries;
|
||||
}
|
||||
|
||||
boinfo->num_of_vms = num_vm_entries;
|
||||
ret = allocate_vm_entries(boinfo, num_vm_entries);
|
||||
if (ret)
|
||||
goto exit;
|
||||
|
||||
for (int j = 0; j < num_vm_entries; j++) {
|
||||
DrmVmEntry *vminfo = boinfo->vm_entries[j];
|
||||
|
||||
boinfo->addr = vm_info_entries[j].addr;
|
||||
vminfo->addr = vm_info_entries[j].addr;
|
||||
vminfo->size = vm_info_entries[j].size;
|
||||
vminfo->offset = vm_info_entries[j].offset;
|
||||
vminfo->flags = vm_info_entries[j].flags;
|
||||
}
|
||||
|
||||
ret = amdgpu_device_initialize(fd, &major, &minor, &h_dev);
|
||||
|
||||
device_fd = amdgpu_device_get_fd(h_dev);
|
||||
|
||||
drmPrimeHandleToFD(device_fd, boinfo->handle, 0, &dmabuf_fd);
|
||||
|
||||
snprintf(img_path, sizeof(img_path), IMG_DRM_PAGES_FILE, rd->id, rd->drm_render_minor, i);
|
||||
bo_contents_fp = open_img_file(img_path, true, &image_size);
|
||||
|
||||
posix_memalign(&buffer, sysconf(_SC_PAGE_SIZE), handle_entry.size);
|
||||
|
||||
ret = sdma_copy_bo(dmabuf_fd, handle_entry.size, bo_contents_fp, buffer, handle_entry.size, h_dev, 0x1000,
|
||||
SDMA_OP_VRAM_READ, false);
|
||||
|
||||
if (dmabuf_fd != KFD_INVALID_FD)
|
||||
close(dmabuf_fd);
|
||||
|
||||
if (bo_contents_fp)
|
||||
fclose(bo_contents_fp);
|
||||
|
||||
ret = amdgpu_device_deinitialize(h_dev);
|
||||
if (ret)
|
||||
goto exit;
|
||||
|
||||
xfree(vm_info_entries);
|
||||
}
|
||||
xfree(list_handles_entries);
|
||||
|
||||
for (int i = 0; i < num_bos; i++) {
|
||||
DrmBoEntry *boinfo = rd->bo_entries[i];
|
||||
|
||||
ret = record_shared_bo(boinfo->handle, boinfo->is_import);
|
||||
if (ret)
|
||||
goto exit;
|
||||
}
|
||||
|
||||
tp_node = sys_get_node_by_render_minor(&src_topology, minor);
|
||||
if (!tp_node) {
|
||||
pr_err("Failed to find a device with minor number = %d\n", minor);
|
||||
|
|
@ -80,21 +414,156 @@ int amdgpu_plugin_drm_dump_file(int fd, int id, struct stat *drm)
|
|||
}
|
||||
|
||||
/* Get the GPU_ID of the DRM device */
|
||||
rd.gpu_id = maps_get_dest_gpu(&checkpoint_maps, tp_node->gpu_id);
|
||||
if (!rd.gpu_id) {
|
||||
pr_err("Failed to find valid gpu_id for the device = %d\n", rd.gpu_id);
|
||||
rd->gpu_id = maps_get_dest_gpu(&checkpoint_maps, tp_node->gpu_id);
|
||||
if (!rd->gpu_id) {
|
||||
pr_err("Failed to find valid gpu_id for the device = %d\n", rd->gpu_id);
|
||||
return -ENODEV;
|
||||
}
|
||||
|
||||
len = criu_render_node__get_packed_size(&rd);
|
||||
len = criu_render_node__get_packed_size(rd);
|
||||
buf = xmalloc(len);
|
||||
if (!buf)
|
||||
return -ENOMEM;
|
||||
|
||||
criu_render_node__pack(&rd, buf);
|
||||
criu_render_node__pack(rd, buf);
|
||||
|
||||
snprintf(path, sizeof(path), IMG_DRM_FILE, id);
|
||||
ret = write_img_file(path, buf, len);
|
||||
|
||||
xfree(buf);
|
||||
exit:
|
||||
free_e(rd);
|
||||
return ret;
|
||||
}
|
||||
|
||||
int amdgpu_plugin_drm_restore_file(int fd, CriuRenderNode *rd)
|
||||
{
|
||||
int ret = 0;
|
||||
bool retry_needed = false;
|
||||
uint32_t major, minor;
|
||||
amdgpu_device_handle h_dev;
|
||||
int device_fd;
|
||||
int *dmabufs = xzalloc(sizeof(int) * rd->num_of_bos);
|
||||
|
||||
ret = amdgpu_device_initialize(fd, &major, &minor, &h_dev);
|
||||
if (ret) {
|
||||
pr_info("Error in init amdgpu device\n");
|
||||
goto exit;
|
||||
}
|
||||
|
||||
device_fd = amdgpu_device_get_fd(h_dev);
|
||||
|
||||
for (int i = 0; i < rd->num_of_bos; i++) {
|
||||
DrmBoEntry *boinfo = rd->bo_entries[i];
|
||||
int dmabuf_fd = -1;
|
||||
uint32_t handle;
|
||||
struct drm_gem_change_handle change_args = { 0 };
|
||||
union drm_amdgpu_gem_mmap mmap_args = { 0 };
|
||||
struct drm_amdgpu_gem_va va_args = { 0 };
|
||||
int fd_id;
|
||||
|
||||
if (work_already_completed(boinfo->handle, rd->drm_render_minor)) {
|
||||
continue;
|
||||
} else if (boinfo->handle != -1) {
|
||||
if (boinfo->is_import) {
|
||||
fd_id = amdgpu_id_for_handle(boinfo->handle);
|
||||
if (fd_id == -1) {
|
||||
retry_needed = true;
|
||||
continue;
|
||||
}
|
||||
dmabuf_fd = fdstore_get(fd_id);
|
||||
}
|
||||
}
|
||||
|
||||
if (boinfo->is_import) {
|
||||
drmPrimeFDToHandle(device_fd, dmabuf_fd, &handle);
|
||||
} else {
|
||||
union drm_amdgpu_gem_create create_args = { 0 };
|
||||
|
||||
create_args.in.bo_size = boinfo->size;
|
||||
create_args.in.alignment = boinfo->alignment;
|
||||
create_args.in.domains = boinfo->preferred_domains;
|
||||
create_args.in.domain_flags = boinfo->alloc_flags;
|
||||
|
||||
if (drmIoctl(fd, DRM_IOCTL_AMDGPU_GEM_CREATE, &create_args) == -1) {
|
||||
pr_perror("Error Failed to call create ioctl");
|
||||
ret = -1;
|
||||
goto exit;
|
||||
}
|
||||
handle = create_args.out.handle;
|
||||
|
||||
drmPrimeHandleToFD(device_fd, handle, 0, &dmabuf_fd);
|
||||
}
|
||||
|
||||
change_args.handle = handle;
|
||||
change_args.new_handle = boinfo->handle;
|
||||
|
||||
if (drmIoctl(fd, DRM_IOCTL_GEM_CHANGE_HANDLE, &change_args) == -1) {
|
||||
pr_perror("Error Failed to call change ioctl; check if the kernel has DRM_IOCTL_GEM_CHANGE_HANDLE support");
|
||||
ret = -1;
|
||||
goto exit;
|
||||
}
|
||||
|
||||
if (!boinfo->is_import)
|
||||
store_dmabuf_fd(boinfo->handle, dmabuf_fd);
|
||||
|
||||
dmabufs[i] = dmabuf_fd;
|
||||
|
||||
ret = record_completed_work(boinfo->handle, rd->drm_render_minor);
|
||||
if (ret)
|
||||
goto exit;
|
||||
|
||||
mmap_args.in.handle = boinfo->handle;
|
||||
|
||||
if (drmIoctl(fd, DRM_IOCTL_AMDGPU_GEM_MMAP, &mmap_args) == -1) {
|
||||
pr_perror("Error Failed to call mmap ioctl");
|
||||
ret = -1;
|
||||
goto exit;
|
||||
}
|
||||
|
||||
for (int j = 0; j < boinfo->num_of_vms; j++) {
|
||||
DrmVmEntry *vminfo = boinfo->vm_entries[j];
|
||||
|
||||
va_args.handle = boinfo->handle;
|
||||
va_args.operation = AMDGPU_VA_OP_MAP;
|
||||
va_args.flags = vminfo->flags;
|
||||
va_args.va_address = vminfo->addr;
|
||||
va_args.offset_in_bo = vminfo->offset;
|
||||
va_args.map_size = vminfo->size;
|
||||
|
||||
if (drmIoctl(fd, DRM_IOCTL_AMDGPU_GEM_VA, &va_args) == -1) {
|
||||
pr_perror("Error Failed to call gem va ioctl");
|
||||
ret = -1;
|
||||
goto exit;
|
||||
}
|
||||
}
|
||||
|
||||
ret = save_vma_updates(boinfo->offset, boinfo->addr, mmap_args.out.addr_ptr, fd);
|
||||
if (ret < 0)
|
||||
goto exit;
|
||||
}
|
||||
|
||||
if (ret) {
|
||||
pr_info("Error in deinit amdgpu device\n");
|
||||
goto exit;
|
||||
}
|
||||
|
||||
ret = record_completed_work(-1, rd->drm_render_minor);
|
||||
if (ret)
|
||||
goto exit;
|
||||
|
||||
ret = amdgpu_device_deinitialize(h_dev);
|
||||
|
||||
if (rd->num_of_bos > 0) {
|
||||
ret = restore_bo_contents_drm(rd->drm_render_minor, rd, fd, dmabufs);
|
||||
if (ret)
|
||||
goto exit;
|
||||
}
|
||||
|
||||
exit:
|
||||
if (ret < 0)
|
||||
return ret;
|
||||
xfree(dmabufs);
|
||||
|
||||
return retry_needed;
|
||||
}
|
||||
|
|
|
|||
|
|
@ -24,5 +24,17 @@ int amdgpu_plugin_drm_handle_device_vma(int fd, const struct stat *drm);
|
|||
*/
|
||||
int amdgpu_plugin_drm_dump_file(int fd, int id, struct stat *drm);
|
||||
|
||||
int amdgpu_plugin_drm_restore_file(int fd, CriuRenderNode *rd);
|
||||
|
||||
int amdgpu_plugin_drm_unpause_file(int fd);
|
||||
|
||||
int amdgpu_id_for_handle(int handle);
|
||||
|
||||
int store_dmabuf_fd(int handle, int fd);
|
||||
|
||||
int get_gem_handle(amdgpu_device_handle h_dev, int dmabuf_fd);
|
||||
|
||||
int save_vma_updates(uint64_t offset, uint64_t addr, uint64_t restored_offset, int gpu_id);
|
||||
|
||||
#endif /* __AMDGPU_PLUGIN_DRM_H__ */
|
||||
|
||||
|
|
|
|||
|
|
@ -37,9 +37,11 @@
|
|||
#include "amdgpu_drm.h"
|
||||
#include "amdgpu_plugin_util.h"
|
||||
#include "amdgpu_plugin_topology.h"
|
||||
#include "amdgpu_plugin_drm.h"
|
||||
|
||||
/* Tracks number of device files that need to be checkpointed */
|
||||
static int dev_file_cnt = 0;
|
||||
static LIST_HEAD(dumped_fds);
|
||||
static LIST_HEAD(shared_bos);
|
||||
static LIST_HEAD(completed_work);
|
||||
|
||||
/* Helper structures to encode device topology of SRC and DEST platforms */
|
||||
struct tp_system src_topology;
|
||||
|
|
@ -49,23 +51,145 @@ struct tp_system dest_topology;
|
|||
struct device_maps checkpoint_maps;
|
||||
struct device_maps restore_maps;
|
||||
|
||||
bool checkpoint_is_complete()
|
||||
int record_dumped_fd(int fd, bool is_drm)
|
||||
{
|
||||
return (dev_file_cnt == 0);
|
||||
int newfd = dup(fd);
|
||||
|
||||
if (newfd < 0)
|
||||
return newfd;
|
||||
struct dumped_fd *st = malloc(sizeof(struct dumped_fd));
|
||||
if (!st)
|
||||
return -1;
|
||||
st->fd = newfd;
|
||||
st->is_drm = is_drm;
|
||||
list_add(&st->l, &dumped_fds);
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
void decrement_checkpoint_count()
|
||||
struct list_head *get_dumped_fds()
|
||||
{
|
||||
dev_file_cnt--;
|
||||
return &dumped_fds;
|
||||
}
|
||||
|
||||
void init_gpu_count(struct tp_system *topo)
|
||||
bool shared_bo_has_exporter(int handle)
|
||||
{
|
||||
if (dev_file_cnt != 0)
|
||||
return;
|
||||
struct shared_bo *bo;
|
||||
|
||||
/* We add ONE to include checkpointing of KFD device */
|
||||
dev_file_cnt = 1 + topology_gpu_count(topo);
|
||||
if (handle == -1)
|
||||
return false;
|
||||
|
||||
list_for_each_entry(bo, &shared_bos, l) {
|
||||
if (bo->handle == handle) {
|
||||
return bo->has_exporter;
|
||||
}
|
||||
}
|
||||
|
||||
return false;
|
||||
}
|
||||
|
||||
int record_shared_bo(int handle, bool is_imported)
|
||||
{
|
||||
struct shared_bo *bo;
|
||||
|
||||
if (handle == -1)
|
||||
return 0;
|
||||
|
||||
list_for_each_entry(bo, &shared_bos, l) {
|
||||
if (bo->handle == handle) {
|
||||
return 0;
|
||||
}
|
||||
}
|
||||
bo = malloc(sizeof(struct shared_bo));
|
||||
if (!bo)
|
||||
return -1;
|
||||
bo->handle = handle;
|
||||
bo->has_exporter = !is_imported;
|
||||
list_add(&bo->l, &shared_bos);
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
int handle_for_shared_bo_fd(int fd)
|
||||
{
|
||||
struct dumped_fd *df;
|
||||
int trial_handle;
|
||||
amdgpu_device_handle h_dev;
|
||||
uint32_t major, minor;
|
||||
struct shared_bo *bo;
|
||||
|
||||
list_for_each_entry(df, &dumped_fds, l) {
|
||||
/* see if the gem handle for fd using the hdev for df->fd is the
|
||||
same as bo->handle. */
|
||||
|
||||
if (!df->is_drm) {
|
||||
continue;
|
||||
}
|
||||
|
||||
if (amdgpu_device_initialize(df->fd, &major, &minor, &h_dev)) {
|
||||
pr_err("Failed to initialize amdgpu device\n");
|
||||
continue;
|
||||
}
|
||||
|
||||
trial_handle = get_gem_handle(h_dev, fd);
|
||||
if (trial_handle < 0)
|
||||
continue;
|
||||
|
||||
list_for_each_entry(bo, &shared_bos, l) {
|
||||
if (bo->handle == trial_handle)
|
||||
return trial_handle;
|
||||
}
|
||||
|
||||
amdgpu_device_deinitialize(h_dev);
|
||||
}
|
||||
|
||||
return -1;
|
||||
}
|
||||
|
||||
int record_completed_work(int handle, int id)
|
||||
{
|
||||
struct restore_completed_work *work;
|
||||
|
||||
work = malloc(sizeof(struct restore_completed_work));
|
||||
if (!work)
|
||||
return -1;
|
||||
work->handle = handle;
|
||||
work->id = id;
|
||||
list_add(&work->l, &completed_work);
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
bool work_already_completed(int handle, int id)
|
||||
{
|
||||
struct restore_completed_work *work;
|
||||
|
||||
list_for_each_entry(work, &completed_work, l) {
|
||||
if (work->handle == handle && work->id == id) {
|
||||
return true;
|
||||
}
|
||||
}
|
||||
|
||||
return false;
|
||||
}
|
||||
|
||||
void clear_restore_state()
|
||||
{
|
||||
while (!list_empty(&completed_work)) {
|
||||
struct restore_completed_work *st = list_first_entry(&completed_work, struct restore_completed_work, l);
|
||||
list_del(&st->l);
|
||||
free(st);
|
||||
}
|
||||
}
|
||||
|
||||
void clear_dumped_fds()
|
||||
{
|
||||
while (!list_empty(&dumped_fds)) {
|
||||
struct dumped_fd *st = list_first_entry(&dumped_fds, struct dumped_fd, l);
|
||||
list_del(&st->l);
|
||||
close(st->fd);
|
||||
free(st);
|
||||
}
|
||||
}
|
||||
|
||||
int read_fp(FILE *fp, void *buf, const size_t buf_len)
|
||||
|
|
|
|||
|
|
@ -1,6 +1,8 @@
|
|||
#ifndef __AMDGPU_PLUGIN_UTIL_H__
|
||||
#define __AMDGPU_PLUGIN_UTIL_H__
|
||||
|
||||
#include <libdrm/amdgpu.h>
|
||||
|
||||
#ifndef _GNU_SOURCE
|
||||
#define _GNU_SOURCE 1
|
||||
#endif
|
||||
|
|
@ -51,14 +53,18 @@
|
|||
/* Name of file having serialized data of DRM device */
|
||||
#define IMG_DRM_FILE "amdgpu-renderD-%d.img"
|
||||
|
||||
/* Name of file having serialized data of dmabuf meta */
|
||||
#define IMG_DMABUF_FILE "amdgpu-dmabuf_%d.img"
|
||||
|
||||
/* Name of file having serialized data of DRM device buffer objects (BOs) */
|
||||
#define IMG_DRM_PAGES_FILE "amdgpu-drm-pages-%d-%04x.img"
|
||||
#define IMG_DRM_PAGES_FILE "amdgpu-drm-pages-%d-%d-%04x.img"
|
||||
|
||||
/* Helper macros to Checkpoint and Restore a ROCm file */
|
||||
#define HSAKMT_SHM_PATH "/dev/shm/hsakmt_shared_mem"
|
||||
#define HSAKMT_SHM "/hsakmt_shared_mem"
|
||||
#define HSAKMT_SEM_PATH "/dev/shm/sem.hsakmt_semaphore"
|
||||
#define HSAKMT_SEM "hsakmt_semaphore"
|
||||
#define DMABUF_LINK "/dmabuf"
|
||||
|
||||
/* Help macros to build sDMA command packets */
|
||||
#define SDMA_PACKET(op, sub_op, e) ((((e)&0xFFFF) << 16) | (((sub_op)&0xFF) << 8) | (((op)&0xFF) << 0))
|
||||
|
|
@ -73,6 +79,24 @@ enum sdma_op_type {
|
|||
SDMA_OP_VRAM_WRITE,
|
||||
};
|
||||
|
||||
struct dumped_fd {
|
||||
struct list_head l;
|
||||
int fd;
|
||||
bool is_drm;
|
||||
};
|
||||
|
||||
struct shared_bo {
|
||||
struct list_head l;
|
||||
int handle;
|
||||
bool has_exporter;
|
||||
};
|
||||
|
||||
struct restore_completed_work {
|
||||
struct list_head l;
|
||||
int handle;
|
||||
int id;
|
||||
};
|
||||
|
||||
/* Helper structures to encode device topology of SRC and DEST platforms */
|
||||
extern struct tp_system src_topology;
|
||||
extern struct tp_system dest_topology;
|
||||
|
|
@ -97,10 +121,25 @@ int read_file(const char *file_path, void *buf, const size_t buf_len);
|
|||
int write_img_file(char *path, const void *buf, const size_t buf_len);
|
||||
FILE *open_img_file(char *path, bool write, size_t *size);
|
||||
|
||||
bool checkpoint_is_complete();
|
||||
void decrement_checkpoint_count();
|
||||
void init_gpu_count(struct tp_system *topology);
|
||||
int record_dumped_fd(int fd, bool is_drm);
|
||||
struct list_head *get_dumped_fds();
|
||||
void clear_dumped_fds();
|
||||
|
||||
bool shared_bo_has_exporter(int handle);
|
||||
int record_shared_bo(int handle, bool is_imported);
|
||||
int handle_for_shared_bo_fd(int dmabuf_fd);
|
||||
|
||||
int record_completed_work(int handle, int id);
|
||||
bool work_already_completed(int handle, int id);
|
||||
|
||||
void clear_restore_state();
|
||||
|
||||
void print_kfd_bo_stat(int bo_cnt, struct kfd_criu_bo_bucket *bo_list);
|
||||
|
||||
int sdma_copy_bo(int shared_fd, uint64_t size, FILE *storage_fp,
|
||||
void *buffer, size_t buffer_size, amdgpu_device_handle h_dev,
|
||||
uint64_t max_copy_size, enum sdma_op_type type, bool do_not_free);
|
||||
|
||||
int serve_out_dmabuf_fd(int handle, int fd);
|
||||
|
||||
#endif /* __AMDGPU_PLUGIN_UTIL_H__ */
|
||||
|
|
|
|||
|
|
@ -46,6 +46,7 @@ message kfd_bo_entry {
|
|||
required uint64 offset = 3;
|
||||
required uint32 alloc_flags = 4;
|
||||
required uint32 gpu_id = 5;
|
||||
required uint32 handle = 6;
|
||||
}
|
||||
|
||||
message criu_kfd {
|
||||
|
|
@ -61,6 +62,34 @@ message criu_kfd {
|
|||
required bytes priv_data = 10;
|
||||
}
|
||||
|
||||
message drm_bo_entry {
|
||||
required uint64 addr = 1;
|
||||
required uint64 size = 2;
|
||||
required uint64 offset = 3;
|
||||
required uint64 alloc_flags = 4;
|
||||
required uint64 alignment = 5;
|
||||
required uint32 preferred_domains = 6;
|
||||
required uint32 handle = 7;
|
||||
required uint32 is_import = 8;
|
||||
required uint32 num_of_vms = 9;
|
||||
repeated drm_vm_entry vm_entries = 10;
|
||||
}
|
||||
|
||||
message drm_vm_entry {
|
||||
required uint64 addr = 1;
|
||||
required uint64 size = 2;
|
||||
required uint64 offset = 3;
|
||||
required uint64 flags = 4;
|
||||
}
|
||||
|
||||
message criu_render_node {
|
||||
required uint32 gpu_id = 1;
|
||||
required uint32 id = 2;
|
||||
required uint32 drm_render_minor = 3;
|
||||
required uint64 num_of_bos = 4;
|
||||
repeated drm_bo_entry bo_entries = 5;
|
||||
}
|
||||
|
||||
message criu_dmabuf_node {
|
||||
required uint32 gem_handle = 1;
|
||||
}
|
||||
|
|
|
|||
1476
plugins/amdgpu/drm.h
Normal file
1476
plugins/amdgpu/drm.h
Normal file
File diff suppressed because it is too large
Load diff
1362
plugins/amdgpu/drm_mode.h
Normal file
1362
plugins/amdgpu/drm_mode.h
Normal file
File diff suppressed because it is too large
Load diff
|
|
@ -23,9 +23,12 @@
|
|||
#ifndef KFD_IOCTL_H_INCLUDED
|
||||
#define KFD_IOCTL_H_INCLUDED
|
||||
|
||||
#include <libdrm/drm.h>
|
||||
#include <linux/ioctl.h>
|
||||
|
||||
/* Define __user as empty for kernel headers in user-space */
|
||||
#define __user
|
||||
#include "drm.h"
|
||||
|
||||
/*
|
||||
* - 1.1 - initial version
|
||||
* - 1.3 - Add SMI events support
|
||||
|
|
|
|||
|
|
@ -70,6 +70,7 @@ static int parse_maps(struct vm_area *vmas)
|
|||
#endif
|
||||
v->is_vvar_or_vdso |= strstr(buf, "[vdso]") != NULL;
|
||||
v->is_vvar_or_vdso |= strstr(buf, "[vvar]") != NULL;
|
||||
v->is_vvar_or_vdso |= strstr(buf, "[vvar_vclock]") != NULL;
|
||||
test_msg("[NOTE]\tVMA: [%#" PRIx64 ", %#" PRIx64 "]\n", v->start, v->end);
|
||||
}
|
||||
|
||||
|
|
@ -86,42 +87,35 @@ static int parse_maps(struct vm_area *vmas)
|
|||
return i;
|
||||
}
|
||||
|
||||
int compare_vmas(struct vm_area *vmax, struct vm_area *vmay)
|
||||
{
|
||||
if (vmax->start > vmay->start)
|
||||
return 1;
|
||||
if (vmax->start < vmay->start)
|
||||
return -1;
|
||||
if (vmax->end > vmay->end)
|
||||
return 1;
|
||||
if (vmax->end < vmay->end)
|
||||
return -1;
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
static int check_vvar_vdso(struct vm_area *before, struct vm_area *after)
|
||||
static int check_vvar_vdso(struct vm_area *before, int nr_before, struct vm_area *after, int nr_after)
|
||||
{
|
||||
int i, j = 0;
|
||||
|
||||
for (i = 0; i < MAX_VMAS && j < MAX_VMAS; i++, j++) {
|
||||
int cmp = compare_vmas(&before[i], &after[j]);
|
||||
|
||||
if (cmp == 0)
|
||||
continue;
|
||||
|
||||
if (cmp < 0) { /* Lost mapping */
|
||||
for (i = 0, j = 0; i < nr_before || j < nr_after;) {
|
||||
if (j == nr_after || before[i].start < after[j].start) {
|
||||
test_msg("[NOTE]\tLost mapping: %#" PRIx64 "-%#" PRIx64 "\n", before[i].start, before[i].end);
|
||||
j--;
|
||||
if (before[i].is_vvar_or_vdso) {
|
||||
fail("Lost vvar/vdso mapping");
|
||||
return -1;
|
||||
}
|
||||
i++;
|
||||
continue;
|
||||
}
|
||||
|
||||
test_msg("[NOTE]\tNew mapping appeared: %#" PRIx64 "-%#" PRIx64 "\n", after[j].start, after[j].end);
|
||||
i--;
|
||||
if (i == nr_before || before[i].start > after[j].start) {
|
||||
test_msg("[NOTE]\tNew mapping appeared: %#" PRIx64 "-%#" PRIx64 "\n", after[j].start, after[j].end);
|
||||
j++;
|
||||
continue;
|
||||
}
|
||||
if (before[i].end == after[j].end) {
|
||||
i++;
|
||||
j++;
|
||||
} else if (before[i].end > after[j].end) {
|
||||
before[i].start = after[j].end;
|
||||
j++;
|
||||
} else {
|
||||
after[j].start = before[i].end;
|
||||
i++;
|
||||
}
|
||||
}
|
||||
|
||||
return 0;
|
||||
|
|
@ -129,11 +123,10 @@ static int check_vvar_vdso(struct vm_area *before, struct vm_area *after)
|
|||
|
||||
static struct vm_area vmas_before[MAX_VMAS];
|
||||
static struct vm_area vmas_after[MAX_VMAS];
|
||||
static int nr_before, nr_after;
|
||||
|
||||
int main(int argc, char *argv[])
|
||||
{
|
||||
int nr_before, nr_after;
|
||||
|
||||
test_init(argc, argv);
|
||||
|
||||
test_msg("[NOTE]\tMappings before:\n");
|
||||
|
|
@ -154,7 +147,7 @@ int main(int argc, char *argv[])
|
|||
}
|
||||
|
||||
/* After restore vDSO/VVAR blobs must remain in the old place. */
|
||||
if (check_vvar_vdso(vmas_before, vmas_after))
|
||||
if (check_vvar_vdso(vmas_before, nr_before, vmas_after, nr_after))
|
||||
return -1;
|
||||
|
||||
if (nr_before + 2 < nr_after) {
|
||||
|
|
|
|||
Loading…
Add table
Add a link
Reference in a new issue