diff --git a/COW_DUMP_DESIGN.md b/COW_DUMP_DESIGN.md new file mode 100644 index 000000000..1f3f90a5c --- /dev/null +++ b/COW_DUMP_DESIGN.md @@ -0,0 +1,299 @@ +# COW-Based Live Migration Design Document + + +## Introduction +This feature implements COW (Copy-On-Write) based live migration for CRIU, enabling process duplication to remote instances to achieve the goal of: +1. Minimized downtime at the source. +2. Making the destination alive ASAP like in the current design of lazy dump. +3. Transfer the data at high speed to complete the process soon and reduce the amount of COW operations. + + +The approach uses userfaultfd write-protection to track memory modifications while the process continues running at the source and the destination is loaded same as in the lazy dump implementation. It overcomes the main issue with the lazy dump where the source is frozen during the dump. + +## Architecture Overview + +### Data Flow Source + + +**Phase 1: Setup via Parasite RPC** + - Create userfaultfd in target process + - Register VMAs with UFFDIO_REGISTER_MODE_WP + - Apply write-protection (UFFDIO_WRITEPROTECT) + - Send userfaultfd back to CRIU + - Create Monitor thread to get write faults events + - Process resumes with COW protection active + +**Phase 2: Monitor Thread (Background)** + - read() from userfaultfd (blocking) + - On write fault: + 1. Read page from /proc/pid/mem (before modification) + 2. Copy the page and store it in hash table + 3. Unprotect page + 4. Wake faulting thread at the source process + +**Phase 3: Page Transfer (page_server_get_pages)** + - Lookup COW pages in hash table + - Fast path: No COW → splice (zero-copy) + - Slow path: COW present → buffer + overlay + - Bulk unprotect after transfer + +#### Detailed design source + +##### 1. cow-dump.c (CRIU-side Coordinator) + +Main coordinator for COW tracking on the CRIU side. Manages the lifecycle of COW dump operations. + +*Key Data Structures* + +```c +/* Per-process COW dump state */ +struct cow_dump_info { + struct pstree_item *item; + int uffd; /* userfaultfd from target */ + int proc_mem_fd; /* /proc/pid/mem handle */ + unsigned long total_pages; /* Total pages tracked */ + unsigned long dirty_pages; /* Modified pages count */ + + /* Hash table: 65K buckets for O(1) lookup */ + struct hlist_head cow_hash[COW_HASH_SIZE]; /* 2^16 buckets */ + pthread_spinlock_t cow_hash_locks[COW_HASH_SIZE]; //Lock for each hash entry to have fine grain locking. +}; + +/* Hash table entry for copied pages */ +struct cow_page { + unsigned long vaddr; /* Virtual address */ + void *data; /* 4KB page content */ + struct hlist_node hash; /* Hash linkage */ +}; + +#define COW_HASH_SIZE (1 << 16) /* 65536 buckets */ +``` + +*Key Functions* + +**Init- Initialize COW tracking** +- Opens `/proc/pid/mem` for reading page contents +- Calls parasite RPC to setup userfaultfd +- Receives userfaultfd from parasite +- Initializes hash table and spinlocks +- Init COW monitoring thread + + +**cow_monitor_thread()** - Background monitoring +- Continuously reads from userfaultfd +- Processes write fault events + +**cow_handle_write_fault()** - Handle write fault event +``` +Input: fault address +1. Allocate cow_page structure +2. Read page from /proc/pid/mem (BEFORE modification) +3. Add to hash table (thread-safe) +4. Unprotect page (UFFDIO_WRITEPROTECT mode=0) +5. Wake faulting thread (UFFDIO_WAKE) +``` + + +**cow_lookup_and_remove_page()** - Thread-safe page lookup +- Hash-based O(1) lookup +- Removes from hash table atomically + +##### 2. pie/parasite.c (In-Process Setup) + +Runs inside the target process to setup userfaultfd with write-protection. + +**Purpose:** The parasite code is injected into the target process and executes in its context to create and configure the userfaultfd. + +*Key Function: parasite_cow_dump_init()* + + +**Why Parasite-Based?** +1. **Context Requirement:** userfaultfd must be created in target process context +2. **Inheritance:** Automatically inherited by all threads +3. **Permissions:** Avoids ptrace permission issues +4. **Atomic Setup:** All VMAs protected before process resumes + + +##### 3. page-xfer.c (Page Server Integration) + +Integrates COW tracking with page transfer, overlaying modified pages during transfer. + +Key Function: page_server_get_pages() + +Step 1: Read pages from page_pipe + page_pipe_read(pp, &pipe_read_dest, vaddr, &nr_pages) + +Step 2: Check for COW pages at the hash table, recall each modified page is stored in the hash table (single pass) + for each page: + cow_pages[i] = cow_lookup_and_remove_page(addr) + cow_count = number of non-NULL entries + +Fast path: (cow_count is zero, same as in the current lazy implementation) +Zero-copy splice: splice(pipe -> sock) +No memory copies! + + +Slow path: (cow_count is above zero) +1. read(pipe -> buffer) +2. overlay COW pages + +Step 3: Bulk unprotect +wp.range.start = vaddr +wp.range.len = len +wp.mode = 0 +ioctl(uffd, UFFDIO_WRITEPROTECT) + + +### Data Flow Destination + +No changes were made at the destination and it is almost the same as in the original code. I implemented a single performance improvement that handles lazy page requests from destination with aggressive pipelining. + +``` +┌─────────────────────────────────────────────────────────┐ +│ Traditional: Sequential (1 request at a time) │ +│ │ +│ Request → Wait → Response → Request → Wait → Response │ +│ │ +│ Throughput: Limited by RTT │ +└─────────────────────────────────────────────────────────┘ + +┌─────────────────────────────────────────────────────────┐ +│ Aggressive: Pipeline (256 requests in-flight) │ +│ │ +│ Request ─┐ │ +│ Request ─┤ │ +│ Request ─┤ │ +│ ... ├─► In Flight (256 concurrent) │ +│ Request ─┤ │ +│ Request ─┤ │ +│ Request ─┘ │ +│ │ +│ Response → IMMEDIATELY refill pipeline │ +│ │ +│ Throughput: Near maximum network bandwidth │ +└─────────────────────────────────────────────────────────┘ +``` + + +## Kernel Requirements + +### Minimum Kernel Version +**Linux 5.7+** (released May 2020) + +### Required Features + +| Feature | Flag | Purpose | Since | +|---------|------|---------|-------| +| WP Flag | `UFFD_FEATURE_PAGEFAULT_FLAG_WP` | Identify write faults | 5.7 | + + +### System Configuration + +**Unprivileged Access:** +```bash +# Allow unprivileged userfaultfd +echo 1 > /proc/sys/vm/unprivileged_userfaultfd + +# Or require CAP_SYS_PTRACE +``` + + + +--- + +## Future Work + +#### 1. Explore UFFD_FEATURE_WP_ASYNC + +We should explore how to use this feature. It should only mark the page as touched and then we can do a second pass to copy only the touched pages. I will dive deeper to see if it is more efficient. + +#### 2. Reduce communication overhead between source and destination + +Currently the communication is driven by the destination which sends requests. We can improve this by making the source send the data and the destination only asks if there is a read page fault. That way, we reduce the amount of work from the source. + +#### 3. Make the source multithreaded + +Can we make the source multithreaded to reduce the overall time? Should be explored. + +#### 4. Non-Registerable VMAs + +**Issue:** Some VMAs cannot be write-protected. + +I will be happy to get advice. + + + +### Next Steps + +For maintainers reviewing this code: + +1. **Testing:** Extensive testing with various workloads + add regression tests. +2. **Documentation:** Update user-facing documentation +3. **Performance Tuning:** Try differnt techniques discussed at the Future Work section. + + +### Usage +```bash +criu dump --cow-dump --lazy-pages ... +``` + +## Appendix - Statistics and Monitoring + +### COW Tracking Statistics + +**Per-Second Logging:** +``` +[COW_STATS] events: wr=1234 fork=0 remap=0 unk=0 | + ops: copied=1234 unprot=1234 woken=1234 | + errs: alloc=0 read=0 unprot_err=0 wake_err=0 + read_err=0 eagain_err=0 +``` + +**Metrics:** + +| Metric | Description | Good Value | Alert If | +|--------|-------------|------------|----------| +| `wr` | Write faults | Varies | - | +| `copied` | Pages copied | = wr | < wr | +| `unprot` | Pages unprotected | = wr | < wr | +| `woken` | Threads woken | = wr | < wr | +| `alloc_failures` | Allocation failures | 0 | > 0 | +| `read_failures` | Read failures | 0 | > 0 | +| `eagain_errors` | EAGAIN on read | Low | High | + +### Page Server Statistics + +**Per-Second Logging:** +``` +[PAGE_SERVER_STATS] get_pages: reqs=500 with_cow=50 no_cow=450 + pages=8000 cow=400 errs=0 | + serve: open2=1 parent=0 add_f=7950 get=500 + close=1 +``` + +**Metrics:** + +| Metric | Description | Indicates | +|--------|-------------|-----------| +| `reqs` | Total requests | Transfer activity | +| `with_cow` | Slow path taken | COW overlay needed | +| `no_cow` | Fast path taken | Zero-copy efficiency | +| `pages` | Total pages transferred | Bandwidth | +| `cow` | COW pages overlaid | Write activity | + +### UFFD Daemon Statistics + +**Per-Second Logging:** +``` +[UFFD_STATS] reqs=1000(pf:50,bg:950) pages=8000 pipe_avg=180 + PF: 4K=30 64K=15 128K=5 + BG: 4K=100 64K=500 128K=200 256K=100 512K=50 +``` + +**Histograms:** +- **PF (Page Fault):** Destination-initiated requests +- **BG (Background):** Proactive prefetch + +**Pipeline Depth:** +- `pipe_avg`: Average in-flight requests +- Target: Close to `max_pipeline_depth` (256) diff --git a/criu/Makefile.crtools b/criu/Makefile.crtools index ba6132d2f..b9b0f5bdd 100644 --- a/criu/Makefile.crtools +++ b/criu/Makefile.crtools @@ -87,6 +87,7 @@ obj-y += path.o obj-y += autofs.o obj-y += fdstore.o obj-y += uffd.o +obj-y += cow-dump.o obj-y += config.o obj-y += servicefd.o obj-y += pie-util-vdso.o diff --git a/criu/config.c b/criu/config.c index d7ef3f8e8..b4f390dd8 100644 --- a/criu/config.c +++ b/criu/config.c @@ -705,6 +705,7 @@ int parse_options(int argc, char **argv, bool *usage_error, bool *has_exec_cmd, BOOL_OPT("unprivileged", &opts.unprivileged), BOOL_OPT("ghost-fiemap", &opts.ghost_fiemap), BOOL_OPT(OPT_ALLOW_UPROBES, &opts.allow_uprobes), + { "cow-dump", no_argument, 0, 1105 }, {}, }; @@ -1045,6 +1046,9 @@ int parse_options(int argc, char **argv, bool *usage_error, bool *has_exec_cmd, return 1; } break; + case 1105: + opts.cow_dump = true; + break; case 'V': pr_msg("Version: %s\n", CRIU_VERSION); if (strcmp(CRIU_GITID, "0")) diff --git a/criu/cow-dump.c b/criu/cow-dump.c new file mode 100644 index 000000000..a1b75c7fb --- /dev/null +++ b/criu/cow-dump.c @@ -0,0 +1,583 @@ +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include "types.h" +#include "cr_options.h" +#include "pstree.h" +#include "cow-dump.h" +#include "uffd.h" +#include "page-xfer.h" +#include "page-pipe.h" +#include "parasite-syscall.h" +#include "mem.h" +#include "vma.h" +#include "util.h" +#include "kerndat.h" +#include "criu-log.h" +#include "parasite.h" + +#undef LOG_PREFIX +#define LOG_PREFIX "cow-dump: " + +/* COW dump state for a single process */ +struct cow_dump_info { + struct pstree_item *item; + int uffd; /* userfaultfd for write tracking */ + int proc_mem_fd; /* /proc/pid/mem for reading pages */ + unsigned long total_pages; /* Total pages being tracked */ + unsigned long dirty_pages; /* Pages modified in current iteration */ + unsigned long dirty_pages_dumped; /* Pages already written to disk */ + unsigned long iteration; /* Current iteration number */ + struct list_head dirty_list; /* List of dirty page ranges */ + struct hlist_head cow_hash[COW_HASH_SIZE]; /* Hash table for copied pages */ + pthread_spinlock_t cow_hash_locks[COW_HASH_SIZE]; /* Per-bucket spinlocks */ +}; + +/* Dirty page range */ +struct dirty_range { + unsigned long start; + unsigned long len; + struct list_head list; +}; + +static struct cow_dump_info *g_cow_info = NULL; +static pthread_t g_monitor_thread; +static volatile bool g_stop_monitoring = false; + +#define COW_MAX_ITERATIONS 10 +#define COW_CONVERGENCE_THRESHOLD 100 /* Stop if < 100 pages dirty per iteration */ +#define COW_FLUSH_THRESHOLD 1000 /* Flush to disk every 1000 pages */ + +/* Statistics tracking structure */ +static struct { + /* Event counters */ + unsigned long write_faults; + unsigned long fork_events; + unsigned long remap_events; + unsigned long unknown_events; + + /* Operation counters */ + unsigned long pages_copied; + unsigned long pages_unprotected; + unsigned long pages_woken; + + /* Error counters */ + unsigned long alloc_failures; + unsigned long read_failures; + unsigned long unprotect_failures; + unsigned long wake_failures; + unsigned long eagain_errors; + unsigned long read_errors; + + time_t last_print_time; +} cow_stats; + +static void check_and_print_cow_stats(void) +{ + time_t now = time(NULL); + + if (now - cow_stats.last_print_time >= 1) { + pr_warn("[COW_STATS] events: wr=%lu fork=%lu remap=%lu unk=%lu | ops: copied=%lu unprot=%lu woken=%lu | errs: alloc=%lu read=%lu unprot_err=%lu wake_err=%lu read_err=%lu eagain_err=%lu\n", + cow_stats.write_faults, + cow_stats.fork_events, + cow_stats.remap_events, + cow_stats.unknown_events, + cow_stats.pages_copied, + cow_stats.pages_unprotected, + cow_stats.pages_woken, + cow_stats.alloc_failures, + cow_stats.read_failures, + cow_stats.unprotect_failures, + cow_stats.wake_failures, + cow_stats.read_errors, + cow_stats.eagain_errors); + + /* Reset all counters */ + memset(&cow_stats, 0, sizeof(cow_stats)); + cow_stats.last_print_time = now; + } +} + +bool cow_check_kernel_support(void) +{ + unsigned long features = UFFD_FEATURE_WP_ASYNC | + UFFD_FEATURE_PAGEFAULT_FLAG_WP | + UFFD_FEATURE_EVENT_FORK | + UFFD_FEATURE_EVENT_REMAP; + int uffd, err = 0; + + uffd = uffd_open(0, &features, &err); + if (uffd < 0) { + if (err == ENOSYS) { + pr_info("userfaultfd not supported by kernel\n"); + } else if (err == EPERM) { + pr_info("userfaultfd requires CAP_SYS_PTRACE or sysctl vm.unprivileged_userfaultfd=1\n"); + } + return false; + } + + if (!(features & UFFD_FEATURE_WP_ASYNC)) { + pr_info("userfaultfd write-protect feature not supported (need kernel 5.7+)\n"); + close(uffd); + return false; + } + + if (!(features & UFFD_FEATURE_PAGEFAULT_FLAG_WP)) { + pr_info("userfaultfd WP pagefault flag not supported (need kernel 5.7+)\n"); + close(uffd); + return false; + } + + close(uffd); + pr_info("COW dump kernel support detected\n"); + return true; +} + +static int open_proc_mem(pid_t pid) +{ + char path[64]; + int fd; + + snprintf(path, sizeof(path), "/proc/%d/mem", pid); + fd = open(path, O_RDONLY); + if (fd < 0) { + pr_perror("Failed to open %s", path); + return -1; + } + + return fd; +} + +int cow_dump_init(struct pstree_item *item, struct vm_area_list *vma_area_list, struct parasite_ctl *ctl) +{ + struct cow_dump_info *cdi; + struct vma_area *vma; + struct parasite_cow_dump_args *args; + struct parasite_vma_entry *p_vma; + + int ret; + unsigned long args_size; + unsigned int nr_vmas = 0; + + pr_info("Initializing COW dump for pid %d (via parasite)\n", item->pid->real); + + if (!cow_check_kernel_support()) { + pr_err("Kernel doesn't support COW dump\n"); + return -1; + } + + cdi = xzalloc(sizeof(*cdi)); + if (!cdi) + return -1; + + cdi->item = item; + INIT_LIST_HEAD(&cdi->dirty_list); + cdi->uffd = -1; /* Will be received from parasite */ + + /* Initialize hash table for COW pages */ + for (int i = 0; i < COW_HASH_SIZE; i++) { + INIT_HLIST_HEAD(&cdi->cow_hash[i]); + pthread_spin_init(&cdi->cow_hash_locks[i], PTHREAD_PROCESS_PRIVATE); + } + + /* Open /proc/pid/mem for reading pages */ + cdi->proc_mem_fd = open_proc_mem(item->pid->real); + if (cdi->proc_mem_fd < 0) + goto err_free; + + /* Prepare parasite arguments - count writable VMAs */ + nr_vmas = 0; + list_for_each_entry(vma, &vma_area_list->h, list) { + if (vma_area_is(vma, VMA_AREA_GUARD)) + continue; + if (vma->e->prot & PROT_WRITE) + nr_vmas++; + } + + /* Allocate parasite args - includes space for VMAs and failed indices */ + args_size = sizeof(*args) + + nr_vmas * sizeof(struct parasite_vma_entry) + + nr_vmas * sizeof(unsigned int); /* Space for failed indices */ + args = compel_parasite_args_s(ctl, args_size); + if (!args) { + pr_err("Failed to allocate parasite args\n"); + goto err_close_mem; + } + + args->nr_vmas = nr_vmas; + args->total_pages = 0; + args->nr_failed_vmas = 0; + args->ret = -1; + + /* Fill VMA entries */ + p_vma = cow_dump_vmas(args); + nr_vmas = 0; + list_for_each_entry(vma, &vma_area_list->h, list) { + if (vma_area_is(vma, VMA_AREA_GUARD)) + continue; + if (!(vma->e->prot & PROT_WRITE)) + continue; + + p_vma[nr_vmas].start = vma->e->start; + p_vma[nr_vmas].len = vma->e->end - vma->e->start; + p_vma[nr_vmas].prot = vma->e->prot; + nr_vmas++; + } + + pr_info("Calling parasite to register %u VMAs\n", args->nr_vmas); + + /* Call parasite to create uffd and perform registration (async) */ + ret = compel_rpc_call(PARASITE_CMD_COW_DUMP_INIT, ctl); + if (ret < 0) { + pr_err("Failed to initiate COW dump RPC\n"); + goto err_close_mem; + } + + /* Receive userfaultfd from parasite */ + compel_util_recv_fd(ctl, &cdi->uffd); + if (cdi->uffd < 0) { + pr_err("Failed to receive userfaultfd from parasite: %d\n", cdi->uffd); + goto err_close_mem; + } + pr_info("Got fd %d VMAs\n", cdi->uffd); + /* Wait for parasite to complete */ + ret = compel_rpc_sync(PARASITE_CMD_COW_DUMP_INIT, ctl); + if (ret < 0 || args->ret != 0) { + pr_err("Parasite COW dump init failed: %d (ret=%d)\n", ret, args->ret); + close(cdi->uffd); + cdi->uffd = -1; + goto err_close_mem; + } + + cdi->total_pages = args->total_pages; + cdi->dirty_pages_dumped = 0; + + pr_info("COW dump initialized: tracking %lu pages, uffd=%d\n", + cdi->total_pages, cdi->uffd); + + + g_cow_info = cdi; + return 0; + +err_close_mem: + close(cdi->proc_mem_fd); +err_free: + xfree(cdi); + return -1; +} + +void cow_dump_fini(void) +{ + struct dirty_range *dr, *tmp; + struct cow_page *cp; + struct hlist_node *n; + int i, remaining = 0; + + if (!g_cow_info) + return; + + pr_info("Cleaning up COW dump\n"); + + /* Clean up any remaining COW pages */ + for (i = 0; i < COW_HASH_SIZE; i++) { + pthread_spin_lock(&g_cow_info->cow_hash_locks[i]); + hlist_for_each_entry_safe(cp, n, &g_cow_info->cow_hash[i], hash) { + hlist_del(&cp->hash); + xfree(cp->data); + xfree(cp); + remaining++; + } + pthread_spin_unlock(&g_cow_info->cow_hash_locks[i]); + pthread_spin_destroy(&g_cow_info->cow_hash_locks[i]); + } + + if (remaining > 0) + pr_warn("Freed %d remaining COW pages\n", remaining); + + list_for_each_entry_safe(dr, tmp, &g_cow_info->dirty_list, list) { + list_del(&dr->list); + xfree(dr); + } + + if (g_cow_info->proc_mem_fd >= 0) + close(g_cow_info->proc_mem_fd); + + if (g_cow_info->uffd >= 0) + close(g_cow_info->uffd); + + xfree(g_cow_info); + g_cow_info = NULL; +} + +static int cow_handle_write_fault(struct cow_dump_info *cdi, unsigned long addr) +{ + struct cow_page *cp; + unsigned long page_addr = addr & ~(PAGE_SIZE - 1); + struct uffdio_writeprotect wp; + struct uffdio_range range; + ssize_t ret; + unsigned int hash; + + pr_info("Write fault at 0x%lx\n", page_addr); + + cow_stats.write_faults++; + cdi->dirty_pages++; + + /* Allocate cow_page structure */ + cp = xmalloc(sizeof(*cp)); + if (!cp) { + pr_err("Failed to allocate cow_page structure\n"); + cow_stats.alloc_failures++; + return -1; + } + + cp->data = xmalloc(PAGE_SIZE); + if (!cp->data) { + pr_err("Failed to allocate page data\n"); + xfree(cp); + cow_stats.alloc_failures++; + return -1; + } + + cp->vaddr = page_addr; + INIT_HLIST_NODE(&cp->hash); + + /* Read original page content from /proc/pid/mem */ + ret = pread(cdi->proc_mem_fd, cp->data, PAGE_SIZE, page_addr); + if (ret != PAGE_SIZE) { + pr_perror("Failed to read page at 0x%lx (read %zd bytes)", page_addr, ret); + xfree(cp->data); + xfree(cp); + cow_stats.read_failures++; + return -1; + } + + /* Add to hash table (thread-safe with per-bucket spinlock) */ + hash = (page_addr >> PAGE_SHIFT) & (COW_HASH_SIZE - 1); + + pthread_spin_lock(&cdi->cow_hash_locks[hash]); + hlist_add_head(&cp->hash, &cdi->cow_hash[hash]); + pthread_spin_unlock(&cdi->cow_hash_locks[hash]); + + cow_stats.pages_copied++; + pr_debug("Copied page at 0x%lx to hash bucket %u\n", page_addr, hash); + + /* Unprotect the page so the process can continue */ + wp.range.start = page_addr; + wp.range.len = PAGE_SIZE; + wp.mode = 0; /* Clear write-protect */ + + if (ioctl(cdi->uffd, UFFDIO_WRITEPROTECT, &wp)) { + pr_perror("Failed to unprotect page at 0x%lx", page_addr); + cow_stats.unprotect_failures++; + return -1; + } + + cow_stats.pages_unprotected++; + + /* Wake up the faulting thread */ + range.start = page_addr; + range.len = PAGE_SIZE; + + if (ioctl(cdi->uffd, UFFDIO_WAKE, &range)) { + pr_perror("Failed to wake thread after unprotect"); + cow_stats.wake_failures++; + return -1; + } + + cow_stats.pages_woken++; + cdi->total_pages--; + return 0; +} + +static int cow_process_events(struct cow_dump_info *cdi, bool blocking) +{ + struct uffd_msg msg; + struct pollfd pfd; + int ret, poll_ret; + + while (1) { + /* Check and print stats */ + check_and_print_cow_stats(); + + /* Try reading directly first - avoids poll() overhead when data is ready */ + ret = read(cdi->uffd, &msg, sizeof(msg)); + + if (ret < 0 && errno == EAGAIN && blocking) { + /* No data available and we want to block - use poll() with timeout */ + pfd.fd = cdi->uffd; + pfd.events = POLLIN; + pfd.revents = 0; + + poll_ret = poll(&pfd, 1, 500); /* 500ms timeout */ + if (poll_ret < 0) { + pr_perror("poll() failed on uffd"); + cow_stats.read_errors++; + return -1; + } + + if (poll_ret == 0) { + /* Timeout - no events within 500ms */ + return 0; + } + + /* Data ready after poll - retry read */ + ret = read(cdi->uffd, &msg, sizeof(msg)); + } + + if (ret < 0) { + if (errno == EAGAIN && !blocking) { + /* Non-blocking mode and no data */ + cow_stats.eagain_errors++; + return 0; + } + pr_perror("Failed to read uffd event"); + cow_stats.read_errors++; + return -1; + } + + if (ret != sizeof(msg)) { + pr_err("Short read from uffd: %d\n", ret); + cow_stats.read_errors++; + return -1; + } + + switch (msg.event) { + case UFFD_EVENT_PAGEFAULT: + if (msg.arg.pagefault.flags & UFFD_PAGEFAULT_FLAG_WP) { + /* Write fault - track it */ + if (cow_handle_write_fault(cdi, msg.arg.pagefault.address)) + return -1; + } + break; + + case UFFD_EVENT_FORK: + cow_stats.fork_events++; + pr_warn("Process forked during COW dump (not fully supported)\n"); + break; + + case UFFD_EVENT_REMAP: + cow_stats.remap_events++; + pr_info("Memory remap event\n"); + break; + + default: + cow_stats.unknown_events++; + pr_err("Unexpected uffd event: %u\n", msg.event); + return -1; + } + } + + return 0; +} + +/* Background thread that monitors for write faults */ +static void *cow_monitor_thread(void *arg) +{ + struct cow_dump_info *cdi = (struct cow_dump_info *)arg; + + pr_info("COW monitor thread started\n"); + pr_warn("PAGE SERVER READY TO SERVE\n"); + + while (!g_stop_monitoring) { + + + /* Process events with short timeout */ + if (cow_process_events(cdi, true) < 0) { + pr_err("Error processing COW events in monitor thread\n"); + break; + } + } + + pr_info("COW monitor thread stopped\n"); + return NULL; +} + +int cow_start_monitor_thread(void) +{ + int ret; + + if (!g_cow_info) { + pr_err("COW dump not initialized\n"); + return -1; + } + + g_stop_monitoring = false; + + ret = pthread_create(&g_monitor_thread, NULL, cow_monitor_thread, g_cow_info); + if (ret) { + pr_perror("Failed to create COW monitor thread"); + return -1; + } + + pr_info("COW monitor thread created successfully\n"); + return 0; +} + +int cow_stop_monitor_thread(void) +{ + void *retval; + + if (!g_cow_info) { + return 0; /* Nothing to stop */ + } + + pr_info("Stopping COW monitor thread\n"); + g_stop_monitoring = true; + + /* Wait for thread to finish */ + if (pthread_join(g_monitor_thread, &retval)) { + pr_perror("Failed to join COW monitor thread"); + return -1; + } + + pr_info("COW monitor thread stopped successfully\n"); + return 0; +} + +int cow_get_uffd(void) +{ + if (!g_cow_info) + return -1; + + return g_cow_info->uffd; +} + +struct cow_page *cow_lookup_and_remove_page(unsigned long vaddr) +{ + struct cow_page *cp; + struct hlist_node *n; + unsigned int hash; + unsigned long page_addr = vaddr & ~(PAGE_SIZE - 1); + + if (!g_cow_info) + return NULL; + + hash = (page_addr >> PAGE_SHIFT) & (COW_HASH_SIZE - 1); + + pthread_spin_lock(&g_cow_info->cow_hash_locks[hash]); + + hlist_for_each_entry_safe(cp, n, &g_cow_info->cow_hash[hash], hash) { + if (cp->vaddr == page_addr) { + hlist_del(&cp->hash); + pthread_spin_unlock(&g_cow_info->cow_hash_locks[hash]); + pr_debug("Found and removed COW page at 0x%lx from hash bucket %u\n", + page_addr, hash); + return cp; + } + } + + pthread_spin_unlock(&g_cow_info->cow_hash_locks[hash]); + return NULL; +} diff --git a/criu/cr-dump.c b/criu/cr-dump.c index a58aaf34a..32bcd25e9 100644 --- a/criu/cr-dump.c +++ b/criu/cr-dump.c @@ -65,6 +65,7 @@ #include "stats.h" #include "mem.h" #include "page-pipe.h" +#include "cow-dump.h" #include "posix-timer.h" #include "vdso.h" #include "vma.h" @@ -1744,6 +1745,23 @@ static int dump_one_task(struct pstree_item *item, InventoryEntry *parent_ie) goto err_cure; } + if (opts.cow_dump) { + /* COW dump mode: split VMAs by size */ + ret = cow_dump_init(item, &vmas, parasite_ctl); + if (ret) { + pr_err("Failed to initialize COW dump for VMAs\n"); + goto err_cure; + } + + /* Start background thread to monitor page faults */ + ret = cow_start_monitor_thread(); + if (ret) { + pr_err("Failed to start COW monitor thread\n"); + goto err_cure; + } + } + + ret = compel_stop_daemon(parasite_ctl); if (ret) { pr_err("Can't stop daemon in parasite (pid: %d)\n", pid); @@ -2100,16 +2118,40 @@ static int cr_dump_finish(int ret) clean_cr_time_mounts(); } - if (!ret && opts.lazy_pages) + /* Resume process early if using COW dump with lazy pages */ + if (!ret && opts.lazy_pages && opts.cow_dump) { + pr_info("Resuming process with COW protection active\n"); + + if (arch_set_thread_regs(root_item, true) < 0) + return -1; + + cr_plugin_fini(CR_PLUGIN_STAGE__DUMP, ret); + + pstree_switch_state(root_item, TASK_ALIVE); + timing_stop(TIME_FROZEN); + + /* Now start lazy page transfer with process running */ ret = cr_lazy_mem_dump(); + + /* Stop the monitor thread after lazy dump completes */ + if (cow_stop_monitor_thread()) { + pr_err("Failed to stop COW monitor thread\n"); + ret = -1; + } + } else { + /* Standard path: transfer pages then resume */ + if (!ret && opts.lazy_pages) + ret = cr_lazy_mem_dump(); + + if (arch_set_thread_regs(root_item, true) < 0) + return -1; - if (arch_set_thread_regs(root_item, true) < 0) - return -1; + cr_plugin_fini(CR_PLUGIN_STAGE__DUMP, ret); - cr_plugin_fini(CR_PLUGIN_STAGE__DUMP, ret); - - pstree_switch_state(root_item, (ret || post_dump_ret) ? TASK_ALIVE : opts.final_state); - timing_stop(TIME_FROZEN); + pstree_switch_state(root_item, (ret || post_dump_ret) ? TASK_ALIVE : opts.final_state); + timing_stop(TIME_FROZEN); + } + free_pstree(root_item); seccomp_free_entries(); free_file_locks(); diff --git a/criu/include/cow-dump.h b/criu/include/cow-dump.h new file mode 100644 index 000000000..b4c77aec7 --- /dev/null +++ b/criu/include/cow-dump.h @@ -0,0 +1,93 @@ +#ifndef __CR_COW_DUMP_H_ +#define __CR_COW_DUMP_H_ + +#include "types.h" +#include "common/list.h" + +struct pstree_item; +struct vm_area_list; +struct parasite_ctl; + +#define COW_HASH_BITS 16 +#define COW_HASH_SIZE (1 << COW_HASH_BITS) + +struct cow_page { + unsigned long vaddr; + void *data; + struct hlist_node hash; +}; + + +/** + * cow_dump_init - Initialize COW dump for a process + * @item: Process tree item to set up COW tracking for + * @vma_area_list: List of VMAs to track + * @ctl: Parasite control structure for RPC + * + * Sets up userfaultfd with write-protection for all writable memory + * regions of the target process. The registration is performed via + * parasite RPC to ensure it runs in the target process's context. + * + * Returns: 0 on success, -1 on error + */ +extern int cow_dump_init(struct pstree_item *item, struct vm_area_list *vma_area_list, struct parasite_ctl *ctl); + +/** + * cow_dump_fini - Clean up COW dump resources + * + * Releases all resources allocated for COW tracking. + */ +extern void cow_dump_fini(void); + +/** + * cow_check_kernel_support - Check if kernel supports COW dump + * + * Verifies that the kernel has necessary userfaultfd write-protect + * features (requires Linux 5.7+). + * + * Returns: true if supported, false otherwise + */ +extern bool cow_check_kernel_support(void); + +/** + * cow_start_monitor_thread - Start background thread to monitor page faults + * + * Creates a pthread that continuously monitors the userfaultfd for + * write faults and handles them immediately, preventing the target + * process from blocking during the dump phase. + * + * Returns: 0 on success, -1 on error + */ +extern int cow_start_monitor_thread(void); + +/** + * cow_stop_monitor_thread - Stop the monitoring thread + * + * Signals the monitor thread to stop and waits for it to complete. + * + * Returns: 0 on success, -1 on error + */ +extern int cow_stop_monitor_thread(void); + +/** + * cow_get_uffd - Get the userfaultfd file descriptor + * + * Returns the userfaultfd associated with the current COW dump session. + * + * Returns: userfaultfd on success, -1 if COW dump not initialized + */ +extern int cow_get_uffd(void); + +/** + * cow_lookup_and_remove_page - Look up and remove a COW page + * @vaddr: Virtual address of the page + * + * Thread-safe lookup and removal of a copied page from the hash table. + * The caller is responsible for freeing the returned cow_page structure + * and its data. + * + * Returns: cow_page structure on success, NULL if not found + */ +extern struct cow_page *cow_lookup_and_remove_page(unsigned long vaddr); + +#endif /* __CR_COW_DUMP_H_ */ diff --git a/criu/include/cr_options.h b/criu/include/cr_options.h index 8c5707b41..98063b929 100644 --- a/criu/include/cr_options.h +++ b/criu/include/cr_options.h @@ -193,6 +193,7 @@ struct cr_options { unsigned int empty_ns; int tcp_skip_in_flight; bool lazy_pages; + bool cow_dump; char *work_dir; int network_lock_method; int skip_file_rwx_check; diff --git a/criu/include/linux/userfaultfd.h b/criu/include/linux/userfaultfd.h index cfcf48571..2eb9f327a 100644 --- a/criu/include/linux/userfaultfd.h +++ b/criu/include/linux/userfaultfd.h @@ -1,3 +1,4 @@ +/* SPDX-License-Identifier: GPL-2.0 WITH Linux-syscall-note */ /* * include/linux/userfaultfd.h * @@ -11,6 +12,10 @@ #include +/* ioctls for /dev/userfaultfd */ +#define USERFAULTFD_IOC 0xAA +#define USERFAULTFD_IOC_NEW _IO(USERFAULTFD_IOC, 0x00) + /* * If the UFFDIO_API is upgraded someday, the UFFDIO_UNREGISTER and * UFFDIO_WAKE ioctls should be defined as _IOW and not as _IOR. In @@ -18,12 +23,44 @@ * means the userland is reading). */ #define UFFD_API ((__u64)0xAA) -#define UFFD_API_FEATURES \ - (UFFD_FEATURE_EVENT_FORK | UFFD_FEATURE_EVENT_REMAP | UFFD_FEATURE_EVENT_REMOVE | UFFD_FEATURE_EVENT_UNMAP | \ - UFFD_FEATURE_MISSING_HUGETLBFS | UFFD_FEATURE_MISSING_SHMEM) -#define UFFD_API_IOCTLS ((__u64)1 << _UFFDIO_REGISTER | (__u64)1 << _UFFDIO_UNREGISTER | (__u64)1 << _UFFDIO_API) -#define UFFD_API_RANGE_IOCTLS ((__u64)1 << _UFFDIO_WAKE | (__u64)1 << _UFFDIO_COPY | (__u64)1 << _UFFDIO_ZEROPAGE) -#define UFFD_API_RANGE_IOCTLS_BASIC ((__u64)1 << _UFFDIO_WAKE | (__u64)1 << _UFFDIO_COPY) +#define UFFD_API_REGISTER_MODES (UFFDIO_REGISTER_MODE_MISSING | \ + UFFDIO_REGISTER_MODE_WP | \ + UFFDIO_REGISTER_MODE_MINOR) +#define UFFD_API_FEATURES (UFFD_FEATURE_PAGEFAULT_FLAG_WP | \ + UFFD_FEATURE_EVENT_FORK | \ + UFFD_FEATURE_EVENT_REMAP | \ + UFFD_FEATURE_EVENT_REMOVE | \ + UFFD_FEATURE_EVENT_UNMAP | \ + UFFD_FEATURE_MISSING_HUGETLBFS | \ + UFFD_FEATURE_MISSING_SHMEM | \ + UFFD_FEATURE_SIGBUS | \ + UFFD_FEATURE_THREAD_ID | \ + UFFD_FEATURE_MINOR_HUGETLBFS | \ + UFFD_FEATURE_MINOR_SHMEM | \ + UFFD_FEATURE_EXACT_ADDRESS | \ + UFFD_FEATURE_WP_HUGETLBFS_SHMEM | \ + UFFD_FEATURE_WP_UNPOPULATED | \ + UFFD_FEATURE_POISON | \ + UFFD_FEATURE_WP_ASYNC | \ + UFFD_FEATURE_MOVE) +#define UFFD_API_IOCTLS \ + ((__u64)1 << _UFFDIO_REGISTER | \ + (__u64)1 << _UFFDIO_UNREGISTER | \ + (__u64)1 << _UFFDIO_API) +#define UFFD_API_RANGE_IOCTLS \ + ((__u64)1 << _UFFDIO_WAKE | \ + (__u64)1 << _UFFDIO_COPY | \ + (__u64)1 << _UFFDIO_ZEROPAGE | \ + (__u64)1 << _UFFDIO_MOVE | \ + (__u64)1 << _UFFDIO_WRITEPROTECT | \ + (__u64)1 << _UFFDIO_CONTINUE | \ + (__u64)1 << _UFFDIO_POISON) +#define UFFD_API_RANGE_IOCTLS_BASIC \ + ((__u64)1 << _UFFDIO_WAKE | \ + (__u64)1 << _UFFDIO_COPY | \ + (__u64)1 << _UFFDIO_WRITEPROTECT | \ + (__u64)1 << _UFFDIO_CONTINUE | \ + (__u64)1 << _UFFDIO_POISON) /* * Valid ioctl command number range with this API is from 0x00 to @@ -33,56 +70,77 @@ * which ioctl the running kernel implements through the ioctl command * bitmask written by the UFFDIO_API. */ -#define _UFFDIO_REGISTER (0x00) -#define _UFFDIO_UNREGISTER (0x01) -#define _UFFDIO_WAKE (0x02) -#define _UFFDIO_COPY (0x03) -#define _UFFDIO_ZEROPAGE (0x04) -#define _UFFDIO_API (0x3F) +#define _UFFDIO_REGISTER (0x00) +#define _UFFDIO_UNREGISTER (0x01) +#define _UFFDIO_WAKE (0x02) +#define _UFFDIO_COPY (0x03) +#define _UFFDIO_ZEROPAGE (0x04) +#define _UFFDIO_MOVE (0x05) +#define _UFFDIO_WRITEPROTECT (0x06) +#define _UFFDIO_CONTINUE (0x07) +#define _UFFDIO_POISON (0x08) +#define _UFFDIO_API (0x3F) /* userfaultfd ioctl ids */ -#define UFFDIO 0xAA -#define UFFDIO_API _IOWR(UFFDIO, _UFFDIO_API, struct uffdio_api) -#define UFFDIO_REGISTER _IOWR(UFFDIO, _UFFDIO_REGISTER, struct uffdio_register) -#define UFFDIO_UNREGISTER _IOR(UFFDIO, _UFFDIO_UNREGISTER, struct uffdio_range) -#define UFFDIO_WAKE _IOR(UFFDIO, _UFFDIO_WAKE, struct uffdio_range) -#define UFFDIO_COPY _IOWR(UFFDIO, _UFFDIO_COPY, struct uffdio_copy) -#define UFFDIO_ZEROPAGE _IOWR(UFFDIO, _UFFDIO_ZEROPAGE, struct uffdio_zeropage) +#define UFFDIO 0xAA +#define UFFDIO_API _IOWR(UFFDIO, _UFFDIO_API, \ + struct uffdio_api) +#define UFFDIO_REGISTER _IOWR(UFFDIO, _UFFDIO_REGISTER, \ + struct uffdio_register) +#define UFFDIO_UNREGISTER _IOR(UFFDIO, _UFFDIO_UNREGISTER, \ + struct uffdio_range) +#define UFFDIO_WAKE _IOR(UFFDIO, _UFFDIO_WAKE, \ + struct uffdio_range) +#define UFFDIO_COPY _IOWR(UFFDIO, _UFFDIO_COPY, \ + struct uffdio_copy) +#define UFFDIO_ZEROPAGE _IOWR(UFFDIO, _UFFDIO_ZEROPAGE, \ + struct uffdio_zeropage) +#define UFFDIO_MOVE _IOWR(UFFDIO, _UFFDIO_MOVE, \ + struct uffdio_move) +#define UFFDIO_WRITEPROTECT _IOWR(UFFDIO, _UFFDIO_WRITEPROTECT, \ + struct uffdio_writeprotect) +#define UFFDIO_CONTINUE _IOWR(UFFDIO, _UFFDIO_CONTINUE, \ + struct uffdio_continue) +#define UFFDIO_POISON _IOWR(UFFDIO, _UFFDIO_POISON, \ + struct uffdio_poison) /* read() structure */ struct uffd_msg { - __u8 event; + __u8 event; - __u8 reserved1; - __u16 reserved2; - __u32 reserved3; + __u8 reserved1; + __u16 reserved2; + __u32 reserved3; union { struct { - __u64 flags; - __u64 address; + __u64 flags; + __u64 address; + union { + __u32 ptid; + } feat; } pagefault; struct { - __u32 ufd; + __u32 ufd; } fork; struct { - __u64 from; - __u64 to; - __u64 len; + __u64 from; + __u64 to; + __u64 len; } remap; struct { - __u64 start; - __u64 end; + __u64 start; + __u64 end; } remove; struct { /* unused reserved fields */ - __u64 reserved1; - __u64 reserved2; - __u64 reserved3; + __u64 reserved1; + __u64 reserved2; + __u64 reserved3; } reserved; } arg; } __packed; @@ -90,15 +148,16 @@ struct uffd_msg { /* * Start at 0x12 and not at 0 to be more strict against bugs. */ -#define UFFD_EVENT_PAGEFAULT 0x12 -#define UFFD_EVENT_FORK 0x13 -#define UFFD_EVENT_REMAP 0x14 -#define UFFD_EVENT_REMOVE 0x15 -#define UFFD_EVENT_UNMAP 0x16 +#define UFFD_EVENT_PAGEFAULT 0x12 +#define UFFD_EVENT_FORK 0x13 +#define UFFD_EVENT_REMAP 0x14 +#define UFFD_EVENT_REMOVE 0x15 +#define UFFD_EVENT_UNMAP 0x16 /* flags for UFFD_EVENT_PAGEFAULT */ -#define UFFD_PAGEFAULT_FLAG_WRITE (1 << 0) /* If this was a write fault */ -#define UFFD_PAGEFAULT_FLAG_WP (1 << 1) /* If reason is VM_UFFD_WP */ +#define UFFD_PAGEFAULT_FLAG_WRITE (1<<0) /* If this was a write fault */ +#define UFFD_PAGEFAULT_FLAG_WP (1<<1) /* If reason is VM_UFFD_WP */ +#define UFFD_PAGEFAULT_FLAG_MINOR (1<<2) /* If reason is VM_UFFD_MINOR */ struct uffdio_api { /* userland asks for an API number and the features to enable */ @@ -136,14 +195,59 @@ struct uffdio_api { * UFFD_FEATURE_MISSING_SHMEM works the same as * UFFD_FEATURE_MISSING_HUGETLBFS, but it applies to shmem * (i.e. tmpfs and other shmem based APIs). + * + * UFFD_FEATURE_SIGBUS feature means no page-fault + * (UFFD_EVENT_PAGEFAULT) event will be delivered, instead + * a SIGBUS signal will be sent to the faulting process. + * + * UFFD_FEATURE_THREAD_ID pid of the page faulted task_struct will + * be returned, if feature is not requested 0 will be returned. + * + * UFFD_FEATURE_MINOR_HUGETLBFS indicates that minor faults + * can be intercepted (via REGISTER_MODE_MINOR) for + * hugetlbfs-backed pages. + * + * UFFD_FEATURE_MINOR_SHMEM indicates the same support as + * UFFD_FEATURE_MINOR_HUGETLBFS, but for shmem-backed pages instead. + * + * UFFD_FEATURE_EXACT_ADDRESS indicates that the exact address of page + * faults would be provided and the offset within the page would not be + * masked. + * + * UFFD_FEATURE_WP_HUGETLBFS_SHMEM indicates that userfaultfd + * write-protection mode is supported on both shmem and hugetlbfs. + * + * UFFD_FEATURE_WP_UNPOPULATED indicates that userfaultfd + * write-protection mode will always apply to unpopulated pages + * (i.e. empty ptes). This will be the default behavior for shmem + * & hugetlbfs, so this flag only affects anonymous memory behavior + * when userfault write-protection mode is registered. + * + * UFFD_FEATURE_WP_ASYNC indicates that userfaultfd write-protection + * asynchronous mode is supported in which the write fault is + * automatically resolved and write-protection is un-set. + * It implies UFFD_FEATURE_WP_UNPOPULATED. + * + * UFFD_FEATURE_MOVE indicates that the kernel supports moving an + * existing page contents from userspace. */ -#define UFFD_FEATURE_PAGEFAULT_FLAG_WP (1 << 0) -#define UFFD_FEATURE_EVENT_FORK (1 << 1) -#define UFFD_FEATURE_EVENT_REMAP (1 << 2) -#define UFFD_FEATURE_EVENT_REMOVE (1 << 3) -#define UFFD_FEATURE_MISSING_HUGETLBFS (1 << 4) -#define UFFD_FEATURE_MISSING_SHMEM (1 << 5) -#define UFFD_FEATURE_EVENT_UNMAP (1 << 6) +#define UFFD_FEATURE_PAGEFAULT_FLAG_WP (1<<0) +#define UFFD_FEATURE_EVENT_FORK (1<<1) +#define UFFD_FEATURE_EVENT_REMAP (1<<2) +#define UFFD_FEATURE_EVENT_REMOVE (1<<3) +#define UFFD_FEATURE_MISSING_HUGETLBFS (1<<4) +#define UFFD_FEATURE_MISSING_SHMEM (1<<5) +#define UFFD_FEATURE_EVENT_UNMAP (1<<6) +#define UFFD_FEATURE_SIGBUS (1<<7) +#define UFFD_FEATURE_THREAD_ID (1<<8) +#define UFFD_FEATURE_MINOR_HUGETLBFS (1<<9) +#define UFFD_FEATURE_MINOR_SHMEM (1<<10) +#define UFFD_FEATURE_EXACT_ADDRESS (1<<11) +#define UFFD_FEATURE_WP_HUGETLBFS_SHMEM (1<<12) +#define UFFD_FEATURE_WP_UNPOPULATED (1<<13) +#define UFFD_FEATURE_POISON (1<<14) +#define UFFD_FEATURE_WP_ASYNC (1<<15) +#define UFFD_FEATURE_MOVE (1<<16) __u64 features; __u64 ioctls; @@ -156,8 +260,9 @@ struct uffdio_range { struct uffdio_register { struct uffdio_range range; -#define UFFDIO_REGISTER_MODE_MISSING ((__u64)1 << 0) -#define UFFDIO_REGISTER_MODE_WP ((__u64)1 << 1) +#define UFFDIO_REGISTER_MODE_MISSING ((__u64)1<<0) +#define UFFDIO_REGISTER_MODE_WP ((__u64)1<<1) +#define UFFDIO_REGISTER_MODE_MINOR ((__u64)1<<2) __u64 mode; /* @@ -171,13 +276,14 @@ struct uffdio_copy { __u64 dst; __u64 src; __u64 len; +#define UFFDIO_COPY_MODE_DONTWAKE ((__u64)1<<0) /* - * There will be a wrprotection flag later that allows to map - * pages wrprotected on the fly. And such a flag will be - * available if the wrprotection ioctl are implemented for the - * range according to the uffdio_register.ioctls. + * UFFDIO_COPY_MODE_WP will map the page write protected on + * the fly. UFFDIO_COPY_MODE_WP is available only if the + * write protected ioctl is implemented for the range + * according to the uffdio_register.ioctls. */ -#define UFFDIO_COPY_MODE_DONTWAKE ((__u64)1 << 0) +#define UFFDIO_COPY_MODE_WP ((__u64)1<<1) __u64 mode; /* @@ -189,7 +295,7 @@ struct uffdio_copy { struct uffdio_zeropage { struct uffdio_range range; -#define UFFDIO_ZEROPAGE_MODE_DONTWAKE ((__u64)1 << 0) +#define UFFDIO_ZEROPAGE_MODE_DONTWAKE ((__u64)1<<0) __u64 mode; /* @@ -199,4 +305,82 @@ struct uffdio_zeropage { __s64 zeropage; }; -#endif /* _LINUX_USERFAULTFD_H */ +struct uffdio_writeprotect { + struct uffdio_range range; +/* + * UFFDIO_WRITEPROTECT_MODE_WP: set the flag to write protect a range, + * unset the flag to undo protection of a range which was previously + * write protected. + * + * UFFDIO_WRITEPROTECT_MODE_DONTWAKE: set the flag to avoid waking up + * any wait thread after the operation succeeds. + * + * NOTE: Write protecting a region (WP=1) is unrelated to page faults, + * therefore DONTWAKE flag is meaningless with WP=1. Removing write + * protection (WP=0) in response to a page fault wakes the faulting + * task unless DONTWAKE is set. + */ +#define UFFDIO_WRITEPROTECT_MODE_WP ((__u64)1<<0) +#define UFFDIO_WRITEPROTECT_MODE_DONTWAKE ((__u64)1<<1) + __u64 mode; +}; + +struct uffdio_continue { + struct uffdio_range range; +#define UFFDIO_CONTINUE_MODE_DONTWAKE ((__u64)1<<0) + /* + * UFFDIO_CONTINUE_MODE_WP will map the page write protected on + * the fly. UFFDIO_CONTINUE_MODE_WP is available only if the + * write protected ioctl is implemented for the range + * according to the uffdio_register.ioctls. + */ +#define UFFDIO_CONTINUE_MODE_WP ((__u64)1<<1) + __u64 mode; + + /* + * Fields below here are written by the ioctl and must be at the end: + * the copy_from_user will not read past here. + */ + __s64 mapped; +}; + +struct uffdio_poison { + struct uffdio_range range; +#define UFFDIO_POISON_MODE_DONTWAKE ((__u64)1<<0) + __u64 mode; + + /* + * Fields below here are written by the ioctl and must be at the end: + * the copy_from_user will not read past here. + */ + __s64 updated; +}; + +struct uffdio_move { + __u64 dst; + __u64 src; + __u64 len; + /* + * Especially if used to atomically remove memory from the + * address space the wake on the dst range is not needed. + */ +#define UFFDIO_MOVE_MODE_DONTWAKE ((__u64)1<<0) +#define UFFDIO_MOVE_MODE_ALLOW_SRC_HOLES ((__u64)1<<1) + __u64 mode; + /* + * "move" is written by the ioctl and must be at the end: the + * copy_from_user will not read the last 8 bytes. + */ + __s64 move; +}; + +/* + * Flags for the userfaultfd(2) system call itself. + */ + +/* + * Create a userfaultfd that can handle page faults only in user mode. + */ +#define UFFD_USER_MODE_ONLY 1 + +#endif /* _LINUX_USERFAULTFD_H */ \ No newline at end of file diff --git a/criu/include/parasite.h b/criu/include/parasite.h index 176357711..76c88ff23 100644 --- a/criu/include/parasite.h +++ b/criu/include/parasite.h @@ -37,6 +37,7 @@ enum { PARASITE_CMD_CHECK_VDSO_MARK, PARASITE_CMD_CHECK_AIOS, PARASITE_CMD_DUMP_CGROUP, + PARASITE_CMD_COW_DUMP_INIT, PARASITE_CMD_MAX, }; @@ -254,6 +255,28 @@ struct parasite_dump_cgroup_args { char thread_cgrp[32]; }; +/* + * COW dump initialization arguments + * VMAs are stored after this structure, similar to parasite_dump_pages_args + * Failed VMA indices stored after VMAs + */ +struct parasite_cow_dump_args { + unsigned int nr_vmas; + unsigned long total_pages; /* Output: total pages registered */ + unsigned int nr_failed_vmas; /* Output: number of VMAs that couldn't be registered */ + int ret; /* Output: return code */ +}; + +static inline struct parasite_vma_entry *cow_dump_vmas(struct parasite_cow_dump_args *a) +{ + return (struct parasite_vma_entry *)(a + 1); +} + +static inline unsigned int *cow_dump_failed_indices(struct parasite_cow_dump_args *a) +{ + return (unsigned int *)(cow_dump_vmas(a) + a->nr_vmas); +} + #endif /* !__ASSEMBLY__ */ #endif /* __CR_PARASITE_H__ */ diff --git a/criu/mem.c b/criu/mem.c index 9e8740c07..00d309ec2 100644 --- a/criu/mem.c +++ b/criu/mem.c @@ -698,8 +698,8 @@ int parasite_dump_pages_seized(struct pstree_item *item, struct vm_area_list *vm * 9. syscall fails to copy * data from M */ - - if (!mdc->pre_dump || opts.pre_dump_mode == PRE_DUMP_SPLICE) { + + if ((pargs->nr_vmas != 0) &&(!mdc->pre_dump || opts.pre_dump_mode == PRE_DUMP_SPLICE)) { pargs->add_prot = PROT_READ; ret = compel_rpc_call_sync(PARASITE_CMD_MPROTECT_VMAS, ctl); if (ret) { @@ -719,8 +719,7 @@ int parasite_dump_pages_seized(struct pstree_item *item, struct vm_area_list *vm /* Parasite will unprotect VMAs after fail in fini() */ return ret; } - - if (!mdc->pre_dump || opts.pre_dump_mode == PRE_DUMP_SPLICE) { + if ((pargs->nr_vmas != 0) &&(!mdc->pre_dump || opts.pre_dump_mode == PRE_DUMP_SPLICE)) { pargs->add_prot = 0; if (compel_rpc_call_sync(PARASITE_CMD_MPROTECT_VMAS, ctl)) { pr_err("Can't rollback unprotected vmas with parasite\n"); diff --git a/criu/page-xfer.c b/criu/page-xfer.c index 463d4c506..8cfa5cd75 100644 --- a/criu/page-xfer.c +++ b/criu/page-xfer.c @@ -8,6 +8,10 @@ #include #include #include +#include +#include +#include +#include #undef LOG_PREFIX #define LOG_PREFIX "page-xfer: " @@ -27,6 +31,11 @@ #include "rst_info.h" #include "stats.h" #include "tls.h" +#include "uffd.h" +#include "cow-dump.h" +#include "criu-plugin.h" +#include "plugin.h" +#include "dump.h" static int page_server_sk = -1; @@ -1067,6 +1076,55 @@ static int prep_loc_xfer(struct page_server_iov *pi) return 0; } +/* Statistics tracking structure */ +static struct { + /* page_server_get_pages counters */ + unsigned long get_total_requests; + unsigned long get_with_cow; + unsigned long get_no_cow; + unsigned long get_total_pages; + unsigned long get_cow_pages; + unsigned long get_errors; + + /* page_server_serve counters */ + unsigned long serve_open; + unsigned long serve_open2; + unsigned long serve_parent; + unsigned long serve_add_f; + unsigned long serve_add; + unsigned long serve_hole; + unsigned long serve_close; + unsigned long serve_force_close; + unsigned long serve_get; + unsigned long serve_unknown; + + time_t last_print_time; +} ps_stats; + +static void check_and_print_stats(void) +{ + time_t now = time(NULL); + + if (now - ps_stats.last_print_time >= 1) { + pr_warn("[PAGE_SERVER_STATS] get_pages: reqs=%lu with_cow=%lu no_cow=%lu pages=%lu cow=%lu errs=%lu | serve: open2=%lu parent=%lu add_f=%lu get=%lu close=%lu\n", + ps_stats.get_total_requests, + ps_stats.get_with_cow, + ps_stats.get_no_cow, + ps_stats.get_total_pages, + ps_stats.get_cow_pages, + ps_stats.get_errors, + ps_stats.serve_open2, + ps_stats.serve_parent, + ps_stats.serve_add_f, + ps_stats.serve_get, + ps_stats.serve_close + ps_stats.serve_force_close); + + /* Reset all counters */ + memset(&ps_stats, 0, sizeof(ps_stats)); + ps_stats.last_print_time = now; + } +} + static int page_server_add(int sk, struct page_server_iov *pi, u32 flags) { size_t len; @@ -1141,6 +1199,16 @@ static int page_server_get_pages(int sk, struct page_server_iov *pi) struct page_pipe *pp; unsigned long len, nr_pages; int ret; + struct uffdio_writeprotect wp; + int uffd = -1; + void *buffer = NULL; + unsigned long i; + struct cow_page **cow_pages = NULL; + unsigned long cow_count = 0; + + /* Update statistics */ + ps_stats.get_total_requests++; + check_and_print_stats(); item = pstree_item_by_virt(pi->dst_id); pp = dmpi(item)->mem_pp; @@ -1150,8 +1218,11 @@ static int page_server_get_pages(int sk, struct page_server_iov *pi) * on 32-bit platforms (e.g. armv7). */ nr_pages = pi->nr_pages; ret = page_pipe_read(pp, &pipe_read_dest, pi->vaddr, &nr_pages, PPB_LAZY); - if (ret) - return ret; + + if (ret) { + ps_stats.get_errors++; + return ret; + } /* * The pi is reused for send_psi here, so .nr_pages, .vaddr and @@ -1161,29 +1232,142 @@ static int page_server_get_pages(int sk, struct page_server_iov *pi) pi->nr_pages = nr_pages; if (pi->nr_pages == 0) { pr_debug("no iovs found, zero pages\n"); + ps_stats.get_errors++; return -1; } - pi->cmd = encode_ps_cmd(PS_IOV_ADD_F, PE_PRESENT); - if (send_psi(sk, pi)) - return -1; - len = pi->nr_pages * PAGE_SIZE; + ps_stats.get_total_pages += pi->nr_pages; - if (opts.tls) { - if (tls_send_data_from_fd(pipe_read_dest.p[0], len)) - return -1; + /* Single-pass lookup - collect all COW pages */ + cow_pages = xzalloc(pi->nr_pages * sizeof(struct cow_page *)); + if (!cow_pages) { + pr_err("Failed to allocate COW pages array\n"); + ps_stats.get_errors++; + return -1; + } + + for (i = 0; i < pi->nr_pages; i++) { + unsigned long page_addr = pi->vaddr + (i * PAGE_SIZE); + cow_pages[i] = cow_lookup_and_remove_page(page_addr); + if (cow_pages[i]) + cow_count++; + } + + /* Send response header */ + pi->cmd = encode_ps_cmd(PS_IOV_ADD_F, PE_PRESENT); + if (send_psi(sk, pi)) { + xfree(cow_pages); + ps_stats.get_errors++; + return -1; + } + + /* Choose fast or slow path based on COW presence */ + if (cow_count == 0) { + /* FAST PATH: Zero-copy splice from pipe to socket */ + pr_debug("Zero-copy path: splicing %lu pages directly\n", pi->nr_pages); + ps_stats.get_no_cow++; + + if (opts.tls) { + ret = tls_send_data_from_fd(pipe_read_dest.p[0], len); + if (ret) { + pr_err("Failed to send via TLS from pipe\n"); + xfree(cow_pages); + ps_stats.get_errors++; + return -1; + } + } else { + ssize_t spliced = 0; + while (spliced < len) { + ret = splice(pipe_read_dest.p[0], NULL, sk, NULL, + len - spliced, SPLICE_F_MOVE); + if (ret <= 0) { + pr_perror("Failed to splice pipe to socket"); + xfree(cow_pages); + ps_stats.get_errors++; + return -1; + } + spliced += ret; + } + } } else { - ret = splice(pipe_read_dest.p[0], NULL, sk, NULL, len, SPLICE_F_MOVE); - if (ret != len) + /* SLOW PATH: Buffer + overlay COW pages */ + pr_debug("Buffered path: overlaying %lu COW pages out of %lu total\n", + cow_count, pi->nr_pages); + ps_stats.get_with_cow++; + ps_stats.get_cow_pages += cow_count; + + buffer = xmalloc(len); + if (!buffer) { + pr_err("Failed to allocate buffer for %lu pages\n", pi->nr_pages); + goto err_free_cow; + } + + ret = read(pipe_read_dest.p[0], buffer, len); + if (ret != len) { + pr_err("Short read from pipe: %d vs %lu\n", ret, len); + goto err_free_all; + } + + /* Overlay COW pages */ + for (i = 0; i < pi->nr_pages; i++) { + if (cow_pages[i]) { + pr_debug("Overlaying COW page at index %lu\n", i); + memcpy(buffer + (i * PAGE_SIZE), cow_pages[i]->data, PAGE_SIZE); + xfree(cow_pages[i]->data); + xfree(cow_pages[i]); + } + } + + /* Send buffered data */ + if (opts.tls) { + if (__send(sk, buffer, len, 0) != len) { + pr_perror("Failed to send page buffer via TLS"); + goto err_free_all; + } + } else { + if (send(sk, buffer, len, 0) != len) { + pr_perror("Failed to send page buffer"); + goto err_free_all; + } + } + + xfree(buffer); + } + + xfree(cow_pages); + + /* Step 5: Unprotect all pages in one operation */ + uffd = cow_get_uffd(); + if (uffd >= 0) { + wp.range.start = pi->vaddr; + wp.range.len = len; + wp.mode = 0; /* Clear write-protect */ + + if (ioctl(uffd, UFFDIO_WRITEPROTECT, &wp)) { + pr_perror("Failed to unprotect pages at 0x%llx", wp.range.start); + ps_stats.get_errors++; return -1; + } } tcp_nodelay(sk, true); - return 0; -} +err_free_all: + xfree(buffer); +err_free_cow: + for (i = 0; i < pi->nr_pages; i++) { + if (cow_pages[i]) { + xfree(cow_pages[i]->data); + xfree(cow_pages[i]); + } + } + xfree(cow_pages); + ps_stats.get_errors++; + return -1; +} +extern void pstree_switch_state(struct pstree_item *root_item, int st); static int page_server_serve(int sk) { int ret = -1; @@ -1228,36 +1412,54 @@ static int page_server_serve(int sk) flushed = false; cmd = decode_ps_cmd(pi.cmd); + /* Check and print stats on each iteration */ + check_and_print_stats(); + switch (cmd) { case PS_IOV_OPEN: + ps_stats.serve_open++; ret = page_server_open(-1, &pi); break; case PS_IOV_OPEN2: + ps_stats.serve_open2++; ret = page_server_open(sk, &pi); break; case PS_IOV_PARENT: + ps_stats.serve_parent++; ret = page_server_check_parent(sk, &pi); break; case PS_IOV_ADD_F: case PS_IOV_ADD: case PS_IOV_HOLE: { u32 flags; - - if (likely(cmd == PS_IOV_ADD_F)) + + if (likely(cmd == PS_IOV_ADD_F)) { flags = decode_ps_flags(pi.cmd); - else if (cmd == PS_IOV_ADD) + ps_stats.serve_add_f++; + } + else if (cmd == PS_IOV_ADD){ flags = PE_PRESENT; + ps_stats.serve_add++; + } else /* PS_IOV_HOLE */ + { flags = PE_PARENT; + ps_stats.serve_hole++; + } ret = page_server_add(sk, &pi, flags); break; - } + } case PS_IOV_CLOSE: case PS_IOV_FORCE_CLOSE: { int32_t status = 0; ret = 0; + + if (cmd == PS_IOV_CLOSE) + ps_stats.serve_close++; + else + ps_stats.serve_force_close++; /* * An answer must be sent back to inform another side, @@ -1272,10 +1474,12 @@ static int page_server_serve(int sk) break; } case PS_IOV_GET: + ps_stats.serve_get++; ret = page_server_get_pages(sk, &pi); break; default: pr_err("Unknown command %u\n", pi.cmd); + ps_stats.serve_unknown++; ret = -1; break; } diff --git a/criu/pie/parasite.c b/criu/pie/parasite.c index c966e9e62..dc9a6fb5b 100644 --- a/criu/pie/parasite.c +++ b/criu/pie/parasite.c @@ -6,6 +6,7 @@ #include #include #include +#include #include "linux/rseq.h" @@ -854,6 +855,141 @@ static int parasite_dump_cgroup(struct parasite_dump_cgroup_args *args) return 0; } +static int parasite_cow_dump_init(struct parasite_cow_dump_args *args) +{ + struct parasite_vma_entry *vmas, *vma; + struct uffdio_register reg; + struct uffdio_writeprotect wp; + struct uffdio_api api; + int uffd, tsock, i; + int ret = 0; + unsigned long addr, len; + unsigned long total_pages = 0; + unsigned int *failed_indices; + + + pr_info("COW dump init: registering %d VMAs\n", args->nr_vmas); + + args->nr_failed_vmas = 0; + failed_indices = cow_dump_failed_indices(args); + + /* Create userfaultfd in target process context */ + uffd = sys_userfaultfd(O_CLOEXEC | O_NONBLOCK); + if (uffd < 0) { + int err = -uffd; // Convert negative errno to positive + pr_err("Failed to create userfaultfd: %d (%s)\n", err, + err == ENOSYS ? "not supported" : + err == EPERM ? "permission denied" : + err == EINVAL ? "invalid flags" : "unknown error"); + return -1; + } + + /* Initialize userfaultfd API with WP features */ + memset(&api, 0, sizeof(api)); + api.api = UFFD_API; + api.features = 0; + api.ioctls = 0; + + ret = sys_ioctl(uffd, UFFDIO_API, (unsigned long)&api); + if (ret < 0) { + int e = (ret < 0) ? -ret : ret; /* convert to +errno code */ + + pr_err("Failed to initialize userfaultfd API: %d uffd=%d but continue\n", e, uffd); + sys_close(uffd); + return -1; + } + + pr_info("UFFD created with features: 0x%llx\n", (unsigned long long)api.features); + + vmas = cow_dump_vmas(args); + + /* Register each VMA with write-protection */ + for (i = 0; i < args->nr_vmas; i++) { + vma = vmas + i; + addr = vma->start; + len = vma->len; + + pr_info("Registering VMA %d: %lx-%lx prot=%x len=%lu\n", + i, addr, addr + len, vma->prot, len); + + /* Skip non-writable VMAs */ + if (!(vma->prot & PROT_WRITE)) { + pr_info("Skipping non-writable VMA: %lx-%lx len=%lu\n", addr, addr + len, len); + + /* Mark for later dump by CRIU */ + failed_indices[args->nr_failed_vmas++] = i; + continue; + } + + + /* Register VMA for write-protect tracking */ + reg.range.start = addr; + reg.range.len = len; + reg.mode = UFFDIO_REGISTER_MODE_WP; + ret = sys_ioctl(uffd, UFFDIO_REGISTER, (unsigned long)®); + if (ret) { + /* Some VMAs may not support WP - record index for CRIU to dump */ + if (ret == EINVAL) { + pr_warn("Cannot WP-register VMA %lx-%lx len=%lu (unsupported), marking for later dump\n", + addr, addr + len, len); + + /* Record the index of this failed VMA */ + failed_indices[args->nr_failed_vmas++] = i; + pr_info("Marked VMA index %d for later dump (%u failed VMAs total)\n", + i, args->nr_failed_vmas); + continue; + } else { + /* Any failure to register - just dump instead of trying to track */ + pr_err("Failed to register VMA %lx-%lx: ret=%d len=%lu\n", + addr, addr + len, ret, len); + + failed_indices[args->nr_failed_vmas++] = i; + pr_info("Marked VMA index %d for immediate dump (%u total)\n", + i, args->nr_failed_vmas); + continue; + } + + } + + /* Apply write-protection */ + wp.range.start = addr; + wp.range.len = len; + wp.mode = UFFDIO_WRITEPROTECT_MODE_WP; + ret = sys_ioctl(uffd, UFFDIO_WRITEPROTECT, (unsigned long)&wp); + if (ret) { + pr_err("Failed to write-protect VMA %lx-%lx: ret=%d\n", + addr, addr + len, ret); + sys_close(uffd); + return -1; + } + + total_pages += len / PAGE_SIZE; + pr_info("Successfully registered and WP'd VMA: %lx-%lx (%lu pages)\n", + addr, addr + len, len / PAGE_SIZE); + } + + pr_info("COW dump init complete: %lu total pages\n", total_pages); + + /* Send userfaultfd back to CRIU before setting return status */ + tsock = parasite_get_rpc_sock(); + ret = send_fd(tsock, NULL, 0, uffd); + if (ret) { + pr_err("Failed to send userfaultfd back to CRIU: %d\n", ret); + sys_close(uffd); + args->ret = -1; + return -1; + } + + pr_info("Sent uffd=%d back to CRIU\n", uffd); + + /* Set success status after fd is sent */ + args->total_pages = total_pages; + args->ret = 0; + + /* Don't close uffd - it will remain open for the process */ + return 0; +} + void parasite_cleanup(void) { if (mprotect_args) { @@ -906,6 +1042,9 @@ int parasite_daemon_cmd(int cmd, void *args) case PARASITE_CMD_DUMP_CGROUP: ret = parasite_dump_cgroup(args); break; + case PARASITE_CMD_COW_DUMP_INIT: + ret = parasite_cow_dump_init(args); + break; default: pr_err("Unknown command in parasite daemon thread leader: %d\n", cmd); ret = -1; diff --git a/criu/uffd.c b/criu/uffd.c index 8e12dcd63..77c62400a 100644 --- a/criu/uffd.c +++ b/criu/uffd.c @@ -98,6 +98,10 @@ struct lazy_pages_info { unsigned long buf_size; void *buf; + + /* Pipeline control */ + unsigned int pipeline_depth; /* Current in-flight requests */ + unsigned int max_pipeline_depth; /* Max allowed concurrent requests */ }; /* global lazy-pages daemon state */ @@ -110,6 +114,108 @@ static struct epoll_rfd lazy_sk_rfd; /* socket for communication with lazy-pages daemon */ static int lazy_pages_sk_id = -1; +/* Histogram statistics structure */ +static struct { + /* Histogram buckets by page count: 1, 16, 32, 64, 128, 256, 512, 1024, >1024 */ + unsigned long pf_hist[9]; /* Page fault histogram */ + unsigned long bg_hist[9]; /* Background transfer histogram */ + + unsigned long total_pf_reqs; + unsigned long total_bg_reqs; + unsigned long total_pages; + + /* Pipeline statistics */ + unsigned long pipeline_depth_sum; + unsigned long pipeline_samples; + + time_t last_print_time; +} uffd_stats; + +static int get_histogram_bucket(unsigned long nr_pages) +{ + if (nr_pages == 1) return 0; /* 4KB */ + if (nr_pages <= 16) return 1; /* 64KB */ + if (nr_pages <= 32) return 2; /* 128KB */ + if (nr_pages <= 64) return 3; /* 256KB */ + if (nr_pages <= 128) return 4; /* 512KB */ + if (nr_pages <= 256) return 5; /* 1MB */ + if (nr_pages <= 512) return 6; /* 2MB */ + if (nr_pages <= 1024) return 7; /* 4MB */ + return 8; /* >4MB */ +} + +static const char *get_bucket_label(int bucket) +{ + switch (bucket) { + case 0: return "4K"; + case 1: return "64K"; + case 2: return "128K"; + case 3: return "256K"; + case 4: return "512K"; + case 5: return "1M"; + case 6: return "2M"; + case 7: return "4M"; + case 8: return ">4M"; + default: return "?"; + } +} + +static void check_and_print_uffd_stats(void) +{ + time_t now = time(NULL); + int i; + bool has_pf = false, has_bg = false; + unsigned long avg_pipeline = 0; + + if (now - uffd_stats.last_print_time >= 1) { + /* Check if we have any data to print */ + for (i = 0; i < 9; i++) { + if (uffd_stats.pf_hist[i] > 0) has_pf = true; + if (uffd_stats.bg_hist[i] > 0) has_bg = true; + } + + if (!has_pf && !has_bg && uffd_stats.total_pf_reqs == 0 && uffd_stats.total_bg_reqs == 0) { + uffd_stats.last_print_time = now; + return; + } + + /* Calculate average pipeline depth */ + if (uffd_stats.pipeline_samples > 0) + avg_pipeline = uffd_stats.pipeline_depth_sum / uffd_stats.pipeline_samples; + + pr_warn("[UFFD_STATS] reqs=%lu(pf:%lu,bg:%lu) pages=%lu pipe_avg=%lu\n", + uffd_stats.total_pf_reqs + uffd_stats.total_bg_reqs, + uffd_stats.total_pf_reqs, + uffd_stats.total_bg_reqs, + uffd_stats.total_pages, + avg_pipeline); + + /* Print page fault histogram */ + if (has_pf) { + pr_warn(" PF: "); + for (i = 0; i < 9; i++) { + if (uffd_stats.pf_hist[i] > 0) + pr_warn(" %s=%lu", get_bucket_label(i), uffd_stats.pf_hist[i]); + } + pr_warn("\n"); + } + + /* Print background transfer histogram */ + if (has_bg) { + pr_warn(" BG: "); + for (i = 0; i < 9; i++) { + if (uffd_stats.bg_hist[i] > 0) + pr_warn(" %s=%lu", get_bucket_label(i), uffd_stats.bg_hist[i]); + } + pr_warn("\n"); + } + + /* Reset all counters */ + memset(&uffd_stats, 0, sizeof(uffd_stats)); + uffd_stats.last_print_time = now; + } +} + static int handle_uffd_event(struct epoll_rfd *lpfd); static struct lazy_pages_info *lpi_init(void) @@ -127,6 +233,10 @@ static struct lazy_pages_info *lpi_init(void) lpi->lpfd.read_event = handle_uffd_event; lpi->xfer_len = DEFAULT_XFER_LEN; lpi->ref_cnt = 1; + + /* Initialize pipeline control - start with aggressive pipelining */ + lpi->pipeline_depth = 0; + lpi->max_pipeline_depth = 256; /* 256 concurrent requests for maximum throughput */ return lpi; } @@ -843,6 +953,27 @@ static int uffd_check_op_error(struct lazy_pages_info *lpi, const char *op, unsi return 0; } +static int xfer_pages(struct lazy_pages_info *lpi); +/* + * Aggressively refill pipeline to maximum capacity. + * Called immediately when a response arrives to keep pipeline saturated. + */ +static int refill_pipeline(struct lazy_pages_info *lpi) +{ + int ret; + + /* Keep filling until pipeline is full or we run out of data */ + while (!list_empty(&lpi->iovs) && + lpi->pipeline_depth < lpi->max_pipeline_depth) { + ret = xfer_pages(lpi); + if (ret < 0) + return ret; + } + + return 0; +} + + static int uffd_copy(struct lazy_pages_info *lpi, __u64 address, unsigned long *nr_pages) { struct uffdio_copy uffdio_copy; @@ -855,9 +986,16 @@ static int uffd_copy(struct lazy_pages_info *lpi, __u64 address, unsigned long * uffdio_copy.copy = 0; lp_debug(lpi, "uffd_copy: 0x%llx/%ld\n", uffdio_copy.dst, len); - if (ioctl(lpi->lpfd.fd, UFFDIO_COPY, &uffdio_copy) && - uffd_check_op_error(lpi, "copy", nr_pages, uffdio_copy.copy)) - return -1; + if (ioctl(lpi->lpfd.fd, UFFDIO_COPY, &uffdio_copy)) { + if (errno == EAGAIN) { + lp_err(lpi, "uffd_copy EAGAIN: 0x%llx/%ld\n", uffdio_copy.dst, len); + } + if ( uffd_check_op_error(lpi, "copy", nr_pages, uffdio_copy.copy)) + { + lp_err(lpi, "uffd_copy EAGAIN: 0x%llx/%ld uffdio_copy.copy=%lld\n", uffdio_copy.dst, len, uffdio_copy.copy); + return -1; + } + } lpi->copied_pages += *nr_pages; @@ -916,7 +1054,20 @@ static int uffd_io_complete(struct page_read *pr, unsigned long img_addr, unsign * list and let drop_iovs do the range math, free memory etc. */ iov_list_insert(req, &lpi->iovs); - return drop_iovs(lpi, addr, nr * PAGE_SIZE); + ret = drop_iovs(lpi, addr, nr * PAGE_SIZE); + + /* + * Decrement pipeline depth now that response is processed. + * IMMEDIATELY refill pipeline to keep it saturated - don't wait for main loop! + * This is the key to aggressive pipelining and reducing source EAGAIN. + */ + lpi->pipeline_depth--; + + if (!lpi->exited && !list_empty(&lpi->iovs)) { + refill_pipeline(lpi); + } + + return ret; } static int uffd_zero(struct lazy_pages_info *lpi, __u64 address, unsigned long nr_pages) @@ -990,6 +1141,8 @@ static struct lazy_iov *pick_next_range(struct lazy_pages_info *lpi) */ static void update_xfer_len(struct lazy_pages_info *lpi, bool pf) { + lpi->xfer_len = 8*1024;//MAX_XFER_LEN; + return; //TODO remove if (pf) lpi->xfer_len = DEFAULT_XFER_LEN; else @@ -1005,6 +1158,7 @@ static int xfer_pages(struct lazy_pages_info *lpi) unsigned long nr_pages; unsigned long len; int err; + int bucket; iov = pick_next_range(lpi); if (!iov) @@ -1019,17 +1173,29 @@ static int xfer_pages(struct lazy_pages_info *lpi) nr_pages = (iov->end - iov->start) / PAGE_SIZE; + /* Update statistics */ + uffd_stats.total_bg_reqs++; + uffd_stats.total_pages += nr_pages; + bucket = get_histogram_bucket(nr_pages); + uffd_stats.bg_hist[bucket]++; + update_xfer_len(lpi, false); + /* Increment pipeline depth BEFORE sending request */ + lpi->pipeline_depth++; + err = uffd_handle_pages(lpi, iov->img_start, nr_pages, PR_ASYNC | PR_ASAP); if (err < 0) { lp_err(lpi, "Error during UFFD copy\n"); + lpi->pipeline_depth--; /* Rollback on error */ return -1; } return 0; } + + static int handle_remove(struct lazy_pages_info *lpi, struct uffd_msg *msg) { struct uffdio_range unreg; @@ -1154,6 +1320,8 @@ static int handle_page_fault(struct lazy_pages_info *lpi, struct uffd_msg *msg) struct lazy_iov *iov; __u64 address; int ret; + unsigned long nr_pages; + int bucket; /* Align requested address to the next page boundary */ address = msg->arg.pagefault.address & ~(page_size() - 1); @@ -1172,11 +1340,23 @@ static int handle_page_fault(struct lazy_pages_info *lpi, struct uffd_msg *msg) list_move(&iov->l, &lpi->reqs); + nr_pages = (iov->end - iov->start) / PAGE_SIZE; + + /* Update statistics */ + uffd_stats.total_pf_reqs++; + uffd_stats.total_pages += nr_pages; + bucket = get_histogram_bucket(nr_pages); + uffd_stats.pf_hist[bucket]++; + update_xfer_len(lpi, true); - ret = uffd_handle_pages(lpi, iov->img_start, 1, PR_ASYNC | PR_ASAP); + /* Increment pipeline depth BEFORE sending request (just like background transfers) */ + lpi->pipeline_depth++; + + ret = uffd_handle_pages(lpi, iov->img_start, nr_pages, PR_ASYNC | PR_ASAP); if (ret < 0) { lp_err(lpi, "Error during regular page copy\n"); + lpi->pipeline_depth--; /* Rollback on error */ return -1; } @@ -1248,6 +1428,15 @@ static int handle_requests(int epollfd, struct epoll_event **events, int nr_fds) int ret; for (;;) { + /* Sample pipeline depth for statistics */ + list_for_each_entry_safe(lpi, n, &lpis, l) { + uffd_stats.pipeline_depth_sum += lpi->pipeline_depth; + uffd_stats.pipeline_samples++; + } + + /* Check and print statistics every second */ + check_and_print_uffd_stats(); + ret = epoll_run_rfds(epollfd, *events, nr_fds, poll_timeout); if (ret < 0) goto out; @@ -1265,11 +1454,11 @@ static int handle_requests(int epollfd, struct epoll_event **events, int nr_fds) ret = 0; list_for_each_entry_safe(lpi, n, &lpis, l) { - if (!list_empty(&lpi->iovs) && list_empty(&lpi->reqs)) { - ret = xfer_pages(lpi); + /* Aggressively refill pipeline to keep it saturated at all times */ + if (!list_empty(&lpi->iovs)) { + ret = refill_pipeline(lpi); if (ret < 0) goto out; - break; } if (list_empty(&lpi->reqs)) {