diff --git a/compel/arch/arm/plugins/std/parasite-head.S b/compel/arch/arm/plugins/std/parasite-head.S deleted file mode 100644 index 6e46bed1fc..0000000000 --- a/compel/arch/arm/plugins/std/parasite-head.S +++ /dev/null @@ -1,8 +0,0 @@ -#include "common/asm/linkage.h" - - .section .head.text, "ax" -ENTRY(__export_parasite_head_start) - bl parasite_service - .byte 0xf0, 0x01, 0xf0, 0xe7 @ the instruction UDF #32 generates the signal SIGTRAP in Linux - -END(__export_parasite_head_start) diff --git a/criu/Makefile.crtools b/criu/Makefile.crtools index ba6132d2f7..b9b0f5bdd7 100644 --- a/criu/Makefile.crtools +++ b/criu/Makefile.crtools @@ -87,6 +87,7 @@ obj-y += path.o obj-y += autofs.o obj-y += fdstore.o obj-y += uffd.o +obj-y += cow-dump.o obj-y += config.o obj-y += servicefd.o obj-y += pie-util-vdso.o diff --git a/criu/arch/arm/aeabi-helpers.S b/criu/arch/arm/aeabi-helpers.S deleted file mode 100644 index ea8561d48f..0000000000 --- a/criu/arch/arm/aeabi-helpers.S +++ /dev/null @@ -1,96 +0,0 @@ -/* - * Code borrowed from gcc, arm/lib1funcs.S - * and adapted to CRIU macros. - */ - -#if defined(__thumb__) -/* - * We don't support compiling PIEs in Thumb mode, - * see top Makefile for details (ARM CFLAGS_PIE section). -*/ -#error Unsupported Thumb mode -#endif - -#include "common/asm/linkage.h" - -#define RET bx lr -#define RETc(x) bx##x lr -#define LSYM(x) .x - -.macro do_it cond, suffix="" -.endm - -.macro ARM_DIV2_ORDER divisor, order - clz \order, \divisor - rsb \order, \order, #31 -.endm - -.macro ARM_DIV_BODY dividend, divisor, result, curbit - clz \curbit, \dividend - clz \result, \divisor - sub \curbit, \result, \curbit - rsbs \curbit, \curbit, #31 - addne \curbit, \curbit, \curbit, lsl #1 - mov \result, #0 - addne pc, pc, \curbit, lsl #2 - nop - .set shift, 32 - .rept 32 - .set shift, shift - 1 - cmp \dividend, \divisor, lsl #shift - adc \result, \result, \result - subcs \dividend, \dividend, \divisor, lsl #shift - .endr -.endm - -/* - * XXX: as an optimization add udiv instruction based version. - * It's possible to check if CPU supports the instruction by - * reading Instruction Set Attribute Register (ID_ISAR0) - * and checking fields "Divide_instrs". - */ -ENTRY(__aeabi_uidiv) - /* Note: if called via udivsi3_skip_div0_test, this will unnecessarily - check for division-by-zero a second time. */ -LSYM(udivsi3_skip_div0_test): - subs r2, r1, #1 - do_it eq - RETc(eq) - bcc LSYM(Ldiv0) - cmp r0, r1 - bls 11f - tst r1, r2 - beq 12f - - ARM_DIV_BODY r0, r1, r2, r3 - - mov r0, r2 - RET - -11: do_it eq, e - moveq r0, #1 - movne r0, #0 - RET - -12: ARM_DIV2_ORDER r1, r2 - - mov r0, r0, lsr r2 - RET - -LSYM(Ldiv0): - .byte 0xf0, 0x01, 0xf0, 0xe7 @ the instruction UDF #32 generates the signal SIGTRAP in Linux - -END(__aeabi_uidiv) -ALIAS(__udivsi3, __aeabi_uidiv) - -ENTRY(__aeabi_uidivmod) - cmp r1, #0 - beq LSYM(Ldiv0) - stmfd sp!, { r0, r1, lr } - bl LSYM(udivsi3_skip_div0_test) - ldmfd sp!, { r1, r2, lr } - mul r3, r2, r0 - sub r1, r1, r3 - RET -END(__aeabi_uidivmod) -ALIAS(__umodsi3, __aeabi_uidiv) diff --git a/criu/arch/arm/bitops.S b/criu/arch/arm/bitops.S deleted file mode 100644 index 51939118b6..0000000000 --- a/criu/arch/arm/bitops.S +++ /dev/null @@ -1,24 +0,0 @@ -#include "common/asm/linkage.h" - -.syntax unified - -ENTRY(test_and_set_bit) - ands ip, r1, #3 - strbne r1, [ip] @ assert word-aligned - mov r2, #1 - and r3, r0, #31 @ Get bit offset - mov r0, r0, lsr #5 - add r1, r1, r0, lsl #2 @ Get word offset - mov r3, r2, lsl r3 @ create mask - dmb ish -1: ldrex r2, [r1] - ands r0, r2, r3 @ save old value of bit - orreq r2, r2, r3 @ toggle bit - strex ip, r2, [r1] - cmp ip, #0 - bne 1b - dmb ish - cmp r0, #0 - movne r0, #1 -2: bx lr -END(test_and_set_bit) diff --git a/criu/config.c b/criu/config.c index d7ef3f8e8b..9d6084ea45 100644 --- a/criu/config.c +++ b/criu/config.c @@ -672,6 +672,7 @@ int parse_options(int argc, char **argv, bool *usage_error, bool *has_exec_cmd, { "external", required_argument, 0, 1073 }, { "empty-ns", required_argument, 0, 1074 }, { "lazy-pages", no_argument, 0, 1076 }, + { "cow-dump", no_argument, 0, 1101 }, BOOL_OPT("extra", &opts.check_extra_features), BOOL_OPT("experimental", &opts.check_experimental_features), { "all", no_argument, 0, 1079 }, @@ -942,6 +943,9 @@ int parse_options(int argc, char **argv, bool *usage_error, bool *has_exec_cmd, case 1076: opts.lazy_pages = true; break; + case 1101: + opts.cow_dump = true; + break; case 'M': { char *aux; diff --git a/criu/cow-dump.c b/criu/cow-dump.c new file mode 100644 index 0000000000..1c0d814d64 --- /dev/null +++ b/criu/cow-dump.c @@ -0,0 +1,540 @@ +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include "types.h" +#include "cr_options.h" +#include "pstree.h" +#include "cow-dump.h" +#include "uffd.h" +#include "page-xfer.h" +#include "page-pipe.h" +#include "parasite-syscall.h" +#include "mem.h" +#include "vma.h" +#include "util.h" +#include "kerndat.h" +#include "criu-log.h" +#include "parasite.h" + +#undef LOG_PREFIX +#define LOG_PREFIX "cow-dump: " + +/* COW dump state for a single process */ +struct cow_dump_info { + struct pstree_item *item; + int uffd; /* userfaultfd for write tracking */ + int proc_mem_fd; /* /proc/pid/mem for reading pages */ + unsigned long total_pages; /* Total pages being tracked */ + unsigned long dirty_pages; /* Pages modified in current iteration */ + unsigned long dirty_pages_dumped; /* Pages already written to disk */ + unsigned long iteration; /* Current iteration number */ + struct list_head dirty_list; /* List of dirty page ranges */ + struct page_xfer xfer; /* Page transfer context */ + struct page_pipe *pp; /* Page pipe for batching writes */ + bool xfer_initialized; /* Whether xfer was opened */ +}; + +/* Dirty page range */ +struct dirty_range { + unsigned long start; + unsigned long len; + struct list_head list; +}; + +static struct cow_dump_info *g_cow_info = NULL; +static pthread_t g_monitor_thread; +static volatile bool g_stop_monitoring = false; + +#define COW_MAX_ITERATIONS 10 +#define COW_CONVERGENCE_THRESHOLD 100 /* Stop if < 100 pages dirty per iteration */ +#define COW_FLUSH_THRESHOLD 1000 /* Flush to disk every 1000 pages */ + +bool cow_check_kernel_support(void) +{ + unsigned long features = UFFD_FEATURE_WP_ASYNC | + UFFD_FEATURE_PAGEFAULT_FLAG_WP | + UFFD_FEATURE_EVENT_FORK | + UFFD_FEATURE_EVENT_REMAP; + int uffd, err = 0; + + uffd = uffd_open(0, &features, &err); + if (uffd < 0) { + if (err == ENOSYS) { + pr_info("userfaultfd not supported by kernel\n"); + } else if (err == EPERM) { + pr_info("userfaultfd requires CAP_SYS_PTRACE or sysctl vm.unprivileged_userfaultfd=1\n"); + } + return false; + } + + if (!(features & UFFD_FEATURE_WP_ASYNC)) { + pr_info("userfaultfd write-protect feature not supported (need kernel 5.7+)\n"); + close(uffd); + return false; + } + + if (!(features & UFFD_FEATURE_PAGEFAULT_FLAG_WP)) { + pr_info("userfaultfd WP pagefault flag not supported (need kernel 5.7+)\n"); + close(uffd); + return false; + } + + close(uffd); + pr_info("COW dump kernel support detected\n"); + return true; +} + +static int open_proc_mem(pid_t pid) +{ + char path[64]; + int fd; + + snprintf(path, sizeof(path), "/proc/%d/mem", pid); + fd = open(path, O_RDONLY); + if (fd < 0) { + pr_perror("Failed to open %s", path); + return -1; + } + + return fd; +} + +int cow_dump_init(struct pstree_item *item, struct vm_area_list *vma_area_list, struct parasite_ctl *ctl) +{ + struct cow_dump_info *cdi; + struct vma_area *vma; + struct parasite_cow_dump_args *args; + struct parasite_vma_entry *p_vma; + + int ret; + unsigned long args_size; + unsigned int nr_vmas = 0; + + pr_info("Initializing COW dump for pid %d (via parasite)\n", item->pid->real); + + if (!cow_check_kernel_support()) { + pr_err("Kernel doesn't support COW dump\n"); + return -1; + } + + cdi = xzalloc(sizeof(*cdi)); + if (!cdi) + return -1; + + cdi->item = item; + INIT_LIST_HEAD(&cdi->dirty_list); + cdi->uffd = -1; /* Will be received from parasite */ + + /* Open /proc/pid/mem for reading pages */ + cdi->proc_mem_fd = open_proc_mem(item->pid->real); + if (cdi->proc_mem_fd < 0) + goto err_free; + + /* Prepare parasite arguments - count writable VMAs */ + nr_vmas = 0; + list_for_each_entry(vma, &vma_area_list->h, list) { + if (vma_area_is(vma, VMA_AREA_GUARD)) + continue; + if (vma->e->prot & PROT_WRITE) + nr_vmas++; + } + + /* Allocate parasite args - includes space for VMAs and failed indices */ + args_size = sizeof(*args) + + nr_vmas * sizeof(struct parasite_vma_entry) + + nr_vmas * sizeof(unsigned int); /* Space for failed indices */ + args = compel_parasite_args_s(ctl, args_size); + if (!args) { + pr_err("Failed to allocate parasite args\n"); + goto err_close_mem; + } + + args->nr_vmas = nr_vmas; + args->total_pages = 0; + args->nr_failed_vmas = 0; + args->ret = -1; + + /* Fill VMA entries */ + p_vma = cow_dump_vmas(args); + nr_vmas = 0; + list_for_each_entry(vma, &vma_area_list->h, list) { + if (vma_area_is(vma, VMA_AREA_GUARD)) + continue; + if (!(vma->e->prot & PROT_WRITE)) + continue; + + p_vma[nr_vmas].start = vma->e->start; + p_vma[nr_vmas].len = vma->e->end - vma->e->start; + p_vma[nr_vmas].prot = vma->e->prot; + nr_vmas++; + } + + pr_info("Calling parasite to register %u VMAs\n", args->nr_vmas); + + /* Call parasite to create uffd and perform registration (async) */ + ret = compel_rpc_call(PARASITE_CMD_COW_DUMP_INIT, ctl); + if (ret < 0) { + pr_err("Failed to initiate COW dump RPC\n"); + goto err_close_mem; + } + + /* Receive userfaultfd from parasite */ + compel_util_recv_fd(ctl, &cdi->uffd); + if (cdi->uffd < 0) { + pr_err("Failed to receive userfaultfd from parasite: %d\n", cdi->uffd); + goto err_close_mem; + } + pr_info("Got fd %d VMAs\n", cdi->uffd); + /* Wait for parasite to complete */ + ret = compel_rpc_sync(PARASITE_CMD_COW_DUMP_INIT, ctl); + if (ret < 0 || args->ret != 0) { + pr_err("Parasite COW dump init failed: %d (ret=%d)\n", ret, args->ret); + close(cdi->uffd); + cdi->uffd = -1; + goto err_close_mem; + } + + cdi->total_pages = args->total_pages; + cdi->dirty_pages_dumped = 0; + cdi->xfer_initialized = false; + + /* Initialize page_xfer for writing pages to disk */ + ret = open_page_xfer(&cdi->xfer, CR_FD_PAGEMAP, vpid(item)); + if (ret < 0) { + pr_err("Failed to open page_xfer\n"); + close(cdi->uffd); + goto err_close_mem; + } + cdi->xfer_initialized = true; + + /* Create page_pipe for batching page writes */ + cdi->pp = create_page_pipe(cdi->total_pages, NULL, 0); + if (!cdi->pp) { + pr_err("Failed to create page_pipe\n"); + cdi->xfer.close(&cdi->xfer); + close(cdi->uffd); + goto err_close_mem; + } + + pr_info("COW dump initialized: tracking %lu pages, uffd=%d\n", + cdi->total_pages, cdi->uffd); + + + g_cow_info = cdi; + return 0; + +err_close_mem: + close(cdi->proc_mem_fd); +err_free: + xfree(cdi); + return -1; +} + +void cow_dump_fini(void) +{ + struct dirty_range *dr, *tmp; + + if (!g_cow_info) + return; + + pr_info("Cleaning up COW dump\n"); + + /* Flush any remaining dirty pages before cleanup */ + if (g_cow_info->pp && g_cow_info->xfer_initialized) { + pr_info("Flushing remaining dirty pages: %lu dumped so far\n", + g_cow_info->dirty_pages_dumped); + if (page_xfer_dump_pages(&g_cow_info->xfer, g_cow_info->pp) < 0) + pr_err("Failed to flush remaining pages during cleanup\n"); + } + + list_for_each_entry_safe(dr, tmp, &g_cow_info->dirty_list, list) { + list_del(&dr->list); + xfree(dr); + } + + if (g_cow_info->pp) + destroy_page_pipe(g_cow_info->pp); + + if (g_cow_info->xfer_initialized) + g_cow_info->xfer.close(&g_cow_info->xfer); + + if (g_cow_info->proc_mem_fd >= 0) + close(g_cow_info->proc_mem_fd); + + if (g_cow_info->uffd >= 0) + close(g_cow_info->uffd); + + xfree(g_cow_info); + g_cow_info = NULL; +} +#if 0 +/* Flush accumulated dirty pages to disk */ +static int cow_flush_dirty_pages(struct cow_dump_info *cdi) +{ + int ret; + + if (!cdi->pp || !cdi->xfer_initialized) + return 0; + + /* Check if there are pages to flush */ + if (cdi->pp->nr_pipes == 0) + return 0; + + pr_info("Flushing %lu dirty pages to disk\n", + cdi->dirty_pages_dumped - (cdi->dirty_pages_dumped - cdi->pp->nr_pipes)); + + ret = page_xfer_dump_pages(&cdi->xfer, cdi->pp); + if (ret < 0) { + pr_err("Failed to flush dirty pages to disk\n"); + return ret; + } + + /* Reset page_pipe for next batch */ + page_pipe_reinit(cdi->pp); + + return 0; +} + +/* Write a single page to the page_pipe */ +static int cow_write_page_to_pipe(struct cow_dump_info *cdi, unsigned long page_addr) +{ + unsigned char page_buf[PAGE_SIZE]; + ssize_t ret; + int pipe_fd; + + /* Read the page from /proc/pid/mem */ + ret = pread(cdi->proc_mem_fd, page_buf, PAGE_SIZE, page_addr); + if (ret != PAGE_SIZE) { + if (ret < 0) + pr_perror("Failed to read page at 0x%lx from /proc/pid/mem", page_addr); + else + pr_err("Short read from /proc/pid/mem at 0x%lx: %zd\n", page_addr, ret); + return -1; + } + + /* Add page to page_pipe - this creates the iov entry */ + ret = page_pipe_add_page(cdi->pp, page_addr, 0); + if (ret < 0) { + if (ret == -EAGAIN) { + /* Page pipe is full, flush it */ + if (cow_flush_dirty_pages(cdi) < 0) + return -1; + /* Try again after flush */ + ret = page_pipe_add_page(cdi->pp, page_addr, 0); + if (ret < 0) { + pr_err("Failed to add page to pipe even after flush\n"); + return -1; + } + } else { + pr_err("Failed to add page 0x%lx to page_pipe: %d\n", page_addr, (int)ret); + return -1; + } + } + + /* Write page data to the pipe */ + /* The page_pipe has buffers, we need to write to the last buffer's write end */ + if (!list_empty(&cdi->pp->bufs)) { + struct page_pipe_buf *ppb = list_entry(cdi->pp->bufs.prev, struct page_pipe_buf, l); + pipe_fd = ppb->p[1]; /* Write end of pipe */ + + ret = write(pipe_fd, page_buf, PAGE_SIZE); + if (ret != PAGE_SIZE) { + if (ret < 0) + pr_perror("Failed to write page to pipe"); + else + pr_err("Short write to pipe: %zd\n", ret); + return -1; + } + } else { + pr_err("No page_pipe buffers available\n"); + return -1; + } + + cdi->dirty_pages_dumped++; + + /* Check if we should flush */ + if (cdi->dirty_pages_dumped % COW_FLUSH_THRESHOLD == 0) { + pr_debug("Reached flush threshold, flushing pages\n"); + return cow_flush_dirty_pages(cdi); + } + + return 0; +} +#endif +static int cow_handle_write_fault(struct cow_dump_info *cdi, unsigned long addr) +{ + struct dirty_range *dr; + unsigned long page_addr = addr & ~(PAGE_SIZE - 1); + void* page; + struct uffdio_writeprotect wp; + struct uffdio_range range; + + + pr_debug("Write fault at 0x%lx\n", page_addr); + + cdi->dirty_pages++; + + /* Add to dirty list for tracking */ + dr = xmalloc(sizeof(*dr)); + if (!dr) { + return -1; + } + + page = xmalloc(PAGE_SIZE); + //memcpy(page,(void*)page_addr, PAGE_SIZE); + + dr->start = (unsigned long)page; + dr->len = PAGE_SIZE; + INIT_LIST_HEAD(&dr->list); + list_add_tail(&dr->list, &cdi->dirty_list); + + + /* Unprotect the page so the process can continue */ + wp.range.start = page_addr; + wp.range.len = PAGE_SIZE; + wp.mode = 0; /* Clear write-protect */ + + if (ioctl(cdi->uffd, UFFDIO_WRITEPROTECT, &wp)) { + pr_perror("Failed to unprotect page at 0x%lx", page_addr); + return -1; + } + + /* Wake up the faulting thread */ + range.start = page_addr; + range.len = PAGE_SIZE; + + if (ioctl(cdi->uffd, UFFDIO_WAKE, &range)) { + pr_perror("Failed to wake thread after unprotect"); + return -1; + } + + cdi->total_pages--; + return 0; +} + +static int cow_process_events(struct cow_dump_info *cdi, bool blocking) +{ + struct uffd_msg msg; + int ret; + //int flags = blocking ? MSG_WAITALL : MSG_DONTWAIT; + + while (1) { + ret = read(cdi->uffd, &msg, sizeof(msg)); + if (ret < 0) { + if (errno == EAGAIN && !blocking) + return 0; /* No more events */ + pr_perror("Failed to read uffd event"); + return -1; + } + + if (ret != sizeof(msg)) { + pr_err("Short read from uffd: %d\n", ret); + return -1; + } + + switch (msg.event) { + case UFFD_EVENT_PAGEFAULT: + if (msg.arg.pagefault.flags & UFFD_PAGEFAULT_FLAG_WP) { + /* Write fault - track it */ + if (cow_handle_write_fault(cdi, msg.arg.pagefault.address)) + return -1; + } + break; + + case UFFD_EVENT_FORK: + pr_warn("Process forked during COW dump (not fully supported)\n"); + break; + + case UFFD_EVENT_REMAP: + pr_info("Memory remap event\n"); + break; + + default: + pr_err("Unexpected uffd event: %u\n", msg.event); + return -1; + } + } + + return 0; +} + +/* Background thread that monitors for write faults */ +static void *cow_monitor_thread(void *arg) +{ + int iteration_count = 0; + struct cow_dump_info *cdi = (struct cow_dump_info *)arg; + + pr_info("COW monitor thread started\n"); + + while (g_cow_info->total_pages != 0) { + + + /* Process events with short timeout */ + if (cow_process_events(cdi, false) < 0) { + pr_err("Error processing COW events in monitor thread\n"); + break; + } + /* Small delay to avoid busy-waiting */ + //usleep(1000); /* 1ms */ + /* Print total pages once per second */ + iteration_count++; + if (iteration_count >= 10000) { /* 1000 * 1ms = 1 second */ + pr_info("COW monitor: %lu pages remaining\n", g_cow_info->total_pages); + iteration_count = 0; + } + + } + + pr_info("COW monitor thread stopped\n"); + return NULL; +} + +int cow_start_monitor_thread(void) +{ + int ret; + + if (!g_cow_info) { + pr_err("COW dump not initialized\n"); + return -1; + } + + g_stop_monitoring = false; + + ret = pthread_create(&g_monitor_thread, NULL, cow_monitor_thread, g_cow_info); + if (ret) { + pr_perror("Failed to create COW monitor thread"); + return -1; + } + + pr_info("COW monitor thread created successfully\n"); + return 0; +} + +int cow_stop_monitor_thread(void) +{ + void *retval; + + if (!g_cow_info) { + return 0; /* Nothing to stop */ + } + + pr_info("Stopping COW monitor thread\n"); + g_stop_monitoring = true; + + /* Wait for thread to finish */ + if (pthread_join(g_monitor_thread, &retval)) { + pr_perror("Failed to join COW monitor thread"); + return -1; + } + + pr_info("COW monitor thread stopped successfully\n"); + return 0; +} diff --git a/criu/cr-dump.c b/criu/cr-dump.c index 60b8e793c9..249432c344 100644 --- a/criu/cr-dump.c +++ b/criu/cr-dump.c @@ -65,6 +65,7 @@ #include "stats.h" #include "mem.h" #include "page-pipe.h" +#include "cow-dump.h" #include "posix-timer.h" #include "vdso.h" #include "vma.h" @@ -1710,56 +1711,81 @@ static int dump_one_task(struct pstree_item *item, InventoryEntry *parent_ie) mdc.stat = &pps_buf; mdc.parent_ie = parent_ie; - ret = parasite_dump_pages_seized(item, &vmas, &mdc, parasite_ctl); - if (ret) - goto err_cure; + if (!opts.cow_dump) { + /* Normal dump - dump all pages */ + ret = parasite_dump_pages_seized(item, &vmas, &mdc, parasite_ctl); + if (ret) + goto err_cure; + } else { + /* COW dump mode: split VMAs by size */ + ret = cow_dump_init(item, &vmas, parasite_ctl); + if (ret) { + pr_err("Failed to initialize COW dump for large VMAs\n"); + goto err_cure; + } + + /* Start background thread to monitor page faults */ + ret = cow_start_monitor_thread(); + if (ret) { + pr_err("Failed to start COW monitor thread\n"); + goto err_cure; + } + } + + pr_info("file = %s, line = %d\n", __FILE__, __LINE__); ret = parasite_dump_sigacts_seized(parasite_ctl, item); if (ret) { pr_err("Can't dump sigactions (pid: %d) with parasite\n", pid); goto err_cure; } - + pr_info("file = %s, line = %d\n", __FILE__, __LINE__); ret = parasite_dump_itimers_seized(parasite_ctl, item); if (ret) { pr_err("Can't dump itimers (pid: %d)\n", pid); goto err_cure; } - + pr_info("file = %s, line = %d\n", __FILE__, __LINE__); ret = parasite_dump_posix_timers_seized(&proc_args, parasite_ctl, item); if (ret) { pr_err("Can't dump posix timers (pid: %d)\n", pid); goto err_cure; } - + pr_info("file = %s, line = %d\n", __FILE__, __LINE__); ret = dump_task_core_all(parasite_ctl, item, &pps_buf, cr_imgset, &misc); if (ret) { pr_err("Dump core (pid: %d) failed with %d\n", pid, ret); goto err_cure; } - + pr_info("file = %s, line = %d\n", __FILE__, __LINE__); ret = dump_task_cgroup(parasite_ctl, item); if (ret) { pr_err("Dump cgroup of threads in process (pid: %d) failed with %d\n", pid, ret); goto err_cure; } + pr_info("file = %s, line = %d\n", __FILE__, __LINE__); + + + pr_info("file = %s, line = %d\n", __FILE__, __LINE__); ret = compel_stop_daemon(parasite_ctl); + pr_info("file = %s, line = %d\n", __FILE__, __LINE__); if (ret) { pr_err("Can't stop daemon in parasite (pid: %d)\n", pid); goto err_cure; } - + pr_info("file = %s, line = %d\n", __FILE__, __LINE__); ret = dump_task_threads(parasite_ctl, item); if (ret) { pr_err("Can't dump threads\n"); goto err_cure; } - + pr_info("file = %s, line = %d\n", __FILE__, __LINE__); /* * On failure local map will be cured in cr_dump_finish() * for lazy pages. */ + if (opts.lazy_pages) ret = compel_cure_remote(parasite_ctl); else @@ -1768,19 +1794,20 @@ static int dump_one_task(struct pstree_item *item, InventoryEntry *parent_ie) pr_err("Can't cure (pid: %d) from parasite\n", pid); goto err; } - + pr_info("file = %s, line = %d\n", __FILE__, __LINE__); ret = dump_task_mm(pid, &pps_buf, &misc, &vmas, cr_imgset); if (ret) { pr_err("Dump mappings (pid: %d) failed with %d\n", pid, ret); goto err; } - + pr_info("file = %s, line = %d\n", __FILE__, __LINE__); ret = dump_task_fs(pid, &misc, cr_imgset); if (ret) { pr_err("Dump fs (pid: %d) failed with %d\n", pid, ret); goto err; } - + + pr_info("file = %s, line = %d\n", __FILE__, __LINE__); exit_code = 0; err: close_cr_imgset(&cr_imgset); @@ -2043,7 +2070,7 @@ static int cr_lazy_mem_dump(void) static int cr_dump_finish(int ret) { int post_dump_ret = 0; - + pr_info("file = %s, line = %d\n", __FILE__, __LINE__); if (disconnect_from_page_server()) ret = -1; @@ -2099,8 +2126,8 @@ static int cr_dump_finish(int ret) delete_link_remaps(); clean_cr_time_mounts(); } - - if (!ret && opts.lazy_pages) + pr_info("file = %s, line = %d\n", __FILE__, __LINE__); + if (!ret && opts.lazy_pages) ret = cr_lazy_mem_dump(); if (arch_set_thread_regs(root_item, true) < 0) @@ -2110,6 +2137,18 @@ static int cr_dump_finish(int ret) pstree_switch_state(root_item, (ret || post_dump_ret) ? TASK_ALIVE : opts.final_state); timing_stop(TIME_FROZEN); + + if (!ret && opts.cow_dump) { + pr_info("file = %s, line = %d\n", __FILE__, __LINE__); + + /* Stop the monitor thread before final dump */ + if (cow_stop_monitor_thread()) { + pr_err("Failed to stop COW monitor thread\n"); + ret = -1; + } + + } + free_pstree(root_item); seccomp_free_entries(); free_file_locks(); diff --git a/criu/include/cow-dump.h b/criu/include/cow-dump.h new file mode 100644 index 0000000000..cf5ce5ad79 --- /dev/null +++ b/criu/include/cow-dump.h @@ -0,0 +1,62 @@ +#ifndef __CR_COW_DUMP_H_ +#define __CR_COW_DUMP_H_ + +#include "types.h" + +struct pstree_item; +struct vm_area_list; +struct parasite_ctl; + + +/** + * cow_dump_init - Initialize COW dump for a process + * @item: Process tree item to set up COW tracking for + * @vma_area_list: List of VMAs to track + * @ctl: Parasite control structure for RPC + * + * Sets up userfaultfd with write-protection for all writable memory + * regions of the target process. The registration is performed via + * parasite RPC to ensure it runs in the target process's context. + * + * Returns: 0 on success, -1 on error + */ +extern int cow_dump_init(struct pstree_item *item, struct vm_area_list *vma_area_list, struct parasite_ctl *ctl); + +/** + * cow_dump_fini - Clean up COW dump resources + * + * Releases all resources allocated for COW tracking. + */ +extern void cow_dump_fini(void); + +/** + * cow_check_kernel_support - Check if kernel supports COW dump + * + * Verifies that the kernel has necessary userfaultfd write-protect + * features (requires Linux 5.7+). + * + * Returns: true if supported, false otherwise + */ +extern bool cow_check_kernel_support(void); + +/** + * cow_start_monitor_thread - Start background thread to monitor page faults + * + * Creates a pthread that continuously monitors the userfaultfd for + * write faults and handles them immediately, preventing the target + * process from blocking during the dump phase. + * + * Returns: 0 on success, -1 on error + */ +extern int cow_start_monitor_thread(void); + +/** + * cow_stop_monitor_thread - Stop the monitoring thread + * + * Signals the monitor thread to stop and waits for it to complete. + * + * Returns: 0 on success, -1 on error + */ +extern int cow_stop_monitor_thread(void); + +#endif /* __CR_COW_DUMP_H_ */ diff --git a/criu/include/cr_options.h b/criu/include/cr_options.h index 8c5707b415..98063b9291 100644 --- a/criu/include/cr_options.h +++ b/criu/include/cr_options.h @@ -193,6 +193,7 @@ struct cr_options { unsigned int empty_ns; int tcp_skip_in_flight; bool lazy_pages; + bool cow_dump; char *work_dir; int network_lock_method; int skip_file_rwx_check; diff --git a/criu/include/linux/userfaultfd.h b/criu/include/linux/userfaultfd.h index cfcf48571d..2eb9f327a8 100644 --- a/criu/include/linux/userfaultfd.h +++ b/criu/include/linux/userfaultfd.h @@ -1,3 +1,4 @@ +/* SPDX-License-Identifier: GPL-2.0 WITH Linux-syscall-note */ /* * include/linux/userfaultfd.h * @@ -11,6 +12,10 @@ #include +/* ioctls for /dev/userfaultfd */ +#define USERFAULTFD_IOC 0xAA +#define USERFAULTFD_IOC_NEW _IO(USERFAULTFD_IOC, 0x00) + /* * If the UFFDIO_API is upgraded someday, the UFFDIO_UNREGISTER and * UFFDIO_WAKE ioctls should be defined as _IOW and not as _IOR. In @@ -18,12 +23,44 @@ * means the userland is reading). */ #define UFFD_API ((__u64)0xAA) -#define UFFD_API_FEATURES \ - (UFFD_FEATURE_EVENT_FORK | UFFD_FEATURE_EVENT_REMAP | UFFD_FEATURE_EVENT_REMOVE | UFFD_FEATURE_EVENT_UNMAP | \ - UFFD_FEATURE_MISSING_HUGETLBFS | UFFD_FEATURE_MISSING_SHMEM) -#define UFFD_API_IOCTLS ((__u64)1 << _UFFDIO_REGISTER | (__u64)1 << _UFFDIO_UNREGISTER | (__u64)1 << _UFFDIO_API) -#define UFFD_API_RANGE_IOCTLS ((__u64)1 << _UFFDIO_WAKE | (__u64)1 << _UFFDIO_COPY | (__u64)1 << _UFFDIO_ZEROPAGE) -#define UFFD_API_RANGE_IOCTLS_BASIC ((__u64)1 << _UFFDIO_WAKE | (__u64)1 << _UFFDIO_COPY) +#define UFFD_API_REGISTER_MODES (UFFDIO_REGISTER_MODE_MISSING | \ + UFFDIO_REGISTER_MODE_WP | \ + UFFDIO_REGISTER_MODE_MINOR) +#define UFFD_API_FEATURES (UFFD_FEATURE_PAGEFAULT_FLAG_WP | \ + UFFD_FEATURE_EVENT_FORK | \ + UFFD_FEATURE_EVENT_REMAP | \ + UFFD_FEATURE_EVENT_REMOVE | \ + UFFD_FEATURE_EVENT_UNMAP | \ + UFFD_FEATURE_MISSING_HUGETLBFS | \ + UFFD_FEATURE_MISSING_SHMEM | \ + UFFD_FEATURE_SIGBUS | \ + UFFD_FEATURE_THREAD_ID | \ + UFFD_FEATURE_MINOR_HUGETLBFS | \ + UFFD_FEATURE_MINOR_SHMEM | \ + UFFD_FEATURE_EXACT_ADDRESS | \ + UFFD_FEATURE_WP_HUGETLBFS_SHMEM | \ + UFFD_FEATURE_WP_UNPOPULATED | \ + UFFD_FEATURE_POISON | \ + UFFD_FEATURE_WP_ASYNC | \ + UFFD_FEATURE_MOVE) +#define UFFD_API_IOCTLS \ + ((__u64)1 << _UFFDIO_REGISTER | \ + (__u64)1 << _UFFDIO_UNREGISTER | \ + (__u64)1 << _UFFDIO_API) +#define UFFD_API_RANGE_IOCTLS \ + ((__u64)1 << _UFFDIO_WAKE | \ + (__u64)1 << _UFFDIO_COPY | \ + (__u64)1 << _UFFDIO_ZEROPAGE | \ + (__u64)1 << _UFFDIO_MOVE | \ + (__u64)1 << _UFFDIO_WRITEPROTECT | \ + (__u64)1 << _UFFDIO_CONTINUE | \ + (__u64)1 << _UFFDIO_POISON) +#define UFFD_API_RANGE_IOCTLS_BASIC \ + ((__u64)1 << _UFFDIO_WAKE | \ + (__u64)1 << _UFFDIO_COPY | \ + (__u64)1 << _UFFDIO_WRITEPROTECT | \ + (__u64)1 << _UFFDIO_CONTINUE | \ + (__u64)1 << _UFFDIO_POISON) /* * Valid ioctl command number range with this API is from 0x00 to @@ -33,56 +70,77 @@ * which ioctl the running kernel implements through the ioctl command * bitmask written by the UFFDIO_API. */ -#define _UFFDIO_REGISTER (0x00) -#define _UFFDIO_UNREGISTER (0x01) -#define _UFFDIO_WAKE (0x02) -#define _UFFDIO_COPY (0x03) -#define _UFFDIO_ZEROPAGE (0x04) -#define _UFFDIO_API (0x3F) +#define _UFFDIO_REGISTER (0x00) +#define _UFFDIO_UNREGISTER (0x01) +#define _UFFDIO_WAKE (0x02) +#define _UFFDIO_COPY (0x03) +#define _UFFDIO_ZEROPAGE (0x04) +#define _UFFDIO_MOVE (0x05) +#define _UFFDIO_WRITEPROTECT (0x06) +#define _UFFDIO_CONTINUE (0x07) +#define _UFFDIO_POISON (0x08) +#define _UFFDIO_API (0x3F) /* userfaultfd ioctl ids */ -#define UFFDIO 0xAA -#define UFFDIO_API _IOWR(UFFDIO, _UFFDIO_API, struct uffdio_api) -#define UFFDIO_REGISTER _IOWR(UFFDIO, _UFFDIO_REGISTER, struct uffdio_register) -#define UFFDIO_UNREGISTER _IOR(UFFDIO, _UFFDIO_UNREGISTER, struct uffdio_range) -#define UFFDIO_WAKE _IOR(UFFDIO, _UFFDIO_WAKE, struct uffdio_range) -#define UFFDIO_COPY _IOWR(UFFDIO, _UFFDIO_COPY, struct uffdio_copy) -#define UFFDIO_ZEROPAGE _IOWR(UFFDIO, _UFFDIO_ZEROPAGE, struct uffdio_zeropage) +#define UFFDIO 0xAA +#define UFFDIO_API _IOWR(UFFDIO, _UFFDIO_API, \ + struct uffdio_api) +#define UFFDIO_REGISTER _IOWR(UFFDIO, _UFFDIO_REGISTER, \ + struct uffdio_register) +#define UFFDIO_UNREGISTER _IOR(UFFDIO, _UFFDIO_UNREGISTER, \ + struct uffdio_range) +#define UFFDIO_WAKE _IOR(UFFDIO, _UFFDIO_WAKE, \ + struct uffdio_range) +#define UFFDIO_COPY _IOWR(UFFDIO, _UFFDIO_COPY, \ + struct uffdio_copy) +#define UFFDIO_ZEROPAGE _IOWR(UFFDIO, _UFFDIO_ZEROPAGE, \ + struct uffdio_zeropage) +#define UFFDIO_MOVE _IOWR(UFFDIO, _UFFDIO_MOVE, \ + struct uffdio_move) +#define UFFDIO_WRITEPROTECT _IOWR(UFFDIO, _UFFDIO_WRITEPROTECT, \ + struct uffdio_writeprotect) +#define UFFDIO_CONTINUE _IOWR(UFFDIO, _UFFDIO_CONTINUE, \ + struct uffdio_continue) +#define UFFDIO_POISON _IOWR(UFFDIO, _UFFDIO_POISON, \ + struct uffdio_poison) /* read() structure */ struct uffd_msg { - __u8 event; + __u8 event; - __u8 reserved1; - __u16 reserved2; - __u32 reserved3; + __u8 reserved1; + __u16 reserved2; + __u32 reserved3; union { struct { - __u64 flags; - __u64 address; + __u64 flags; + __u64 address; + union { + __u32 ptid; + } feat; } pagefault; struct { - __u32 ufd; + __u32 ufd; } fork; struct { - __u64 from; - __u64 to; - __u64 len; + __u64 from; + __u64 to; + __u64 len; } remap; struct { - __u64 start; - __u64 end; + __u64 start; + __u64 end; } remove; struct { /* unused reserved fields */ - __u64 reserved1; - __u64 reserved2; - __u64 reserved3; + __u64 reserved1; + __u64 reserved2; + __u64 reserved3; } reserved; } arg; } __packed; @@ -90,15 +148,16 @@ struct uffd_msg { /* * Start at 0x12 and not at 0 to be more strict against bugs. */ -#define UFFD_EVENT_PAGEFAULT 0x12 -#define UFFD_EVENT_FORK 0x13 -#define UFFD_EVENT_REMAP 0x14 -#define UFFD_EVENT_REMOVE 0x15 -#define UFFD_EVENT_UNMAP 0x16 +#define UFFD_EVENT_PAGEFAULT 0x12 +#define UFFD_EVENT_FORK 0x13 +#define UFFD_EVENT_REMAP 0x14 +#define UFFD_EVENT_REMOVE 0x15 +#define UFFD_EVENT_UNMAP 0x16 /* flags for UFFD_EVENT_PAGEFAULT */ -#define UFFD_PAGEFAULT_FLAG_WRITE (1 << 0) /* If this was a write fault */ -#define UFFD_PAGEFAULT_FLAG_WP (1 << 1) /* If reason is VM_UFFD_WP */ +#define UFFD_PAGEFAULT_FLAG_WRITE (1<<0) /* If this was a write fault */ +#define UFFD_PAGEFAULT_FLAG_WP (1<<1) /* If reason is VM_UFFD_WP */ +#define UFFD_PAGEFAULT_FLAG_MINOR (1<<2) /* If reason is VM_UFFD_MINOR */ struct uffdio_api { /* userland asks for an API number and the features to enable */ @@ -136,14 +195,59 @@ struct uffdio_api { * UFFD_FEATURE_MISSING_SHMEM works the same as * UFFD_FEATURE_MISSING_HUGETLBFS, but it applies to shmem * (i.e. tmpfs and other shmem based APIs). + * + * UFFD_FEATURE_SIGBUS feature means no page-fault + * (UFFD_EVENT_PAGEFAULT) event will be delivered, instead + * a SIGBUS signal will be sent to the faulting process. + * + * UFFD_FEATURE_THREAD_ID pid of the page faulted task_struct will + * be returned, if feature is not requested 0 will be returned. + * + * UFFD_FEATURE_MINOR_HUGETLBFS indicates that minor faults + * can be intercepted (via REGISTER_MODE_MINOR) for + * hugetlbfs-backed pages. + * + * UFFD_FEATURE_MINOR_SHMEM indicates the same support as + * UFFD_FEATURE_MINOR_HUGETLBFS, but for shmem-backed pages instead. + * + * UFFD_FEATURE_EXACT_ADDRESS indicates that the exact address of page + * faults would be provided and the offset within the page would not be + * masked. + * + * UFFD_FEATURE_WP_HUGETLBFS_SHMEM indicates that userfaultfd + * write-protection mode is supported on both shmem and hugetlbfs. + * + * UFFD_FEATURE_WP_UNPOPULATED indicates that userfaultfd + * write-protection mode will always apply to unpopulated pages + * (i.e. empty ptes). This will be the default behavior for shmem + * & hugetlbfs, so this flag only affects anonymous memory behavior + * when userfault write-protection mode is registered. + * + * UFFD_FEATURE_WP_ASYNC indicates that userfaultfd write-protection + * asynchronous mode is supported in which the write fault is + * automatically resolved and write-protection is un-set. + * It implies UFFD_FEATURE_WP_UNPOPULATED. + * + * UFFD_FEATURE_MOVE indicates that the kernel supports moving an + * existing page contents from userspace. */ -#define UFFD_FEATURE_PAGEFAULT_FLAG_WP (1 << 0) -#define UFFD_FEATURE_EVENT_FORK (1 << 1) -#define UFFD_FEATURE_EVENT_REMAP (1 << 2) -#define UFFD_FEATURE_EVENT_REMOVE (1 << 3) -#define UFFD_FEATURE_MISSING_HUGETLBFS (1 << 4) -#define UFFD_FEATURE_MISSING_SHMEM (1 << 5) -#define UFFD_FEATURE_EVENT_UNMAP (1 << 6) +#define UFFD_FEATURE_PAGEFAULT_FLAG_WP (1<<0) +#define UFFD_FEATURE_EVENT_FORK (1<<1) +#define UFFD_FEATURE_EVENT_REMAP (1<<2) +#define UFFD_FEATURE_EVENT_REMOVE (1<<3) +#define UFFD_FEATURE_MISSING_HUGETLBFS (1<<4) +#define UFFD_FEATURE_MISSING_SHMEM (1<<5) +#define UFFD_FEATURE_EVENT_UNMAP (1<<6) +#define UFFD_FEATURE_SIGBUS (1<<7) +#define UFFD_FEATURE_THREAD_ID (1<<8) +#define UFFD_FEATURE_MINOR_HUGETLBFS (1<<9) +#define UFFD_FEATURE_MINOR_SHMEM (1<<10) +#define UFFD_FEATURE_EXACT_ADDRESS (1<<11) +#define UFFD_FEATURE_WP_HUGETLBFS_SHMEM (1<<12) +#define UFFD_FEATURE_WP_UNPOPULATED (1<<13) +#define UFFD_FEATURE_POISON (1<<14) +#define UFFD_FEATURE_WP_ASYNC (1<<15) +#define UFFD_FEATURE_MOVE (1<<16) __u64 features; __u64 ioctls; @@ -156,8 +260,9 @@ struct uffdio_range { struct uffdio_register { struct uffdio_range range; -#define UFFDIO_REGISTER_MODE_MISSING ((__u64)1 << 0) -#define UFFDIO_REGISTER_MODE_WP ((__u64)1 << 1) +#define UFFDIO_REGISTER_MODE_MISSING ((__u64)1<<0) +#define UFFDIO_REGISTER_MODE_WP ((__u64)1<<1) +#define UFFDIO_REGISTER_MODE_MINOR ((__u64)1<<2) __u64 mode; /* @@ -171,13 +276,14 @@ struct uffdio_copy { __u64 dst; __u64 src; __u64 len; +#define UFFDIO_COPY_MODE_DONTWAKE ((__u64)1<<0) /* - * There will be a wrprotection flag later that allows to map - * pages wrprotected on the fly. And such a flag will be - * available if the wrprotection ioctl are implemented for the - * range according to the uffdio_register.ioctls. + * UFFDIO_COPY_MODE_WP will map the page write protected on + * the fly. UFFDIO_COPY_MODE_WP is available only if the + * write protected ioctl is implemented for the range + * according to the uffdio_register.ioctls. */ -#define UFFDIO_COPY_MODE_DONTWAKE ((__u64)1 << 0) +#define UFFDIO_COPY_MODE_WP ((__u64)1<<1) __u64 mode; /* @@ -189,7 +295,7 @@ struct uffdio_copy { struct uffdio_zeropage { struct uffdio_range range; -#define UFFDIO_ZEROPAGE_MODE_DONTWAKE ((__u64)1 << 0) +#define UFFDIO_ZEROPAGE_MODE_DONTWAKE ((__u64)1<<0) __u64 mode; /* @@ -199,4 +305,82 @@ struct uffdio_zeropage { __s64 zeropage; }; -#endif /* _LINUX_USERFAULTFD_H */ +struct uffdio_writeprotect { + struct uffdio_range range; +/* + * UFFDIO_WRITEPROTECT_MODE_WP: set the flag to write protect a range, + * unset the flag to undo protection of a range which was previously + * write protected. + * + * UFFDIO_WRITEPROTECT_MODE_DONTWAKE: set the flag to avoid waking up + * any wait thread after the operation succeeds. + * + * NOTE: Write protecting a region (WP=1) is unrelated to page faults, + * therefore DONTWAKE flag is meaningless with WP=1. Removing write + * protection (WP=0) in response to a page fault wakes the faulting + * task unless DONTWAKE is set. + */ +#define UFFDIO_WRITEPROTECT_MODE_WP ((__u64)1<<0) +#define UFFDIO_WRITEPROTECT_MODE_DONTWAKE ((__u64)1<<1) + __u64 mode; +}; + +struct uffdio_continue { + struct uffdio_range range; +#define UFFDIO_CONTINUE_MODE_DONTWAKE ((__u64)1<<0) + /* + * UFFDIO_CONTINUE_MODE_WP will map the page write protected on + * the fly. UFFDIO_CONTINUE_MODE_WP is available only if the + * write protected ioctl is implemented for the range + * according to the uffdio_register.ioctls. + */ +#define UFFDIO_CONTINUE_MODE_WP ((__u64)1<<1) + __u64 mode; + + /* + * Fields below here are written by the ioctl and must be at the end: + * the copy_from_user will not read past here. + */ + __s64 mapped; +}; + +struct uffdio_poison { + struct uffdio_range range; +#define UFFDIO_POISON_MODE_DONTWAKE ((__u64)1<<0) + __u64 mode; + + /* + * Fields below here are written by the ioctl and must be at the end: + * the copy_from_user will not read past here. + */ + __s64 updated; +}; + +struct uffdio_move { + __u64 dst; + __u64 src; + __u64 len; + /* + * Especially if used to atomically remove memory from the + * address space the wake on the dst range is not needed. + */ +#define UFFDIO_MOVE_MODE_DONTWAKE ((__u64)1<<0) +#define UFFDIO_MOVE_MODE_ALLOW_SRC_HOLES ((__u64)1<<1) + __u64 mode; + /* + * "move" is written by the ioctl and must be at the end: the + * copy_from_user will not read the last 8 bytes. + */ + __s64 move; +}; + +/* + * Flags for the userfaultfd(2) system call itself. + */ + +/* + * Create a userfaultfd that can handle page faults only in user mode. + */ +#define UFFD_USER_MODE_ONLY 1 + +#endif /* _LINUX_USERFAULTFD_H */ \ No newline at end of file diff --git a/criu/include/parasite.h b/criu/include/parasite.h index 1763577111..76c88ff23c 100644 --- a/criu/include/parasite.h +++ b/criu/include/parasite.h @@ -37,6 +37,7 @@ enum { PARASITE_CMD_CHECK_VDSO_MARK, PARASITE_CMD_CHECK_AIOS, PARASITE_CMD_DUMP_CGROUP, + PARASITE_CMD_COW_DUMP_INIT, PARASITE_CMD_MAX, }; @@ -254,6 +255,28 @@ struct parasite_dump_cgroup_args { char thread_cgrp[32]; }; +/* + * COW dump initialization arguments + * VMAs are stored after this structure, similar to parasite_dump_pages_args + * Failed VMA indices stored after VMAs + */ +struct parasite_cow_dump_args { + unsigned int nr_vmas; + unsigned long total_pages; /* Output: total pages registered */ + unsigned int nr_failed_vmas; /* Output: number of VMAs that couldn't be registered */ + int ret; /* Output: return code */ +}; + +static inline struct parasite_vma_entry *cow_dump_vmas(struct parasite_cow_dump_args *a) +{ + return (struct parasite_vma_entry *)(a + 1); +} + +static inline unsigned int *cow_dump_failed_indices(struct parasite_cow_dump_args *a) +{ + return (unsigned int *)(cow_dump_vmas(a) + a->nr_vmas); +} + #endif /* !__ASSEMBLY__ */ #endif /* __CR_PARASITE_H__ */ diff --git a/criu/mem.c b/criu/mem.c index 9e8740c070..3418debb10 100644 --- a/criu/mem.c +++ b/criu/mem.c @@ -290,13 +290,15 @@ prep_dump_pages_args(struct parasite_ctl *ctl, struct vm_area_list *vma_area_lis struct parasite_dump_pages_args *args; struct parasite_vma_entry *p_vma; struct vma_area *vma; - + pr_info("parasite_dump_pages_seized file = %s, line = %d\n", __FILE__, __LINE__); args = compel_parasite_args_s(ctl, dump_pages_args_size(vma_area_list)); - +pr_info("parasite_dump_pages_seized file = %s, line = %d\n", __FILE__, __LINE__); p_vma = pargs_vmas(args); + pr_info("parasite_dump_pages_seized file = %s, line = %d\n", __FILE__, __LINE__); args->nr_vmas = 0; list_for_each_entry(vma, &vma_area_list->h, list) { + pr_info("parasite_dump_pages_seized file = %s, line = %d\n", __FILE__, __LINE__); if (!vma_area_is_private(vma, kdat.task_size)) continue; /* @@ -317,10 +319,12 @@ prep_dump_pages_args(struct parasite_ctl *ctl, struct vm_area_list *vma_area_lis p_vma->start = vma->e->start; p_vma->len = vma_area_len(vma); p_vma->prot = vma->e->prot; + pr_info("parasite_dump_pages_seized file = %s, line = %d\n", __FILE__, __LINE__); args->nr_vmas++; p_vma++; } + pr_info("parasite_dump_pages_seized file = %s, line = %d args->nr_vmas=%u\n", __FILE__, __LINE__,args->nr_vmas); return args; } @@ -329,6 +333,7 @@ static int drain_pages(struct page_pipe *pp, struct parasite_ctl *ctl, struct pa { struct page_pipe_buf *ppb; int ret = 0; + pr_info("drain_pages file = %s, line = %d\n", __FILE__, __LINE__); debug_show_page_pipe(pp); @@ -338,15 +343,20 @@ static int drain_pages(struct page_pipe *pp, struct parasite_ctl *ctl, struct pa args->nr_pages = ppb->pages_in; pr_debug("PPB: %ld pages %d segs %u pipe %d off\n", args->nr_pages, args->nr_segs, ppb->pipe_size, args->off); - + pr_info("drain_pages file = %s, line = %d\n", __FILE__, __LINE__); ret = compel_rpc_call(PARASITE_CMD_DUMPPAGES, ctl); + pr_info("drain_pages file = %s, line = %d ret=%d\n", __FILE__, __LINE__, ret); if (ret < 0) return -1; + pr_info("drain_pages file = %s, line = %d\n", __FILE__, __LINE__); + ret = compel_util_send_fd(ctl, ppb->p[1]); + pr_info("drain_pages file = %s, line = %d ret=%d\n", __FILE__, __LINE__, ret); if (ret) return -1; ret = compel_rpc_sync(PARASITE_CMD_DUMPPAGES, ctl); + pr_info("drain_pages file = %s, line = %d ret=%d\n", __FILE__, __LINE__, ret); if (ret < 0) return -1; @@ -543,16 +553,18 @@ static int __parasite_dump_pages_seized(struct pstree_item *item, struct parasit timing_start(TIME_MEMDUMP); - pr_debug(" Private vmas %lu/%lu pages\n", vma_area_list->nr_priv_pages_longest, vma_area_list->nr_priv_pages); + pr_info(" Private vmas %lu/%lu pages\n", vma_area_list->nr_priv_pages_longest, vma_area_list->nr_priv_pages); /* * Step 0 -- prepare */ - + pr_info("__parasite_dump_pages_seized file = %s, line = %d\n", __FILE__, __LINE__); pmc_size = max(vma_area_list->nr_priv_pages_longest, vma_area_list->nr_shared_pages_longest); + pr_info("__parasite_dump_pages_seized file = %s, line = %d pmc_size=%lu\n", __FILE__, __LINE__,pmc_size); + if (pmc_init(&pmc, item->pid->real, &vma_area_list->h, pmc_size * PAGE_SIZE)) return -1; - + pr_info("__parasite_dump_pages_seized file = %s, line = %d\n", __FILE__, __LINE__); if (!(mdc->pre_dump || mdc->lazy)) /* * Chunk mode pushes pages portion by portion. This mode @@ -563,7 +575,7 @@ static int __parasite_dump_pages_seized(struct pstree_item *item, struct parasit pp = create_page_pipe(vma_area_list->nr_priv_pages, mdc->lazy ? NULL : pargs_iovs(args), cpp_flags); if (!pp) goto out; - + pr_info("__parasite_dump_pages_seized file = %s, line = %d\n", __FILE__, __LINE__); if (!mdc->pre_dump) { /* * Regular dump -- create xfer object and send pages to it @@ -583,13 +595,13 @@ static int __parasite_dump_pages_seized(struct pstree_item *item, struct parasit if (ret) xfer.parent = NULL + 1; } - + pr_info("__parasite_dump_pages_seized file = %s, line = %d\n", __FILE__, __LINE__); if (xfer.parent) { possible_pid_reuse = detect_pid_reuse(item, mdc->stat, mdc->parent_ie); if (possible_pid_reuse == -1) goto out_xfer; } - + pr_info("__parasite_dump_pages_seized file = %s, line = %d\n", __FILE__, __LINE__); /* * Step 1 -- generate the pagemap */ @@ -607,9 +619,10 @@ static int __parasite_dump_pages_seized(struct pstree_item *item, struct parasit if (ret < 0) goto out_xfer; } - + pr_info("__parasite_dump_pages_seized file = %s, line = %d\n", __FILE__, __LINE__); if (mdc->lazy) memcpy(pargs_iovs(args), pp->iovs, sizeof(struct iovec) * pp->nr_iovs); + pr_info("__parasite_dump_pages_seized file = %s, line = %d\n", __FILE__, __LINE__); /* * Faking drain_pages for pre-dump here. Actual drain_pages for pre-dump @@ -621,14 +634,14 @@ static int __parasite_dump_pages_seized(struct pstree_item *item, struct parasit ret = 0; else ret = drain_pages(pp, ctl, args); - + pr_info("__parasite_dump_pages_seized file = %s, line = %d\n", __FILE__, __LINE__); if (!ret && !mdc->pre_dump) ret = xfer_pages(pp, &xfer); if (ret) goto out_xfer; timing_stop(TIME_MEMDUMP); - + pr_info("__parasite_dump_pages_seized file = %s, line = %d\n", __FILE__, __LINE__); /* * Step 4 -- clean up */ @@ -656,9 +669,9 @@ int parasite_dump_pages_seized(struct pstree_item *item, struct vm_area_list *vm { int ret; struct parasite_dump_pages_args *pargs; - + pr_info("parasite_dump_pages_seized file = %s, line = %d\n", __FILE__, __LINE__); pargs = prep_dump_pages_args(ctl, vma_area_list, mdc->pre_dump); - + pr_info("parasite_dump_pages_seized file = %s, line = %d\n", __FILE__, __LINE__); /* * Add PROT_READ protection for all VMAs we're about to * dump if they don't have one. Otherwise we'll not be @@ -698,15 +711,19 @@ int parasite_dump_pages_seized(struct pstree_item *item, struct vm_area_list *vm * 9. syscall fails to copy * data from M */ - - if (!mdc->pre_dump || opts.pre_dump_mode == PRE_DUMP_SPLICE) { + pr_info("parasite_dump_pages_seized file = %s, line = %d\n", __FILE__, __LINE__); + + if ((pargs->nr_vmas != 0) &&(!mdc->pre_dump || opts.pre_dump_mode == PRE_DUMP_SPLICE)) { pargs->add_prot = PROT_READ; + pr_info("parasite_dump_pages_seized file = %s, line = %d\n", __FILE__, __LINE__); + ret = compel_rpc_call_sync(PARASITE_CMD_MPROTECT_VMAS, ctl); if (ret) { pr_err("Can't dump unprotect vmas with parasite\n"); return ret; } } + pr_info("parasite_dump_pages_seized file = %s, line = %d\n", __FILE__, __LINE__); if (fault_injected(FI_DUMP_PAGES)) { pr_err("fault: Dump VMA pages failure!\n"); @@ -719,8 +736,7 @@ int parasite_dump_pages_seized(struct pstree_item *item, struct vm_area_list *vm /* Parasite will unprotect VMAs after fail in fini() */ return ret; } - - if (!mdc->pre_dump || opts.pre_dump_mode == PRE_DUMP_SPLICE) { + if ((pargs->nr_vmas != 0) &&(!mdc->pre_dump || opts.pre_dump_mode == PRE_DUMP_SPLICE)) { pargs->add_prot = 0; if (compel_rpc_call_sync(PARASITE_CMD_MPROTECT_VMAS, ctl)) { pr_err("Can't rollback unprotected vmas with parasite\n"); diff --git a/criu/pie/parasite.c b/criu/pie/parasite.c index c966e9e62c..2b45cdfd47 100644 --- a/criu/pie/parasite.c +++ b/criu/pie/parasite.c @@ -6,6 +6,7 @@ #include #include #include +#include #include "linux/rseq.h" @@ -49,6 +50,7 @@ static int mprotect_vmas(struct parasite_dump_pages_args *args) { struct parasite_vma_entry *vmas, *vma; int ret = 0, i; + pr_info("mprotect_vmas file = %s, line = %d\n", __FILE__, __LINE__); vmas = pargs_vmas(args); for (i = 0; i < args->nr_vmas; i++) { @@ -74,18 +76,19 @@ static int dump_pages(struct parasite_dump_pages_args *args) struct iovec *iovs; int off, nr_segs; unsigned long spliced_bytes = 0; - + pr_err("dump_pages file = %s, line = %d\n", __FILE__, __LINE__); tsock = parasite_get_rpc_sock(); p = recv_fd(tsock); if (p < 0) return -1; - + pr_info("dump_pages file = %s, line = %d\n", __FILE__, __LINE__); iovs = pargs_iovs(args); off = 0; nr_segs = args->nr_segs; if (nr_segs > UIO_MAXIOV) nr_segs = UIO_MAXIOV; while (1) { + pr_info("dump_pages file = %s, line = %d\n", __FILE__, __LINE__); ret = sys_vmsplice(p, &iovs[args->off + off], nr_segs, SPLICE_F_GIFT | SPLICE_F_NONBLOCK); if (ret < 0) { sys_close(p); @@ -854,6 +857,144 @@ static int parasite_dump_cgroup(struct parasite_dump_cgroup_args *args) return 0; } +static int parasite_cow_dump_init(struct parasite_cow_dump_args *args) +{ + struct parasite_vma_entry *vmas, *vma; + struct uffdio_register reg; + struct uffdio_writeprotect wp; + struct uffdio_api api; + int uffd, tsock, i; + int ret = 0; + unsigned long addr, len; + unsigned long total_pages = 0; + unsigned int *failed_indices; + unsigned long threshold_pages = 25000; /* 25K pages ~= 100MB */ + /*unsigned long features = UFFD_FEATURE_PAGEFAULT_FLAG_WP | + UFFD_FEATURE_EVENT_FORK | + UFFD_FEATURE_EVENT_REMAP;*/ + + pr_info("COW dump init: registering %d VMAs\n", args->nr_vmas); + + args->nr_failed_vmas = 0; + failed_indices = cow_dump_failed_indices(args); + + /* Create userfaultfd in target process context */ + uffd = sys_userfaultfd(O_CLOEXEC | O_NONBLOCK); + if (uffd < 0) { + pr_err("Failed to create userfaultfd: %d\n", uffd); + return -1; + } + + /* Initialize userfaultfd API with WP features */ + memset(&api, 0, sizeof(api)); + api.api = UFFD_API; + api.features = 0; + api.ioctls = 0; + + ret = sys_ioctl(uffd, UFFDIO_API, (unsigned long)&api); + if (ret < 0) { + int e = (ret < 0) ? -ret : ret; /* convert to +errno code */ + + pr_err("Failed to initialize userfaultfd API: %d uffd=%d but continue\n", e, uffd); + sys_close(uffd); + return -1; + } + + pr_info("UFFD created with features: 0x%llx\n", (unsigned long long)api.features); + + vmas = cow_dump_vmas(args); + + /* Register each VMA with write-protection */ + for (i = 0; i < args->nr_vmas; i++) { + vma = vmas + i; + addr = vma->start; + len = vma->len; + + pr_info("Registering VMA %d: %lx-%lx prot=%x len=%lu\n", + i, addr, addr + len, vma->prot, len); + + if (((len / PAGE_SIZE) < threshold_pages)){ + pr_info("Skipping small VMA: %lx-%lx len=%lu\n", addr, addr + len, len); + } + + /* Skip non-writable VMAs */ + if (!(vma->prot & PROT_WRITE)) { + pr_info("Skipping non-writable VMA: %lx-%lx len=%lu\n", addr, addr + len, len); + + /* Mark for later dump by CRIU */ + failed_indices[args->nr_failed_vmas++] = i; + continue; + } + + + /* Register VMA for write-protect tracking */ + reg.range.start = addr; + reg.range.len = len; + reg.mode = UFFDIO_REGISTER_MODE_WP; + ret = sys_ioctl(uffd, UFFDIO_REGISTER, (unsigned long)®); + if (ret) { + /* Some VMAs may not support WP - record index for CRIU to dump */ + if (ret == EINVAL) { + pr_warn("Cannot WP-register VMA %lx-%lx len=%lu (unsupported), marking for later dump\n", + addr, addr + len, len); + + /* Record the index of this failed VMA */ + failed_indices[args->nr_failed_vmas++] = i; + pr_info("Marked VMA index %d for later dump (%u failed VMAs total)\n", + i, args->nr_failed_vmas); + continue; + } else { + /* Any failure to register - just dump instead of trying to track */ + pr_err("Failed to register VMA %lx-%lx: ret=%d len=%lu\n", + addr, addr + len, ret, len); + + failed_indices[args->nr_failed_vmas++] = i; + pr_info("Marked VMA index %d for immediate dump (%u total)\n", + i, args->nr_failed_vmas); + continue; + } + + } + + /* Apply write-protection */ + wp.range.start = addr; + wp.range.len = len; + wp.mode = UFFDIO_WRITEPROTECT_MODE_WP; + ret = sys_ioctl(uffd, UFFDIO_WRITEPROTECT, (unsigned long)&wp); + if (ret) { + pr_err("Failed to write-protect VMA %lx-%lx: ret=%d\n", + addr, addr + len, ret); + sys_close(uffd); + return -1; + } + + total_pages += len / PAGE_SIZE; + pr_info("Successfully registered and WP'd VMA: %lx-%lx (%lu pages)\n", + addr, addr + len, len / PAGE_SIZE); + } + + pr_info("COW dump init complete: %lu total pages\n", total_pages); + + /* Send userfaultfd back to CRIU before setting return status */ + tsock = parasite_get_rpc_sock(); + ret = send_fd(tsock, NULL, 0, uffd); + if (ret) { + pr_err("Failed to send userfaultfd back to CRIU: %d\n", ret); + sys_close(uffd); + args->ret = -1; + return -1; + } + + pr_info("Sent uffd=%d back to CRIU\n", uffd); + + /* Set success status after fd is sent */ + args->total_pages = total_pages; + args->ret = 0; + + /* Don't close uffd - it will remain open for the process */ + return 0; +} + void parasite_cleanup(void) { if (mprotect_args) { @@ -906,6 +1047,9 @@ int parasite_daemon_cmd(int cmd, void *args) case PARASITE_CMD_DUMP_CGROUP: ret = parasite_dump_cgroup(args); break; + case PARASITE_CMD_COW_DUMP_INIT: + ret = parasite_cow_dump_init(args); + break; default: pr_err("Unknown command in parasite daemon thread leader: %d\n", cmd); ret = -1;