SKELETONKEY/tools/refresh-kernel-ranges.py

#!/usr/bin/env python3
"""
tools/refresh-kernel-ranges.py — Detect drift between each module's
kernel_patched_from table and Debian's security-tracker data.

The repo's no-fabrication rule (CVES.md) means every kernel_range
threshold has to come from a real, citeable source. Debian's
security tracker is the most reliable per-CVE backport list — it's
machine-readable and updated continuously by the Debian security
team. This script:

  1. Fetches https://security-tracker.debian.org/tracker/data/json
     (cached at /tmp/skeletonkey-debian-tracker.json, 12h TTL).
  2. Scans every modules/*/skeletonkey_modules.c for
     `kernel_patched_from <name>[] = { {M, m, p}, ... };` arrays and
     their corresponding `.cve = "CVE-..."` entry.
  3. For each module, compares the table against Debian's tracked
     fixed-versions for that CVE.
  4. Reports:
       missing branch       — Debian has a fix at X.Y.Z; our table
                              has no X.Y entry. The module's detect()
                              would say VULNERABLE on a Debian host
                              that's actually patched.
       too-tight threshold  — Our X.Y.Z is HIGHER than Debian's fix
                              version; our module would call a
                              fixed host vulnerable. False-positive.
       info (more conservative) — Our threshold is LOWER than
                              Debian's; we accept earlier kernels
                              as patched. Could be intentional or
                              could mean we have stale data.

Usage:
    tools/refresh-kernel-ranges.py            # human report
    tools/refresh-kernel-ranges.py --json     # machine-readable
    tools/refresh-kernel-ranges.py --patch    # propose C-source edits
    tools/refresh-kernel-ranges.py --refresh  # force re-fetch
"""

from __future__ import annotations

import json
import os
import re
import sys
import time
import urllib.request

CACHE = "/tmp/skeletonkey-debian-tracker.json"
TRACKER_URL = "https://security-tracker.debian.org/tracker/data/json"
CACHE_TTL_SEC = 12 * 3600


# ── tracker fetch ────────────────────────────────────────────────────

def fetch_tracker(force_refresh: bool = False) -> dict:
    """Return the parsed Debian tracker JSON. Cached at /tmp with 12h TTL."""
    if not force_refresh and os.path.exists(CACHE):
        age = time.time() - os.stat(CACHE).st_mtime
        if age < CACHE_TTL_SEC:
            print(f"[*] using cached tracker ({CACHE}, age {int(age)}s)",
                  file=sys.stderr)
            with open(CACHE) as f:
                return json.load(f)
    print(f"[*] fetching {TRACKER_URL} ...", file=sys.stderr)
    req = urllib.request.Request(
        TRACKER_URL,
        headers={"User-Agent": "skeletonkey/refresh-kernel-ranges"},
    )
    with urllib.request.urlopen(req, timeout=120) as r:
        data = r.read()
    os.makedirs(os.path.dirname(CACHE), exist_ok=True)
    with open(CACHE, "wb") as f:
        f.write(data)
    print(f"[*] tracker cached: {len(data) // 1024} KB", file=sys.stderr)
    return json.loads(data)


# ── module source parsing ────────────────────────────────────────────

# Some modules have multiple .cve entries (e.g. dirty_frag_esp +
# dirty_frag_esp6 share the same CVE). Pull the first one.
RE_CVE = re.compile(r'\.cve\s*=\s*"(CVE-\d{4}-\d{4,7})"')
RE_TABLE = re.compile(
    r'kernel_patched_from\s+(\w+)\s*\[\]\s*=\s*\{([^}]+(?:\}[^}]*)*?)\}\s*;',
    re.MULTILINE,
)
RE_ENTRY = re.compile(r'\{\s*(\d+)\s*,\s*(\d+)\s*,\s*(\d+)\s*\}')


def find_modules(repo_root: str):
    """Yield {name, src, cve, table, table_name, table_span} per module.

    `table_span` is (start, end) byte offsets of the array body for
    --patch mode that wants to edit the source. `table` is a list of
    (major, minor, patch) tuples in source order."""
    mods_dir = os.path.join(repo_root, "modules")
    for d in sorted(os.listdir(mods_dir)):
        src = os.path.join(mods_dir, d, "skeletonkey_modules.c")
        if not os.path.exists(src):
            continue
        with open(src) as f:
            text = f.read()
        cve_m = RE_CVE.search(text)
        if not cve_m:
            continue
        tab_m = RE_TABLE.search(text)
        if not tab_m:
            continue
        entries = [tuple(int(x) for x in e) for e in RE_ENTRY.findall(tab_m.group(2))]
        if not entries:
            continue
        yield {
            "name": d,
            "src": src,
            "cve": cve_m.group(1),
            "table": entries,
            "table_name": tab_m.group(1),
            "table_span": (tab_m.start(2), tab_m.end(2)),
        }


# ── Debian tracker lookup ────────────────────────────────────────────

# Debian release names we care about (in age order, oldest first).
# The tracker has more (e.g. ELTS) but those are usually too old to
# inform mainline-or-near-mainline backport thresholds.
DEBIAN_RELEASES = ["bullseye", "bookworm", "trixie", "forky", "sid"]


def parse_upstream_version(deb_ver: str) -> tuple[int, int, int] | None:
    """Map a Debian package version like '5.10.218-1' to upstream
    (5, 10, 218). Returns None on parse failure."""
    if not deb_ver:
        return None
    # Strip everything after first '-' (Debian revision) or '+' (backport).
    head = re.split(r'[-+~]', deb_ver, maxsplit=1)[0]
    parts = head.split(".")
    if len(parts) < 3:
        # Some Debian versions are X.Y (no patch). Treat patch as 0.
        if len(parts) == 2:
            parts.append("0")
        else:
            return None
    try:
        return (int(parts[0]), int(parts[1]), int(parts[2]))
    except ValueError:
        return None


def debian_fixed_for(tracker: dict, cve: str) -> dict[str, tuple[int, int, int]]:
    """For a CVE, return {debian_release: upstream_version_tuple} of
    fixed versions per the tracker. Skips releases with no fix yet."""
    out: dict[str, tuple[int, int, int]] = {}
    for pkg in ("linux", "linux-grsec"):
        pkg_data = tracker.get(pkg, {})
        if cve not in pkg_data:
            continue
        cve_data = pkg_data[cve]
        for release, info in cve_data.get("releases", {}).items():
            if release not in DEBIAN_RELEASES:
                continue
            if info.get("status") != "resolved":
                continue
            fixed = info.get("fixed_version")
            up = parse_upstream_version(fixed)
            if up:
                out[release] = up
    return out


# ── compare + report ─────────────────────────────────────────────────

def branch_of(v: tuple[int, int, int]) -> tuple[int, int]:
    return (v[0], v[1])


def compare(table: list[tuple[int, int, int]],
            debian: dict[str, tuple[int, int, int]]) -> list[dict]:
    """Return a list of finding dicts ({severity, message, ...})."""
    findings: list[dict] = []
    our_by_branch = {branch_of(t): t for t in table}

    # Group Debian releases by branch (multiple releases may share a branch)
    debian_by_branch: dict[tuple[int, int], list[tuple[str, tuple[int, int, int]]]] = {}
    for rel, ver in debian.items():
        debian_by_branch.setdefault(branch_of(ver), []).append((rel, ver))

    for branch, rels in debian_by_branch.items():
        # Use the OLDEST fix Debian has on this branch (most permissive)
        rels.sort(key=lambda x: x[1])
        oldest_rel, oldest_ver = rels[0]
        rel_list = ", ".join(f"{r}: {v[0]}.{v[1]}.{v[2]}" for r, v in rels)

        if branch not in our_by_branch:
            findings.append({
                "severity": "MISSING",
                "message": (
                    f"Debian has fix on the {branch[0]}.{branch[1]} branch "
                    f"(earliest: {oldest_ver[0]}.{oldest_ver[1]}.{oldest_ver[2]}, "
                    f"all: {rel_list}), but our table has no {branch[0]}.{branch[1]} entry"
                ),
                "suggest_add": list(oldest_ver),
            })
        else:
            our = our_by_branch[branch]
            if our[2] > oldest_ver[2]:
                findings.append({
                    "severity": "TOO_TIGHT",
                    "message": (
                        f"Our {our[0]}.{our[1]}.{our[2]} threshold is later than "
                        f"Debian's earliest fix on the {branch[0]}.{branch[1]} branch "
                        f"({oldest_ver[0]}.{oldest_ver[1]}.{oldest_ver[2]}, from "
                        f"{oldest_rel}). Hosts at {branch[0]}.{branch[1]}.{oldest_ver[2]} "
                        "are patched per Debian but our detect() would report "
                        "VULNERABLE."
                    ),
                    "suggest_replace": list(oldest_ver),
                })
            elif our[2] < oldest_ver[2]:
                # Our threshold is earlier — we're more permissive about
                # what counts as patched. Usually fine (we have better
                # info than Debian's stable backport) but flag as info.
                findings.append({
                    "severity": "INFO",
                    "message": (
                        f"Our {our[0]}.{our[1]}.{our[2]} threshold is earlier "
                        f"than Debian's {oldest_ver[0]}.{oldest_ver[1]}.{oldest_ver[2]} "
                        f"({oldest_rel}). We're more permissive — verify this "
                        "is intentional (e.g. we tracked a different distro's "
                        "earlier backport)."
                    ),
                })

    return findings


# ── main ─────────────────────────────────────────────────────────────

def render_text(reports: list[dict]) -> None:
    """Human-readable report on stderr."""
    drifted = 0
    for r in reports:
        if not r["findings"]:
            print(f"[+] {r['name']:32s} ({r['cve']}) — table is current "
                  f"({len(r['table'])} entries)")
            continue
        drifted += 1
        print(f"[!] {r['name']} ({r['cve']})")
        print(f"    table: " + ", ".join(
            f"{M}.{m}.{p}" for (M, m, p) in r["table"]))
        if r["debian"]:
            print(f"    debian: " + ", ".join(
                f"{rel}={M}.{m}.{p}"
                for rel, (M, m, p) in sorted(r["debian"].items())))
        else:
            print("    debian: (no resolved entries for this CVE)")
        for f in r["findings"]:
            tag = {"MISSING": "+", "TOO_TIGHT": "✗", "INFO": "i"}[f["severity"]]
            print(f"      [{tag}] {f['message']}")
        print()
    total = len(reports)
    print(f"=== {drifted}/{total} module(s) drifted ===", file=sys.stderr)


def render_json(reports: list[dict]) -> None:
    print(json.dumps({"modules": reports}, indent=2, default=lambda o: list(o)))


def render_patch(reports: list[dict]) -> None:
    """Emit a brief proposed-edits diff for modules with MISSING or
    TOO_TIGHT findings. Not actually applied — operator reviews."""
    for r in reports:
        actionable = [f for f in r["findings"]
                      if f["severity"] in ("MISSING", "TOO_TIGHT")]
        if not actionable:
            continue
        print(f"--- {r['src']}")
        print(f"+++ {r['src']} (proposed)")
        print(f"@@ kernel_patched_from {r['table_name']}[] @@")
        # Reconstruct the table with the actionable changes applied.
        new_table = list(r["table"])
        new_branches = {branch_of(t): list(t) for t in new_table}
        for f in actionable:
            if "suggest_add" in f:
                v = tuple(f["suggest_add"])
                new_branches[branch_of(v)] = list(v)
            elif "suggest_replace" in f:
                v = tuple(f["suggest_replace"])
                new_branches[branch_of(v)] = list(v)
        new_sorted = sorted(new_branches.values())
        old_set = {tuple(t) for t in r["table"]}
        for entry in new_sorted:
            t = tuple(entry)
            if t in old_set:
                print(f"     {{{entry[0]:>2}, {entry[1]:>2}, {entry[2]:>3}}},")
            else:
                print(f" +   {{{entry[0]:>2}, {entry[1]:>2}, {entry[2]:>3}}},")
        for old in r["table"]:
            if branch_of(old) not in new_branches or \
               list(old) != new_branches[branch_of(old)]:
                print(f" -   {{{old[0]:>2}, {old[1]:>2}, {old[2]:>3}}},")
        print()


def main() -> int:
    json_mode = "--json" in sys.argv
    patch_mode = "--patch" in sys.argv
    force = "--refresh" in sys.argv

    repo_root = os.path.dirname(os.path.dirname(os.path.abspath(__file__)))
    tracker = fetch_tracker(force_refresh=force)

    if "linux" not in tracker:
        print("[-] tracker JSON has no 'linux' package — schema changed?",
              file=sys.stderr)
        return 1

    reports: list[dict] = []
    for mod in find_modules(repo_root):
        debian = debian_fixed_for(tracker, mod["cve"])
        findings = compare(mod["table"], debian)
        reports.append({
            "name": mod["name"],
            "src": mod["src"],
            "cve": mod["cve"],
            "table_name": mod["table_name"],
            "table": [list(t) for t in mod["table"]],
            "debian": {k: list(v) for k, v in debian.items()},
            "findings": findings,
        })

    if json_mode:
        render_json(reports)
    elif patch_mode:
        render_patch(reports)
    else:
        render_text(reports)

    # Exit code: 1 if any MISSING or TOO_TIGHT, 0 otherwise. INFO is fine.
    actionable = sum(1 for r in reports for f in r["findings"]
                     if f["severity"] in ("MISSING", "TOO_TIGHT"))
    return 1 if actionable else 0


if __name__ == "__main__":
    sys.exit(main())