SKELETONKEY/tools/refresh-verifications.py

#!/usr/bin/env python3
"""
tools/refresh-verifications.py — read docs/VERIFICATIONS.jsonl,
generate core/verifications.c with a deduped, sorted lookup table.

Dedup key: (module, vm_box, host_kernel, expect_detect).
On collision, the LATEST verified_at wins (so re-runs update rather
than accumulate). Records are then sorted by module name so the
output is stable and review-friendly.

Records with no module name are dropped silently. Records with
status != "match" are kept so MISMATCH histories stay visible in
--module-info (but don't earn the ✓ verified badge).

Usage:
    tools/refresh-verifications.py            # regenerate core/verifications.c
    tools/refresh-verifications.py --check    # exit 1 if regenerating would change anything
"""

import argparse
import json
import sys
from pathlib import Path

REPO = Path(__file__).resolve().parent.parent
JSONL = REPO / "docs" / "VERIFICATIONS.jsonl"
OUT_C = REPO / "core" / "verifications.c"


def load_records():
    if not JSONL.exists():
        return []
    out = []
    for line in JSONL.read_text().splitlines():
        line = line.strip()
        if not line or line.startswith("#"):
            continue
        try:
            r = json.loads(line)
            if r.get("module"):
                out.append(r)
        except json.JSONDecodeError as e:
            print(f"[!] skipping bad JSONL line: {e}", file=sys.stderr)
    return out


def dedup_latest(records):
    """Keep only the latest record per (module, vm_box, host_kernel).

    NB: expect_detect is intentionally NOT part of the dedup key. If we
    re-verify the same target with a corrected expectation, the new
    record supersedes the old one entirely (the old MISMATCH was a stale
    target-yaml entry, not a separate test scenario)."""
    by_key = {}
    for r in records:
        k = (r.get("module"), r.get("vm_box"), r.get("host_kernel"))
        prev = by_key.get(k)
        if prev is None or r.get("verified_at", "") > prev.get("verified_at", ""):
            by_key[k] = r
    return sorted(by_key.values(),
                  key=lambda r: (r["module"], r.get("vm_box", ""),
                                 r.get("host_kernel", "")))


def date_only(iso_ts: str) -> str:
    """Truncate 2026-05-23T19:26:02Z -> 2026-05-23."""
    if not iso_ts:
        return ""
    return iso_ts.split("T", 1)[0]


def cstr(s):
    if s is None or s == "":
        return '""'
    # No paths in here ever contain unescapable chars; basic backslash + quote escape.
    return '"' + s.replace("\\", "\\\\").replace('"', '\\"') + '"'


def render_c(records) -> str:
    lines = [
        "/*",
        " * SKELETONKEY — verification records table",
        " *",
        " * AUTO-GENERATED by tools/refresh-verifications.py from",
        " * docs/VERIFICATIONS.jsonl. Do not hand-edit; rerun the script.",
        " *",
        " * Source: tools/verify-vm/verify.sh appends one JSON record per",
        " * run; this generator dedupes to (module, vm_box, kernel, expect)",
        " * and keeps the latest by verified_at.",
        " */",
        "",
        '#include "verifications.h"',
        "",
        "#include <stddef.h>",
        "#include <string.h>",
        "#include <stdbool.h>",
        "",
        "const struct verification_record verifications[] = {",
    ]
    for r in records:
        lines.append("    {")
        lines.append(f"        .module        = {cstr(r.get('module'))},")
        lines.append(f"        .verified_at   = {cstr(date_only(r.get('verified_at', '')))},")
        lines.append(f"        .host_kernel   = {cstr(r.get('host_kernel'))},")
        lines.append(f"        .host_distro   = {cstr(r.get('host_distro'))},")
        lines.append(f"        .vm_box        = {cstr(r.get('vm_box'))},")
        lines.append(f"        .expect_detect = {cstr(r.get('expect_detect'))},")
        lines.append(f"        .actual_detect = {cstr(r.get('actual_detect'))},")
        lines.append(f"        .status        = {cstr(r.get('status'))},")
        lines.append("    },")
    lines += [
        "};",
        "",
        "const size_t verifications_count =",
        "    sizeof(verifications) / sizeof(verifications[0]);",
        "",
        "const struct verification_record *",
        "verifications_for_module(const char *module, size_t *count_out)",
        "{",
        "    if (count_out) *count_out = 0;",
        "    if (!module) return NULL;",
        "    const struct verification_record *first = NULL;",
        "    size_t n = 0;",
        "    for (size_t i = 0; i < verifications_count; i++) {",
        "        if (strcmp(verifications[i].module, module) == 0) {",
        "            if (first == NULL) first = &verifications[i];",
        "            n++;",
        "        }",
        "    }",
        "    if (count_out) *count_out = n;",
        "    return first;",
        "}",
        "",
        "bool verifications_module_has_match(const char *module)",
        "{",
        "    size_t n = 0;",
        "    const struct verification_record *r = verifications_for_module(module, &n);",
        "    for (size_t i = 0; i < n; i++)",
        "        if (r[i].status && strcmp(r[i].status, \"match\") == 0)",
        "            return true;",
        "    return false;",
        "}",
        "",
    ]
    return "\n".join(lines)


def main() -> int:
    ap = argparse.ArgumentParser(description=__doc__.splitlines()[1])
    ap.add_argument("--check", action="store_true",
                    help="diff against committed core/verifications.c; exit 1 on drift")
    args = ap.parse_args()

    records = dedup_latest(load_records())
    text = render_c(records)

    if args.check:
        existing = OUT_C.read_text() if OUT_C.exists() else ""
        if existing == text:
            print(f"[+] core/verifications.c is current ({len(records)} record(s))",
                  file=sys.stderr)
            return 0
        print("[!] core/verifications.c drifted — rerun "
              "tools/refresh-verifications.py", file=sys.stderr)
        return 1

    OUT_C.write_text(text)
    print(f"[+] wrote {OUT_C.relative_to(REPO)} ({len(records)} record(s))",
          file=sys.stderr)
    return 0


if __name__ == "__main__":
    sys.exit(main())