feat(benchmark): add hook mode with baseline cache for pre-push checks

- Fix lint errors and LSP issues in benchmark.py - Add --hook mode to compare against saved baseline - Add --baseline flag to save results as baseline - Add --threshold to configure degradation threshold (default 20%) - Add benchmark step to pre-push hook in hk.pkl - Update AGENTS.md with hk documentation links and benchmark runner docs
2026-03-15 22:41:13 -07:00
parent 829c4ab63d
commit 7eb3fca935
3 changed files with 348 additions and 74 deletions
--- a/AGENTS.md
+++ b/AGENTS.md
@@ -60,9 +60,52 @@ hk init --mise
 mise run pre-commit
 ```

+**IMPORTANT**: Always review the hk documentation before modifying `hk.pkl`:
+- [hk Configuration Guide](https://hk.jdx.dev/configuration.html)
+- [hk Hooks Reference](https://hk.jdx.dev/hooks.html)
+- [hk Builtins](https://hk.jdx.dev/builtins.html)
+
 The project uses hk configured in `hk.pkl`:
 - **pre-commit**: runs ruff-format and ruff (with auto-fix)
- **pre-push**: runs ruff check
+- **pre-push**: runs ruff check + benchmark hook
+
+## Benchmark Runner
+
+Run performance benchmarks:
+
+```bash
+mise run benchmark           # Run all benchmarks (text output)
+mise run benchmark-json     # Run benchmarks (JSON output)
+mise run benchmark-report   # Run benchmarks (Markdown report)
+```
+
+### Benchmark Commands
+
+```bash
+# Run benchmarks
+uv run python -m engine.benchmark
+
+# Run with specific displays/effects
+uv run python -m engine.benchmark --displays null,terminal --effects fade,glitch
+
+# Save baseline for hook comparisons
+uv run python -m engine.benchmark --baseline
+
+# Run in hook mode (compares against baseline)
+uv run python -m engine.benchmark --hook
+
+# Hook mode with custom threshold (default: 20% degradation)
+uv run python -m engine.benchmark --hook --threshold 0.3
+
+# Custom baseline location
+uv run python -m engine.benchmark --hook --cache /path/to/cache.json
+```
+
+### Hook Mode
+
+The `--hook` mode compares current benchmarks against a saved baseline. If performance degrades beyond the threshold (default 20%), it exits with code 1. This is useful for preventing performance regressions in feature branches.
+
+The pre-push hook runs benchmark in hook mode to catch performance regressions before pushing.

 ## Workflow Rules

--- a/engine/benchmark.py
+++ b/engine/benchmark.py
@@ -6,6 +6,9 @@ Usage:
    python -m engine.benchmark
    python -m engine.benchmark --output report.md
    python -m engine.benchmark --displays terminal,websocket --effects glitch,fade
+    python -m engine.benchmark --format json --output benchmark.json
+
+Headless mode (default): suppress all terminal output during benchmarks.
 """

 import argparse
@@ -13,6 +16,9 @@ import json
 import sys
 import time
 from dataclasses import dataclass, field
+from datetime import datetime
+from io import StringIO
+from pathlib import Path
 from typing import Any

 import numpy as np
@@ -57,21 +63,34 @@ def get_sample_buffer(width: int = 80, height: int = 24) -> list[str]:

 def benchmark_display(
    display_class, buffer: list[str], iterations: int = 100
-) -> BenchmarkResult:
+) -> BenchmarkResult | None:
    """Benchmark a single display."""
-    display = display_class()
-    display.init(80, 24)
+    old_stdout = sys.stdout
+    old_stderr = sys.stderr

-    times = []
-    chars = sum(len(line) for line in buffer)
+    try:
+        sys.stdout = StringIO()
+        sys.stderr = StringIO()

-    for _ in range(iterations):
-        t0 = time.perf_counter()
-        display.show(buffer)
-        elapsed = (time.perf_counter() - t0) * 1000
-        times.append(elapsed)
+        display = display_class()
+        display.init(80, 24)

-    display.cleanup()
+        times = []
+        chars = sum(len(line) for line in buffer)
+
+        for _ in range(iterations):
+            t0 = time.perf_counter()
+            display.show(buffer)
+            elapsed = (time.perf_counter() - t0) * 1000
+            times.append(elapsed)
+
+        display.cleanup()
+
+    except Exception:
+        return None
+    finally:
+        sys.stdout = old_stdout
+        sys.stderr = old_stderr

    times_arr = np.array(times)

@@ -81,36 +100,62 @@ def benchmark_display(
        effect=None,
        iterations=iterations,
        total_time_ms=sum(times),
-        avg_time_ms=np.mean(times_arr),
-        std_dev_ms=np.std(times_arr),
-        min_ms=np.min(times_arr),
-        max_ms=np.max(times_arr),
-        fps=1000.0 / np.mean(times_arr) if np.mean(times_arr) > 0 else 0,
+        avg_time_ms=float(np.mean(times_arr)),
+        std_dev_ms=float(np.std(times_arr)),
+        min_ms=float(np.min(times_arr)),
+        max_ms=float(np.max(times_arr)),
+        fps=float(1000.0 / np.mean(times_arr)) if np.mean(times_arr) > 0 else 0.0,
        chars_processed=chars * iterations,
-        chars_per_sec=(chars * iterations) / (sum(times) / 1000)
+        chars_per_sec=float((chars * iterations) / (sum(times) / 1000))
        if sum(times) > 0
-        else 0,
+        else 0.0,
    )


 def benchmark_effect_with_display(
    effect_class, display, buffer: list[str], iterations: int = 100
-) -> BenchmarkResult:
+) -> BenchmarkResult | None:
    """Benchmark an effect with a display."""
-    effect = effect_class()
-    effect.configure(enabled=True, intensity=1.0)
+    old_stdout = sys.stdout
+    old_stderr = sys.stderr

-    times = []
-    chars = sum(len(line) for line in buffer)
+    try:
+        from engine.effects.types import EffectConfig, EffectContext

-    for _ in range(iterations):
-        processed = effect.process(buffer)
-        t0 = time.perf_counter()
-        display.show(processed)
-        elapsed = (time.perf_counter() - t0) * 1000
-        times.append(elapsed)
+        sys.stdout = StringIO()
+        sys.stderr = StringIO()

-    display.cleanup()
+        effect = effect_class()
+        effect.configure(EffectConfig(enabled=True, intensity=1.0))
+
+        ctx = EffectContext(
+            terminal_width=80,
+            terminal_height=24,
+            scroll_cam=0,
+            ticker_height=0,
+            mic_excess=0.0,
+            grad_offset=0.0,
+            frame_number=0,
+            has_message=False,
+        )
+
+        times = []
+        chars = sum(len(line) for line in buffer)
+
+        for _ in range(iterations):
+            processed = effect.process(buffer, ctx)
+            t0 = time.perf_counter()
+            display.show(processed)
+            elapsed = (time.perf_counter() - t0) * 1000
+            times.append(elapsed)
+
+        display.cleanup()
+
+    except Exception:
+        return None
+    finally:
+        sys.stdout = old_stdout
+        sys.stderr = old_stderr

    times_arr = np.array(times)

@@ -120,15 +165,15 @@ def benchmark_effect_with_display(
        effect=effect_class.__name__,
        iterations=iterations,
        total_time_ms=sum(times),
-        avg_time_ms=np.mean(times_arr),
-        std_dev_ms=np.std(times_arr),
-        min_ms=np.min(times_arr),
-        max_ms=np.max(times_arr),
-        fps=1000.0 / np.mean(times_arr) if np.mean(times_arr) > 0 else 0,
+        avg_time_ms=float(np.mean(times_arr)),
+        std_dev_ms=float(np.std(times_arr)),
+        min_ms=float(np.min(times_arr)),
+        max_ms=float(np.max(times_arr)),
+        fps=float(1000.0 / np.mean(times_arr)) if np.mean(times_arr) > 0 else 0.0,
        chars_processed=chars * iterations,
-        chars_per_sec=(chars * iterations) / (sum(times) / 1000)
+        chars_per_sec=float((chars * iterations) / (sum(times) / 1000))
        if sum(times) > 0
-        else 0,
+        else 0.0,
    )


@@ -139,7 +184,6 @@ def get_available_displays():
        NullDisplay,
        TerminalDisplay,
    )
-    from engine.display.backends.sixel import SixelDisplay

    DisplayRegistry.initialize()

@@ -156,6 +200,8 @@ def get_available_displays():
        pass

    try:
+        from engine.display.backends.sixel import SixelDisplay
+
        displays.append(("sixel", SixelDisplay))
    except Exception:
        pass
@@ -166,17 +212,24 @@ def get_available_displays():
 def get_available_effects():
    """Get available effect classes."""
    try:
-        from engine.effects.registry import get_effect_registry
+        from engine.effects import get_registry
+
+        try:
+            from effects_plugins import discover_plugins
+
+            discover_plugins()
+        except Exception:
+            pass
    except Exception:
        return []

    effects = []
-    registry = get_effect_registry()
+    registry = get_registry()

-    for name in registry.list_effects():
-        effect = registry.get(name)
+    for name, effect in registry.list_all().items():
        if effect:
-            effects.append((name, effect))
+            effect_cls = type(effect)
+            effects.append((name, effect_cls))

    return effects

@@ -185,7 +238,7 @@ def run_benchmarks(
    displays: list[tuple[str, Any]] | None = None,
    effects: list[tuple[str, Any]] | None = None,
    iterations: int = 100,
-    output_format: str = "text",
+    verbose: bool = False,
 ) -> BenchmarkReport:
    """Run all benchmarks and return report."""
    from datetime import datetime
@@ -199,35 +252,38 @@ def run_benchmarks(
    buffer = get_sample_buffer(80, 24)
    results = []

-    print(f"Running benchmarks ({iterations} iterations each)...")
-    print()
+    if verbose:
+        print(f"Running benchmarks ({iterations} iterations each)...")

    for name, display_class in displays:
-        print(f"Benchmarking display: {name}")
-        try:
-            result = benchmark_display(display_class, buffer, iterations)
-            results.append(result)
-            print(f"  {result.fps:.1f} FPS, {result.avg_time_ms:.2f}ms avg")
-        except Exception as e:
-            print(f"  Error: {e}")
+        if verbose:
+            print(f"Benchmarking display: {name}")

-    print()
+        result = benchmark_display(display_class, buffer, iterations)
+        if result:
+            results.append(result)
+            if verbose:
+                print(f"  {result.fps:.1f} FPS, {result.avg_time_ms:.2f}ms avg")
+
+    if verbose:
+        print()

    for effect_name, effect_class in effects:
        for display_name, display_class in displays:
            if display_name == "websocket":
                continue
-            print(f"Benchmarking effect: {effect_name} with {display_name}")
-            try:
-                display = display_class()
-                display.init(80, 24)
-                result = benchmark_effect_with_display(
-                    effect_class, display, buffer, iterations
-                )
+            if verbose:
+                print(f"Benchmarking effect: {effect_name} with {display_name}")
+
+            display = display_class()
+            display.init(80, 24)
+            result = benchmark_effect_with_display(
+                effect_class, display, buffer, iterations
+            )
+            if result:
                results.append(result)
-                print(f"  {result.fps:.1f} FPS, {result.avg_time_ms:.2f}ms avg")
-            except Exception as e:
-                print(f"  Error: {e}")
+                if verbose:
+                    print(f"  {result.fps:.1f} FPS, {result.avg_time_ms:.2f}ms avg")

    summary = generate_summary(results)

@@ -267,24 +323,132 @@ def generate_summary(results: list[BenchmarkResult]) -> dict[str, Any]:
    for display, res in by_display.items():
        fps_values = [r.fps for r in res]
        summary["by_display"][display] = {
-            "avg_fps": np.mean(fps_values),
-            "min_fps": np.min(fps_values),
-            "max_fps": np.max(fps_values),
+            "avg_fps": float(np.mean(fps_values)),
+            "min_fps": float(np.min(fps_values)),
+            "max_fps": float(np.max(fps_values)),
            "tests": len(res),
        }

    for effect, res in by_effect.items():
        fps_values = [r.fps for r in res]
        summary["by_effect"][effect] = {
-            "avg_fps": np.mean(fps_values),
-            "min_fps": np.min(fps_values),
-            "max_fps": np.max(fps_values),
+            "avg_fps": float(np.mean(fps_values)),
+            "min_fps": float(np.min(fps_values)),
+            "max_fps": float(np.max(fps_values)),
            "tests": len(res),
        }

    return summary


+DEFAULT_CACHE_PATH = Path.home() / ".mainline_benchmark_cache.json"
+
+
+def load_baseline(cache_path: Path | None = None) -> dict[str, Any] | None:
+    """Load baseline benchmark results from cache."""
+    path = cache_path or DEFAULT_CACHE_PATH
+    if not path.exists():
+        return None
+    try:
+        with open(path) as f:
+            return json.load(f)
+    except Exception:
+        return None
+
+
+def save_baseline(
+    results: list[BenchmarkResult],
+    cache_path: Path | None = None,
+) -> None:
+    """Save benchmark results as baseline to cache."""
+    path = cache_path or DEFAULT_CACHE_PATH
+    baseline = {
+        "timestamp": datetime.now().isoformat(),
+        "results": {
+            r.name: {
+                "fps": r.fps,
+                "avg_time_ms": r.avg_time_ms,
+                "chars_per_sec": r.chars_per_sec,
+            }
+            for r in results
+        },
+    }
+    path.parent.mkdir(parents=True, exist_ok=True)
+    with open(path, "w") as f:
+        json.dump(baseline, f, indent=2)
+
+
+def compare_with_baseline(
+    results: list[BenchmarkResult],
+    baseline: dict[str, Any],
+    threshold: float = 0.2,
+    verbose: bool = True,
+) -> tuple[bool, list[str]]:
+    """Compare current results with baseline. Returns (pass, messages)."""
+    baseline_results = baseline.get("results", {})
+    failures = []
+    warnings = []
+
+    for r in results:
+        if r.name not in baseline_results:
+            warnings.append(f"New test: {r.name} (no baseline)")
+            continue
+
+        b = baseline_results[r.name]
+        if b["fps"] == 0:
+            continue
+
+        degradation = (b["fps"] - r.fps) / b["fps"]
+        if degradation > threshold:
+            failures.append(
+                f"{r.name}: FPS degraded {degradation * 100:.1f}% "
+                f"(baseline: {b['fps']:.1f}, current: {r.fps:.1f})"
+            )
+        elif verbose:
+            print(f"  {r.name}: {r.fps:.1f} FPS (baseline: {b['fps']:.1f})")
+
+    passed = len(failures) == 0
+    messages = []
+    if failures:
+        messages.extend(failures)
+    if warnings:
+        messages.extend(warnings)
+
+    return passed, messages
+
+
+def run_hook_mode(
+    displays: list[tuple[str, Any]] | None = None,
+    effects: list[tuple[str, Any]] | None = None,
+    iterations: int = 20,
+    threshold: float = 0.2,
+    cache_path: Path | None = None,
+    verbose: bool = False,
+) -> int:
+    """Run in hook mode: compare against baseline, exit 0 on pass, 1 on fail."""
+    baseline = load_baseline(cache_path)
+
+    if baseline is None:
+        print("No baseline found. Run with --baseline to create one.")
+        return 1
+
+    report = run_benchmarks(displays, effects, iterations, verbose)
+
+    passed, messages = compare_with_baseline(
+        report.results, baseline, threshold, verbose
+    )
+
+    print("\n=== Benchmark Hook Results ===")
+    if passed:
+        print("PASSED - No significant performance degradation")
+        return 0
+    else:
+        print("FAILED - Performance degradation detected:")
+        for msg in messages:
+            print(f"  - {msg}")
+        return 1
+
+
 def format_report_text(report: BenchmarkReport) -> str:
    """Format report as human-readable text."""
    lines = [
@@ -391,9 +555,67 @@ def main():
        default="text",
        help="Output format (default: text)",
    )
+    parser.add_argument(
+        "--verbose",
+        "-v",
+        action="store_true",
+        help="Show progress during benchmarking",
+    )
+    parser.add_argument(
+        "--hook",
+        action="store_true",
+        help="Run in hook mode: compare against baseline, exit 0 pass, 1 fail",
+    )
+    parser.add_argument(
+        "--baseline",
+        action="store_true",
+        help="Save current results as baseline for future hook comparisons",
+    )
+    parser.add_argument(
+        "--threshold",
+        type=float,
+        default=0.2,
+        help="Performance degradation threshold for hook mode (default: 0.2 = 20%%)",
+    )
+    parser.add_argument(
+        "--cache",
+        type=str,
+        default=None,
+        help="Path to baseline cache file (default: ~/.mainline_benchmark_cache.json)",
+    )

    args = parser.parse_args()

+    cache_path = Path(args.cache) if args.cache else DEFAULT_CACHE_PATH
+
+    if args.hook:
+        displays = None
+        if args.displays:
+            display_map = dict(get_available_displays())
+            displays = [
+                (name, display_map[name])
+                for name in args.displays.split(",")
+                if name in display_map
+            ]
+
+        effects = None
+        if args.effects:
+            effect_map = dict(get_available_effects())
+            effects = [
+                (name, effect_map[name])
+                for name in args.effects.split(",")
+                if name in effect_map
+            ]
+
+        return run_hook_mode(
+            displays,
+            effects,
+            iterations=args.iterations,
+            threshold=args.threshold,
+            cache_path=cache_path,
+            verbose=args.verbose,
+        )
+
    displays = None
    if args.displays:
        display_map = dict(get_available_displays())
@@ -412,7 +634,12 @@ def main():
            if name in effect_map
        ]

-    report = run_benchmarks(displays, effects, args.iterations, args.format)
+    report = run_benchmarks(displays, effects, args.iterations, args.verbose)
+
+    if args.baseline:
+        save_baseline(report.results, cache_path)
+        print(f"Baseline saved to {cache_path}")
+        return 0

    if args.format == "json":
        output = format_report_json(report)
@@ -422,10 +649,11 @@ def main():
    if args.output:
        with open(args.output, "w") as f:
            f.write(output)
-        print(f"Report written to {args.output}")
    else:
        print(output)

+    return 0
+

 if __name__ == "__main__":
-    main()
+    sys.exit(main())
--- a/hk.pkl
+++ b/hk.pkl
@@ -22,6 +22,9 @@ hooks {
                prefix = "uv run"
                check = "ruff check engine/ tests/"
            }
+            ["benchmark"] {
+                check = "uv run python -m engine.benchmark --hook --displays null --iterations 20"
+            }
        }
    }
 }