feat(benchmark): add hook mode with baseline cache for pre-push checks

- Fix lint errors and LSP issues in benchmark.py - Add --hook mode to compare against saved baseline - Add --baseline flag to save results as baseline - Add --threshold to configure degradation threshold (default 20%) - Add benchmark step to pre-push hook in hk.pkl - Update AGENTS.md with hk documentation links and benchmark runner docs
2026-03-15 22:54:51 -07:00
3 changed files with 348 additions and 74 deletions
--- a/AGENTS.md
+++ b/AGENTS.md
@@ -60,9 +60,52 @@ hk init --mise
 mise run pre-commit
 ```
 **IMPORTANT**: Always review the hk documentation before modifying `hk.pkl`:
 - [hk Configuration Guide](https://hk.jdx.dev/configuration.html)
 - [hk Hooks Reference](https://hk.jdx.dev/hooks.html)
 - [hk Builtins](https://hk.jdx.dev/builtins.html)
 The project uses hk configured in `hk.pkl`:
 - **pre-commit**: runs ruff-format and ruff (with auto-fix)
- **pre-push**: runs ruff check
+- **pre-push**: runs ruff check + benchmark hook
 ## Benchmark Runner
 Run performance benchmarks:
 ```bash
 mise run benchmark           # Run all benchmarks (text output)
 mise run benchmark-json     # Run benchmarks (JSON output)
 mise run benchmark-report   # Run benchmarks (Markdown report)
 ```
 ### Benchmark Commands
 ```bash
 # Run benchmarks
 uv run python -m engine.benchmark
 # Run with specific displays/effects
 uv run python -m engine.benchmark --displays null,terminal --effects fade,glitch
 # Save baseline for hook comparisons
 uv run python -m engine.benchmark --baseline
 # Run in hook mode (compares against baseline)
 uv run python -m engine.benchmark --hook
 # Hook mode with custom threshold (default: 20% degradation)
 uv run python -m engine.benchmark --hook --threshold 0.3
 # Custom baseline location
 uv run python -m engine.benchmark --hook --cache /path/to/cache.json
 ```
 ### Hook Mode
 The `--hook` mode compares current benchmarks against a saved baseline. If performance degrades beyond the threshold (default 20%), it exits with code 1. This is useful for preventing performance regressions in feature branches.
 The pre-push hook runs benchmark in hook mode to catch performance regressions before pushing.
 ## Workflow Rules
--- a/engine/benchmark.py
+++ b/engine/benchmark.py
@@ -6,6 +6,9 @@ Usage:
    python -m engine.benchmark
    python -m engine.benchmark --output report.md
    python -m engine.benchmark --displays terminal,websocket --effects glitch,fade
    python -m engine.benchmark --format json --output benchmark.json
 Headless mode (default): suppress all terminal output during benchmarks.
 """
 import argparse
@@ -13,6 +16,9 @@ import json
 import sys
 import time
 from dataclasses import dataclass, field
 from datetime import datetime
 from io import StringIO
 from pathlib import Path
 from typing import Any
 import numpy as np
@@ -57,21 +63,34 @@ def get_sample_buffer(width: int = 80, height: int = 24) -> list[str]:
 def benchmark_display(
    display_class, buffer: list[str], iterations: int = 100
-) -> BenchmarkResult:
+) -> BenchmarkResult | None:
    """Benchmark a single display."""
-    display = display_class()
+    old_stdout = sys.stdout
-    display.init(80, 24)
+    old_stderr = sys.stderr
-    times = []
+    try:
-    chars = sum(len(line) for line in buffer)
+        sys.stdout = StringIO()
        sys.stderr = StringIO()
-    for _ in range(iterations):
+        display = display_class()
-        t0 = time.perf_counter()
+        display.init(80, 24)
        display.show(buffer)
        elapsed = (time.perf_counter() - t0) * 1000
        times.append(elapsed)
-    display.cleanup()
+        times = []
        chars = sum(len(line) for line in buffer)
        for _ in range(iterations):
            t0 = time.perf_counter()
            display.show(buffer)
            elapsed = (time.perf_counter() - t0) * 1000
            times.append(elapsed)
        display.cleanup()
    except Exception:
        return None
    finally:
        sys.stdout = old_stdout
        sys.stderr = old_stderr
    times_arr = np.array(times)
@@ -81,36 +100,62 @@ def benchmark_display(
        effect=None,
        iterations=iterations,
        total_time_ms=sum(times),
-        avg_time_ms=np.mean(times_arr),
+        avg_time_ms=float(np.mean(times_arr)),
-        std_dev_ms=np.std(times_arr),
+        std_dev_ms=float(np.std(times_arr)),
-        min_ms=np.min(times_arr),
+        min_ms=float(np.min(times_arr)),
-        max_ms=np.max(times_arr),
+        max_ms=float(np.max(times_arr)),
-        fps=1000.0 / np.mean(times_arr) if np.mean(times_arr) > 0 else 0,
+        fps=float(1000.0 / np.mean(times_arr)) if np.mean(times_arr) > 0 else 0.0,
        chars_processed=chars * iterations,
-        chars_per_sec=(chars * iterations) / (sum(times) / 1000)
+        chars_per_sec=float((chars * iterations) / (sum(times) / 1000))
        if sum(times) > 0
-        else 0,
+        else 0.0,
    )
 def benchmark_effect_with_display(
    effect_class, display, buffer: list[str], iterations: int = 100
-) -> BenchmarkResult:
+) -> BenchmarkResult | None:
    """Benchmark an effect with a display."""
-    effect = effect_class()
+    old_stdout = sys.stdout
-    effect.configure(enabled=True, intensity=1.0)
+    old_stderr = sys.stderr
-    times = []
+    try:
-    chars = sum(len(line) for line in buffer)
+        from engine.effects.types import EffectConfig, EffectContext
-    for _ in range(iterations):
+        sys.stdout = StringIO()
-        processed = effect.process(buffer)
+        sys.stderr = StringIO()
        t0 = time.perf_counter()
        display.show(processed)
        elapsed = (time.perf_counter() - t0) * 1000
        times.append(elapsed)
-    display.cleanup()
+        effect = effect_class()
        effect.configure(EffectConfig(enabled=True, intensity=1.0))
        ctx = EffectContext(
            terminal_width=80,
            terminal_height=24,
            scroll_cam=0,
            ticker_height=0,
            mic_excess=0.0,
            grad_offset=0.0,
            frame_number=0,
            has_message=False,
        )
        times = []
        chars = sum(len(line) for line in buffer)
        for _ in range(iterations):
            processed = effect.process(buffer, ctx)
            t0 = time.perf_counter()
            display.show(processed)
            elapsed = (time.perf_counter() - t0) * 1000
            times.append(elapsed)
        display.cleanup()
    except Exception:
        return None
    finally:
        sys.stdout = old_stdout
        sys.stderr = old_stderr
    times_arr = np.array(times)
@@ -120,15 +165,15 @@ def benchmark_effect_with_display(
        effect=effect_class.__name__,
        iterations=iterations,
        total_time_ms=sum(times),
-        avg_time_ms=np.mean(times_arr),
+        avg_time_ms=float(np.mean(times_arr)),
-        std_dev_ms=np.std(times_arr),
+        std_dev_ms=float(np.std(times_arr)),
-        min_ms=np.min(times_arr),
+        min_ms=float(np.min(times_arr)),
-        max_ms=np.max(times_arr),
+        max_ms=float(np.max(times_arr)),
-        fps=1000.0 / np.mean(times_arr) if np.mean(times_arr) > 0 else 0,
+        fps=float(1000.0 / np.mean(times_arr)) if np.mean(times_arr) > 0 else 0.0,
        chars_processed=chars * iterations,
-        chars_per_sec=(chars * iterations) / (sum(times) / 1000)
+        chars_per_sec=float((chars * iterations) / (sum(times) / 1000))
        if sum(times) > 0
-        else 0,
+        else 0.0,
    )
@@ -139,7 +184,6 @@ def get_available_displays():
        NullDisplay,
        TerminalDisplay,
    )
    from engine.display.backends.sixel import SixelDisplay
    DisplayRegistry.initialize()
@@ -156,6 +200,8 @@ def get_available_displays():
        pass
    try:
        from engine.display.backends.sixel import SixelDisplay
        displays.append(("sixel", SixelDisplay))
    except Exception:
        pass
@@ -166,17 +212,24 @@ def get_available_displays():
 def get_available_effects():
    """Get available effect classes."""
    try:
-        from engine.effects.registry import get_effect_registry
+        from engine.effects import get_registry
        try:
            from effects_plugins import discover_plugins
            discover_plugins()
        except Exception:
            pass
    except Exception:
        return []
    effects = []
-    registry = get_effect_registry()
+    registry = get_registry()
-    for name in registry.list_effects():
+    for name, effect in registry.list_all().items():
        effect = registry.get(name)
        if effect:
-            effects.append((name, effect))
+            effect_cls = type(effect)
            effects.append((name, effect_cls))
    return effects
@@ -185,7 +238,7 @@ def run_benchmarks(
    displays: list[tuple[str, Any]] | None = None,
    effects: list[tuple[str, Any]] | None = None,
    iterations: int = 100,
-    output_format: str = "text",
+    verbose: bool = False,
 ) -> BenchmarkReport:
    """Run all benchmarks and return report."""
    from datetime import datetime
@@ -199,35 +252,38 @@ def run_benchmarks(
    buffer = get_sample_buffer(80, 24)
    results = []
-    print(f"Running benchmarks ({iterations} iterations each)...")
+    if verbose:
-    print()
+        print(f"Running benchmarks ({iterations} iterations each)...")
    for name, display_class in displays:
-        print(f"Benchmarking display: {name}")
+        if verbose:
-        try:
+            print(f"Benchmarking display: {name}")
            result = benchmark_display(display_class, buffer, iterations)
            results.append(result)
            print(f"  {result.fps:.1f} FPS, {result.avg_time_ms:.2f}ms avg")
        except Exception as e:
            print(f"  Error: {e}")
-    print()
+        result = benchmark_display(display_class, buffer, iterations)
        if result:
            results.append(result)
            if verbose:
                print(f"  {result.fps:.1f} FPS, {result.avg_time_ms:.2f}ms avg")
    if verbose:
        print()
    for effect_name, effect_class in effects:
        for display_name, display_class in displays:
            if display_name == "websocket":
                continue
-            print(f"Benchmarking effect: {effect_name} with {display_name}")
+            if verbose:
-            try:
+                print(f"Benchmarking effect: {effect_name} with {display_name}")
-                display = display_class()
+
-                display.init(80, 24)
+            display = display_class()
-                result = benchmark_effect_with_display(
+            display.init(80, 24)
-                    effect_class, display, buffer, iterations
+            result = benchmark_effect_with_display(
-                )
+                effect_class, display, buffer, iterations
            )
            if result:
                results.append(result)
-                print(f"  {result.fps:.1f} FPS, {result.avg_time_ms:.2f}ms avg")
+                if verbose:
-            except Exception as e:
+                    print(f"  {result.fps:.1f} FPS, {result.avg_time_ms:.2f}ms avg")
                print(f"  Error: {e}")
    summary = generate_summary(results)
@@ -267,24 +323,132 @@ def generate_summary(results: list[BenchmarkResult]) -> dict[str, Any]:
    for display, res in by_display.items():
        fps_values = [r.fps for r in res]
        summary["by_display"][display] = {
-            "avg_fps": np.mean(fps_values),
+            "avg_fps": float(np.mean(fps_values)),
-            "min_fps": np.min(fps_values),
+            "min_fps": float(np.min(fps_values)),
-            "max_fps": np.max(fps_values),
+            "max_fps": float(np.max(fps_values)),
            "tests": len(res),
        }
    for effect, res in by_effect.items():
        fps_values = [r.fps for r in res]
        summary["by_effect"][effect] = {
-            "avg_fps": np.mean(fps_values),
+            "avg_fps": float(np.mean(fps_values)),
-            "min_fps": np.min(fps_values),
+            "min_fps": float(np.min(fps_values)),
-            "max_fps": np.max(fps_values),
+            "max_fps": float(np.max(fps_values)),
            "tests": len(res),
        }
    return summary
 DEFAULT_CACHE_PATH = Path.home() / ".mainline_benchmark_cache.json"
 def load_baseline(cache_path: Path | None = None) -> dict[str, Any] | None:
    """Load baseline benchmark results from cache."""
    path = cache_path or DEFAULT_CACHE_PATH
    if not path.exists():
        return None
    try:
        with open(path) as f:
            return json.load(f)
    except Exception:
        return None
 def save_baseline(
    results: list[BenchmarkResult],
    cache_path: Path | None = None,
 ) -> None:
    """Save benchmark results as baseline to cache."""
    path = cache_path or DEFAULT_CACHE_PATH
    baseline = {
        "timestamp": datetime.now().isoformat(),
        "results": {
            r.name: {
                "fps": r.fps,
                "avg_time_ms": r.avg_time_ms,
                "chars_per_sec": r.chars_per_sec,
            }
            for r in results
        },
    }
    path.parent.mkdir(parents=True, exist_ok=True)
    with open(path, "w") as f:
        json.dump(baseline, f, indent=2)
 def compare_with_baseline(
    results: list[BenchmarkResult],
    baseline: dict[str, Any],
    threshold: float = 0.2,
    verbose: bool = True,
 ) -> tuple[bool, list[str]]:
    """Compare current results with baseline. Returns (pass, messages)."""
    baseline_results = baseline.get("results", {})
    failures = []
    warnings = []
    for r in results:
        if r.name not in baseline_results:
            warnings.append(f"New test: {r.name} (no baseline)")
            continue
        b = baseline_results[r.name]
        if b["fps"] == 0:
            continue
        degradation = (b["fps"] - r.fps) / b["fps"]
        if degradation > threshold:
            failures.append(
                f"{r.name}: FPS degraded {degradation * 100:.1f}% "
                f"(baseline: {b['fps']:.1f}, current: {r.fps:.1f})"
            )
        elif verbose:
            print(f"  {r.name}: {r.fps:.1f} FPS (baseline: {b['fps']:.1f})")
    passed = len(failures) == 0
    messages = []
    if failures:
        messages.extend(failures)
    if warnings:
        messages.extend(warnings)
    return passed, messages
 def run_hook_mode(
    displays: list[tuple[str, Any]] | None = None,
    effects: list[tuple[str, Any]] | None = None,
    iterations: int = 20,
    threshold: float = 0.2,
    cache_path: Path | None = None,
    verbose: bool = False,
 ) -> int:
    """Run in hook mode: compare against baseline, exit 0 on pass, 1 on fail."""
    baseline = load_baseline(cache_path)
    if baseline is None:
        print("No baseline found. Run with --baseline to create one.")
        return 1
    report = run_benchmarks(displays, effects, iterations, verbose)
    passed, messages = compare_with_baseline(
        report.results, baseline, threshold, verbose
    )
    print("\n=== Benchmark Hook Results ===")
    if passed:
        print("PASSED - No significant performance degradation")
        return 0
    else:
        print("FAILED - Performance degradation detected:")
        for msg in messages:
            print(f"  - {msg}")
        return 1
 def format_report_text(report: BenchmarkReport) -> str:
    """Format report as human-readable text."""
    lines = [
@@ -391,9 +555,67 @@ def main():
        default="text",
        help="Output format (default: text)",
    )
    parser.add_argument(
        "--verbose",
        "-v",
        action="store_true",
        help="Show progress during benchmarking",
    )
    parser.add_argument(
        "--hook",
        action="store_true",
        help="Run in hook mode: compare against baseline, exit 0 pass, 1 fail",
    )
    parser.add_argument(
        "--baseline",
        action="store_true",
        help="Save current results as baseline for future hook comparisons",
    )
    parser.add_argument(
        "--threshold",
        type=float,
        default=0.2,
        help="Performance degradation threshold for hook mode (default: 0.2 = 20%%)",
    )
    parser.add_argument(
        "--cache",
        type=str,
        default=None,
        help="Path to baseline cache file (default: ~/.mainline_benchmark_cache.json)",
    )
    args = parser.parse_args()
    cache_path = Path(args.cache) if args.cache else DEFAULT_CACHE_PATH
    if args.hook:
        displays = None
        if args.displays:
            display_map = dict(get_available_displays())
            displays = [
                (name, display_map[name])
                for name in args.displays.split(",")
                if name in display_map
            ]
        effects = None
        if args.effects:
            effect_map = dict(get_available_effects())
            effects = [
                (name, effect_map[name])
                for name in args.effects.split(",")
                if name in effect_map
            ]
        return run_hook_mode(
            displays,
            effects,
            iterations=args.iterations,
            threshold=args.threshold,
            cache_path=cache_path,
            verbose=args.verbose,
        )
    displays = None
    if args.displays:
        display_map = dict(get_available_displays())
@@ -412,7 +634,12 @@ def main():
            if name in effect_map
        ]
-    report = run_benchmarks(displays, effects, args.iterations, args.format)
+    report = run_benchmarks(displays, effects, args.iterations, args.verbose)
    if args.baseline:
        save_baseline(report.results, cache_path)
        print(f"Baseline saved to {cache_path}")
        return 0
    if args.format == "json":
        output = format_report_json(report)
@@ -422,10 +649,11 @@ def main():
    if args.output:
        with open(args.output, "w") as f:
            f.write(output)
        print(f"Report written to {args.output}")
    else:
        print(output)
    return 0
 if __name__ == "__main__":
-    main()
+    sys.exit(main())
--- a/hk.pkl
+++ b/hk.pkl
@@ -22,6 +22,9 @@ hooks {
                prefix = "uv run"
                check = "ruff check engine/ tests/"
            }
            ["benchmark"] {
                check = "uv run python -m engine.benchmark --hook --displays null --iterations 20"
            }
        }
    }
 }