diff --git a/AGENTS.md b/AGENTS.md index 92fc922..c567a3f 100644 --- a/AGENTS.md +++ b/AGENTS.md @@ -60,9 +60,52 @@ hk init --mise mise run pre-commit ``` +**IMPORTANT**: Always review the hk documentation before modifying `hk.pkl`: +- [hk Configuration Guide](https://hk.jdx.dev/configuration.html) +- [hk Hooks Reference](https://hk.jdx.dev/hooks.html) +- [hk Builtins](https://hk.jdx.dev/builtins.html) + The project uses hk configured in `hk.pkl`: - **pre-commit**: runs ruff-format and ruff (with auto-fix) -- **pre-push**: runs ruff check +- **pre-push**: runs ruff check + benchmark hook + +## Benchmark Runner + +Run performance benchmarks: + +```bash +mise run benchmark # Run all benchmarks (text output) +mise run benchmark-json # Run benchmarks (JSON output) +mise run benchmark-report # Run benchmarks (Markdown report) +``` + +### Benchmark Commands + +```bash +# Run benchmarks +uv run python -m engine.benchmark + +# Run with specific displays/effects +uv run python -m engine.benchmark --displays null,terminal --effects fade,glitch + +# Save baseline for hook comparisons +uv run python -m engine.benchmark --baseline + +# Run in hook mode (compares against baseline) +uv run python -m engine.benchmark --hook + +# Hook mode with custom threshold (default: 20% degradation) +uv run python -m engine.benchmark --hook --threshold 0.3 + +# Custom baseline location +uv run python -m engine.benchmark --hook --cache /path/to/cache.json +``` + +### Hook Mode + +The `--hook` mode compares current benchmarks against a saved baseline. If performance degrades beyond the threshold (default 20%), it exits with code 1. This is useful for preventing performance regressions in feature branches. + +The pre-push hook runs benchmark in hook mode to catch performance regressions before pushing. ## Workflow Rules diff --git a/engine/benchmark.py b/engine/benchmark.py index e4a3882..0ac2481 100644 --- a/engine/benchmark.py +++ b/engine/benchmark.py @@ -6,6 +6,9 @@ Usage: python -m engine.benchmark python -m engine.benchmark --output report.md python -m engine.benchmark --displays terminal,websocket --effects glitch,fade + python -m engine.benchmark --format json --output benchmark.json + +Headless mode (default): suppress all terminal output during benchmarks. """ import argparse @@ -13,6 +16,9 @@ import json import sys import time from dataclasses import dataclass, field +from datetime import datetime +from io import StringIO +from pathlib import Path from typing import Any import numpy as np @@ -57,21 +63,34 @@ def get_sample_buffer(width: int = 80, height: int = 24) -> list[str]: def benchmark_display( display_class, buffer: list[str], iterations: int = 100 -) -> BenchmarkResult: +) -> BenchmarkResult | None: """Benchmark a single display.""" - display = display_class() - display.init(80, 24) + old_stdout = sys.stdout + old_stderr = sys.stderr - times = [] - chars = sum(len(line) for line in buffer) + try: + sys.stdout = StringIO() + sys.stderr = StringIO() - for _ in range(iterations): - t0 = time.perf_counter() - display.show(buffer) - elapsed = (time.perf_counter() - t0) * 1000 - times.append(elapsed) + display = display_class() + display.init(80, 24) - display.cleanup() + times = [] + chars = sum(len(line) for line in buffer) + + for _ in range(iterations): + t0 = time.perf_counter() + display.show(buffer) + elapsed = (time.perf_counter() - t0) * 1000 + times.append(elapsed) + + display.cleanup() + + except Exception: + return None + finally: + sys.stdout = old_stdout + sys.stderr = old_stderr times_arr = np.array(times) @@ -81,36 +100,49 @@ def benchmark_display( effect=None, iterations=iterations, total_time_ms=sum(times), - avg_time_ms=np.mean(times_arr), - std_dev_ms=np.std(times_arr), - min_ms=np.min(times_arr), - max_ms=np.max(times_arr), - fps=1000.0 / np.mean(times_arr) if np.mean(times_arr) > 0 else 0, + avg_time_ms=float(np.mean(times_arr)), + std_dev_ms=float(np.std(times_arr)), + min_ms=float(np.min(times_arr)), + max_ms=float(np.max(times_arr)), + fps=float(1000.0 / np.mean(times_arr)) if np.mean(times_arr) > 0 else 0.0, chars_processed=chars * iterations, - chars_per_sec=(chars * iterations) / (sum(times) / 1000) + chars_per_sec=float((chars * iterations) / (sum(times) / 1000)) if sum(times) > 0 - else 0, + else 0.0, ) def benchmark_effect_with_display( effect_class, display, buffer: list[str], iterations: int = 100 -) -> BenchmarkResult: +) -> BenchmarkResult | None: """Benchmark an effect with a display.""" - effect = effect_class() - effect.configure(enabled=True, intensity=1.0) + old_stdout = sys.stdout + old_stderr = sys.stderr - times = [] - chars = sum(len(line) for line in buffer) + try: + sys.stdout = StringIO() + sys.stderr = StringIO() - for _ in range(iterations): - processed = effect.process(buffer) - t0 = time.perf_counter() - display.show(processed) - elapsed = (time.perf_counter() - t0) * 1000 - times.append(elapsed) + effect = effect_class() + effect.configure(enabled=True, intensity=1.0) - display.cleanup() + times = [] + chars = sum(len(line) for line in buffer) + + for _ in range(iterations): + processed = effect.process(buffer) + t0 = time.perf_counter() + display.show(processed) + elapsed = (time.perf_counter() - t0) * 1000 + times.append(elapsed) + + display.cleanup() + + except Exception: + return None + finally: + sys.stdout = old_stdout + sys.stderr = old_stderr times_arr = np.array(times) @@ -120,15 +152,15 @@ def benchmark_effect_with_display( effect=effect_class.__name__, iterations=iterations, total_time_ms=sum(times), - avg_time_ms=np.mean(times_arr), - std_dev_ms=np.std(times_arr), - min_ms=np.min(times_arr), - max_ms=np.max(times_arr), - fps=1000.0 / np.mean(times_arr) if np.mean(times_arr) > 0 else 0, + avg_time_ms=float(np.mean(times_arr)), + std_dev_ms=float(np.std(times_arr)), + min_ms=float(np.min(times_arr)), + max_ms=float(np.max(times_arr)), + fps=float(1000.0 / np.mean(times_arr)) if np.mean(times_arr) > 0 else 0.0, chars_processed=chars * iterations, - chars_per_sec=(chars * iterations) / (sum(times) / 1000) + chars_per_sec=float((chars * iterations) / (sum(times) / 1000)) if sum(times) > 0 - else 0, + else 0.0, ) @@ -139,7 +171,6 @@ def get_available_displays(): NullDisplay, TerminalDisplay, ) - from engine.display.backends.sixel import SixelDisplay DisplayRegistry.initialize() @@ -156,6 +187,8 @@ def get_available_displays(): pass try: + from engine.display.backends.sixel import SixelDisplay + displays.append(("sixel", SixelDisplay)) except Exception: pass @@ -166,15 +199,14 @@ def get_available_displays(): def get_available_effects(): """Get available effect classes.""" try: - from engine.effects.registry import get_effect_registry + from engine.effects import get_registry except Exception: return [] effects = [] - registry = get_effect_registry() + registry = get_registry() - for name in registry.list_effects(): - effect = registry.get(name) + for name, effect in registry.list_all().items(): if effect: effects.append((name, effect)) @@ -185,7 +217,7 @@ def run_benchmarks( displays: list[tuple[str, Any]] | None = None, effects: list[tuple[str, Any]] | None = None, iterations: int = 100, - output_format: str = "text", + verbose: bool = False, ) -> BenchmarkReport: """Run all benchmarks and return report.""" from datetime import datetime @@ -199,35 +231,38 @@ def run_benchmarks( buffer = get_sample_buffer(80, 24) results = [] - print(f"Running benchmarks ({iterations} iterations each)...") - print() + if verbose: + print(f"Running benchmarks ({iterations} iterations each)...") for name, display_class in displays: - print(f"Benchmarking display: {name}") - try: - result = benchmark_display(display_class, buffer, iterations) - results.append(result) - print(f" {result.fps:.1f} FPS, {result.avg_time_ms:.2f}ms avg") - except Exception as e: - print(f" Error: {e}") + if verbose: + print(f"Benchmarking display: {name}") - print() + result = benchmark_display(display_class, buffer, iterations) + if result: + results.append(result) + if verbose: + print(f" {result.fps:.1f} FPS, {result.avg_time_ms:.2f}ms avg") + + if verbose: + print() for effect_name, effect_class in effects: for display_name, display_class in displays: if display_name == "websocket": continue - print(f"Benchmarking effect: {effect_name} with {display_name}") - try: - display = display_class() - display.init(80, 24) - result = benchmark_effect_with_display( - effect_class, display, buffer, iterations - ) + if verbose: + print(f"Benchmarking effect: {effect_name} with {display_name}") + + display = display_class() + display.init(80, 24) + result = benchmark_effect_with_display( + effect_class, display, buffer, iterations + ) + if result: results.append(result) - print(f" {result.fps:.1f} FPS, {result.avg_time_ms:.2f}ms avg") - except Exception as e: - print(f" Error: {e}") + if verbose: + print(f" {result.fps:.1f} FPS, {result.avg_time_ms:.2f}ms avg") summary = generate_summary(results) @@ -267,24 +302,132 @@ def generate_summary(results: list[BenchmarkResult]) -> dict[str, Any]: for display, res in by_display.items(): fps_values = [r.fps for r in res] summary["by_display"][display] = { - "avg_fps": np.mean(fps_values), - "min_fps": np.min(fps_values), - "max_fps": np.max(fps_values), + "avg_fps": float(np.mean(fps_values)), + "min_fps": float(np.min(fps_values)), + "max_fps": float(np.max(fps_values)), "tests": len(res), } for effect, res in by_effect.items(): fps_values = [r.fps for r in res] summary["by_effect"][effect] = { - "avg_fps": np.mean(fps_values), - "min_fps": np.min(fps_values), - "max_fps": np.max(fps_values), + "avg_fps": float(np.mean(fps_values)), + "min_fps": float(np.min(fps_values)), + "max_fps": float(np.max(fps_values)), "tests": len(res), } return summary +DEFAULT_CACHE_PATH = Path.home() / ".mainline_benchmark_cache.json" + + +def load_baseline(cache_path: Path | None = None) -> dict[str, Any] | None: + """Load baseline benchmark results from cache.""" + path = cache_path or DEFAULT_CACHE_PATH + if not path.exists(): + return None + try: + with open(path) as f: + return json.load(f) + except Exception: + return None + + +def save_baseline( + results: list[BenchmarkResult], + cache_path: Path | None = None, +) -> None: + """Save benchmark results as baseline to cache.""" + path = cache_path or DEFAULT_CACHE_PATH + baseline = { + "timestamp": datetime.now().isoformat(), + "results": { + r.name: { + "fps": r.fps, + "avg_time_ms": r.avg_time_ms, + "chars_per_sec": r.chars_per_sec, + } + for r in results + }, + } + path.parent.mkdir(parents=True, exist_ok=True) + with open(path, "w") as f: + json.dump(baseline, f, indent=2) + + +def compare_with_baseline( + results: list[BenchmarkResult], + baseline: dict[str, Any], + threshold: float = 0.2, + verbose: bool = True, +) -> tuple[bool, list[str]]: + """Compare current results with baseline. Returns (pass, messages).""" + baseline_results = baseline.get("results", {}) + failures = [] + warnings = [] + + for r in results: + if r.name not in baseline_results: + warnings.append(f"New test: {r.name} (no baseline)") + continue + + b = baseline_results[r.name] + if b["fps"] == 0: + continue + + degradation = (b["fps"] - r.fps) / b["fps"] + if degradation > threshold: + failures.append( + f"{r.name}: FPS degraded {degradation * 100:.1f}% " + f"(baseline: {b['fps']:.1f}, current: {r.fps:.1f})" + ) + elif verbose: + print(f" {r.name}: {r.fps:.1f} FPS (baseline: {b['fps']:.1f})") + + passed = len(failures) == 0 + messages = [] + if failures: + messages.extend(failures) + if warnings: + messages.extend(warnings) + + return passed, messages + + +def run_hook_mode( + displays: list[tuple[str, Any]] | None = None, + effects: list[tuple[str, Any]] | None = None, + iterations: int = 20, + threshold: float = 0.2, + cache_path: Path | None = None, + verbose: bool = False, +) -> int: + """Run in hook mode: compare against baseline, exit 0 on pass, 1 on fail.""" + baseline = load_baseline(cache_path) + + if baseline is None: + print("No baseline found. Run with --baseline to create one.") + return 1 + + report = run_benchmarks(displays, effects, iterations, verbose) + + passed, messages = compare_with_baseline( + report.results, baseline, threshold, verbose + ) + + print("\n=== Benchmark Hook Results ===") + if passed: + print("PASSED - No significant performance degradation") + return 0 + else: + print("FAILED - Performance degradation detected:") + for msg in messages: + print(f" - {msg}") + return 1 + + def format_report_text(report: BenchmarkReport) -> str: """Format report as human-readable text.""" lines = [ @@ -391,9 +534,67 @@ def main(): default="text", help="Output format (default: text)", ) + parser.add_argument( + "--verbose", + "-v", + action="store_true", + help="Show progress during benchmarking", + ) + parser.add_argument( + "--hook", + action="store_true", + help="Run in hook mode: compare against baseline, exit 0 pass, 1 fail", + ) + parser.add_argument( + "--baseline", + action="store_true", + help="Save current results as baseline for future hook comparisons", + ) + parser.add_argument( + "--threshold", + type=float, + default=0.2, + help="Performance degradation threshold for hook mode (default: 0.2 = 20%%)", + ) + parser.add_argument( + "--cache", + type=str, + default=None, + help="Path to baseline cache file (default: ~/.mainline_benchmark_cache.json)", + ) args = parser.parse_args() + cache_path = Path(args.cache) if args.cache else DEFAULT_CACHE_PATH + + if args.hook: + displays = None + if args.displays: + display_map = dict(get_available_displays()) + displays = [ + (name, display_map[name]) + for name in args.displays.split(",") + if name in display_map + ] + + effects = None + if args.effects: + effect_map = dict(get_available_effects()) + effects = [ + (name, effect_map[name]) + for name in args.effects.split(",") + if name in effect_map + ] + + return run_hook_mode( + displays, + effects, + iterations=args.iterations, + threshold=args.threshold, + cache_path=cache_path, + verbose=args.verbose, + ) + displays = None if args.displays: display_map = dict(get_available_displays()) @@ -412,7 +613,12 @@ def main(): if name in effect_map ] - report = run_benchmarks(displays, effects, args.iterations, args.format) + report = run_benchmarks(displays, effects, args.iterations, args.verbose) + + if args.baseline: + save_baseline(report.results, cache_path) + print(f"Baseline saved to {cache_path}") + return 0 if args.format == "json": output = format_report_json(report) @@ -422,10 +628,11 @@ def main(): if args.output: with open(args.output, "w") as f: f.write(output) - print(f"Report written to {args.output}") else: print(output) + return 0 + if __name__ == "__main__": - main() + sys.exit(main()) diff --git a/hk.pkl b/hk.pkl index 155daf6..b8e8a6d 100644 --- a/hk.pkl +++ b/hk.pkl @@ -22,6 +22,9 @@ hooks { prefix = "uv run" check = "ruff check engine/ tests/" } + ["benchmark"] { + check = "uv run python -m engine.benchmark --hook --displays null --iterations 20" + } } } }