1 Commits

Author SHA1 Message Date
7eb3fca935 feat(benchmark): add hook mode with baseline cache for pre-push checks
- Fix lint errors and LSP issues in benchmark.py
- Add --hook mode to compare against saved baseline
- Add --baseline flag to save results as baseline
- Add --threshold to configure degradation threshold (default 20%)
- Add benchmark step to pre-push hook in hk.pkl
- Update AGENTS.md with hk documentation links and benchmark runner docs
2026-03-15 22:54:51 -07:00
3 changed files with 348 additions and 74 deletions

View File

@@ -60,9 +60,52 @@ hk init --mise
mise run pre-commit mise run pre-commit
``` ```
**IMPORTANT**: Always review the hk documentation before modifying `hk.pkl`:
- [hk Configuration Guide](https://hk.jdx.dev/configuration.html)
- [hk Hooks Reference](https://hk.jdx.dev/hooks.html)
- [hk Builtins](https://hk.jdx.dev/builtins.html)
The project uses hk configured in `hk.pkl`: The project uses hk configured in `hk.pkl`:
- **pre-commit**: runs ruff-format and ruff (with auto-fix) - **pre-commit**: runs ruff-format and ruff (with auto-fix)
- **pre-push**: runs ruff check - **pre-push**: runs ruff check + benchmark hook
## Benchmark Runner
Run performance benchmarks:
```bash
mise run benchmark # Run all benchmarks (text output)
mise run benchmark-json # Run benchmarks (JSON output)
mise run benchmark-report # Run benchmarks (Markdown report)
```
### Benchmark Commands
```bash
# Run benchmarks
uv run python -m engine.benchmark
# Run with specific displays/effects
uv run python -m engine.benchmark --displays null,terminal --effects fade,glitch
# Save baseline for hook comparisons
uv run python -m engine.benchmark --baseline
# Run in hook mode (compares against baseline)
uv run python -m engine.benchmark --hook
# Hook mode with custom threshold (default: 20% degradation)
uv run python -m engine.benchmark --hook --threshold 0.3
# Custom baseline location
uv run python -m engine.benchmark --hook --cache /path/to/cache.json
```
### Hook Mode
The `--hook` mode compares current benchmarks against a saved baseline. If performance degrades beyond the threshold (default 20%), it exits with code 1. This is useful for preventing performance regressions in feature branches.
The pre-push hook runs benchmark in hook mode to catch performance regressions before pushing.
## Workflow Rules ## Workflow Rules

View File

@@ -6,6 +6,9 @@ Usage:
python -m engine.benchmark python -m engine.benchmark
python -m engine.benchmark --output report.md python -m engine.benchmark --output report.md
python -m engine.benchmark --displays terminal,websocket --effects glitch,fade python -m engine.benchmark --displays terminal,websocket --effects glitch,fade
python -m engine.benchmark --format json --output benchmark.json
Headless mode (default): suppress all terminal output during benchmarks.
""" """
import argparse import argparse
@@ -13,6 +16,9 @@ import json
import sys import sys
import time import time
from dataclasses import dataclass, field from dataclasses import dataclass, field
from datetime import datetime
from io import StringIO
from pathlib import Path
from typing import Any from typing import Any
import numpy as np import numpy as np
@@ -57,21 +63,34 @@ def get_sample_buffer(width: int = 80, height: int = 24) -> list[str]:
def benchmark_display( def benchmark_display(
display_class, buffer: list[str], iterations: int = 100 display_class, buffer: list[str], iterations: int = 100
) -> BenchmarkResult: ) -> BenchmarkResult | None:
"""Benchmark a single display.""" """Benchmark a single display."""
display = display_class() old_stdout = sys.stdout
display.init(80, 24) old_stderr = sys.stderr
times = [] try:
chars = sum(len(line) for line in buffer) sys.stdout = StringIO()
sys.stderr = StringIO()
for _ in range(iterations): display = display_class()
t0 = time.perf_counter() display.init(80, 24)
display.show(buffer)
elapsed = (time.perf_counter() - t0) * 1000
times.append(elapsed)
display.cleanup() times = []
chars = sum(len(line) for line in buffer)
for _ in range(iterations):
t0 = time.perf_counter()
display.show(buffer)
elapsed = (time.perf_counter() - t0) * 1000
times.append(elapsed)
display.cleanup()
except Exception:
return None
finally:
sys.stdout = old_stdout
sys.stderr = old_stderr
times_arr = np.array(times) times_arr = np.array(times)
@@ -81,36 +100,62 @@ def benchmark_display(
effect=None, effect=None,
iterations=iterations, iterations=iterations,
total_time_ms=sum(times), total_time_ms=sum(times),
avg_time_ms=np.mean(times_arr), avg_time_ms=float(np.mean(times_arr)),
std_dev_ms=np.std(times_arr), std_dev_ms=float(np.std(times_arr)),
min_ms=np.min(times_arr), min_ms=float(np.min(times_arr)),
max_ms=np.max(times_arr), max_ms=float(np.max(times_arr)),
fps=1000.0 / np.mean(times_arr) if np.mean(times_arr) > 0 else 0, fps=float(1000.0 / np.mean(times_arr)) if np.mean(times_arr) > 0 else 0.0,
chars_processed=chars * iterations, chars_processed=chars * iterations,
chars_per_sec=(chars * iterations) / (sum(times) / 1000) chars_per_sec=float((chars * iterations) / (sum(times) / 1000))
if sum(times) > 0 if sum(times) > 0
else 0, else 0.0,
) )
def benchmark_effect_with_display( def benchmark_effect_with_display(
effect_class, display, buffer: list[str], iterations: int = 100 effect_class, display, buffer: list[str], iterations: int = 100
) -> BenchmarkResult: ) -> BenchmarkResult | None:
"""Benchmark an effect with a display.""" """Benchmark an effect with a display."""
effect = effect_class() old_stdout = sys.stdout
effect.configure(enabled=True, intensity=1.0) old_stderr = sys.stderr
times = [] try:
chars = sum(len(line) for line in buffer) from engine.effects.types import EffectConfig, EffectContext
for _ in range(iterations): sys.stdout = StringIO()
processed = effect.process(buffer) sys.stderr = StringIO()
t0 = time.perf_counter()
display.show(processed)
elapsed = (time.perf_counter() - t0) * 1000
times.append(elapsed)
display.cleanup() effect = effect_class()
effect.configure(EffectConfig(enabled=True, intensity=1.0))
ctx = EffectContext(
terminal_width=80,
terminal_height=24,
scroll_cam=0,
ticker_height=0,
mic_excess=0.0,
grad_offset=0.0,
frame_number=0,
has_message=False,
)
times = []
chars = sum(len(line) for line in buffer)
for _ in range(iterations):
processed = effect.process(buffer, ctx)
t0 = time.perf_counter()
display.show(processed)
elapsed = (time.perf_counter() - t0) * 1000
times.append(elapsed)
display.cleanup()
except Exception:
return None
finally:
sys.stdout = old_stdout
sys.stderr = old_stderr
times_arr = np.array(times) times_arr = np.array(times)
@@ -120,15 +165,15 @@ def benchmark_effect_with_display(
effect=effect_class.__name__, effect=effect_class.__name__,
iterations=iterations, iterations=iterations,
total_time_ms=sum(times), total_time_ms=sum(times),
avg_time_ms=np.mean(times_arr), avg_time_ms=float(np.mean(times_arr)),
std_dev_ms=np.std(times_arr), std_dev_ms=float(np.std(times_arr)),
min_ms=np.min(times_arr), min_ms=float(np.min(times_arr)),
max_ms=np.max(times_arr), max_ms=float(np.max(times_arr)),
fps=1000.0 / np.mean(times_arr) if np.mean(times_arr) > 0 else 0, fps=float(1000.0 / np.mean(times_arr)) if np.mean(times_arr) > 0 else 0.0,
chars_processed=chars * iterations, chars_processed=chars * iterations,
chars_per_sec=(chars * iterations) / (sum(times) / 1000) chars_per_sec=float((chars * iterations) / (sum(times) / 1000))
if sum(times) > 0 if sum(times) > 0
else 0, else 0.0,
) )
@@ -139,7 +184,6 @@ def get_available_displays():
NullDisplay, NullDisplay,
TerminalDisplay, TerminalDisplay,
) )
from engine.display.backends.sixel import SixelDisplay
DisplayRegistry.initialize() DisplayRegistry.initialize()
@@ -156,6 +200,8 @@ def get_available_displays():
pass pass
try: try:
from engine.display.backends.sixel import SixelDisplay
displays.append(("sixel", SixelDisplay)) displays.append(("sixel", SixelDisplay))
except Exception: except Exception:
pass pass
@@ -166,17 +212,24 @@ def get_available_displays():
def get_available_effects(): def get_available_effects():
"""Get available effect classes.""" """Get available effect classes."""
try: try:
from engine.effects.registry import get_effect_registry from engine.effects import get_registry
try:
from effects_plugins import discover_plugins
discover_plugins()
except Exception:
pass
except Exception: except Exception:
return [] return []
effects = [] effects = []
registry = get_effect_registry() registry = get_registry()
for name in registry.list_effects(): for name, effect in registry.list_all().items():
effect = registry.get(name)
if effect: if effect:
effects.append((name, effect)) effect_cls = type(effect)
effects.append((name, effect_cls))
return effects return effects
@@ -185,7 +238,7 @@ def run_benchmarks(
displays: list[tuple[str, Any]] | None = None, displays: list[tuple[str, Any]] | None = None,
effects: list[tuple[str, Any]] | None = None, effects: list[tuple[str, Any]] | None = None,
iterations: int = 100, iterations: int = 100,
output_format: str = "text", verbose: bool = False,
) -> BenchmarkReport: ) -> BenchmarkReport:
"""Run all benchmarks and return report.""" """Run all benchmarks and return report."""
from datetime import datetime from datetime import datetime
@@ -199,35 +252,38 @@ def run_benchmarks(
buffer = get_sample_buffer(80, 24) buffer = get_sample_buffer(80, 24)
results = [] results = []
print(f"Running benchmarks ({iterations} iterations each)...") if verbose:
print() print(f"Running benchmarks ({iterations} iterations each)...")
for name, display_class in displays: for name, display_class in displays:
print(f"Benchmarking display: {name}") if verbose:
try: print(f"Benchmarking display: {name}")
result = benchmark_display(display_class, buffer, iterations)
results.append(result)
print(f" {result.fps:.1f} FPS, {result.avg_time_ms:.2f}ms avg")
except Exception as e:
print(f" Error: {e}")
print() result = benchmark_display(display_class, buffer, iterations)
if result:
results.append(result)
if verbose:
print(f" {result.fps:.1f} FPS, {result.avg_time_ms:.2f}ms avg")
if verbose:
print()
for effect_name, effect_class in effects: for effect_name, effect_class in effects:
for display_name, display_class in displays: for display_name, display_class in displays:
if display_name == "websocket": if display_name == "websocket":
continue continue
print(f"Benchmarking effect: {effect_name} with {display_name}") if verbose:
try: print(f"Benchmarking effect: {effect_name} with {display_name}")
display = display_class()
display.init(80, 24) display = display_class()
result = benchmark_effect_with_display( display.init(80, 24)
effect_class, display, buffer, iterations result = benchmark_effect_with_display(
) effect_class, display, buffer, iterations
)
if result:
results.append(result) results.append(result)
print(f" {result.fps:.1f} FPS, {result.avg_time_ms:.2f}ms avg") if verbose:
except Exception as e: print(f" {result.fps:.1f} FPS, {result.avg_time_ms:.2f}ms avg")
print(f" Error: {e}")
summary = generate_summary(results) summary = generate_summary(results)
@@ -267,24 +323,132 @@ def generate_summary(results: list[BenchmarkResult]) -> dict[str, Any]:
for display, res in by_display.items(): for display, res in by_display.items():
fps_values = [r.fps for r in res] fps_values = [r.fps for r in res]
summary["by_display"][display] = { summary["by_display"][display] = {
"avg_fps": np.mean(fps_values), "avg_fps": float(np.mean(fps_values)),
"min_fps": np.min(fps_values), "min_fps": float(np.min(fps_values)),
"max_fps": np.max(fps_values), "max_fps": float(np.max(fps_values)),
"tests": len(res), "tests": len(res),
} }
for effect, res in by_effect.items(): for effect, res in by_effect.items():
fps_values = [r.fps for r in res] fps_values = [r.fps for r in res]
summary["by_effect"][effect] = { summary["by_effect"][effect] = {
"avg_fps": np.mean(fps_values), "avg_fps": float(np.mean(fps_values)),
"min_fps": np.min(fps_values), "min_fps": float(np.min(fps_values)),
"max_fps": np.max(fps_values), "max_fps": float(np.max(fps_values)),
"tests": len(res), "tests": len(res),
} }
return summary return summary
DEFAULT_CACHE_PATH = Path.home() / ".mainline_benchmark_cache.json"
def load_baseline(cache_path: Path | None = None) -> dict[str, Any] | None:
"""Load baseline benchmark results from cache."""
path = cache_path or DEFAULT_CACHE_PATH
if not path.exists():
return None
try:
with open(path) as f:
return json.load(f)
except Exception:
return None
def save_baseline(
results: list[BenchmarkResult],
cache_path: Path | None = None,
) -> None:
"""Save benchmark results as baseline to cache."""
path = cache_path or DEFAULT_CACHE_PATH
baseline = {
"timestamp": datetime.now().isoformat(),
"results": {
r.name: {
"fps": r.fps,
"avg_time_ms": r.avg_time_ms,
"chars_per_sec": r.chars_per_sec,
}
for r in results
},
}
path.parent.mkdir(parents=True, exist_ok=True)
with open(path, "w") as f:
json.dump(baseline, f, indent=2)
def compare_with_baseline(
results: list[BenchmarkResult],
baseline: dict[str, Any],
threshold: float = 0.2,
verbose: bool = True,
) -> tuple[bool, list[str]]:
"""Compare current results with baseline. Returns (pass, messages)."""
baseline_results = baseline.get("results", {})
failures = []
warnings = []
for r in results:
if r.name not in baseline_results:
warnings.append(f"New test: {r.name} (no baseline)")
continue
b = baseline_results[r.name]
if b["fps"] == 0:
continue
degradation = (b["fps"] - r.fps) / b["fps"]
if degradation > threshold:
failures.append(
f"{r.name}: FPS degraded {degradation * 100:.1f}% "
f"(baseline: {b['fps']:.1f}, current: {r.fps:.1f})"
)
elif verbose:
print(f" {r.name}: {r.fps:.1f} FPS (baseline: {b['fps']:.1f})")
passed = len(failures) == 0
messages = []
if failures:
messages.extend(failures)
if warnings:
messages.extend(warnings)
return passed, messages
def run_hook_mode(
displays: list[tuple[str, Any]] | None = None,
effects: list[tuple[str, Any]] | None = None,
iterations: int = 20,
threshold: float = 0.2,
cache_path: Path | None = None,
verbose: bool = False,
) -> int:
"""Run in hook mode: compare against baseline, exit 0 on pass, 1 on fail."""
baseline = load_baseline(cache_path)
if baseline is None:
print("No baseline found. Run with --baseline to create one.")
return 1
report = run_benchmarks(displays, effects, iterations, verbose)
passed, messages = compare_with_baseline(
report.results, baseline, threshold, verbose
)
print("\n=== Benchmark Hook Results ===")
if passed:
print("PASSED - No significant performance degradation")
return 0
else:
print("FAILED - Performance degradation detected:")
for msg in messages:
print(f" - {msg}")
return 1
def format_report_text(report: BenchmarkReport) -> str: def format_report_text(report: BenchmarkReport) -> str:
"""Format report as human-readable text.""" """Format report as human-readable text."""
lines = [ lines = [
@@ -391,9 +555,67 @@ def main():
default="text", default="text",
help="Output format (default: text)", help="Output format (default: text)",
) )
parser.add_argument(
"--verbose",
"-v",
action="store_true",
help="Show progress during benchmarking",
)
parser.add_argument(
"--hook",
action="store_true",
help="Run in hook mode: compare against baseline, exit 0 pass, 1 fail",
)
parser.add_argument(
"--baseline",
action="store_true",
help="Save current results as baseline for future hook comparisons",
)
parser.add_argument(
"--threshold",
type=float,
default=0.2,
help="Performance degradation threshold for hook mode (default: 0.2 = 20%%)",
)
parser.add_argument(
"--cache",
type=str,
default=None,
help="Path to baseline cache file (default: ~/.mainline_benchmark_cache.json)",
)
args = parser.parse_args() args = parser.parse_args()
cache_path = Path(args.cache) if args.cache else DEFAULT_CACHE_PATH
if args.hook:
displays = None
if args.displays:
display_map = dict(get_available_displays())
displays = [
(name, display_map[name])
for name in args.displays.split(",")
if name in display_map
]
effects = None
if args.effects:
effect_map = dict(get_available_effects())
effects = [
(name, effect_map[name])
for name in args.effects.split(",")
if name in effect_map
]
return run_hook_mode(
displays,
effects,
iterations=args.iterations,
threshold=args.threshold,
cache_path=cache_path,
verbose=args.verbose,
)
displays = None displays = None
if args.displays: if args.displays:
display_map = dict(get_available_displays()) display_map = dict(get_available_displays())
@@ -412,7 +634,12 @@ def main():
if name in effect_map if name in effect_map
] ]
report = run_benchmarks(displays, effects, args.iterations, args.format) report = run_benchmarks(displays, effects, args.iterations, args.verbose)
if args.baseline:
save_baseline(report.results, cache_path)
print(f"Baseline saved to {cache_path}")
return 0
if args.format == "json": if args.format == "json":
output = format_report_json(report) output = format_report_json(report)
@@ -422,10 +649,11 @@ def main():
if args.output: if args.output:
with open(args.output, "w") as f: with open(args.output, "w") as f:
f.write(output) f.write(output)
print(f"Report written to {args.output}")
else: else:
print(output) print(output)
return 0
if __name__ == "__main__": if __name__ == "__main__":
main() sys.exit(main())

3
hk.pkl
View File

@@ -22,6 +22,9 @@ hooks {
prefix = "uv run" prefix = "uv run"
check = "ruff check engine/ tests/" check = "ruff check engine/ tests/"
} }
["benchmark"] {
check = "uv run python -m engine.benchmark --hook --displays null --iterations 20"
}
} }
} }
} }