feat(benchmark): add hook mode with baseline cache for pre-push checks
- Fix lint errors and LSP issues in benchmark.py - Add --hook mode to compare against saved baseline - Add --baseline flag to save results as baseline - Add --threshold to configure degradation threshold (default 20%) - Add benchmark step to pre-push hook in hk.pkl - Update AGENTS.md with hk documentation links and benchmark runner docs
This commit is contained in:
45
AGENTS.md
45
AGENTS.md
@@ -60,9 +60,52 @@ hk init --mise
|
||||
mise run pre-commit
|
||||
```
|
||||
|
||||
**IMPORTANT**: Always review the hk documentation before modifying `hk.pkl`:
|
||||
- [hk Configuration Guide](https://hk.jdx.dev/configuration.html)
|
||||
- [hk Hooks Reference](https://hk.jdx.dev/hooks.html)
|
||||
- [hk Builtins](https://hk.jdx.dev/builtins.html)
|
||||
|
||||
The project uses hk configured in `hk.pkl`:
|
||||
- **pre-commit**: runs ruff-format and ruff (with auto-fix)
|
||||
- **pre-push**: runs ruff check
|
||||
- **pre-push**: runs ruff check + benchmark hook
|
||||
|
||||
## Benchmark Runner
|
||||
|
||||
Run performance benchmarks:
|
||||
|
||||
```bash
|
||||
mise run benchmark # Run all benchmarks (text output)
|
||||
mise run benchmark-json # Run benchmarks (JSON output)
|
||||
mise run benchmark-report # Run benchmarks (Markdown report)
|
||||
```
|
||||
|
||||
### Benchmark Commands
|
||||
|
||||
```bash
|
||||
# Run benchmarks
|
||||
uv run python -m engine.benchmark
|
||||
|
||||
# Run with specific displays/effects
|
||||
uv run python -m engine.benchmark --displays null,terminal --effects fade,glitch
|
||||
|
||||
# Save baseline for hook comparisons
|
||||
uv run python -m engine.benchmark --baseline
|
||||
|
||||
# Run in hook mode (compares against baseline)
|
||||
uv run python -m engine.benchmark --hook
|
||||
|
||||
# Hook mode with custom threshold (default: 20% degradation)
|
||||
uv run python -m engine.benchmark --hook --threshold 0.3
|
||||
|
||||
# Custom baseline location
|
||||
uv run python -m engine.benchmark --hook --cache /path/to/cache.json
|
||||
```
|
||||
|
||||
### Hook Mode
|
||||
|
||||
The `--hook` mode compares current benchmarks against a saved baseline. If performance degrades beyond the threshold (default 20%), it exits with code 1. This is useful for preventing performance regressions in feature branches.
|
||||
|
||||
The pre-push hook runs benchmark in hook mode to catch performance regressions before pushing.
|
||||
|
||||
## Workflow Rules
|
||||
|
||||
|
||||
@@ -6,6 +6,9 @@ Usage:
|
||||
python -m engine.benchmark
|
||||
python -m engine.benchmark --output report.md
|
||||
python -m engine.benchmark --displays terminal,websocket --effects glitch,fade
|
||||
python -m engine.benchmark --format json --output benchmark.json
|
||||
|
||||
Headless mode (default): suppress all terminal output during benchmarks.
|
||||
"""
|
||||
|
||||
import argparse
|
||||
@@ -13,6 +16,9 @@ import json
|
||||
import sys
|
||||
import time
|
||||
from dataclasses import dataclass, field
|
||||
from datetime import datetime
|
||||
from io import StringIO
|
||||
from pathlib import Path
|
||||
from typing import Any
|
||||
|
||||
import numpy as np
|
||||
@@ -57,21 +63,34 @@ def get_sample_buffer(width: int = 80, height: int = 24) -> list[str]:
|
||||
|
||||
def benchmark_display(
|
||||
display_class, buffer: list[str], iterations: int = 100
|
||||
) -> BenchmarkResult:
|
||||
) -> BenchmarkResult | None:
|
||||
"""Benchmark a single display."""
|
||||
display = display_class()
|
||||
display.init(80, 24)
|
||||
old_stdout = sys.stdout
|
||||
old_stderr = sys.stderr
|
||||
|
||||
times = []
|
||||
chars = sum(len(line) for line in buffer)
|
||||
try:
|
||||
sys.stdout = StringIO()
|
||||
sys.stderr = StringIO()
|
||||
|
||||
for _ in range(iterations):
|
||||
t0 = time.perf_counter()
|
||||
display.show(buffer)
|
||||
elapsed = (time.perf_counter() - t0) * 1000
|
||||
times.append(elapsed)
|
||||
display = display_class()
|
||||
display.init(80, 24)
|
||||
|
||||
display.cleanup()
|
||||
times = []
|
||||
chars = sum(len(line) for line in buffer)
|
||||
|
||||
for _ in range(iterations):
|
||||
t0 = time.perf_counter()
|
||||
display.show(buffer)
|
||||
elapsed = (time.perf_counter() - t0) * 1000
|
||||
times.append(elapsed)
|
||||
|
||||
display.cleanup()
|
||||
|
||||
except Exception:
|
||||
return None
|
||||
finally:
|
||||
sys.stdout = old_stdout
|
||||
sys.stderr = old_stderr
|
||||
|
||||
times_arr = np.array(times)
|
||||
|
||||
@@ -81,36 +100,62 @@ def benchmark_display(
|
||||
effect=None,
|
||||
iterations=iterations,
|
||||
total_time_ms=sum(times),
|
||||
avg_time_ms=np.mean(times_arr),
|
||||
std_dev_ms=np.std(times_arr),
|
||||
min_ms=np.min(times_arr),
|
||||
max_ms=np.max(times_arr),
|
||||
fps=1000.0 / np.mean(times_arr) if np.mean(times_arr) > 0 else 0,
|
||||
avg_time_ms=float(np.mean(times_arr)),
|
||||
std_dev_ms=float(np.std(times_arr)),
|
||||
min_ms=float(np.min(times_arr)),
|
||||
max_ms=float(np.max(times_arr)),
|
||||
fps=float(1000.0 / np.mean(times_arr)) if np.mean(times_arr) > 0 else 0.0,
|
||||
chars_processed=chars * iterations,
|
||||
chars_per_sec=(chars * iterations) / (sum(times) / 1000)
|
||||
chars_per_sec=float((chars * iterations) / (sum(times) / 1000))
|
||||
if sum(times) > 0
|
||||
else 0,
|
||||
else 0.0,
|
||||
)
|
||||
|
||||
|
||||
def benchmark_effect_with_display(
|
||||
effect_class, display, buffer: list[str], iterations: int = 100
|
||||
) -> BenchmarkResult:
|
||||
) -> BenchmarkResult | None:
|
||||
"""Benchmark an effect with a display."""
|
||||
effect = effect_class()
|
||||
effect.configure(enabled=True, intensity=1.0)
|
||||
old_stdout = sys.stdout
|
||||
old_stderr = sys.stderr
|
||||
|
||||
times = []
|
||||
chars = sum(len(line) for line in buffer)
|
||||
try:
|
||||
from engine.effects.types import EffectConfig, EffectContext
|
||||
|
||||
for _ in range(iterations):
|
||||
processed = effect.process(buffer)
|
||||
t0 = time.perf_counter()
|
||||
display.show(processed)
|
||||
elapsed = (time.perf_counter() - t0) * 1000
|
||||
times.append(elapsed)
|
||||
sys.stdout = StringIO()
|
||||
sys.stderr = StringIO()
|
||||
|
||||
display.cleanup()
|
||||
effect = effect_class()
|
||||
effect.configure(EffectConfig(enabled=True, intensity=1.0))
|
||||
|
||||
ctx = EffectContext(
|
||||
terminal_width=80,
|
||||
terminal_height=24,
|
||||
scroll_cam=0,
|
||||
ticker_height=0,
|
||||
mic_excess=0.0,
|
||||
grad_offset=0.0,
|
||||
frame_number=0,
|
||||
has_message=False,
|
||||
)
|
||||
|
||||
times = []
|
||||
chars = sum(len(line) for line in buffer)
|
||||
|
||||
for _ in range(iterations):
|
||||
processed = effect.process(buffer, ctx)
|
||||
t0 = time.perf_counter()
|
||||
display.show(processed)
|
||||
elapsed = (time.perf_counter() - t0) * 1000
|
||||
times.append(elapsed)
|
||||
|
||||
display.cleanup()
|
||||
|
||||
except Exception:
|
||||
return None
|
||||
finally:
|
||||
sys.stdout = old_stdout
|
||||
sys.stderr = old_stderr
|
||||
|
||||
times_arr = np.array(times)
|
||||
|
||||
@@ -120,15 +165,15 @@ def benchmark_effect_with_display(
|
||||
effect=effect_class.__name__,
|
||||
iterations=iterations,
|
||||
total_time_ms=sum(times),
|
||||
avg_time_ms=np.mean(times_arr),
|
||||
std_dev_ms=np.std(times_arr),
|
||||
min_ms=np.min(times_arr),
|
||||
max_ms=np.max(times_arr),
|
||||
fps=1000.0 / np.mean(times_arr) if np.mean(times_arr) > 0 else 0,
|
||||
avg_time_ms=float(np.mean(times_arr)),
|
||||
std_dev_ms=float(np.std(times_arr)),
|
||||
min_ms=float(np.min(times_arr)),
|
||||
max_ms=float(np.max(times_arr)),
|
||||
fps=float(1000.0 / np.mean(times_arr)) if np.mean(times_arr) > 0 else 0.0,
|
||||
chars_processed=chars * iterations,
|
||||
chars_per_sec=(chars * iterations) / (sum(times) / 1000)
|
||||
chars_per_sec=float((chars * iterations) / (sum(times) / 1000))
|
||||
if sum(times) > 0
|
||||
else 0,
|
||||
else 0.0,
|
||||
)
|
||||
|
||||
|
||||
@@ -139,7 +184,6 @@ def get_available_displays():
|
||||
NullDisplay,
|
||||
TerminalDisplay,
|
||||
)
|
||||
from engine.display.backends.sixel import SixelDisplay
|
||||
|
||||
DisplayRegistry.initialize()
|
||||
|
||||
@@ -156,6 +200,8 @@ def get_available_displays():
|
||||
pass
|
||||
|
||||
try:
|
||||
from engine.display.backends.sixel import SixelDisplay
|
||||
|
||||
displays.append(("sixel", SixelDisplay))
|
||||
except Exception:
|
||||
pass
|
||||
@@ -166,17 +212,24 @@ def get_available_displays():
|
||||
def get_available_effects():
|
||||
"""Get available effect classes."""
|
||||
try:
|
||||
from engine.effects.registry import get_effect_registry
|
||||
from engine.effects import get_registry
|
||||
|
||||
try:
|
||||
from effects_plugins import discover_plugins
|
||||
|
||||
discover_plugins()
|
||||
except Exception:
|
||||
pass
|
||||
except Exception:
|
||||
return []
|
||||
|
||||
effects = []
|
||||
registry = get_effect_registry()
|
||||
registry = get_registry()
|
||||
|
||||
for name in registry.list_effects():
|
||||
effect = registry.get(name)
|
||||
for name, effect in registry.list_all().items():
|
||||
if effect:
|
||||
effects.append((name, effect))
|
||||
effect_cls = type(effect)
|
||||
effects.append((name, effect_cls))
|
||||
|
||||
return effects
|
||||
|
||||
@@ -185,7 +238,7 @@ def run_benchmarks(
|
||||
displays: list[tuple[str, Any]] | None = None,
|
||||
effects: list[tuple[str, Any]] | None = None,
|
||||
iterations: int = 100,
|
||||
output_format: str = "text",
|
||||
verbose: bool = False,
|
||||
) -> BenchmarkReport:
|
||||
"""Run all benchmarks and return report."""
|
||||
from datetime import datetime
|
||||
@@ -199,35 +252,38 @@ def run_benchmarks(
|
||||
buffer = get_sample_buffer(80, 24)
|
||||
results = []
|
||||
|
||||
print(f"Running benchmarks ({iterations} iterations each)...")
|
||||
print()
|
||||
if verbose:
|
||||
print(f"Running benchmarks ({iterations} iterations each)...")
|
||||
|
||||
for name, display_class in displays:
|
||||
print(f"Benchmarking display: {name}")
|
||||
try:
|
||||
result = benchmark_display(display_class, buffer, iterations)
|
||||
results.append(result)
|
||||
print(f" {result.fps:.1f} FPS, {result.avg_time_ms:.2f}ms avg")
|
||||
except Exception as e:
|
||||
print(f" Error: {e}")
|
||||
if verbose:
|
||||
print(f"Benchmarking display: {name}")
|
||||
|
||||
print()
|
||||
result = benchmark_display(display_class, buffer, iterations)
|
||||
if result:
|
||||
results.append(result)
|
||||
if verbose:
|
||||
print(f" {result.fps:.1f} FPS, {result.avg_time_ms:.2f}ms avg")
|
||||
|
||||
if verbose:
|
||||
print()
|
||||
|
||||
for effect_name, effect_class in effects:
|
||||
for display_name, display_class in displays:
|
||||
if display_name == "websocket":
|
||||
continue
|
||||
print(f"Benchmarking effect: {effect_name} with {display_name}")
|
||||
try:
|
||||
display = display_class()
|
||||
display.init(80, 24)
|
||||
result = benchmark_effect_with_display(
|
||||
effect_class, display, buffer, iterations
|
||||
)
|
||||
if verbose:
|
||||
print(f"Benchmarking effect: {effect_name} with {display_name}")
|
||||
|
||||
display = display_class()
|
||||
display.init(80, 24)
|
||||
result = benchmark_effect_with_display(
|
||||
effect_class, display, buffer, iterations
|
||||
)
|
||||
if result:
|
||||
results.append(result)
|
||||
print(f" {result.fps:.1f} FPS, {result.avg_time_ms:.2f}ms avg")
|
||||
except Exception as e:
|
||||
print(f" Error: {e}")
|
||||
if verbose:
|
||||
print(f" {result.fps:.1f} FPS, {result.avg_time_ms:.2f}ms avg")
|
||||
|
||||
summary = generate_summary(results)
|
||||
|
||||
@@ -267,24 +323,132 @@ def generate_summary(results: list[BenchmarkResult]) -> dict[str, Any]:
|
||||
for display, res in by_display.items():
|
||||
fps_values = [r.fps for r in res]
|
||||
summary["by_display"][display] = {
|
||||
"avg_fps": np.mean(fps_values),
|
||||
"min_fps": np.min(fps_values),
|
||||
"max_fps": np.max(fps_values),
|
||||
"avg_fps": float(np.mean(fps_values)),
|
||||
"min_fps": float(np.min(fps_values)),
|
||||
"max_fps": float(np.max(fps_values)),
|
||||
"tests": len(res),
|
||||
}
|
||||
|
||||
for effect, res in by_effect.items():
|
||||
fps_values = [r.fps for r in res]
|
||||
summary["by_effect"][effect] = {
|
||||
"avg_fps": np.mean(fps_values),
|
||||
"min_fps": np.min(fps_values),
|
||||
"max_fps": np.max(fps_values),
|
||||
"avg_fps": float(np.mean(fps_values)),
|
||||
"min_fps": float(np.min(fps_values)),
|
||||
"max_fps": float(np.max(fps_values)),
|
||||
"tests": len(res),
|
||||
}
|
||||
|
||||
return summary
|
||||
|
||||
|
||||
DEFAULT_CACHE_PATH = Path.home() / ".mainline_benchmark_cache.json"
|
||||
|
||||
|
||||
def load_baseline(cache_path: Path | None = None) -> dict[str, Any] | None:
|
||||
"""Load baseline benchmark results from cache."""
|
||||
path = cache_path or DEFAULT_CACHE_PATH
|
||||
if not path.exists():
|
||||
return None
|
||||
try:
|
||||
with open(path) as f:
|
||||
return json.load(f)
|
||||
except Exception:
|
||||
return None
|
||||
|
||||
|
||||
def save_baseline(
|
||||
results: list[BenchmarkResult],
|
||||
cache_path: Path | None = None,
|
||||
) -> None:
|
||||
"""Save benchmark results as baseline to cache."""
|
||||
path = cache_path or DEFAULT_CACHE_PATH
|
||||
baseline = {
|
||||
"timestamp": datetime.now().isoformat(),
|
||||
"results": {
|
||||
r.name: {
|
||||
"fps": r.fps,
|
||||
"avg_time_ms": r.avg_time_ms,
|
||||
"chars_per_sec": r.chars_per_sec,
|
||||
}
|
||||
for r in results
|
||||
},
|
||||
}
|
||||
path.parent.mkdir(parents=True, exist_ok=True)
|
||||
with open(path, "w") as f:
|
||||
json.dump(baseline, f, indent=2)
|
||||
|
||||
|
||||
def compare_with_baseline(
|
||||
results: list[BenchmarkResult],
|
||||
baseline: dict[str, Any],
|
||||
threshold: float = 0.2,
|
||||
verbose: bool = True,
|
||||
) -> tuple[bool, list[str]]:
|
||||
"""Compare current results with baseline. Returns (pass, messages)."""
|
||||
baseline_results = baseline.get("results", {})
|
||||
failures = []
|
||||
warnings = []
|
||||
|
||||
for r in results:
|
||||
if r.name not in baseline_results:
|
||||
warnings.append(f"New test: {r.name} (no baseline)")
|
||||
continue
|
||||
|
||||
b = baseline_results[r.name]
|
||||
if b["fps"] == 0:
|
||||
continue
|
||||
|
||||
degradation = (b["fps"] - r.fps) / b["fps"]
|
||||
if degradation > threshold:
|
||||
failures.append(
|
||||
f"{r.name}: FPS degraded {degradation * 100:.1f}% "
|
||||
f"(baseline: {b['fps']:.1f}, current: {r.fps:.1f})"
|
||||
)
|
||||
elif verbose:
|
||||
print(f" {r.name}: {r.fps:.1f} FPS (baseline: {b['fps']:.1f})")
|
||||
|
||||
passed = len(failures) == 0
|
||||
messages = []
|
||||
if failures:
|
||||
messages.extend(failures)
|
||||
if warnings:
|
||||
messages.extend(warnings)
|
||||
|
||||
return passed, messages
|
||||
|
||||
|
||||
def run_hook_mode(
|
||||
displays: list[tuple[str, Any]] | None = None,
|
||||
effects: list[tuple[str, Any]] | None = None,
|
||||
iterations: int = 20,
|
||||
threshold: float = 0.2,
|
||||
cache_path: Path | None = None,
|
||||
verbose: bool = False,
|
||||
) -> int:
|
||||
"""Run in hook mode: compare against baseline, exit 0 on pass, 1 on fail."""
|
||||
baseline = load_baseline(cache_path)
|
||||
|
||||
if baseline is None:
|
||||
print("No baseline found. Run with --baseline to create one.")
|
||||
return 1
|
||||
|
||||
report = run_benchmarks(displays, effects, iterations, verbose)
|
||||
|
||||
passed, messages = compare_with_baseline(
|
||||
report.results, baseline, threshold, verbose
|
||||
)
|
||||
|
||||
print("\n=== Benchmark Hook Results ===")
|
||||
if passed:
|
||||
print("PASSED - No significant performance degradation")
|
||||
return 0
|
||||
else:
|
||||
print("FAILED - Performance degradation detected:")
|
||||
for msg in messages:
|
||||
print(f" - {msg}")
|
||||
return 1
|
||||
|
||||
|
||||
def format_report_text(report: BenchmarkReport) -> str:
|
||||
"""Format report as human-readable text."""
|
||||
lines = [
|
||||
@@ -391,9 +555,67 @@ def main():
|
||||
default="text",
|
||||
help="Output format (default: text)",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--verbose",
|
||||
"-v",
|
||||
action="store_true",
|
||||
help="Show progress during benchmarking",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--hook",
|
||||
action="store_true",
|
||||
help="Run in hook mode: compare against baseline, exit 0 pass, 1 fail",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--baseline",
|
||||
action="store_true",
|
||||
help="Save current results as baseline for future hook comparisons",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--threshold",
|
||||
type=float,
|
||||
default=0.2,
|
||||
help="Performance degradation threshold for hook mode (default: 0.2 = 20%%)",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--cache",
|
||||
type=str,
|
||||
default=None,
|
||||
help="Path to baseline cache file (default: ~/.mainline_benchmark_cache.json)",
|
||||
)
|
||||
|
||||
args = parser.parse_args()
|
||||
|
||||
cache_path = Path(args.cache) if args.cache else DEFAULT_CACHE_PATH
|
||||
|
||||
if args.hook:
|
||||
displays = None
|
||||
if args.displays:
|
||||
display_map = dict(get_available_displays())
|
||||
displays = [
|
||||
(name, display_map[name])
|
||||
for name in args.displays.split(",")
|
||||
if name in display_map
|
||||
]
|
||||
|
||||
effects = None
|
||||
if args.effects:
|
||||
effect_map = dict(get_available_effects())
|
||||
effects = [
|
||||
(name, effect_map[name])
|
||||
for name in args.effects.split(",")
|
||||
if name in effect_map
|
||||
]
|
||||
|
||||
return run_hook_mode(
|
||||
displays,
|
||||
effects,
|
||||
iterations=args.iterations,
|
||||
threshold=args.threshold,
|
||||
cache_path=cache_path,
|
||||
verbose=args.verbose,
|
||||
)
|
||||
|
||||
displays = None
|
||||
if args.displays:
|
||||
display_map = dict(get_available_displays())
|
||||
@@ -412,7 +634,12 @@ def main():
|
||||
if name in effect_map
|
||||
]
|
||||
|
||||
report = run_benchmarks(displays, effects, args.iterations, args.format)
|
||||
report = run_benchmarks(displays, effects, args.iterations, args.verbose)
|
||||
|
||||
if args.baseline:
|
||||
save_baseline(report.results, cache_path)
|
||||
print(f"Baseline saved to {cache_path}")
|
||||
return 0
|
||||
|
||||
if args.format == "json":
|
||||
output = format_report_json(report)
|
||||
@@ -422,10 +649,11 @@ def main():
|
||||
if args.output:
|
||||
with open(args.output, "w") as f:
|
||||
f.write(output)
|
||||
print(f"Report written to {args.output}")
|
||||
else:
|
||||
print(output)
|
||||
|
||||
return 0
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
sys.exit(main())
|
||||
|
||||
Reference in New Issue
Block a user