diff --git a/AGENTS.md b/AGENTS.md
index 92fc922..c567a3f 100644
--- a/AGENTS.md
+++ b/AGENTS.md
@@ -60,9 +60,52 @@ hk init --mise
 mise run pre-commit
 ```
 
+**IMPORTANT**: Always review the hk documentation before modifying `hk.pkl`:
+- [hk Configuration Guide](https://hk.jdx.dev/configuration.html)
+- [hk Hooks Reference](https://hk.jdx.dev/hooks.html)
+- [hk Builtins](https://hk.jdx.dev/builtins.html)
+
 The project uses hk configured in `hk.pkl`:
 - **pre-commit**: runs ruff-format and ruff (with auto-fix)
-- **pre-push**: runs ruff check
+- **pre-push**: runs ruff check + benchmark hook
+
+## Benchmark Runner
+
+Run performance benchmarks:
+
+```bash
+mise run benchmark           # Run all benchmarks (text output)
+mise run benchmark-json     # Run benchmarks (JSON output)
+mise run benchmark-report   # Run benchmarks (Markdown report)
+```
+
+### Benchmark Commands
+
+```bash
+# Run benchmarks
+uv run python -m engine.benchmark
+
+# Run with specific displays/effects
+uv run python -m engine.benchmark --displays null,terminal --effects fade,glitch
+
+# Save baseline for hook comparisons
+uv run python -m engine.benchmark --baseline
+
+# Run in hook mode (compares against baseline)
+uv run python -m engine.benchmark --hook
+
+# Hook mode with custom threshold (default: 20% degradation)
+uv run python -m engine.benchmark --hook --threshold 0.3
+
+# Custom baseline location
+uv run python -m engine.benchmark --hook --cache /path/to/cache.json
+```
+
+### Hook Mode
+
+The `--hook` mode compares current benchmarks against a saved baseline. If performance degrades beyond the threshold (default 20%), it exits with code 1. This is useful for preventing performance regressions in feature branches.
+
+The pre-push hook runs benchmark in hook mode to catch performance regressions before pushing.
 
 ## Workflow Rules
 
diff --git a/engine/benchmark.py b/engine/benchmark.py
index e4a3882..0ac2481 100644
--- a/engine/benchmark.py
+++ b/engine/benchmark.py
@@ -6,6 +6,9 @@ Usage:
     python -m engine.benchmark
     python -m engine.benchmark --output report.md
     python -m engine.benchmark --displays terminal,websocket --effects glitch,fade
+    python -m engine.benchmark --format json --output benchmark.json
+
+Headless mode (default): suppress all terminal output during benchmarks.
 """
 
 import argparse
@@ -13,6 +16,9 @@ import json
 import sys
 import time
 from dataclasses import dataclass, field
+from datetime import datetime
+from io import StringIO
+from pathlib import Path
 from typing import Any
 
 import numpy as np
@@ -57,21 +63,34 @@ def get_sample_buffer(width: int = 80, height: int = 24) -> list[str]:
 
 def benchmark_display(
     display_class, buffer: list[str], iterations: int = 100
-) -> BenchmarkResult:
+) -> BenchmarkResult | None:
     """Benchmark a single display."""
-    display = display_class()
-    display.init(80, 24)
+    old_stdout = sys.stdout
+    old_stderr = sys.stderr
 
-    times = []
-    chars = sum(len(line) for line in buffer)
+    try:
+        sys.stdout = StringIO()
+        sys.stderr = StringIO()
 
-    for _ in range(iterations):
-        t0 = time.perf_counter()
-        display.show(buffer)
-        elapsed = (time.perf_counter() - t0) * 1000
-        times.append(elapsed)
+        display = display_class()
+        display.init(80, 24)
 
-    display.cleanup()
+        times = []
+        chars = sum(len(line) for line in buffer)
+
+        for _ in range(iterations):
+            t0 = time.perf_counter()
+            display.show(buffer)
+            elapsed = (time.perf_counter() - t0) * 1000
+            times.append(elapsed)
+
+        display.cleanup()
+
+    except Exception:
+        return None
+    finally:
+        sys.stdout = old_stdout
+        sys.stderr = old_stderr
 
     times_arr = np.array(times)
 
@@ -81,36 +100,49 @@ def benchmark_display(
         effect=None,
         iterations=iterations,
         total_time_ms=sum(times),
-        avg_time_ms=np.mean(times_arr),
-        std_dev_ms=np.std(times_arr),
-        min_ms=np.min(times_arr),
-        max_ms=np.max(times_arr),
-        fps=1000.0 / np.mean(times_arr) if np.mean(times_arr) > 0 else 0,
+        avg_time_ms=float(np.mean(times_arr)),
+        std_dev_ms=float(np.std(times_arr)),
+        min_ms=float(np.min(times_arr)),
+        max_ms=float(np.max(times_arr)),
+        fps=float(1000.0 / np.mean(times_arr)) if np.mean(times_arr) > 0 else 0.0,
         chars_processed=chars * iterations,
-        chars_per_sec=(chars * iterations) / (sum(times) / 1000)
+        chars_per_sec=float((chars * iterations) / (sum(times) / 1000))
         if sum(times) > 0
-        else 0,
+        else 0.0,
     )
 
 
 def benchmark_effect_with_display(
     effect_class, display, buffer: list[str], iterations: int = 100
-) -> BenchmarkResult:
+) -> BenchmarkResult | None:
     """Benchmark an effect with a display."""
-    effect = effect_class()
-    effect.configure(enabled=True, intensity=1.0)
+    old_stdout = sys.stdout
+    old_stderr = sys.stderr
 
-    times = []
-    chars = sum(len(line) for line in buffer)
+    try:
+        sys.stdout = StringIO()
+        sys.stderr = StringIO()
 
-    for _ in range(iterations):
-        processed = effect.process(buffer)
-        t0 = time.perf_counter()
-        display.show(processed)
-        elapsed = (time.perf_counter() - t0) * 1000
-        times.append(elapsed)
+        effect = effect_class()
+        effect.configure(enabled=True, intensity=1.0)
 
-    display.cleanup()
+        times = []
+        chars = sum(len(line) for line in buffer)
+
+        for _ in range(iterations):
+            processed = effect.process(buffer)
+            t0 = time.perf_counter()
+            display.show(processed)
+            elapsed = (time.perf_counter() - t0) * 1000
+            times.append(elapsed)
+
+        display.cleanup()
+
+    except Exception:
+        return None
+    finally:
+        sys.stdout = old_stdout
+        sys.stderr = old_stderr
 
     times_arr = np.array(times)
 
@@ -120,15 +152,15 @@ def benchmark_effect_with_display(
         effect=effect_class.__name__,
         iterations=iterations,
         total_time_ms=sum(times),
-        avg_time_ms=np.mean(times_arr),
-        std_dev_ms=np.std(times_arr),
-        min_ms=np.min(times_arr),
-        max_ms=np.max(times_arr),
-        fps=1000.0 / np.mean(times_arr) if np.mean(times_arr) > 0 else 0,
+        avg_time_ms=float(np.mean(times_arr)),
+        std_dev_ms=float(np.std(times_arr)),
+        min_ms=float(np.min(times_arr)),
+        max_ms=float(np.max(times_arr)),
+        fps=float(1000.0 / np.mean(times_arr)) if np.mean(times_arr) > 0 else 0.0,
         chars_processed=chars * iterations,
-        chars_per_sec=(chars * iterations) / (sum(times) / 1000)
+        chars_per_sec=float((chars * iterations) / (sum(times) / 1000))
         if sum(times) > 0
-        else 0,
+        else 0.0,
     )
 
 
@@ -139,7 +171,6 @@ def get_available_displays():
         NullDisplay,
         TerminalDisplay,
     )
-    from engine.display.backends.sixel import SixelDisplay
 
     DisplayRegistry.initialize()
 
@@ -156,6 +187,8 @@ def get_available_displays():
         pass
 
     try:
+        from engine.display.backends.sixel import SixelDisplay
+
         displays.append(("sixel", SixelDisplay))
     except Exception:
         pass
@@ -166,15 +199,14 @@ def get_available_displays():
 def get_available_effects():
     """Get available effect classes."""
     try:
-        from engine.effects.registry import get_effect_registry
+        from engine.effects import get_registry
     except Exception:
         return []
 
     effects = []
-    registry = get_effect_registry()
+    registry = get_registry()
 
-    for name in registry.list_effects():
-        effect = registry.get(name)
+    for name, effect in registry.list_all().items():
         if effect:
             effects.append((name, effect))
 
@@ -185,7 +217,7 @@ def run_benchmarks(
     displays: list[tuple[str, Any]] | None = None,
     effects: list[tuple[str, Any]] | None = None,
     iterations: int = 100,
-    output_format: str = "text",
+    verbose: bool = False,
 ) -> BenchmarkReport:
     """Run all benchmarks and return report."""
     from datetime import datetime
@@ -199,35 +231,38 @@ def run_benchmarks(
     buffer = get_sample_buffer(80, 24)
     results = []
 
-    print(f"Running benchmarks ({iterations} iterations each)...")
-    print()
+    if verbose:
+        print(f"Running benchmarks ({iterations} iterations each)...")
 
     for name, display_class in displays:
-        print(f"Benchmarking display: {name}")
-        try:
-            result = benchmark_display(display_class, buffer, iterations)
-            results.append(result)
-            print(f"  {result.fps:.1f} FPS, {result.avg_time_ms:.2f}ms avg")
-        except Exception as e:
-            print(f"  Error: {e}")
+        if verbose:
+            print(f"Benchmarking display: {name}")
 
-    print()
+        result = benchmark_display(display_class, buffer, iterations)
+        if result:
+            results.append(result)
+            if verbose:
+                print(f"  {result.fps:.1f} FPS, {result.avg_time_ms:.2f}ms avg")
+
+    if verbose:
+        print()
 
     for effect_name, effect_class in effects:
         for display_name, display_class in displays:
             if display_name == "websocket":
                 continue
-            print(f"Benchmarking effect: {effect_name} with {display_name}")
-            try:
-                display = display_class()
-                display.init(80, 24)
-                result = benchmark_effect_with_display(
-                    effect_class, display, buffer, iterations
-                )
+            if verbose:
+                print(f"Benchmarking effect: {effect_name} with {display_name}")
+
+            display = display_class()
+            display.init(80, 24)
+            result = benchmark_effect_with_display(
+                effect_class, display, buffer, iterations
+            )
+            if result:
                 results.append(result)
-                print(f"  {result.fps:.1f} FPS, {result.avg_time_ms:.2f}ms avg")
-            except Exception as e:
-                print(f"  Error: {e}")
+                if verbose:
+                    print(f"  {result.fps:.1f} FPS, {result.avg_time_ms:.2f}ms avg")
 
     summary = generate_summary(results)
 
@@ -267,24 +302,132 @@ def generate_summary(results: list[BenchmarkResult]) -> dict[str, Any]:
     for display, res in by_display.items():
         fps_values = [r.fps for r in res]
         summary["by_display"][display] = {
-            "avg_fps": np.mean(fps_values),
-            "min_fps": np.min(fps_values),
-            "max_fps": np.max(fps_values),
+            "avg_fps": float(np.mean(fps_values)),
+            "min_fps": float(np.min(fps_values)),
+            "max_fps": float(np.max(fps_values)),
             "tests": len(res),
         }
 
     for effect, res in by_effect.items():
         fps_values = [r.fps for r in res]
         summary["by_effect"][effect] = {
-            "avg_fps": np.mean(fps_values),
-            "min_fps": np.min(fps_values),
-            "max_fps": np.max(fps_values),
+            "avg_fps": float(np.mean(fps_values)),
+            "min_fps": float(np.min(fps_values)),
+            "max_fps": float(np.max(fps_values)),
             "tests": len(res),
         }
 
     return summary
 
 
+DEFAULT_CACHE_PATH = Path.home() / ".mainline_benchmark_cache.json"
+
+
+def load_baseline(cache_path: Path | None = None) -> dict[str, Any] | None:
+    """Load baseline benchmark results from cache."""
+    path = cache_path or DEFAULT_CACHE_PATH
+    if not path.exists():
+        return None
+    try:
+        with open(path) as f:
+            return json.load(f)
+    except Exception:
+        return None
+
+
+def save_baseline(
+    results: list[BenchmarkResult],
+    cache_path: Path | None = None,
+) -> None:
+    """Save benchmark results as baseline to cache."""
+    path = cache_path or DEFAULT_CACHE_PATH
+    baseline = {
+        "timestamp": datetime.now().isoformat(),
+        "results": {
+            r.name: {
+                "fps": r.fps,
+                "avg_time_ms": r.avg_time_ms,
+                "chars_per_sec": r.chars_per_sec,
+            }
+            for r in results
+        },
+    }
+    path.parent.mkdir(parents=True, exist_ok=True)
+    with open(path, "w") as f:
+        json.dump(baseline, f, indent=2)
+
+
+def compare_with_baseline(
+    results: list[BenchmarkResult],
+    baseline: dict[str, Any],
+    threshold: float = 0.2,
+    verbose: bool = True,
+) -> tuple[bool, list[str]]:
+    """Compare current results with baseline. Returns (pass, messages)."""
+    baseline_results = baseline.get("results", {})
+    failures = []
+    warnings = []
+
+    for r in results:
+        if r.name not in baseline_results:
+            warnings.append(f"New test: {r.name} (no baseline)")
+            continue
+
+        b = baseline_results[r.name]
+        if b["fps"] == 0:
+            continue
+
+        degradation = (b["fps"] - r.fps) / b["fps"]
+        if degradation > threshold:
+            failures.append(
+                f"{r.name}: FPS degraded {degradation * 100:.1f}% "
+                f"(baseline: {b['fps']:.1f}, current: {r.fps:.1f})"
+            )
+        elif verbose:
+            print(f"  {r.name}: {r.fps:.1f} FPS (baseline: {b['fps']:.1f})")
+
+    passed = len(failures) == 0
+    messages = []
+    if failures:
+        messages.extend(failures)
+    if warnings:
+        messages.extend(warnings)
+
+    return passed, messages
+
+
+def run_hook_mode(
+    displays: list[tuple[str, Any]] | None = None,
+    effects: list[tuple[str, Any]] | None = None,
+    iterations: int = 20,
+    threshold: float = 0.2,
+    cache_path: Path | None = None,
+    verbose: bool = False,
+) -> int:
+    """Run in hook mode: compare against baseline, exit 0 on pass, 1 on fail."""
+    baseline = load_baseline(cache_path)
+
+    if baseline is None:
+        print("No baseline found. Run with --baseline to create one.")
+        return 1
+
+    report = run_benchmarks(displays, effects, iterations, verbose)
+
+    passed, messages = compare_with_baseline(
+        report.results, baseline, threshold, verbose
+    )
+
+    print("\n=== Benchmark Hook Results ===")
+    if passed:
+        print("PASSED - No significant performance degradation")
+        return 0
+    else:
+        print("FAILED - Performance degradation detected:")
+        for msg in messages:
+            print(f"  - {msg}")
+        return 1
+
+
 def format_report_text(report: BenchmarkReport) -> str:
     """Format report as human-readable text."""
     lines = [
@@ -391,9 +534,67 @@ def main():
         default="text",
         help="Output format (default: text)",
     )
+    parser.add_argument(
+        "--verbose",
+        "-v",
+        action="store_true",
+        help="Show progress during benchmarking",
+    )
+    parser.add_argument(
+        "--hook",
+        action="store_true",
+        help="Run in hook mode: compare against baseline, exit 0 pass, 1 fail",
+    )
+    parser.add_argument(
+        "--baseline",
+        action="store_true",
+        help="Save current results as baseline for future hook comparisons",
+    )
+    parser.add_argument(
+        "--threshold",
+        type=float,
+        default=0.2,
+        help="Performance degradation threshold for hook mode (default: 0.2 = 20%%)",
+    )
+    parser.add_argument(
+        "--cache",
+        type=str,
+        default=None,
+        help="Path to baseline cache file (default: ~/.mainline_benchmark_cache.json)",
+    )
 
     args = parser.parse_args()
 
+    cache_path = Path(args.cache) if args.cache else DEFAULT_CACHE_PATH
+
+    if args.hook:
+        displays = None
+        if args.displays:
+            display_map = dict(get_available_displays())
+            displays = [
+                (name, display_map[name])
+                for name in args.displays.split(",")
+                if name in display_map
+            ]
+
+        effects = None
+        if args.effects:
+            effect_map = dict(get_available_effects())
+            effects = [
+                (name, effect_map[name])
+                for name in args.effects.split(",")
+                if name in effect_map
+            ]
+
+        return run_hook_mode(
+            displays,
+            effects,
+            iterations=args.iterations,
+            threshold=args.threshold,
+            cache_path=cache_path,
+            verbose=args.verbose,
+        )
+
     displays = None
     if args.displays:
         display_map = dict(get_available_displays())
@@ -412,7 +613,12 @@ def main():
             if name in effect_map
         ]
 
-    report = run_benchmarks(displays, effects, args.iterations, args.format)
+    report = run_benchmarks(displays, effects, args.iterations, args.verbose)
+
+    if args.baseline:
+        save_baseline(report.results, cache_path)
+        print(f"Baseline saved to {cache_path}")
+        return 0
 
     if args.format == "json":
         output = format_report_json(report)
@@ -422,10 +628,11 @@ def main():
     if args.output:
         with open(args.output, "w") as f:
             f.write(output)
-        print(f"Report written to {args.output}")
     else:
         print(output)
 
+    return 0
+
 
 if __name__ == "__main__":
-    main()
+    sys.exit(main())
diff --git a/hk.pkl b/hk.pkl
index 155daf6..b8e8a6d 100644
--- a/hk.pkl
+++ b/hk.pkl
@@ -22,6 +22,9 @@ hooks {
                 prefix = "uv run"
                 check = "ruff check engine/ tests/"
             }
+            ["benchmark"] {
+                check = "uv run python -m engine.benchmark --hook --displays null --iterations 20"
+            }
         }
     }
 }