You are a senior code reviewer preparing to review code changes.

## Code Changes

```diff
diff --git a/expert_build/cli.py b/expert_build/cli.py
index 2cb73b9..5fab97b 100644
--- a/expert_build/cli.py
+++ b/expert_build/cli.py
@@ -182,6 +182,11 @@ def main():
     except Exception as e:
         print(f"error: {e}", file=sys.stderr)
         sys.exit(1)
+    finally:
+        from .llm import format_cost_summary
+        cost = format_cost_summary()
+        if cost:
+            print(f"\n{cost}", file=sys.stderr)
 
 
 if __name__ == "__main__":
diff --git a/expert_build/llm.py b/expert_build/llm.py
index 1ca48b2..8d7ecbc 100644
--- a/expert_build/llm.py
+++ b/expert_build/llm.py
@@ -1,4 +1,8 @@
-"""Model invocation for expert agent builder."""
+"""Model invocation for expert agent builder.
+
+Cost tracking: CLI models use --output-format json to capture token
+counts and costs. Use get_cost_summary() to retrieve accumulated stats.
+"""
 
 import asyncio
 import json
@@ -6,12 +10,101 @@
 import shutil
 
 MODEL_COMMANDS: dict[str, list[str]] = {
-    "claude": ["claude", "-p"],
-    "gemini": ["gemini", "-p", ""],
+    "claude": ["claude", "-p", "--output-format", "json"],
+    "gemini": ["gemini", "--skip-trust", "-o", "json", "-p", ""],
 }
 
 DEFAULT_TIMEOUT = 300
 
+_cost_tracker = {
+    "calls": 0,
+    "input_tokens": 0,
+    "output_tokens": 0,
+    "total_cost_usd": 0.0,
+    "by_model": {},
+}
+
+
+def reset_cost_tracker():
+    """Reset accumulated cost/token stats."""
+    _cost_tracker["calls"] = 0
+    _cost_tracker["input_tokens"] = 0
+    _cost_tracker["output_tokens"] = 0
+    _cost_tracker["total_cost_usd"] = 0.0
+    _cost_tracker["by_model"] = {}
+
+
+def get_cost_summary() -> dict:
+    """Return accumulated cost/token stats across all LLM calls."""
+    return dict(_cost_tracker)
+
+
+def format_cost_summary() -> str:
+    """Format cost summary as a human-readable string."""
+    s = _cost_tracker
+    if s["calls"] == 0:
+        return ""
+    parts = []
+    if s["total_cost_usd"] > 0:
+        parts.append(f"${s['total_cost_usd']:.4f}")
+    parts.append(f"{s['input_tokens']:,} input + {s['output_tokens']:,} output tokens")
+    parts.append(f"{s['calls']} call(s)")
+    return "Cost: " + " | ".join(parts)
+
+
+def _record_cost(model: str, input_tokens: int, output_tokens: int, cost_usd: float):
+    """Record token/cost stats from one LLM call."""
+    _cost_tracker["calls"] += 1
+    _cost_tracker["input_tokens"] += input_tokens
+    _cost_tracker["output_tokens"] += output_tokens
+    _cost_tracker["total_cost_usd"] += cost_usd
+
+    if model not in _cost_tracker["by_model"]:
+        _cost_tracker["by_model"][model] = {
+            "calls": 0, "input_tokens": 0, "output_tokens": 0, "total_cost_usd": 0.0,
+        }
+    m = _cost_tracker["by_model"][model]
+    m["calls"] += 1
+    m["input_tokens"] += input_tokens
+    m["output_tokens"] += output_tokens
+    m["total_cost_usd"] += cost_usd
+
+
+def _parse_cli_json(output: str, model: str) -> str:
+    """Parse JSON output from CLI, extract response text and record costs.
+
+    Falls back to returning raw output if JSON parsing fails.
+    """
+    try:
+        data = json.loads(output)
+    except (json.JSONDecodeError, ValueError):
+        return output
+
+    if not isinstance(data, dict):
+        return output
+
+    if model.startswith("gemini"):
+        text = data.get("response") or output
+        stats = data.get("stats", {})
+        input_tokens = 0
+        output_tokens = 0
+        for model_stats in stats.get("models", {}).values():
+            tokens = model_stats.get("tokens", {})
+            input_tokens += tokens.get("input", 0)
+            output_tokens += tokens.get("candidates", 0)
+        _record_cost(model, input_tokens, output_tokens, 0.0)
+        return text
+
+    text = data.get("result") or output
+    usage = data.get("usage", {})
+    input_tokens = (usage.get("input_tokens", 0)
+                    + usage.get("cache_creation_input_tokens", 0)
+                    + usage.get("cache_read_input_tokens", 0))
+    output_tokens = usage.get("output_tokens", 0)
+    cost_usd = data.get("total_cost_usd", 0.0)
+    _record_cost(model, input_tokens, output_tokens, cost_usd)
+    return text
+
 
 def check_model_available(model: str) -> bool:
     """Check if a model's CLI is available."""
@@ -22,7 +115,11 @@ def check_model_available(model: str) -> bool:
 
 
 async def invoke(prompt: str, model: str = "claude", timeout: int = DEFAULT_TIMEOUT) -> str:
-    """Invoke model via CLI, piping prompt through stdin."""
+    """Invoke model via CLI, piping prompt through stdin.
+
+    Uses --output-format json to capture token/cost data.
+    Accumulated stats available via get_cost_summary().
+    """
     if model not in MODEL_COMMANDS:
         raise ValueError(f"Unknown model: {model}. Available: {list(MODEL_COMMANDS.keys())}")
 
@@ -51,7 +148,7 @@ async def invoke(prompt: str, model: str = "claude", timeout: int = DEFAULT_TIME
     if proc.returncode != 0:
         raise RuntimeError(f"Model {model} failed: {stderr.decode()}")
 
-    return stdout.decode()
+    return _parse_cli_json(stdout.decode(), model)
 
 
 def invoke_sync(prompt: str, model: str = "claude", timeout: int = DEFAULT_TIMEOUT) -> str:
diff --git a/tests/test_llm.py b/tests/test_llm.py
new file mode 100644
index 0000000..3f41900
--- /dev/null
+++ b/tests/test_llm.py
@@ -0,0 +1,161 @@
+"""Tests for expert_build.llm — cost tracking and JSON output parsing."""
+
+import json
+
+import pytest
+
+from expert_build.llm import (
+    _parse_cli_json,
+    _record_cost,
+    reset_cost_tracker,
+    get_cost_summary,
+    format_cost_summary,
+)
+
+
+@pytest.fixture(autouse=True)
+def clean_tracker():
+    """Reset cost tracker before each test."""
+    reset_cost_tracker()
+    yield
+    reset_cost_tracker()
+
+
+# --- _parse_cli_json ---
+
+def test_parse_claude_json():
+    data = {
+        "result": "The answer is 4.",
+        "total_cost_usd": 0.21,
+        "usage": {
+            "input_tokens": 100,
+            "output_tokens": 10,
+            "cache_creation_input_tokens": 500,
+            "cache_read_input_tokens": 0,
+        },
+    }
+    text = _parse_cli_json(json.dumps(data), "claude")
+    assert text == "The answer is 4."
+    s = get_cost_summary()
+    assert s["calls"] == 1
+    assert s["input_tokens"] == 600
+    assert s["output_tokens"] == 10
+    assert s["total_cost_usd"] == 0.21
+
+
+def test_parse_gemini_json():
+    data = {
+        "response": "4",
+        "stats": {
+            "models": {
+                "gemini-2.5-flash": {
+                    "tokens": {"input": 200, "candidates": 5, "total": 205},
+                },
+            },
+        },
+    }
+    text = _parse_cli_json(json.dumps(data), "gemini")
+    assert text == "4"
+    s = get_cost_summary()
+    assert s["calls"] == 1
+    assert s["input_tokens"] == 200
+    assert s["output_tokens"] == 5
+    assert s["total_cost_usd"] == 0.0
+
+
+def test_parse_non_json_falls_back():
+    text = _parse_cli_json("Just plain text", "claude")
+    assert text == "Just plain text"
+    s = get_cost_summary()
+    assert s["calls"] == 0
+
+
+def test_parse_claude_null_result_falls_back():
+    data = {"result": None, "total_cost_usd": 0.01, "usage": {"input_tokens": 10, "output_tokens": 0}}
+    raw = json.dumps(data)
+    text = _parse_cli_json(raw, "claude")
+    assert text == raw
+    s = get_cost_summary()
+    assert s["calls"] == 1
+
+
+def test_parse_non_dict_json_falls_back():
+    text = _parse_cli_json("[1, 2, 3]", "claude")
+    assert text == "[1, 2, 3]"
+    s = get_cost_summary()
+    assert s["calls"] == 0
+
+
+def test_parse_gemini_multi_model():
+    data = {
+        "response": "answer",
+        "stats": {
+            "models": {
+                "gemini-2.5-flash-lite": {
+                    "tokens": {"input": 100, "candidates": 10},
+                },
+                "gemini-3-flash-preview": {
+                    "tokens": {"input": 500, "candidates": 20},
+                },
+            },
+        },
+    }
+    text = _parse_cli_json(json.dumps(data), "gemini")
+    assert text == "answer"
+    s = get_cost_summary()
+    assert s["input_tokens"] == 600
+    assert s["output_tokens"] == 30
+
+
+# --- cost accumulation ---
+
+def test_accumulates_across_calls():
+    _record_cost("claude", 100, 10, 0.10)
+    _record_cost("claude", 200, 20, 0.20)
+    s = get_cost_summary()
+    assert s["calls"] == 2
+    assert s["input_tokens"] == 300
+    assert s["output_tokens"] == 30
+    assert abs(s["total_cost_usd"] - 0.30) < 0.001
+
+
+def test_tracks_by_model():
+    _record_cost("claude", 100, 10, 0.10)
+    _record_cost("gemini", 200, 20, 0.0)
+    s = get_cost_summary()
+    assert s["by_model"]["claude"]["calls"] == 1
+    assert s["by_model"]["gemini"]["calls"] == 1
+    assert s["by_model"]["claude"]["total_cost_usd"] == 0.10
+
+
+def test_reset_clears_all():
+    _record_cost("claude", 100, 10, 0.10)
+    reset_cost_tracker()
+    s = get_cost_summary()
+    assert s["calls"] == 0
+    assert s["input_tokens"] == 0
+    assert s["by_model"] == {}
+
+
+# --- format_cost_summary ---
+
+def test_format_no_calls():
+    assert format_cost_summary() == ""
+
+
+def test_format_with_cost():
+    _record_cost("claude", 1000, 50, 0.1234)
+    result = format_cost_summary()
+    assert "Cost:" in result
+    assert "$0.1234" in result
+    assert "1,000 input" in result
+    assert "50 output" in result
+    assert "1 call(s)" in result
+
+
+def test_format_without_cost():
+    _record_cost("gemini", 500, 25, 0.0)
+    result = format_cost_summary()
+    assert "Cost:" in result
+    assert "$" not in result
+    assert "500 input" in result

```

## Your Task

Analyze the diff and identify what additional information you need to render confident verdicts.
Do NOT render verdicts yet. Only request observations.

## Available Observation Tools

| Tool | Purpose | When to use |
|------|---------|-------------|
| `exception_hierarchy` | Show exception MRO and subclasses | Retry logic, exception handling |
| `raises_analysis` | What exceptions a function raises | New function calls, error paths |
| `call_graph` | What a function calls | Impact analysis |
| `find_usages` | Where a symbol is used (with prod/test split) | Quick integration lookup |
| `find_callers` | Caller analysis with prod/test split and calling context | Method signature changes, return type changes, constructor modifications, integration verification |
| `test_coverage` | Find tests for a file (uses coverage-map if available) | Test coverage claims |
| `coverage_map_tests` | Find tests covering a file (from coverage-map.json) | Precise test coverage from actual execution |
| `coverage_map_files` | Find files covered by tests matching a pattern | Impact analysis for test changes |
| `function_body` | Full source of a function/method | Need complete function context beyond diff hunks |
| `file_imports` | Extract imports from a file | Verify import changes, check dependencies |
| `project_dependencies` | Get pyproject.toml/requirements.txt | Verify new imports have dependencies |
| `related_test_files` | Find test files for a source file | Discover tests by naming, imports, and coverage map |
| `class_hierarchy` | Show base classes and their `__init__` signatures | Class changes its parent, modifies `__init__`, or uses `super()` |
| `symbol_migration` | Check if a rename is complete across the repo | Symbol renamed in diff — verify old name is fully removed |
| `generator_info` | Report whether a function uses `yield` | Function might be a generator — affects return value semantics |

## What to Look For

1. **Exception handling**: Any `retry_if_exception_type`, `except`, or exception class references
2. **New dependencies**: Calls to external libraries where you don't know the error behavior
3. **Behavioral changes**: Modified logic where you need to verify callers/callees
4. **Test claims**: References to tests you can't see in the diff
5. **Inheritance changes**: Class definition changes, new base classes, `super()` calls
6. **Renames**: Symbols that appear to have been renamed in the diff
7. **Factory methods**: Calls to `@classmethod` / `@staticmethod` constructors (e.g. `Result.error(...)`) — request `function_body` to see their implementation

## Output Format

Output a JSON array of observation requests:

```json
[
  {"name": "descriptive_name", "tool": "tool_name", "params": {"param": "value"}},
  ...
]
```

If you don't need any observations (simple changes, all context is in the diff), output:

```json
[]
```

## Examples

For a diff containing `retry_if_exception_type((OSError, httpx.TransportError))`:
```json
[
  {"name": "oserror_subclasses", "tool": "exception_hierarchy", "params": {"class_name": "builtins.OSError"}},
  {"name": "transport_errors", "tool": "exception_hierarchy", "params": {"class_name": "httpx.TransportError"}}
]
```

For a diff adding a new function that calls `oauth_client.get_access_token()`:
```json
[
  {"name": "oauth_exceptions", "tool": "raises_analysis", "params": {"file_path": "src/auth/oauth.py", "function_name": "get_access_token"}}
]
```

For a diff modifying a method but you need the full function to verify:
```json
[
  {"name": "full_getattr", "tool": "function_body", "params": {"file_path": "src/proxy.py", "function_name": "__getattr__"}}
]
```

For a diff changing a method signature or return type (verify all callers):
```json
[
  {"name": "handle_request_callers", "tool": "find_callers", "params": {"symbol": "handle_request"}}
]
```

For a diff adding new imports (e.g., `import httpx`):
```json
[
  {"name": "file_imports", "tool": "file_imports", "params": {"file_path": "src/client.py"}},
  {"name": "project_deps", "tool": "project_dependencies", "params": {}}
]
```

For a diff calling a factory method like `ModuleResult.error_result(msg)`:
```json
[
  {"name": "error_result_body", "tool": "function_body", "params": {"file_path": "src/models.py", "function_name": "error_result"}}
]
```

For a diff where a class changes its parent class:
```json
[
  {"name": "client_hierarchy", "tool": "class_hierarchy", "params": {"class_name": "MyClient", "file_path": "src/client.py"}}
]
```

For a diff that renames a symbol (e.g., `OldClient` to `NewClient`):
```json
[
  {"name": "client_rename", "tool": "symbol_migration", "params": {"old_name": "OldClient", "new_name": "NewClient"}}
]
```

For a diff modifying a function that might be a generator:
```json
[
  {"name": "process_gen", "tool": "generator_info", "params": {"file_path": "src/pipeline.py", "function_name": "process_items"}}
]
```

Now analyze the diff above and output your observation requests as JSON:
