You are a senior code reviewer preparing to review code changes.

## Code Changes

```diff
diff --git a/expert_build/exam.py b/expert_build/exam.py
index 74340a1..554bdb6 100644
--- a/expert_build/exam.py
+++ b/expert_build/exam.py
@@ -1,5 +1,6 @@
 """Practice exam runner for nogood discovery."""
 
+import json
 import re
 import sys
 from pathlib import Path
@@ -102,25 +103,50 @@ def load_beliefs_for_context(db_path: str = REASONS_DB) -> str:
     return "\n".join(beliefs)
 
 
-def extract_answer(response: str) -> str:
-    """Extract the answer from LLM response."""
-    # Look for ANSWER: line
-    match = re.search(r"ANSWER:\s*(.+)", response, re.IGNORECASE)
-    if match:
-        ans = match.group(1).strip()
-        # If it's a letter choice, extract just the letter
-        letter_match = re.match(r"([a-d])[.):\s]", ans, re.IGNORECASE)
-        if letter_match:
-            return letter_match.group(1).lower()
-        return ans
-
-    # Fallback: look for a single letter on its own
-    lines = response.strip().split("\n")
-    for line in lines:
-        line = line.strip()
-        if re.match(r"^[a-d]$", line, re.IGNORECASE):
-            return line.lower()
+RETRY_JSON = "Your response was not valid JSON. Respond with ONLY the JSON object, no other text."
 
+
+def _extract_json(response: str) -> dict | None:
+    """Extract a JSON object from an LLM response."""
+    text = response.strip()
+    if text.startswith("```"):
+        lines = text.split("\n")
+        lines = [l for l in lines if not l.strip().startswith("```")]
+        text = "\n".join(lines).strip()
+    try:
+        return json.loads(text)
+    except (json.JSONDecodeError, ValueError):
+        pass
+    start = text.find("{")
+    end = text.rfind("}")
+    if start != -1 and end > start:
+        try:
+            return json.loads(text[start:end + 1])
+        except (json.JSONDecodeError, ValueError):
+            pass
+    return None
+
+
+def extract_answer(response: str, model: str = None, prompt: str = None) -> str:
+    """Extract answer from JSON LLM response, retrying on parse failure."""
+    data = _extract_json(response)
+    if data and "answer" in data:
+        return str(data["answer"]).strip()
+
+    if model and prompt:
+        print("    WARN: response not valid JSON, retrying...", file=sys.stderr)
+        try:
+            retry_response = invoke_sync(
+                prompt + "\n\n" + response + "\n\n" + RETRY_JSON,
+                model=model, timeout=60,
+            )
+            data = _extract_json(retry_response)
+            if data and "answer" in data:
+                return str(data["answer"]).strip()
+        except Exception:
+            pass
+
+    print("    WARN: could not parse answer JSON", file=sys.stderr)
     return response.strip()[:100]
 
 
@@ -132,16 +158,26 @@ def judge_answer(question: str, expected: str, got: str, model: str) -> tuple[bo
     except Exception:
         return False, "judge error"
 
-    verdict_match = re.search(r"VERDICT:\s*(CORRECT|WRONG)", response, re.IGNORECASE)
-    if not verdict_match:
-        return False, "no verdict"
+    data = _extract_json(response)
+    if data and "verdict" in data:
+        is_correct = data["verdict"].strip().upper() == "CORRECT"
+        return is_correct, data.get("explanation", "")
+
+    print("    WARN: verdict not valid JSON, retrying...", file=sys.stderr)
+    try:
+        retry_response = invoke_sync(
+            prompt + "\n\n" + response + "\n\n" + RETRY_JSON,
+            model=model, timeout=60,
+        )
+        data = _extract_json(retry_response)
+        if data and "verdict" in data:
+            is_correct = data["verdict"].strip().upper() == "CORRECT"
+            return is_correct, data.get("explanation", "")
+    except Exception:
+        pass
 
-    is_correct = verdict_match.group(1).upper() == "CORRECT"
-    explanation = ""
-    exp_match = re.search(r"EXPLANATION:\s*(.+)", response, re.IGNORECASE)
-    if exp_match:
-        explanation = exp_match.group(1).strip()
-    return is_correct, explanation
+    print("    WARN: could not parse verdict JSON", file=sys.stderr)
+    return False, "no verdict"
 
 
 def cmd_exam(args):
@@ -194,7 +230,7 @@ def cmd_exam(args):
             results.append({"question": q, "status": "ERROR", "error": str(e)})
             continue
 
-        answer = extract_answer(response)
+        answer = extract_answer(response, model=args.model, prompt=prompt)
         expected = q["correct"].strip().lower()
         use_judge = not getattr(args, 'no_judge', False)
 
diff --git a/expert_build/prompts.py b/expert_build/prompts.py
index 9110d81..d3c6d30 100644
--- a/expert_build/prompts.py
+++ b/expert_build/prompts.py
@@ -99,9 +99,8 @@
 Question: {question}
 {choices}
 
-Provide your answer and brief explanation. Format:
-ANSWER: <letter or text>
-EXPLANATION: <one paragraph>
+Respond with ONLY this JSON (no other text):
+{{"answer": "<letter or short answer>", "explanation": "<one paragraph>"}}
 """
 
 EXAM_JUDGE = """\
@@ -120,9 +119,8 @@
 different but valid approaches should count as correct. Missing key facts or \
 fundamentally wrong reasoning should count as wrong.
 
-Format your response as:
-VERDICT: CORRECT or WRONG
-EXPLANATION: <one sentence>
+Respond with ONLY this JSON (no other text):
+{{"verdict": "CORRECT or WRONG", "explanation": "<one sentence>"}}
 """
 
 CERT_MATCH = """\
diff --git a/tests/test_exam.py b/tests/test_exam.py
new file mode 100644
index 0000000..a8d9240
--- /dev/null
+++ b/tests/test_exam.py
@@ -0,0 +1,159 @@
+"""Tests for expert_build.exam — JSON answer/verdict parsing and retry."""
+
+from unittest.mock import patch
+
+from expert_build.exam import extract_answer, judge_answer, _extract_json
+
+
+# --- _extract_json ---
+
+def test_extract_json_plain():
+    assert _extract_json('{"answer": "b", "explanation": "because"}') == {
+        "answer": "b", "explanation": "because"
+    }
+
+
+def test_extract_json_with_code_fence():
+    response = '```json\n{"answer": "c", "explanation": "reason"}\n```'
+    assert _extract_json(response) == {"answer": "c", "explanation": "reason"}
+
+
+def test_extract_json_embedded_in_text():
+    response = 'Here is my answer:\n{"answer": "a", "explanation": "yes"}\nDone.'
+    result = _extract_json(response)
+    assert result["answer"] == "a"
+
+
+def test_extract_json_braces_in_value():
+    response = 'Sure: {"answer": "b", "explanation": "use {braces} here"}'
+    result = _extract_json(response)
+    assert result["answer"] == "b"
+    assert "{braces}" in result["explanation"]
+
+
+def test_extract_json_invalid():
+    assert _extract_json("No JSON here at all") is None
+
+
+def test_extract_json_truncated():
+    assert _extract_json('{"answer": "b", "explan') is None
+
+
+# --- extract_answer ---
+
+def test_extract_answer_from_json():
+    response = '{"answer": "b", "explanation": "because"}'
+    assert extract_answer(response) == "b"
+
+
+def test_extract_answer_strips_whitespace():
+    response = '{"answer": "  d  ", "explanation": "reason"}'
+    assert extract_answer(response) == "d"
+
+
+def test_extract_answer_retries_on_bad_json():
+    bad_response = "I think the answer is B because of reasons"
+
+    with patch("expert_build.exam.invoke_sync",
+               return_value='{"answer": "b", "explanation": "reasons"}') as mock_llm:
+        result = extract_answer(bad_response, model="test", prompt="original prompt")
+
+    assert result == "b"
+    assert mock_llm.called
+
+
+def test_extract_answer_fallback_after_failed_retry():
+    bad_response = "No format at all"
+
+    with patch("expert_build.exam.invoke_sync", return_value="Still no format"):
+        result = extract_answer(bad_response, model="test", prompt="original prompt")
+
+    assert result == "No format at all"
+
+
+def test_extract_answer_no_retry_without_model():
+    result = extract_answer("No JSON here")
+    assert result == "No JSON here"
+
+
+def test_extract_answer_code_fence():
+    response = '```json\n{"answer": "a", "explanation": "yes"}\n```'
+    assert extract_answer(response) == "a"
+
+
+# --- judge_answer ---
+
+def test_judge_correct():
+    with patch("expert_build.exam.invoke_sync",
+               return_value='{"verdict": "CORRECT", "explanation": "matches"}'):
+        is_correct, explanation = judge_answer("q", "expected", "got", "test")
+
+    assert is_correct is True
+    assert explanation == "matches"
+
+
+def test_judge_wrong():
+    with patch("expert_build.exam.invoke_sync",
+               return_value='{"verdict": "WRONG", "explanation": "missed key point"}'):
+        is_correct, explanation = judge_answer("q", "expected", "got", "test")
+
+    assert is_correct is False
+    assert explanation == "missed key point"
+
+
+def test_judge_retries_on_bad_json():
+    call_count = 0
+    def side_effect(prompt, model=None, timeout=None):
+        nonlocal call_count
+        call_count += 1
+        if call_count == 1:
+            return "I think this is correct because it matches"
+        return '{"verdict": "CORRECT", "explanation": "matches expected"}'
+
+    with patch("expert_build.exam.invoke_sync", side_effect=side_effect):
+        is_correct, explanation = judge_answer("q", "expected", "got", "test")
+
+    assert is_correct is True
+    assert call_count == 2
+
+
+def test_judge_fallback_after_failed_retry():
+    with patch("expert_build.exam.invoke_sync", return_value="No JSON at all"):
+        is_correct, explanation = judge_answer("q", "expected", "got", "test")
+
+    assert is_correct is False
+    assert explanation == "no verdict"
+
+
+def test_judge_handles_llm_error():
+    with patch("expert_build.exam.invoke_sync",
+               side_effect=RuntimeError("timeout")):
+        is_correct, explanation = judge_answer("q", "expected", "got", "test")
+
+    assert is_correct is False
+    assert explanation == "judge error"
+
+
+def test_judge_retry_itself_raises():
+    call_count = 0
+    def side_effect(prompt, model=None, timeout=None):
+        nonlocal call_count
+        call_count += 1
+        if call_count == 1:
+            return "Not JSON"
+        raise RuntimeError("retry timeout")
+
+    with patch("expert_build.exam.invoke_sync", side_effect=side_effect):
+        is_correct, explanation = judge_answer("q", "expected", "got", "test")
+
+    assert is_correct is False
+    assert explanation == "no verdict"
+    assert call_count == 2
+
+
+def test_judge_case_insensitive_verdict():
+    with patch("expert_build.exam.invoke_sync",
+               return_value='{"verdict": "correct", "explanation": "ok"}'):
+        is_correct, _ = judge_answer("q", "expected", "got", "test")
+
+    assert is_correct is True

```

## Your Task

Analyze the diff and identify what additional information you need to render confident verdicts.
Do NOT render verdicts yet. Only request observations.

## Available Observation Tools

| Tool | Purpose | When to use |
|------|---------|-------------|
| `exception_hierarchy` | Show exception MRO and subclasses | Retry logic, exception handling |
| `raises_analysis` | What exceptions a function raises | New function calls, error paths |
| `call_graph` | What a function calls | Impact analysis |
| `find_usages` | Where a symbol is used (with prod/test split) | Quick integration lookup |
| `find_callers` | Caller analysis with prod/test split and calling context | Method signature changes, return type changes, constructor modifications, integration verification |
| `test_coverage` | Find tests for a file (uses coverage-map if available) | Test coverage claims |
| `coverage_map_tests` | Find tests covering a file (from coverage-map.json) | Precise test coverage from actual execution |
| `coverage_map_files` | Find files covered by tests matching a pattern | Impact analysis for test changes |
| `function_body` | Full source of a function/method | Need complete function context beyond diff hunks |
| `file_imports` | Extract imports from a file | Verify import changes, check dependencies |
| `project_dependencies` | Get pyproject.toml/requirements.txt | Verify new imports have dependencies |
| `related_test_files` | Find test files for a source file | Discover tests by naming, imports, and coverage map |
| `class_hierarchy` | Show base classes and their `__init__` signatures | Class changes its parent, modifies `__init__`, or uses `super()` |
| `symbol_migration` | Check if a rename is complete across the repo | Symbol renamed in diff — verify old name is fully removed |
| `generator_info` | Report whether a function uses `yield` | Function might be a generator — affects return value semantics |

## What to Look For

1. **Exception handling**: Any `retry_if_exception_type`, `except`, or exception class references
2. **New dependencies**: Calls to external libraries where you don't know the error behavior
3. **Behavioral changes**: Modified logic where you need to verify callers/callees
4. **Test claims**: References to tests you can't see in the diff
5. **Inheritance changes**: Class definition changes, new base classes, `super()` calls
6. **Renames**: Symbols that appear to have been renamed in the diff
7. **Factory methods**: Calls to `@classmethod` / `@staticmethod` constructors (e.g. `Result.error(...)`) — request `function_body` to see their implementation

## Output Format

Output a JSON array of observation requests:

```json
[
  {"name": "descriptive_name", "tool": "tool_name", "params": {"param": "value"}},
  ...
]
```

If you don't need any observations (simple changes, all context is in the diff), output:

```json
[]
```

## Examples

For a diff containing `retry_if_exception_type((OSError, httpx.TransportError))`:
```json
[
  {"name": "oserror_subclasses", "tool": "exception_hierarchy", "params": {"class_name": "builtins.OSError"}},
  {"name": "transport_errors", "tool": "exception_hierarchy", "params": {"class_name": "httpx.TransportError"}}
]
```

For a diff adding a new function that calls `oauth_client.get_access_token()`:
```json
[
  {"name": "oauth_exceptions", "tool": "raises_analysis", "params": {"file_path": "src/auth/oauth.py", "function_name": "get_access_token"}}
]
```

For a diff modifying a method but you need the full function to verify:
```json
[
  {"name": "full_getattr", "tool": "function_body", "params": {"file_path": "src/proxy.py", "function_name": "__getattr__"}}
]
```

For a diff changing a method signature or return type (verify all callers):
```json
[
  {"name": "handle_request_callers", "tool": "find_callers", "params": {"symbol": "handle_request"}}
]
```

For a diff adding new imports (e.g., `import httpx`):
```json
[
  {"name": "file_imports", "tool": "file_imports", "params": {"file_path": "src/client.py"}},
  {"name": "project_deps", "tool": "project_dependencies", "params": {}}
]
```

For a diff calling a factory method like `ModuleResult.error_result(msg)`:
```json
[
  {"name": "error_result_body", "tool": "function_body", "params": {"file_path": "src/models.py", "function_name": "error_result"}}
]
```

For a diff where a class changes its parent class:
```json
[
  {"name": "client_hierarchy", "tool": "class_hierarchy", "params": {"class_name": "MyClient", "file_path": "src/client.py"}}
]
```

For a diff that renames a symbol (e.g., `OldClient` to `NewClient`):
```json
[
  {"name": "client_rename", "tool": "symbol_migration", "params": {"old_name": "OldClient", "new_name": "NewClient"}}
]
```

For a diff modifying a function that might be a generator:
```json
[
  {"name": "process_gen", "tool": "generator_info", "params": {"file_path": "src/pipeline.py", "function_name": "process_items"}}
]
```

Now analyze the diff above and output your observation requests as JSON:
