You are a senior code reviewer. Review the following code changes.

## Specification

No specification provided. Focus on correctness, tests, and integration.





## Code Changes

```diff
diff --git a/expert_build/exam.py b/expert_build/exam.py
index 74340a1..968cd74 100644
--- a/expert_build/exam.py
+++ b/expert_build/exam.py
@@ -1,5 +1,6 @@
 """Practice exam runner for nogood discovery."""
 
+import json
 import re
 import sys
 from pathlib import Path
@@ -102,25 +103,49 @@ def load_beliefs_for_context(db_path: str = REASONS_DB) -> str:
     return "\n".join(beliefs)
 
 
-def extract_answer(response: str) -> str:
-    """Extract the answer from LLM response."""
-    # Look for ANSWER: line
-    match = re.search(r"ANSWER:\s*(.+)", response, re.IGNORECASE)
+RETRY_JSON = "Your response was not valid JSON. Respond with ONLY the JSON object, no other text."
+
+
+def _extract_json(response: str) -> dict | None:
+    """Extract a JSON object from an LLM response."""
+    text = response.strip()
+    if text.startswith("```"):
+        lines = text.split("\n")
+        lines = [l for l in lines if not l.strip().startswith("```")]
+        text = "\n".join(lines).strip()
+    try:
+        return json.loads(text)
+    except (json.JSONDecodeError, ValueError):
+        pass
+    match = re.search(r"\{[^{}]+\}", text)
     if match:
-        ans = match.group(1).strip()
-        # If it's a letter choice, extract just the letter
-        letter_match = re.match(r"([a-d])[.):\s]", ans, re.IGNORECASE)
-        if letter_match:
-            return letter_match.group(1).lower()
-        return ans
-
-    # Fallback: look for a single letter on its own
-    lines = response.strip().split("\n")
-    for line in lines:
-        line = line.strip()
-        if re.match(r"^[a-d]$", line, re.IGNORECASE):
-            return line.lower()
+        try:
+            return json.loads(match.group())
+        except (json.JSONDecodeError, ValueError):
+            pass
+    return None
 
+
+def extract_answer(response: str, model: str = None, prompt: str = None) -> str:
+    """Extract answer from JSON LLM response, retrying on parse failure."""
+    data = _extract_json(response)
+    if data and "answer" in data:
+        return data["answer"].strip()
+
+    if model and prompt:
+        print("    WARN: response not valid JSON, retrying...", file=sys.stderr)
+        try:
+            retry_response = invoke_sync(
+                prompt + "\n\n" + response + "\n\n" + RETRY_JSON,
+                model=model, timeout=60,
+            )
+            data = _extract_json(retry_response)
+            if data and "answer" in data:
+                return data["answer"].strip()
+        except Exception:
+            pass
+
+    print("    WARN: could not parse answer JSON", file=sys.stderr)
     return response.strip()[:100]
 
 
@@ -132,16 +157,26 @@ def judge_answer(question: str, expected: str, got: str, model: str) -> tuple[bo
     except Exception:
         return False, "judge error"
 
-    verdict_match = re.search(r"VERDICT:\s*(CORRECT|WRONG)", response, re.IGNORECASE)
-    if not verdict_match:
-        return False, "no verdict"
+    data = _extract_json(response)
+    if data and "verdict" in data:
+        is_correct = data["verdict"].strip().upper() == "CORRECT"
+        return is_correct, data.get("explanation", "")
+
+    print("    WARN: verdict not valid JSON, retrying...", file=sys.stderr)
+    try:
+        retry_response = invoke_sync(
+            prompt + "\n\n" + response + "\n\n" + RETRY_JSON,
+            model=model, timeout=60,
+        )
+        data = _extract_json(retry_response)
+        if data and "verdict" in data:
+            is_correct = data["verdict"].strip().upper() == "CORRECT"
+            return is_correct, data.get("explanation", "")
+    except Exception:
+        pass
 
-    is_correct = verdict_match.group(1).upper() == "CORRECT"
-    explanation = ""
-    exp_match = re.search(r"EXPLANATION:\s*(.+)", response, re.IGNORECASE)
-    if exp_match:
-        explanation = exp_match.group(1).strip()
-    return is_correct, explanation
+    print("    WARN: could not parse verdict JSON", file=sys.stderr)
+    return False, "no verdict"
 
 
 def cmd_exam(args):
@@ -194,7 +229,7 @@ def cmd_exam(args):
             results.append({"question": q, "status": "ERROR", "error": str(e)})
             continue
 
-        answer = extract_answer(response)
+        answer = extract_answer(response, model=args.model, prompt=prompt)
         expected = q["correct"].strip().lower()
         use_judge = not getattr(args, 'no_judge', False)
 
diff --git a/expert_build/prompts.py b/expert_build/prompts.py
index 9110d81..d3c6d30 100644
--- a/expert_build/prompts.py
+++ b/expert_build/prompts.py
@@ -99,9 +99,8 @@
 Question: {question}
 {choices}
 
-Provide your answer and brief explanation. Format:
-ANSWER: <letter or text>
-EXPLANATION: <one paragraph>
+Respond with ONLY this JSON (no other text):
+{{"answer": "<letter or short answer>", "explanation": "<one paragraph>"}}
 """
 
 EXAM_JUDGE = """\
@@ -120,9 +119,8 @@
 different but valid approaches should count as correct. Missing key facts or \
 fundamentally wrong reasoning should count as wrong.
 
-Format your response as:
-VERDICT: CORRECT or WRONG
-EXPLANATION: <one sentence>
+Respond with ONLY this JSON (no other text):
+{{"verdict": "CORRECT or WRONG", "explanation": "<one sentence>"}}
 """
 
 CERT_MATCH = """\
diff --git a/tests/test_exam.py b/tests/test_exam.py
new file mode 100644
index 0000000..6d06293
--- /dev/null
+++ b/tests/test_exam.py
@@ -0,0 +1,137 @@
+"""Tests for expert_build.exam — JSON answer/verdict parsing and retry."""
+
+from unittest.mock import patch
+
+import pytest
+
+from expert_build.exam import extract_answer, judge_answer, _extract_json
+
+
+# --- _extract_json ---
+
+def test_extract_json_plain():
+    assert _extract_json('{"answer": "b", "explanation": "because"}') == {
+        "answer": "b", "explanation": "because"
+    }
+
+
+def test_extract_json_with_code_fence():
+    response = '```json\n{"answer": "c", "explanation": "reason"}\n```'
+    assert _extract_json(response) == {"answer": "c", "explanation": "reason"}
+
+
+def test_extract_json_embedded_in_text():
+    response = 'Here is my answer:\n{"answer": "a", "explanation": "yes"}\nDone.'
+    result = _extract_json(response)
+    assert result["answer"] == "a"
+
+
+def test_extract_json_invalid():
+    assert _extract_json("No JSON here at all") is None
+
+
+def test_extract_json_truncated():
+    assert _extract_json('{"answer": "b", "explan') is None
+
+
+# --- extract_answer ---
+
+def test_extract_answer_from_json():
+    response = '{"answer": "b", "explanation": "because"}'
+    assert extract_answer(response) == "b"
+
+
+def test_extract_answer_strips_whitespace():
+    response = '{"answer": "  d  ", "explanation": "reason"}'
+    assert extract_answer(response) == "d"
+
+
+def test_extract_answer_retries_on_bad_json():
+    bad_response = "I think the answer is B because of reasons"
+
+    with patch("expert_build.exam.invoke_sync",
+               return_value='{"answer": "b", "explanation": "reasons"}') as mock_llm:
+        result = extract_answer(bad_response, model="test", prompt="original prompt")
+
+    assert result == "b"
+    assert mock_llm.called
+
+
+def test_extract_answer_fallback_after_failed_retry():
+    bad_response = "No format at all"
+
+    with patch("expert_build.exam.invoke_sync", return_value="Still no format"):
+        result = extract_answer(bad_response, model="test", prompt="original prompt")
+
+    assert result == "No format at all"
+
+
+def test_extract_answer_no_retry_without_model():
+    result = extract_answer("No JSON here")
+    assert result == "No JSON here"
+
+
+def test_extract_answer_code_fence():
+    response = '```json\n{"answer": "a", "explanation": "yes"}\n```'
+    assert extract_answer(response) == "a"
+
+
+# --- judge_answer ---
+
+def test_judge_correct():
+    with patch("expert_build.exam.invoke_sync",
+               return_value='{"verdict": "CORRECT", "explanation": "matches"}'):
+        is_correct, explanation = judge_answer("q", "expected", "got", "test")
+
+    assert is_correct is True
+    assert explanation == "matches"
+
+
+def test_judge_wrong():
+    with patch("expert_build.exam.invoke_sync",
+               return_value='{"verdict": "WRONG", "explanation": "missed key point"}'):
+        is_correct, explanation = judge_answer("q", "expected", "got", "test")
+
+    assert is_correct is False
+    assert explanation == "missed key point"
+
+
+def test_judge_retries_on_bad_json():
+    call_count = 0
+    def side_effect(prompt, model=None, timeout=None):
+        nonlocal call_count
+        call_count += 1
+        if call_count == 1:
+            return "I think this is correct because it matches"
+        return '{"verdict": "CORRECT", "explanation": "matches expected"}'
+
+    with patch("expert_build.exam.invoke_sync", side_effect=side_effect):
+        is_correct, explanation = judge_answer("q", "expected", "got", "test")
+
+    assert is_correct is True
+    assert call_count == 2
+
+
+def test_judge_fallback_after_failed_retry():
+    with patch("expert_build.exam.invoke_sync", return_value="No JSON at all"):
+        is_correct, explanation = judge_answer("q", "expected", "got", "test")
+
+    assert is_correct is False
+    assert explanation == "no verdict"
+
+
+def test_judge_handles_llm_error():
+    with patch("expert_build.exam.invoke_sync",
+               side_effect=RuntimeError("timeout")):
+        is_correct, explanation = judge_answer("q", "expected", "got", "test")
+
+    assert is_correct is False
+    assert explanation == "judge error"
+
+
+def test_judge_case_insensitive_verdict():
+    with patch("expert_build.exam.invoke_sync",
+               return_value='{"verdict": "correct", "explanation": "ok"}'):
+        is_correct, _ = judge_answer("q", "expected", "got", "test")
+
+    assert is_correct is True

```

## Observation Results

You previously requested observations. Here are the results:

```json
{
  "judge_answer_full": {
    "function": "judge_answer",
    "file": "expert_build/exam.py",
    "start_line": 152,
    "end_line": 179,
    "source": "def judge_answer(question: str, expected: str, got: str, model: str) -> tuple[bool, str]:\n    \"\"\"Use LLM to judge if an open-ended answer is semantically correct.\"\"\"\n    prompt = EXAM_JUDGE.format(question=question, expected=expected, got=got)\n    try:\n        response = invoke_sync(prompt, model=model, timeout=60)\n    except Exception:\n        return False, \"judge error\"\n\n    data = _extract_json(response)\n    if data and \"verdict\" in data:\n        is_correct = data[\"verdict\"].strip().upper() == \"CORRECT\"\n        return is_correct, data.get(\"explanation\", \"\")\n\n    print(\"    WARN: verdict not valid JSON, retrying...\", file=sys.stderr)\n    try:\n        retry_response = invoke_sync(\n            prompt + \"\\n\\n\" + response + \"\\n\\n\" + RETRY_JSON,\n            model=model, timeout=60,\n        )\n        data = _extract_json(retry_response)\n        if data and \"verdict\" in data:\n            is_correct = data[\"verdict\"].strip().upper() == \"CORRECT\"\n            return is_correct, data.get(\"explanation\", \"\")\n    except Exception:\n        pass\n\n    print(\"    WARN: could not parse verdict JSON\", file=sys.stderr)\n    return False, \"no verdict\""
  },
  "invoke_sync_body": {
    "error": "No function found at 'invoke_sync'",
    "file": "expert_build/exam.py"
  },
  "invoke_sync_raises": {
    "function": "invoke_sync",
    "file": "expert_build/exam.py",
    "explicit_raises": [],
    "calls": [],
    "error": "Function 'invoke_sync' not found"
  },
  "extract_answer_callers": {
    "symbol": "extract_answer",
    "production_callers": [
      {
        "file": "expert_build/exam.py",
        "line": 129,
        "text": "def extract_answer(response: str, model: str = None, prompt: str = None) -> str:",
        "context_function": "_extract_json",
        "context_snippet": "   126:     return None\n   127: \n   128: \n>> 129: def extract_answer(response: str, model: str = None, prompt: str = None) -> str:\n   130:     \"\"\"Extract answer from JSON LLM response, retrying on parse failure.\"\"\"\n   131:     data = _extract_json(response)\n   132:     if data and \"answer\" in data:"
      },
      {
        "file": "expert_build/exam.py",
        "line": 232,
        "text": "answer = extract_answer(response, model=args.model, prompt=prompt)",
        "context_function": "cmd_exam",
        "context_snippet": "   229:             results.append({\"question\": q, \"status\": \"ERROR\", \"error\": str(e)})\n   230:             continue\n   231: \n>> 232:         answer = extract_answer(response, model=args.model, prompt=prompt)\n   233:         expected = q[\"correct\"].strip().lower()\n   234:         use_judge = not getattr(args, 'no_judge', False)\n   235: "
      }
    ],
    "test_callers": [
      {
        "file": "tests/test_exam.py",
        "line": 7,
        "text": "from expert_build.exam import extract_answer, judge_answer, _extract_json",
        "context_function": null,
        "context_snippet": "   4: \n   5: import pytest\n   6: \n>> 7: from expert_build.exam import extract_answer, judge_answer, _extract_json\n   8: \n   9: \n   10: # --- _extract_json ---"
      },
      {
        "file": "tests/test_exam.py",
        "line": 37,
        "text": "# --- extract_answer ---",
        "context_function": "test_extract_json_truncated",
        "context_snippet": "   34:     assert _extract_json('{\"answer\": \"b\", \"explan') is None\n   35: \n   36: \n>> 37: # --- extract_answer ---\n   38: \n   39: def test_extract_answer_from_json():\n   40:     response = '{\"answer\": \"b\", \"explanation\": \"because\"}'"
      },
      {
        "file": "tests/test_exam.py",
        "line": 39,
        "text": "def test_extract_answer_from_json():",
        "context_function": "test_extract_json_truncated",
        "context_snippet": "   36: \n   37: # --- extract_answer ---\n   38: \n>> 39: def test_extract_answer_from_json():\n   40:     response = '{\"answer\": \"b\", \"explanation\": \"because\"}'\n   41:     assert extract_answer(response) == \"b\"\n   42: "
      },
      {
        "file": "tests/test_exam.py",
        "line": 41,
        "text": "assert extract_answer(response) == \"b\"",
        "context_function": "test_extract_answer_from_json",
        "context_snippet": "   38: \n   39: def test_extract_answer_from_json():\n   40:     response = '{\"answer\": \"b\", \"explanation\": \"because\"}'\n>> 41:     assert extract_answer(response) == \"b\"\n   42: \n   43: \n   44: def test_extract_answer_strips_whitespace():"
      },
      {
        "file": "tests/test_exam.py",
        "line": 44,
        "text": "def test_extract_answer_strips_whitespace():",
        "context_function": "test_extract_answer_from_json",
        "context_snippet": "   41:     assert extract_answer(response) == \"b\"\n   42: \n   43: \n>> 44: def test_extract_answer_strips_whitespace():\n   45:     response = '{\"answer\": \"  d  \", \"explanation\": \"reason\"}'\n   46:     assert extract_answer(response) == \"d\"\n   47: "
      },
      {
        "file": "tests/test_exam.py",
        "line": 46,
        "text": "assert extract_answer(response) == \"d\"",
        "context_function": "test_extract_answer_strips_whitespace",
        "context_snippet": "   43: \n   44: def test_extract_answer_strips_whitespace():\n   45:     response = '{\"answer\": \"  d  \", \"explanation\": \"reason\"}'\n>> 46:     assert extract_answer(response) == \"d\"\n   47: \n   48: \n   49: def test_extract_answer_retries_on_bad_json():"
      },
      {
        "file": "tests/test_exam.py",
        "line": 49,
        "text": "def test_extract_answer_retries_on_bad_json():",
        "context_function": "test_extract_answer_strips_whitespace",
        "context_snippet": "   46:     assert extract_answer(response) == \"d\"\n   47: \n   48: \n>> 49: def test_extract_answer_retries_on_bad_json():\n   50:     bad_response = \"I think the answer is B because of reasons\"\n   51: \n   52:     with patch(\"expert_build.exam.invoke_sync\","
      },
      {
        "file": "tests/test_exam.py",
        "line": 54,
        "text": "result = extract_answer(bad_response, model=\"test\", prompt=\"original prompt\")",
        "context_function": "test_extract_answer_retries_on_bad_json",
        "context_snippet": "   51: \n   52:     with patch(\"expert_build.exam.invoke_sync\",\n   53:                return_value='{\"answer\": \"b\", \"explanation\": \"reasons\"}') as mock_llm:\n>> 54:         result = extract_answer(bad_response, model=\"test\", prompt=\"original prompt\")\n   55: \n   56:     assert result == \"b\"\n   57:     assert mock_llm.called"
      },
      {
        "file": "tests/test_exam.py",
        "line": 60,
        "text": "def test_extract_answer_fallback_after_failed_retry():",
        "context_function": "test_extract_answer_retries_on_bad_json",
        "context_snippet": "   57:     assert mock_llm.called\n   58: \n   59: \n>> 60: def test_extract_answer_fallback_after_failed_retry():\n   61:     bad_response = \"No format at all\"\n   62: \n   63:     with patch(\"expert_build.exam.invoke_sync\", return_value=\"Still no format\"):"
      },
      {
        "file": "tests/test_exam.py",
        "line": 64,
        "text": "result = extract_answer(bad_response, model=\"test\", prompt=\"original prompt\")",
        "context_function": "test_extract_answer_fallback_after_failed_retry",
        "context_snippet": "   61:     bad_response = \"No format at all\"\n   62: \n   63:     with patch(\"expert_build.exam.invoke_sync\", return_value=\"Still no format\"):\n>> 64:         result = extract_answer(bad_response, model=\"test\", prompt=\"original prompt\")\n   65: \n   66:     assert result == \"No format at all\"\n   67: "
      },
      {
        "file": "tests/test_exam.py",
        "line": 69,
        "text": "def test_extract_answer_no_retry_without_model():",
        "context_function": "test_extract_answer_fallback_after_failed_retry",
        "context_snippet": "   66:     assert result == \"No format at all\"\n   67: \n   68: \n>> 69: def test_extract_answer_no_retry_without_model():\n   70:     result = extract_answer(\"No JSON here\")\n   71:     assert result == \"No JSON here\"\n   72: "
      },
      {
        "file": "tests/test_exam.py",
        "line": 70,
        "text": "result = extract_answer(\"No JSON here\")",
        "context_function": "test_extract_answer_no_retry_without_model",
        "context_snippet": "   67: \n   68: \n   69: def test_extract_answer_no_retry_without_model():\n>> 70:     result = extract_answer(\"No JSON here\")\n   71:     assert result == \"No JSON here\"\n   72: \n   73: "
      },
      {
        "file": "tests/test_exam.py",
        "line": 74,
        "text": "def test_extract_answer_code_fence():",
        "context_function": "test_extract_answer_no_retry_without_model",
        "context_snippet": "   71:     assert result == \"No JSON here\"\n   72: \n   73: \n>> 74: def test_extract_answer_code_fence():\n   75:     response = '```json\\n{\"answer\": \"a\", \"explanation\": \"yes\"}\\n```'\n   76:     assert extract_answer(response) == \"a\"\n   77: "
      },
      {
        "file": "tests/test_exam.py",
        "line": 76,
        "text": "assert extract_answer(response) == \"a\"",
        "context_function": "test_extract_answer_code_fence",
        "context_snippet": "   73: \n   74: def test_extract_answer_code_fence():\n   75:     response = '```json\\n{\"answer\": \"a\", \"explanation\": \"yes\"}\\n```'\n>> 76:     assert extract_answer(response) == \"a\"\n   77: \n   78: \n   79: # --- judge_answer ---"
      }
    ],
    "production_count": 2,
    "test_count": 14,
    "total_count": 16
  },
  "judge_answer_callers": {
    "symbol": "judge_answer",
    "production_callers": [
      {
        "file": "expert_build/exam.py",
        "line": 152,
        "text": "def judge_answer(question: str, expected: str, got: str, model: str) -> tuple[bool, str]:",
        "context_function": "extract_answer",
        "context_snippet": "   149:     return response.strip()[:100]\n   150: \n   151: \n>> 152: def judge_answer(question: str, expected: str, got: str, model: str) -> tuple[bool, str]:\n   153:     \"\"\"Use LLM to judge if an open-ended answer is semantically correct.\"\"\"\n   154:     prompt = EXAM_JUDGE.format(question=question, expected=expected, got=got)\n   155:     try:"
      },
      {
        "file": "expert_build/exam.py",
        "line": 241,
        "text": "is_correct, judge_note = judge_answer(",
        "context_function": "cmd_exam",
        "context_snippet": "   238:             is_correct = answer.lower() == expected\n   239:             judge_note = \"\"\n   240:         elif use_judge:\n>> 241:             is_correct, judge_note = judge_answer(\n   242:                 q[\"text\"], q[\"correct\"], response, args.model,\n   243:             )\n   244:         else:"
      }
    ],
    "test_callers": [
      {
        "file": "tests/test_exam.py",
        "line": 7,
        "text": "from expert_build.exam import extract_answer, judge_answer, _extract_json",
        "context_function": null,
        "context_snippet": "   4: \n   5: import pytest\n   6: \n>> 7: from expert_build.exam import extract_answer, judge_answer, _extract_json\n   8: \n   9: \n   10: # --- _extract_json ---"
      },
      {
        "file": "tests/test_exam.py",
        "line": 79,
        "text": "# --- judge_answer ---",
        "context_function": "test_extract_answer_code_fence",
        "context_snippet": "   76:     assert extract_answer(response) == \"a\"\n   77: \n   78: \n>> 79: # --- judge_answer ---\n   80: \n   81: def test_judge_correct():\n   82:     with patch(\"expert_build.exam.invoke_sync\","
      },
      {
        "file": "tests/test_exam.py",
        "line": 84,
        "text": "is_correct, explanation = judge_answer(\"q\", \"expected\", \"got\", \"test\")",
        "context_function": "test_judge_correct",
        "context_snippet": "   81: def test_judge_correct():\n   82:     with patch(\"expert_build.exam.invoke_sync\",\n   83:                return_value='{\"verdict\": \"CORRECT\", \"explanation\": \"matches\"}'):\n>> 84:         is_correct, explanation = judge_answer(\"q\", \"expected\", \"got\", \"test\")\n   85: \n   86:     assert is_correct is True\n   87:     assert explanation == \"matches\""
      },
      {
        "file": "tests/test_exam.py",
        "line": 93,
        "text": "is_correct, explanation = judge_answer(\"q\", \"expected\", \"got\", \"test\")",
        "context_function": "test_judge_wrong",
        "context_snippet": "   90: def test_judge_wrong():\n   91:     with patch(\"expert_build.exam.invoke_sync\",\n   92:                return_value='{\"verdict\": \"WRONG\", \"explanation\": \"missed key point\"}'):\n>> 93:         is_correct, explanation = judge_answer(\"q\", \"expected\", \"got\", \"test\")\n   94: \n   95:     assert is_correct is False\n   96:     assert explanation == \"missed key point\""
      },
      {
        "file": "tests/test_exam.py",
        "line": 109,
        "text": "is_correct, explanation = judge_answer(\"q\", \"expected\", \"got\", \"test\")",
        "context_function": "side_effect",
        "context_snippet": "   106:         return '{\"verdict\": \"CORRECT\", \"explanation\": \"matches expected\"}'\n   107: \n   108:     with patch(\"expert_build.exam.invoke_sync\", side_effect=side_effect):\n>> 109:         is_correct, explanation = judge_answer(\"q\", \"expected\", \"got\", \"test\")\n   110: \n   111:     assert is_correct is True\n   112:     assert call_count == 2"
      },
      {
        "file": "tests/test_exam.py",
        "line": 117,
        "text": "is_correct, explanation = judge_answer(\"q\", \"expected\", \"got\", \"test\")",
        "context_function": "test_judge_fallback_after_failed_retry",
        "context_snippet": "   114: \n   115: def test_judge_fallback_after_failed_retry():\n   116:     with patch(\"expert_build.exam.invoke_sync\", return_value=\"No JSON at all\"):\n>> 117:         is_correct, explanation = judge_answer(\"q\", \"expected\", \"got\", \"test\")\n   118: \n   119:     assert is_correct is False\n   120:     assert explanation == \"no verdict\""
      },
      {
        "file": "tests/test_exam.py",
        "line": 126,
        "text": "is_correct, explanation = judge_answer(\"q\", \"expected\", \"got\", \"test\")",
        "context_function": "test_judge_handles_llm_error",
        "context_snippet": "   123: def test_judge_handles_llm_error():\n   124:     with patch(\"expert_build.exam.invoke_sync\",\n   125:                side_effect=RuntimeError(\"timeout\")):\n>> 126:         is_correct, explanation = judge_answer(\"q\", \"expected\", \"got\", \"test\")\n   127: \n   128:     assert is_correct is False\n   129:     assert explanation == \"judge error\""
      },
      {
        "file": "tests/test_exam.py",
        "line": 135,
        "text": "is_correct, _ = judge_answer(\"q\", \"expected\", \"got\", \"test\")",
        "context_function": "test_judge_case_insensitive_verdict",
        "context_snippet": "   132: def test_judge_case_insensitive_verdict():\n   133:     with patch(\"expert_build.exam.invoke_sync\",\n   134:                return_value='{\"verdict\": \"correct\", \"explanation\": \"ok\"}'):\n>> 135:         is_correct, _ = judge_answer(\"q\", \"expected\", \"got\", \"test\")\n   136: \n   137:     assert is_correct is True"
      }
    ],
    "production_count": 2,
    "test_count": 8,
    "total_count": 10
  },
  "exam_imports": {
    "file": "expert_build/exam.py",
    "imports": [
      "json",
      "re",
      "sys"
    ],
    "from_imports": [
      {
        "module": "pathlib",
        "names": [
          "Path"
        ]
      },
      {
        "module": "reasons_lib.api",
        "names": [
          "add_node",
          "add_nogood",
          "list_nodes"
        ]
      },
      {
        "module": "llm",
        "names": [
          "check_model_available",
          "invoke_sync"
        ]
      },
      {
        "module": "prompts",
        "names": [
          "EXAM_ANSWER",
          "EXAM_JUDGE"
        ]
      }
    ],
    "import_section": "\"\"\"Practice exam runner for nogood discovery.\"\"\"\n\nimport json\nimport re\nimport sys\nfrom pathlib import Path\n\nfrom reasons_lib.api import add_node, add_nogood, list_nodes\n\nfrom .llm import check_model_available, invoke_sync\nfrom .prompts import EXAM_ANSWER, EXAM_JUDGE\n\nREASONS_DB = \"reasons.db\"\n\n\ndef parse_questions(filepath: Path) -> list[dict]:\n    \"\"\"Parse practice questions from markdown.\n"
  },
  "cmd_exam_body": {
    "function": "cmd_exam",
    "file": "expert_build/exam.py",
    "start_line": 182,
    "end_line": 319,
    "source": "def cmd_exam(args):\n    \"\"\"Run practice questions through LLM, discover nogoods.\"\"\"\n    q_path = Path(args.questions_file)\n    if not q_path.exists():\n        print(f\"Questions file not found: {q_path}\")\n        sys.exit(1)\n\n    if not check_model_available(args.model):\n        print(f\"Model not available: {args.model}\")\n        sys.exit(1)\n\n    questions = parse_questions(q_path)\n    if not questions:\n        print(f\"No questions found in {q_path}\")\n        print(\"Expected format: ## Q1: Question text\\\\nAnswer: correct answer\")\n        return\n\n    if args.limit:\n        questions = questions[:args.limit]\n\n    db_path = str(args.beliefs_file)\n    beliefs_context = load_beliefs_for_context(db_path=db_path)\n\n    print(f\"=== Exam: {q_path.name} ===\")\n    print(f\"Questions: {len(questions)}\")\n    print(f\"Model: {args.model}\\n\")\n\n    correct = 0\n    wrong = []\n    results = []  # Per-question results for output file\n\n    for q in questions:\n        # Format choices\n        choices_text = \"\"\n        if q[\"choices\"]:\n            choices_text = \"\\n\".join(f\"  {k}) {v}\" for k, v in sorted(q[\"choices\"].items()))\n\n        prompt = EXAM_ANSWER.format(\n            beliefs=beliefs_context,\n            question=q[\"text\"],\n            choices=choices_text,\n        )\n\n        try:\n            response = invoke_sync(prompt, model=args.model, timeout=120)\n        except Exception as e:\n            print(f\"  {q['id']}: ERROR - {e}\")\n            results.append({\"question\": q, \"status\": \"ERROR\", \"error\": str(e)})\n            continue\n\n        answer = extract_answer(response, model=args.model, prompt=prompt)\n        expected = q[\"correct\"].strip().lower()\n        use_judge = not getattr(args, 'no_judge', False)\n\n        # Score: MC uses exact match, open-ended uses LLM judge\n        if q[\"choices\"] or len(expected) == 1:\n            is_correct = answer.lower() == expected\n            judge_note = \"\"\n        elif use_judge:\n            is_correct, judge_note = judge_answer(\n                q[\"text\"], q[\"correct\"], response, args.model,\n            )\n        else:\n            is_correct = expected in answer.lower() or answer.lower() in expected\n            judge_note = \"\"\n\n        if is_correct:\n            correct += 1\n            print(f\"  {q['id']}: CORRECT\")\n            results.append({\"question\": q, \"status\": \"CORRECT\", \"got\": answer, \"response\": response, \"judge\": judge_note})\n        else:\n            wrong.append({\n                \"question\": q,\n                \"got\": answer,\n                \"expected\": q[\"correct\"],\n                \"response\": response,\n                \"judge\": judge_note,\n            })\n            results.append({\"question\": q, \"status\": \"WRONG\", \"got\": answer, \"expected\": q[\"correct\"], \"response\": response, \"judge\": judge_note})\n            print(f\"  {q['id']}: WRONG (expected: {q['correct']}, got: {answer})\")\n            if judge_note:\n                print(f\"    Judge: {judge_note}\")\n\n    # Summary\n    total = len(questions)\n    pct = 100 * correct // total if total else 0\n    print(f\"\\n=== Results ===\")\n    print(f\"Score: {correct}/{total} ({pct}%)\")\n\n    # Gaps by objective\n    obj_scores: dict[str, dict] = {}\n    for q in questions:\n        obj = q.get(\"objective\", \"general\")\n        if obj not in obj_scores:\n            obj_scores[obj] = {\"correct\": 0, \"total\": 0}\n        obj_scores[obj][\"total\"] += 1\n\n    for q in questions:\n        obj = q.get(\"objective\", \"general\")\n        if not any(w[\"question\"][\"id\"] == q[\"id\"] for w in wrong):\n            obj_scores[obj][\"correct\"] += 1\n\n    if wrong:\n        print(f\"\\nWRONG ANSWERS ({len(wrong)}):\\n\")\n        for w in wrong:\n            q = w[\"question\"]\n            print(f\"  {q['id']}: {q['text']}\")\n            print(f\"    Expected: {w['expected']}\")\n            print(f\"    Got: {w['got']}\")\n            if q[\"objective\"]:\n                print(f\"    Objective: {q['objective']}\")\n\n            # Record exam failure as a node for tracking\n            nogood_id = f\"exam-fail-{q['id'].lower()}\"\n            description = f\"Exam {q['id']}: expected '{w['expected']}' but agent answered '{w['got']}' for: {q['text']}\"\n            resolution = f\"Review and update beliefs about: {q['objective'] or q['text']}\"\n            try:\n                add_node(\n                    node_id=nogood_id,\n                    text=f\"{description} \u2014 {resolution}\",\n                    source=str(q_path),\n                    db_path=db_path,\n                )\n                print(f\"    -> Recorded as nogood\")\n            except Exception:\n                print(f\"    -> WARN: could not record nogood\")\n\n        print(f\"\\nBY OBJECTIVE:\")\n        for obj, scores in sorted(obj_scores.items(), key=lambda x: x[1][\"correct\"] / max(x[1][\"total\"], 1)):\n            pct = 100 * scores[\"correct\"] // scores[\"total\"] if scores[\"total\"] else 0\n            weak = \" *** WEAK AREA\" if pct < 50 else \"\"\n            print(f\"  {obj}: {scores['correct']}/{scores['total']} ({pct}%){weak}\")\n\n    # Write output file if requested\n    output_path = getattr(args, \"output\", None)\n    if output_path:\n        _write_results(output_path, q_path, args.model, questions, results, wrong, obj_scores, correct, total)\n        print(f\"\\nResults saved to {output_path}\")"
  }
}
```

Use these results to inform your review. Do not request the same observations again.


## Instructions

For each significant change (new file, modified function, etc.), provide a structured verdict.

Use this exact format for each change:

### <file_path or file_path:function_name>
VERDICT: PASS | CONCERN | BLOCK
CORRECTNESS: VALID | QUESTIONABLE | BROKEN
SPEC_COMPLIANCE: MEETS | PARTIAL | VIOLATES | N/A
ISSUE_COMPLIANCE: ADDRESSES | PARTIAL | UNRELATED | N/A
BELIEF_COMPLIANCE: CONSISTENT | VIOLATES | N/A
TEST_COVERAGE: COVERED | PARTIAL | UNTESTED
INTEGRATION: WIRED | PARTIAL | MISSING
REASONING: <brief explanation of your assessment>
---

## Review Criteria

1. **CORRECTNESS**: Does the code do what it claims? Is the logic sound?
   - VALID: Logic is correct, no bugs apparent
   - QUESTIONABLE: Logic may have edge cases or unclear behavior
   - BROKEN: Clear bugs or incorrect behavior

2. **SPEC_COMPLIANCE**: Does it meet MUST requirements from the spec?
   - MEETS: All relevant spec requirements satisfied
   - PARTIAL: Some requirements met, others missing or incomplete
   - VIOLATES: Contradicts spec requirements
   - N/A: No spec provided or not applicable

3. **ISSUE_COMPLIANCE** (only when an issue is provided): Do the changes address the problem or feature described in the issue?
   - ADDRESSES: Changes directly solve the issue's stated problem or implement the requested feature
   - PARTIAL: Changes partially address the issue but leave some aspects unresolved
   - UNRELATED: Changes do not appear related to the issue
   - N/A: No issue provided

4. **TEST_COVERAGE**: Are there tests for the new/changed code?
   - COVERED: Tests exist and cover the changes
   - PARTIAL: Some tests exist but coverage is incomplete
   - UNTESTED: No tests for the changes

5. **INTEGRATION**: Are callers updated? Is the feature usable end-to-end?
   - WIRED: Feature is fully integrated and usable
   - PARTIAL: Interface exists but callers not updated, or integration incomplete
   - MISSING: No integration with existing code

6. **BELIEF_COMPLIANCE** (only when beliefs are provided): Do the changes respect known architectural invariants, contracts, and rules?
   - CONSISTENT: Changes align with or reinforce known beliefs
   - VIOLATES: Changes contradict a specific belief — cite the belief ID
   - N/A: No beliefs provided or no relevant beliefs apply

## Verdict Guidelines

- **BLOCK**: Security issues, broken functionality, spec violations, or missing critical integration
- **CONCERN**: Missing tests, partial integration, questionable patterns, or unclear logic
- **PASS**: Correct, tested, well-integrated code

## Important

- Full function bodies for modified functions may be available in the observations section — use them to verify the complete logic, not just the diff hunks
- Related test files (prefixed with ``related_test:``) may be included in observations — check whether existing test assertions still match modified return types, signatures, or behavior. Flag any test that would break due to the changes
- If duplicate test coverage is detected (multiple test files covering the same source), note it in your review
- Focus on actual issues, not style preferences
- If a method signature is added but callers aren't updated, that's PARTIAL integration
- Be specific in reasoning - reference line numbers or function names
- When in doubt, use CONCERN rather than PASS

## Self-Review

After completing your review, add a brief self-assessment:

### SELF_REVIEW
LIMITATIONS: <what context were you missing that affected review quality?>
---

Examples of limitations:
- "Could not see full class to verify no other methods access the modified field"
- "Test file not included in diff - cannot verify coverage claims"
- "Spec file referenced but not provided"


## Feature Requests

If this review tool could be improved to help you do a better job, suggest features:

### FEATURE_REQUESTS
- <suggestion 1>
- <suggestion 2>
---

Examples:
- "Include full file context for modified functions, not just diff hunks"
- "Show callers of modified methods to verify integration"
- "Include test file alongside implementation changes"

Only include this section if you have specific suggestions. Skip if none.
