You are a senior code reviewer. Review the following code changes.

## Specification

No specification provided. Focus on correctness, tests, and integration.





## Code Changes

```diff
diff --git a/expert_build/exam.py b/expert_build/exam.py
index 74340a1..554bdb6 100644
--- a/expert_build/exam.py
+++ b/expert_build/exam.py
@@ -1,5 +1,6 @@
 """Practice exam runner for nogood discovery."""
 
+import json
 import re
 import sys
 from pathlib import Path
@@ -102,25 +103,50 @@ def load_beliefs_for_context(db_path: str = REASONS_DB) -> str:
     return "\n".join(beliefs)
 
 
-def extract_answer(response: str) -> str:
-    """Extract the answer from LLM response."""
-    # Look for ANSWER: line
-    match = re.search(r"ANSWER:\s*(.+)", response, re.IGNORECASE)
-    if match:
-        ans = match.group(1).strip()
-        # If it's a letter choice, extract just the letter
-        letter_match = re.match(r"([a-d])[.):\s]", ans, re.IGNORECASE)
-        if letter_match:
-            return letter_match.group(1).lower()
-        return ans
-
-    # Fallback: look for a single letter on its own
-    lines = response.strip().split("\n")
-    for line in lines:
-        line = line.strip()
-        if re.match(r"^[a-d]$", line, re.IGNORECASE):
-            return line.lower()
+RETRY_JSON = "Your response was not valid JSON. Respond with ONLY the JSON object, no other text."
 
+
+def _extract_json(response: str) -> dict | None:
+    """Extract a JSON object from an LLM response."""
+    text = response.strip()
+    if text.startswith("```"):
+        lines = text.split("\n")
+        lines = [l for l in lines if not l.strip().startswith("```")]
+        text = "\n".join(lines).strip()
+    try:
+        return json.loads(text)
+    except (json.JSONDecodeError, ValueError):
+        pass
+    start = text.find("{")
+    end = text.rfind("}")
+    if start != -1 and end > start:
+        try:
+            return json.loads(text[start:end + 1])
+        except (json.JSONDecodeError, ValueError):
+            pass
+    return None
+
+
+def extract_answer(response: str, model: str = None, prompt: str = None) -> str:
+    """Extract answer from JSON LLM response, retrying on parse failure."""
+    data = _extract_json(response)
+    if data and "answer" in data:
+        return str(data["answer"]).strip()
+
+    if model and prompt:
+        print("    WARN: response not valid JSON, retrying...", file=sys.stderr)
+        try:
+            retry_response = invoke_sync(
+                prompt + "\n\n" + response + "\n\n" + RETRY_JSON,
+                model=model, timeout=60,
+            )
+            data = _extract_json(retry_response)
+            if data and "answer" in data:
+                return str(data["answer"]).strip()
+        except Exception:
+            pass
+
+    print("    WARN: could not parse answer JSON", file=sys.stderr)
     return response.strip()[:100]
 
 
@@ -132,16 +158,26 @@ def judge_answer(question: str, expected: str, got: str, model: str) -> tuple[bo
     except Exception:
         return False, "judge error"
 
-    verdict_match = re.search(r"VERDICT:\s*(CORRECT|WRONG)", response, re.IGNORECASE)
-    if not verdict_match:
-        return False, "no verdict"
+    data = _extract_json(response)
+    if data and "verdict" in data:
+        is_correct = data["verdict"].strip().upper() == "CORRECT"
+        return is_correct, data.get("explanation", "")
+
+    print("    WARN: verdict not valid JSON, retrying...", file=sys.stderr)
+    try:
+        retry_response = invoke_sync(
+            prompt + "\n\n" + response + "\n\n" + RETRY_JSON,
+            model=model, timeout=60,
+        )
+        data = _extract_json(retry_response)
+        if data and "verdict" in data:
+            is_correct = data["verdict"].strip().upper() == "CORRECT"
+            return is_correct, data.get("explanation", "")
+    except Exception:
+        pass
 
-    is_correct = verdict_match.group(1).upper() == "CORRECT"
-    explanation = ""
-    exp_match = re.search(r"EXPLANATION:\s*(.+)", response, re.IGNORECASE)
-    if exp_match:
-        explanation = exp_match.group(1).strip()
-    return is_correct, explanation
+    print("    WARN: could not parse verdict JSON", file=sys.stderr)
+    return False, "no verdict"
 
 
 def cmd_exam(args):
@@ -194,7 +230,7 @@ def cmd_exam(args):
             results.append({"question": q, "status": "ERROR", "error": str(e)})
             continue
 
-        answer = extract_answer(response)
+        answer = extract_answer(response, model=args.model, prompt=prompt)
         expected = q["correct"].strip().lower()
         use_judge = not getattr(args, 'no_judge', False)
 
diff --git a/expert_build/prompts.py b/expert_build/prompts.py
index 9110d81..d3c6d30 100644
--- a/expert_build/prompts.py
+++ b/expert_build/prompts.py
@@ -99,9 +99,8 @@
 Question: {question}
 {choices}
 
-Provide your answer and brief explanation. Format:
-ANSWER: <letter or text>
-EXPLANATION: <one paragraph>
+Respond with ONLY this JSON (no other text):
+{{"answer": "<letter or short answer>", "explanation": "<one paragraph>"}}
 """
 
 EXAM_JUDGE = """\
@@ -120,9 +119,8 @@
 different but valid approaches should count as correct. Missing key facts or \
 fundamentally wrong reasoning should count as wrong.
 
-Format your response as:
-VERDICT: CORRECT or WRONG
-EXPLANATION: <one sentence>
+Respond with ONLY this JSON (no other text):
+{{"verdict": "CORRECT or WRONG", "explanation": "<one sentence>"}}
 """
 
 CERT_MATCH = """\
diff --git a/tests/test_exam.py b/tests/test_exam.py
new file mode 100644
index 0000000..a8d9240
--- /dev/null
+++ b/tests/test_exam.py
@@ -0,0 +1,159 @@
+"""Tests for expert_build.exam — JSON answer/verdict parsing and retry."""
+
+from unittest.mock import patch
+
+from expert_build.exam import extract_answer, judge_answer, _extract_json
+
+
+# --- _extract_json ---
+
+def test_extract_json_plain():
+    assert _extract_json('{"answer": "b", "explanation": "because"}') == {
+        "answer": "b", "explanation": "because"
+    }
+
+
+def test_extract_json_with_code_fence():
+    response = '```json\n{"answer": "c", "explanation": "reason"}\n```'
+    assert _extract_json(response) == {"answer": "c", "explanation": "reason"}
+
+
+def test_extract_json_embedded_in_text():
+    response = 'Here is my answer:\n{"answer": "a", "explanation": "yes"}\nDone.'
+    result = _extract_json(response)
+    assert result["answer"] == "a"
+
+
+def test_extract_json_braces_in_value():
+    response = 'Sure: {"answer": "b", "explanation": "use {braces} here"}'
+    result = _extract_json(response)
+    assert result["answer"] == "b"
+    assert "{braces}" in result["explanation"]
+
+
+def test_extract_json_invalid():
+    assert _extract_json("No JSON here at all") is None
+
+
+def test_extract_json_truncated():
+    assert _extract_json('{"answer": "b", "explan') is None
+
+
+# --- extract_answer ---
+
+def test_extract_answer_from_json():
+    response = '{"answer": "b", "explanation": "because"}'
+    assert extract_answer(response) == "b"
+
+
+def test_extract_answer_strips_whitespace():
+    response = '{"answer": "  d  ", "explanation": "reason"}'
+    assert extract_answer(response) == "d"
+
+
+def test_extract_answer_retries_on_bad_json():
+    bad_response = "I think the answer is B because of reasons"
+
+    with patch("expert_build.exam.invoke_sync",
+               return_value='{"answer": "b", "explanation": "reasons"}') as mock_llm:
+        result = extract_answer(bad_response, model="test", prompt="original prompt")
+
+    assert result == "b"
+    assert mock_llm.called
+
+
+def test_extract_answer_fallback_after_failed_retry():
+    bad_response = "No format at all"
+
+    with patch("expert_build.exam.invoke_sync", return_value="Still no format"):
+        result = extract_answer(bad_response, model="test", prompt="original prompt")
+
+    assert result == "No format at all"
+
+
+def test_extract_answer_no_retry_without_model():
+    result = extract_answer("No JSON here")
+    assert result == "No JSON here"
+
+
+def test_extract_answer_code_fence():
+    response = '```json\n{"answer": "a", "explanation": "yes"}\n```'
+    assert extract_answer(response) == "a"
+
+
+# --- judge_answer ---
+
+def test_judge_correct():
+    with patch("expert_build.exam.invoke_sync",
+               return_value='{"verdict": "CORRECT", "explanation": "matches"}'):
+        is_correct, explanation = judge_answer("q", "expected", "got", "test")
+
+    assert is_correct is True
+    assert explanation == "matches"
+
+
+def test_judge_wrong():
+    with patch("expert_build.exam.invoke_sync",
+               return_value='{"verdict": "WRONG", "explanation": "missed key point"}'):
+        is_correct, explanation = judge_answer("q", "expected", "got", "test")
+
+    assert is_correct is False
+    assert explanation == "missed key point"
+
+
+def test_judge_retries_on_bad_json():
+    call_count = 0
+    def side_effect(prompt, model=None, timeout=None):
+        nonlocal call_count
+        call_count += 1
+        if call_count == 1:
+            return "I think this is correct because it matches"
+        return '{"verdict": "CORRECT", "explanation": "matches expected"}'
+
+    with patch("expert_build.exam.invoke_sync", side_effect=side_effect):
+        is_correct, explanation = judge_answer("q", "expected", "got", "test")
+
+    assert is_correct is True
+    assert call_count == 2
+
+
+def test_judge_fallback_after_failed_retry():
+    with patch("expert_build.exam.invoke_sync", return_value="No JSON at all"):
+        is_correct, explanation = judge_answer("q", "expected", "got", "test")
+
+    assert is_correct is False
+    assert explanation == "no verdict"
+
+
+def test_judge_handles_llm_error():
+    with patch("expert_build.exam.invoke_sync",
+               side_effect=RuntimeError("timeout")):
+        is_correct, explanation = judge_answer("q", "expected", "got", "test")
+
+    assert is_correct is False
+    assert explanation == "judge error"
+
+
+def test_judge_retry_itself_raises():
+    call_count = 0
+    def side_effect(prompt, model=None, timeout=None):
+        nonlocal call_count
+        call_count += 1
+        if call_count == 1:
+            return "Not JSON"
+        raise RuntimeError("retry timeout")
+
+    with patch("expert_build.exam.invoke_sync", side_effect=side_effect):
+        is_correct, explanation = judge_answer("q", "expected", "got", "test")
+
+    assert is_correct is False
+    assert explanation == "no verdict"
+    assert call_count == 2
+
+
+def test_judge_case_insensitive_verdict():
+    with patch("expert_build.exam.invoke_sync",
+               return_value='{"verdict": "correct", "explanation": "ok"}'):
+        is_correct, _ = judge_answer("q", "expected", "got", "test")
+
+    assert is_correct is True

```

## Observation Results

You previously requested observations. Here are the results:

```json
{
  "judge_answer_full": {
    "function": "judge_answer",
    "file": "expert_build/exam.py",
    "start_line": 153,
    "end_line": 180,
    "source": "def judge_answer(question: str, expected: str, got: str, model: str) -> tuple[bool, str]:\n    \"\"\"Use LLM to judge if an open-ended answer is semantically correct.\"\"\"\n    prompt = EXAM_JUDGE.format(question=question, expected=expected, got=got)\n    try:\n        response = invoke_sync(prompt, model=model, timeout=60)\n    except Exception:\n        return False, \"judge error\"\n\n    data = _extract_json(response)\n    if data and \"verdict\" in data:\n        is_correct = data[\"verdict\"].strip().upper() == \"CORRECT\"\n        return is_correct, data.get(\"explanation\", \"\")\n\n    print(\"    WARN: verdict not valid JSON, retrying...\", file=sys.stderr)\n    try:\n        retry_response = invoke_sync(\n            prompt + \"\\n\\n\" + response + \"\\n\\n\" + RETRY_JSON,\n            model=model, timeout=60,\n        )\n        data = _extract_json(retry_response)\n        if data and \"verdict\" in data:\n            is_correct = data[\"verdict\"].strip().upper() == \"CORRECT\"\n            return is_correct, data.get(\"explanation\", \"\")\n    except Exception:\n        pass\n\n    print(\"    WARN: could not parse verdict JSON\", file=sys.stderr)\n    return False, \"no verdict\""
  },
  "invoke_sync_body": {
    "error": "No function found at 'invoke_sync'",
    "file": "expert_build/exam.py"
  },
  "invoke_sync_raises": {
    "function": "invoke_sync",
    "file": "expert_build/exam.py",
    "explicit_raises": [],
    "calls": [],
    "error": "Function 'invoke_sync' not found"
  },
  "extract_answer_callers": {
    "symbol": "extract_answer",
    "production_callers": [
      {
        "file": "expert_build/exam.py",
        "line": 130,
        "text": "def extract_answer(response: str, model: str = None, prompt: str = None) -> str:",
        "context_function": "_extract_json",
        "context_snippet": "   127:     return None\n   128: \n   129: \n>> 130: def extract_answer(response: str, model: str = None, prompt: str = None) -> str:\n   131:     \"\"\"Extract answer from JSON LLM response, retrying on parse failure.\"\"\"\n   132:     data = _extract_json(response)\n   133:     if data and \"answer\" in data:"
      },
      {
        "file": "expert_build/exam.py",
        "line": 233,
        "text": "answer = extract_answer(response, model=args.model, prompt=prompt)",
        "context_function": "cmd_exam",
        "context_snippet": "   230:             results.append({\"question\": q, \"status\": \"ERROR\", \"error\": str(e)})\n   231:             continue\n   232: \n>> 233:         answer = extract_answer(response, model=args.model, prompt=prompt)\n   234:         expected = q[\"correct\"].strip().lower()\n   235:         use_judge = not getattr(args, 'no_judge', False)\n   236: "
      }
    ],
    "test_callers": [
      {
        "file": "tests/test_exam.py",
        "line": 5,
        "text": "from expert_build.exam import extract_answer, judge_answer, _extract_json",
        "context_function": null,
        "context_snippet": "   2: \n   3: from unittest.mock import patch\n   4: \n>> 5: from expert_build.exam import extract_answer, judge_answer, _extract_json\n   6: \n   7: \n   8: # --- _extract_json ---"
      },
      {
        "file": "tests/test_exam.py",
        "line": 42,
        "text": "# --- extract_answer ---",
        "context_function": "test_extract_json_truncated",
        "context_snippet": "   39:     assert _extract_json('{\"answer\": \"b\", \"explan') is None\n   40: \n   41: \n>> 42: # --- extract_answer ---\n   43: \n   44: def test_extract_answer_from_json():\n   45:     response = '{\"answer\": \"b\", \"explanation\": \"because\"}'"
      },
      {
        "file": "tests/test_exam.py",
        "line": 44,
        "text": "def test_extract_answer_from_json():",
        "context_function": "test_extract_json_truncated",
        "context_snippet": "   41: \n   42: # --- extract_answer ---\n   43: \n>> 44: def test_extract_answer_from_json():\n   45:     response = '{\"answer\": \"b\", \"explanation\": \"because\"}'\n   46:     assert extract_answer(response) == \"b\"\n   47: "
      },
      {
        "file": "tests/test_exam.py",
        "line": 46,
        "text": "assert extract_answer(response) == \"b\"",
        "context_function": "test_extract_answer_from_json",
        "context_snippet": "   43: \n   44: def test_extract_answer_from_json():\n   45:     response = '{\"answer\": \"b\", \"explanation\": \"because\"}'\n>> 46:     assert extract_answer(response) == \"b\"\n   47: \n   48: \n   49: def test_extract_answer_strips_whitespace():"
      },
      {
        "file": "tests/test_exam.py",
        "line": 49,
        "text": "def test_extract_answer_strips_whitespace():",
        "context_function": "test_extract_answer_from_json",
        "context_snippet": "   46:     assert extract_answer(response) == \"b\"\n   47: \n   48: \n>> 49: def test_extract_answer_strips_whitespace():\n   50:     response = '{\"answer\": \"  d  \", \"explanation\": \"reason\"}'\n   51:     assert extract_answer(response) == \"d\"\n   52: "
      },
      {
        "file": "tests/test_exam.py",
        "line": 51,
        "text": "assert extract_answer(response) == \"d\"",
        "context_function": "test_extract_answer_strips_whitespace",
        "context_snippet": "   48: \n   49: def test_extract_answer_strips_whitespace():\n   50:     response = '{\"answer\": \"  d  \", \"explanation\": \"reason\"}'\n>> 51:     assert extract_answer(response) == \"d\"\n   52: \n   53: \n   54: def test_extract_answer_retries_on_bad_json():"
      },
      {
        "file": "tests/test_exam.py",
        "line": 54,
        "text": "def test_extract_answer_retries_on_bad_json():",
        "context_function": "test_extract_answer_strips_whitespace",
        "context_snippet": "   51:     assert extract_answer(response) == \"d\"\n   52: \n   53: \n>> 54: def test_extract_answer_retries_on_bad_json():\n   55:     bad_response = \"I think the answer is B because of reasons\"\n   56: \n   57:     with patch(\"expert_build.exam.invoke_sync\","
      },
      {
        "file": "tests/test_exam.py",
        "line": 59,
        "text": "result = extract_answer(bad_response, model=\"test\", prompt=\"original prompt\")",
        "context_function": "test_extract_answer_retries_on_bad_json",
        "context_snippet": "   56: \n   57:     with patch(\"expert_build.exam.invoke_sync\",\n   58:                return_value='{\"answer\": \"b\", \"explanation\": \"reasons\"}') as mock_llm:\n>> 59:         result = extract_answer(bad_response, model=\"test\", prompt=\"original prompt\")\n   60: \n   61:     assert result == \"b\"\n   62:     assert mock_llm.called"
      },
      {
        "file": "tests/test_exam.py",
        "line": 65,
        "text": "def test_extract_answer_fallback_after_failed_retry():",
        "context_function": "test_extract_answer_retries_on_bad_json",
        "context_snippet": "   62:     assert mock_llm.called\n   63: \n   64: \n>> 65: def test_extract_answer_fallback_after_failed_retry():\n   66:     bad_response = \"No format at all\"\n   67: \n   68:     with patch(\"expert_build.exam.invoke_sync\", return_value=\"Still no format\"):"
      },
      {
        "file": "tests/test_exam.py",
        "line": 69,
        "text": "result = extract_answer(bad_response, model=\"test\", prompt=\"original prompt\")",
        "context_function": "test_extract_answer_fallback_after_failed_retry",
        "context_snippet": "   66:     bad_response = \"No format at all\"\n   67: \n   68:     with patch(\"expert_build.exam.invoke_sync\", return_value=\"Still no format\"):\n>> 69:         result = extract_answer(bad_response, model=\"test\", prompt=\"original prompt\")\n   70: \n   71:     assert result == \"No format at all\"\n   72: "
      },
      {
        "file": "tests/test_exam.py",
        "line": 74,
        "text": "def test_extract_answer_no_retry_without_model():",
        "context_function": "test_extract_answer_fallback_after_failed_retry",
        "context_snippet": "   71:     assert result == \"No format at all\"\n   72: \n   73: \n>> 74: def test_extract_answer_no_retry_without_model():\n   75:     result = extract_answer(\"No JSON here\")\n   76:     assert result == \"No JSON here\"\n   77: "
      },
      {
        "file": "tests/test_exam.py",
        "line": 75,
        "text": "result = extract_answer(\"No JSON here\")",
        "context_function": "test_extract_answer_no_retry_without_model",
        "context_snippet": "   72: \n   73: \n   74: def test_extract_answer_no_retry_without_model():\n>> 75:     result = extract_answer(\"No JSON here\")\n   76:     assert result == \"No JSON here\"\n   77: \n   78: "
      },
      {
        "file": "tests/test_exam.py",
        "line": 79,
        "text": "def test_extract_answer_code_fence():",
        "context_function": "test_extract_answer_no_retry_without_model",
        "context_snippet": "   76:     assert result == \"No JSON here\"\n   77: \n   78: \n>> 79: def test_extract_answer_code_fence():\n   80:     response = '```json\\n{\"answer\": \"a\", \"explanation\": \"yes\"}\\n```'\n   81:     assert extract_answer(response) == \"a\"\n   82: "
      },
      {
        "file": "tests/test_exam.py",
        "line": 81,
        "text": "assert extract_answer(response) == \"a\"",
        "context_function": "test_extract_answer_code_fence",
        "context_snippet": "   78: \n   79: def test_extract_answer_code_fence():\n   80:     response = '```json\\n{\"answer\": \"a\", \"explanation\": \"yes\"}\\n```'\n>> 81:     assert extract_answer(response) == \"a\"\n   82: \n   83: \n   84: # --- judge_answer ---"
      }
    ],
    "production_count": 2,
    "test_count": 14,
    "total_count": 16
  },
  "judge_answer_callers": {
    "symbol": "judge_answer",
    "production_callers": [
      {
        "file": "expert_build/exam.py",
        "line": 153,
        "text": "def judge_answer(question: str, expected: str, got: str, model: str) -> tuple[bool, str]:",
        "context_function": "extract_answer",
        "context_snippet": "   150:     return response.strip()[:100]\n   151: \n   152: \n>> 153: def judge_answer(question: str, expected: str, got: str, model: str) -> tuple[bool, str]:\n   154:     \"\"\"Use LLM to judge if an open-ended answer is semantically correct.\"\"\"\n   155:     prompt = EXAM_JUDGE.format(question=question, expected=expected, got=got)\n   156:     try:"
      },
      {
        "file": "expert_build/exam.py",
        "line": 242,
        "text": "is_correct, judge_note = judge_answer(",
        "context_function": "cmd_exam",
        "context_snippet": "   239:             is_correct = answer.lower() == expected\n   240:             judge_note = \"\"\n   241:         elif use_judge:\n>> 242:             is_correct, judge_note = judge_answer(\n   243:                 q[\"text\"], q[\"correct\"], response, args.model,\n   244:             )\n   245:         else:"
      }
    ],
    "test_callers": [
      {
        "file": "tests/test_exam.py",
        "line": 5,
        "text": "from expert_build.exam import extract_answer, judge_answer, _extract_json",
        "context_function": null,
        "context_snippet": "   2: \n   3: from unittest.mock import patch\n   4: \n>> 5: from expert_build.exam import extract_answer, judge_answer, _extract_json\n   6: \n   7: \n   8: # --- _extract_json ---"
      },
      {
        "file": "tests/test_exam.py",
        "line": 84,
        "text": "# --- judge_answer ---",
        "context_function": "test_extract_answer_code_fence",
        "context_snippet": "   81:     assert extract_answer(response) == \"a\"\n   82: \n   83: \n>> 84: # --- judge_answer ---\n   85: \n   86: def test_judge_correct():\n   87:     with patch(\"expert_build.exam.invoke_sync\","
      },
      {
        "file": "tests/test_exam.py",
        "line": 89,
        "text": "is_correct, explanation = judge_answer(\"q\", \"expected\", \"got\", \"test\")",
        "context_function": "test_judge_correct",
        "context_snippet": "   86: def test_judge_correct():\n   87:     with patch(\"expert_build.exam.invoke_sync\",\n   88:                return_value='{\"verdict\": \"CORRECT\", \"explanation\": \"matches\"}'):\n>> 89:         is_correct, explanation = judge_answer(\"q\", \"expected\", \"got\", \"test\")\n   90: \n   91:     assert is_correct is True\n   92:     assert explanation == \"matches\""
      },
      {
        "file": "tests/test_exam.py",
        "line": 98,
        "text": "is_correct, explanation = judge_answer(\"q\", \"expected\", \"got\", \"test\")",
        "context_function": "test_judge_wrong",
        "context_snippet": "   95: def test_judge_wrong():\n   96:     with patch(\"expert_build.exam.invoke_sync\",\n   97:                return_value='{\"verdict\": \"WRONG\", \"explanation\": \"missed key point\"}'):\n>> 98:         is_correct, explanation = judge_answer(\"q\", \"expected\", \"got\", \"test\")\n   99: \n   100:     assert is_correct is False\n   101:     assert explanation == \"missed key point\""
      },
      {
        "file": "tests/test_exam.py",
        "line": 114,
        "text": "is_correct, explanation = judge_answer(\"q\", \"expected\", \"got\", \"test\")",
        "context_function": "side_effect",
        "context_snippet": "   111:         return '{\"verdict\": \"CORRECT\", \"explanation\": \"matches expected\"}'\n   112: \n   113:     with patch(\"expert_build.exam.invoke_sync\", side_effect=side_effect):\n>> 114:         is_correct, explanation = judge_answer(\"q\", \"expected\", \"got\", \"test\")\n   115: \n   116:     assert is_correct is True\n   117:     assert call_count == 2"
      },
      {
        "file": "tests/test_exam.py",
        "line": 122,
        "text": "is_correct, explanation = judge_answer(\"q\", \"expected\", \"got\", \"test\")",
        "context_function": "test_judge_fallback_after_failed_retry",
        "context_snippet": "   119: \n   120: def test_judge_fallback_after_failed_retry():\n   121:     with patch(\"expert_build.exam.invoke_sync\", return_value=\"No JSON at all\"):\n>> 122:         is_correct, explanation = judge_answer(\"q\", \"expected\", \"got\", \"test\")\n   123: \n   124:     assert is_correct is False\n   125:     assert explanation == \"no verdict\""
      },
      {
        "file": "tests/test_exam.py",
        "line": 131,
        "text": "is_correct, explanation = judge_answer(\"q\", \"expected\", \"got\", \"test\")",
        "context_function": "test_judge_handles_llm_error",
        "context_snippet": "   128: def test_judge_handles_llm_error():\n   129:     with patch(\"expert_build.exam.invoke_sync\",\n   130:                side_effect=RuntimeError(\"timeout\")):\n>> 131:         is_correct, explanation = judge_answer(\"q\", \"expected\", \"got\", \"test\")\n   132: \n   133:     assert is_correct is False\n   134:     assert explanation == \"judge error\""
      },
      {
        "file": "tests/test_exam.py",
        "line": 147,
        "text": "is_correct, explanation = judge_answer(\"q\", \"expected\", \"got\", \"test\")",
        "context_function": "side_effect",
        "context_snippet": "   144:         raise RuntimeError(\"retry timeout\")\n   145: \n   146:     with patch(\"expert_build.exam.invoke_sync\", side_effect=side_effect):\n>> 147:         is_correct, explanation = judge_answer(\"q\", \"expected\", \"got\", \"test\")\n   148: \n   149:     assert is_correct is False\n   150:     assert explanation == \"no verdict\""
      },
      {
        "file": "tests/test_exam.py",
        "line": 157,
        "text": "is_correct, _ = judge_answer(\"q\", \"expected\", \"got\", \"test\")",
        "context_function": "test_judge_case_insensitive_verdict",
        "context_snippet": "   154: def test_judge_case_insensitive_verdict():\n   155:     with patch(\"expert_build.exam.invoke_sync\",\n   156:                return_value='{\"verdict\": \"correct\", \"explanation\": \"ok\"}'):\n>> 157:         is_correct, _ = judge_answer(\"q\", \"expected\", \"got\", \"test\")\n   158: \n   159:     assert is_correct is True"
      }
    ],
    "production_count": 2,
    "test_count": 9,
    "total_count": 11
  },
  "exam_imports": {
    "file": "expert_build/exam.py",
    "imports": [
      "json",
      "re",
      "sys"
    ],
    "from_imports": [
      {
        "module": "pathlib",
        "names": [
          "Path"
        ]
      },
      {
        "module": "reasons_lib.api",
        "names": [
          "add_node",
          "add_nogood",
          "list_nodes"
        ]
      },
      {
        "module": "llm",
        "names": [
          "check_model_available",
          "invoke_sync"
        ]
      },
      {
        "module": "prompts",
        "names": [
          "EXAM_ANSWER",
          "EXAM_JUDGE"
        ]
      }
    ],
    "import_section": "\"\"\"Practice exam runner for nogood discovery.\"\"\"\n\nimport json\nimport re\nimport sys\nfrom pathlib import Path\n\nfrom reasons_lib.api import add_node, add_nogood, list_nodes\n\nfrom .llm import check_model_available, invoke_sync\nfrom .prompts import EXAM_ANSWER, EXAM_JUDGE\n\nREASONS_DB = \"reasons.db\"\n\n\ndef parse_questions(filepath: Path) -> list[dict]:\n    \"\"\"Parse practice questions from markdown.\n"
  }
}
```

Use these results to inform your review. Do not request the same observations again.


## Instructions

For each significant change (new file, modified function, etc.), provide a structured verdict.

Use this exact format for each change:

### <file_path or file_path:function_name>
VERDICT: PASS | CONCERN | BLOCK
CORRECTNESS: VALID | QUESTIONABLE | BROKEN
SPEC_COMPLIANCE: MEETS | PARTIAL | VIOLATES | N/A
ISSUE_COMPLIANCE: ADDRESSES | PARTIAL | UNRELATED | N/A
BELIEF_COMPLIANCE: CONSISTENT | VIOLATES | N/A
TEST_COVERAGE: COVERED | PARTIAL | UNTESTED
INTEGRATION: WIRED | PARTIAL | MISSING
REASONING: <brief explanation of your assessment>
---

## Review Criteria

1. **CORRECTNESS**: Does the code do what it claims? Is the logic sound?
   - VALID: Logic is correct, no bugs apparent
   - QUESTIONABLE: Logic may have edge cases or unclear behavior
   - BROKEN: Clear bugs or incorrect behavior

2. **SPEC_COMPLIANCE**: Does it meet MUST requirements from the spec?
   - MEETS: All relevant spec requirements satisfied
   - PARTIAL: Some requirements met, others missing or incomplete
   - VIOLATES: Contradicts spec requirements
   - N/A: No spec provided or not applicable

3. **ISSUE_COMPLIANCE** (only when an issue is provided): Do the changes address the problem or feature described in the issue?
   - ADDRESSES: Changes directly solve the issue's stated problem or implement the requested feature
   - PARTIAL: Changes partially address the issue but leave some aspects unresolved
   - UNRELATED: Changes do not appear related to the issue
   - N/A: No issue provided

4. **TEST_COVERAGE**: Are there tests for the new/changed code?
   - COVERED: Tests exist and cover the changes
   - PARTIAL: Some tests exist but coverage is incomplete
   - UNTESTED: No tests for the changes

5. **INTEGRATION**: Are callers updated? Is the feature usable end-to-end?
   - WIRED: Feature is fully integrated and usable
   - PARTIAL: Interface exists but callers not updated, or integration incomplete
   - MISSING: No integration with existing code

6. **BELIEF_COMPLIANCE** (only when beliefs are provided): Do the changes respect known architectural invariants, contracts, and rules?
   - CONSISTENT: Changes align with or reinforce known beliefs
   - VIOLATES: Changes contradict a specific belief — cite the belief ID
   - N/A: No beliefs provided or no relevant beliefs apply

## Verdict Guidelines

- **BLOCK**: Security issues, broken functionality, spec violations, or missing critical integration
- **CONCERN**: Missing tests, partial integration, questionable patterns, or unclear logic
- **PASS**: Correct, tested, well-integrated code

## Important

- Full function bodies for modified functions may be available in the observations section — use them to verify the complete logic, not just the diff hunks
- Related test files (prefixed with ``related_test:``) may be included in observations — check whether existing test assertions still match modified return types, signatures, or behavior. Flag any test that would break due to the changes
- If duplicate test coverage is detected (multiple test files covering the same source), note it in your review
- Focus on actual issues, not style preferences
- If a method signature is added but callers aren't updated, that's PARTIAL integration
- Be specific in reasoning - reference line numbers or function names
- When in doubt, use CONCERN rather than PASS

## Self-Review

After completing your review, add a brief self-assessment:

### SELF_REVIEW
LIMITATIONS: <what context were you missing that affected review quality?>
---

Examples of limitations:
- "Could not see full class to verify no other methods access the modified field"
- "Test file not included in diff - cannot verify coverage claims"
- "Spec file referenced but not provided"


## Feature Requests

If this review tool could be improved to help you do a better job, suggest features:

### FEATURE_REQUESTS
- <suggestion 1>
- <suggestion 2>
---

Examples:
- "Include full file context for modified functions, not just diff hunks"
- "Show callers of modified methods to verify integration"
- "Include test file alongside implementation changes"

Only include this section if you have specific suggestions. Skip if none.
