You are a senior code reviewer. Review the following code changes.

## Specification

No specification provided. Focus on correctness, tests, and integration.





## Code Changes

```diff
diff --git a/expert_build/chunk_docs.py b/expert_build/chunk_docs.py
new file mode 100644
index 0000000..7d4368a
--- /dev/null
+++ b/expert_build/chunk_docs.py
@@ -0,0 +1,191 @@
+"""Chunk large documents into entry-sized pieces."""
+
+import re
+import sys
+from datetime import date
+from pathlib import Path
+
+
+def chunk_markdown(text, max_chars=25000):
+    """Split markdown by heading boundaries, merging small sections."""
+    parts = re.split(r"(?=^#{1,2} )", text, flags=re.MULTILINE)
+    parts = [p for p in parts if p.strip()]
+
+    if len(parts) <= 1:
+        return chunk_fixed(text, max_chars)
+
+    chunks = []
+    current = ""
+    for part in parts:
+        if len(current) + len(part) > max_chars and current:
+            chunks.append(current)
+            current = part
+        else:
+            current += part
+    if current:
+        chunks.append(current)
+
+    return chunks
+
+
+def chunk_python(text, max_chars=25000):
+    """Split Python by top-level class/def boundaries, keeping imports."""
+    lines = text.split("\n")
+
+    preamble_end = 0
+    for i, line in enumerate(lines):
+        if re.match(r"^(class |def )", line) or (line.startswith("@") and i + 1 < len(lines) and re.match(r"^(class |def |@)", lines[i + 1])):
+            preamble_end = i
+            break
+    else:
+        return chunk_fixed(text, max_chars)
+
+    preamble = "\n".join(lines[:preamble_end]).rstrip() + "\n\n"
+
+    boundaries = []
+    for i, line in enumerate(lines[preamble_end:], start=preamble_end):
+        if re.match(r"^(class |def )", line):
+            # Include preceding decorator lines
+            start = i
+            while start > preamble_end and lines[start - 1].startswith("@"):
+                start -= 1
+            if not boundaries or boundaries[-1] != start:
+                boundaries.append(start)
+
+    if not boundaries:
+        return chunk_fixed(text, max_chars)
+
+    sections = []
+    for j, start in enumerate(boundaries):
+        end = boundaries[j + 1] if j + 1 < len(boundaries) else len(lines)
+        section = "\n".join(lines[start:end])
+        sections.append(section)
+
+    chunks = []
+    current = preamble
+    for section in sections:
+        if len(current) + len(section) > max_chars and current.strip() != preamble.strip():
+            chunks.append(current)
+            current = preamble + section
+        else:
+            current += section + "\n"
+    if current.strip():
+        chunks.append(current)
+
+    return chunks
+
+
+def chunk_fixed(text, max_chars=25000, overlap=500):
+    """Split text into fixed-size windows with overlap."""
+    if len(text) <= max_chars:
+        return [text]
+
+    overlap = min(overlap, max_chars // 4)
+    step = max(max_chars - overlap, 1)
+
+    chunks = []
+    start = 0
+    while start < len(text):
+        end = start + max_chars
+        chunks.append(text[start:end])
+        start += step
+    return chunks
+
+
+def _strip_frontmatter(text):
+    """Strip YAML frontmatter and return (frontmatter_dict, content)."""
+    meta = {}
+    content = text
+    if text.startswith("---"):
+        end = text.find("---", 3)
+        if end != -1:
+            for line in text[3:end].splitlines():
+                if ":" in line:
+                    key, _, val = line.partition(":")
+                    meta[key.strip()] = val.strip()
+            content = text[end + 3:].strip()
+    return meta, content
+
+
+def cmd_chunk_docs(args):
+    """Chunk large documents into entry-sized pieces."""
+    input_dir = Path(args.input_dir)
+    if not input_dir.exists():
+        print(f"Source directory not found: {input_dir}")
+        sys.exit(1)
+
+    threshold = args.threshold
+
+    sources = sorted(
+        [*input_dir.glob("*.md"), *input_dir.glob("*.py")],
+        key=lambda p: p.name,
+    )
+    if not sources:
+        print(f"No .md or .py files in {input_dir}")
+        return
+
+    manifest = Path(".chunked-docs")
+    done = set()
+    if manifest.exists():
+        done = set(manifest.read_text().strip().split("\n"))
+
+    total_chunked = 0
+    total_skipped = 0
+
+    for source_path in sources:
+        if str(source_path) in done:
+            total_skipped += 1
+            continue
+
+        raw = source_path.read_text()
+        meta, content = _strip_frontmatter(raw)
+
+        if len(content) <= threshold:
+            continue
+
+        print(f"Chunking: {source_path.name} ({len(content)} chars)")
+
+        max_chars = threshold
+        if source_path.suffix == ".py":
+            chunks = chunk_python(content, max_chars=max_chars)
+        elif source_path.suffix == ".md":
+            chunks = chunk_markdown(content, max_chars=max_chars)
+        else:
+            chunks = chunk_fixed(content, max_chars=max_chars)
+
+        if args.dry_run:
+            for i, chunk in enumerate(chunks, 1):
+                print(f"  chunk {i}: {len(chunk)} chars")
+            continue
+
+        today = date.today()
+        entry_dir = Path("entries") / str(today.year) / f"{today.month:02d}" / f"{today.day:02d}"
+        entry_dir.mkdir(parents=True, exist_ok=True)
+
+        source_url = meta.get("source_url") or meta.get("source", "")
+        if source_url and not source_url.startswith(("http://", "https://")):
+            source_url = ""
+        source_id = meta.get("source_id", "")
+
+        for i, chunk in enumerate(chunks, 1):
+            entry_name = f"{source_path.stem}-chunk-{i}.md"
+            entry_path = entry_dir / entry_name
+
+            fm_lines = [f"source: {source_path}"]
+            if source_url:
+                fm_lines.append(f"source_url: {source_url}")
+            if source_id:
+                fm_lines.append(f"source_id: {source_id}")
+            fm_lines.append(f"chunk: {i}/{len(chunks)}")
+            frontmatter = "---\n" + "\n".join(fm_lines) + "\n---\n\n"
+
+            entry_path.write_text(frontmatter + chunk + "\n")
+            print(f"  -> {entry_path}")
+
+        total_chunked += 1
+
+        with manifest.open("a") as f:
+            f.write(f"{source_path}\n")
+        done.add(str(source_path))
+
+    print(f"\nChunked {total_chunked} files ({total_skipped} already done)")
diff --git a/expert_build/cli.py b/expert_build/cli.py
index 0f99b53..2cb73b9 100644
--- a/expert_build/cli.py
+++ b/expert_build/cli.py
@@ -48,6 +48,13 @@ def main():
     chunk_p.add_argument("--source-label", help="Citation label for Source line")
     chunk_p.add_argument("--dry-run", action="store_true", help="Show sections without creating entries")
 
+    # -- chunk-docs --
+    chunkd_p = sub.add_parser("chunk-docs", help="Chunk large documents into entry-sized pieces")
+    chunkd_p.add_argument("--input-dir", default="sources", help="Source directory (default: sources)")
+    chunkd_p.add_argument("--threshold", type=int, default=30000,
+                          help="Only chunk files larger than this (default: 30000)")
+    chunkd_p.add_argument("--dry-run", action="store_true", help="Show chunks without creating entries")
+
     # -- summarize --
     sum_p = sub.add_parser("summarize", help="Generate entries from source documents")
     sum_p.add_argument("--input-dir", default="sources", help="Source directory (default: sources)")
@@ -140,6 +147,7 @@ def main():
     commands = {
         "init": lambda a: _lazy("init_cmd", "cmd_init")(a),
         "chunk-pdf": lambda a: _lazy("chunk_pdf", "cmd_chunk_pdf")(a),
+        "chunk-docs": lambda a: _lazy("chunk_docs", "cmd_chunk_docs")(a),
         "fetch-docs": lambda a: _lazy("fetch", "cmd_fetch_docs")(a),
         "summarize": lambda a: _lazy("summarize", "cmd_summarize")(a),
         "propose-beliefs": lambda a: _lazy("propose", "cmd_propose_beliefs")(a),
diff --git a/expert_build/summarize.py b/expert_build/summarize.py
index 840d7de..29d8bcb 100644
--- a/expert_build/summarize.py
+++ b/expert_build/summarize.py
@@ -83,7 +83,7 @@ def cmd_summarize(args):
                       f"Consider: expert-build chunk-pdf {source_path}")
             else:
                 print(f"  WARN: truncated from {original_len} to 30000 chars. "
-                      f"Large documents may lose tail content.")
+                      f"Consider: expert-build chunk-docs")
 
         template = SUMMARIZE_CODE if source_path.suffix == ".py" else SUMMARIZE
         prompt = template.format(content=content)
diff --git a/tests/test_chunk_docs.py b/tests/test_chunk_docs.py
new file mode 100644
index 0000000..571e52e
--- /dev/null
+++ b/tests/test_chunk_docs.py
@@ -0,0 +1,237 @@
+"""Tests for expert_build.chunk_docs — document chunking."""
+
+import types
+from pathlib import Path
+
+import pytest
+
+from expert_build.chunk_docs import (
+    chunk_markdown,
+    chunk_python,
+    chunk_fixed,
+    cmd_chunk_docs,
+)
+
+
+# --- chunk_markdown ---
+
+def test_chunk_markdown_by_headings():
+    text = "# Section 1\nContent one.\n\n# Section 2\nContent two.\n"
+    chunks = chunk_markdown(text, max_chars=50)
+    assert len(chunks) == 2
+    assert "Section 1" in chunks[0]
+    assert "Section 2" in chunks[1]
+
+
+def test_chunk_markdown_merges_small_sections():
+    text = "# A\nShort.\n\n# B\nAlso short.\n\n# C\nStill short.\n"
+    chunks = chunk_markdown(text, max_chars=1000)
+    assert len(chunks) == 1
+    assert "A" in chunks[0]
+    assert "C" in chunks[0]
+
+
+def test_chunk_markdown_h2_headings():
+    text = "## First\nContent.\n\n## Second\nMore content.\n"
+    chunks = chunk_markdown(text, max_chars=30)
+    assert len(chunks) == 2
+
+
+def test_chunk_markdown_no_headings_falls_back():
+    text = "Just a plain text document with no headings at all. " * 100
+    chunks = chunk_markdown(text, max_chars=200)
+    assert len(chunks) > 1
+
+
+# --- chunk_python ---
+
+def test_chunk_python_by_definitions():
+    text = (
+        "import os\n\n"
+        "def foo():\n    pass\n\n"
+        "def bar():\n    pass\n"
+    )
+    chunks = chunk_python(text, max_chars=40)
+    assert len(chunks) == 2
+    assert "def foo" in chunks[0]
+    assert "def bar" in chunks[1]
+
+
+def test_chunk_python_keeps_imports():
+    text = (
+        "import os\nimport sys\n\n"
+        "def first():\n    pass\n\n"
+        "def second():\n    pass\n"
+    )
+    chunks = chunk_python(text, max_chars=60)
+    assert len(chunks) == 2
+    for chunk in chunks:
+        assert "import os" in chunk
+
+
+def test_chunk_python_class_boundary():
+    text = (
+        "import x\n\n"
+        "class Foo:\n    pass\n\n"
+        "class Bar:\n    pass\n"
+    )
+    chunks = chunk_python(text, max_chars=40)
+    assert len(chunks) == 2
+    assert "class Foo" in chunks[0]
+    assert "class Bar" in chunks[1]
+
+
+def test_chunk_python_decorator_stays_with_function():
+    text = (
+        "import os\n\n"
+        "@decorator\n"
+        "def foo():\n    pass\n\n"
+        "def bar():\n    pass\n"
+    )
+    chunks = chunk_python(text, max_chars=60)
+    assert len(chunks) == 2
+    assert "@decorator" in chunks[0]
+    assert "def foo" in chunks[0]
+
+
+def test_chunk_python_no_defs_falls_back():
+    text = "x = 1\ny = 2\nz = 3\n" * 100
+    chunks = chunk_python(text, max_chars=100)
+    assert len(chunks) > 1
+
+
+# --- chunk_fixed ---
+
+def test_chunk_fixed_small_text_single_chunk():
+    text = "Short text."
+    chunks = chunk_fixed(text, max_chars=100)
+    assert len(chunks) == 1
+    assert chunks[0] == text
+
+
+def test_chunk_fixed_with_overlap():
+    text = "a" * 1000
+    chunks = chunk_fixed(text, max_chars=400, overlap=100)
+    assert len(chunks) >= 3
+    assert chunks[0][-100:] == chunks[1][:100]
+
+
+# --- cmd_chunk_docs ---
+
+@pytest.fixture
+def source_dir(tmp_path):
+    d = tmp_path / "sources"
+    d.mkdir()
+    return d
+
+
+@pytest.fixture
+def work_dir(tmp_path, monkeypatch):
+    wd = tmp_path / "work"
+    wd.mkdir()
+    monkeypatch.chdir(wd)
+    return wd
+
+
+def make_args(input_dir, threshold=100, dry_run=False):
+    return types.SimpleNamespace(
+        input_dir=str(input_dir),
+        threshold=threshold,
+        dry_run=dry_run,
+    )
+
+
+def test_skips_small_files(source_dir, work_dir, capsys):
+    (source_dir / "small.md").write_text("# Short\nContent")
+    args = make_args(source_dir, threshold=30000)
+    cmd_chunk_docs(args)
+    captured = capsys.readouterr()
+    assert "Chunked 0 files" in captured.out
+    entries = list((work_dir / "entries").rglob("*.md")) if (work_dir / "entries").exists() else []
+    assert len(entries) == 0
+
+
+def test_chunks_large_markdown(source_dir, work_dir):
+    text = "# Section 1\n" + "x" * 200 + "\n\n# Section 2\n" + "y" * 200
+    (source_dir / "big.md").write_text(text)
+    args = make_args(source_dir, threshold=100)
+    cmd_chunk_docs(args)
+    entries = list((work_dir / "entries").rglob("*.md"))
+    assert len(entries) == 2
+    contents = [e.read_text() for e in sorted(entries)]
+    assert "Section 1" in contents[0]
+    assert "Section 2" in contents[1]
+
+
+def test_chunks_large_python(source_dir, work_dir):
+    text = "import os\n\n" + "def foo():\n    " + "x = 1\n    " * 50 + "\n\ndef bar():\n    " + "y = 2\n    " * 50
+    (source_dir / "big.py").write_text(text)
+    args = make_args(source_dir, threshold=100)
+    cmd_chunk_docs(args)
+    entries = list((work_dir / "entries").rglob("*.md"))
+    assert len(entries) == 2
+
+
+def test_dry_run_no_files_created(source_dir, work_dir, capsys):
+    text = "# A\n" + "x" * 200 + "\n\n# B\n" + "y" * 200
+    (source_dir / "big.md").write_text(text)
+    args = make_args(source_dir, threshold=100, dry_run=True)
+    cmd_chunk_docs(args)
+    entries_dir = work_dir / "entries"
+    assert not entries_dir.exists() or len(list(entries_dir.rglob("*.md"))) == 0
+    captured = capsys.readouterr()
+    assert "chunk 1" in captured.out
+
+    manifest = work_dir / ".chunked-docs"
+    assert not manifest.exists()
+
+
+def test_dry_run_does_not_poison_manifest(source_dir, work_dir):
+    """Dry-run should not prevent subsequent real runs."""
+    text = "# A\n" + "x" * 200 + "\n\n# B\n" + "y" * 200
+    (source_dir / "big.md").write_text(text)
+
+    dry_args = make_args(source_dir, threshold=100, dry_run=True)
+    cmd_chunk_docs(dry_args)
+
+    real_args = make_args(source_dir, threshold=100, dry_run=False)
+    cmd_chunk_docs(real_args)
+
+    entries = list((work_dir / "entries").rglob("*.md"))
+    assert len(entries) == 2
+
+
+def test_provenance_frontmatter(source_dir, work_dir):
+    fm = "---\nsource_url: https://example.com/doc\nsource_id: abc\n---\n\n"
+    text = fm + "# Part 1\n" + "x" * 200 + "\n\n# Part 2\n" + "y" * 200
+    (source_dir / "doc.md").write_text(text)
+    args = make_args(source_dir, threshold=100)
+    cmd_chunk_docs(args)
+    entries = sorted((work_dir / "entries").rglob("*.md"))
+    content = entries[0].read_text()
+    assert "source_url: https://example.com/doc" in content
+    assert "source_id: abc" in content
+    assert "chunk: 1/" in content
+
+
+def test_manifest_tracking(source_dir, work_dir):
+    text = "# A\n" + "x" * 200 + "\n\n# B\n" + "y" * 200
+    (source_dir / "big.md").write_text(text)
+    args = make_args(source_dir, threshold=100)
+    cmd_chunk_docs(args)
+    entries_count_1 = len(list((work_dir / "entries").rglob("*.md")))
+
+    cmd_chunk_docs(args)
+    entries_count_2 = len(list((work_dir / "entries").rglob("*.md")))
+    assert entries_count_1 == entries_count_2
+
+
+def test_chunk_names_include_stem(source_dir, work_dir):
+    text = "# A\n" + "x" * 200 + "\n\n# B\n" + "y" * 200
+    (source_dir / "my-doc.md").write_text(text)
+    args = make_args(source_dir, threshold=100)
+    cmd_chunk_docs(args)
+    entries = list((work_dir / "entries").rglob("*.md"))
+    names = [e.name for e in entries]
+    assert any("my-doc-chunk-1" in n for n in names)
+    assert any("my-doc-chunk-2" in n for n in names)
diff --git a/tests/test_summarize.py b/tests/test_summarize.py
index ff3457d..7f000be 100644
--- a/tests/test_summarize.py
+++ b/tests/test_summarize.py
@@ -130,7 +130,7 @@ def test_truncation_warning_for_large_file(source_dir, work_dir, capsys):
 
     captured = capsys.readouterr()
     assert "WARN: truncated from 50000 to 30000 chars" in captured.out
-    assert "Large documents may lose tail content" in captured.out
+    assert "Consider: expert-build chunk-docs" in captured.out
 
 
 def test_truncation_content_is_capped(source_dir, work_dir):

```

## Observation Results

You previously requested observations. Here are the results:

```json
{
  "lazy_helper_body": {
    "function": "_lazy",
    "file": "expert_build/cli.py",
    "start_line": 11,
    "end_line": 14,
    "source": "def _lazy(module_name, func_name):\n    \"\"\"Lazy import to keep startup fast.\"\"\"\n    mod = importlib.import_module(f\".{module_name}\", package=\"expert_build\")\n    return getattr(mod, func_name)"
  },
  "cmd_chunk_pdf_body": {
    "function": "cmd_chunk_pdf",
    "file": "expert_build/chunk_pdf.py",
    "start_line": 151,
    "end_line": 257,
    "source": "def cmd_chunk_pdf(args):\n    \"\"\"Chunk a PDF paper into section-by-section entries.\"\"\"\n    pdf_path = Path(args.pdf).resolve()\n    if not pdf_path.exists():\n        print(f\"PDF not found: {pdf_path}\")\n        sys.exit(1)\n\n    if pdf_path.suffix.lower() != \".pdf\":\n        print(f\"Not a PDF file: {pdf_path}\")\n        sys.exit(1)\n\n    prefix = args.prefix or slugify(pdf_path.stem)\n    source_label = args.source_label or pdf_path.stem\n\n    print(f\"Reading PDF: {pdf_path}\")\n    pages = extract_text_by_page(pdf_path)\n    print(f\"  {len(pages)} pages extracted\")\n\n    if not pages:\n        print(\"ERROR: No pages found in PDF.\")\n        sys.exit(1)\n\n    if not check_text_quality(pages):\n        print(\"ERROR: PDF appears to be scanned with no text layer.\")\n        print(\"OCR the PDF first (e.g., with ocrmypdf) and try again.\")\n        sys.exit(1)\n\n    print(\"Identifying sections...\")\n    sections = identify_sections(pages)\n\n    if not sections:\n        print(\"  No sections found. Falling back to one entry per page.\")\n        sections = [\n            {\"number\": str(i + 1), \"title\": f\"Page {i + 1}\", \"start_page\": i + 1, \"end_page\": i + 1}\n            for i in range(len(pages))\n        ]\n\n    print(f\"  Found {len(sections)} sections:\")\n    for s in sections:\n        print(f\"    {s['number']}. {s['title']} (pp. {s['start_page']}-{s['end_page']})\")\n\n    if args.dry_run:\n        print(\"\\n(dry run \u2014 no entries created)\")\n        return\n\n    manifest = Path(f\".chunked-{prefix}\")\n    done = set()\n    if manifest.exists():\n        done = set(manifest.read_text().strip().split(\"\\n\"))\n\n    created = 0\n    skipped = 0\n\n    for section in sections:\n        filename = make_entry_filename(prefix, section)\n\n        if filename in done:\n            print(f\"  SKIP (already chunked): {filename}\")\n            skipped += 1\n            continue\n\n        print(f\"  Creating: Section {section['number']} \u2014 {section['title']}...\")\n\n        content = format_section_content(pages, section, source_label)\n        title = f\"Section {section['number']}: {section['title']}\"\n\n        try:\n            with tempfile.NamedTemporaryFile(\n                mode=\"w\", suffix=\".md\", delete=False,\n            ) as tmp:\n                tmp.write(content)\n                tmp_path = tmp.name\n\n            result = subprocess.run(\n                [\"entry\", \"create\", filename, title, \"--content-file\", tmp_path],\n                capture_output=True,\n                text=True,\n            )\n\n            Path(tmp_path).unlink(missing_ok=True)\n\n            if result.returncode == 0:\n                print(f\"    -> {result.stdout.strip()}\")\n            else:\n                result = subprocess.run(\n                    [\"entry\", \"create\", filename, title, \"--content\", content],\n                    capture_output=True,\n                    text=True,\n                )\n                if result.returncode == 0:\n                    print(f\"    -> {result.stdout.strip()}\")\n                else:\n                    print(f\"    WARN: entry create failed: {result.stderr.strip()}\")\n                    continue\n        except FileNotFoundError:\n            print(\"  ERROR: entry CLI not found. Install with: uv tool install entry\")\n            sys.exit(1)\n\n        with manifest.open(\"a\") as f:\n            f.write(f\"{filename}\\n\")\n        done.add(filename)\n\n        created += 1\n\n    print(f\"\\nChunked {created} sections ({skipped} already done)\")\n    if created:\n        print(\"Next: expert-build propose-beliefs\")"
  },
  "cmd_summarize_body": {
    "function": "cmd_summarize",
    "file": "expert_build/summarize.py",
    "start_line": 11,
    "end_line": 122,
    "source": "def cmd_summarize(args):\n    \"\"\"Generate entries from source documents.\"\"\"\n    from .caffeinate import hold as _caffeinate\n    _caffeinate()\n    input_dir = Path(args.input_dir)\n    if not input_dir.exists():\n        print(f\"Source directory not found: {input_dir}\")\n        print(\"Run: expert-build fetch-docs <url>\")\n        sys.exit(1)\n\n    if not check_model_available(args.model):\n        print(f\"Model not available: {args.model}\")\n        print(\"Install claude CLI or specify --model\")\n        sys.exit(1)\n\n    sources = sorted(\n        [*input_dir.glob(\"*.md\"), *input_dir.glob(\"*.py\")],\n        key=lambda p: p.name,\n    )\n    if not sources:\n        print(f\"No .md or .py files in {input_dir}\")\n        return\n\n    if args.limit:\n        sources = sources[:args.limit]\n\n    # Track what's been summarized\n    manifest = Path(\".summarized\")\n    done = set()\n    if manifest.exists():\n        done = set(manifest.read_text().strip().split(\"\\n\"))\n\n    processed = 0\n    skipped = 0\n\n    for source_path in sources:\n        if str(source_path) in done:\n            skipped += 1\n            continue\n\n        print(f\"Summarizing: {source_path.name}\")\n\n        content = source_path.read_text()\n\n        # Extract and strip frontmatter\n        source_url = None\n        source_id = None\n        if content.startswith(\"---\"):\n            end = content.find(\"---\", 3)\n            if end != -1:\n                frontmatter = content[3:end]\n                for line in frontmatter.splitlines():\n                    if line.startswith(\"source_url:\"):\n                        source_url = line.split(\":\", 1)[1].strip()\n                    elif line.startswith(\"source:\"):\n                        val = line.split(\":\", 1)[1].strip()\n                        if not source_url and val.startswith((\"http://\", \"https://\")):\n                            source_url = val\n                    elif line.startswith(\"source_id:\"):\n                        source_id = line.split(\":\", 1)[1].strip()\n                content = content[end + 3:].strip()\n\n        if not content.strip():\n            print(f\"  SKIP (empty)\")\n            continue\n\n        # Truncate very long documents\n        if len(content) > 30000:\n            original_len = len(content)\n            content = content[:30000] + \"\\n\\n[Truncated \u2014 original was longer]\"\n            if source_path.suffix == \".pdf\":\n                print(f\"  WARN: truncated from {original_len} to 30000 chars. \"\n                      f\"Consider: expert-build chunk-pdf {source_path}\")\n            else:\n                print(f\"  WARN: truncated from {original_len} to 30000 chars. \"\n                      f\"Consider: expert-build chunk-docs\")\n\n        template = SUMMARIZE_CODE if source_path.suffix == \".py\" else SUMMARIZE\n        prompt = template.format(content=content)\n\n        try:\n            summary = invoke_sync(prompt, model=args.model)\n        except Exception as e:\n            print(f\"  ERROR: {e}\")\n            continue\n\n        topic = source_path.stem\n\n        # Write entry directly with provenance frontmatter\n        today = date.today()\n        entry_dir = Path(\"entries\") / str(today.year) / f\"{today.month:02d}\" / f\"{today.day:02d}\"\n        entry_dir.mkdir(parents=True, exist_ok=True)\n        entry_path = entry_dir / f\"{topic}.md\"\n\n        fm_lines = [f\"source: {source_path}\"]\n        if source_url:\n            fm_lines.append(f\"source_url: {source_url}\")\n        if source_id:\n            fm_lines.append(f\"source_id: {source_id}\")\n        frontmatter = \"---\\n\" + \"\\n\".join(fm_lines) + \"\\n---\\n\\n\"\n\n        entry_path.write_text(frontmatter + summary + \"\\n\")\n        print(f\"  -> Created {entry_path}\")\n\n        # Record as done\n        with manifest.open(\"a\") as f:\n            f.write(f\"{source_path}\\n\")\n        done.add(str(source_path))\n\n        processed += 1\n\n    print(f\"\\nSummarized {processed} sources ({skipped} already done)\")"
  },
  "chunk_docs_callers": {
    "symbol": "cmd_chunk_docs",
    "production_callers": [
      {
        "file": "expert_build/cli.py",
        "line": 150,
        "text": "\"chunk-docs\": lambda a: _lazy(\"chunk_docs\", \"cmd_chunk_docs\")(a),",
        "context_function": "main",
        "context_snippet": "   147:     commands = {\n   148:         \"init\": lambda a: _lazy(\"init_cmd\", \"cmd_init\")(a),\n   149:         \"chunk-pdf\": lambda a: _lazy(\"chunk_pdf\", \"cmd_chunk_pdf\")(a),\n>> 150:         \"chunk-docs\": lambda a: _lazy(\"chunk_docs\", \"cmd_chunk_docs\")(a),\n   151:         \"fetch-docs\": lambda a: _lazy(\"fetch\", \"cmd_fetch_docs\")(a),\n   152:         \"summarize\": lambda a: _lazy(\"summarize\", \"cmd_summarize\")(a),\n   153:         \"propose-beliefs\": lambda a: _lazy(\"propose\", \"cmd_propose_beliefs\")(a),"
      },
      {
        "file": "expert_build/chunk_docs.py",
        "line": 110,
        "text": "def cmd_chunk_docs(args):",
        "context_function": "_strip_frontmatter",
        "context_snippet": "   107:     return meta, content\n   108: \n   109: \n>> 110: def cmd_chunk_docs(args):\n   111:     \"\"\"Chunk large documents into entry-sized pieces.\"\"\"\n   112:     input_dir = Path(args.input_dir)\n   113:     if not input_dir.exists():"
      }
    ],
    "test_callers": [
      {
        "file": "tests/test_chunk_docs.py",
        "line": 12,
        "text": "cmd_chunk_docs,",
        "context_function": null,
        "context_snippet": "   9:     chunk_markdown,\n   10:     chunk_python,\n   11:     chunk_fixed,\n>> 12:     cmd_chunk_docs,\n   13: )\n   14: \n   15: "
      },
      {
        "file": "tests/test_chunk_docs.py",
        "line": 119,
        "text": "# --- cmd_chunk_docs ---",
        "context_function": "test_chunk_fixed_with_overlap",
        "context_snippet": "   116:     assert chunks[0][-100:] == chunks[1][:100]\n   117: \n   118: \n>> 119: # --- cmd_chunk_docs ---\n   120: \n   121: @pytest.fixture\n   122: def source_dir(tmp_path):"
      },
      {
        "file": "tests/test_chunk_docs.py",
        "line": 147,
        "text": "cmd_chunk_docs(args)",
        "context_function": "test_skips_small_files",
        "context_snippet": "   144: def test_skips_small_files(source_dir, work_dir, capsys):\n   145:     (source_dir / \"small.md\").write_text(\"# Short\\nContent\")\n   146:     args = make_args(source_dir, threshold=30000)\n>> 147:     cmd_chunk_docs(args)\n   148:     captured = capsys.readouterr()\n   149:     assert \"Chunked 0 files\" in captured.out\n   150:     entries = list((work_dir / \"entries\").rglob(\"*.md\")) if (work_dir / \"entries\").exists() else []"
      },
      {
        "file": "tests/test_chunk_docs.py",
        "line": 158,
        "text": "cmd_chunk_docs(args)",
        "context_function": "test_chunks_large_markdown",
        "context_snippet": "   155:     text = \"# Section 1\\n\" + \"x\" * 200 + \"\\n\\n# Section 2\\n\" + \"y\" * 200\n   156:     (source_dir / \"big.md\").write_text(text)\n   157:     args = make_args(source_dir, threshold=100)\n>> 158:     cmd_chunk_docs(args)\n   159:     entries = list((work_dir / \"entries\").rglob(\"*.md\"))\n   160:     assert len(entries) == 2\n   161:     contents = [e.read_text() for e in sorted(entries)]"
      },
      {
        "file": "tests/test_chunk_docs.py",
        "line": 170,
        "text": "cmd_chunk_docs(args)",
        "context_function": "test_chunks_large_python",
        "context_snippet": "   167:     text = \"import os\\n\\n\" + \"def foo():\\n    \" + \"x = 1\\n    \" * 50 + \"\\n\\ndef bar():\\n    \" + \"y = 2\\n    \" * 50\n   168:     (source_dir / \"big.py\").write_text(text)\n   169:     args = make_args(source_dir, threshold=100)\n>> 170:     cmd_chunk_docs(args)\n   171:     entries = list((work_dir / \"entries\").rglob(\"*.md\"))\n   172:     assert len(entries) == 2\n   173: "
      },
      {
        "file": "tests/test_chunk_docs.py",
        "line": 179,
        "text": "cmd_chunk_docs(args)",
        "context_function": "test_dry_run_no_files_created",
        "context_snippet": "   176:     text = \"# A\\n\" + \"x\" * 200 + \"\\n\\n# B\\n\" + \"y\" * 200\n   177:     (source_dir / \"big.md\").write_text(text)\n   178:     args = make_args(source_dir, threshold=100, dry_run=True)\n>> 179:     cmd_chunk_docs(args)\n   180:     entries_dir = work_dir / \"entries\"\n   181:     assert not entries_dir.exists() or len(list(entries_dir.rglob(\"*.md\"))) == 0\n   182:     captured = capsys.readouterr()"
      },
      {
        "file": "tests/test_chunk_docs.py",
        "line": 195,
        "text": "cmd_chunk_docs(dry_args)",
        "context_function": "test_dry_run_does_not_poison_manifest",
        "context_snippet": "   192:     (source_dir / \"big.md\").write_text(text)\n   193: \n   194:     dry_args = make_args(source_dir, threshold=100, dry_run=True)\n>> 195:     cmd_chunk_docs(dry_args)\n   196: \n   197:     real_args = make_args(source_dir, threshold=100, dry_run=False)\n   198:     cmd_chunk_docs(real_args)"
      },
      {
        "file": "tests/test_chunk_docs.py",
        "line": 198,
        "text": "cmd_chunk_docs(real_args)",
        "context_function": "test_dry_run_does_not_poison_manifest",
        "context_snippet": "   195:     cmd_chunk_docs(dry_args)\n   196: \n   197:     real_args = make_args(source_dir, threshold=100, dry_run=False)\n>> 198:     cmd_chunk_docs(real_args)\n   199: \n   200:     entries = list((work_dir / \"entries\").rglob(\"*.md\"))\n   201:     assert len(entries) == 2"
      },
      {
        "file": "tests/test_chunk_docs.py",
        "line": 209,
        "text": "cmd_chunk_docs(args)",
        "context_function": "test_provenance_frontmatter",
        "context_snippet": "   206:     text = fm + \"# Part 1\\n\" + \"x\" * 200 + \"\\n\\n# Part 2\\n\" + \"y\" * 200\n   207:     (source_dir / \"doc.md\").write_text(text)\n   208:     args = make_args(source_dir, threshold=100)\n>> 209:     cmd_chunk_docs(args)\n   210:     entries = sorted((work_dir / \"entries\").rglob(\"*.md\"))\n   211:     content = entries[0].read_text()\n   212:     assert \"source_url: https://example.com/doc\" in content"
      },
      {
        "file": "tests/test_chunk_docs.py",
        "line": 221,
        "text": "cmd_chunk_docs(args)",
        "context_function": "test_manifest_tracking",
        "context_snippet": "   218:     text = \"# A\\n\" + \"x\" * 200 + \"\\n\\n# B\\n\" + \"y\" * 200\n   219:     (source_dir / \"big.md\").write_text(text)\n   220:     args = make_args(source_dir, threshold=100)\n>> 221:     cmd_chunk_docs(args)\n   222:     entries_count_1 = len(list((work_dir / \"entries\").rglob(\"*.md\")))\n   223: \n   224:     cmd_chunk_docs(args)"
      },
      {
        "file": "tests/test_chunk_docs.py",
        "line": 224,
        "text": "cmd_chunk_docs(args)",
        "context_function": "test_manifest_tracking",
        "context_snippet": "   221:     cmd_chunk_docs(args)\n   222:     entries_count_1 = len(list((work_dir / \"entries\").rglob(\"*.md\")))\n   223: \n>> 224:     cmd_chunk_docs(args)\n   225:     entries_count_2 = len(list((work_dir / \"entries\").rglob(\"*.md\")))\n   226:     assert entries_count_1 == entries_count_2\n   227: "
      },
      {
        "file": "tests/test_chunk_docs.py",
        "line": 233,
        "text": "cmd_chunk_docs(args)",
        "context_function": "test_chunk_names_include_stem",
        "context_snippet": "   230:     text = \"# A\\n\" + \"x\" * 200 + \"\\n\\n# B\\n\" + \"y\" * 200\n   231:     (source_dir / \"my-doc.md\").write_text(text)\n   232:     args = make_args(source_dir, threshold=100)\n>> 233:     cmd_chunk_docs(args)\n   234:     entries = list((work_dir / \"entries\").rglob(\"*.md\"))\n   235:     names = [e.name for e in entries]\n   236:     assert any(\"my-doc-chunk-1\" in n for n in names)"
      }
    ],
    "production_count": 2,
    "test_count": 12,
    "total_count": 14
  },
  "chunk_fixed_usages": {
    "symbol": "chunk_fixed",
    "usages": [
      {
        "file": "tests/test_chunk_docs.py",
        "line": 11,
        "text": "chunk_fixed,"
      },
      {
        "file": "tests/test_chunk_docs.py",
        "line": 103,
        "text": "# --- chunk_fixed ---"
      },
      {
        "file": "tests/test_chunk_docs.py",
        "line": 105,
        "text": "def test_chunk_fixed_small_text_single_chunk():"
      },
      {
        "file": "tests/test_chunk_docs.py",
        "line": 107,
        "text": "chunks = chunk_fixed(text, max_chars=100)"
      },
      {
        "file": "tests/test_chunk_docs.py",
        "line": 112,
        "text": "def test_chunk_fixed_with_overlap():"
      },
      {
        "file": "tests/test_chunk_docs.py",
        "line": 114,
        "text": "chunks = chunk_fixed(text, max_chars=400, overlap=100)"
      },
      {
        "file": "expert_build/chunk_docs.py",
        "line": 15,
        "text": "return chunk_fixed(text, max_chars)"
      },
      {
        "file": "expert_build/chunk_docs.py",
        "line": 41,
        "text": "return chunk_fixed(text, max_chars)"
      },
      {
        "file": "expert_build/chunk_docs.py",
        "line": 56,
        "text": "return chunk_fixed(text, max_chars)"
      },
      {
        "file": "expert_build/chunk_docs.py",
        "line": 78,
        "text": "def chunk_fixed(text, max_chars=25000, overlap=500):"
      },
      {
        "file": "expert_build/chunk_docs.py",
        "line": 154,
        "text": "chunks = chunk_fixed(content, max_chars=max_chars)"
      }
    ],
    "production_usages": [
      {
        "file": "expert_build/chunk_docs.py",
        "line": 15,
        "text": "return chunk_fixed(text, max_chars)"
      },
      {
        "file": "expert_build/chunk_docs.py",
        "line": 41,
        "text": "return chunk_fixed(text, max_chars)"
      },
      {
        "file": "expert_build/chunk_docs.py",
        "line": 56,
        "text": "return chunk_fixed(text, max_chars)"
      },
      {
        "file": "expert_build/chunk_docs.py",
        "line": 78,
        "text": "def chunk_fixed(text, max_chars=25000, overlap=500):"
      },
      {
        "file": "expert_build/chunk_docs.py",
        "line": 154,
        "text": "chunks = chunk_fixed(content, max_chars=max_chars)"
      }
    ],
    "test_usages": [
      {
        "file": "tests/test_chunk_docs.py",
        "line": 11,
        "text": "chunk_fixed,"
      },
      {
        "file": "tests/test_chunk_docs.py",
        "line": 103,
        "text": "# --- chunk_fixed ---"
      },
      {
        "file": "tests/test_chunk_docs.py",
        "line": 105,
        "text": "def test_chunk_fixed_small_text_single_chunk():"
      },
      {
        "file": "tests/test_chunk_docs.py",
        "line": 107,
        "text": "chunks = chunk_fixed(text, max_chars=100)"
      },
      {
        "file": "tests/test_chunk_docs.py",
        "line": 112,
        "text": "def test_chunk_fixed_with_overlap():"
      },
      {
        "file": "tests/test_chunk_docs.py",
        "line": 114,
        "text": "chunks = chunk_fixed(text, max_chars=400, overlap=100)"
      }
    ],
    "production_count": 5,
    "test_count": 6,
    "total_count": 11
  },
  "chunk_docs_tests": {
    "source_file": "expert_build/chunk_docs.py",
    "test_files": [
      {
        "path": "tests/test_chunk_docs.py",
        "exists": true,
        "line_count": 237
      },
      {
        "path": "tests/test_chunk_docspy",
        "exists": false
      }
    ],
    "test_count": 2
  },
  "cli_imports": {
    "file": "expert_build/cli.py",
    "imports": [
      "argparse",
      "importlib",
      "sys"
    ],
    "from_imports": [
      {
        "module": "pathlib",
        "names": [
          "Path"
        ]
      },
      {
        "module": "",
        "names": [
          "__version__"
        ]
      }
    ],
    "import_section": "\"\"\"Expert agent builder CLI.\"\"\"\n\nimport argparse\nimport importlib\nimport sys\nfrom pathlib import Path\n\nfrom . import __version__\n\n\ndef _lazy(module_name, func_name):\n    \"\"\"Lazy import to keep startup fast.\"\"\""
  }
}
```

Use these results to inform your review. Do not request the same observations again.


## Instructions

For each significant change (new file, modified function, etc.), provide a structured verdict.

Use this exact format for each change:

### <file_path or file_path:function_name>
VERDICT: PASS | CONCERN | BLOCK
CORRECTNESS: VALID | QUESTIONABLE | BROKEN
SPEC_COMPLIANCE: MEETS | PARTIAL | VIOLATES | N/A
ISSUE_COMPLIANCE: ADDRESSES | PARTIAL | UNRELATED | N/A
BELIEF_COMPLIANCE: CONSISTENT | VIOLATES | N/A
TEST_COVERAGE: COVERED | PARTIAL | UNTESTED
INTEGRATION: WIRED | PARTIAL | MISSING
REASONING: <brief explanation of your assessment>
---

## Review Criteria

1. **CORRECTNESS**: Does the code do what it claims? Is the logic sound?
   - VALID: Logic is correct, no bugs apparent
   - QUESTIONABLE: Logic may have edge cases or unclear behavior
   - BROKEN: Clear bugs or incorrect behavior

2. **SPEC_COMPLIANCE**: Does it meet MUST requirements from the spec?
   - MEETS: All relevant spec requirements satisfied
   - PARTIAL: Some requirements met, others missing or incomplete
   - VIOLATES: Contradicts spec requirements
   - N/A: No spec provided or not applicable

3. **ISSUE_COMPLIANCE** (only when an issue is provided): Do the changes address the problem or feature described in the issue?
   - ADDRESSES: Changes directly solve the issue's stated problem or implement the requested feature
   - PARTIAL: Changes partially address the issue but leave some aspects unresolved
   - UNRELATED: Changes do not appear related to the issue
   - N/A: No issue provided

4. **TEST_COVERAGE**: Are there tests for the new/changed code?
   - COVERED: Tests exist and cover the changes
   - PARTIAL: Some tests exist but coverage is incomplete
   - UNTESTED: No tests for the changes

5. **INTEGRATION**: Are callers updated? Is the feature usable end-to-end?
   - WIRED: Feature is fully integrated and usable
   - PARTIAL: Interface exists but callers not updated, or integration incomplete
   - MISSING: No integration with existing code

6. **BELIEF_COMPLIANCE** (only when beliefs are provided): Do the changes respect known architectural invariants, contracts, and rules?
   - CONSISTENT: Changes align with or reinforce known beliefs
   - VIOLATES: Changes contradict a specific belief — cite the belief ID
   - N/A: No beliefs provided or no relevant beliefs apply

## Verdict Guidelines

- **BLOCK**: Security issues, broken functionality, spec violations, or missing critical integration
- **CONCERN**: Missing tests, partial integration, questionable patterns, or unclear logic
- **PASS**: Correct, tested, well-integrated code

## Important

- Full function bodies for modified functions may be available in the observations section — use them to verify the complete logic, not just the diff hunks
- Related test files (prefixed with ``related_test:``) may be included in observations — check whether existing test assertions still match modified return types, signatures, or behavior. Flag any test that would break due to the changes
- If duplicate test coverage is detected (multiple test files covering the same source), note it in your review
- Focus on actual issues, not style preferences
- If a method signature is added but callers aren't updated, that's PARTIAL integration
- Be specific in reasoning - reference line numbers or function names
- When in doubt, use CONCERN rather than PASS

## Self-Review

After completing your review, add a brief self-assessment:

### SELF_REVIEW
LIMITATIONS: <what context were you missing that affected review quality?>
---

Examples of limitations:
- "Could not see full class to verify no other methods access the modified field"
- "Test file not included in diff - cannot verify coverage claims"
- "Spec file referenced but not provided"


## Feature Requests

If this review tool could be improved to help you do a better job, suggest features:

### FEATURE_REQUESTS
- <suggestion 1>
- <suggestion 2>
---

Examples:
- "Include full file context for modified functions, not just diff hunks"
- "Show callers of modified methods to verify integration"
- "Include test file alongside implementation changes"

Only include this section if you have specific suggestions. Skip if none.
