Coverage for src / documint_mcp / mint.py: 80%
615 statements
« prev ^ index » next coverage.py v7.13.5, created at 2026-03-30 22:30 -0400
« prev ^ index » next coverage.py v7.13.5, created at 2026-03-30 22:30 -0400
1"""
2.mint — AI-native documentation format for Documint v1.0.
4A .mint file is a TOML document with structured sections:
5 [mint] — metadata header (includes drift_status)
6 [symbols] — LSIF-compact symbol index (functions, types, etc.)
7 [narrative] — human-readable markdown prose
8 [references] — source file -> line numbers mapping
9 [api_schema] — API endpoint/function signatures
10 [ai_context] — instructions for AI agents consuming this doc
11 [links] — related files and URLs
13Compression: sections > 2KB are gzip+base64 encoded automatically.
14Exports: to_claude_md(), to_llms_txt(), to_llms_full_txt(), to_agents_md(),
15 to_api_reference()
17LSIF-compact symbol format (symbols_lsif):
18 Each entry is a dict with keys:
19 n — name (str, required)
20 k — kind (str: "function"|"class"|"type"|"const"|"method")
21 s — signature (str, full type signature)
22 d — docstring (str, first line only)
23 f — file (str, relative path)
24 l — line (int, 1-based)
26 Griffe-enriched entries may additionally include:
27 params — list of {name, type, default} dicts
28 returns — return type annotation (str)
29 decorators — list of decorator names (str)
30 bases — list of base class names (str, classes only)
31"""
32from __future__ import annotations
34import base64
35import gzip
36import hashlib
37import json
38import struct
39import tomllib
40from dataclasses import dataclass, field
41from datetime import datetime, timedelta, timezone
42from pathlib import Path
43from typing import Any
45try:
46 import tomli_w
47 _HAS_TOMLI_W = True
48except ImportError:
49 _HAS_TOMLI_W = False
52_COMPRESS_THRESHOLD_BYTES = 2048
55def _compress(text: str) -> str:
56 return base64.b64encode(gzip.compress(text.encode())).decode()
59def _decompress(data: str) -> str:
60 return gzip.decompress(base64.b64decode(data)).decode()
63def _maybe_compress(text: str) -> tuple[str, str]:
64 if len(text.encode()) > _COMPRESS_THRESHOLD_BYTES:
65 return "gzip+base64", _compress(text)
66 return "raw", text
69def _decode_section(encoding: str, data: str) -> str:
70 if encoding == "gzip+base64":
71 return _decompress(data)
72 return data
75def _try_griffe_extraction(source_files: list[str]) -> list[dict]:
76 """Attempt to extract rich symbols from Python files using Griffe.
78 Returns an LSIF-compatible symbol list enriched with params, returns,
79 decorators, and base classes when Griffe is available. Returns an empty
80 list on failure so the caller can fall back to its existing symbols.
81 """
82 try:
83 from documint_mcp.griffe_extractor import extract_symbols_from_file, is_available
84 except ImportError:
85 return []
87 if not is_available():
88 return []
90 all_symbols: list[dict] = []
91 for fpath in source_files:
92 if not fpath.endswith(".py"):
93 continue
94 try:
95 syms = extract_symbols_from_file(fpath)
96 if syms:
97 all_symbols.extend(syms)
98 except Exception:
99 continue
100 return all_symbols
103def _format_param(p: dict) -> str:
104 """Format a single parameter dict into a signature fragment."""
105 s = p.get("name", "")
106 t = p.get("type")
107 d = p.get("default")
108 if t:
109 s += f": {t}"
110 if d:
111 s += f" = {d}"
112 return s
115def _build_rich_signature(sym: dict) -> str:
116 """Build a full signature string from enriched symbol data.
118 Prefers the Griffe-enriched 'params' / 'returns' fields; falls back to
119 the compact 's' signature when those are absent.
120 """
121 params = sym.get("params")
122 if params is None:
123 return sym.get("s", "")
125 name = sym.get("n", "")
126 kind = sym.get("k", "")
128 if kind == "class":
129 return sym.get("s", f"class {name}")
131 parts = [_format_param(p) for p in params]
132 sig = f"{name}({', '.join(parts)})"
133 ret = sym.get("returns")
134 if ret:
135 sig += f" -> {ret}"
136 return sig
139@dataclass
140class MintDocument:
141 MINT_MAX_AGE_DAYS: int = 7
143 version: str = "1.0"
144 language: str = "unknown"
145 codebase_hash: str = ""
146 generated_at: str = ""
147 source_files: list[str] = field(default_factory=list)
148 # symbols is either a list[dict] (LSIF-compact) or a legacy dict
149 symbols: Any = field(default_factory=dict)
150 narrative: str = ""
151 references: dict[str, list[int]] = field(default_factory=dict)
152 api_schema: str = ""
153 ai_context: dict[str, Any] = field(default_factory=dict)
154 links: dict[str, str] = field(default_factory=dict)
155 drift_status: str | dict = "CLEAN"
156 drift_history: list[dict] = field(default_factory=list)
158 # ------------------------------------------------------------------
159 # Constructors
160 # ------------------------------------------------------------------
162 @classmethod
163 def from_file(cls, path: str | Path) -> "MintDocument":
164 """Load a MintDocument from a file, auto-detecting binary (.mint) vs TOML format."""
165 path = Path(path)
166 with open(path, "rb") as f:
167 raw = f.read()
169 # Auto-detect binary .mint format by magic bytes
170 if raw[:4] == b"MINT":
171 return cls.from_binary(raw)
173 # Fall back to TOML parsing
174 return cls._from_toml_bytes(raw)
176 @classmethod
177 def _from_toml_bytes(cls, raw: bytes) -> "MintDocument":
178 """Parse a MintDocument from raw TOML bytes (extracted for reuse)."""
179 data = tomllib.loads(raw.decode("utf-8"))
180 mint_hdr = data.get("mint", {})
181 symbols_raw = data.get("symbols", {})
182 symbols_text = _decode_section(
183 symbols_raw.get("encoding", "raw"),
184 symbols_raw.get("data", "[]"),
185 )
186 try:
187 symbols = json.loads(symbols_text)
188 except json.JSONDecodeError:
189 symbols = []
190 narrative_raw = data.get("narrative", {})
191 narrative = _decode_section(
192 narrative_raw.get("encoding", "raw"),
193 narrative_raw.get("content", ""),
194 )
195 api_raw = data.get("api_schema", {})
196 api_schema = _decode_section(
197 api_raw.get("encoding", "raw"),
198 api_raw.get("content", ""),
199 )
200 return cls(
201 version=mint_hdr.get("version", "1.0"),
202 language=mint_hdr.get("language", "unknown"),
203 codebase_hash=mint_hdr.get("codebase_hash", ""),
204 generated_at=mint_hdr.get("generated_at", ""),
205 source_files=mint_hdr.get("source_files", []),
206 symbols=symbols,
207 narrative=narrative,
208 references=data.get("references", {}),
209 api_schema=api_schema,
210 ai_context=data.get("ai_context", {}),
211 links=data.get("links", {}),
212 drift_status=mint_hdr.get("drift_status", "CLEAN"),
213 )
215 @classmethod
216 def from_artifact_trace(
217 cls,
218 *,
219 artifact_key: str,
220 artifact_type: str,
221 title: str,
222 source_files: list[str],
223 narrative_md: str,
224 symbols_lsif: list[dict] | None = None,
225 api_schema: str = "",
226 language: str = "unknown",
227 agent_instructions: str = "",
228 links: dict[str, str] | None = None,
229 priority: str = "high",
230 role: str = "",
231 drift_status: str = "CLEAN",
232 ) -> "MintDocument":
233 """
234 Create a MintDocument from an artifact trace.
236 Args:
237 artifact_key: Unique key for this artifact (e.g. "cilow-api")
238 artifact_type: One of ArtifactType values (e.g. "api_reference")
239 title: Human-readable title
240 source_files: List of source file paths covered by this artifact
241 narrative_md: Markdown prose (supplementary, lower authority than symbols)
242 symbols_lsif: LSIF-compact symbol list from drift_engine. Each entry:
243 {"n": name, "k": kind, "s": signature, "d": docstring,
244 "f": file, "l": line}
245 api_schema: HTTP/function endpoint definitions (OpenAPI fragment, etc.)
246 language: Primary programming language
247 agent_instructions: Instructions for AI agents consuming this doc
248 links: Related URLs {"readme": "...", "changelog": "..."}
249 priority: Agent priority hint ("high"|"medium"|"low")
250 role: Short description of this artifact's role for agents
251 drift_status: "CLEAN" or "STALE"
252 """
253 codebase_hash = hashlib.sha256(
254 "\n".join(sorted(source_files)).encode()
255 ).hexdigest()[:16]
257 # Build LSIF symbols list; include artifact metadata as the first entry
258 # using a sentinel kind so loaders can reconstruct the artifact identity.
259 lsif: list[dict] = []
260 meta_entry: dict[str, Any] = {
261 "n": title,
262 "k": "__artifact__",
263 "artifact_key": artifact_key,
264 "artifact_type": artifact_type,
265 }
266 lsif.append(meta_entry)
268 # Try Griffe enrichment for Python files when no symbols provided
269 if not symbols_lsif and language == "python" and source_files:
270 symbols_lsif = _try_griffe_extraction(source_files)
272 if symbols_lsif:
273 lsif.extend(symbols_lsif)
275 default_role = role or f"Covers {title} ({artifact_type})"
277 ground_truth_preamble = (
278 "Ground truth priority (highest to lowest):\n"
279 "1. [SYM] — always current, extracted from live AST\n"
280 "2. [REF] — exact line numbers in source files\n"
281 "3. [API] — API schema\n"
282 "4. [NAR] — narrative docs (MAY BE STALE — check drift.stale_sections)\n"
283 "\n"
284 "If drift.any_stale = true: do not cite stale [NAR] sections as fact. "
285 "Cite [SYM] signatures directly."
286 )
288 if agent_instructions:
289 resolved_instructions = ground_truth_preamble + "\n\n" + agent_instructions
290 else:
291 resolved_instructions = (
292 ground_truth_preamble + "\n\n"
293 f"This document covers {title} ({artifact_type}). "
294 "Use [symbols] for type signatures, [references] for code locations, "
295 "[narrative] as supplementary prose."
296 )
298 return cls(
299 language=language,
300 codebase_hash=f"sha256:{codebase_hash}",
301 generated_at=datetime.now(timezone.utc).isoformat(),
302 source_files=source_files,
303 symbols=lsif,
304 narrative=narrative_md,
305 api_schema=api_schema,
306 ai_context={
307 "model_hint": "claude-sonnet-4-6",
308 "priority": priority,
309 "role": default_role,
310 "instructions": resolved_instructions,
311 },
312 links=links or {},
313 drift_status=drift_status,
314 )
316 # ------------------------------------------------------------------
317 # Persistence
318 # ------------------------------------------------------------------
320 def to_file(self, path: Path) -> None:
321 if not _HAS_TOMLI_W:
322 raise RuntimeError(
323 "tomli-w is required to write .mint files. Install with: pip install tomli-w"
324 )
325 path.parent.mkdir(parents=True, exist_ok=True)
326 symbols_json = json.dumps(self.symbols, separators=(",", ":"))
327 sym_enc, sym_data = _maybe_compress(symbols_json)
328 narr_enc, narr_data = _maybe_compress(self.narrative)
329 api_enc, api_data = _maybe_compress(self.api_schema)
330 doc: dict[str, Any] = {
331 "mint": {
332 "version": self.version,
333 "language": self.language,
334 "codebase_hash": self.codebase_hash,
335 "generated_at": self.generated_at,
336 "source_files": self.source_files,
337 "drift_status": self.drift_status,
338 },
339 "symbols": {"encoding": sym_enc, "data": sym_data},
340 "narrative": {"encoding": narr_enc, "content": narr_data},
341 "references": self.references,
342 "api_schema": {"encoding": api_enc, "content": api_data},
343 "ai_context": self.ai_context,
344 "links": self.links,
345 }
346 with open(path, "wb") as f:
347 tomli_w.dump(doc, f)
349 # ------------------------------------------------------------------
350 # Internal helpers
351 # ------------------------------------------------------------------
353 def _artifact_meta(self) -> dict[str, str]:
354 """Extract artifact metadata from the LSIF sentinel entry or legacy dict."""
355 if isinstance(self.symbols, list):
356 for entry in self.symbols:
357 if isinstance(entry, dict) and entry.get("k") == "__artifact__":
358 return entry
359 if isinstance(self.symbols, dict):
360 return self.symbols
361 return {}
363 def _title(self) -> str:
364 meta = self._artifact_meta()
365 return meta.get("n", meta.get("title", "Documentation"))
367 def _artifact_key(self) -> str:
368 meta = self._artifact_meta()
369 return meta.get("artifact_key", "")
371 def _artifact_type(self) -> str:
372 meta = self._artifact_meta()
373 return meta.get("artifact_type", "unknown")
375 def _export_symbols(self) -> list[dict]:
376 """Return non-sentinel LSIF entries."""
377 if isinstance(self.symbols, list):
378 return [s for s in self.symbols if isinstance(s, dict) and s.get("k") != "__artifact__"]
379 return []
381 def _agent_context_xml(self) -> str:
382 """Render the [agent_context] section as Claude XML."""
383 key = self._artifact_key()
384 artifact_type = self._artifact_type()
385 title = self._title()
386 priority = self.ai_context.get("priority", "high")
387 role = self.ai_context.get("role", f"Covers {title}")
388 last_verified = self.generated_at or datetime.now(timezone.utc).isoformat()
389 source_hash = self.codebase_hash
391 lines = [
392 "<documint-artifact>",
393 f' <artifact key="{key}" type="{artifact_type}" priority="{priority}">',
394 f" <role>{role}</role>",
395 " <sections>",
396 " <symbols>Exact type signatures — ground truth for code generation</symbols>",
397 " <api_schema>HTTP/function endpoint definitions</api_schema>",
398 " <narrative>Supplementary prose — lower authority than symbols</narrative>",
399 " </sections>",
400 " <freshness>",
401 f" <drift-status>{self.drift_status}</drift-status>",
402 f" <last-verified>{last_verified}</last-verified>",
403 f" <source-hash>{source_hash}</source-hash>",
404 " </freshness>",
405 " </artifact>",
406 "</documint-artifact>",
407 ]
408 return "\n".join(lines)
410 def _check_freshness(self) -> None:
411 """Raise ValueError if the .mint file is older than MINT_MAX_AGE_DAYS."""
412 if not self.generated_at:
413 return
414 try:
415 generated = datetime.fromisoformat(
416 self.generated_at.replace("Z", "+00:00")
417 )
418 except (ValueError, TypeError, AttributeError):
419 return # if parsing fails, allow export
420 age = datetime.now(timezone.utc) - generated
421 if age.days > self.MINT_MAX_AGE_DAYS:
422 raise ValueError(
423 f".mint file is {age.days} days old. "
424 "Run 'documint publish' to refresh before exporting."
425 )
427 # ------------------------------------------------------------------
428 # Export methods
429 # ------------------------------------------------------------------
431 def to_claude_md(self) -> str:
432 """Emit CLAUDE.md format with agent_context XML block embedded."""
433 self._check_freshness()
434 title = self._title()
435 artifact_type = self._artifact_type()
436 export_syms = self._export_symbols()
438 lines: list[str] = [
439 f"# {title}",
440 "",
441 "<!-- agent_context: generated by documint.xyz -->",
442 self._agent_context_xml(),
443 "",
444 ]
446 if export_syms:
447 lines += ["## Symbols", ""]
448 for sym in export_syms:
449 name = sym.get("n", "")
450 kind = sym.get("k", "")
451 sig = _build_rich_signature(sym)
452 doc = sym.get("d", "")
453 f = sym.get("f", "")
454 l = sym.get("l", "")
455 decorators = sym.get("decorators", [])
456 bases = sym.get("bases", [])
458 # Decorator annotations
459 if decorators:
460 for dec in decorators:
461 lines.append(f"- `@{dec}`")
463 entry = f"- `{name}`"
464 if sig:
465 entry += f" -- `{sig}`"
466 if bases:
467 entry += f" \n Bases: {', '.join(f'`{b}`' for b in bases)}"
468 if doc:
469 first_line = doc.strip().split("\n")[0]
470 entry += f" \n _{first_line}_"
471 if f:
472 loc = f"`{f}`"
473 if l:
474 loc += f":{l}"
475 entry += f" \n {loc}"
476 lines.append(entry)
477 lines.append("")
479 if self.api_schema:
480 lines += ["## API Reference", "", "```", self.api_schema.rstrip(), "```", ""]
482 if self.narrative:
483 lines += ["## Documentation", "", self.narrative, ""]
485 if self.source_files:
486 lines += ["## Source Files", ""]
487 for sf in self.source_files:
488 lines.append(f"- `{sf}`")
489 lines.append("")
491 if self.ai_context.get("instructions"):
492 lines += ["## Agent Instructions", "", self.ai_context["instructions"], ""]
494 # Stale-sections warning from drift_status
495 stale_sections: list[str] = []
496 if isinstance(self.drift_status, dict):
497 stale_sections = self.drift_status.get("stale_sections", [])
498 if stale_sections:
499 lines.append("")
500 lines.append("## Stale Sections")
501 lines.append("The following narrative sections may not match the current code.")
502 lines.append("Trust [SYM] signatures over these sections:")
503 for s in stale_sections:
504 lines.append(f"- {s}")
505 lines.append("")
507 return "\n".join(lines)
509 def to_agents_md(self) -> str:
510 """Emit AGENTS.md format — simpler, less prescriptive than CLAUDE.md."""
511 self._check_freshness()
512 title = self._title()
513 artifact_type = self._artifact_type()
514 export_syms = self._export_symbols()
516 lines: list[str] = [
517 f"# Agent Context: {title}",
518 "",
519 f"Type: `{artifact_type}` · Status: `{self.drift_status}`",
520 "",
521 ]
523 if self.ai_context.get("instructions"):
524 lines += ["## Instructions", "", self.ai_context["instructions"], ""]
526 if export_syms:
527 lines += ["## Exports", ""]
528 for sym in export_syms:
529 name = sym.get("n", "")
530 kind = sym.get("k", "")
531 sig = _build_rich_signature(sym)
532 doc = sym.get("d", "")
533 params = sym.get("params", [])
534 returns = sym.get("returns")
535 bases = sym.get("bases", [])
537 entry = f"- `{name}`"
538 if kind:
539 entry += f" ({kind})"
540 if sig:
541 entry += f": `{sig}`"
542 lines.append(entry)
544 # Tool-style parameter descriptions
545 if params:
546 for p in params:
547 pname = p.get("name", "")
548 ptype = p.get("type", "")
549 pdef = p.get("default", "")
550 pdesc = f" - `{pname}`"
551 if ptype:
552 pdesc += f" ({ptype})"
553 if pdef:
554 pdesc += f" [default: {pdef}]"
555 lines.append(pdesc)
556 if returns:
557 lines.append(f" - Returns: `{returns}`")
558 if bases:
559 lines.append(f" - Inherits: {', '.join(f'`{b}`' for b in bases)}")
560 if doc:
561 first_line = doc.strip().split("\n")[0]
562 lines.append(f" - {first_line}")
563 lines.append("")
565 if self.narrative:
566 lines += ["## Documentation", "", self.narrative, ""]
568 if self.source_files:
569 lines += ["## Source Files", ""]
570 for sf in self.source_files:
571 lines.append(f"- `{sf}`")
572 lines.append("")
574 return "\n".join(lines)
576 def to_llms_txt(self) -> str:
577 """
578 Per llmstxt.org spec: title line + blank line + summary + links section.
579 This is the compact index version (not full content).
580 Enriched with a structured symbol index when Griffe data is present.
581 """
582 self._check_freshness()
583 title = self._title()
584 artifact_type = self._artifact_type()
585 export_syms = self._export_symbols()
587 lines: list[str] = [
588 f"# {title}",
589 "",
590 ]
592 # Summary: first paragraph of narrative or a generated one
593 summary = ""
594 if self.narrative:
595 first_para = self.narrative.strip().split("\n\n")[0].strip()
596 # Strip any markdown headings from the first paragraph
597 summary_lines = [ln for ln in first_para.splitlines() if not ln.startswith("#")]
598 summary = " ".join(summary_lines).strip()
599 if not summary:
600 summary = f"{title} -- {artifact_type} documentation."
601 lines += [f"> {summary}", ""]
603 # Structured symbol index for LLM consumption
604 if export_syms:
605 lines += ["## API Index", ""]
606 for sym in export_syms:
607 name = sym.get("n", "")
608 sig = _build_rich_signature(sym)
609 doc = sym.get("d", "")
610 if sig:
611 lines.append(f"- `{sig}`")
612 else:
613 lines.append(f"- `{name}`")
614 if doc:
615 first_line = doc.strip().split("\n")[0]
616 lines.append(f" {first_line}")
617 lines.append("")
619 if self.links:
620 lines.append("## Links")
621 lines.append("")
622 for name, url in self.links.items():
623 lines.append(f"- [{name}]({url})")
624 lines.append("")
626 return "\n".join(lines)
628 def to_llms_full_txt(self) -> str:
629 """
630 Full content version of llms.txt — all narrative + api_schema included.
631 Used for /llms-full.txt endpoints.
632 """
633 self._check_freshness()
634 title = self._title()
635 artifact_type = self._artifact_type()
636 export_syms = self._export_symbols()
638 lines: list[str] = [
639 f"# {title}",
640 "",
641 f"Type: {artifact_type} · Language: {self.language} · Status: {self.drift_status}",
642 "",
643 ]
645 if export_syms:
646 lines += ["## Exports", ""]
647 for sym in export_syms:
648 name = sym.get("n", "")
649 kind = sym.get("k", "")
650 sig = _build_rich_signature(sym)
651 doc = sym.get("d", "")
652 decorators = sym.get("decorators", [])
653 bases = sym.get("bases", [])
654 params = sym.get("params", [])
655 returns = sym.get("returns")
657 entry = f"### `{name}`"
658 if kind:
659 entry += f" ({kind})"
660 lines.append(entry)
662 if decorators:
663 lines.append("Decorators: " + ", ".join(f"`@{d}`" for d in decorators))
664 if bases:
665 lines.append("Inherits: " + ", ".join(f"`{b}`" for b in bases))
666 if sig:
667 lines += [f"```\n{sig}\n```"]
668 if params:
669 lines.append("Parameters:")
670 for p in params:
671 pname = p.get("name", "")
672 ptype = p.get("type", "")
673 pdef = p.get("default", "")
674 pdesc = f"- `{pname}`"
675 if ptype:
676 pdesc += f": {ptype}"
677 if pdef:
678 pdesc += f" (default: {pdef})"
679 lines.append(pdesc)
680 if returns:
681 lines.append(f"Returns: `{returns}`")
682 if doc:
683 lines.append("")
684 lines.append(doc)
685 lines.append("")
687 if self.api_schema:
688 lines += ["## API Schema", "", "```", self.api_schema.rstrip(), "```", ""]
690 if self.narrative:
691 lines += ["## Documentation", "", self.narrative, ""]
693 if self.links:
694 lines += ["## Links", ""]
695 for lname, url in self.links.items():
696 lines.append(f"- [{lname}]({url})")
697 lines.append("")
699 return "\n".join(lines)
701 def to_api_reference(self) -> str:
702 """Generate a complete API reference document from .mint data.
704 Produces publication-quality markdown organized by module/file, with
705 full signatures, parameter tables, return types, inheritance, and
706 docstrings. Designed to be directly publishable as developer docs.
707 """
708 self._check_freshness()
709 title = self._title()
710 export_syms = self._export_symbols()
712 lines: list[str] = [
713 f"# {title} -- API Reference",
714 "",
715 f"*Generated by [Documint](https://documint.xyz) "
716 f"on {self.generated_at or 'unknown date'}*",
717 "",
718 ]
720 if self.narrative:
721 first_para = self.narrative.strip().split("\n\n")[0].strip()
722 summary_lines = [ln for ln in first_para.splitlines() if not ln.startswith("#")]
723 summary = " ".join(summary_lines).strip()
724 if summary:
725 lines += [f"> {summary}", ""]
727 if not export_syms:
728 lines.append("No exported symbols found.")
729 return "\n".join(lines)
731 # Group symbols by file
732 by_file: dict[str, list[dict]] = {}
733 for sym in export_syms:
734 f = sym.get("f", "(unknown)")
735 by_file.setdefault(f, []).append(sym)
737 # Table of contents
738 lines += ["## Table of Contents", ""]
739 for fpath in sorted(by_file):
740 anchor = fpath.replace("/", "").replace(".", "").replace(" ", "-").lower()
741 lines.append(f"- [{fpath}](#{anchor})")
742 lines.append("")
744 # Sections per file
745 for fpath in sorted(by_file):
746 lines += [f"---", "", f"## `{fpath}`", ""]
747 syms = by_file[fpath]
749 # Separate classes and top-level functions
750 classes = [s for s in syms if s.get("k") == "class"]
751 functions = [s for s in syms if s.get("k") == "function"]
752 methods = [s for s in syms if s.get("k") == "method"]
753 other = [s for s in syms if s.get("k") not in ("class", "function", "method")]
755 if classes:
756 for cls_sym in classes:
757 cls_name = cls_sym.get("n", "")
758 bases = cls_sym.get("bases", [])
759 doc = cls_sym.get("d", "")
760 decorators = cls_sym.get("decorators", [])
761 lineno = cls_sym.get("l", "")
763 lines.append(f"### class `{cls_name}`")
764 if decorators:
765 for dec in decorators:
766 lines.append(f"`@{dec}` ")
767 if bases:
768 lines.append(f"**Bases:** {', '.join(f'`{b}`' for b in bases)}")
769 if lineno:
770 lines.append(f"*Defined at line {lineno}*")
771 lines.append("")
772 if doc:
773 lines += [doc, ""]
775 # Find methods belonging to this class
776 cls_methods = [m for m in methods if m.get("n", "").startswith(f"{cls_name}.")]
777 if cls_methods:
778 lines.append("#### Methods")
779 lines.append("")
780 for m in cls_methods:
781 self._render_callable(lines, m, heading_level=5)
783 if functions:
784 if classes:
785 lines += ["### Module-level Functions", ""]
786 for fn in functions:
787 self._render_callable(lines, fn, heading_level=3 if not classes else 4)
789 if other:
790 for sym in other:
791 name = sym.get("n", "")
792 kind = sym.get("k", "")
793 sig = _build_rich_signature(sym)
794 doc = sym.get("d", "")
795 lines.append(f"### `{name}` ({kind})")
796 if sig:
797 lines += [f"```python", sig, "```", ""]
798 if doc:
799 lines += [doc, ""]
801 # Source files reference
802 if self.source_files:
803 lines += ["---", "", "## Source Files", ""]
804 for sf in self.source_files:
805 lines.append(f"- `{sf}`")
806 lines.append("")
808 return "\n".join(lines)
810 def _render_callable(self, lines: list[str], sym: dict, heading_level: int = 4) -> None:
811 """Render a function or method symbol into the API reference lines list."""
812 prefix = "#" * heading_level
813 name = sym.get("n", "")
814 sig = _build_rich_signature(sym)
815 doc = sym.get("d", "")
816 params = sym.get("params", [])
817 returns = sym.get("returns")
818 decorators = sym.get("decorators", [])
819 lineno = sym.get("l", "")
821 # Short name for methods: "Class.method" -> "method"
822 display_name = name.split(".")[-1] if "." in name else name
824 lines.append(f"{prefix} `{display_name}`")
825 if decorators:
826 for dec in decorators:
827 lines.append(f"`@{dec}` ")
828 if sig:
829 lines += ["", f"```python", sig, "```", ""]
830 if lineno:
831 lines.append(f"*Line {lineno}*")
832 lines.append("")
834 if params:
835 # Filter out 'self' and 'cls' for cleaner docs
836 doc_params = [p for p in params if p.get("name") not in ("self", "cls")]
837 if doc_params:
838 lines.append("| Parameter | Type | Default |")
839 lines.append("|-----------|------|---------|")
840 for p in doc_params:
841 pname = p.get("name", "")
842 ptype = p.get("type") or "--"
843 pdef = p.get("default") or "--"
844 lines.append(f"| `{pname}` | `{ptype}` | `{pdef}` |")
845 lines.append("")
847 if returns:
848 lines.append(f"**Returns:** `{returns}`")
849 lines.append("")
851 if doc:
852 lines += [doc, ""]
854 # ------------------------------------------------------------------
855 # Metrics
856 # ------------------------------------------------------------------
858 def compression_ratio(self) -> float:
859 """
860 Ratio of .mint serialized size vs equivalent markdown size.
861 A ratio < 1.0 means .mint is smaller than plain markdown.
862 """
863 # Equivalent markdown = narrative + api_schema + symbols rendered as text
864 md_equivalent = self.to_llms_full_txt()
865 md_size = len(md_equivalent.encode())
867 # .mint size: approximate as JSON of all raw fields
868 mint_payload = json.dumps(
869 {
870 "symbols": self.symbols,
871 "narrative": self.narrative,
872 "api_schema": self.api_schema,
873 },
874 separators=(",", ":"),
875 )
876 mint_size = len(mint_payload.encode())
878 if md_size == 0:
879 return 1.0
880 return mint_size / md_size
882 # ------------------------------------------------------------------
883 # Binary serialization (.mint v2 binary format)
884 # ------------------------------------------------------------------
886 _SECTION_HDR = 0x01
887 _SECTION_SYM = 0x02
888 _SECTION_NAR = 0x03
889 _SECTION_REF = 0x04
890 _SECTION_AIC = 0x05
891 _SECTION_API = 0x06
892 _SECTION_DIF = 0x07
893 _FOOTER_MARKER = 0xFF
895 def _build_section(self, section_type: int, payload: bytes) -> bytes:
896 """Encode a single binary section: type (1B) + length (4B BE) + payload."""
897 return bytes([section_type]) + struct.pack(">I", len(payload)) + payload
899 def to_binary(self) -> bytes:
900 """
901 Serialize this MintDocument to the .mint v2 binary format.
903 Layout:
904 HEADER (6B): b"MINT" + version(1B) + flags(1B)
905 SECTIONS: type(1B) + length(4B BE) + payload(NB) (repeating)
906 FOOTER (33B): 0xFF + SHA-256(all preceding bytes)
907 """
908 buf = bytearray()
910 # --- Header ---
911 buf += b"MINT"
912 buf += bytes([0x02, 0x00])
914 # --- Section 0x01: HDR (metadata JSON) ---
915 drift_obj: dict[str, Any]
916 if isinstance(self.drift_status, dict):
917 drift_obj = {
918 "any_stale": self.drift_status.get("any_stale", False),
919 "stale_sections": self.drift_status.get("stale_sections", []),
920 }
921 else:
922 drift_obj = {
923 "any_stale": self.drift_status == "STALE",
924 "stale_sections": [],
925 }
927 hdr_payload = json.dumps(
928 {
929 "version": self.version,
930 "language": self.language,
931 "codebase_hash": self.codebase_hash,
932 "source_files": self.source_files,
933 "generated_at": self.generated_at,
934 "drift": drift_obj,
935 },
936 separators=(",", ":"),
937 ).encode("utf-8")
938 buf += self._build_section(self._SECTION_HDR, hdr_payload)
940 # --- Section 0x02: SYM (gzip-compressed LSIF-compact JSON) ---
941 sym_list = self.symbols if isinstance(self.symbols, list) else []
942 if sym_list:
943 sym_json = json.dumps(sym_list, separators=(",", ":")).encode("utf-8")
944 sym_compressed = gzip.compress(sym_json)
945 buf += self._build_section(self._SECTION_SYM, sym_compressed)
947 # --- Section 0x03: NAR (raw markdown UTF-8) ---
948 if self.narrative:
949 buf += self._build_section(self._SECTION_NAR, self.narrative.encode("utf-8"))
951 # --- Section 0x04: REF (JSON) ---
952 if self.references:
953 ref_payload = json.dumps(self.references, separators=(",", ":")).encode("utf-8")
954 buf += self._build_section(self._SECTION_REF, ref_payload)
956 # --- Section 0x05: AIC (JSON) ---
957 if self.ai_context:
958 aic_payload = json.dumps(self.ai_context, separators=(",", ":")).encode("utf-8")
959 buf += self._build_section(self._SECTION_AIC, aic_payload)
961 # --- Section 0x06: API (raw text UTF-8) ---
962 if self.api_schema:
963 buf += self._build_section(self._SECTION_API, self.api_schema.encode("utf-8"))
965 # --- Section 0x07: DIF (JSON drift history) ---
966 if self.drift_history:
967 dif_payload = json.dumps(self.drift_history, separators=(",", ":")).encode("utf-8")
968 buf += self._build_section(self._SECTION_DIF, dif_payload)
970 # --- Footer ---
971 checksum = hashlib.sha256(bytes(buf)).digest()
972 buf += bytes([self._FOOTER_MARKER])
973 buf += checksum
975 return bytes(buf)
977 @classmethod
978 def from_binary(cls, data: bytes) -> "MintDocument":
979 """
980 Parse a .mint v2 binary file and return a MintDocument.
982 Validates the magic bytes and SHA-256 checksum. Raises ValueError
983 on format errors or checksum mismatch.
984 """
985 if len(data) < 6 + 33: # header + minimal footer
986 raise ValueError("File too small to be a valid .mint file")
988 # --- Validate magic ---
989 if data[:4] != b"MINT":
990 raise ValueError("Not a .mint file (bad magic bytes)")
992 version_byte = data[4]
993 if version_byte != 0x02:
994 import warnings
995 warnings.warn(
996 f".mint file version {version_byte} differs from expected v2; "
997 "parsing will proceed but may produce unexpected results.",
998 stacklevel=2,
999 )
1001 # --- Validate footer checksum ---
1002 if data[-33] != cls._FOOTER_MARKER:
1003 raise ValueError("Missing footer marker (0xFF)")
1004 stored_checksum = data[-32:]
1005 computed_checksum = hashlib.sha256(data[:-33]).digest()
1006 if stored_checksum != computed_checksum:
1007 raise ValueError(
1008 "Checksum mismatch: file may be corrupted or truncated"
1009 )
1011 # --- Parse sections ---
1012 sections: dict[int, bytes] = {}
1013 offset = 6 # after header
1014 end = len(data) - 33 # before footer
1016 while offset < end:
1017 if offset + 5 > end:
1018 raise ValueError(
1019 f"Truncated section header at offset {offset}"
1020 )
1021 section_type = data[offset]
1022 payload_len = struct.unpack(">I", data[offset + 1 : offset + 5])[0]
1023 offset += 5
1024 if offset + payload_len > end:
1025 raise ValueError(
1026 f"Section 0x{section_type:02X} payload extends past footer "
1027 f"(need {payload_len} bytes, only {end - offset} available)"
1028 )
1029 sections[section_type] = data[offset : offset + payload_len]
1030 offset += payload_len
1032 # --- Reconstruct MintDocument ---
1033 doc = cls()
1035 # HDR (0x01)
1036 if cls._SECTION_HDR in sections:
1037 hdr = json.loads(sections[cls._SECTION_HDR].decode("utf-8"))
1038 doc.version = hdr.get("version", "2.0")
1039 doc.language = hdr.get("language", "unknown")
1040 doc.codebase_hash = hdr.get("codebase_hash", "")
1041 doc.source_files = hdr.get("source_files", [])
1042 doc.generated_at = hdr.get("generated_at", "")
1043 drift = hdr.get("drift", {})
1044 if isinstance(drift, dict):
1045 doc.drift_status = drift
1046 else:
1047 doc.drift_status = "CLEAN"
1049 # SYM (0x02)
1050 if cls._SECTION_SYM in sections:
1051 sym_json = gzip.decompress(sections[cls._SECTION_SYM])
1052 doc.symbols = json.loads(sym_json.decode("utf-8"))
1053 else:
1054 doc.symbols = []
1056 # NAR (0x03)
1057 if cls._SECTION_NAR in sections:
1058 doc.narrative = sections[cls._SECTION_NAR].decode("utf-8")
1060 # REF (0x04)
1061 if cls._SECTION_REF in sections:
1062 doc.references = json.loads(sections[cls._SECTION_REF].decode("utf-8"))
1064 # AIC (0x05)
1065 if cls._SECTION_AIC in sections:
1066 doc.ai_context = json.loads(sections[cls._SECTION_AIC].decode("utf-8"))
1068 # API (0x06)
1069 if cls._SECTION_API in sections:
1070 doc.api_schema = sections[cls._SECTION_API].decode("utf-8")
1072 # DIF (0x07)
1073 if cls._SECTION_DIF in sections:
1074 doc.drift_history = json.loads(sections[cls._SECTION_DIF].decode("utf-8"))
1076 return doc
1078 # ------------------------------------------------------------------
1079 # Validation
1080 # ------------------------------------------------------------------
1082 def validate(self) -> list[str]:
1083 """
1084 Return a list of validation error strings. An empty list means valid.
1086 Checks:
1087 - codebase_hash is a non-empty string
1088 - generated_at is a parseable ISO 8601 datetime
1089 - version is "1.0" or "2.0"
1090 - language is non-empty
1091 - If symbols is non-empty, each entry has at least {"n": str, "k": str}
1092 """
1093 errors: list[str] = []
1095 # codebase_hash
1096 if not self.codebase_hash or not isinstance(self.codebase_hash, str):
1097 errors.append("codebase_hash must be a non-empty string")
1099 # generated_at
1100 if not self.generated_at:
1101 errors.append("generated_at is missing")
1102 else:
1103 try:
1104 datetime.fromisoformat(self.generated_at.replace("Z", "+00:00"))
1105 except (ValueError, AttributeError):
1106 errors.append(
1107 f"generated_at is not a valid ISO datetime: {self.generated_at!r}"
1108 )
1110 # version
1111 if self.version not in ("1.0", "2.0"):
1112 errors.append(
1113 f"version must be '1.0' or '2.0', got {self.version!r}"
1114 )
1116 # language
1117 if not self.language or not isinstance(self.language, str):
1118 errors.append("language must be a non-empty string")
1120 # symbols
1121 if isinstance(self.symbols, list) and self.symbols:
1122 for i, sym in enumerate(self.symbols):
1123 if not isinstance(sym, dict):
1124 errors.append(f"symbols[{i}] is not a dict")
1125 continue
1126 if "n" not in sym or not isinstance(sym.get("n"), str):
1127 errors.append(f"symbols[{i}] missing required 'n' (name) string")
1128 if "k" not in sym or not isinstance(sym.get("k"), str):
1129 errors.append(f"symbols[{i}] missing required 'k' (kind) string")
1131 return errors