Coverage for src / documint_mcp / mint.py: 80%

615 statements  

« prev     ^ index     » next       coverage.py v7.13.5, created at 2026-03-30 22:30 -0400

1""" 

2.mint — AI-native documentation format for Documint v1.0. 

3 

4A .mint file is a TOML document with structured sections: 

5 [mint] — metadata header (includes drift_status) 

6 [symbols] — LSIF-compact symbol index (functions, types, etc.) 

7 [narrative] — human-readable markdown prose 

8 [references] — source file -> line numbers mapping 

9 [api_schema] — API endpoint/function signatures 

10 [ai_context] — instructions for AI agents consuming this doc 

11 [links] — related files and URLs 

12 

13Compression: sections > 2KB are gzip+base64 encoded automatically. 

14Exports: to_claude_md(), to_llms_txt(), to_llms_full_txt(), to_agents_md(), 

15 to_api_reference() 

16 

17LSIF-compact symbol format (symbols_lsif): 

18 Each entry is a dict with keys: 

19 n — name (str, required) 

20 k — kind (str: "function"|"class"|"type"|"const"|"method") 

21 s — signature (str, full type signature) 

22 d — docstring (str, first line only) 

23 f — file (str, relative path) 

24 l — line (int, 1-based) 

25 

26 Griffe-enriched entries may additionally include: 

27 params — list of {name, type, default} dicts 

28 returns — return type annotation (str) 

29 decorators — list of decorator names (str) 

30 bases — list of base class names (str, classes only) 

31""" 

32from __future__ import annotations 

33 

34import base64 

35import gzip 

36import hashlib 

37import json 

38import struct 

39import tomllib 

40from dataclasses import dataclass, field 

41from datetime import datetime, timedelta, timezone 

42from pathlib import Path 

43from typing import Any 

44 

45try: 

46 import tomli_w 

47 _HAS_TOMLI_W = True 

48except ImportError: 

49 _HAS_TOMLI_W = False 

50 

51 

52_COMPRESS_THRESHOLD_BYTES = 2048 

53 

54 

55def _compress(text: str) -> str: 

56 return base64.b64encode(gzip.compress(text.encode())).decode() 

57 

58 

59def _decompress(data: str) -> str: 

60 return gzip.decompress(base64.b64decode(data)).decode() 

61 

62 

63def _maybe_compress(text: str) -> tuple[str, str]: 

64 if len(text.encode()) > _COMPRESS_THRESHOLD_BYTES: 

65 return "gzip+base64", _compress(text) 

66 return "raw", text 

67 

68 

69def _decode_section(encoding: str, data: str) -> str: 

70 if encoding == "gzip+base64": 

71 return _decompress(data) 

72 return data 

73 

74 

75def _try_griffe_extraction(source_files: list[str]) -> list[dict]: 

76 """Attempt to extract rich symbols from Python files using Griffe. 

77 

78 Returns an LSIF-compatible symbol list enriched with params, returns, 

79 decorators, and base classes when Griffe is available. Returns an empty 

80 list on failure so the caller can fall back to its existing symbols. 

81 """ 

82 try: 

83 from documint_mcp.griffe_extractor import extract_symbols_from_file, is_available 

84 except ImportError: 

85 return [] 

86 

87 if not is_available(): 

88 return [] 

89 

90 all_symbols: list[dict] = [] 

91 for fpath in source_files: 

92 if not fpath.endswith(".py"): 

93 continue 

94 try: 

95 syms = extract_symbols_from_file(fpath) 

96 if syms: 

97 all_symbols.extend(syms) 

98 except Exception: 

99 continue 

100 return all_symbols 

101 

102 

103def _format_param(p: dict) -> str: 

104 """Format a single parameter dict into a signature fragment.""" 

105 s = p.get("name", "") 

106 t = p.get("type") 

107 d = p.get("default") 

108 if t: 

109 s += f": {t}" 

110 if d: 

111 s += f" = {d}" 

112 return s 

113 

114 

115def _build_rich_signature(sym: dict) -> str: 

116 """Build a full signature string from enriched symbol data. 

117 

118 Prefers the Griffe-enriched 'params' / 'returns' fields; falls back to 

119 the compact 's' signature when those are absent. 

120 """ 

121 params = sym.get("params") 

122 if params is None: 

123 return sym.get("s", "") 

124 

125 name = sym.get("n", "") 

126 kind = sym.get("k", "") 

127 

128 if kind == "class": 

129 return sym.get("s", f"class {name}") 

130 

131 parts = [_format_param(p) for p in params] 

132 sig = f"{name}({', '.join(parts)})" 

133 ret = sym.get("returns") 

134 if ret: 

135 sig += f" -> {ret}" 

136 return sig 

137 

138 

139@dataclass 

140class MintDocument: 

141 MINT_MAX_AGE_DAYS: int = 7 

142 

143 version: str = "1.0" 

144 language: str = "unknown" 

145 codebase_hash: str = "" 

146 generated_at: str = "" 

147 source_files: list[str] = field(default_factory=list) 

148 # symbols is either a list[dict] (LSIF-compact) or a legacy dict 

149 symbols: Any = field(default_factory=dict) 

150 narrative: str = "" 

151 references: dict[str, list[int]] = field(default_factory=dict) 

152 api_schema: str = "" 

153 ai_context: dict[str, Any] = field(default_factory=dict) 

154 links: dict[str, str] = field(default_factory=dict) 

155 drift_status: str | dict = "CLEAN" 

156 drift_history: list[dict] = field(default_factory=list) 

157 

158 # ------------------------------------------------------------------ 

159 # Constructors 

160 # ------------------------------------------------------------------ 

161 

162 @classmethod 

163 def from_file(cls, path: str | Path) -> "MintDocument": 

164 """Load a MintDocument from a file, auto-detecting binary (.mint) vs TOML format.""" 

165 path = Path(path) 

166 with open(path, "rb") as f: 

167 raw = f.read() 

168 

169 # Auto-detect binary .mint format by magic bytes 

170 if raw[:4] == b"MINT": 

171 return cls.from_binary(raw) 

172 

173 # Fall back to TOML parsing 

174 return cls._from_toml_bytes(raw) 

175 

176 @classmethod 

177 def _from_toml_bytes(cls, raw: bytes) -> "MintDocument": 

178 """Parse a MintDocument from raw TOML bytes (extracted for reuse).""" 

179 data = tomllib.loads(raw.decode("utf-8")) 

180 mint_hdr = data.get("mint", {}) 

181 symbols_raw = data.get("symbols", {}) 

182 symbols_text = _decode_section( 

183 symbols_raw.get("encoding", "raw"), 

184 symbols_raw.get("data", "[]"), 

185 ) 

186 try: 

187 symbols = json.loads(symbols_text) 

188 except json.JSONDecodeError: 

189 symbols = [] 

190 narrative_raw = data.get("narrative", {}) 

191 narrative = _decode_section( 

192 narrative_raw.get("encoding", "raw"), 

193 narrative_raw.get("content", ""), 

194 ) 

195 api_raw = data.get("api_schema", {}) 

196 api_schema = _decode_section( 

197 api_raw.get("encoding", "raw"), 

198 api_raw.get("content", ""), 

199 ) 

200 return cls( 

201 version=mint_hdr.get("version", "1.0"), 

202 language=mint_hdr.get("language", "unknown"), 

203 codebase_hash=mint_hdr.get("codebase_hash", ""), 

204 generated_at=mint_hdr.get("generated_at", ""), 

205 source_files=mint_hdr.get("source_files", []), 

206 symbols=symbols, 

207 narrative=narrative, 

208 references=data.get("references", {}), 

209 api_schema=api_schema, 

210 ai_context=data.get("ai_context", {}), 

211 links=data.get("links", {}), 

212 drift_status=mint_hdr.get("drift_status", "CLEAN"), 

213 ) 

214 

215 @classmethod 

216 def from_artifact_trace( 

217 cls, 

218 *, 

219 artifact_key: str, 

220 artifact_type: str, 

221 title: str, 

222 source_files: list[str], 

223 narrative_md: str, 

224 symbols_lsif: list[dict] | None = None, 

225 api_schema: str = "", 

226 language: str = "unknown", 

227 agent_instructions: str = "", 

228 links: dict[str, str] | None = None, 

229 priority: str = "high", 

230 role: str = "", 

231 drift_status: str = "CLEAN", 

232 ) -> "MintDocument": 

233 """ 

234 Create a MintDocument from an artifact trace. 

235 

236 Args: 

237 artifact_key: Unique key for this artifact (e.g. "cilow-api") 

238 artifact_type: One of ArtifactType values (e.g. "api_reference") 

239 title: Human-readable title 

240 source_files: List of source file paths covered by this artifact 

241 narrative_md: Markdown prose (supplementary, lower authority than symbols) 

242 symbols_lsif: LSIF-compact symbol list from drift_engine. Each entry: 

243 {"n": name, "k": kind, "s": signature, "d": docstring, 

244 "f": file, "l": line} 

245 api_schema: HTTP/function endpoint definitions (OpenAPI fragment, etc.) 

246 language: Primary programming language 

247 agent_instructions: Instructions for AI agents consuming this doc 

248 links: Related URLs {"readme": "...", "changelog": "..."} 

249 priority: Agent priority hint ("high"|"medium"|"low") 

250 role: Short description of this artifact's role for agents 

251 drift_status: "CLEAN" or "STALE" 

252 """ 

253 codebase_hash = hashlib.sha256( 

254 "\n".join(sorted(source_files)).encode() 

255 ).hexdigest()[:16] 

256 

257 # Build LSIF symbols list; include artifact metadata as the first entry 

258 # using a sentinel kind so loaders can reconstruct the artifact identity. 

259 lsif: list[dict] = [] 

260 meta_entry: dict[str, Any] = { 

261 "n": title, 

262 "k": "__artifact__", 

263 "artifact_key": artifact_key, 

264 "artifact_type": artifact_type, 

265 } 

266 lsif.append(meta_entry) 

267 

268 # Try Griffe enrichment for Python files when no symbols provided 

269 if not symbols_lsif and language == "python" and source_files: 

270 symbols_lsif = _try_griffe_extraction(source_files) 

271 

272 if symbols_lsif: 

273 lsif.extend(symbols_lsif) 

274 

275 default_role = role or f"Covers {title} ({artifact_type})" 

276 

277 ground_truth_preamble = ( 

278 "Ground truth priority (highest to lowest):\n" 

279 "1. [SYM] — always current, extracted from live AST\n" 

280 "2. [REF] — exact line numbers in source files\n" 

281 "3. [API] — API schema\n" 

282 "4. [NAR] — narrative docs (MAY BE STALE — check drift.stale_sections)\n" 

283 "\n" 

284 "If drift.any_stale = true: do not cite stale [NAR] sections as fact. " 

285 "Cite [SYM] signatures directly." 

286 ) 

287 

288 if agent_instructions: 

289 resolved_instructions = ground_truth_preamble + "\n\n" + agent_instructions 

290 else: 

291 resolved_instructions = ( 

292 ground_truth_preamble + "\n\n" 

293 f"This document covers {title} ({artifact_type}). " 

294 "Use [symbols] for type signatures, [references] for code locations, " 

295 "[narrative] as supplementary prose." 

296 ) 

297 

298 return cls( 

299 language=language, 

300 codebase_hash=f"sha256:{codebase_hash}", 

301 generated_at=datetime.now(timezone.utc).isoformat(), 

302 source_files=source_files, 

303 symbols=lsif, 

304 narrative=narrative_md, 

305 api_schema=api_schema, 

306 ai_context={ 

307 "model_hint": "claude-sonnet-4-6", 

308 "priority": priority, 

309 "role": default_role, 

310 "instructions": resolved_instructions, 

311 }, 

312 links=links or {}, 

313 drift_status=drift_status, 

314 ) 

315 

316 # ------------------------------------------------------------------ 

317 # Persistence 

318 # ------------------------------------------------------------------ 

319 

320 def to_file(self, path: Path) -> None: 

321 if not _HAS_TOMLI_W: 

322 raise RuntimeError( 

323 "tomli-w is required to write .mint files. Install with: pip install tomli-w" 

324 ) 

325 path.parent.mkdir(parents=True, exist_ok=True) 

326 symbols_json = json.dumps(self.symbols, separators=(",", ":")) 

327 sym_enc, sym_data = _maybe_compress(symbols_json) 

328 narr_enc, narr_data = _maybe_compress(self.narrative) 

329 api_enc, api_data = _maybe_compress(self.api_schema) 

330 doc: dict[str, Any] = { 

331 "mint": { 

332 "version": self.version, 

333 "language": self.language, 

334 "codebase_hash": self.codebase_hash, 

335 "generated_at": self.generated_at, 

336 "source_files": self.source_files, 

337 "drift_status": self.drift_status, 

338 }, 

339 "symbols": {"encoding": sym_enc, "data": sym_data}, 

340 "narrative": {"encoding": narr_enc, "content": narr_data}, 

341 "references": self.references, 

342 "api_schema": {"encoding": api_enc, "content": api_data}, 

343 "ai_context": self.ai_context, 

344 "links": self.links, 

345 } 

346 with open(path, "wb") as f: 

347 tomli_w.dump(doc, f) 

348 

349 # ------------------------------------------------------------------ 

350 # Internal helpers 

351 # ------------------------------------------------------------------ 

352 

353 def _artifact_meta(self) -> dict[str, str]: 

354 """Extract artifact metadata from the LSIF sentinel entry or legacy dict.""" 

355 if isinstance(self.symbols, list): 

356 for entry in self.symbols: 

357 if isinstance(entry, dict) and entry.get("k") == "__artifact__": 

358 return entry 

359 if isinstance(self.symbols, dict): 

360 return self.symbols 

361 return {} 

362 

363 def _title(self) -> str: 

364 meta = self._artifact_meta() 

365 return meta.get("n", meta.get("title", "Documentation")) 

366 

367 def _artifact_key(self) -> str: 

368 meta = self._artifact_meta() 

369 return meta.get("artifact_key", "") 

370 

371 def _artifact_type(self) -> str: 

372 meta = self._artifact_meta() 

373 return meta.get("artifact_type", "unknown") 

374 

375 def _export_symbols(self) -> list[dict]: 

376 """Return non-sentinel LSIF entries.""" 

377 if isinstance(self.symbols, list): 

378 return [s for s in self.symbols if isinstance(s, dict) and s.get("k") != "__artifact__"] 

379 return [] 

380 

381 def _agent_context_xml(self) -> str: 

382 """Render the [agent_context] section as Claude XML.""" 

383 key = self._artifact_key() 

384 artifact_type = self._artifact_type() 

385 title = self._title() 

386 priority = self.ai_context.get("priority", "high") 

387 role = self.ai_context.get("role", f"Covers {title}") 

388 last_verified = self.generated_at or datetime.now(timezone.utc).isoformat() 

389 source_hash = self.codebase_hash 

390 

391 lines = [ 

392 "<documint-artifact>", 

393 f' <artifact key="{key}" type="{artifact_type}" priority="{priority}">', 

394 f" <role>{role}</role>", 

395 " <sections>", 

396 " <symbols>Exact type signatures — ground truth for code generation</symbols>", 

397 " <api_schema>HTTP/function endpoint definitions</api_schema>", 

398 " <narrative>Supplementary prose — lower authority than symbols</narrative>", 

399 " </sections>", 

400 " <freshness>", 

401 f" <drift-status>{self.drift_status}</drift-status>", 

402 f" <last-verified>{last_verified}</last-verified>", 

403 f" <source-hash>{source_hash}</source-hash>", 

404 " </freshness>", 

405 " </artifact>", 

406 "</documint-artifact>", 

407 ] 

408 return "\n".join(lines) 

409 

410 def _check_freshness(self) -> None: 

411 """Raise ValueError if the .mint file is older than MINT_MAX_AGE_DAYS.""" 

412 if not self.generated_at: 

413 return 

414 try: 

415 generated = datetime.fromisoformat( 

416 self.generated_at.replace("Z", "+00:00") 

417 ) 

418 except (ValueError, TypeError, AttributeError): 

419 return # if parsing fails, allow export 

420 age = datetime.now(timezone.utc) - generated 

421 if age.days > self.MINT_MAX_AGE_DAYS: 

422 raise ValueError( 

423 f".mint file is {age.days} days old. " 

424 "Run 'documint publish' to refresh before exporting." 

425 ) 

426 

427 # ------------------------------------------------------------------ 

428 # Export methods 

429 # ------------------------------------------------------------------ 

430 

431 def to_claude_md(self) -> str: 

432 """Emit CLAUDE.md format with agent_context XML block embedded.""" 

433 self._check_freshness() 

434 title = self._title() 

435 artifact_type = self._artifact_type() 

436 export_syms = self._export_symbols() 

437 

438 lines: list[str] = [ 

439 f"# {title}", 

440 "", 

441 "<!-- agent_context: generated by documint.xyz -->", 

442 self._agent_context_xml(), 

443 "", 

444 ] 

445 

446 if export_syms: 

447 lines += ["## Symbols", ""] 

448 for sym in export_syms: 

449 name = sym.get("n", "") 

450 kind = sym.get("k", "") 

451 sig = _build_rich_signature(sym) 

452 doc = sym.get("d", "") 

453 f = sym.get("f", "") 

454 l = sym.get("l", "") 

455 decorators = sym.get("decorators", []) 

456 bases = sym.get("bases", []) 

457 

458 # Decorator annotations 

459 if decorators: 

460 for dec in decorators: 

461 lines.append(f"- `@{dec}`") 

462 

463 entry = f"- `{name}`" 

464 if sig: 

465 entry += f" -- `{sig}`" 

466 if bases: 

467 entry += f" \n Bases: {', '.join(f'`{b}`' for b in bases)}" 

468 if doc: 

469 first_line = doc.strip().split("\n")[0] 

470 entry += f" \n _{first_line}_" 

471 if f: 

472 loc = f"`{f}`" 

473 if l: 

474 loc += f":{l}" 

475 entry += f" \n {loc}" 

476 lines.append(entry) 

477 lines.append("") 

478 

479 if self.api_schema: 

480 lines += ["## API Reference", "", "```", self.api_schema.rstrip(), "```", ""] 

481 

482 if self.narrative: 

483 lines += ["## Documentation", "", self.narrative, ""] 

484 

485 if self.source_files: 

486 lines += ["## Source Files", ""] 

487 for sf in self.source_files: 

488 lines.append(f"- `{sf}`") 

489 lines.append("") 

490 

491 if self.ai_context.get("instructions"): 

492 lines += ["## Agent Instructions", "", self.ai_context["instructions"], ""] 

493 

494 # Stale-sections warning from drift_status 

495 stale_sections: list[str] = [] 

496 if isinstance(self.drift_status, dict): 

497 stale_sections = self.drift_status.get("stale_sections", []) 

498 if stale_sections: 

499 lines.append("") 

500 lines.append("## Stale Sections") 

501 lines.append("The following narrative sections may not match the current code.") 

502 lines.append("Trust [SYM] signatures over these sections:") 

503 for s in stale_sections: 

504 lines.append(f"- {s}") 

505 lines.append("") 

506 

507 return "\n".join(lines) 

508 

509 def to_agents_md(self) -> str: 

510 """Emit AGENTS.md format — simpler, less prescriptive than CLAUDE.md.""" 

511 self._check_freshness() 

512 title = self._title() 

513 artifact_type = self._artifact_type() 

514 export_syms = self._export_symbols() 

515 

516 lines: list[str] = [ 

517 f"# Agent Context: {title}", 

518 "", 

519 f"Type: `{artifact_type}` · Status: `{self.drift_status}`", 

520 "", 

521 ] 

522 

523 if self.ai_context.get("instructions"): 

524 lines += ["## Instructions", "", self.ai_context["instructions"], ""] 

525 

526 if export_syms: 

527 lines += ["## Exports", ""] 

528 for sym in export_syms: 

529 name = sym.get("n", "") 

530 kind = sym.get("k", "") 

531 sig = _build_rich_signature(sym) 

532 doc = sym.get("d", "") 

533 params = sym.get("params", []) 

534 returns = sym.get("returns") 

535 bases = sym.get("bases", []) 

536 

537 entry = f"- `{name}`" 

538 if kind: 

539 entry += f" ({kind})" 

540 if sig: 

541 entry += f": `{sig}`" 

542 lines.append(entry) 

543 

544 # Tool-style parameter descriptions 

545 if params: 

546 for p in params: 

547 pname = p.get("name", "") 

548 ptype = p.get("type", "") 

549 pdef = p.get("default", "") 

550 pdesc = f" - `{pname}`" 

551 if ptype: 

552 pdesc += f" ({ptype})" 

553 if pdef: 

554 pdesc += f" [default: {pdef}]" 

555 lines.append(pdesc) 

556 if returns: 

557 lines.append(f" - Returns: `{returns}`") 

558 if bases: 

559 lines.append(f" - Inherits: {', '.join(f'`{b}`' for b in bases)}") 

560 if doc: 

561 first_line = doc.strip().split("\n")[0] 

562 lines.append(f" - {first_line}") 

563 lines.append("") 

564 

565 if self.narrative: 

566 lines += ["## Documentation", "", self.narrative, ""] 

567 

568 if self.source_files: 

569 lines += ["## Source Files", ""] 

570 for sf in self.source_files: 

571 lines.append(f"- `{sf}`") 

572 lines.append("") 

573 

574 return "\n".join(lines) 

575 

576 def to_llms_txt(self) -> str: 

577 """ 

578 Per llmstxt.org spec: title line + blank line + summary + links section. 

579 This is the compact index version (not full content). 

580 Enriched with a structured symbol index when Griffe data is present. 

581 """ 

582 self._check_freshness() 

583 title = self._title() 

584 artifact_type = self._artifact_type() 

585 export_syms = self._export_symbols() 

586 

587 lines: list[str] = [ 

588 f"# {title}", 

589 "", 

590 ] 

591 

592 # Summary: first paragraph of narrative or a generated one 

593 summary = "" 

594 if self.narrative: 

595 first_para = self.narrative.strip().split("\n\n")[0].strip() 

596 # Strip any markdown headings from the first paragraph 

597 summary_lines = [ln for ln in first_para.splitlines() if not ln.startswith("#")] 

598 summary = " ".join(summary_lines).strip() 

599 if not summary: 

600 summary = f"{title} -- {artifact_type} documentation." 

601 lines += [f"> {summary}", ""] 

602 

603 # Structured symbol index for LLM consumption 

604 if export_syms: 

605 lines += ["## API Index", ""] 

606 for sym in export_syms: 

607 name = sym.get("n", "") 

608 sig = _build_rich_signature(sym) 

609 doc = sym.get("d", "") 

610 if sig: 

611 lines.append(f"- `{sig}`") 

612 else: 

613 lines.append(f"- `{name}`") 

614 if doc: 

615 first_line = doc.strip().split("\n")[0] 

616 lines.append(f" {first_line}") 

617 lines.append("") 

618 

619 if self.links: 

620 lines.append("## Links") 

621 lines.append("") 

622 for name, url in self.links.items(): 

623 lines.append(f"- [{name}]({url})") 

624 lines.append("") 

625 

626 return "\n".join(lines) 

627 

628 def to_llms_full_txt(self) -> str: 

629 """ 

630 Full content version of llms.txt — all narrative + api_schema included. 

631 Used for /llms-full.txt endpoints. 

632 """ 

633 self._check_freshness() 

634 title = self._title() 

635 artifact_type = self._artifact_type() 

636 export_syms = self._export_symbols() 

637 

638 lines: list[str] = [ 

639 f"# {title}", 

640 "", 

641 f"Type: {artifact_type} · Language: {self.language} · Status: {self.drift_status}", 

642 "", 

643 ] 

644 

645 if export_syms: 

646 lines += ["## Exports", ""] 

647 for sym in export_syms: 

648 name = sym.get("n", "") 

649 kind = sym.get("k", "") 

650 sig = _build_rich_signature(sym) 

651 doc = sym.get("d", "") 

652 decorators = sym.get("decorators", []) 

653 bases = sym.get("bases", []) 

654 params = sym.get("params", []) 

655 returns = sym.get("returns") 

656 

657 entry = f"### `{name}`" 

658 if kind: 

659 entry += f" ({kind})" 

660 lines.append(entry) 

661 

662 if decorators: 

663 lines.append("Decorators: " + ", ".join(f"`@{d}`" for d in decorators)) 

664 if bases: 

665 lines.append("Inherits: " + ", ".join(f"`{b}`" for b in bases)) 

666 if sig: 

667 lines += [f"```\n{sig}\n```"] 

668 if params: 

669 lines.append("Parameters:") 

670 for p in params: 

671 pname = p.get("name", "") 

672 ptype = p.get("type", "") 

673 pdef = p.get("default", "") 

674 pdesc = f"- `{pname}`" 

675 if ptype: 

676 pdesc += f": {ptype}" 

677 if pdef: 

678 pdesc += f" (default: {pdef})" 

679 lines.append(pdesc) 

680 if returns: 

681 lines.append(f"Returns: `{returns}`") 

682 if doc: 

683 lines.append("") 

684 lines.append(doc) 

685 lines.append("") 

686 

687 if self.api_schema: 

688 lines += ["## API Schema", "", "```", self.api_schema.rstrip(), "```", ""] 

689 

690 if self.narrative: 

691 lines += ["## Documentation", "", self.narrative, ""] 

692 

693 if self.links: 

694 lines += ["## Links", ""] 

695 for lname, url in self.links.items(): 

696 lines.append(f"- [{lname}]({url})") 

697 lines.append("") 

698 

699 return "\n".join(lines) 

700 

701 def to_api_reference(self) -> str: 

702 """Generate a complete API reference document from .mint data. 

703 

704 Produces publication-quality markdown organized by module/file, with 

705 full signatures, parameter tables, return types, inheritance, and 

706 docstrings. Designed to be directly publishable as developer docs. 

707 """ 

708 self._check_freshness() 

709 title = self._title() 

710 export_syms = self._export_symbols() 

711 

712 lines: list[str] = [ 

713 f"# {title} -- API Reference", 

714 "", 

715 f"*Generated by [Documint](https://documint.xyz) " 

716 f"on {self.generated_at or 'unknown date'}*", 

717 "", 

718 ] 

719 

720 if self.narrative: 

721 first_para = self.narrative.strip().split("\n\n")[0].strip() 

722 summary_lines = [ln for ln in first_para.splitlines() if not ln.startswith("#")] 

723 summary = " ".join(summary_lines).strip() 

724 if summary: 

725 lines += [f"> {summary}", ""] 

726 

727 if not export_syms: 

728 lines.append("No exported symbols found.") 

729 return "\n".join(lines) 

730 

731 # Group symbols by file 

732 by_file: dict[str, list[dict]] = {} 

733 for sym in export_syms: 

734 f = sym.get("f", "(unknown)") 

735 by_file.setdefault(f, []).append(sym) 

736 

737 # Table of contents 

738 lines += ["## Table of Contents", ""] 

739 for fpath in sorted(by_file): 

740 anchor = fpath.replace("/", "").replace(".", "").replace(" ", "-").lower() 

741 lines.append(f"- [{fpath}](#{anchor})") 

742 lines.append("") 

743 

744 # Sections per file 

745 for fpath in sorted(by_file): 

746 lines += [f"---", "", f"## `{fpath}`", ""] 

747 syms = by_file[fpath] 

748 

749 # Separate classes and top-level functions 

750 classes = [s for s in syms if s.get("k") == "class"] 

751 functions = [s for s in syms if s.get("k") == "function"] 

752 methods = [s for s in syms if s.get("k") == "method"] 

753 other = [s for s in syms if s.get("k") not in ("class", "function", "method")] 

754 

755 if classes: 

756 for cls_sym in classes: 

757 cls_name = cls_sym.get("n", "") 

758 bases = cls_sym.get("bases", []) 

759 doc = cls_sym.get("d", "") 

760 decorators = cls_sym.get("decorators", []) 

761 lineno = cls_sym.get("l", "") 

762 

763 lines.append(f"### class `{cls_name}`") 

764 if decorators: 

765 for dec in decorators: 

766 lines.append(f"`@{dec}` ") 

767 if bases: 

768 lines.append(f"**Bases:** {', '.join(f'`{b}`' for b in bases)}") 

769 if lineno: 

770 lines.append(f"*Defined at line {lineno}*") 

771 lines.append("") 

772 if doc: 

773 lines += [doc, ""] 

774 

775 # Find methods belonging to this class 

776 cls_methods = [m for m in methods if m.get("n", "").startswith(f"{cls_name}.")] 

777 if cls_methods: 

778 lines.append("#### Methods") 

779 lines.append("") 

780 for m in cls_methods: 

781 self._render_callable(lines, m, heading_level=5) 

782 

783 if functions: 

784 if classes: 

785 lines += ["### Module-level Functions", ""] 

786 for fn in functions: 

787 self._render_callable(lines, fn, heading_level=3 if not classes else 4) 

788 

789 if other: 

790 for sym in other: 

791 name = sym.get("n", "") 

792 kind = sym.get("k", "") 

793 sig = _build_rich_signature(sym) 

794 doc = sym.get("d", "") 

795 lines.append(f"### `{name}` ({kind})") 

796 if sig: 

797 lines += [f"```python", sig, "```", ""] 

798 if doc: 

799 lines += [doc, ""] 

800 

801 # Source files reference 

802 if self.source_files: 

803 lines += ["---", "", "## Source Files", ""] 

804 for sf in self.source_files: 

805 lines.append(f"- `{sf}`") 

806 lines.append("") 

807 

808 return "\n".join(lines) 

809 

810 def _render_callable(self, lines: list[str], sym: dict, heading_level: int = 4) -> None: 

811 """Render a function or method symbol into the API reference lines list.""" 

812 prefix = "#" * heading_level 

813 name = sym.get("n", "") 

814 sig = _build_rich_signature(sym) 

815 doc = sym.get("d", "") 

816 params = sym.get("params", []) 

817 returns = sym.get("returns") 

818 decorators = sym.get("decorators", []) 

819 lineno = sym.get("l", "") 

820 

821 # Short name for methods: "Class.method" -> "method" 

822 display_name = name.split(".")[-1] if "." in name else name 

823 

824 lines.append(f"{prefix} `{display_name}`") 

825 if decorators: 

826 for dec in decorators: 

827 lines.append(f"`@{dec}` ") 

828 if sig: 

829 lines += ["", f"```python", sig, "```", ""] 

830 if lineno: 

831 lines.append(f"*Line {lineno}*") 

832 lines.append("") 

833 

834 if params: 

835 # Filter out 'self' and 'cls' for cleaner docs 

836 doc_params = [p for p in params if p.get("name") not in ("self", "cls")] 

837 if doc_params: 

838 lines.append("| Parameter | Type | Default |") 

839 lines.append("|-----------|------|---------|") 

840 for p in doc_params: 

841 pname = p.get("name", "") 

842 ptype = p.get("type") or "--" 

843 pdef = p.get("default") or "--" 

844 lines.append(f"| `{pname}` | `{ptype}` | `{pdef}` |") 

845 lines.append("") 

846 

847 if returns: 

848 lines.append(f"**Returns:** `{returns}`") 

849 lines.append("") 

850 

851 if doc: 

852 lines += [doc, ""] 

853 

854 # ------------------------------------------------------------------ 

855 # Metrics 

856 # ------------------------------------------------------------------ 

857 

858 def compression_ratio(self) -> float: 

859 """ 

860 Ratio of .mint serialized size vs equivalent markdown size. 

861 A ratio < 1.0 means .mint is smaller than plain markdown. 

862 """ 

863 # Equivalent markdown = narrative + api_schema + symbols rendered as text 

864 md_equivalent = self.to_llms_full_txt() 

865 md_size = len(md_equivalent.encode()) 

866 

867 # .mint size: approximate as JSON of all raw fields 

868 mint_payload = json.dumps( 

869 { 

870 "symbols": self.symbols, 

871 "narrative": self.narrative, 

872 "api_schema": self.api_schema, 

873 }, 

874 separators=(",", ":"), 

875 ) 

876 mint_size = len(mint_payload.encode()) 

877 

878 if md_size == 0: 

879 return 1.0 

880 return mint_size / md_size 

881 

882 # ------------------------------------------------------------------ 

883 # Binary serialization (.mint v2 binary format) 

884 # ------------------------------------------------------------------ 

885 

886 _SECTION_HDR = 0x01 

887 _SECTION_SYM = 0x02 

888 _SECTION_NAR = 0x03 

889 _SECTION_REF = 0x04 

890 _SECTION_AIC = 0x05 

891 _SECTION_API = 0x06 

892 _SECTION_DIF = 0x07 

893 _FOOTER_MARKER = 0xFF 

894 

895 def _build_section(self, section_type: int, payload: bytes) -> bytes: 

896 """Encode a single binary section: type (1B) + length (4B BE) + payload.""" 

897 return bytes([section_type]) + struct.pack(">I", len(payload)) + payload 

898 

899 def to_binary(self) -> bytes: 

900 """ 

901 Serialize this MintDocument to the .mint v2 binary format. 

902 

903 Layout: 

904 HEADER (6B): b"MINT" + version(1B) + flags(1B) 

905 SECTIONS: type(1B) + length(4B BE) + payload(NB) (repeating) 

906 FOOTER (33B): 0xFF + SHA-256(all preceding bytes) 

907 """ 

908 buf = bytearray() 

909 

910 # --- Header --- 

911 buf += b"MINT" 

912 buf += bytes([0x02, 0x00]) 

913 

914 # --- Section 0x01: HDR (metadata JSON) --- 

915 drift_obj: dict[str, Any] 

916 if isinstance(self.drift_status, dict): 

917 drift_obj = { 

918 "any_stale": self.drift_status.get("any_stale", False), 

919 "stale_sections": self.drift_status.get("stale_sections", []), 

920 } 

921 else: 

922 drift_obj = { 

923 "any_stale": self.drift_status == "STALE", 

924 "stale_sections": [], 

925 } 

926 

927 hdr_payload = json.dumps( 

928 { 

929 "version": self.version, 

930 "language": self.language, 

931 "codebase_hash": self.codebase_hash, 

932 "source_files": self.source_files, 

933 "generated_at": self.generated_at, 

934 "drift": drift_obj, 

935 }, 

936 separators=(",", ":"), 

937 ).encode("utf-8") 

938 buf += self._build_section(self._SECTION_HDR, hdr_payload) 

939 

940 # --- Section 0x02: SYM (gzip-compressed LSIF-compact JSON) --- 

941 sym_list = self.symbols if isinstance(self.symbols, list) else [] 

942 if sym_list: 

943 sym_json = json.dumps(sym_list, separators=(",", ":")).encode("utf-8") 

944 sym_compressed = gzip.compress(sym_json) 

945 buf += self._build_section(self._SECTION_SYM, sym_compressed) 

946 

947 # --- Section 0x03: NAR (raw markdown UTF-8) --- 

948 if self.narrative: 

949 buf += self._build_section(self._SECTION_NAR, self.narrative.encode("utf-8")) 

950 

951 # --- Section 0x04: REF (JSON) --- 

952 if self.references: 

953 ref_payload = json.dumps(self.references, separators=(",", ":")).encode("utf-8") 

954 buf += self._build_section(self._SECTION_REF, ref_payload) 

955 

956 # --- Section 0x05: AIC (JSON) --- 

957 if self.ai_context: 

958 aic_payload = json.dumps(self.ai_context, separators=(",", ":")).encode("utf-8") 

959 buf += self._build_section(self._SECTION_AIC, aic_payload) 

960 

961 # --- Section 0x06: API (raw text UTF-8) --- 

962 if self.api_schema: 

963 buf += self._build_section(self._SECTION_API, self.api_schema.encode("utf-8")) 

964 

965 # --- Section 0x07: DIF (JSON drift history) --- 

966 if self.drift_history: 

967 dif_payload = json.dumps(self.drift_history, separators=(",", ":")).encode("utf-8") 

968 buf += self._build_section(self._SECTION_DIF, dif_payload) 

969 

970 # --- Footer --- 

971 checksum = hashlib.sha256(bytes(buf)).digest() 

972 buf += bytes([self._FOOTER_MARKER]) 

973 buf += checksum 

974 

975 return bytes(buf) 

976 

977 @classmethod 

978 def from_binary(cls, data: bytes) -> "MintDocument": 

979 """ 

980 Parse a .mint v2 binary file and return a MintDocument. 

981 

982 Validates the magic bytes and SHA-256 checksum. Raises ValueError 

983 on format errors or checksum mismatch. 

984 """ 

985 if len(data) < 6 + 33: # header + minimal footer 

986 raise ValueError("File too small to be a valid .mint file") 

987 

988 # --- Validate magic --- 

989 if data[:4] != b"MINT": 

990 raise ValueError("Not a .mint file (bad magic bytes)") 

991 

992 version_byte = data[4] 

993 if version_byte != 0x02: 

994 import warnings 

995 warnings.warn( 

996 f".mint file version {version_byte} differs from expected v2; " 

997 "parsing will proceed but may produce unexpected results.", 

998 stacklevel=2, 

999 ) 

1000 

1001 # --- Validate footer checksum --- 

1002 if data[-33] != cls._FOOTER_MARKER: 

1003 raise ValueError("Missing footer marker (0xFF)") 

1004 stored_checksum = data[-32:] 

1005 computed_checksum = hashlib.sha256(data[:-33]).digest() 

1006 if stored_checksum != computed_checksum: 

1007 raise ValueError( 

1008 "Checksum mismatch: file may be corrupted or truncated" 

1009 ) 

1010 

1011 # --- Parse sections --- 

1012 sections: dict[int, bytes] = {} 

1013 offset = 6 # after header 

1014 end = len(data) - 33 # before footer 

1015 

1016 while offset < end: 

1017 if offset + 5 > end: 

1018 raise ValueError( 

1019 f"Truncated section header at offset {offset}" 

1020 ) 

1021 section_type = data[offset] 

1022 payload_len = struct.unpack(">I", data[offset + 1 : offset + 5])[0] 

1023 offset += 5 

1024 if offset + payload_len > end: 

1025 raise ValueError( 

1026 f"Section 0x{section_type:02X} payload extends past footer " 

1027 f"(need {payload_len} bytes, only {end - offset} available)" 

1028 ) 

1029 sections[section_type] = data[offset : offset + payload_len] 

1030 offset += payload_len 

1031 

1032 # --- Reconstruct MintDocument --- 

1033 doc = cls() 

1034 

1035 # HDR (0x01) 

1036 if cls._SECTION_HDR in sections: 

1037 hdr = json.loads(sections[cls._SECTION_HDR].decode("utf-8")) 

1038 doc.version = hdr.get("version", "2.0") 

1039 doc.language = hdr.get("language", "unknown") 

1040 doc.codebase_hash = hdr.get("codebase_hash", "") 

1041 doc.source_files = hdr.get("source_files", []) 

1042 doc.generated_at = hdr.get("generated_at", "") 

1043 drift = hdr.get("drift", {}) 

1044 if isinstance(drift, dict): 

1045 doc.drift_status = drift 

1046 else: 

1047 doc.drift_status = "CLEAN" 

1048 

1049 # SYM (0x02) 

1050 if cls._SECTION_SYM in sections: 

1051 sym_json = gzip.decompress(sections[cls._SECTION_SYM]) 

1052 doc.symbols = json.loads(sym_json.decode("utf-8")) 

1053 else: 

1054 doc.symbols = [] 

1055 

1056 # NAR (0x03) 

1057 if cls._SECTION_NAR in sections: 

1058 doc.narrative = sections[cls._SECTION_NAR].decode("utf-8") 

1059 

1060 # REF (0x04) 

1061 if cls._SECTION_REF in sections: 

1062 doc.references = json.loads(sections[cls._SECTION_REF].decode("utf-8")) 

1063 

1064 # AIC (0x05) 

1065 if cls._SECTION_AIC in sections: 

1066 doc.ai_context = json.loads(sections[cls._SECTION_AIC].decode("utf-8")) 

1067 

1068 # API (0x06) 

1069 if cls._SECTION_API in sections: 

1070 doc.api_schema = sections[cls._SECTION_API].decode("utf-8") 

1071 

1072 # DIF (0x07) 

1073 if cls._SECTION_DIF in sections: 

1074 doc.drift_history = json.loads(sections[cls._SECTION_DIF].decode("utf-8")) 

1075 

1076 return doc 

1077 

1078 # ------------------------------------------------------------------ 

1079 # Validation 

1080 # ------------------------------------------------------------------ 

1081 

1082 def validate(self) -> list[str]: 

1083 """ 

1084 Return a list of validation error strings. An empty list means valid. 

1085 

1086 Checks: 

1087 - codebase_hash is a non-empty string 

1088 - generated_at is a parseable ISO 8601 datetime 

1089 - version is "1.0" or "2.0" 

1090 - language is non-empty 

1091 - If symbols is non-empty, each entry has at least {"n": str, "k": str} 

1092 """ 

1093 errors: list[str] = [] 

1094 

1095 # codebase_hash 

1096 if not self.codebase_hash or not isinstance(self.codebase_hash, str): 

1097 errors.append("codebase_hash must be a non-empty string") 

1098 

1099 # generated_at 

1100 if not self.generated_at: 

1101 errors.append("generated_at is missing") 

1102 else: 

1103 try: 

1104 datetime.fromisoformat(self.generated_at.replace("Z", "+00:00")) 

1105 except (ValueError, AttributeError): 

1106 errors.append( 

1107 f"generated_at is not a valid ISO datetime: {self.generated_at!r}" 

1108 ) 

1109 

1110 # version 

1111 if self.version not in ("1.0", "2.0"): 

1112 errors.append( 

1113 f"version must be '1.0' or '2.0', got {self.version!r}" 

1114 ) 

1115 

1116 # language 

1117 if not self.language or not isinstance(self.language, str): 

1118 errors.append("language must be a non-empty string") 

1119 

1120 # symbols 

1121 if isinstance(self.symbols, list) and self.symbols: 

1122 for i, sym in enumerate(self.symbols): 

1123 if not isinstance(sym, dict): 

1124 errors.append(f"symbols[{i}] is not a dict") 

1125 continue 

1126 if "n" not in sym or not isinstance(sym.get("n"), str): 

1127 errors.append(f"symbols[{i}] missing required 'n' (name) string") 

1128 if "k" not in sym or not isinstance(sym.get("k"), str): 

1129 errors.append(f"symbols[{i}] missing required 'k' (kind) string") 

1130 

1131 return errors