#!/usr/bin/env python
"""
This script builds Sphinx documentation in Markdown format and combines it into a single file
for use as context with Large Language Models (LLMs).
"""
import argparse
import glob
import logging
import os
import shutil
import subprocess
import sys
import tempfile
# Configure logging
logging.basicConfig(
level=logging.INFO,
format="[%(levelname)s] %(message)s"
)
logger = logging.getLogger(__name__)
[docs]
def parse_args():
parser = argparse.ArgumentParser(description="Build Sphinx documentation in Markdown format for LLM context.")
parser.add_argument("--exclude", type=str, default="", help="Comma-separated list of files to exclude (without .md extension)")
parser.add_argument("--output", type=str, required=True, help="Output file path")
parser.add_argument("--sphinx-source", type=str, required=True, help="Path to Sphinx source directory (where conf.py and index.rst are)")
parser.add_argument("--conf", type=str, default=None, help="Path to conf.py (default: <sphinx-source>/conf.py)")
parser.add_argument("--index", type=str, default=None, help="Path to index.rst (default: <sphinx-source>/index.rst)")
parser.add_argument("--notebook", type=str, default=None, help="Path to notebook to convert and append")
parser.add_argument("--source-root", type=str, required=True, help="Absolute path to the root of the source code to add to sys.path for Sphinx autodoc.")
parser.add_argument("--library-name", type=str, default=None, help="Name of the library for the documentation title.")
return parser.parse_args()
[docs]
def build_markdown(sphinx_source, conf_path, source_root):
build_dir = tempfile.mkdtemp(prefix="sphinx_build_")
logger.info(f" 📄 Temporary build directory: {build_dir}")
os.makedirs(build_dir, exist_ok=True)
conf_dir = os.path.dirname(conf_path)
logger.info(f" 📄 sphinx_source: {sphinx_source}")
logger.info(f" 📄 conf_path: {conf_path}")
logger.info(f" 📄 build_dir: {build_dir}")
logger.info(f" 📄 sphinx-build command: sphinx-build -b markdown -c {conf_dir} {sphinx_source} {build_dir}")
logger.info(" 📄 Running sphinx-build...")
result = subprocess.run(
["sphinx-build", "-b", "markdown", "-c", conf_dir, sphinx_source, build_dir],
capture_output=True,
text=True,
env={**os.environ, "PYTHONPATH": source_root + os.pathsep + os.environ.get("PYTHONPATH", "")}
)
if result.returncode != 0:
logger.error(" 📄 sphinx-build failed with return code %s", result.returncode)
logger.error(" 📄 stdout:\n%s", result.stdout)
logger.error(" 📄 stderr:\n%s", result.stderr)
else:
logger.info(" ✅ sphinx-build completed successfully.")
logger.info(" 📄 Files in build_dir after sphinx-build: %s", os.listdir(build_dir))
return build_dir
[docs]
def combine_markdown(build_dir, exclude, output, index_path, library_name):
md_files = glob.glob(os.path.join(build_dir, "*.md"))
exclude_set = set(f"{e.strip()}.md" for e in exclude if e.strip())
filtered = [f for f in md_files if os.path.basename(f) not in exclude_set]
index_md = None
others = []
for f in filtered:
if os.path.basename(f).lower() == "index.md":
index_md = f
else:
others.append(f)
toctree_order = extract_toctree_order(index_path) if index_path else []
name_to_file = {os.path.splitext(os.path.basename(f))[0]: f for f in others}
ordered = []
for doc in toctree_order:
if doc in name_to_file:
ordered.append(name_to_file.pop(doc))
remaining = sorted(name_to_file.values())
ordered.extend(remaining)
final_order = ([index_md] if index_md else []) + ordered
os.makedirs(os.path.dirname(output), exist_ok=True)
with open(output, "w", encoding="utf-8") as out:
out.write(f"# - {library_name} | Complete Documentation -\n\n")
for i, f in enumerate(final_order):
if i > 0:
out.write("\n\n---\n\n")
section = os.path.splitext(os.path.basename(f))[0]
out.write(f"## {section}\n\n")
with open(f, encoding="utf-8") as infile:
out.write(infile.read())
out.write("\n\n")
logger.info(f" 📄 Combined markdown written to {output}")
[docs]
def convert_notebook(nb_path):
if not shutil.which("jupytext"):
logger.error(" 📄 jupytext is required to convert notebooks.")
return None
md_path = os.path.splitext(nb_path)[0] + ".md"
cmd = ["jupytext", "--to", "md", "--opt", "notebook_metadata_filter=-all", nb_path]
logger.info(f" 📄 Converting notebook {nb_path} to markdown...")
result = subprocess.run(cmd, capture_output=True, text=True)
if result.returncode != 0:
logger.error(f" 📄 Failed to convert notebook:\n{result.stderr}")
return None
if not os.path.exists(md_path):
logger.error(f" 📄 Expected markdown file {md_path} not found after conversion.")
return None
logger.info(f" ✅ Notebook converted to {md_path}")
return md_path
[docs]
def append_notebook_markdown(output_file, notebook_md):
with open(output_file, "a", encoding="utf-8") as out, open(notebook_md, encoding="utf-8") as nb_md:
out.write("\n\n# Notebook\n\n---\n\n")
out.write(nb_md.read())
logger.info(f" 📄 Appended notebook markdown from {notebook_md} to {output_file}")
[docs]
def main():
args = parse_args()
exclude = args.exclude.split(",") if args.exclude else []
sphinx_source = os.path.abspath(args.sphinx_source)
conf_path = os.path.abspath(args.conf) if args.conf else os.path.join(sphinx_source, "conf.py")
index_path = os.path.abspath(args.index) if args.index else os.path.join(sphinx_source, "index.rst")
source_root = os.path.abspath(args.source_root)
library_name = args.library_name if args.library_name else os.path.basename(source_root)
build_dir = build_markdown(sphinx_source, conf_path, source_root)
combine_markdown(build_dir, exclude, args.output, index_path, library_name)
if args.notebook:
notebook_md = convert_notebook(args.notebook)
if notebook_md:
append_notebook_markdown(args.output, notebook_md)
logger.info(" ✅ Sphinx to Markdown conversion successful.")
if __name__ == "__main__":
sys.exit(main())