Source code for jeevesagent.loader.docx
"""DOCX loader → markdown.
Uses ``python-docx`` (lazy import). Walks the document body and
emits markdown:
* ``Heading 1`` / ``Heading 2`` / ... → ``#``, ``##``, ...
* Lists → ``-`` items
* Plain paragraphs → text
* Tables → markdown tables
"""
from __future__ import annotations
from pathlib import Path
from .base import Document
def _heading_level(style_name: str) -> int | None:
"""Return the heading level (1-6) for a Word style name, or None
if it isn't a heading."""
if not style_name:
return None
name = style_name.lower()
if name == "title":
return 1
if name.startswith("heading "):
try:
level = int(name[len("heading ") :])
except ValueError:
return None
return max(1, min(level, 6))
return None
def _is_list_paragraph(paragraph: object) -> bool:
"""Best-effort detect of list paragraphs. Word's list styles
aren't always reliable; this catches the common cases."""
style = getattr(paragraph, "style", None)
style_name = (
getattr(style, "name", "") if style is not None else ""
).lower()
return "list" in style_name or "bullet" in style_name
def _table_to_markdown(table: object) -> str:
"""Convert a ``docx.table.Table`` to a markdown table."""
rows = []
for row in getattr(table, "rows", []):
cells = [
(getattr(cell, "text", "") or "").replace("\n", " ").strip()
for cell in getattr(row, "cells", [])
]
cells = [c.replace("|", "\\|") for c in cells]
rows.append(cells)
if not rows:
return ""
header = rows[0]
out = [
"| " + " | ".join(header) + " |",
"| " + " | ".join("---" for _ in header) + " |",
]
for row in rows[1:]:
padded = row + [""] * (len(header) - len(row))
out.append("| " + " | ".join(padded) + " |")
return "\n".join(out)
[docs]
def load_docx(path: str | Path) -> Document:
"""Load a ``.docx`` file → markdown.
Requires ``python-docx``:
``pip install 'jeevesagent[loader-docx]'``.
"""
try:
import docx # type: ignore[import-not-found, import-untyped]
except ImportError as exc: # pragma: no cover
raise ImportError(
"python-docx is not installed. "
"Install with: pip install 'jeevesagent[loader-docx]'."
) from exc
p = Path(path)
document = docx.Document(str(p))
# Walk the document in order, interleaving paragraphs and tables.
# python-docx stores them in document.element.body.iter() — we
# use the simpler high-level API and process paragraphs +
# tables in document order using their _element index.
paragraphs = list(document.paragraphs)
tables = list(document.tables)
# Build a map from xml element to (kind, payload) so we can walk
# the body in order.
body = document.element.body
parts: list[str] = []
for child in body.iterchildren():
tag = child.tag.split("}", 1)[-1]
if tag == "p":
# Find the matching paragraph
for para in paragraphs:
if para._element is child: # noqa: SLF001
text = (para.text or "").strip()
if not text:
parts.append("")
continue
level = _heading_level(
para.style.name if para.style else ""
)
if level is not None:
parts.append(f"{'#' * level} {text}")
elif _is_list_paragraph(para):
parts.append(f"- {text}")
else:
parts.append(text)
break
elif tag == "tbl":
for table in tables:
if table._element is child: # noqa: SLF001
md = _table_to_markdown(table)
if md:
parts.append("")
parts.append(md)
parts.append("")
break
# Collapse runs of blank lines.
out: list[str] = []
prev_blank = False
for line in parts:
is_blank = not line.strip()
if is_blank and prev_blank:
continue
out.append(line)
prev_blank = is_blank
content = "\n".join(out).strip() + "\n"
# Document properties
core_props = document.core_properties
title = (core_props.title or "").strip() if core_props else ""
if not title and p.stem:
title = p.stem
if not content.startswith("# "):
content = f"# {title}\n\n{content}"
return Document(
content=content,
metadata={
"source": str(p),
"format": "docx",
"title": title,
"paragraph_count": len(paragraphs),
"table_count": len(tables),
},
)