Source code for jeevesagent.loader.csv

"""CSV / TSV loaders → markdown table.

Uses the stdlib ``csv`` module. The first row is treated as the
header. Each row becomes a markdown table row. For very large CSVs
(hundreds of thousands of rows), prefer pandas + manual chunking;
this loader is intended for documents an LLM will read end-to-end.
"""

from __future__ import annotations

import csv as _stdlib_csv
from pathlib import Path

from .base import Document


def _escape_pipe(s: str) -> str:
    """Markdown tables can't contain unescaped ``|``."""
    return s.replace("|", "\\|").replace("\n", " ")


def _rows_to_markdown_table(rows: list[list[str]]) -> str:
    if not rows:
        return "(empty)"
    header = rows[0]
    body = rows[1:]
    out = [
        "| " + " | ".join(_escape_pipe(c) for c in header) + " |",
        "| " + " | ".join("---" for _ in header) + " |",
    ]
    for row in body:
        # Pad short rows so the table stays valid.
        padded = row + [""] * (len(header) - len(row))
        out.append(
            "| " + " | ".join(_escape_pipe(c) for c in padded) + " |"
        )
    return "\n".join(out)


def _load_delimited(
    path: Path, delimiter: str, format_name: str
) -> Document:
    with path.open("r", encoding="utf-8", newline="") as fh:
        reader = _stdlib_csv.reader(fh, delimiter=delimiter)
        rows = list(reader)

    table = _rows_to_markdown_table(rows)
    content = f"# {path.name}\n\n{table}\n"
    return Document(
        content=content,
        metadata={
            "source": str(path),
            "format": format_name,
            "row_count": max(len(rows) - 1, 0),  # excluding header
            "column_count": len(rows[0]) if rows else 0,
        },
    )


[docs] def load_csv(path: str | Path) -> Document: """Load a comma-separated file → markdown table.""" return _load_delimited(Path(path), delimiter=",", format_name="csv")
[docs] def load_tsv(path: str | Path) -> Document: """Load a tab-separated file → markdown table.""" return _load_delimited(Path(path), delimiter="\t", format_name="tsv")