Coverage for agentos/rag/loader.py: 21%
81 statements
« prev ^ index » next coverage.py v7.14.3, created at 2026-07-02 09:59 +0800
« prev ^ index » next coverage.py v7.14.3, created at 2026-07-02 09:59 +0800
1"""
2文档加载器 — PDF / DOCX / TXT / Markdown 解析与分块。
4零外部 HTTP 依赖,纯本地解析。
5"""
7from __future__ import annotations
9import os
10from pathlib import Path
11from typing import Iterator
14class Document:
15 """文档片段。"""
17 def __init__(self, content: str, source: str = "", page: int = 0, metadata: dict | None = None):
18 self.content = content
19 self.source = source
20 self.page = page
21 self.metadata = metadata or {}
23 def __repr__(self):
24 return f"Document(source={self.source!r}, chars={len(self.content)})"
27class DocumentLoader:
28 """文档加载器 — 支持多种格式的文档解析与智能分块。
30 Args:
31 chunk_size: 分块大小(字符数)
32 chunk_overlap: 块间重叠字符数
33 """
35 SUPPORTED_SUFFIXES = {".pdf", ".docx", ".txt", ".md", ".markdown", ".py", ".json", ".yaml", ".yml"}
37 def __init__(self, chunk_size: int = 1000, chunk_overlap: int = 200):
38 self.chunk_size = chunk_size
39 self.chunk_overlap = chunk_overlap
41 def load_file(self, path: str) -> list[Document]:
42 """加载单个文件,自动识别格式。"""
43 path = os.path.abspath(path)
44 suffix = Path(path).suffix.lower()
45 if suffix not in self.SUPPORTED_SUFFIXES:
46 raise ValueError(f"不支持的文件格式: {suffix}。支持: {self.SUPPORTED_SUFFIXES}")
48 if suffix == ".pdf":
49 text = self._read_pdf(path)
50 elif suffix == ".docx":
51 text = self._read_docx(path)
52 else:
53 with open(path, "r", encoding="utf-8", errors="replace") as f:
54 text = f.read()
56 return self._chunk(text, source=path)
58 def load_directory(self, dir_path: str, recursive: bool = True) -> list[Document]:
59 """加载目录下所有支持的文件。"""
60 docs = []
61 for root, _, files in os.walk(dir_path):
62 for fn in sorted(files):
63 fp = os.path.join(root, fn)
64 suffix = Path(fp).suffix.lower()
65 if suffix in self.SUPPORTED_SUFFIXES:
66 try:
67 docs.extend(self.load_file(fp))
68 except Exception:
69 pass
70 if not recursive:
71 break
72 return docs
74 def _read_pdf(self, path: str) -> str:
75 """读取 PDF 文本。"""
76 try:
77 import pypdf
78 reader = pypdf.PdfReader(path)
79 pages = []
80 for page in reader.pages:
81 text = page.extract_text()
82 if text:
83 pages.append(text)
84 return "\n\n".join(pages)
85 except ImportError:
86 raise ImportError("pypdf 未安装。运行: pip install pypdf")
88 def _read_docx(self, path: str) -> str:
89 """读取 DOCX 文本。"""
90 try:
91 from docx import Document as DocxDocument
92 doc = DocxDocument(path)
93 paragraphs = [p.text for p in doc.paragraphs if p.text.strip()]
94 return "\n".join(paragraphs)
95 except ImportError:
96 raise ImportError("python-docx 未安装。运行: pip install python-docx")
98 def _chunk(self, text: str, source: str = "") -> list[Document]:
99 """固定大小+重叠分块。"""
100 if len(text) <= self.chunk_size:
101 return [Document(content=text.strip(), source=source)]
103 chunks = []
104 start = 0
105 while start < len(text):
106 end = min(start + self.chunk_size, len(text))
107 chunk = text[start:end].strip()
108 if chunk:
109 chunks.append(Document(content=chunk, source=source))
110 start += self.chunk_size - self.chunk_overlap
111 return chunks
114# ── 便捷函数 ──────────────────────────────────────────────────
116def load_file(path: str, chunk_size: int = 1000, chunk_overlap: int = 200) -> list[Document]:
117 """便捷函数:加载单个文件。"""
118 loader = DocumentLoader(chunk_size=chunk_size, chunk_overlap=chunk_overlap)
119 return loader.load_file(path)
122def load_directory(dir_path: str, recursive: bool = True, chunk_size: int = 1000, chunk_overlap: int = 200) -> list[Document]:
123 """便捷函数:加载目录。"""
124 loader = DocumentLoader(chunk_size=chunk_size, chunk_overlap=chunk_overlap)
125 return loader.load_directory(dir_path, recursive=recursive)