Coverage for agentos/rag/loader.py: 21%

81 statements  

« prev     ^ index     » next       coverage.py v7.14.3, created at 2026-07-02 09:59 +0800

1""" 

2文档加载器 — PDF / DOCX / TXT / Markdown 解析与分块。 

3 

4零外部 HTTP 依赖,纯本地解析。 

5""" 

6 

7from __future__ import annotations 

8 

9import os 

10from pathlib import Path 

11from typing import Iterator 

12 

13 

14class Document: 

15 """文档片段。""" 

16 

17 def __init__(self, content: str, source: str = "", page: int = 0, metadata: dict | None = None): 

18 self.content = content 

19 self.source = source 

20 self.page = page 

21 self.metadata = metadata or {} 

22 

23 def __repr__(self): 

24 return f"Document(source={self.source!r}, chars={len(self.content)})" 

25 

26 

27class DocumentLoader: 

28 """文档加载器 — 支持多种格式的文档解析与智能分块。 

29 

30 Args: 

31 chunk_size: 分块大小(字符数) 

32 chunk_overlap: 块间重叠字符数 

33 """ 

34 

35 SUPPORTED_SUFFIXES = {".pdf", ".docx", ".txt", ".md", ".markdown", ".py", ".json", ".yaml", ".yml"} 

36 

37 def __init__(self, chunk_size: int = 1000, chunk_overlap: int = 200): 

38 self.chunk_size = chunk_size 

39 self.chunk_overlap = chunk_overlap 

40 

41 def load_file(self, path: str) -> list[Document]: 

42 """加载单个文件,自动识别格式。""" 

43 path = os.path.abspath(path) 

44 suffix = Path(path).suffix.lower() 

45 if suffix not in self.SUPPORTED_SUFFIXES: 

46 raise ValueError(f"不支持的文件格式: {suffix}。支持: {self.SUPPORTED_SUFFIXES}") 

47 

48 if suffix == ".pdf": 

49 text = self._read_pdf(path) 

50 elif suffix == ".docx": 

51 text = self._read_docx(path) 

52 else: 

53 with open(path, "r", encoding="utf-8", errors="replace") as f: 

54 text = f.read() 

55 

56 return self._chunk(text, source=path) 

57 

58 def load_directory(self, dir_path: str, recursive: bool = True) -> list[Document]: 

59 """加载目录下所有支持的文件。""" 

60 docs = [] 

61 for root, _, files in os.walk(dir_path): 

62 for fn in sorted(files): 

63 fp = os.path.join(root, fn) 

64 suffix = Path(fp).suffix.lower() 

65 if suffix in self.SUPPORTED_SUFFIXES: 

66 try: 

67 docs.extend(self.load_file(fp)) 

68 except Exception: 

69 pass 

70 if not recursive: 

71 break 

72 return docs 

73 

74 def _read_pdf(self, path: str) -> str: 

75 """读取 PDF 文本。""" 

76 try: 

77 import pypdf 

78 reader = pypdf.PdfReader(path) 

79 pages = [] 

80 for page in reader.pages: 

81 text = page.extract_text() 

82 if text: 

83 pages.append(text) 

84 return "\n\n".join(pages) 

85 except ImportError: 

86 raise ImportError("pypdf 未安装。运行: pip install pypdf") 

87 

88 def _read_docx(self, path: str) -> str: 

89 """读取 DOCX 文本。""" 

90 try: 

91 from docx import Document as DocxDocument 

92 doc = DocxDocument(path) 

93 paragraphs = [p.text for p in doc.paragraphs if p.text.strip()] 

94 return "\n".join(paragraphs) 

95 except ImportError: 

96 raise ImportError("python-docx 未安装。运行: pip install python-docx") 

97 

98 def _chunk(self, text: str, source: str = "") -> list[Document]: 

99 """固定大小+重叠分块。""" 

100 if len(text) <= self.chunk_size: 

101 return [Document(content=text.strip(), source=source)] 

102 

103 chunks = [] 

104 start = 0 

105 while start < len(text): 

106 end = min(start + self.chunk_size, len(text)) 

107 chunk = text[start:end].strip() 

108 if chunk: 

109 chunks.append(Document(content=chunk, source=source)) 

110 start += self.chunk_size - self.chunk_overlap 

111 return chunks 

112 

113 

114# ── 便捷函数 ────────────────────────────────────────────────── 

115 

116def load_file(path: str, chunk_size: int = 1000, chunk_overlap: int = 200) -> list[Document]: 

117 """便捷函数:加载单个文件。""" 

118 loader = DocumentLoader(chunk_size=chunk_size, chunk_overlap=chunk_overlap) 

119 return loader.load_file(path) 

120 

121 

122def load_directory(dir_path: str, recursive: bool = True, chunk_size: int = 1000, chunk_overlap: int = 200) -> list[Document]: 

123 """便捷函数:加载目录。""" 

124 loader = DocumentLoader(chunk_size=chunk_size, chunk_overlap=chunk_overlap) 

125 return loader.load_directory(dir_path, recursive=recursive)