Coverage for src / dataknobs_xization / ingestion / config.py: 92%

120 statements  

« prev     ^ index     » next       coverage.py v7.13.0, created at 2025-12-26 16:16 -0700

1"""Configuration schema for knowledge base ingestion. 

2 

3This module provides configuration classes for loading and processing 

4documents from a directory into a knowledge base. 

5""" 

6 

7from __future__ import annotations 

8 

9import json 

10import logging 

11from dataclasses import dataclass, field 

12from pathlib import Path 

13from typing import Any 

14 

15logger = logging.getLogger(__name__) 

16 

17 

18class IngestionConfigError(Exception): 

19 """Error related to ingestion configuration.""" 

20 

21 pass 

22 

23 

24@dataclass 

25class FilePatternConfig: 

26 """Configuration for a specific file pattern. 

27 

28 Allows overriding chunking and metadata settings for files 

29 matching a glob pattern. 

30 

31 Attributes: 

32 pattern: Glob pattern to match files (e.g., "api/**/*.json") 

33 enabled: Whether to process files matching this pattern 

34 chunking: Override chunking settings for matched files 

35 text_template: Jinja2 template for JSON text generation 

36 text_fields: Fields to use for text generation (JSON) 

37 metadata_fields: Fields to include in chunk metadata 

38 """ 

39 

40 pattern: str 

41 enabled: bool = True 

42 chunking: dict[str, Any] | None = None 

43 text_template: str | None = None 

44 text_fields: list[str] | None = None 

45 metadata_fields: list[str] | None = None 

46 

47 def to_dict(self) -> dict[str, Any]: 

48 """Convert to dictionary representation.""" 

49 result: dict[str, Any] = {"pattern": self.pattern} 

50 if not self.enabled: 

51 result["enabled"] = False 

52 if self.chunking: 

53 result["chunking"] = self.chunking 

54 if self.text_template: 

55 result["text_template"] = self.text_template 

56 if self.text_fields: 

57 result["text_fields"] = self.text_fields 

58 if self.metadata_fields: 

59 result["metadata_fields"] = self.metadata_fields 

60 return result 

61 

62 @classmethod 

63 def from_dict(cls, data: dict[str, Any]) -> FilePatternConfig: 

64 """Create from dictionary representation.""" 

65 return cls( 

66 pattern=data["pattern"], 

67 enabled=data.get("enabled", True), 

68 chunking=data.get("chunking"), 

69 text_template=data.get("text_template"), 

70 text_fields=data.get("text_fields"), 

71 metadata_fields=data.get("metadata_fields"), 

72 ) 

73 

74 

75@dataclass 

76class KnowledgeBaseConfig: 

77 """Configuration for knowledge base ingestion from a directory. 

78 

79 Defines how documents in a directory should be processed, chunked, 

80 and prepared for embedding. Supports glob-pattern based overrides 

81 for different file types. 

82 

83 Attributes: 

84 name: Name of the knowledge base 

85 default_chunking: Default chunking settings for all files 

86 default_quality_filter: Default quality filter settings 

87 patterns: List of file pattern configurations with overrides 

88 exclude_patterns: Glob patterns for files to skip 

89 default_metadata: Metadata to attach to all chunks 

90 

91 Example: 

92 ```yaml 

93 name: product-docs 

94 default_chunking: 

95 max_chunk_size: 500 

96 chunk_overlap: 50 

97 

98 patterns: 

99 - pattern: "api/**/*.json" 

100 text_template: "API: {{ method }} {{ path }}\\n{{ description }}" 

101 metadata_fields: [method, path, auth_required] 

102 

103 - pattern: "guides/**/*.md" 

104 chunking: 

105 max_chunk_size: 800 

106 

107 exclude_patterns: 

108 - "**/drafts/**" 

109 - "**/.git/**" 

110 ``` 

111 """ 

112 

113 name: str 

114 default_chunking: dict[str, Any] = field(default_factory=lambda: { 

115 "max_chunk_size": 500, 

116 "chunk_overlap": 50, 

117 }) 

118 default_quality_filter: dict[str, Any] | None = None 

119 patterns: list[FilePatternConfig] = field(default_factory=list) 

120 exclude_patterns: list[str] = field(default_factory=list) 

121 default_metadata: dict[str, Any] = field(default_factory=dict) 

122 

123 @classmethod 

124 def load(cls, directory: str | Path) -> KnowledgeBaseConfig: 

125 """Load configuration from a directory. 

126 

127 Looks for `knowledge_base.yaml`, `knowledge_base.yml`, or 

128 `knowledge_base.json` in the directory. 

129 

130 Args: 

131 directory: Directory containing the config file 

132 

133 Returns: 

134 Loaded KnowledgeBaseConfig instance 

135 

136 Raises: 

137 IngestionConfigError: If config file is invalid or missing 

138 """ 

139 directory = Path(directory) 

140 config_path = cls._find_config_file(directory) 

141 

142 if config_path is None: 

143 # Return default config with directory name 

144 logger.debug( 

145 f"No knowledge_base config found in {directory}, using defaults" 

146 ) 

147 return cls(name=directory.name) 

148 

149 try: 

150 data = cls._load_file(config_path) 

151 except Exception as e: 

152 raise IngestionConfigError( 

153 f"Failed to load config from {config_path}: {e}" 

154 ) from e 

155 

156 return cls.from_dict(data, default_name=directory.name) 

157 

158 @classmethod 

159 def from_dict( 

160 cls, 

161 data: dict[str, Any], 

162 default_name: str = "knowledge_base", 

163 ) -> KnowledgeBaseConfig: 

164 """Create from dictionary representation. 

165 

166 Args: 

167 data: Configuration dictionary 

168 default_name: Default name if not specified in data 

169 

170 Returns: 

171 KnowledgeBaseConfig instance 

172 """ 

173 patterns = [ 

174 FilePatternConfig.from_dict(p) if isinstance(p, dict) else p 

175 for p in data.get("patterns", []) 

176 ] 

177 

178 return cls( 

179 name=data.get("name", default_name), 

180 default_chunking=data.get("default_chunking", { 

181 "max_chunk_size": 500, 

182 "chunk_overlap": 50, 

183 }), 

184 default_quality_filter=data.get("default_quality_filter"), 

185 patterns=patterns, 

186 exclude_patterns=data.get("exclude_patterns", []), 

187 default_metadata=data.get("default_metadata", {}), 

188 ) 

189 

190 @classmethod 

191 def _find_config_file(cls, directory: Path) -> Path | None: 

192 """Find the config file in a directory. 

193 

194 Args: 

195 directory: Directory to search 

196 

197 Returns: 

198 Path to config file, or None if not found 

199 """ 

200 for name in ["knowledge_base.yaml", "knowledge_base.yml", "knowledge_base.json"]: 

201 path = directory / name 

202 if path.exists(): 

203 return path 

204 return None 

205 

206 @classmethod 

207 def _load_file(cls, path: Path) -> dict[str, Any]: 

208 """Load and parse a config file. 

209 

210 Args: 

211 path: Path to config file 

212 

213 Returns: 

214 Parsed configuration dictionary 

215 """ 

216 with open(path, encoding="utf-8") as f: 

217 if path.suffix in [".yaml", ".yml"]: 

218 try: 

219 import yaml 

220 data = yaml.safe_load(f) 

221 except ImportError: 

222 raise IngestionConfigError( 

223 "PyYAML is required to load YAML config files. " 

224 "Install with: pip install pyyaml" 

225 ) 

226 else: 

227 data = json.load(f) 

228 

229 if not isinstance(data, dict): 

230 raise IngestionConfigError( 

231 f"Config file must contain a dictionary: {path}" 

232 ) 

233 

234 return data 

235 

236 def to_dict(self) -> dict[str, Any]: 

237 """Convert to dictionary representation.""" 

238 result: dict[str, Any] = {"name": self.name} 

239 

240 if self.default_chunking: 

241 result["default_chunking"] = self.default_chunking 

242 

243 if self.default_quality_filter: 

244 result["default_quality_filter"] = self.default_quality_filter 

245 

246 if self.patterns: 

247 result["patterns"] = [p.to_dict() for p in self.patterns] 

248 

249 if self.exclude_patterns: 

250 result["exclude_patterns"] = self.exclude_patterns 

251 

252 if self.default_metadata: 

253 result["default_metadata"] = self.default_metadata 

254 

255 return result 

256 

257 def get_pattern_config(self, filepath: str | Path) -> FilePatternConfig | None: 

258 """Get the pattern config that matches a file path. 

259 

260 Returns the first matching pattern config, or None if no pattern matches. 

261 Patterns are checked in order, so more specific patterns should come first. 

262 

263 Args: 

264 filepath: Path to check (relative to knowledge base root) 

265 

266 Returns: 

267 Matching FilePatternConfig, or None 

268 """ 

269 filepath = Path(filepath) 

270 

271 for pattern_config in self.patterns: 

272 if pattern_config.enabled and self._matches_pattern(filepath, pattern_config.pattern): 

273 return pattern_config 

274 

275 return None 

276 

277 def is_excluded(self, filepath: str | Path) -> bool: 

278 """Check if a file path matches any exclude pattern. 

279 

280 Args: 

281 filepath: Path to check (relative to knowledge base root) 

282 

283 Returns: 

284 True if file should be excluded 

285 """ 

286 filepath = Path(filepath) 

287 

288 for pattern in self.exclude_patterns: 

289 if self._matches_pattern(filepath, pattern): 

290 return True 

291 

292 return False 

293 

294 def _matches_pattern(self, filepath: Path, pattern: str) -> bool: 

295 """Check if a filepath matches a glob pattern. 

296 

297 Handles both fnmatch-style and glob-style patterns including `**`. 

298 

299 Args: 

300 filepath: Path to check 

301 pattern: Glob pattern 

302 

303 Returns: 

304 True if path matches pattern 

305 """ 

306 from fnmatch import fnmatch 

307 

308 filepath_str = str(filepath) 

309 

310 # Handle ** patterns by using Path.match for recursive matching 

311 if "**" in pattern: 

312 # Path.match handles ** as recursive glob 

313 return filepath.match(pattern) 

314 else: 

315 # Use fnmatch for simple patterns 

316 return fnmatch(filepath_str, pattern) 

317 

318 def get_chunking_config(self, filepath: str | Path) -> dict[str, Any]: 

319 """Get the effective chunking config for a file. 

320 

321 Merges default chunking with any pattern-specific overrides. 

322 

323 Args: 

324 filepath: Path to file 

325 

326 Returns: 

327 Merged chunking configuration 

328 """ 

329 config = self.default_chunking.copy() 

330 

331 pattern_config = self.get_pattern_config(filepath) 

332 if pattern_config and pattern_config.chunking: 

333 config.update(pattern_config.chunking) 

334 

335 return config 

336 

337 def get_metadata(self, filepath: str | Path) -> dict[str, Any]: 

338 """Get the effective metadata for a file. 

339 

340 Includes default metadata plus source file info. 

341 

342 Args: 

343 filepath: Path to file 

344 

345 Returns: 

346 Metadata dictionary 

347 """ 

348 filepath = Path(filepath) 

349 metadata = self.default_metadata.copy() 

350 metadata["source"] = str(filepath) 

351 metadata["filename"] = filepath.name 

352 return metadata