Coverage for src / dataknobs_xization / ingestion / config.py: 92%
120 statements
« prev ^ index » next coverage.py v7.13.0, created at 2025-12-26 16:16 -0700
« prev ^ index » next coverage.py v7.13.0, created at 2025-12-26 16:16 -0700
1"""Configuration schema for knowledge base ingestion.
3This module provides configuration classes for loading and processing
4documents from a directory into a knowledge base.
5"""
7from __future__ import annotations
9import json
10import logging
11from dataclasses import dataclass, field
12from pathlib import Path
13from typing import Any
15logger = logging.getLogger(__name__)
18class IngestionConfigError(Exception):
19 """Error related to ingestion configuration."""
21 pass
24@dataclass
25class FilePatternConfig:
26 """Configuration for a specific file pattern.
28 Allows overriding chunking and metadata settings for files
29 matching a glob pattern.
31 Attributes:
32 pattern: Glob pattern to match files (e.g., "api/**/*.json")
33 enabled: Whether to process files matching this pattern
34 chunking: Override chunking settings for matched files
35 text_template: Jinja2 template for JSON text generation
36 text_fields: Fields to use for text generation (JSON)
37 metadata_fields: Fields to include in chunk metadata
38 """
40 pattern: str
41 enabled: bool = True
42 chunking: dict[str, Any] | None = None
43 text_template: str | None = None
44 text_fields: list[str] | None = None
45 metadata_fields: list[str] | None = None
47 def to_dict(self) -> dict[str, Any]:
48 """Convert to dictionary representation."""
49 result: dict[str, Any] = {"pattern": self.pattern}
50 if not self.enabled:
51 result["enabled"] = False
52 if self.chunking:
53 result["chunking"] = self.chunking
54 if self.text_template:
55 result["text_template"] = self.text_template
56 if self.text_fields:
57 result["text_fields"] = self.text_fields
58 if self.metadata_fields:
59 result["metadata_fields"] = self.metadata_fields
60 return result
62 @classmethod
63 def from_dict(cls, data: dict[str, Any]) -> FilePatternConfig:
64 """Create from dictionary representation."""
65 return cls(
66 pattern=data["pattern"],
67 enabled=data.get("enabled", True),
68 chunking=data.get("chunking"),
69 text_template=data.get("text_template"),
70 text_fields=data.get("text_fields"),
71 metadata_fields=data.get("metadata_fields"),
72 )
75@dataclass
76class KnowledgeBaseConfig:
77 """Configuration for knowledge base ingestion from a directory.
79 Defines how documents in a directory should be processed, chunked,
80 and prepared for embedding. Supports glob-pattern based overrides
81 for different file types.
83 Attributes:
84 name: Name of the knowledge base
85 default_chunking: Default chunking settings for all files
86 default_quality_filter: Default quality filter settings
87 patterns: List of file pattern configurations with overrides
88 exclude_patterns: Glob patterns for files to skip
89 default_metadata: Metadata to attach to all chunks
91 Example:
92 ```yaml
93 name: product-docs
94 default_chunking:
95 max_chunk_size: 500
96 chunk_overlap: 50
98 patterns:
99 - pattern: "api/**/*.json"
100 text_template: "API: {{ method }} {{ path }}\\n{{ description }}"
101 metadata_fields: [method, path, auth_required]
103 - pattern: "guides/**/*.md"
104 chunking:
105 max_chunk_size: 800
107 exclude_patterns:
108 - "**/drafts/**"
109 - "**/.git/**"
110 ```
111 """
113 name: str
114 default_chunking: dict[str, Any] = field(default_factory=lambda: {
115 "max_chunk_size": 500,
116 "chunk_overlap": 50,
117 })
118 default_quality_filter: dict[str, Any] | None = None
119 patterns: list[FilePatternConfig] = field(default_factory=list)
120 exclude_patterns: list[str] = field(default_factory=list)
121 default_metadata: dict[str, Any] = field(default_factory=dict)
123 @classmethod
124 def load(cls, directory: str | Path) -> KnowledgeBaseConfig:
125 """Load configuration from a directory.
127 Looks for `knowledge_base.yaml`, `knowledge_base.yml`, or
128 `knowledge_base.json` in the directory.
130 Args:
131 directory: Directory containing the config file
133 Returns:
134 Loaded KnowledgeBaseConfig instance
136 Raises:
137 IngestionConfigError: If config file is invalid or missing
138 """
139 directory = Path(directory)
140 config_path = cls._find_config_file(directory)
142 if config_path is None:
143 # Return default config with directory name
144 logger.debug(
145 f"No knowledge_base config found in {directory}, using defaults"
146 )
147 return cls(name=directory.name)
149 try:
150 data = cls._load_file(config_path)
151 except Exception as e:
152 raise IngestionConfigError(
153 f"Failed to load config from {config_path}: {e}"
154 ) from e
156 return cls.from_dict(data, default_name=directory.name)
158 @classmethod
159 def from_dict(
160 cls,
161 data: dict[str, Any],
162 default_name: str = "knowledge_base",
163 ) -> KnowledgeBaseConfig:
164 """Create from dictionary representation.
166 Args:
167 data: Configuration dictionary
168 default_name: Default name if not specified in data
170 Returns:
171 KnowledgeBaseConfig instance
172 """
173 patterns = [
174 FilePatternConfig.from_dict(p) if isinstance(p, dict) else p
175 for p in data.get("patterns", [])
176 ]
178 return cls(
179 name=data.get("name", default_name),
180 default_chunking=data.get("default_chunking", {
181 "max_chunk_size": 500,
182 "chunk_overlap": 50,
183 }),
184 default_quality_filter=data.get("default_quality_filter"),
185 patterns=patterns,
186 exclude_patterns=data.get("exclude_patterns", []),
187 default_metadata=data.get("default_metadata", {}),
188 )
190 @classmethod
191 def _find_config_file(cls, directory: Path) -> Path | None:
192 """Find the config file in a directory.
194 Args:
195 directory: Directory to search
197 Returns:
198 Path to config file, or None if not found
199 """
200 for name in ["knowledge_base.yaml", "knowledge_base.yml", "knowledge_base.json"]:
201 path = directory / name
202 if path.exists():
203 return path
204 return None
206 @classmethod
207 def _load_file(cls, path: Path) -> dict[str, Any]:
208 """Load and parse a config file.
210 Args:
211 path: Path to config file
213 Returns:
214 Parsed configuration dictionary
215 """
216 with open(path, encoding="utf-8") as f:
217 if path.suffix in [".yaml", ".yml"]:
218 try:
219 import yaml
220 data = yaml.safe_load(f)
221 except ImportError:
222 raise IngestionConfigError(
223 "PyYAML is required to load YAML config files. "
224 "Install with: pip install pyyaml"
225 )
226 else:
227 data = json.load(f)
229 if not isinstance(data, dict):
230 raise IngestionConfigError(
231 f"Config file must contain a dictionary: {path}"
232 )
234 return data
236 def to_dict(self) -> dict[str, Any]:
237 """Convert to dictionary representation."""
238 result: dict[str, Any] = {"name": self.name}
240 if self.default_chunking:
241 result["default_chunking"] = self.default_chunking
243 if self.default_quality_filter:
244 result["default_quality_filter"] = self.default_quality_filter
246 if self.patterns:
247 result["patterns"] = [p.to_dict() for p in self.patterns]
249 if self.exclude_patterns:
250 result["exclude_patterns"] = self.exclude_patterns
252 if self.default_metadata:
253 result["default_metadata"] = self.default_metadata
255 return result
257 def get_pattern_config(self, filepath: str | Path) -> FilePatternConfig | None:
258 """Get the pattern config that matches a file path.
260 Returns the first matching pattern config, or None if no pattern matches.
261 Patterns are checked in order, so more specific patterns should come first.
263 Args:
264 filepath: Path to check (relative to knowledge base root)
266 Returns:
267 Matching FilePatternConfig, or None
268 """
269 filepath = Path(filepath)
271 for pattern_config in self.patterns:
272 if pattern_config.enabled and self._matches_pattern(filepath, pattern_config.pattern):
273 return pattern_config
275 return None
277 def is_excluded(self, filepath: str | Path) -> bool:
278 """Check if a file path matches any exclude pattern.
280 Args:
281 filepath: Path to check (relative to knowledge base root)
283 Returns:
284 True if file should be excluded
285 """
286 filepath = Path(filepath)
288 for pattern in self.exclude_patterns:
289 if self._matches_pattern(filepath, pattern):
290 return True
292 return False
294 def _matches_pattern(self, filepath: Path, pattern: str) -> bool:
295 """Check if a filepath matches a glob pattern.
297 Handles both fnmatch-style and glob-style patterns including `**`.
299 Args:
300 filepath: Path to check
301 pattern: Glob pattern
303 Returns:
304 True if path matches pattern
305 """
306 from fnmatch import fnmatch
308 filepath_str = str(filepath)
310 # Handle ** patterns by using Path.match for recursive matching
311 if "**" in pattern:
312 # Path.match handles ** as recursive glob
313 return filepath.match(pattern)
314 else:
315 # Use fnmatch for simple patterns
316 return fnmatch(filepath_str, pattern)
318 def get_chunking_config(self, filepath: str | Path) -> dict[str, Any]:
319 """Get the effective chunking config for a file.
321 Merges default chunking with any pattern-specific overrides.
323 Args:
324 filepath: Path to file
326 Returns:
327 Merged chunking configuration
328 """
329 config = self.default_chunking.copy()
331 pattern_config = self.get_pattern_config(filepath)
332 if pattern_config and pattern_config.chunking:
333 config.update(pattern_config.chunking)
335 return config
337 def get_metadata(self, filepath: str | Path) -> dict[str, Any]:
338 """Get the effective metadata for a file.
340 Includes default metadata plus source file info.
342 Args:
343 filepath: Path to file
345 Returns:
346 Metadata dictionary
347 """
348 filepath = Path(filepath)
349 metadata = self.default_metadata.copy()
350 metadata["source"] = str(filepath)
351 metadata["filename"] = filepath.name
352 return metadata