Coverage for src / dataknobs_xization / content_transformer.py: 9%

216 statements  

« prev     ^ index     » next       coverage.py v7.13.0, created at 2025-12-26 15:46 -0700

1"""Content transformation utilities for converting various formats to markdown. 

2 

3This module provides tools for converting structured data formats (JSON, YAML, CSV) 

4into markdown format suitable for RAG ingestion and chunking. 

5 

6The ContentTransformer supports: 

7- Generic conversion that preserves structure through heading hierarchy 

8- Custom schemas for specialized formatting of known data structures 

9- Nested object and array handling 

10- Configurable heading levels and formatting options 

11 

12Example: 

13 >>> transformer = ContentTransformer() 

14 >>> 

15 >>> # Generic conversion 

16 >>> data = {"name": "Chain of Thought", "description": "Step by step reasoning"} 

17 >>> markdown = transformer.transform_json(data) 

18 >>> 

19 >>> # With custom schema 

20 >>> transformer.register_schema("pattern", { 

21 ... "title_field": "name", 

22 ... "sections": [ 

23 ... {"field": "description", "heading": "Description"}, 

24 ... {"field": "example", "heading": "Example", "format": "code"} 

25 ... ] 

26 ... }) 

27 >>> markdown = transformer.transform_json(data, schema="pattern") 

28""" 

29 

30import csv 

31import io 

32import json 

33import logging 

34from pathlib import Path 

35from typing import Any 

36 

37logger = logging.getLogger(__name__) 

38 

39 

40class ContentTransformer: 

41 """Transform structured content into markdown for RAG ingestion. 

42 

43 This class converts various data formats (JSON, YAML, CSV) into well-structured 

44 markdown that can be parsed by MarkdownParser and chunked by MarkdownChunker. 

45 

46 The transformer creates markdown with appropriate heading hierarchy so that 

47 the chunker can create semantic boundaries around logical content units. 

48 

49 Attributes: 

50 schemas: Dictionary of registered custom schemas 

51 config: Transformer configuration options 

52 """ 

53 

54 def __init__( 

55 self, 

56 base_heading_level: int = 2, 

57 include_field_labels: bool = True, 

58 code_block_fields: list[str] | None = None, 

59 list_fields: list[str] | None = None, 

60 ): 

61 """Initialize the content transformer. 

62 

63 Args: 

64 base_heading_level: Starting heading level for top-level items (default: 2) 

65 include_field_labels: Whether to bold field names in output (default: True) 

66 code_block_fields: Field names that should be rendered as code blocks 

67 list_fields: Field names that should be rendered as bullet lists 

68 """ 

69 self.base_heading_level = base_heading_level 

70 self.include_field_labels = include_field_labels 

71 self.code_block_fields = set(code_block_fields or ["example", "code", "snippet"]) 

72 self.list_fields = set(list_fields or ["items", "steps", "objectives", "symptoms", "solutions"]) 

73 self.schemas: dict[str, dict[str, Any]] = {} 

74 

75 def register_schema(self, name: str, schema: dict[str, Any]) -> None: 

76 """Register a custom schema for specialized content conversion. 

77 

78 Schemas define how to map JSON fields to markdown structure. 

79 

80 Args: 

81 name: Schema identifier 

82 schema: Schema definition with the following structure: 

83 - title_field: Field to use as the main heading (required) 

84 - description_field: Field for intro text (optional) 

85 - sections: List of section definitions, each with: 

86 - field: Source field name 

87 - heading: Section heading text 

88 - format: "text", "code", "list", or "subsections" (default: "text") 

89 - language: Code block language (for format="code") 

90 - metadata_fields: Fields to render as key-value metadata 

91 

92 Example: 

93 >>> transformer.register_schema("pattern", { 

94 ... "title_field": "name", 

95 ... "description_field": "description", 

96 ... "sections": [ 

97 ... {"field": "use_case", "heading": "When to Use"}, 

98 ... {"field": "example", "heading": "Example", "format": "code"} 

99 ... ], 

100 ... "metadata_fields": ["category", "difficulty"] 

101 ... }) 

102 """ 

103 self.schemas[name] = schema 

104 logger.debug(f"Registered schema: {name}") 

105 

106 def transform( 

107 self, 

108 content: Any, 

109 format: str = "json", 

110 schema: str | None = None, 

111 title: str | None = None, 

112 ) -> str: 

113 """Transform content to markdown. 

114 

115 Args: 

116 content: Content to transform (dict, list, string, or file path) 

117 format: Content format - "json", "yaml", or "csv" 

118 schema: Optional schema name for custom conversion 

119 title: Optional document title 

120 

121 Returns: 

122 Markdown formatted string 

123 

124 Raises: 

125 ValueError: If format is not supported 

126 """ 

127 if format == "json": 

128 if isinstance(content, (str, Path)): 

129 with open(content, encoding="utf-8") as f: 

130 data = json.load(f) 

131 else: 

132 data = content 

133 return self.transform_json(data, schema=schema, title=title) 

134 elif format == "yaml": 

135 return self.transform_yaml(content, schema=schema, title=title) 

136 elif format == "csv": 

137 return self.transform_csv(content, title=title) 

138 else: 

139 raise ValueError(f"Unsupported format: {format}. Use 'json', 'yaml', or 'csv'.") 

140 

141 def transform_json( 

142 self, 

143 data: dict[str, Any] | list[Any], 

144 schema: str | None = None, 

145 title: str | None = None, 

146 ) -> str: 

147 """Transform JSON data to markdown. 

148 

149 Args: 

150 data: JSON data (dict or list) 

151 schema: Optional schema name for custom conversion 

152 title: Optional document title 

153 

154 Returns: 

155 Markdown formatted string 

156 """ 

157 lines: list[str] = [] 

158 

159 # Add document title if provided 

160 if title: 

161 lines.extend([f"# {title}", ""]) 

162 

163 # Use custom schema if specified 

164 if schema and schema in self.schemas: 

165 return self._transform_with_schema(data, self.schemas[schema], title) 

166 

167 # Generic transformation 

168 if isinstance(data, list): 

169 for item in data: 

170 if isinstance(item, dict): 

171 lines.extend(self._transform_dict_generic(item, self.base_heading_level)) 

172 lines.extend(["---", ""]) 

173 else: 

174 lines.append(f"- {item}") 

175 lines.append("") 

176 elif isinstance(data, dict): 

177 lines.extend(self._transform_dict_generic(data, self.base_heading_level)) 

178 else: 

179 lines.append(str(data)) 

180 

181 return "\n".join(lines) 

182 

183 def transform_yaml( 

184 self, 

185 content: str | Path, 

186 schema: str | None = None, 

187 title: str | None = None, 

188 ) -> str: 

189 """Transform YAML content to markdown. 

190 

191 Args: 

192 content: YAML string or file path 

193 schema: Optional schema name for custom conversion 

194 title: Optional document title 

195 

196 Returns: 

197 Markdown formatted string 

198 

199 Raises: 

200 ImportError: If PyYAML is not installed 

201 """ 

202 try: 

203 import yaml 

204 except ImportError: 

205 raise ImportError("PyYAML is required for YAML transformation. Install with: pip install pyyaml") from None 

206 

207 if isinstance(content, (str, Path)) and Path(content).exists(): 

208 with open(content, encoding="utf-8") as f: 

209 data = yaml.safe_load(f) 

210 else: 

211 data = yaml.safe_load(content) 

212 

213 return self.transform_json(data, schema=schema, title=title) 

214 

215 def transform_csv( 

216 self, 

217 content: str | Path, 

218 title: str | None = None, 

219 title_field: str | None = None, 

220 ) -> str: 

221 """Transform CSV content to markdown. 

222 

223 Each row becomes a section with the first column (or title_field) as heading. 

224 

225 Args: 

226 content: CSV string or file path 

227 title: Optional document title 

228 title_field: Column to use as section title (default: first column) 

229 

230 Returns: 

231 Markdown formatted string 

232 """ 

233 lines: list[str] = [] 

234 

235 if title: 

236 lines.extend([f"# {title}", ""]) 

237 

238 # Read CSV 

239 if isinstance(content, Path) or (isinstance(content, str) and Path(content).exists()): 

240 with open(content, encoding="utf-8") as f: 

241 reader = csv.DictReader(f) 

242 rows = list(reader) 

243 else: 

244 reader = csv.DictReader(io.StringIO(content)) 

245 rows = list(reader) 

246 

247 if not rows: 

248 return "\n".join(lines) 

249 

250 # Determine title field 

251 fieldnames = list(rows[0].keys()) 

252 if title_field and title_field in fieldnames: 

253 title_col = title_field 

254 else: 

255 title_col = fieldnames[0] 

256 

257 # Transform each row 

258 for row in rows: 

259 row_title = row.get(title_col, "Untitled") 

260 lines.append(f"{'#' * self.base_heading_level} {row_title}") 

261 lines.append("") 

262 

263 for field, value in row.items(): 

264 if field == title_col or not value: 

265 continue 

266 

267 if self.include_field_labels: 

268 lines.append(f"**{self._format_field_name(field)}**: {value}") 

269 else: 

270 lines.append(value) 

271 lines.append("") 

272 

273 lines.extend(["---", ""]) 

274 

275 return "\n".join(lines) 

276 

277 def _transform_with_schema( 

278 self, 

279 data: dict[str, Any] | list[Any], 

280 schema: dict[str, Any], 

281 title: str | None = None, 

282 ) -> str: 

283 """Transform data using a custom schema. 

284 

285 Args: 

286 data: Data to transform (list or dict) 

287 - List format: [{"name": "Item", ...}, ...] 

288 - Dict format: {"Item": {...}, ...} (keys become title_field values) 

289 schema: Schema definition 

290 title: Optional document title 

291 

292 Returns: 

293 Markdown formatted string 

294 """ 

295 lines: list[str] = [] 

296 

297 if title: 

298 lines.extend([f"# {title}", ""]) 

299 

300 # Normalize dict-keyed format to list format 

301 # Dict format: {"Item Name": {"field": "value"}} -> [{"name": "Item Name", "field": "value"}] 

302 if isinstance(data, dict): 

303 # Check if this looks like a keyed dict (values are dicts) 

304 # vs a single item dict (values are primitive) 

305 if all(isinstance(v, dict) for v in data.values()): 

306 title_field = schema.get("title_field", "name") 

307 data = [ 

308 {title_field: key, **value} 

309 for key, value in data.items() 

310 ] 

311 logger.debug(f"Normalized dict-keyed data to list format with {len(data)} items") 

312 

313 items = data if isinstance(data, list) else [data] 

314 

315 for item in items: 

316 if not isinstance(item, dict): 

317 continue 

318 

319 # Title 

320 title_field = schema.get("title_field", "name") 

321 item_title = item.get(title_field, "Untitled") 

322 lines.append(f"{'#' * self.base_heading_level} {item_title}") 

323 lines.append("") 

324 

325 # Metadata fields (rendered as bold key-value pairs) 

326 metadata_fields = schema.get("metadata_fields", []) 

327 for field in metadata_fields: 

328 if item.get(field): 

329 formatted_name = self._format_field_name(field) 

330 lines.append(f"**{formatted_name}**: {item[field]}") 

331 if metadata_fields: 

332 lines.append("") 

333 

334 # Description field (intro text without heading) 

335 desc_field = schema.get("description_field") 

336 if desc_field and desc_field in item and item[desc_field]: 

337 lines.extend([item[desc_field], ""]) 

338 

339 # Sections 

340 for section in schema.get("sections", []): 

341 field = section.get("field") 

342 if field not in item or not item[field]: 

343 continue 

344 

345 heading = section.get("heading", self._format_field_name(field)) 

346 format_type = section.get("format", "text") 

347 

348 lines.append(f"{'#' * (self.base_heading_level + 1)} {heading}") 

349 lines.append("") 

350 

351 value = item[field] 

352 

353 if format_type == "code": 

354 language = section.get("language", "") 

355 lines.append(f"```{language}") 

356 lines.append(str(value)) 

357 lines.append("```") 

358 elif format_type == "list": 

359 if isinstance(value, list): 

360 for v in value: 

361 lines.append(f"- {v}") 

362 else: 

363 lines.append(f"- {value}") 

364 elif format_type == "subsections": 

365 # For nested objects 

366 if isinstance(value, dict): 

367 for k, v in value.items(): 

368 lines.append(f"**{self._format_field_name(k)}**: {v}") 

369 elif isinstance(value, list): 

370 for v in value: 

371 if isinstance(v, dict): 

372 name = v.get("name", v.get("title", "Item")) 

373 desc = v.get("description", "") 

374 lines.append(f"- **{name}**: {desc}") 

375 else: 

376 lines.append(f"- {v}") 

377 else: # text 

378 lines.append(str(value)) 

379 

380 lines.append("") 

381 

382 lines.extend(["---", ""]) 

383 

384 return "\n".join(lines) 

385 

386 def _transform_dict_generic( 

387 self, 

388 data: dict[str, Any], 

389 heading_level: int, 

390 ) -> list[str]: 

391 """Transform a dictionary to markdown using generic rules. 

392 

393 Args: 

394 data: Dictionary to transform 

395 heading_level: Current heading level 

396 

397 Returns: 

398 List of markdown lines 

399 """ 

400 lines: list[str] = [] 

401 

402 # Try to find a title field 

403 title = None 

404 title_candidates = ["name", "title", "id", "key"] 

405 for candidate in title_candidates: 

406 if candidate in data and isinstance(data[candidate], str): 

407 title = data[candidate] 

408 break 

409 

410 if title: 

411 lines.append(f"{'#' * heading_level} {title}") 

412 lines.append("") 

413 

414 # Process fields 

415 for key, value in data.items(): 

416 # Skip title field if we already used it 

417 if key in title_candidates and key == title: 

418 continue 

419 

420 if value is None or value == "": 

421 continue 

422 

423 formatted_key = self._format_field_name(key) 

424 

425 # Handle different value types 

426 if isinstance(value, dict): 

427 # Nested object becomes a subsection 

428 lines.append(f"{'#' * (heading_level + 1)} {formatted_key}") 

429 lines.append("") 

430 lines.extend(self._transform_dict_generic(value, heading_level + 2)) 

431 

432 elif isinstance(value, list): 

433 if key in self.list_fields or all(isinstance(v, str) for v in value): 

434 # Render as bullet list 

435 lines.append(f"{'#' * (heading_level + 1)} {formatted_key}") 

436 lines.append("") 

437 for item in value: 

438 if isinstance(item, dict): 

439 # Complex list item 

440 name = item.get("name", item.get("title", str(item))) 

441 desc = item.get("description", "") 

442 if desc: 

443 lines.append(f"- **{name}**: {desc}") 

444 else: 

445 lines.append(f"- {name}") 

446 else: 

447 lines.append(f"- {item}") 

448 lines.append("") 

449 else: 

450 # List of complex objects 

451 lines.append(f"{'#' * (heading_level + 1)} {formatted_key}") 

452 lines.append("") 

453 for item in value: 

454 if isinstance(item, dict): 

455 lines.extend(self._transform_dict_generic(item, heading_level + 2)) 

456 else: 

457 lines.append(f"- {item}") 

458 lines.append("") 

459 

460 elif key in self.code_block_fields: 

461 # Render as code block 

462 lines.append(f"{'#' * (heading_level + 1)} {formatted_key}") 

463 lines.append("") 

464 lines.append("```") 

465 lines.append(str(value)) 

466 lines.append("```") 

467 lines.append("") 

468 

469 else: 

470 # Simple value 

471 if self.include_field_labels: 

472 lines.append(f"**{formatted_key}**: {value}") 

473 else: 

474 lines.append(str(value)) 

475 lines.append("") 

476 

477 return lines 

478 

479 def _format_field_name(self, field: str) -> str: 

480 """Format a field name for display. 

481 

482 Converts snake_case and camelCase to Title Case. 

483 

484 Args: 

485 field: Field name to format 

486 

487 Returns: 

488 Formatted field name 

489 """ 

490 # Handle snake_case 

491 words = field.replace("_", " ").replace("-", " ") 

492 

493 # Handle camelCase 

494 result = [] 

495 for i, char in enumerate(words): 

496 if char.isupper() and i > 0 and words[i-1].islower(): 

497 result.append(" ") 

498 result.append(char) 

499 

500 return "".join(result).title() 

501 

502 

503# Convenience function for quick transformations 

504def json_to_markdown( 

505 data: dict[str, Any] | list[Any], 

506 title: str | None = None, 

507 base_heading_level: int = 2, 

508) -> str: 

509 """Convert JSON data to markdown. 

510 

511 This is a convenience function that creates a ContentTransformer 

512 and transforms the data in one call. 

513 

514 Args: 

515 data: JSON data to transform 

516 title: Optional document title 

517 base_heading_level: Starting heading level (default: 2) 

518 

519 Returns: 

520 Markdown formatted string 

521 

522 Example: 

523 >>> patterns = [ 

524 ... {"name": "Chain of Thought", "description": "Step by step"}, 

525 ... {"name": "Few-Shot", "description": "Learning from examples"} 

526 ... ] 

527 >>> markdown = json_to_markdown(patterns, title="Prompt Patterns") 

528 """ 

529 transformer = ContentTransformer(base_heading_level=base_heading_level) 

530 return transformer.transform_json(data, title=title) 

531 

532 

533def yaml_to_markdown( 

534 content: str | Path, 

535 title: str | None = None, 

536 base_heading_level: int = 2, 

537) -> str: 

538 """Convert YAML content to markdown. 

539 

540 Args: 

541 content: YAML string or file path 

542 title: Optional document title 

543 base_heading_level: Starting heading level (default: 2) 

544 

545 Returns: 

546 Markdown formatted string 

547 """ 

548 transformer = ContentTransformer(base_heading_level=base_heading_level) 

549 return transformer.transform_yaml(content, title=title) 

550 

551 

552def csv_to_markdown( 

553 content: str | Path, 

554 title: str | None = None, 

555 title_field: str | None = None, 

556 base_heading_level: int = 2, 

557) -> str: 

558 """Convert CSV content to markdown. 

559 

560 Args: 

561 content: CSV string or file path 

562 title: Optional document title 

563 title_field: Column to use as section title 

564 base_heading_level: Starting heading level (default: 2) 

565 

566 Returns: 

567 Markdown formatted string 

568 """ 

569 transformer = ContentTransformer(base_heading_level=base_heading_level) 

570 return transformer.transform_csv(content, title=title, title_field=title_field)