Coverage for src / dataknobs_xization / content_transformer.py: 9%
216 statements
« prev ^ index » next coverage.py v7.13.0, created at 2025-12-26 15:46 -0700
« prev ^ index » next coverage.py v7.13.0, created at 2025-12-26 15:46 -0700
1"""Content transformation utilities for converting various formats to markdown.
3This module provides tools for converting structured data formats (JSON, YAML, CSV)
4into markdown format suitable for RAG ingestion and chunking.
6The ContentTransformer supports:
7- Generic conversion that preserves structure through heading hierarchy
8- Custom schemas for specialized formatting of known data structures
9- Nested object and array handling
10- Configurable heading levels and formatting options
12Example:
13 >>> transformer = ContentTransformer()
14 >>>
15 >>> # Generic conversion
16 >>> data = {"name": "Chain of Thought", "description": "Step by step reasoning"}
17 >>> markdown = transformer.transform_json(data)
18 >>>
19 >>> # With custom schema
20 >>> transformer.register_schema("pattern", {
21 ... "title_field": "name",
22 ... "sections": [
23 ... {"field": "description", "heading": "Description"},
24 ... {"field": "example", "heading": "Example", "format": "code"}
25 ... ]
26 ... })
27 >>> markdown = transformer.transform_json(data, schema="pattern")
28"""
30import csv
31import io
32import json
33import logging
34from pathlib import Path
35from typing import Any
37logger = logging.getLogger(__name__)
40class ContentTransformer:
41 """Transform structured content into markdown for RAG ingestion.
43 This class converts various data formats (JSON, YAML, CSV) into well-structured
44 markdown that can be parsed by MarkdownParser and chunked by MarkdownChunker.
46 The transformer creates markdown with appropriate heading hierarchy so that
47 the chunker can create semantic boundaries around logical content units.
49 Attributes:
50 schemas: Dictionary of registered custom schemas
51 config: Transformer configuration options
52 """
54 def __init__(
55 self,
56 base_heading_level: int = 2,
57 include_field_labels: bool = True,
58 code_block_fields: list[str] | None = None,
59 list_fields: list[str] | None = None,
60 ):
61 """Initialize the content transformer.
63 Args:
64 base_heading_level: Starting heading level for top-level items (default: 2)
65 include_field_labels: Whether to bold field names in output (default: True)
66 code_block_fields: Field names that should be rendered as code blocks
67 list_fields: Field names that should be rendered as bullet lists
68 """
69 self.base_heading_level = base_heading_level
70 self.include_field_labels = include_field_labels
71 self.code_block_fields = set(code_block_fields or ["example", "code", "snippet"])
72 self.list_fields = set(list_fields or ["items", "steps", "objectives", "symptoms", "solutions"])
73 self.schemas: dict[str, dict[str, Any]] = {}
75 def register_schema(self, name: str, schema: dict[str, Any]) -> None:
76 """Register a custom schema for specialized content conversion.
78 Schemas define how to map JSON fields to markdown structure.
80 Args:
81 name: Schema identifier
82 schema: Schema definition with the following structure:
83 - title_field: Field to use as the main heading (required)
84 - description_field: Field for intro text (optional)
85 - sections: List of section definitions, each with:
86 - field: Source field name
87 - heading: Section heading text
88 - format: "text", "code", "list", or "subsections" (default: "text")
89 - language: Code block language (for format="code")
90 - metadata_fields: Fields to render as key-value metadata
92 Example:
93 >>> transformer.register_schema("pattern", {
94 ... "title_field": "name",
95 ... "description_field": "description",
96 ... "sections": [
97 ... {"field": "use_case", "heading": "When to Use"},
98 ... {"field": "example", "heading": "Example", "format": "code"}
99 ... ],
100 ... "metadata_fields": ["category", "difficulty"]
101 ... })
102 """
103 self.schemas[name] = schema
104 logger.debug(f"Registered schema: {name}")
106 def transform(
107 self,
108 content: Any,
109 format: str = "json",
110 schema: str | None = None,
111 title: str | None = None,
112 ) -> str:
113 """Transform content to markdown.
115 Args:
116 content: Content to transform (dict, list, string, or file path)
117 format: Content format - "json", "yaml", or "csv"
118 schema: Optional schema name for custom conversion
119 title: Optional document title
121 Returns:
122 Markdown formatted string
124 Raises:
125 ValueError: If format is not supported
126 """
127 if format == "json":
128 if isinstance(content, (str, Path)):
129 with open(content, encoding="utf-8") as f:
130 data = json.load(f)
131 else:
132 data = content
133 return self.transform_json(data, schema=schema, title=title)
134 elif format == "yaml":
135 return self.transform_yaml(content, schema=schema, title=title)
136 elif format == "csv":
137 return self.transform_csv(content, title=title)
138 else:
139 raise ValueError(f"Unsupported format: {format}. Use 'json', 'yaml', or 'csv'.")
141 def transform_json(
142 self,
143 data: dict[str, Any] | list[Any],
144 schema: str | None = None,
145 title: str | None = None,
146 ) -> str:
147 """Transform JSON data to markdown.
149 Args:
150 data: JSON data (dict or list)
151 schema: Optional schema name for custom conversion
152 title: Optional document title
154 Returns:
155 Markdown formatted string
156 """
157 lines: list[str] = []
159 # Add document title if provided
160 if title:
161 lines.extend([f"# {title}", ""])
163 # Use custom schema if specified
164 if schema and schema in self.schemas:
165 return self._transform_with_schema(data, self.schemas[schema], title)
167 # Generic transformation
168 if isinstance(data, list):
169 for item in data:
170 if isinstance(item, dict):
171 lines.extend(self._transform_dict_generic(item, self.base_heading_level))
172 lines.extend(["---", ""])
173 else:
174 lines.append(f"- {item}")
175 lines.append("")
176 elif isinstance(data, dict):
177 lines.extend(self._transform_dict_generic(data, self.base_heading_level))
178 else:
179 lines.append(str(data))
181 return "\n".join(lines)
183 def transform_yaml(
184 self,
185 content: str | Path,
186 schema: str | None = None,
187 title: str | None = None,
188 ) -> str:
189 """Transform YAML content to markdown.
191 Args:
192 content: YAML string or file path
193 schema: Optional schema name for custom conversion
194 title: Optional document title
196 Returns:
197 Markdown formatted string
199 Raises:
200 ImportError: If PyYAML is not installed
201 """
202 try:
203 import yaml
204 except ImportError:
205 raise ImportError("PyYAML is required for YAML transformation. Install with: pip install pyyaml") from None
207 if isinstance(content, (str, Path)) and Path(content).exists():
208 with open(content, encoding="utf-8") as f:
209 data = yaml.safe_load(f)
210 else:
211 data = yaml.safe_load(content)
213 return self.transform_json(data, schema=schema, title=title)
215 def transform_csv(
216 self,
217 content: str | Path,
218 title: str | None = None,
219 title_field: str | None = None,
220 ) -> str:
221 """Transform CSV content to markdown.
223 Each row becomes a section with the first column (or title_field) as heading.
225 Args:
226 content: CSV string or file path
227 title: Optional document title
228 title_field: Column to use as section title (default: first column)
230 Returns:
231 Markdown formatted string
232 """
233 lines: list[str] = []
235 if title:
236 lines.extend([f"# {title}", ""])
238 # Read CSV
239 if isinstance(content, Path) or (isinstance(content, str) and Path(content).exists()):
240 with open(content, encoding="utf-8") as f:
241 reader = csv.DictReader(f)
242 rows = list(reader)
243 else:
244 reader = csv.DictReader(io.StringIO(content))
245 rows = list(reader)
247 if not rows:
248 return "\n".join(lines)
250 # Determine title field
251 fieldnames = list(rows[0].keys())
252 if title_field and title_field in fieldnames:
253 title_col = title_field
254 else:
255 title_col = fieldnames[0]
257 # Transform each row
258 for row in rows:
259 row_title = row.get(title_col, "Untitled")
260 lines.append(f"{'#' * self.base_heading_level} {row_title}")
261 lines.append("")
263 for field, value in row.items():
264 if field == title_col or not value:
265 continue
267 if self.include_field_labels:
268 lines.append(f"**{self._format_field_name(field)}**: {value}")
269 else:
270 lines.append(value)
271 lines.append("")
273 lines.extend(["---", ""])
275 return "\n".join(lines)
277 def _transform_with_schema(
278 self,
279 data: dict[str, Any] | list[Any],
280 schema: dict[str, Any],
281 title: str | None = None,
282 ) -> str:
283 """Transform data using a custom schema.
285 Args:
286 data: Data to transform (list or dict)
287 - List format: [{"name": "Item", ...}, ...]
288 - Dict format: {"Item": {...}, ...} (keys become title_field values)
289 schema: Schema definition
290 title: Optional document title
292 Returns:
293 Markdown formatted string
294 """
295 lines: list[str] = []
297 if title:
298 lines.extend([f"# {title}", ""])
300 # Normalize dict-keyed format to list format
301 # Dict format: {"Item Name": {"field": "value"}} -> [{"name": "Item Name", "field": "value"}]
302 if isinstance(data, dict):
303 # Check if this looks like a keyed dict (values are dicts)
304 # vs a single item dict (values are primitive)
305 if all(isinstance(v, dict) for v in data.values()):
306 title_field = schema.get("title_field", "name")
307 data = [
308 {title_field: key, **value}
309 for key, value in data.items()
310 ]
311 logger.debug(f"Normalized dict-keyed data to list format with {len(data)} items")
313 items = data if isinstance(data, list) else [data]
315 for item in items:
316 if not isinstance(item, dict):
317 continue
319 # Title
320 title_field = schema.get("title_field", "name")
321 item_title = item.get(title_field, "Untitled")
322 lines.append(f"{'#' * self.base_heading_level} {item_title}")
323 lines.append("")
325 # Metadata fields (rendered as bold key-value pairs)
326 metadata_fields = schema.get("metadata_fields", [])
327 for field in metadata_fields:
328 if item.get(field):
329 formatted_name = self._format_field_name(field)
330 lines.append(f"**{formatted_name}**: {item[field]}")
331 if metadata_fields:
332 lines.append("")
334 # Description field (intro text without heading)
335 desc_field = schema.get("description_field")
336 if desc_field and desc_field in item and item[desc_field]:
337 lines.extend([item[desc_field], ""])
339 # Sections
340 for section in schema.get("sections", []):
341 field = section.get("field")
342 if field not in item or not item[field]:
343 continue
345 heading = section.get("heading", self._format_field_name(field))
346 format_type = section.get("format", "text")
348 lines.append(f"{'#' * (self.base_heading_level + 1)} {heading}")
349 lines.append("")
351 value = item[field]
353 if format_type == "code":
354 language = section.get("language", "")
355 lines.append(f"```{language}")
356 lines.append(str(value))
357 lines.append("```")
358 elif format_type == "list":
359 if isinstance(value, list):
360 for v in value:
361 lines.append(f"- {v}")
362 else:
363 lines.append(f"- {value}")
364 elif format_type == "subsections":
365 # For nested objects
366 if isinstance(value, dict):
367 for k, v in value.items():
368 lines.append(f"**{self._format_field_name(k)}**: {v}")
369 elif isinstance(value, list):
370 for v in value:
371 if isinstance(v, dict):
372 name = v.get("name", v.get("title", "Item"))
373 desc = v.get("description", "")
374 lines.append(f"- **{name}**: {desc}")
375 else:
376 lines.append(f"- {v}")
377 else: # text
378 lines.append(str(value))
380 lines.append("")
382 lines.extend(["---", ""])
384 return "\n".join(lines)
386 def _transform_dict_generic(
387 self,
388 data: dict[str, Any],
389 heading_level: int,
390 ) -> list[str]:
391 """Transform a dictionary to markdown using generic rules.
393 Args:
394 data: Dictionary to transform
395 heading_level: Current heading level
397 Returns:
398 List of markdown lines
399 """
400 lines: list[str] = []
402 # Try to find a title field
403 title = None
404 title_candidates = ["name", "title", "id", "key"]
405 for candidate in title_candidates:
406 if candidate in data and isinstance(data[candidate], str):
407 title = data[candidate]
408 break
410 if title:
411 lines.append(f"{'#' * heading_level} {title}")
412 lines.append("")
414 # Process fields
415 for key, value in data.items():
416 # Skip title field if we already used it
417 if key in title_candidates and key == title:
418 continue
420 if value is None or value == "":
421 continue
423 formatted_key = self._format_field_name(key)
425 # Handle different value types
426 if isinstance(value, dict):
427 # Nested object becomes a subsection
428 lines.append(f"{'#' * (heading_level + 1)} {formatted_key}")
429 lines.append("")
430 lines.extend(self._transform_dict_generic(value, heading_level + 2))
432 elif isinstance(value, list):
433 if key in self.list_fields or all(isinstance(v, str) for v in value):
434 # Render as bullet list
435 lines.append(f"{'#' * (heading_level + 1)} {formatted_key}")
436 lines.append("")
437 for item in value:
438 if isinstance(item, dict):
439 # Complex list item
440 name = item.get("name", item.get("title", str(item)))
441 desc = item.get("description", "")
442 if desc:
443 lines.append(f"- **{name}**: {desc}")
444 else:
445 lines.append(f"- {name}")
446 else:
447 lines.append(f"- {item}")
448 lines.append("")
449 else:
450 # List of complex objects
451 lines.append(f"{'#' * (heading_level + 1)} {formatted_key}")
452 lines.append("")
453 for item in value:
454 if isinstance(item, dict):
455 lines.extend(self._transform_dict_generic(item, heading_level + 2))
456 else:
457 lines.append(f"- {item}")
458 lines.append("")
460 elif key in self.code_block_fields:
461 # Render as code block
462 lines.append(f"{'#' * (heading_level + 1)} {formatted_key}")
463 lines.append("")
464 lines.append("```")
465 lines.append(str(value))
466 lines.append("```")
467 lines.append("")
469 else:
470 # Simple value
471 if self.include_field_labels:
472 lines.append(f"**{formatted_key}**: {value}")
473 else:
474 lines.append(str(value))
475 lines.append("")
477 return lines
479 def _format_field_name(self, field: str) -> str:
480 """Format a field name for display.
482 Converts snake_case and camelCase to Title Case.
484 Args:
485 field: Field name to format
487 Returns:
488 Formatted field name
489 """
490 # Handle snake_case
491 words = field.replace("_", " ").replace("-", " ")
493 # Handle camelCase
494 result = []
495 for i, char in enumerate(words):
496 if char.isupper() and i > 0 and words[i-1].islower():
497 result.append(" ")
498 result.append(char)
500 return "".join(result).title()
503# Convenience function for quick transformations
504def json_to_markdown(
505 data: dict[str, Any] | list[Any],
506 title: str | None = None,
507 base_heading_level: int = 2,
508) -> str:
509 """Convert JSON data to markdown.
511 This is a convenience function that creates a ContentTransformer
512 and transforms the data in one call.
514 Args:
515 data: JSON data to transform
516 title: Optional document title
517 base_heading_level: Starting heading level (default: 2)
519 Returns:
520 Markdown formatted string
522 Example:
523 >>> patterns = [
524 ... {"name": "Chain of Thought", "description": "Step by step"},
525 ... {"name": "Few-Shot", "description": "Learning from examples"}
526 ... ]
527 >>> markdown = json_to_markdown(patterns, title="Prompt Patterns")
528 """
529 transformer = ContentTransformer(base_heading_level=base_heading_level)
530 return transformer.transform_json(data, title=title)
533def yaml_to_markdown(
534 content: str | Path,
535 title: str | None = None,
536 base_heading_level: int = 2,
537) -> str:
538 """Convert YAML content to markdown.
540 Args:
541 content: YAML string or file path
542 title: Optional document title
543 base_heading_level: Starting heading level (default: 2)
545 Returns:
546 Markdown formatted string
547 """
548 transformer = ContentTransformer(base_heading_level=base_heading_level)
549 return transformer.transform_yaml(content, title=title)
552def csv_to_markdown(
553 content: str | Path,
554 title: str | None = None,
555 title_field: str | None = None,
556 base_heading_level: int = 2,
557) -> str:
558 """Convert CSV content to markdown.
560 Args:
561 content: CSV string or file path
562 title: Optional document title
563 title_field: Column to use as section title
564 base_heading_level: Starting heading level (default: 2)
566 Returns:
567 Markdown formatted string
568 """
569 transformer = ContentTransformer(base_heading_level=base_heading_level)
570 return transformer.transform_csv(content, title=title, title_field=title_field)