Coverage for src/prosemark/adapters/markdown_binder_parser.py: 100%
187 statements
« prev ^ index » next coverage.py v7.8.0, created at 2025-09-28 19:17 +0000
« prev ^ index » next coverage.py v7.8.0, created at 2025-09-28 19:17 +0000
1# Copyright (c) 2024 Prosemark Contributors
2# This software is licensed under the MIT License
4"""Markdown binder parser for converting between binder structures and markdown text."""
6import re
7from typing import NoReturn
9from prosemark.domain.models import Binder, BinderItem, NodeId
10from prosemark.domain.parser_result import ParserResult, ParsingMetadata
11from prosemark.domain.position_anchor import PositionAnchor
12from prosemark.domain.preserved_text import PreservedText
13from prosemark.domain.structural_element import StructuralElement
14from prosemark.exceptions import BinderFormatError
15from prosemark.ports.enhanced_binder_parser import EnhancedBinderParserPort
18class MarkdownBinderParser(EnhancedBinderParserPort):
19 """Parser for converting between Binder objects and markdown list format.
21 This adapter handles bidirectional conversion between:
22 - Binder domain objects with tree structure
23 - Markdown unordered list representation with links
25 Supported markdown format:
26 ```
27 - [Title](file.md)
28 - [Nested Item](nested.md)
29 - [Another Root](another.md)
30 ```
32 The parser maintains:
33 - Hierarchical structure through indentation
34 - NodeId extraction from filenames (assumes {id}.md pattern)
35 - Placeholder support for items without links
36 - Proper tree parent-child relationships
37 """
39 # Pattern to match markdown list items with optional links
40 # Updated to handle brackets in titles and empty links
41 LIST_ITEM_PATTERN = re.compile(r'^(\s*)- \[(.*?)\](?:\(([^)]*)\))?(?:\s*)$', re.MULTILINE)
43 # Pattern to extract NodeId from markdown links (assuming {id}.md format, possibly with path)
44 NODE_ID_PATTERN = re.compile(r'([0-9a-f]{8}-[0-9a-f]{4}-[0-9a-f]{4}-[0-9a-f]{4}-[0-9a-f]{12})(?:\.md)?$')
46 def parse_to_binder(self, markdown_content: str) -> Binder:
47 """Parse markdown content into a Binder object.
49 Args:
50 markdown_content: Markdown text with unordered list structure
52 Returns:
53 Binder object with parsed hierarchy
55 Raises:
56 BinderFormatError: If markdown format is invalid or malformed
58 """
59 try:
60 # Validate markdown format
61 MarkdownBinderParser._validate_markdown_format(markdown_content)
63 # Find all list items with their indentation
64 matches = self.LIST_ITEM_PATTERN.findall(markdown_content)
65 if not matches:
66 MarkdownBinderParser._handle_no_matches(markdown_content)
67 return Binder(roots=[])
69 # Build tree structure
70 return self._build_binder_tree(matches)
72 except BinderFormatError:
73 raise
74 except Exception as exc: # noqa: BLE001
75 MarkdownBinderParser._raise_parse_error(exc)
77 def render_from_binder(self, binder: Binder) -> str:
78 """Render Binder object as markdown list content.
80 Args:
81 binder: Binder object to render
83 Returns:
84 Markdown text with unordered list structure
86 """
87 lines: list[str] = []
88 for root in binder.roots:
89 self._render_item(root, 0, lines)
90 return '\n'.join(lines)
92 def parse_with_preservation(self, markdown_content: str) -> ParserResult:
93 """Parse markdown content preserving all non-structural text.
95 Args:
96 markdown_content: Raw markdown text with mixed structural and narrative content
98 Returns:
99 ParserResult containing binder structure and preserved text
101 Raises:
102 BinderFormatError: If structural parsing fails
104 """
105 try:
106 # Parse lines for analysis
107 lines = markdown_content.split('\n')
108 preserved_text: list[PreservedText] = []
109 structural_elements: list[StructuralElement] = []
111 # Track parsing metadata
112 malformed_count = 0
113 uuid_failures = 0
114 structural_count = 0
116 # Analyze each line
117 for line_num, line in enumerate(lines, 1):
118 stripped_line = line.strip()
120 if not stripped_line:
121 # Preserve empty lines
122 preserved_text.append(
123 PreservedText(
124 content=line, line_number=line_num, position_anchor=PositionAnchor.BETWEEN_ELEMENTS
125 )
126 )
127 continue
129 # Check if this line matches structural pattern
130 match = self.LIST_ITEM_PATTERN.match(line)
131 if match:
132 indent_str, title, link = match.groups()
133 indent_level = len(indent_str)
135 # Validate UUID7 format if link present
136 node_id = None
137 if link: # pragma: no branch
138 node_id = self._extract_node_id(link)
139 if node_id is None:
140 uuid_failures += 1
141 # Non-UUID7 links are treated as extraneous text
142 preserved_text.append(
143 PreservedText(
144 content=line, line_number=line_num, position_anchor=PositionAnchor.BETWEEN_ELEMENTS
145 )
146 )
147 continue
149 # Valid structural element
150 structural_elements.append(
151 StructuralElement(
152 indent_level=indent_level, title=title.strip(), node_id=node_id, line_number=line_num
153 )
154 )
155 structural_count += 1
157 else:
158 # Check for malformed list items
159 if '- [' in stripped_line or stripped_line.startswith('- '):
160 malformed_count += 1
162 # All non-structural text is preserved
163 preserved_text.append(
164 PreservedText(
165 content=line,
166 line_number=line_num,
167 position_anchor=MarkdownBinderParser._determine_position_anchor(
168 line_num, len(lines), structural_elements
169 ),
170 )
171 )
173 # Build binder from structural elements
174 binder = MarkdownBinderParser._build_binder_from_elements(structural_elements)
176 # Create parsing metadata
177 metadata = ParsingMetadata(
178 malformed_elements_count=malformed_count,
179 uuid_validation_failures=uuid_failures,
180 original_line_count=len(lines),
181 structural_line_count=structural_count,
182 )
184 return ParserResult(binder=binder, preserved_text=preserved_text, parsing_metadata=metadata)
186 except Exception as exc:
187 raise BinderFormatError('Failed to parse markdown with text preservation') from exc
189 @staticmethod
190 def _validate_markdown_format(markdown_content: str) -> None:
191 """Validate markdown format and raise errors for malformed patterns."""
192 lines = markdown_content.strip().split('\n')
193 for line in lines:
194 stripped_line = line.strip()
195 if stripped_line: # Skip empty lines
196 MarkdownBinderParser._check_bracket_patterns(stripped_line)
198 @staticmethod
199 def _check_bracket_patterns(line: str) -> None:
200 """Check for malformed bracket patterns in a line."""
201 if '- [' in line and line.count('[') != line.count(']'):
202 MarkdownBinderParser._raise_malformed_error('unmatched brackets')
203 if '- [' in line and '[' in line and not line.endswith(']') and ')' not in line:
204 MarkdownBinderParser._raise_malformed_error('unclosed bracket')
206 @staticmethod
207 def _handle_no_matches(markdown_content: str) -> None:
208 """Handle case where no list items were matched."""
209 lines = markdown_content.strip().split('\n')
210 for line in lines:
211 stripped_line = line.strip()
212 if stripped_line and ('- ' in stripped_line or '* ' in stripped_line or stripped_line.startswith(' - ')):
213 MarkdownBinderParser._raise_malformed_error('invalid list item format')
214 # If there's any non-empty content but no valid list items, it might be malformed
215 if any(line.strip() for line in lines):
216 MarkdownBinderParser._raise_malformed_error('content found but no valid list items')
218 def _build_binder_tree(self, matches: list[tuple[str, str, str]]) -> Binder:
219 """Build the binder tree structure from matched list items.
221 Returns:
222 Constructed Binder with hierarchical structure
224 """
225 root_items = []
226 item_stack: list[tuple[int, BinderItem]] = [] # (indent_level, item)
228 for indent_str, title, link in matches:
229 indent_level = len(indent_str)
231 # Extract NodeId from link if present
232 node_id = self._extract_node_id(link) if link else None
234 # Create binder item
235 item = BinderItem(display_title=title.strip(), node_id=node_id, children=[])
237 # Find parent based on indentation
238 parent = MarkdownBinderParser._find_parent(item_stack, indent_level)
240 if parent is None:
241 # Root level item
242 root_items.append(item)
243 else:
244 # Child item
245 parent.children.append(item)
247 # Update stack - remove items at same or deeper levels, then add current
248 item_stack = [(level, stack_item) for level, stack_item in item_stack if level < indent_level]
249 item_stack.append((indent_level, item))
251 return Binder(roots=root_items)
253 @staticmethod
254 def _raise_malformed_error(issue: str) -> NoReturn:
255 """Raise a BinderFormatError with malformed markdown message.
257 Raises:
258 BinderFormatError: Always raised with issue-specific message
260 """
261 msg = f'Malformed markdown: {issue}'
262 raise BinderFormatError(msg)
264 @staticmethod
265 def _raise_parse_error(exc: Exception) -> NoReturn:
266 """Raise a BinderFormatError for parse failures.
268 Raises:
269 BinderFormatError: Always raised with exception context
271 """
272 msg = 'Failed to parse markdown binder content'
273 raise BinderFormatError(msg) from exc
275 def _render_item(self, item: BinderItem, depth: int, lines: list[str]) -> None:
276 """Render a single binder item and its children to lines."""
277 indent = ' ' * depth
278 if item.node_id:
279 # Item with link
280 lines.append(f'{indent}- [{item.display_title}]({item.node_id}.md)')
281 else:
282 # Placeholder item
283 lines.append(f'{indent}- [{item.display_title}]()')
285 # Render children
286 for child in item.children:
287 self._render_item(child, depth + 1, lines)
289 def _extract_node_id(self, link: str) -> NodeId | None:
290 """Extract NodeId from markdown link if valid UUID format.
292 Returns:
293 NodeId if link contains valid UUID, None otherwise
295 """
296 if not link:
297 return None
299 match = self.NODE_ID_PATTERN.search(link)
300 if match:
301 try:
302 return NodeId(match.group(1))
303 except ValueError: # pragma: no cover
304 # Invalid UUID format
305 return None
306 return None
308 @staticmethod
309 def _find_parent(item_stack: list[tuple[int, BinderItem]], indent_level: int) -> BinderItem | None:
310 """Find the appropriate parent item based on indentation level.
312 Returns:
313 Parent BinderItem or None if no appropriate parent found
315 """
316 # Find the item with the largest indent level that's less than current
317 parent = None
318 for level, item in reversed(item_stack):
319 if level < indent_level:
320 parent = item
321 break
322 return parent
324 @staticmethod
325 def _determine_position_anchor(
326 line_num: int, total_lines: int, structural_elements: list[StructuralElement]
327 ) -> PositionAnchor:
328 """Determine the position anchor for preserved text based on context.
330 Args:
331 line_num: The line number of the preserved text
332 total_lines: Total number of lines in the document
333 structural_elements: List of structural elements found so far
335 Returns:
336 PositionAnchor indicating where this text appears relative to structure
338 """
339 # Check if there are any structural elements
340 if not structural_elements:
341 # No structural elements found yet
342 if line_num <= total_lines // 2:
343 return PositionAnchor.BEFORE_STRUCTURE
344 return PositionAnchor.AFTER_STRUCTURE
346 # Find structural elements before and after this line
347 elements_before = [elem for elem in structural_elements if elem.line_number < line_num]
348 elements_after = [elem for elem in structural_elements if elem.line_number > line_num]
350 if not elements_before:
351 # No structural elements before this line
352 return PositionAnchor.BEFORE_STRUCTURE # pragma: no cover
353 if not elements_after:
354 # No structural elements after this line
355 return PositionAnchor.AFTER_STRUCTURE
356 # Structural elements both before and after
357 return PositionAnchor.BETWEEN_ELEMENTS # pragma: no cover
359 @staticmethod
360 def _build_binder_from_elements(structural_elements: list[StructuralElement]) -> Binder:
361 """Build a Binder object from a list of structural elements.
363 Args:
364 structural_elements: List of parsed structural elements with hierarchy
366 Returns:
367 Binder object with hierarchical structure
369 """
370 if not structural_elements:
371 return Binder(roots=[])
373 root_items = []
374 item_stack: list[tuple[int, BinderItem]] = [] # (indent_level, item)
376 for element in structural_elements:
377 # Create binder item from structural element
378 item = BinderItem(display_title=element.title, node_id=element.node_id, children=[])
380 # Find parent based on indentation
381 parent = MarkdownBinderParser._find_parent(item_stack, element.indent_level)
383 if parent is None:
384 # Root level item
385 root_items.append(item)
386 else:
387 # Child item
388 parent.children.append(item)
390 # Update stack - remove items at same or deeper levels, then add current
391 item_stack = [(level, stack_item) for level, stack_item in item_stack if level < element.indent_level]
392 item_stack.append((element.indent_level, item))
394 return Binder(roots=root_items)
396 def render_with_preservation(self, parser_result: ParserResult) -> str: # noqa: PLR6301
397 """Render ParserResult back to markdown preserving all text positioning.
399 Args:
400 parser_result: Result from parse_with_preservation containing binder and preserved text
402 Returns:
403 Markdown text with structural elements and preserved text in original positions
405 """
406 # Create a mapping of line numbers to content
407 line_content: dict[int, str] = {}
409 # Add preserved text at their original line positions
410 for preserved in parser_result.preserved_text:
411 line_content[preserved.line_number] = preserved.content
413 # To render structural elements, we need to re-extract them with line positions
414 # Since the parse process stores line numbers in StructuralElement objects,
415 # we need to recreate this mapping by re-parsing the structural elements
416 preserved_lines = {p.line_number for p in parser_result.preserved_text}
417 structural_lines = MarkdownBinderParser._render_structural_elements_with_positions(
418 parser_result.binder, preserved_lines
419 )
421 # Add structural elements to their positions
422 line_content.update(dict(structural_lines.items()))
424 # Reconstruct the full document line by line
425 if not line_content:
426 return ''
428 max_line = max(line_content.keys())
429 lines = []
430 for line_num in range(1, max_line + 1):
431 if line_num in line_content:
432 lines.append(line_content[line_num])
433 else:
434 # Fill missing lines with empty content
435 lines.append('') # pragma: no cover
437 return '\n'.join(lines)
439 @staticmethod
440 def _render_structural_elements_with_positions(binder: Binder, preserved_lines: set[int]) -> dict[int, str]:
441 """Render structural elements to available line positions.
443 Args:
444 binder: Binder object containing structural hierarchy
445 preserved_lines: Set of line numbers already used by preserved text
447 Returns:
448 Dictionary mapping line numbers to rendered structural content
450 """
451 # This is a limitation of the current design - we lost the line number information
452 # when converting from StructuralElement to BinderItem. For now, we'll render
453 # structural elements as if they appear consecutively, which won't achieve
454 # perfect round-trip integrity but will preserve the content correctly.
456 structural_lines: dict[int, str] = {}
457 line_counter = 1
459 # Find the first available line not used by preserved text
460 # This is a workaround since we don't have original line positions in Binder
461 def render_items_recursively(items: list[BinderItem], depth: int) -> None:
462 nonlocal line_counter
463 for item in items:
464 # Find next available line position
465 while line_counter in preserved_lines:
466 line_counter += 1
468 # Render the structural element
469 indent = ' ' * depth
470 if item.node_id:
471 content = f'{indent}- [{item.display_title}]({item.node_id}.md)'
472 else:
473 content = f'{indent}- [{item.display_title}]()' # pragma: no cover
475 structural_lines[line_counter] = content
476 line_counter += 1
478 # Recursively render children
479 if item.children:
480 render_items_recursively(item.children, depth + 1) # pragma: no cover
482 # Start rendering from root items
483 render_items_recursively(binder.roots, 0)
485 return structural_lines