Coverage for repo_ctx / parser.py: 100%
43 statements
« prev ^ index » next coverage.py v7.12.0, created at 2025-11-25 17:42 +0100
« prev ^ index » next coverage.py v7.12.0, created at 2025-11-25 17:42 +0100
1"""Documentation parser."""
2import re
3from typing import Optional
4from markdown_it import MarkdownIt
7class Parser:
8 def __init__(self):
9 self.md = MarkdownIt()
11 def should_include_file(self, path: str, config: Optional[dict] = None) -> bool:
12 """Check if file should be included based on config."""
13 # Default: include markdown and text files
14 if not any(path.endswith(ext) for ext in ['.md', '.rst', '.txt', '.markdown']):
15 return False
17 if not config:
18 return True
20 # Check exclude patterns
21 exclude_folders = config.get("excludeFolders", [])
22 for folder in exclude_folders:
23 if f"/{folder}/" in path or path.startswith(f"{folder}/"):
24 return False
26 exclude_files = config.get("excludeFiles", [])
27 if any(path.endswith(f) for f in exclude_files):
28 return False
30 # Check include patterns
31 folders = config.get("folders", [])
32 if folders:
33 return any(f"/{folder}/" in path or path.startswith(f"{folder}/") for folder in folders)
35 return True
37 def parse_markdown(self, content: str) -> str:
38 """Parse markdown and return formatted content."""
39 # For MVP, just return content as-is
40 # Could enhance with better formatting later
41 return content
43 def extract_snippets(self, content: str) -> list[dict]:
44 """Extract code snippets from markdown."""
45 snippets = []
46 # Match code blocks: ```language\ncode\n```
47 pattern = r'```(\w+)?\n(.*?)```'
48 matches = re.finditer(pattern, content, re.DOTALL)
50 for match in matches:
51 language = match.group(1) or "text"
52 code = match.group(2).strip()
53 snippets.append({
54 "language": language,
55 "code": code
56 })
58 return snippets
60 def count_tokens(self, content: str) -> int:
61 """Rough token count estimation."""
62 # Simple estimation: ~4 chars per token
63 return len(content) // 4
65 def format_for_llm(self, documents: list, library_id: str) -> str:
66 """Format documents for LLM consumption."""
67 output = []
68 for doc in documents:
69 output.append(f"### {doc.file_path}\n")
70 output.append(f"Source: {library_id}/{doc.file_path}\n\n")
71 output.append(doc.content)
72 output.append("\n\n" + "-" * 32 + "\n\n")
74 return "".join(output)