Coverage for repo_ctx / parser.py: 100%

43 statements  

« prev     ^ index     » next       coverage.py v7.12.0, created at 2025-11-25 17:42 +0100

1"""Documentation parser.""" 

2import re 

3from typing import Optional 

4from markdown_it import MarkdownIt 

5 

6 

7class Parser: 

8 def __init__(self): 

9 self.md = MarkdownIt() 

10 

11 def should_include_file(self, path: str, config: Optional[dict] = None) -> bool: 

12 """Check if file should be included based on config.""" 

13 # Default: include markdown and text files 

14 if not any(path.endswith(ext) for ext in ['.md', '.rst', '.txt', '.markdown']): 

15 return False 

16 

17 if not config: 

18 return True 

19 

20 # Check exclude patterns 

21 exclude_folders = config.get("excludeFolders", []) 

22 for folder in exclude_folders: 

23 if f"/{folder}/" in path or path.startswith(f"{folder}/"): 

24 return False 

25 

26 exclude_files = config.get("excludeFiles", []) 

27 if any(path.endswith(f) for f in exclude_files): 

28 return False 

29 

30 # Check include patterns 

31 folders = config.get("folders", []) 

32 if folders: 

33 return any(f"/{folder}/" in path or path.startswith(f"{folder}/") for folder in folders) 

34 

35 return True 

36 

37 def parse_markdown(self, content: str) -> str: 

38 """Parse markdown and return formatted content.""" 

39 # For MVP, just return content as-is 

40 # Could enhance with better formatting later 

41 return content 

42 

43 def extract_snippets(self, content: str) -> list[dict]: 

44 """Extract code snippets from markdown.""" 

45 snippets = [] 

46 # Match code blocks: ```language\ncode\n``` 

47 pattern = r'```(\w+)?\n(.*?)```' 

48 matches = re.finditer(pattern, content, re.DOTALL) 

49 

50 for match in matches: 

51 language = match.group(1) or "text" 

52 code = match.group(2).strip() 

53 snippets.append({ 

54 "language": language, 

55 "code": code 

56 }) 

57 

58 return snippets 

59 

60 def count_tokens(self, content: str) -> int: 

61 """Rough token count estimation.""" 

62 # Simple estimation: ~4 chars per token 

63 return len(content) // 4 

64 

65 def format_for_llm(self, documents: list, library_id: str) -> str: 

66 """Format documents for LLM consumption.""" 

67 output = [] 

68 for doc in documents: 

69 output.append(f"### {doc.file_path}\n") 

70 output.append(f"Source: {library_id}/{doc.file_path}\n\n") 

71 output.append(doc.content) 

72 output.append("\n\n" + "-" * 32 + "\n\n") 

73 

74 return "".join(output)