Coverage for src \ truenex_memory \ ingestion \ parsers \ text_docs.py: 78%

65 statements  

« prev     ^ index     » next       coverage.py v7.14.0, created at 2026-05-19 10:21 +0200

1"""Parser for text-based project documentation sources. 

2 

3Handles source_type=project_docs. Walks a directory tree, filters to 

4supported text extensions, and produces one IngestionRecord per file. 

5""" 

6 

7from __future__ import annotations 

8 

9from datetime import datetime, timezone 

10import os 

11from pathlib import Path 

12 

13from truenex_memory.ingestion.manifest import IngestionRecord 

14from truenex_memory.ingestion.parsers import register 

15 

16INDEX_EXTENSIONS = { 

17 ".md", ".markdown", ".txt", ".py", ".toml", ".yaml", ".yml", ".json", 

18 ".rst", ".cfg", ".ini", 

19} 

20EXCLUDED_DIRS = { 

21 ".agent", ".git", ".venv", "venv", "__pycache__", ".pytest_cache", 

22 ".truenex-memory", "node_modules", ".mypy_cache", ".tox", 

23 ".pytest-tmp", "pytest_tmp", ".task_work", ".task3_work", 

24 "site-packages", "dist-info", ".conda", "conda-meta", 

25 "dist", "build", ".eggs", ".ruff_cache", ".coverage", 

26} 

27EXCLUDED_DIR_PREFIXES = ("task_work_", "pytest-task", "pytest-cache-files-", "venv") 

28 

29EXCLUDED_FILENAMES: frozenset[str] = frozenset({ 

30 "tokenizer.json", 

31 "tokenizer_config.json", 

32 "vocab.json", 

33 "special_tokens_map.json", 

34 "generation_config.json", 

35 "merges.txt", 

36 "package-lock.json", 

37 "yarn.lock", 

38 "pnpm-lock.yaml", 

39 "composer.lock", 

40 "Pipfile.lock", 

41 "poetry.lock", 

42 "Gemfile.lock", 

43 "package.json", 

44}) 

45 

46MIN_ALPHA_RATIO = 0.35 

47 

48 

49@register("project_docs") 

50def parse_project_docs( 

51 source_dir: Path, 

52 project: str, 

53 source_tool: str, 

54 privacy_scope: str, 

55) -> list[IngestionRecord]: 

56 """Walk a directory and create records for supported text files.""" 

57 records: list[IngestionRecord] = [] 

58 resolved = source_dir.resolve() 

59 if not resolved.exists(): 

60 return records 

61 candidates = _iter_candidate_files(resolved) 

62 

63 for file_path in candidates: 

64 suffix = file_path.suffix.lower() 

65 if suffix not in INDEX_EXTENSIONS: 

66 continue 

67 if file_path.name in EXCLUDED_FILENAMES: 

68 continue 

69 try: 

70 text = file_path.read_text(encoding="utf-8", errors="replace") 

71 except OSError: 

72 continue 

73 if not text.strip(): 

74 continue 

75 alpha_count = sum(1 for c in text if c.isalpha()) 

76 if len(text) > 0 and alpha_count / len(text) < MIN_ALPHA_RATIO: 

77 continue 

78 mtime = _file_mtime_iso(file_path) 

79 records.append( 

80 IngestionRecord( 

81 project=project, 

82 source_type="project_docs", 

83 source_path=str(file_path.resolve()), 

84 source_tool=source_tool, 

85 text=text, 

86 created_at=mtime, 

87 last_modified=mtime, 

88 privacy_scope=privacy_scope, 

89 ) 

90 ) 

91 return records 

92 

93 

94def _iter_candidate_files(resolved: Path) -> list[Path]: 

95 """Yield files while pruning excluded directories before descent.""" 

96 if resolved.is_file(): 

97 return [resolved] 

98 

99 candidates: list[Path] = [] 

100 for root, dirnames, filenames in os.walk(resolved): 

101 dirnames[:] = [ 

102 name for name in dirnames 

103 if not _is_excluded_dir_name(name) 

104 ] 

105 root_path = Path(root) 

106 for filename in filenames: 

107 candidates.append(root_path / filename) 

108 return sorted(candidates) 

109 

110 

111def _is_excluded_path(path: Path, *, root: Path) -> bool: 

112 try: 

113 parts = path.relative_to(root).parts 

114 except ValueError: 

115 parts = path.parts 

116 for part in parts: 

117 if _is_excluded_dir_name(part): 

118 return True 

119 return False 

120 

121 

122def _is_excluded_dir_name(part: str) -> bool: 

123 if part in EXCLUDED_DIRS: 

124 return True 

125 return any(part.startswith(prefix) for prefix in EXCLUDED_DIR_PREFIXES) 

126 

127 

128def _file_mtime_iso(path: Path) -> str: 

129 try: 

130 stat = path.stat() 

131 return datetime.fromtimestamp(stat.st_mtime, tz=timezone.utc).isoformat() 

132 except OSError: 

133 return datetime.now(timezone.utc).isoformat()