Coverage for src \ truenex_memory \ ingestion \ parsers \ text_docs.py: 78%
65 statements
« prev ^ index » next coverage.py v7.14.0, created at 2026-05-19 10:21 +0200
« prev ^ index » next coverage.py v7.14.0, created at 2026-05-19 10:21 +0200
1"""Parser for text-based project documentation sources.
3Handles source_type=project_docs. Walks a directory tree, filters to
4supported text extensions, and produces one IngestionRecord per file.
5"""
7from __future__ import annotations
9from datetime import datetime, timezone
10import os
11from pathlib import Path
13from truenex_memory.ingestion.manifest import IngestionRecord
14from truenex_memory.ingestion.parsers import register
16INDEX_EXTENSIONS = {
17 ".md", ".markdown", ".txt", ".py", ".toml", ".yaml", ".yml", ".json",
18 ".rst", ".cfg", ".ini",
19}
20EXCLUDED_DIRS = {
21 ".agent", ".git", ".venv", "venv", "__pycache__", ".pytest_cache",
22 ".truenex-memory", "node_modules", ".mypy_cache", ".tox",
23 ".pytest-tmp", "pytest_tmp", ".task_work", ".task3_work",
24 "site-packages", "dist-info", ".conda", "conda-meta",
25 "dist", "build", ".eggs", ".ruff_cache", ".coverage",
26}
27EXCLUDED_DIR_PREFIXES = ("task_work_", "pytest-task", "pytest-cache-files-", "venv")
29EXCLUDED_FILENAMES: frozenset[str] = frozenset({
30 "tokenizer.json",
31 "tokenizer_config.json",
32 "vocab.json",
33 "special_tokens_map.json",
34 "generation_config.json",
35 "merges.txt",
36 "package-lock.json",
37 "yarn.lock",
38 "pnpm-lock.yaml",
39 "composer.lock",
40 "Pipfile.lock",
41 "poetry.lock",
42 "Gemfile.lock",
43 "package.json",
44})
46MIN_ALPHA_RATIO = 0.35
49@register("project_docs")
50def parse_project_docs(
51 source_dir: Path,
52 project: str,
53 source_tool: str,
54 privacy_scope: str,
55) -> list[IngestionRecord]:
56 """Walk a directory and create records for supported text files."""
57 records: list[IngestionRecord] = []
58 resolved = source_dir.resolve()
59 if not resolved.exists():
60 return records
61 candidates = _iter_candidate_files(resolved)
63 for file_path in candidates:
64 suffix = file_path.suffix.lower()
65 if suffix not in INDEX_EXTENSIONS:
66 continue
67 if file_path.name in EXCLUDED_FILENAMES:
68 continue
69 try:
70 text = file_path.read_text(encoding="utf-8", errors="replace")
71 except OSError:
72 continue
73 if not text.strip():
74 continue
75 alpha_count = sum(1 for c in text if c.isalpha())
76 if len(text) > 0 and alpha_count / len(text) < MIN_ALPHA_RATIO:
77 continue
78 mtime = _file_mtime_iso(file_path)
79 records.append(
80 IngestionRecord(
81 project=project,
82 source_type="project_docs",
83 source_path=str(file_path.resolve()),
84 source_tool=source_tool,
85 text=text,
86 created_at=mtime,
87 last_modified=mtime,
88 privacy_scope=privacy_scope,
89 )
90 )
91 return records
94def _iter_candidate_files(resolved: Path) -> list[Path]:
95 """Yield files while pruning excluded directories before descent."""
96 if resolved.is_file():
97 return [resolved]
99 candidates: list[Path] = []
100 for root, dirnames, filenames in os.walk(resolved):
101 dirnames[:] = [
102 name for name in dirnames
103 if not _is_excluded_dir_name(name)
104 ]
105 root_path = Path(root)
106 for filename in filenames:
107 candidates.append(root_path / filename)
108 return sorted(candidates)
111def _is_excluded_path(path: Path, *, root: Path) -> bool:
112 try:
113 parts = path.relative_to(root).parts
114 except ValueError:
115 parts = path.parts
116 for part in parts:
117 if _is_excluded_dir_name(part):
118 return True
119 return False
122def _is_excluded_dir_name(part: str) -> bool:
123 if part in EXCLUDED_DIRS:
124 return True
125 return any(part.startswith(prefix) for prefix in EXCLUDED_DIR_PREFIXES)
128def _file_mtime_iso(path: Path) -> str:
129 try:
130 stat = path.stat()
131 return datetime.fromtimestamp(stat.st_mtime, tz=timezone.utc).isoformat()
132 except OSError:
133 return datetime.now(timezone.utc).isoformat()