Coverage for little_loops / text_utils.py: 98%
51 statements
« prev ^ index » next coverage.py v7.12.0, created at 2026-05-22 16:19 -0500
« prev ^ index » next coverage.py v7.12.0, created at 2026-05-22 16:19 -0500
1"""Text extraction utilities for issue content.
3Provides shared functions for extracting file paths from markdown issue
4content. Used by dependency_mapper, issue_history, and other modules that
5need to identify file references in issue text.
6"""
8from __future__ import annotations
10import math
11import re
12from pathlib import Path
14# File path patterns for extraction from issue content
15_BACKTICK_PATH = re.compile(r"`([^`\s]+\.[a-z]{2,4})`")
16_BOLD_FILE_PATH = re.compile(r"\*\*File\*\*:\s*`?([^`\n]+\.[a-z]{2,4})`?")
17_STANDALONE_PATH = re.compile(
18 r"(?:^|\s)([a-zA-Z_][\w/.-]*\.[a-z]{2,4})(?::\d+)?(?:\s|$|:|\))",
19 re.MULTILINE,
20)
21_CODE_FENCE = re.compile(r"```[\s\S]*?```", re.MULTILINE)
23# File extensions that indicate real source file paths
24SOURCE_EXTENSIONS = frozenset(
25 {
26 ".py",
27 ".ts",
28 ".js",
29 ".tsx",
30 ".jsx",
31 ".md",
32 ".json",
33 ".yaml",
34 ".yml",
35 ".toml",
36 ".cfg",
37 ".ini",
38 ".html",
39 ".css",
40 ".scss",
41 ".sh",
42 ".bash",
43 ".sql",
44 ".go",
45 ".rs",
46 ".java",
47 ".kt",
48 ".rb",
49 ".php",
50 }
51)
54def extract_file_paths(content: str) -> set[str]:
55 """Extract file paths from issue content.
57 Searches for file paths in:
58 - Backtick-quoted paths: `path/to/file.py`
59 - Location section bold paths: **File**: `path/to/file.py`
60 - Standalone paths with recognized extensions
62 Code fence blocks are stripped before extraction to avoid
63 matching paths inside example code. Line number suffixes
64 (e.g., ``path.py:123``) are normalized by stripping the
65 line number portion.
67 Args:
68 content: Issue file content
70 Returns:
71 Set of file paths found in the content
72 """
73 if not content:
74 return set()
76 # Strip code fences to avoid matching example paths
77 stripped = _CODE_FENCE.sub("", content)
79 paths: set[str] = set()
80 for pattern in (_BOLD_FILE_PATH, _BACKTICK_PATH, _STANDALONE_PATH):
81 for match in pattern.finditer(stripped):
82 path = match.group(1).strip()
83 # Normalize: remove line numbers (path.py:123 -> path.py)
84 if ":" in path and path.split(":")[-1].isdigit():
85 path = ":".join(path.split(":")[:-1])
86 # Only include paths with directory separators or recognized extensions
87 ext = Path(path).suffix.lower()
88 if ext in SOURCE_EXTENSIONS and ("/" in path or ext):
89 paths.add(path)
90 return paths
93# =============================================================================
94# Word Extraction and Overlap Scoring
95# =============================================================================
97# Common stop words excluded from word extraction
98_COMMON_WORDS = frozenset(
99 {
100 "the",
101 "and",
102 "for",
103 "this",
104 "that",
105 "with",
106 "from",
107 "are",
108 "was",
109 "were",
110 "been",
111 "have",
112 "has",
113 "had",
114 "not",
115 "but",
116 "can",
117 "will",
118 "should",
119 "would",
120 "could",
121 "may",
122 "might",
123 "must",
124 "file",
125 "code",
126 "issue",
127 }
128)
131def extract_words(text: str) -> set[str]:
132 """Extract significant words from text.
134 Extracts all lowercase alphabetic words of 3+ characters,
135 excluding common stop words. Useful for topic-based relevance
136 scoring via Jaccard similarity.
138 Args:
139 text: Input text
141 Returns:
142 Set of lowercase words (3+ chars, excluding common words)
143 """
144 words = set(re.findall(r"\b[a-z]{3,}\b", text.lower()))
145 return words - _COMMON_WORDS
148def calculate_word_overlap(words1: set[str], words2: set[str]) -> float:
149 """Calculate Jaccard similarity between word sets.
151 Args:
152 words1: First word set
153 words2: Second word set
155 Returns:
156 Similarity score from 0.0 to 1.0
157 """
158 if not words1 or not words2:
159 return 0.0
160 intersection = words1 & words2
161 union = words1 | words2
162 return len(intersection) / len(union)
165# =============================================================================
166# Duration Parsing
167# =============================================================================
169_DURATION_UNITS = {"s": 1, "m": 60, "h": 3600, "d": 86400}
170_DURATION_RE = re.compile(r"^(\d+)([smhd])$")
173def parse_duration(s: str) -> int:
174 """Parse a duration string like '1h', '30m', '2d', '45s' into seconds.
176 Args:
177 s: Duration string with a numeric value followed by a unit (s/m/h/d)
179 Returns:
180 Number of seconds represented by the duration
182 Raises:
183 ValueError: If the string does not match the expected format
184 """
185 m = _DURATION_RE.match(s)
186 if not m:
187 raise ValueError(f"Invalid duration: {s!r}. Use e.g. 1h, 30m, 2d, 45s")
188 return int(m.group(1)) * _DURATION_UNITS[m.group(2)]
191def score_bm25(
192 query_words: set[str],
193 doc_words: set[str],
194 doc_freq: dict[str, int],
195 avg_doc_len: float,
196 total_docs: int,
197 k1: float = 1.5,
198 b: float = 0.75,
199) -> float:
200 """Compute BM25 relevance score for a document against a query.
202 Uses the Robertson BM25 formula with IDF smoothing. Since doc_words
203 is a set (unique terms only), term frequency within the document is
204 always 1 for matching terms.
206 Args:
207 query_words: Set of query terms
208 doc_words: Set of document terms (unique words, from extract_words)
209 doc_freq: Document frequency per term (number of docs containing each term)
210 avg_doc_len: Average document length in unique words across corpus
211 total_docs: Total number of documents in corpus
212 k1: Term frequency saturation parameter (default: 1.5)
213 b: Length normalization parameter (default: 0.75)
215 Returns:
216 BM25 score (non-negative float, unbounded above)
217 """
218 if not query_words or not doc_words or total_docs == 0 or avg_doc_len == 0:
219 return 0.0
221 doc_len = len(doc_words)
222 score = 0.0
224 for term in query_words & doc_words:
225 df = doc_freq.get(term, 0)
226 # Robertson IDF with +1 smoothing to keep score non-negative
227 idf = math.log((total_docs - df + 0.5) / (df + 0.5) + 1)
228 # TF = 1 (term present in doc), with length normalization
229 tf_norm = (k1 + 1) / (1 + k1 * (1 - b + b * doc_len / avg_doc_len))
230 score += idf * tf_norm
232 return score