Coverage for little_loops / text_utils.py: 98%

51 statements  

« prev     ^ index     » next       coverage.py v7.12.0, created at 2026-05-22 16:19 -0500

1"""Text extraction utilities for issue content. 

2 

3Provides shared functions for extracting file paths from markdown issue 

4content. Used by dependency_mapper, issue_history, and other modules that 

5need to identify file references in issue text. 

6""" 

7 

8from __future__ import annotations 

9 

10import math 

11import re 

12from pathlib import Path 

13 

14# File path patterns for extraction from issue content 

15_BACKTICK_PATH = re.compile(r"`([^`\s]+\.[a-z]{2,4})`") 

16_BOLD_FILE_PATH = re.compile(r"\*\*File\*\*:\s*`?([^`\n]+\.[a-z]{2,4})`?") 

17_STANDALONE_PATH = re.compile( 

18 r"(?:^|\s)([a-zA-Z_][\w/.-]*\.[a-z]{2,4})(?::\d+)?(?:\s|$|:|\))", 

19 re.MULTILINE, 

20) 

21_CODE_FENCE = re.compile(r"```[\s\S]*?```", re.MULTILINE) 

22 

23# File extensions that indicate real source file paths 

24SOURCE_EXTENSIONS = frozenset( 

25 { 

26 ".py", 

27 ".ts", 

28 ".js", 

29 ".tsx", 

30 ".jsx", 

31 ".md", 

32 ".json", 

33 ".yaml", 

34 ".yml", 

35 ".toml", 

36 ".cfg", 

37 ".ini", 

38 ".html", 

39 ".css", 

40 ".scss", 

41 ".sh", 

42 ".bash", 

43 ".sql", 

44 ".go", 

45 ".rs", 

46 ".java", 

47 ".kt", 

48 ".rb", 

49 ".php", 

50 } 

51) 

52 

53 

54def extract_file_paths(content: str) -> set[str]: 

55 """Extract file paths from issue content. 

56 

57 Searches for file paths in: 

58 - Backtick-quoted paths: `path/to/file.py` 

59 - Location section bold paths: **File**: `path/to/file.py` 

60 - Standalone paths with recognized extensions 

61 

62 Code fence blocks are stripped before extraction to avoid 

63 matching paths inside example code. Line number suffixes 

64 (e.g., ``path.py:123``) are normalized by stripping the 

65 line number portion. 

66 

67 Args: 

68 content: Issue file content 

69 

70 Returns: 

71 Set of file paths found in the content 

72 """ 

73 if not content: 

74 return set() 

75 

76 # Strip code fences to avoid matching example paths 

77 stripped = _CODE_FENCE.sub("", content) 

78 

79 paths: set[str] = set() 

80 for pattern in (_BOLD_FILE_PATH, _BACKTICK_PATH, _STANDALONE_PATH): 

81 for match in pattern.finditer(stripped): 

82 path = match.group(1).strip() 

83 # Normalize: remove line numbers (path.py:123 -> path.py) 

84 if ":" in path and path.split(":")[-1].isdigit(): 

85 path = ":".join(path.split(":")[:-1]) 

86 # Only include paths with directory separators or recognized extensions 

87 ext = Path(path).suffix.lower() 

88 if ext in SOURCE_EXTENSIONS and ("/" in path or ext): 

89 paths.add(path) 

90 return paths 

91 

92 

93# ============================================================================= 

94# Word Extraction and Overlap Scoring 

95# ============================================================================= 

96 

97# Common stop words excluded from word extraction 

98_COMMON_WORDS = frozenset( 

99 { 

100 "the", 

101 "and", 

102 "for", 

103 "this", 

104 "that", 

105 "with", 

106 "from", 

107 "are", 

108 "was", 

109 "were", 

110 "been", 

111 "have", 

112 "has", 

113 "had", 

114 "not", 

115 "but", 

116 "can", 

117 "will", 

118 "should", 

119 "would", 

120 "could", 

121 "may", 

122 "might", 

123 "must", 

124 "file", 

125 "code", 

126 "issue", 

127 } 

128) 

129 

130 

131def extract_words(text: str) -> set[str]: 

132 """Extract significant words from text. 

133 

134 Extracts all lowercase alphabetic words of 3+ characters, 

135 excluding common stop words. Useful for topic-based relevance 

136 scoring via Jaccard similarity. 

137 

138 Args: 

139 text: Input text 

140 

141 Returns: 

142 Set of lowercase words (3+ chars, excluding common words) 

143 """ 

144 words = set(re.findall(r"\b[a-z]{3,}\b", text.lower())) 

145 return words - _COMMON_WORDS 

146 

147 

148def calculate_word_overlap(words1: set[str], words2: set[str]) -> float: 

149 """Calculate Jaccard similarity between word sets. 

150 

151 Args: 

152 words1: First word set 

153 words2: Second word set 

154 

155 Returns: 

156 Similarity score from 0.0 to 1.0 

157 """ 

158 if not words1 or not words2: 

159 return 0.0 

160 intersection = words1 & words2 

161 union = words1 | words2 

162 return len(intersection) / len(union) 

163 

164 

165# ============================================================================= 

166# Duration Parsing 

167# ============================================================================= 

168 

169_DURATION_UNITS = {"s": 1, "m": 60, "h": 3600, "d": 86400} 

170_DURATION_RE = re.compile(r"^(\d+)([smhd])$") 

171 

172 

173def parse_duration(s: str) -> int: 

174 """Parse a duration string like '1h', '30m', '2d', '45s' into seconds. 

175 

176 Args: 

177 s: Duration string with a numeric value followed by a unit (s/m/h/d) 

178 

179 Returns: 

180 Number of seconds represented by the duration 

181 

182 Raises: 

183 ValueError: If the string does not match the expected format 

184 """ 

185 m = _DURATION_RE.match(s) 

186 if not m: 

187 raise ValueError(f"Invalid duration: {s!r}. Use e.g. 1h, 30m, 2d, 45s") 

188 return int(m.group(1)) * _DURATION_UNITS[m.group(2)] 

189 

190 

191def score_bm25( 

192 query_words: set[str], 

193 doc_words: set[str], 

194 doc_freq: dict[str, int], 

195 avg_doc_len: float, 

196 total_docs: int, 

197 k1: float = 1.5, 

198 b: float = 0.75, 

199) -> float: 

200 """Compute BM25 relevance score for a document against a query. 

201 

202 Uses the Robertson BM25 formula with IDF smoothing. Since doc_words 

203 is a set (unique terms only), term frequency within the document is 

204 always 1 for matching terms. 

205 

206 Args: 

207 query_words: Set of query terms 

208 doc_words: Set of document terms (unique words, from extract_words) 

209 doc_freq: Document frequency per term (number of docs containing each term) 

210 avg_doc_len: Average document length in unique words across corpus 

211 total_docs: Total number of documents in corpus 

212 k1: Term frequency saturation parameter (default: 1.5) 

213 b: Length normalization parameter (default: 0.75) 

214 

215 Returns: 

216 BM25 score (non-negative float, unbounded above) 

217 """ 

218 if not query_words or not doc_words or total_docs == 0 or avg_doc_len == 0: 

219 return 0.0 

220 

221 doc_len = len(doc_words) 

222 score = 0.0 

223 

224 for term in query_words & doc_words: 

225 df = doc_freq.get(term, 0) 

226 # Robertson IDF with +1 smoothing to keep score non-negative 

227 idf = math.log((total_docs - df + 0.5) / (df + 0.5) + 1) 

228 # TF = 1 (term present in doc), with length normalization 

229 tf_norm = (k1 + 1) / (1 + k1 * (1 - b + b * doc_len / avg_doc_len)) 

230 score += idf * tf_norm 

231 

232 return score