Coverage for little_loops / issue_discovery / matching.py: 100%

72 statements  

« prev     ^ index     » next       coverage.py v7.12.0, created at 2026-05-22 16:19 -0500

1"""Issue matching types and text similarity helpers.""" 

2 

3from __future__ import annotations 

4 

5import re 

6from dataclasses import dataclass, field 

7from enum import Enum 

8from pathlib import Path 

9from typing import TYPE_CHECKING 

10 

11# Promoted to text_utils.py as public functions; aliased here for backward compat 

12from little_loops.text_utils import calculate_word_overlap as _calculate_word_overlap # noqa: F401 

13from little_loops.text_utils import extract_words as _extract_words # noqa: F401 

14 

15if TYPE_CHECKING: 

16 from little_loops.config import BRConfig 

17 

18 

19# ============================================================================= 

20# Enums 

21# ============================================================================= 

22 

23 

24class MatchClassification(Enum): 

25 """Classification of how a finding matches an existing issue. 

26 

27 Used to distinguish between true duplicates, regressions, and invalid fixes 

28 when a finding matches a completed issue. 

29 """ 

30 

31 NEW_ISSUE = "new_issue" # No existing issue matches 

32 DUPLICATE = "duplicate" # Active issue exists 

33 REGRESSION = "regression" # Completed, files modified AFTER fix (fix broke) 

34 INVALID_FIX = "invalid_fix" # Completed, files NOT modified after fix (never worked) 

35 UNVERIFIED = "unverified" # Completed, no fix commit tracked (can't determine) 

36 

37 

38# ============================================================================= 

39# Data Classes 

40# ============================================================================= 

41 

42 

43@dataclass 

44class RegressionEvidence: 

45 """Evidence for regression vs invalid fix classification. 

46 

47 Attributes: 

48 fix_commit_sha: SHA of the commit that fixed the original issue 

49 fix_commit_exists: Whether the fix commit exists in current history 

50 files_modified_since_fix: Files from the fix that were modified after fix 

51 days_since_fix: Number of days since the fix was applied 

52 related_commits: Commits that modified the relevant files after fix 

53 """ 

54 

55 fix_commit_sha: str | None = None 

56 fix_commit_exists: bool = True 

57 files_modified_since_fix: list[str] = field(default_factory=list) 

58 days_since_fix: int = 0 

59 related_commits: list[str] = field(default_factory=list) 

60 

61 

62@dataclass 

63class FindingMatch: 

64 """Result of matching a finding to an existing issue. 

65 

66 Attributes: 

67 issue_path: Path to matched issue file, or None if no match 

68 match_type: Type of match ("exact", "similar", "content", "none") 

69 match_score: Confidence score from 0.0 to 1.0 

70 is_completed: Whether the matched issue is in completed/ 

71 matched_terms: Terms that matched (for debugging) 

72 classification: How to classify this match (regression, duplicate, etc.) 

73 regression_evidence: Evidence supporting regression classification 

74 exact_threshold: Score at or above which a finding is a duplicate (skip) 

75 similar_threshold: Score at or above which a finding should update an existing issue 

76 """ 

77 

78 issue_path: Path | None 

79 match_type: str 

80 match_score: float 

81 is_completed: bool = False 

82 matched_terms: list[str] = field(default_factory=list) 

83 classification: MatchClassification = MatchClassification.NEW_ISSUE 

84 regression_evidence: RegressionEvidence | None = None 

85 exact_threshold: float = 0.8 

86 similar_threshold: float = 0.5 

87 

88 @property 

89 def should_skip(self) -> bool: 

90 """Return True if finding is a duplicate and should be skipped.""" 

91 return self.match_score >= self.exact_threshold 

92 

93 @property 

94 def should_update(self) -> bool: 

95 """Return True if finding should update the existing issue.""" 

96 return self.similar_threshold <= self.match_score < self.exact_threshold 

97 

98 @property 

99 def should_create(self) -> bool: 

100 """Return True if a new issue should be created.""" 

101 return self.match_score < self.similar_threshold 

102 

103 @property 

104 def should_reopen(self) -> bool: 

105 """Return True if a completed issue should be reopened.""" 

106 return self.is_completed and self.match_score >= self.similar_threshold 

107 

108 @property 

109 def should_reopen_as_regression(self) -> bool: 

110 """Return True if issue should be reopened as a regression. 

111 

112 A regression means the fix was applied but later code changes broke it. 

113 """ 

114 return ( 

115 self.is_completed 

116 and self.match_score >= self.similar_threshold 

117 and self.classification == MatchClassification.REGRESSION 

118 ) 

119 

120 @property 

121 def should_reopen_as_invalid_fix(self) -> bool: 

122 """Return True if issue should be reopened due to invalid fix. 

123 

124 An invalid fix means the original fix never actually resolved the issue. 

125 """ 

126 return ( 

127 self.is_completed 

128 and self.match_score >= self.similar_threshold 

129 and self.classification == MatchClassification.INVALID_FIX 

130 ) 

131 

132 @property 

133 def is_unverified(self) -> bool: 

134 """Return True if regression status cannot be determined. 

135 

136 Unverified means the completed issue has no fix commit tracked, 

137 so we cannot determine if this is a regression or invalid fix. 

138 """ 

139 return ( 

140 self.is_completed 

141 and self.match_score >= self.similar_threshold 

142 and self.classification == MatchClassification.UNVERIFIED 

143 ) 

144 

145 

146# ============================================================================= 

147# Text Matching Helpers 

148# ============================================================================= 

149 

150 

151def _normalize_text(text: str) -> str: 

152 """Normalize text for comparison. 

153 

154 Args: 

155 text: Input text 

156 

157 Returns: 

158 Lowercase text with normalized whitespace 

159 """ 

160 return re.sub(r"\s+", " ", text.lower().strip()) 

161 

162 

163def _extract_line_numbers(text: str) -> set[int]: 

164 """Extract line numbers from text. 

165 

166 Args: 

167 text: Input text 

168 

169 Returns: 

170 Set of line numbers found 

171 """ 

172 numbers: set[int] = set() 

173 # Match line number patterns 

174 patterns = [ 

175 r"\*\*Line(?:\(s\))?\*\*:\s*(\d+)(?:-(\d+))?", # **Line(s)**: 42-45 

176 r":(\d+)(?:-(\d+))?", # :42-45 (in paths) 

177 r"line\s+(\d+)", # line 42 

178 ] 

179 for pattern in patterns: 

180 for match in re.finditer(pattern, text, re.IGNORECASE): 

181 numbers.add(int(match.group(1))) 

182 if match.lastindex and match.lastindex >= 2 and match.group(2): 

183 numbers.add(int(match.group(2))) 

184 return numbers 

185 

186 

187def _matches_issue_type( 

188 finding_type: str, 

189 issue_path: Path, 

190 config: BRConfig, 

191 is_completed: bool, 

192) -> bool: 

193 """Check if finding type matches issue path using configured categories. 

194 

195 Args: 

196 finding_type: The type of finding (e.g., 'BUG', 'ENH', 'FEAT') 

197 issue_path: Path to the issue file 

198 config: Configuration with category definitions 

199 is_completed: Whether the issue is in the completed directory 

200 

201 Returns: 

202 True if the finding type matches the issue path's category 

203 """ 

204 if is_completed: 

205 return True 

206 

207 path_str = str(issue_path) 

208 for category in config.issues.categories.values(): 

209 if finding_type == category.prefix and f"/{category.dir}/" in path_str: 

210 return True 

211 return False