Coverage for little_loops / issue_discovery / matching.py: 100%
72 statements
« prev ^ index » next coverage.py v7.12.0, created at 2026-05-22 16:19 -0500
« prev ^ index » next coverage.py v7.12.0, created at 2026-05-22 16:19 -0500
1"""Issue matching types and text similarity helpers."""
3from __future__ import annotations
5import re
6from dataclasses import dataclass, field
7from enum import Enum
8from pathlib import Path
9from typing import TYPE_CHECKING
11# Promoted to text_utils.py as public functions; aliased here for backward compat
12from little_loops.text_utils import calculate_word_overlap as _calculate_word_overlap # noqa: F401
13from little_loops.text_utils import extract_words as _extract_words # noqa: F401
15if TYPE_CHECKING:
16 from little_loops.config import BRConfig
19# =============================================================================
20# Enums
21# =============================================================================
24class MatchClassification(Enum):
25 """Classification of how a finding matches an existing issue.
27 Used to distinguish between true duplicates, regressions, and invalid fixes
28 when a finding matches a completed issue.
29 """
31 NEW_ISSUE = "new_issue" # No existing issue matches
32 DUPLICATE = "duplicate" # Active issue exists
33 REGRESSION = "regression" # Completed, files modified AFTER fix (fix broke)
34 INVALID_FIX = "invalid_fix" # Completed, files NOT modified after fix (never worked)
35 UNVERIFIED = "unverified" # Completed, no fix commit tracked (can't determine)
38# =============================================================================
39# Data Classes
40# =============================================================================
43@dataclass
44class RegressionEvidence:
45 """Evidence for regression vs invalid fix classification.
47 Attributes:
48 fix_commit_sha: SHA of the commit that fixed the original issue
49 fix_commit_exists: Whether the fix commit exists in current history
50 files_modified_since_fix: Files from the fix that were modified after fix
51 days_since_fix: Number of days since the fix was applied
52 related_commits: Commits that modified the relevant files after fix
53 """
55 fix_commit_sha: str | None = None
56 fix_commit_exists: bool = True
57 files_modified_since_fix: list[str] = field(default_factory=list)
58 days_since_fix: int = 0
59 related_commits: list[str] = field(default_factory=list)
62@dataclass
63class FindingMatch:
64 """Result of matching a finding to an existing issue.
66 Attributes:
67 issue_path: Path to matched issue file, or None if no match
68 match_type: Type of match ("exact", "similar", "content", "none")
69 match_score: Confidence score from 0.0 to 1.0
70 is_completed: Whether the matched issue is in completed/
71 matched_terms: Terms that matched (for debugging)
72 classification: How to classify this match (regression, duplicate, etc.)
73 regression_evidence: Evidence supporting regression classification
74 exact_threshold: Score at or above which a finding is a duplicate (skip)
75 similar_threshold: Score at or above which a finding should update an existing issue
76 """
78 issue_path: Path | None
79 match_type: str
80 match_score: float
81 is_completed: bool = False
82 matched_terms: list[str] = field(default_factory=list)
83 classification: MatchClassification = MatchClassification.NEW_ISSUE
84 regression_evidence: RegressionEvidence | None = None
85 exact_threshold: float = 0.8
86 similar_threshold: float = 0.5
88 @property
89 def should_skip(self) -> bool:
90 """Return True if finding is a duplicate and should be skipped."""
91 return self.match_score >= self.exact_threshold
93 @property
94 def should_update(self) -> bool:
95 """Return True if finding should update the existing issue."""
96 return self.similar_threshold <= self.match_score < self.exact_threshold
98 @property
99 def should_create(self) -> bool:
100 """Return True if a new issue should be created."""
101 return self.match_score < self.similar_threshold
103 @property
104 def should_reopen(self) -> bool:
105 """Return True if a completed issue should be reopened."""
106 return self.is_completed and self.match_score >= self.similar_threshold
108 @property
109 def should_reopen_as_regression(self) -> bool:
110 """Return True if issue should be reopened as a regression.
112 A regression means the fix was applied but later code changes broke it.
113 """
114 return (
115 self.is_completed
116 and self.match_score >= self.similar_threshold
117 and self.classification == MatchClassification.REGRESSION
118 )
120 @property
121 def should_reopen_as_invalid_fix(self) -> bool:
122 """Return True if issue should be reopened due to invalid fix.
124 An invalid fix means the original fix never actually resolved the issue.
125 """
126 return (
127 self.is_completed
128 and self.match_score >= self.similar_threshold
129 and self.classification == MatchClassification.INVALID_FIX
130 )
132 @property
133 def is_unverified(self) -> bool:
134 """Return True if regression status cannot be determined.
136 Unverified means the completed issue has no fix commit tracked,
137 so we cannot determine if this is a regression or invalid fix.
138 """
139 return (
140 self.is_completed
141 and self.match_score >= self.similar_threshold
142 and self.classification == MatchClassification.UNVERIFIED
143 )
146# =============================================================================
147# Text Matching Helpers
148# =============================================================================
151def _normalize_text(text: str) -> str:
152 """Normalize text for comparison.
154 Args:
155 text: Input text
157 Returns:
158 Lowercase text with normalized whitespace
159 """
160 return re.sub(r"\s+", " ", text.lower().strip())
163def _extract_line_numbers(text: str) -> set[int]:
164 """Extract line numbers from text.
166 Args:
167 text: Input text
169 Returns:
170 Set of line numbers found
171 """
172 numbers: set[int] = set()
173 # Match line number patterns
174 patterns = [
175 r"\*\*Line(?:\(s\))?\*\*:\s*(\d+)(?:-(\d+))?", # **Line(s)**: 42-45
176 r":(\d+)(?:-(\d+))?", # :42-45 (in paths)
177 r"line\s+(\d+)", # line 42
178 ]
179 for pattern in patterns:
180 for match in re.finditer(pattern, text, re.IGNORECASE):
181 numbers.add(int(match.group(1)))
182 if match.lastindex and match.lastindex >= 2 and match.group(2):
183 numbers.add(int(match.group(2)))
184 return numbers
187def _matches_issue_type(
188 finding_type: str,
189 issue_path: Path,
190 config: BRConfig,
191 is_completed: bool,
192) -> bool:
193 """Check if finding type matches issue path using configured categories.
195 Args:
196 finding_type: The type of finding (e.g., 'BUG', 'ENH', 'FEAT')
197 issue_path: Path to the issue file
198 config: Configuration with category definitions
199 is_completed: Whether the issue is in the completed directory
201 Returns:
202 True if the finding type matches the issue path's category
203 """
204 if is_completed:
205 return True
207 path_str = str(issue_path)
208 for category in config.issues.categories.values():
209 if finding_type == category.prefix and f"/{category.dir}/" in path_str:
210 return True
211 return False