Coverage for src / tracekit / quality / scoring.py: 34%

199 statements  

« prev     ^ index     » next       coverage.py v7.13.1, created at 2026-01-11 23:04 +0000

1"""Analysis quality scoring for TraceKit. 

2 

3This module provides quality scoring and reliability categorization for 

4analysis results, enabling users to assess confidence in automated findings. 

5 

6 

7Example: 

8 >>> from tracekit.quality.scoring import AnalysisQualityScore, ReliabilityCategory 

9 >>> score = AnalysisQualityScore( 

10 ... confidence=0.85, 

11 ... category=ReliabilityCategory.HIGH, 

12 ... data_quality_factor=0.9, 

13 ... sample_sufficiency=0.8, 

14 ... method_reliability=0.85, 

15 ... ) 

16 >>> print(score.explain()) 

17 >>> recommendations = score.get_recommendations() 

18 

19References: 

20 - Quality scoring for automated analysis results 

21""" 

22 

23from __future__ import annotations 

24 

25import logging 

26from dataclasses import dataclass, field 

27from enum import Enum 

28from typing import TYPE_CHECKING, Any 

29 

30import numpy as np 

31 

32if TYPE_CHECKING: 

33 from numpy.typing import NDArray 

34 

35logger = logging.getLogger(__name__) 

36 

37 

38class ReliabilityCategory(Enum): 

39 """Reliability categories for analysis results. 

40 

41 Attributes: 

42 HIGH: Result is highly reliable (confidence >= 0.8) 

43 MEDIUM: Result has moderate reliability (0.6 <= confidence < 0.8) 

44 LOW: Result has low reliability (0.4 <= confidence < 0.6) 

45 UNRELIABLE: Result is unreliable (confidence < 0.4) 

46 """ 

47 

48 HIGH = "high" 

49 MEDIUM = "medium" 

50 LOW = "low" 

51 UNRELIABLE = "unreliable" 

52 

53 @classmethod 

54 def from_confidence(cls, confidence: float) -> ReliabilityCategory: 

55 """Get category from confidence score. 

56 

57 Args: 

58 confidence: Confidence value in range [0, 1] 

59 

60 Returns: 

61 Appropriate ReliabilityCategory 

62 """ 

63 if confidence >= 0.8: 63 ↛ 65line 63 didn't jump to line 65 because the condition on line 63 was always true

64 return cls.HIGH 

65 elif confidence >= 0.6: 

66 return cls.MEDIUM 

67 elif confidence >= 0.4: 

68 return cls.LOW 

69 else: 

70 return cls.UNRELIABLE 

71 

72 

73@dataclass 

74class AnalysisQualityScore: 

75 """Quality score for an analysis result. 

76 

77 Attributes: 

78 confidence: Overall confidence in result (0-1) 

79 category: Reliability category 

80 data_quality_factor: Quality of input data (0-1) 

81 sample_sufficiency: Sufficiency of sample count (0-1) 

82 method_reliability: Inherent reliability of method (0-1) 

83 factors: Additional contributing factors 

84 warnings: Quality warnings 

85 metadata: Additional metadata 

86 

87 Example: 

88 >>> score = AnalysisQualityScore( 

89 ... confidence=0.85, 

90 ... category=ReliabilityCategory.HIGH, 

91 ... data_quality_factor=0.9, 

92 ... sample_sufficiency=0.8, 

93 ... method_reliability=0.85, 

94 ... ) 

95 >>> if score.is_reliable: 

96 ... print("Result is reliable") 

97 """ 

98 

99 confidence: float 

100 category: ReliabilityCategory 

101 data_quality_factor: float 

102 sample_sufficiency: float 

103 method_reliability: float 

104 factors: dict[str, float] = field(default_factory=dict) 

105 warnings: list[str] = field(default_factory=list) 

106 metadata: dict[str, Any] = field(default_factory=dict) 

107 

108 def __post_init__(self) -> None: 

109 """Validate score values.""" 

110 # Ensure confidence is in valid range 

111 if not 0 <= self.confidence <= 1: 111 ↛ 112line 111 didn't jump to line 112 because the condition on line 111 was never true

112 raise ValueError(f"Confidence must be in [0, 1], got {self.confidence}") 

113 

114 # Validate factors 

115 for name, value in [ 

116 ("data_quality_factor", self.data_quality_factor), 

117 ("sample_sufficiency", self.sample_sufficiency), 

118 ("method_reliability", self.method_reliability), 

119 ]: 

120 if not 0 <= value <= 1: 120 ↛ 121line 120 didn't jump to line 121 because the condition on line 120 was never true

121 raise ValueError(f"{name} must be in [0, 1], got {value}") 

122 

123 @property 

124 def is_reliable(self) -> bool: 

125 """Check if result is reliable (medium confidence or higher). 

126 

127 Returns: 

128 True if category is HIGH or MEDIUM 

129 """ 

130 return self.category in (ReliabilityCategory.HIGH, ReliabilityCategory.MEDIUM) 

131 

132 def explain(self, include_factors: bool = True) -> str: 

133 """Generate human-readable explanation of the quality score. 

134 

135 Args: 

136 include_factors: Whether to include factor breakdown 

137 

138 Returns: 

139 Human-readable explanation string 

140 

141 Example: 

142 >>> print(score.explain()) 

143 ✓ High confidence result (85.0%) 

144 

145 Contributing factors: 

146 ✓ Data Quality Factor: 90.0% 

147 ✓ Sample Sufficiency: 80.0% 

148 ✓ Method Reliability: 85.0% 

149 """ 

150 lines = [] 

151 

152 # Overall assessment 

153 if self.category == ReliabilityCategory.HIGH: 

154 lines.append(f"✓ High confidence result ({self.confidence:.1%})") 

155 elif self.category == ReliabilityCategory.MEDIUM: 

156 lines.append(f"◐ Medium confidence result ({self.confidence:.1%})") 

157 elif self.category == ReliabilityCategory.LOW: 

158 lines.append(f"◯ Low confidence result ({self.confidence:.1%})") 

159 else: 

160 lines.append(f"✗ Unreliable result ({self.confidence:.1%})") 

161 

162 # Factor breakdown 

163 if include_factors and self.factors: 

164 lines.append("\nContributing factors:") 

165 for factor_name, factor_value in sorted(self.factors.items()): 

166 status = "✓" if factor_value >= 0.7 else "◐" if factor_value >= 0.4 else "✗" 

167 lines.append( 

168 f" {status} {factor_name.replace('_', ' ').title()}: {factor_value:.1%}" 

169 ) 

170 

171 # Warnings 

172 if self.warnings: 

173 lines.append("\nWarnings:") 

174 for warning in self.warnings: 

175 lines.append(f"{warning}") 

176 

177 return "\n".join(lines) 

178 

179 def get_recommendations(self) -> list[str]: 

180 """Get actionable recommendations to improve result quality. 

181 

182 Returns: 

183 List of recommendation strings 

184 

185 Example: 

186 >>> recommendations = score.get_recommendations() 

187 >>> for rec in recommendations: 

188 ... print(rec) 

189 Consider improving input signal quality (filtering, averaging) 

190 """ 

191 recommendations = [] 

192 

193 if self.data_quality_factor < 0.5: 

194 recommendations.append("Consider improving input signal quality (filtering, averaging)") 

195 

196 if self.sample_sufficiency < 0.5: 

197 recommendations.append("Capture more data points for reliable analysis") 

198 

199 if "snr" in str(self.warnings).lower(): 

200 recommendations.append("Use a bandpass filter to improve SNR") 

201 

202 if "clipping" in str(self.warnings).lower(): 

203 recommendations.append("Adjust input gain to avoid signal clipping") 

204 

205 if not recommendations: 

206 recommendations.append("Result quality is acceptable") 

207 

208 return recommendations 

209 

210 def to_dict(self) -> dict[str, Any]: 

211 """Convert to dictionary for serialization. 

212 

213 Returns: 

214 Dictionary representation of quality score. 

215 """ 

216 return { 

217 "confidence": self.confidence, 

218 "category": self.category.value, 

219 "is_reliable": self.is_reliable, 

220 "data_quality_factor": self.data_quality_factor, 

221 "method_reliability": self.method_reliability, 

222 "sample_sufficiency": self.sample_sufficiency, 

223 "factors": self.factors, 

224 "warnings": self.warnings, 

225 "metadata": self.metadata, 

226 } 

227 

228 

229def calculate_quality_score( 

230 data_quality_factor: float, 

231 sample_sufficiency: float, 

232 method_reliability: float, 

233 *, 

234 weights: tuple[float, float, float] | None = None, 

235 warnings: list[str] | None = None, 

236 factors: dict[str, float] | None = None, 

237 metadata: dict[str, Any] | None = None, 

238) -> AnalysisQualityScore: 

239 """Calculate overall quality score from component factors. 

240 

241 Args: 

242 data_quality_factor: Quality of input data (0-1) 

243 sample_sufficiency: Sufficiency of sample count (0-1) 

244 method_reliability: Inherent reliability of method (0-1) 

245 weights: Optional custom weights (data, sample, method), defaults to (0.4, 0.3, 0.3) 

246 warnings: Optional quality warnings 

247 factors: Optional additional factors 

248 metadata: Optional metadata 

249 

250 Returns: 

251 AnalysisQualityScore with computed confidence 

252 

253 Example: 

254 >>> score = calculate_quality_score( 

255 ... data_quality_factor=0.9, 

256 ... sample_sufficiency=0.8, 

257 ... method_reliability=0.85, 

258 ... ) 

259 >>> print(f"Confidence: {score.confidence:.1%}") 

260 """ 

261 if weights is None: 

262 weights = (0.4, 0.3, 0.3) 

263 

264 w_data, w_sample, w_method = weights 

265 

266 # Calculate weighted confidence 

267 confidence = ( 

268 w_data * data_quality_factor + w_sample * sample_sufficiency + w_method * method_reliability 

269 ) 

270 

271 # Determine category 

272 category = ReliabilityCategory.from_confidence(confidence) 

273 

274 # Build factors dictionary 

275 all_factors = { 

276 "data_quality_factor": data_quality_factor, 

277 "sample_sufficiency": sample_sufficiency, 

278 "method_reliability": method_reliability, 

279 } 

280 if factors: 

281 all_factors.update(factors) 

282 

283 return AnalysisQualityScore( 

284 confidence=confidence, 

285 category=category, 

286 data_quality_factor=data_quality_factor, 

287 sample_sufficiency=sample_sufficiency, 

288 method_reliability=method_reliability, 

289 factors=all_factors, 

290 warnings=warnings or [], 

291 metadata=metadata or {}, 

292 ) 

293 

294 

295@dataclass 

296class DataQualityMetrics: 

297 """Metrics describing input data quality. 

298 

299 QUAL-002: Data quality assessment 

300 

301 Attributes: 

302 snr_db: Signal-to-noise ratio in decibels 

303 sample_count: Number of samples in the data 

304 has_clipping: Whether the signal shows clipping 

305 has_saturation: Whether the signal shows saturation 

306 noise_floor: Estimated noise floor level 

307 completeness: Fraction of non-NaN values (0-1) 

308 """ 

309 

310 snr_db: float | None = None 

311 sample_count: int = 0 

312 has_clipping: bool = False 

313 has_saturation: bool = False 

314 noise_floor: float | None = None 

315 completeness: float = 1.0 # Fraction of non-NaN values 

316 

317 def to_factor(self) -> float: 

318 """Convert metrics to single quality factor (0-1). 

319 

320 Returns: 

321 Quality factor between 0 and 1. 

322 """ 

323 factors = [] 

324 

325 # SNR contribution 

326 if self.snr_db is not None: 

327 snr_factor = min(1.0, max(0.0, self.snr_db / 40.0)) 

328 factors.append(snr_factor) 

329 

330 # Sample count contribution (diminishing returns after 1000) 

331 sample_factor = min(1.0, np.log10(max(1, self.sample_count)) / 4.0) 

332 factors.append(sample_factor) 

333 

334 # Clipping/saturation penalties 

335 if self.has_clipping: 

336 factors.append(0.7) 

337 if self.has_saturation: 

338 factors.append(0.6) 

339 

340 # Completeness 

341 factors.append(self.completeness) 

342 

343 return float(np.mean(factors)) if factors else 0.5 

344 

345 

346# Method reliability scores (based on algorithm characteristics) 

347#: Method reliability tracking 

348METHOD_RELIABILITY: dict[str, float] = { 

349 # High reliability methods 

350 "fft": 0.95, 

351 "welch": 0.90, 

352 "autocorrelation": 0.85, 

353 "histogram": 0.95, 

354 "statistics": 0.95, 

355 # Medium reliability methods 

356 "edge_detection": 0.80, 

357 "zero_crossing": 0.75, 

358 "peak_detection": 0.70, 

359 "pattern_matching": 0.75, 

360 # Lower reliability methods (heuristic-based) 

361 "protocol_inference": 0.60, 

362 "signal_classification": 0.65, 

363 "anomaly_detection": 0.60, 

364} 

365 

366 

367def assess_data_quality( 

368 data: NDArray[np.float64], sample_rate: float | None = None 

369) -> DataQualityMetrics: 

370 """Assess quality of input data. 

371 

372 QUAL-002: Data quality assessment 

373 

374 Args: 

375 data: Input data array 

376 sample_rate: Sample rate in Hz (optional) 

377 

378 Returns: 

379 DataQualityMetrics with quality assessment 

380 """ 

381 metrics = DataQualityMetrics() 

382 

383 try: 

384 # Sample count 

385 metrics.sample_count = len(data) 

386 

387 # Check for NaN/Inf 

388 valid_mask = np.isfinite(data) 

389 metrics.completeness = float(np.mean(valid_mask)) 

390 

391 if metrics.completeness < 0.01: 391 ↛ 392line 391 didn't jump to line 392 because the condition on line 391 was never true

392 return metrics 

393 

394 valid_data = data[valid_mask] 

395 

396 # Check for clipping (values at min/max bounds) 

397 data_range = np.ptp(valid_data) 

398 if data_range > 0: 398 ↛ 406line 398 didn't jump to line 406 because the condition on line 398 was always true

399 min_count = np.sum(valid_data == np.min(valid_data)) 

400 max_count = np.sum(valid_data == np.max(valid_data)) 

401 clip_threshold = 0.01 * len(valid_data) 

402 metrics.has_clipping = min_count > clip_threshold or max_count > clip_threshold 

403 

404 # Estimate SNR using signal variance vs noise floor 

405 # Use median absolute deviation for robust noise estimation 

406 median = np.median(valid_data) 

407 mad = np.median(np.abs(valid_data - median)) * 1.4826 

408 metrics.noise_floor = float(mad) 

409 

410 signal_power = float(np.var(valid_data)) 

411 noise_power = mad**2 

412 

413 if noise_power > 0: 413 ↛ 420line 413 didn't jump to line 420 because the condition on line 413 was always true

414 snr_linear = signal_power / noise_power 

415 metrics.snr_db = float(10 * np.log10(max(1e-10, snr_linear))) 

416 

417 except Exception as e: 

418 logger.debug(f"Error assessing data quality: {e}") 

419 

420 return metrics 

421 

422 

423def score_analysis_result( 

424 result: Any, 

425 method_name: str, 

426 data: NDArray[np.float64] | None = None, 

427 data_quality: DataQualityMetrics | None = None, 

428 min_samples: int = 10, 

429) -> AnalysisQualityScore: 

430 """Score the quality of an analysis result. 

431 

432 QUAL-001: Quality scoring foundation 

433 

434 Args: 

435 result: The analysis result to score 

436 method_name: Name of the analysis method 

437 data: Input data (for quality assessment) 

438 data_quality: Pre-computed data quality metrics 

439 min_samples: Minimum samples for reliable result 

440 

441 Returns: 

442 AnalysisQualityScore with confidence and factors 

443 """ 

444 factors = {} 

445 warnings = [] 

446 

447 # Get data quality 

448 if data_quality is None and data is not None: 

449 data_quality = assess_data_quality(data) 

450 

451 # Data quality factor 

452 if data_quality is not None: 

453 data_factor = data_quality.to_factor() 

454 factors["data_quality"] = data_factor 

455 

456 if data_quality.has_clipping: 

457 warnings.append("Input data shows clipping") 

458 if data_quality.snr_db is not None and data_quality.snr_db < 20: 

459 warnings.append(f"Low SNR ({data_quality.snr_db:.1f} dB)") 

460 else: 

461 data_factor = 0.5 

462 factors["data_quality"] = data_factor 

463 

464 # Method reliability 

465 method_key = method_name.lower().split(".")[-1].replace("_", "") 

466 method_reliability = METHOD_RELIABILITY.get(method_key, 0.7) 

467 

468 # Check for partial matches 

469 for key, reliability in METHOD_RELIABILITY.items(): 

470 if key in method_name.lower(): 

471 method_reliability = reliability 

472 break 

473 

474 factors["method_reliability"] = method_reliability 

475 

476 # Sample sufficiency 

477 if data_quality is not None: 

478 sample_sufficiency = min(1.0, data_quality.sample_count / (min_samples * 10)) 

479 if data_quality.sample_count < min_samples: 

480 warnings.append(f"Insufficient samples ({data_quality.sample_count} < {min_samples})") 

481 else: 

482 sample_sufficiency = 0.5 

483 factors["sample_sufficiency"] = sample_sufficiency 

484 

485 # Result-specific scoring 

486 result_factor = _score_result_value(result) 

487 factors["result_validity"] = result_factor 

488 

489 # Combine factors 

490 confidence = ( 

491 data_factor * 0.3 

492 + method_reliability * 0.25 

493 + sample_sufficiency * 0.25 

494 + result_factor * 0.2 

495 ) 

496 

497 # Determine category from confidence 

498 category = ReliabilityCategory.from_confidence(confidence) 

499 

500 return AnalysisQualityScore( 

501 confidence=confidence, 

502 category=category, 

503 data_quality_factor=data_factor, 

504 method_reliability=method_reliability, 

505 sample_sufficiency=sample_sufficiency, 

506 factors=factors, 

507 warnings=warnings, 

508 ) 

509 

510 

511def _score_result_value(result: Any) -> float: 

512 """Score result validity based on value characteristics. 

513 

514 Args: 

515 result: Analysis result to score. 

516 

517 Returns: 

518 Validity score between 0 and 1. 

519 """ 

520 if result is None: 

521 return 0.0 

522 

523 # Handle numeric results 

524 if isinstance(result, int | float): 

525 if np.isnan(result) or np.isinf(result): 

526 return 0.0 

527 return 1.0 

528 

529 # Handle array results 

530 if isinstance(result, np.ndarray): 

531 valid_ratio = np.mean(np.isfinite(result)) 

532 return float(valid_ratio) 

533 

534 # Handle dict results 

535 if isinstance(result, dict): 

536 if not result: 

537 return 0.3 

538 return 1.0 

539 

540 # Handle list results 

541 if isinstance(result, list): 

542 if not result: 

543 return 0.3 

544 return 1.0 

545 

546 return 0.7 # Default for other types 

547 

548 

549def combine_quality_scores( 

550 scores: list[AnalysisQualityScore], 

551 weights: list[float] | None = None, 

552) -> AnalysisQualityScore: 

553 """Combine multiple quality scores into one. 

554 

555 Args: 

556 scores: List of quality scores to combine 

557 weights: Optional weights for each score 

558 

559 Returns: 

560 Combined quality score 

561 """ 

562 if not scores: 562 ↛ 563line 562 didn't jump to line 563 because the condition on line 562 was never true

563 return AnalysisQualityScore( 

564 confidence=0.0, 

565 category=ReliabilityCategory.UNRELIABLE, 

566 data_quality_factor=0.0, 

567 method_reliability=0.0, 

568 sample_sufficiency=0.0, 

569 ) 

570 

571 if weights is None: 571 ↛ 572line 571 didn't jump to line 572 because the condition on line 571 was never true

572 weights = [1.0] * len(scores) 

573 

574 total_weight = sum(weights) 

575 

576 combined_confidence = ( 

577 sum(s.confidence * w for s, w in zip(scores, weights, strict=True)) / total_weight 

578 ) 

579 combined_data = ( 

580 sum(s.data_quality_factor * w for s, w in zip(scores, weights, strict=True)) / total_weight 

581 ) 

582 combined_method = ( 

583 sum(s.method_reliability * w for s, w in zip(scores, weights, strict=True)) / total_weight 

584 ) 

585 combined_samples = ( 

586 sum(s.sample_sufficiency * w for s, w in zip(scores, weights, strict=True)) / total_weight 

587 ) 

588 

589 # Aggregate warnings 

590 all_warnings = [] 

591 for score in scores: 

592 all_warnings.extend(score.warnings) 

593 

594 # Determine category 

595 category = ReliabilityCategory.from_confidence(combined_confidence) 

596 

597 return AnalysisQualityScore( 

598 confidence=combined_confidence, 

599 category=category, 

600 data_quality_factor=combined_data, 

601 method_reliability=combined_method, 

602 sample_sufficiency=combined_samples, 

603 warnings=list(set(all_warnings)), 

604 ) 

605 

606 

607__all__ = [ 

608 "METHOD_RELIABILITY", 

609 "AnalysisQualityScore", 

610 "DataQualityMetrics", 

611 "ReliabilityCategory", 

612 "assess_data_quality", 

613 "calculate_quality_score", 

614 "combine_quality_scores", 

615 "score_analysis_result", 

616]