Coverage for src / tracekit / reporting / summary_generator.py: 96%

138 statements  

« prev     ^ index     » next       coverage.py v7.13.1, created at 2026-01-11 23:04 +0000

1"""Natural language summary generation for signal analysis. 

2 

3This module generates human-readable descriptions of measurements and analysis 

4results that avoid jargon and explain findings in accessible language. 

5 

6 

7Example: 

8 >>> from tracekit.reporting import generate_summary 

9 >>> trace = load("capture.wfm") 

10 >>> summary = generate_summary(trace) 

11 >>> print(summary.text) 

12 

13References: 

14 TraceKit Auto-Discovery Specification 

15""" 

16 

17from __future__ import annotations 

18 

19from dataclasses import dataclass, field 

20from typing import TYPE_CHECKING, Any 

21 

22import numpy as np 

23 

24if TYPE_CHECKING: 

25 from tracekit.core.types import WaveformTrace 

26 

27 

28@dataclass 

29class Finding: 

30 """Individual analysis finding. 

31 

32 Attributes: 

33 title: Short title for the finding. 

34 description: Plain language description. 

35 confidence: Confidence score (0.0-1.0). 

36 severity: Severity level (INFO, WARNING, CRITICAL). 

37 """ 

38 

39 title: str 

40 description: str 

41 confidence: float = 1.0 

42 severity: str = "INFO" 

43 

44 

45@dataclass 

46class Summary: 

47 """Natural language summary of signal analysis. 

48 

49 Attributes: 

50 text: Complete summary text (2-3 sentences, 100-200 words). 

51 overview: High-level overview sentence. 

52 findings: List of key findings (minimum 3). 

53 recommendations: Actionable insights and next steps. 

54 word_count: Number of words in summary text. 

55 grade_level: Flesch-Kincaid grade level. 

56 """ 

57 

58 text: str 

59 overview: str 

60 findings: list[Finding] = field(default_factory=list) 

61 recommendations: list[str] = field(default_factory=list) 

62 word_count: int = 0 

63 grade_level: float = 0.0 

64 

65 

66def _estimate_grade_level(text: str) -> float: 

67 """Estimate Flesch-Kincaid grade level. 

68 

69 Simple approximation based on sentence and word length. 

70 

71 Args: 

72 text: Text to analyze. 

73 

74 Returns: 

75 Estimated grade level. 

76 """ 

77 # Split into sentences (simple split on period) 

78 sentences = [s.strip() for s in text.split(".") if s.strip()] 

79 if not sentences: 

80 return 0.0 

81 

82 # Split into words 

83 words = text.split() 

84 if not words: 84 ↛ 85line 84 didn't jump to line 85 because the condition on line 84 was never true

85 return 0.0 

86 

87 # Count syllables (approximation: count vowel groups) 

88 total_syllables = 0 

89 for word in words: 

90 word_lower = word.lower() 

91 syllable_count = 0 

92 previous_was_vowel = False 

93 

94 for char in word_lower: 

95 is_vowel = char in "aeiouy" 

96 if is_vowel and not previous_was_vowel: 

97 syllable_count += 1 

98 previous_was_vowel = is_vowel 

99 

100 # Minimum 1 syllable per word 

101 total_syllables += max(1, syllable_count) 

102 

103 # Flesch-Kincaid formula 

104 avg_words_per_sentence = len(words) / len(sentences) 

105 avg_syllables_per_word = total_syllables / len(words) 

106 

107 grade_level = 0.39 * avg_words_per_sentence + 11.8 * avg_syllables_per_word - 15.59 

108 

109 return max(0.0, grade_level) 

110 

111 

112def _characterize_signal_type(trace: WaveformTrace) -> tuple[str, float]: 

113 """Characterize basic signal type. 

114 

115 Simple heuristic-based signal type detection. 

116 

117 Args: 

118 trace: Waveform to analyze. 

119 

120 Returns: 

121 Tuple of (signal_type, confidence). 

122 """ 

123 data = trace.data.astype(np.float64) 

124 

125 # Check if digital (only 2-3 distinct levels) 

126 unique_values = len(np.unique(np.round(data, decimals=2))) 

127 value_range = np.ptp(data) 

128 

129 if unique_values <= 3 and value_range > 0.1: 

130 # Likely digital 

131 return "digital", 0.85 

132 elif value_range < 0.01: 

133 # Constant signal 

134 return "DC level", 0.90 

135 else: 

136 # Analog 

137 # Check for periodicity 

138 if len(data) > 100: 138 ↛ 146line 138 didn't jump to line 146 because the condition on line 138 was always true

139 autocorr = np.correlate(data - np.mean(data), data - np.mean(data), mode="full") 

140 autocorr = autocorr[len(autocorr) // 2 :] 

141 autocorr = autocorr / autocorr[0] 

142 

143 # Look for peaks in autocorrelation 

144 if len(autocorr) > 10 and np.max(autocorr[10:]) > 0.5: 

145 return "periodic analog", 0.75 

146 return "analog", 0.70 

147 

148 

149def _assess_quality(trace: WaveformTrace) -> tuple[str, list[str]]: 

150 """Assess signal quality. 

151 

152 Args: 

153 trace: Waveform to analyze. 

154 

155 Returns: 

156 Tuple of (quality_level, issues). 

157 """ 

158 data = trace.data.astype(np.float64) 

159 issues = [] 

160 

161 # Check for sufficient data 

162 if len(data) < 100: 

163 issues.append("Very short capture (less than 100 samples)") 

164 

165 # Check noise level (standard deviation relative to range) 

166 data_range = np.ptp(data) 

167 if data_range > 0: 

168 noise_ratio = np.std(data) / data_range 

169 if noise_ratio > 0.2: 

170 issues.append("High noise level detected") 

171 

172 # Check for clipping 

173 if len(data) > 0: 173 ↛ 187line 173 didn't jump to line 187 because the condition on line 173 was always true

174 data_min = np.min(data) 

175 data_max = np.max(data) 

176 

177 # Check if many samples at min/max (possible clipping) 

178 at_min = np.sum(data == data_min) 

179 at_max = np.sum(data == data_max) 

180 

181 if at_min > len(data) * 0.05: 

182 issues.append("Possible clipping at minimum level") 

183 if at_max > len(data) * 0.05: 

184 issues.append("Possible clipping at maximum level") 

185 

186 # Determine quality level 

187 if not issues: 

188 quality = "excellent" 

189 elif len(issues) == 1: 

190 quality = "good" 

191 elif len(issues) == 2: 

192 quality = "fair" 

193 else: 

194 quality = "poor" 

195 

196 return quality, issues 

197 

198 

199def _format_frequency(freq_hz: float) -> str: 

200 """Format frequency in human-readable form. 

201 

202 Args: 

203 freq_hz: Frequency in Hz. 

204 

205 Returns: 

206 Formatted string. 

207 """ 

208 if freq_hz >= 1e9: 

209 return f"{freq_hz / 1e9:.1f} GHz" 

210 elif freq_hz >= 1e6: 

211 return f"{freq_hz / 1e6:.1f} MHz" 

212 elif freq_hz >= 1e3: 

213 return f"{freq_hz / 1e3:.1f} kHz" 

214 else: 

215 return f"{freq_hz:.1f} Hz" 

216 

217 

218def generate_summary( 

219 trace: WaveformTrace, 

220 *, 

221 context: dict[str, Any] | None = None, 

222 detail_level: str = "summary", 

223 max_words: int = 200, 

224 include_sections: list[str] | None = None, 

225) -> Summary: 

226 """Generate natural language summary of signal analysis. 

227 

228 Creates a plain-English description of the signal and analysis results, 

229 avoiding technical jargon and explaining findings in accessible terms. 

230 

231 Args: 

232 trace: Waveform to summarize. 

233 context: Optional analysis context (characterization, anomalies, etc.). 

234 detail_level: Summary detail level ("summary", "intermediate", "expert"). 

235 max_words: Maximum word count for summary text. 

236 include_sections: Sections to include (default: all). 

237 

238 Returns: 

239 Summary object with natural language description. 

240 

241 Example: 

242 >>> trace = load("uart_signal.wfm") 

243 >>> summary = generate_summary(trace) 

244 >>> print(summary.text) 

245 This is a digital signal with two voltage levels... 

246 

247 References: 

248 DISC-003: Natural Language Summaries 

249 """ 

250 context = context or {} 

251 include_sections = include_sections or ["overview", "findings", "recommendations"] 

252 

253 # Characterize signal type 

254 signal_type, type_confidence = _characterize_signal_type(trace) 

255 

256 # Assess quality 

257 quality_level, quality_issues = _assess_quality(trace) 

258 

259 # Build overview 

260 sample_rate = trace.metadata.sample_rate 

261 duration_ms = len(trace.data) / sample_rate * 1000 

262 

263 overview = f"This is a {signal_type} signal captured at {_format_frequency(sample_rate)} sample rate for {duration_ms:.1f} milliseconds." 

264 

265 # Build findings 

266 findings = [] 

267 

268 # Signal type finding 

269 findings.append( 

270 Finding( 

271 title="Signal Type", 

272 description=f"Identified as {signal_type}", 

273 confidence=type_confidence, 

274 severity="INFO", 

275 ) 

276 ) 

277 

278 # Quality finding 

279 quality_desc = f"Signal quality is {quality_level}" 

280 if quality_issues: 

281 quality_desc += f" with {len(quality_issues)} issue(s) noted" 

282 

283 findings.append( 

284 Finding( 

285 title="Signal Quality", 

286 description=quality_desc, 

287 confidence=0.85, 

288 severity="WARNING" if quality_issues else "INFO", 

289 ) 

290 ) 

291 

292 # Voltage levels 

293 v_min = float(np.min(trace.data)) 

294 v_max = float(np.max(trace.data)) 

295 v_range = v_max - v_min 

296 

297 findings.append( 

298 Finding( 

299 title="Voltage Range", 

300 description=f"Signal ranges from {v_min:.3f}V to {v_max:.3f}V (swing: {v_range:.3f}V)", 

301 confidence=1.0, 

302 severity="INFO", 

303 ) 

304 ) 

305 

306 # Build recommendations 

307 recommendations = [] 

308 

309 if "very short" in str(quality_issues).lower(): 

310 recommendations.append("Capture a longer duration to enable more detailed analysis") 

311 

312 if "noise" in str(quality_issues).lower(): 

313 recommendations.append( 

314 "Check signal integrity and consider using better probes or shielding" 

315 ) 

316 

317 if "clipping" in str(quality_issues).lower(): 

318 recommendations.append("Adjust voltage range to prevent signal clipping and data loss") 

319 

320 if signal_type == "digital" and not recommendations: 320 ↛ 321line 320 didn't jump to line 321 because the condition on line 320 was never true

321 recommendations.append("Signal appears clean and suitable for digital protocol analysis") 

322 elif signal_type in ["analog", "periodic analog"] and not recommendations: 

323 recommendations.append("Consider spectral analysis to identify frequency components") 

324 

325 # Build complete summary text 

326 summary_parts = [] 

327 

328 if "overview" in include_sections: 328 ↛ 331line 328 didn't jump to line 331 because the condition on line 328 was always true

329 summary_parts.append(overview) 

330 

331 if "findings" in include_sections and findings: 

332 key_findings = findings[:3] # Top 3 findings 

333 findings_text = " ".join( 

334 [f"{finding.title}: {finding.description}." for finding in key_findings] 

335 ) 

336 summary_parts.append(findings_text) 

337 

338 if "recommendations" in include_sections and recommendations: 

339 rec_text = "Recommended next steps: " + "; ".join(recommendations[:2]) + "." 

340 summary_parts.append(rec_text) 

341 

342 full_text = " ".join(summary_parts) 

343 

344 # Truncate to max_words if needed 

345 words = full_text.split() 

346 if len(words) > max_words: 

347 words = words[:max_words] 

348 full_text = " ".join(words) + "..." 

349 

350 # Calculate statistics 

351 word_count = len(full_text.split()) 

352 grade_level = _estimate_grade_level(full_text) 

353 

354 return Summary( 

355 text=full_text, 

356 overview=overview, 

357 findings=findings, 

358 recommendations=recommendations, 

359 word_count=word_count, 

360 grade_level=grade_level, 

361 ) 

362 

363 

364__all__ = [ 

365 "Finding", 

366 "Summary", 

367 "generate_summary", 

368]