Coverage for src / tracekit / core / cache.py: 98%

189 statements  

« prev     ^ index     » next       coverage.py v7.13.1, created at 2026-01-11 23:04 +0000

1"""Persistent LRU cache with disk spillover for TraceKit intermediate results. 

2 

3This module provides memory-bounded caching for expensive operations like FFT, 

4spectrograms, and filtered traces with automatic disk spillover when memory 

5limits are exceeded. 

6 

7 

8Example: 

9 >>> from tracekit.core.cache import TraceKitCache, get_cache 

10 >>> cache = get_cache(max_memory="2GB") 

11 >>> result = cache.get_or_compute("fft_key", compute_fft, signal, 1024) 

12 >>> cache.show_stats() 

13 Cache Statistics: 42 hits, 15 misses (73.7% hit rate) 

14 

15References: 

16 Python functools.lru_cache 

17 Python pickle for serialization 

18""" 

19 

20from __future__ import annotations 

21 

22import contextlib 

23import hashlib 

24import pickle 

25import tempfile 

26import threading 

27import time 

28from collections import OrderedDict 

29from dataclasses import dataclass 

30from pathlib import Path 

31from typing import TYPE_CHECKING, Any, TypeVar 

32 

33import numpy as np 

34 

35if TYPE_CHECKING: 

36 from collections.abc import Callable 

37 

38 

39T = TypeVar("T") 

40 

41 

42@dataclass 

43class CacheEntry: 

44 """Single cache entry with metadata. 

45 

46 Attributes: 

47 key: Cache key (hash of inputs). 

48 value: Cached value (in memory). 

49 disk_path: Path to disk file if spilled. 

50 size_bytes: Size of cached value in bytes. 

51 created_at: Creation timestamp. 

52 last_accessed: Last access timestamp. 

53 access_count: Number of times accessed. 

54 in_memory: True if value is in memory. 

55 """ 

56 

57 key: str 

58 value: Any | None 

59 disk_path: Path | None 

60 size_bytes: int 

61 created_at: float 

62 last_accessed: float 

63 access_count: int 

64 in_memory: bool 

65 

66 

67@dataclass 

68class CacheStats: 

69 """Cache statistics. 

70 

71 

72 Attributes: 

73 hits: Number of cache hits. 

74 misses: Number of cache misses. 

75 evictions: Number of entries evicted. 

76 disk_spills: Number of entries spilled to disk. 

77 current_memory: Current memory usage (bytes). 

78 current_entries: Number of entries in cache. 

79 disk_entries: Number of entries on disk. 

80 """ 

81 

82 hits: int 

83 misses: int 

84 evictions: int 

85 disk_spills: int 

86 current_memory: int 

87 current_entries: int 

88 disk_entries: int 

89 

90 @property 

91 def hit_rate(self) -> float: 

92 """Calculate cache hit rate (0.0-1.0).""" 

93 total = self.hits + self.misses 

94 return self.hits / total if total > 0 else 0.0 

95 

96 def __str__(self) -> str: 

97 """Format stats as human-readable string.""" 

98 return ( 

99 f"Cache Statistics:\n" 

100 f" Hits: {self.hits}\n" 

101 f" Misses: {self.misses}\n" 

102 f" Hit Rate: {self.hit_rate * 100:.1f}%\n" 

103 f" Evictions: {self.evictions}\n" 

104 f" Disk Spills: {self.disk_spills}\n" 

105 f" Memory Usage: {self.current_memory / 1e9:.2f} GB\n" 

106 f" Entries (Memory): {self.current_entries}\n" 

107 f" Entries (Disk): {self.disk_entries}\n" 

108 ) 

109 

110 

111class TraceKitCache: 

112 """LRU cache with disk spillover for intermediate results. 

113 

114 

115 Caches expensive computation results with automatic memory management. 

116 When memory limit is exceeded, least-recently-used entries are spilled 

117 to disk. Automatic cleanup on exit. 

118 

119 Args: 

120 max_memory: Maximum memory for cache (bytes or string like "2GB"). 

121 cache_dir: Directory for disk cache (default: /tmp/tracekit_cache). 

122 auto_cleanup: Clean up disk cache on exit (default: True). 

123 

124 Example: 

125 >>> cache = TraceKitCache(max_memory="1GB") 

126 >>> result = cache.get_or_compute("key", expensive_func, arg1, arg2) 

127 >>> stats = cache.get_stats() 

128 >>> cache.clear() 

129 

130 Security Note: 

131 Cache files use pickle serialization. Cache directory should be in 

132 a secure location with appropriate permissions. Do not share cache 

133 directories across security boundaries. The cache is intended for 

134 single-user, local computation only. 

135 

136 References: 

137 MEM-031: Persistent Cache (Disk-Based) 

138 MEM-029: LRU Cache for Intermediate Results 

139 """ 

140 

141 def __init__( 

142 self, 

143 max_memory: int | str = "2GB", 

144 *, 

145 cache_dir: str | Path | None = None, 

146 auto_cleanup: bool = True, 

147 ): 

148 """Initialize cache. 

149 

150 Args: 

151 max_memory: Maximum memory (bytes or string). 

152 cache_dir: Directory for disk cache. 

153 auto_cleanup: Clean up on exit. 

154 """ 

155 # Parse max_memory 

156 if isinstance(max_memory, str): 

157 self.max_memory = self._parse_memory_string(max_memory) 

158 else: 

159 self.max_memory = int(max_memory) 

160 

161 # Set up cache directory 

162 if cache_dir is None: 

163 self.cache_dir = Path(tempfile.gettempdir()) / "tracekit_cache" 

164 else: 

165 self.cache_dir = Path(cache_dir) 

166 

167 self.cache_dir.mkdir(parents=True, exist_ok=True) 

168 self.auto_cleanup = auto_cleanup 

169 

170 # Cache storage (LRU via OrderedDict) 

171 self._cache: OrderedDict[str, CacheEntry] = OrderedDict() 

172 

173 # Thread lock for thread-safe operations (MEM-031) 

174 self._lock = threading.RLock() 

175 

176 # Statistics 

177 self._hits = 0 

178 self._misses = 0 

179 self._evictions = 0 

180 self._disk_spills = 0 

181 self._current_memory = 0 

182 

183 def __enter__(self) -> TraceKitCache: 

184 """Enter context.""" 

185 return self 

186 

187 def __exit__(self, exc_type, exc_val, exc_tb) -> None: # type: ignore[no-untyped-def] 

188 """Exit context and clean up if enabled.""" 

189 # Note: exc_val and exc_tb intentionally unused but required for Python 3.11+ compatibility 

190 if self.auto_cleanup: 

191 self.clear() 

192 

193 def get(self, key: str) -> Any | None: 

194 """Get value from cache. 

195 

196 

197 Args: 

198 key: Cache key. 

199 

200 Returns: 

201 Cached value or None if not found. 

202 

203 Example: 

204 >>> value = cache.get("my_key") 

205 >>> if value is None: 

206 ... value = compute_value() 

207 ... cache.put("my_key", value) 

208 

209 References: 

210 MEM-031: Persistent Cache (Disk-Based) 

211 """ 

212 with self._lock: 

213 if key not in self._cache: 

214 self._misses += 1 

215 return None 

216 

217 # Cache hit 

218 self._hits += 1 

219 entry = self._cache[key] 

220 

221 # Update access metadata 

222 entry.last_accessed = time.time() 

223 entry.access_count += 1 

224 

225 # Move to end (most recently used) 

226 self._cache.move_to_end(key) 

227 

228 # Load from disk if needed 

229 if not entry.in_memory: 

230 entry.value = self._load_from_disk(entry.disk_path) # type: ignore[arg-type] 

231 entry.in_memory = True 

232 self._current_memory += entry.size_bytes 

233 

234 # Check if we need to spill to make room 

235 self._ensure_memory_limit() 

236 

237 return entry.value 

238 

239 def put(self, key: str, value: Any) -> None: 

240 """Put value in cache. 

241 

242 

243 Args: 

244 key: Cache key. 

245 value: Value to cache. 

246 

247 Example: 

248 >>> cache.put("result_key", computed_result) 

249 

250 References: 

251 MEM-031: Persistent Cache (Disk-Based) 

252 """ 

253 # Calculate size outside lock (potentially expensive) 

254 size_bytes = self._estimate_size(value) 

255 

256 with self._lock: 

257 # Remove old entry if exists 

258 if key in self._cache: 

259 old_entry = self._cache[key] 

260 self._current_memory -= old_entry.size_bytes 

261 if old_entry.disk_path and old_entry.disk_path.exists(): 261 ↛ 262line 261 didn't jump to line 262 because the condition on line 261 was never true

262 old_entry.disk_path.unlink() 

263 del self._cache[key] 

264 

265 # Create new entry 

266 entry = CacheEntry( 

267 key=key, 

268 value=value, 

269 disk_path=None, 

270 size_bytes=size_bytes, 

271 created_at=time.time(), 

272 last_accessed=time.time(), 

273 access_count=0, 

274 in_memory=True, 

275 ) 

276 

277 # Add to cache 

278 self._cache[key] = entry 

279 self._current_memory += size_bytes 

280 

281 # Ensure memory limit 

282 self._ensure_memory_limit() 

283 

284 def get_or_compute( 

285 self, 

286 key: str, 

287 compute_fn: Callable[..., T], 

288 *args: Any, 

289 **kwargs: Any, 

290 ) -> T: 

291 """Get cached value or compute and cache it. 

292 

293 

294 Args: 

295 key: Cache key. 

296 compute_fn: Function to compute value if not cached. 

297 args: Arguments to compute_fn. 

298 kwargs: Keyword arguments to compute_fn. 

299 

300 Returns: 

301 Cached or computed value. 

302 

303 Example: 

304 >>> result = cache.get_or_compute("fft_key", np.fft.fft, signal) 

305 

306 References: 

307 MEM-029: LRU Cache for Intermediate Results 

308 """ 

309 value = self.get(key) 

310 if value is not None: 

311 return value # type: ignore[no-any-return] 

312 

313 # Compute value 

314 value = compute_fn(*args, **kwargs) 

315 

316 # Cache it 

317 self.put(key, value) 

318 

319 return value 

320 

321 def clear(self) -> None: 

322 """Clear all cache entries and disk files. 

323 

324 

325 Example: 

326 >>> cache.clear() 

327 

328 References: 

329 MEM-031: Persistent Cache (Disk-Based) 

330 """ 

331 with self._lock: 

332 # Remove disk files 

333 for entry in self._cache.values(): 

334 if entry.disk_path and entry.disk_path.exists(): 

335 with contextlib.suppress(OSError): 

336 entry.disk_path.unlink() 

337 

338 # Clear cache 

339 self._cache.clear() 

340 self._current_memory = 0 

341 

342 # Try to remove cache directory if empty (outside lock, not critical) 

343 try: 

344 if self.cache_dir.exists() and not any(self.cache_dir.iterdir()): 

345 self.cache_dir.rmdir() 

346 except OSError: 

347 pass 

348 

349 def get_stats(self) -> CacheStats: 

350 """Get cache statistics. 

351 

352 

353 Returns: 

354 CacheStats object. 

355 

356 Example: 

357 >>> stats = cache.get_stats() 

358 >>> print(f"Hit rate: {stats.hit_rate * 100:.1f}%") 

359 

360 References: 

361 MEM-031: Persistent Cache (Disk-Based) 

362 """ 

363 with self._lock: 

364 disk_entries = sum(1 for e in self._cache.values() if not e.in_memory) 

365 return CacheStats( 

366 hits=self._hits, 

367 misses=self._misses, 

368 evictions=self._evictions, 

369 disk_spills=self._disk_spills, 

370 current_memory=self._current_memory, 

371 current_entries=len(self._cache), 

372 disk_entries=disk_entries, 

373 ) 

374 

375 def show_stats(self) -> None: 

376 """Print cache statistics. 

377 

378 

379 Example: 

380 >>> cache.show_stats() 

381 Cache Statistics: 42 hits, 15 misses (73.7% hit rate) 

382 

383 References: 

384 MEM-031: Persistent Cache (Disk-Based) 

385 """ 

386 stats = self.get_stats() 

387 print(stats) 

388 

389 def compute_key(self, *args: Any, **kwargs: Any) -> str: 

390 """Compute cache key from arguments. 

391 

392 Creates a hash key from arbitrary arguments for cache lookups. 

393 

394 Args: 

395 args: Positional arguments. 

396 kwargs: Keyword arguments. 

397 

398 Returns: 

399 Hash key string. 

400 

401 Example: 

402 >>> key = cache.compute_key("operation", param1=10, param2="value") 

403 

404 References: 

405 MEM-029: LRU Cache for Intermediate Results 

406 """ 

407 # Create hashable representation 

408 hash_obj = hashlib.sha256() 

409 

410 # Hash positional args 

411 for arg in args: 

412 hash_obj.update(self._make_hashable(arg)) 

413 

414 # Hash keyword args (sorted for consistency) 

415 for k in sorted(kwargs.keys()): 

416 hash_obj.update(k.encode()) 

417 hash_obj.update(self._make_hashable(kwargs[k])) 

418 

419 return hash_obj.hexdigest() 

420 

421 def _ensure_memory_limit(self) -> None: 

422 """Ensure cache memory usage is within limit.""" 

423 while self._current_memory > self.max_memory and self._cache: 

424 # Evict least recently used entry 

425 key, entry = self._cache.popitem(last=False) 

426 

427 if entry.in_memory: 

428 # Spill to disk if not already there 

429 if entry.disk_path is None: 429 ↛ 439line 429 didn't jump to line 439 because the condition on line 429 was always true

430 entry.disk_path = self._spill_to_disk(key, entry.value) 

431 entry.in_memory = False 

432 entry.value = None 

433 self._disk_spills += 1 

434 

435 # Put back in cache (on disk) 

436 self._cache[key] = entry 

437 self._cache.move_to_end(key, last=False) 

438 

439 self._current_memory -= entry.size_bytes 

440 

441 self._evictions += 1 

442 

443 def _spill_to_disk(self, key: str, value: Any) -> Path: 

444 """Write value to disk. 

445 

446 

447 Args: 

448 key: Cache key. 

449 value: Value to write. 

450 

451 Returns: 

452 Path to disk file. 

453 """ 

454 disk_path = self.cache_dir / f"{key}.pkl" 

455 with open(disk_path, "wb") as f: 

456 pickle.dump(value, f, protocol=pickle.HIGHEST_PROTOCOL) 

457 return disk_path 

458 

459 def _load_from_disk(self, disk_path: Path) -> Any: 

460 """Load value from disk. 

461 

462 

463 Args: 

464 disk_path: Path to disk file. 

465 

466 Returns: 

467 Loaded value. 

468 """ 

469 with open(disk_path, "rb") as f: 

470 return pickle.load(f) 

471 

472 def _estimate_size(self, value: Any) -> int: 

473 """Estimate size of value in bytes.""" 

474 if isinstance(value, np.ndarray): 

475 return value.nbytes # type: ignore[no-any-return] 

476 elif isinstance(value, list | tuple): 

477 return sum(self._estimate_size(item) for item in value) 

478 elif isinstance(value, dict): 

479 return sum(self._estimate_size(k) + self._estimate_size(v) for k, v in value.items()) 

480 else: 

481 # Fallback: use pickle size 

482 try: 

483 return len(pickle.dumps(value, protocol=pickle.HIGHEST_PROTOCOL)) 

484 except (TypeError, pickle.PicklingError): 

485 return 1024 # Default estimate 

486 

487 def _make_hashable(self, obj: Any) -> bytes: 

488 """Convert object to hashable bytes.""" 

489 if isinstance(obj, np.ndarray): 

490 # Use array bytes for hashing 

491 return obj.tobytes() # type: ignore[no-any-return] 

492 elif isinstance(obj, str | bytes): 

493 return obj.encode() if isinstance(obj, str) else obj 

494 elif isinstance(obj, int | float | bool): 

495 return str(obj).encode() 

496 elif isinstance(obj, list | tuple): 

497 return b"".join(self._make_hashable(item) for item in obj) 

498 else: 

499 # Fallback: use string representation 

500 return str(obj).encode() 

501 

502 def _parse_memory_string(self, memory_str: str) -> int: 

503 """Parse memory string like '2GB' to bytes.""" 

504 memory_str = memory_str.strip().upper() 

505 

506 if memory_str.endswith("GB"): 

507 return int(float(memory_str[:-2]) * 1e9) 

508 elif memory_str.endswith("MB"): 

509 return int(float(memory_str[:-2]) * 1e6) 

510 elif memory_str.endswith("KB"): 

511 return int(float(memory_str[:-2]) * 1e3) 

512 else: 

513 return int(memory_str) 

514 

515 

516# Global cache instance 

517_global_cache: TraceKitCache | None = None 

518 

519 

520def get_cache( 

521 max_memory: int | str = "2GB", 

522 *, 

523 cache_dir: str | Path | None = None, 

524) -> TraceKitCache: 

525 """Get or create global cache instance. 

526 

527 

528 Args: 

529 max_memory: Maximum memory for cache. 

530 cache_dir: Cache directory. 

531 

532 Returns: 

533 Global TraceKitCache instance. 

534 

535 Example: 

536 >>> cache = get_cache(max_memory="1GB") 

537 >>> result = cache.get_or_compute("key", compute_fn, args) 

538 

539 References: 

540 MEM-031: Persistent Cache (Disk-Based) 

541 """ 

542 global _global_cache 

543 

544 if _global_cache is None: 

545 _global_cache = TraceKitCache(max_memory, cache_dir=cache_dir) 

546 

547 return _global_cache 

548 

549 

550def clear_cache() -> None: 

551 """Clear global cache. 

552 

553 Example: 

554 >>> clear_cache() 

555 

556 References: 

557 MEM-031: Persistent Cache (Disk-Based) 

558 """ 

559 global _global_cache 

560 

561 if _global_cache is not None: 

562 _global_cache.clear() 

563 _global_cache = None 

564 

565 

566def show_cache_stats() -> None: 

567 """Show global cache statistics. 

568 

569 

570 Example: 

571 >>> show_cache_stats() 

572 Cache Statistics: 42 hits, 15 misses (73.7% hit rate) 

573 

574 References: 

575 MEM-031: Persistent Cache (Disk-Based) 

576 """ 

577 if _global_cache is not None: 

578 _global_cache.show_stats() 

579 else: 

580 print("Cache not initialized") 

581 

582 

583__all__ = [ 

584 "CacheEntry", 

585 "CacheStats", 

586 "TraceKitCache", 

587 "clear_cache", 

588 "get_cache", 

589 "show_cache_stats", 

590]