violawake_sdk
ViolaWake SDK — Open-source wake word detection + voice pipeline.
Public API surface:
WakeDetector — detect a wake word in an audio stream AsyncWakeDetector — async wrapper for asyncio-based applications DetectorConfig — advanced configuration (ensemble, adaptive, speaker, power) VADEngine — voice activity detection (WebRTC, Silero, RMS) TTSEngine — on-device text-to-speech (Kokoro-82M) STTEngine — speech-to-text (faster-whisper) VoicePipeline — bundled Wake→VAD→STT→TTS orchestration NoiseProfiler — noise-adaptive threshold adjustment PowerManager — battery-aware power management FusionStrategy — multi-model ensemble scoring list_models() — discover available wake word models list_voices() — discover available TTS voices
Quick start::
from violawake_sdk import WakeDetector
with WakeDetector(threshold=0.80) as detector:
for chunk in detector.stream_mic():
if detector.detect(chunk):
print("Wake word detected!")
break
See README.md or https://github.com/GeeIHadAGoodTime/ViolaWake for full documentation.
1""" 2ViolaWake SDK — Open-source wake word detection + voice pipeline. 3 4Public API surface: 5 WakeDetector — detect a wake word in an audio stream 6 AsyncWakeDetector — async wrapper for asyncio-based applications 7 DetectorConfig — advanced configuration (ensemble, adaptive, speaker, power) 8 VADEngine — voice activity detection (WebRTC, Silero, RMS) 9 TTSEngine — on-device text-to-speech (Kokoro-82M) 10 STTEngine — speech-to-text (faster-whisper) 11 VoicePipeline — bundled Wake→VAD→STT→TTS orchestration 12 NoiseProfiler — noise-adaptive threshold adjustment 13 PowerManager — battery-aware power management 14 FusionStrategy — multi-model ensemble scoring 15 list_models() — discover available wake word models 16 list_voices() — discover available TTS voices 17 18Quick start:: 19 20 from violawake_sdk import WakeDetector 21 22 with WakeDetector(threshold=0.80) as detector: 23 for chunk in detector.stream_mic(): 24 if detector.detect(chunk): 25 print("Wake word detected!") 26 break 27 28See README.md or https://github.com/GeeIHadAGoodTime/ViolaWake for full documentation. 29""" 30 31from __future__ import annotations 32 33__version__ = "0.2.2" 34__author__ = "ViolaWake Contributors" 35__license__ = "Apache-2.0" 36 37from violawake_sdk._exceptions import ( 38 AudioCaptureError, 39 ModelLoadError, 40 ModelNotFoundError, 41 PipelineError, 42 VADBackendError, 43 ViolaWakeError, 44) 45from violawake_sdk.async_detector import AsyncWakeDetector 46from violawake_sdk.confidence import ConfidenceLevel, ConfidenceResult 47from violawake_sdk.ensemble import FusionStrategy 48from violawake_sdk.noise_profiler import NoiseProfiler 49from violawake_sdk.pipeline import VoicePipeline 50from violawake_sdk.power_manager import PowerManager 51from violawake_sdk.vad import VADEngine 52from violawake_sdk.wake_detector import ( 53 DetectorConfig, 54 WakeDecisionPolicy, 55 WakeDetector, 56 WakewordDetector, # noqa: F401 — backward compat 57 validate_audio_chunk, 58) 59 60# Conditional imports for optional extras 61try: 62 from violawake_sdk.tts import TTSEngine 63except ImportError: 64 TTSEngine = None # type: ignore[assignment,misc] 65 66try: 67 from violawake_sdk.stt import STTEngine, StreamingSTTEngine 68except ImportError: 69 STTEngine = None # type: ignore[assignment,misc] 70 StreamingSTTEngine = None # type: ignore[assignment,misc] 71 72 73def list_models() -> list[dict[str, str]]: 74 """Return available wake word models with their descriptions. 75 76 Each entry is a dict with keys: ``name``, ``description``, ``version``. 77 78 Example:: 79 80 >>> from violawake_sdk import list_models 81 >>> for m in list_models(): 82 ... print(f"{m['name']:20s} {m['description']}") 83 """ 84 from violawake_sdk.models import MODEL_REGISTRY 85 86 seen: set[str] = set() 87 result: list[dict[str, str]] = [] 88 for name, spec in MODEL_REGISTRY.items(): 89 # Deduplicate aliases (e.g. "viola" -> "temporal_cnn") 90 if spec.name in seen: 91 continue 92 # Hide deprecated, package-managed, and non-wake-word models 93 if "DEPRECATED" in spec.description: 94 continue 95 if spec.name in ("oww_backbone", "kokoro_v1_0", "kokoro_voices_v1_0"): 96 continue 97 seen.add(spec.name) 98 result.append( 99 { 100 "name": name, 101 "description": spec.description, 102 "version": spec.version, 103 } 104 ) 105 return result 106 107 108def list_voices() -> list[str]: 109 """Return available TTS voice names for use with ``TTSEngine``. 110 111 Requires the ``[tts]`` extra to be installed for actual synthesis, 112 but this function always works for discovery. 113 114 Example:: 115 116 >>> from violawake_sdk import list_voices 117 >>> list_voices() 118 ['af_heart', 'af_bella', 'af_sarah', ...] 119 """ 120 from violawake_sdk.tts import AVAILABLE_VOICES 121 122 return list(AVAILABLE_VOICES) 123 124 125__all__ = [ 126 # Core detection 127 "DetectorConfig", 128 "WakeDetector", 129 "AsyncWakeDetector", 130 "WakeDecisionPolicy", 131 "validate_audio_chunk", 132 # Confidence & scoring 133 "ConfidenceResult", 134 "ConfidenceLevel", 135 "FusionStrategy", 136 # Advanced features 137 "NoiseProfiler", 138 "PowerManager", 139 # Pipeline components 140 "VADEngine", 141 "TTSEngine", 142 "STTEngine", 143 "StreamingSTTEngine", 144 "VoicePipeline", 145 # Exceptions 146 "ViolaWakeError", 147 "ModelNotFoundError", 148 "AudioCaptureError", 149 "ModelLoadError", 150 "PipelineError", 151 "VADBackendError", 152 # Discovery 153 "list_models", 154 "list_voices", 155 "__version__", 156]
70@dataclass 71class DetectorConfig: 72 """Advanced configuration for WakeDetector. 73 74 Basic usage needs no config -- just use ``WakeDetector(threshold=0.80)``. 75 Use ``DetectorConfig`` to opt-in to advanced features without cluttering 76 the constructor. 77 78 Example:: 79 80 # Simple (80% of users): 81 det = WakeDetector(model="temporal_cnn", threshold=0.80) 82 83 # Advanced (multi-model ensemble + adaptive threshold): 84 det = WakeDetector( 85 model="temporal_cnn", 86 config=DetectorConfig( 87 adaptive_threshold=True, 88 confirm_count=3, 89 ), 90 ) 91 92 Attributes: 93 models: Additional model paths for multi-model ensemble (K3). 94 fusion_strategy: Score fusion strategy for ensemble (K3). 95 fusion_weights: Per-model weights for weighted_average fusion (K3). 96 adaptive_threshold: Enable dynamic threshold based on noise (K4). 97 noise_profiler: Custom NoiseProfiler instance (K4). 98 speaker_verify_fn: Post-detection speaker verification callback (K5). 99 power_manager: Power management controller for duty cycling (K7). 100 confirm_count: Consecutive above-threshold scores required (K2). 101 score_history_size: Number of recent scores to retain (K2). 102 """ 103 104 # K3: Multi-model ensemble 105 models: list[str] | None = None 106 fusion_strategy: FusionStrategy | str = FusionStrategy.AVERAGE 107 fusion_weights: list[float] | None = None 108 109 # K4: Adaptive threshold 110 adaptive_threshold: bool = False 111 noise_profiler: NoiseProfiler | None = None 112 113 # K5: Speaker verification 114 speaker_verify_fn: Callable[..., bool] | None = None 115 116 # K7: Power management 117 power_manager: PowerManager | None = None 118 119 # K2: Confidence tracking 120 confirm_count: int = 1 121 score_history_size: int = 50 122 123 def build(self, model: str = "temporal_cnn", **kwargs: Any) -> WakeDetector: 124 """Build a WakeDetector from this config. 125 126 Convenience method that passes ``self`` as the ``config=`` argument. 127 128 Args: 129 model: Model name or path (default: ``"temporal_cnn"``). 130 **kwargs: Additional WakeDetector constructor arguments 131 (threshold, cooldown_s, etc.). 132 133 Returns: 134 Configured WakeDetector instance. 135 """ 136 return WakeDetector(model=model, config=self, **kwargs)
Advanced configuration for WakeDetector.
Basic usage needs no config -- just use WakeDetector(threshold=0.80).
Use DetectorConfig to opt-in to advanced features without cluttering
the constructor.
Example::
# Simple (80% of users):
det = WakeDetector(model="temporal_cnn", threshold=0.80)
# Advanced (multi-model ensemble + adaptive threshold):
det = WakeDetector(
model="temporal_cnn",
config=DetectorConfig(
adaptive_threshold=True,
confirm_count=3,
),
)
Attributes:
- models: Additional model paths for multi-model ensemble (K3).
- fusion_strategy: Score fusion strategy for ensemble (K3).
- fusion_weights: Per-model weights for weighted_average fusion (K3).
- adaptive_threshold: Enable dynamic threshold based on noise (K4).
- noise_profiler: Custom NoiseProfiler instance (K4).
- speaker_verify_fn: Post-detection speaker verification callback (K5).
- power_manager: Power management controller for duty cycling (K7).
- confirm_count: Consecutive above-threshold scores required (K2).
- score_history_size: Number of recent scores to retain (K2).
123 def build(self, model: str = "temporal_cnn", **kwargs: Any) -> WakeDetector: 124 """Build a WakeDetector from this config. 125 126 Convenience method that passes ``self`` as the ``config=`` argument. 127 128 Args: 129 model: Model name or path (default: ``"temporal_cnn"``). 130 **kwargs: Additional WakeDetector constructor arguments 131 (threshold, cooldown_s, etc.). 132 133 Returns: 134 Configured WakeDetector instance. 135 """ 136 return WakeDetector(model=model, config=self, **kwargs)
Build a WakeDetector from this config.
Convenience method that passes self as the config= argument.
Arguments:
- model: Model name or path (default:
"temporal_cnn"). - **kwargs: Additional WakeDetector constructor arguments (threshold, cooldown_s, etc.).
Returns:
Configured WakeDetector instance.
272class WakeDetector: 273 """Wake word detector using ViolaWake MLP on OpenWakeWord embeddings. 274 275 Supports pluggable inference backends (ONNX Runtime, TFLite) via the 276 ``backend`` parameter. The default ``"auto"`` mode tries ONNX Runtime 277 first, then falls back to TFLite, so users on edge devices can run 278 without installing ``onnxruntime``. 279 280 Also supports optional competitive features (all opt-in, backward compatible): 281 282 - **K2 Confidence API**: ``get_confidence()`` and ``last_scores`` property. 283 - **K3 Multi-model ensemble**: ``models`` parameter with fusion strategies. 284 - **K4 Adaptive threshold**: ``adaptive_threshold`` parameter with noise profiling. 285 - **K5 Speaker verification**: ``speaker_verify_fn`` callback for post-detection. 286 - **K6 Audio source abstraction**: ``from_source()`` class method factory. 287 - **K7 Power management**: ``power_manager`` parameter for duty cycling. 288 289 **Threshold tuning guide:** 290 291 - 0.70 = sensitive (more detections, more false positives) 292 - 0.80 = balanced (default, recommended starting point) 293 - 0.85 = conservative (fewer false positives, may miss some) 294 - 0.90+ = very conservative (for noisy environments) 295 296 Start at 0.80 and adjust based on your false accept rate. 297 298 Args: 299 model: Model name from the registry, or a path to a model file. 300 threshold: Detection confidence threshold in [0.0, 1.0]. 301 cooldown_s: Minimum seconds between consecutive detections. 302 providers: ONNX Runtime execution providers (ignored for TFLite). 303 backend: Inference backend selector (``"onnx"``, ``"tflite"``, ``"auto"``). 304 config: A ``DetectorConfig`` instance bundling all advanced options. 305 Mutually exclusive with the individual advanced kwargs below. 306 models: Additional model paths for ensemble scoring (K3). 307 fusion_strategy: Score fusion strategy for ensemble (K3). 308 fusion_weights: Per-model weights for weighted_average fusion (K3). 309 adaptive_threshold: Enable dynamic threshold based on noise (K4). 310 noise_profiler: Custom NoiseProfiler instance (K4). 311 speaker_verify_fn: Post-detection speaker verification callback (K5). 312 power_manager: Power management controller for duty cycling (K7). 313 confirm_count: Consecutive above-threshold scores required for detection (K2). 314 score_history_size: Number of recent scores to retain (K2). 315 """ 316 317 _VALID_BACKENDS = ("onnx", "tflite", "auto") 318 319 def __init__( 320 self, 321 model: str = "temporal_cnn", 322 threshold: float = DEFAULT_THRESHOLD, 323 cooldown_s: float = DEFAULT_COOLDOWN_S, 324 providers: list[str] | None = None, 325 backend: str = "auto", 326 *, 327 config: DetectorConfig | None = None, 328 # K3: Multi-model ensemble (individual kwargs, backwards compat) 329 models: list[str] | None = _UNSET, 330 fusion_strategy: FusionStrategy | str = _UNSET, 331 fusion_weights: list[float] | None = _UNSET, 332 # K4: Adaptive threshold 333 adaptive_threshold: bool = _UNSET, 334 noise_profiler: NoiseProfiler | None = _UNSET, 335 # K5: Speaker verification 336 speaker_verify_fn: Callable[[np.ndarray], bool] | None = _UNSET, 337 # K7: Power management 338 power_manager: PowerManager | None = _UNSET, 339 # K2: Confidence tracking 340 confirm_count: int = _UNSET, 341 score_history_size: int = _UNSET, 342 ) -> None: 343 # --- Resolve config vs individual kwargs ------------------------- 344 # Detect if any advanced kwarg was explicitly passed (not _UNSET) 345 _locals = { 346 "models": models, 347 "fusion_strategy": fusion_strategy, 348 "fusion_weights": fusion_weights, 349 "adaptive_threshold": adaptive_threshold, 350 "noise_profiler": noise_profiler, 351 "speaker_verify_fn": speaker_verify_fn, 352 "power_manager": power_manager, 353 "confirm_count": confirm_count, 354 "score_history_size": score_history_size, 355 } 356 explicit_kwargs = {name for name, val in _locals.items() if val is not _UNSET} 357 if config is not None and explicit_kwargs: 358 raise ValueError( 359 f"Cannot specify both config= and individual advanced kwargs. " 360 f"Conflicting kwargs: {sorted(explicit_kwargs)}. " 361 f"Either use config=DetectorConfig(...) or pass kwargs directly, not both." 362 ) 363 364 if config is not None: 365 # Unpack from DetectorConfig 366 models = config.models 367 fusion_strategy = config.fusion_strategy 368 fusion_weights = config.fusion_weights 369 adaptive_threshold = config.adaptive_threshold 370 noise_profiler = config.noise_profiler 371 speaker_verify_fn = config.speaker_verify_fn 372 power_manager = config.power_manager 373 confirm_count = config.confirm_count 374 score_history_size = config.score_history_size 375 else: 376 # Apply defaults for any _UNSET values (backwards compat path) 377 if models is _UNSET: 378 models = None 379 if fusion_strategy is _UNSET: 380 fusion_strategy = FusionStrategy.AVERAGE 381 if fusion_weights is _UNSET: 382 fusion_weights = None 383 if adaptive_threshold is _UNSET: 384 adaptive_threshold = False 385 if noise_profiler is _UNSET: 386 noise_profiler = None 387 if speaker_verify_fn is _UNSET: 388 speaker_verify_fn = None 389 if power_manager is _UNSET: 390 power_manager = None 391 if confirm_count is _UNSET: 392 confirm_count = 1 393 if score_history_size is _UNSET: 394 score_history_size = 50 395 396 # G1: Input validation for public constructor parameters 397 if not isinstance(threshold, (int, float)): 398 raise TypeError(f"threshold must be a number, got {type(threshold).__name__}") 399 if not 0.0 <= threshold <= 1.0: 400 raise ValueError(f"threshold must be in [0.0, 1.0], got {threshold!r}") 401 if not isinstance(cooldown_s, (int, float)): 402 raise TypeError(f"cooldown_s must be a number, got {type(cooldown_s).__name__}") 403 if cooldown_s < 0: 404 raise ValueError(f"cooldown_s must be >= 0, got {cooldown_s!r}") 405 if backend not in self._VALID_BACKENDS: 406 raise ValueError(f"backend must be one of {self._VALID_BACKENDS}, got {backend!r}") 407 if confirm_count < 1: 408 raise ValueError(f"confirm_count must be >= 1, got {confirm_count}") 409 410 self.threshold = threshold 411 self._lock = threading.Lock() 412 self._backbone_lock = threading.Lock() 413 self._policy = WakeDecisionPolicy(threshold=threshold, cooldown_s=cooldown_s) 414 self._providers = providers or ["CPUExecutionProvider"] 415 self._backend: InferenceBackend = get_backend(backend, providers=self._providers) 416 417 # K2: Confidence tracking 418 self._score_tracker = ScoreTracker( 419 threshold=threshold, 420 history_size=score_history_size, 421 ) 422 self._confirm_required = confirm_count 423 self._confirm_counter = 0 424 425 # K3: Ensemble support 426 self._ensemble: EnsembleScorer | None = None 427 if models and len(models) > 0: 428 if isinstance(fusion_strategy, str): 429 fusion_strategy = FusionStrategy(fusion_strategy) 430 self._ensemble = EnsembleScorer( 431 strategy=fusion_strategy, 432 weights=fusion_weights, 433 ) 434 435 # K4: Noise profiler / adaptive threshold 436 self._adaptive_threshold = adaptive_threshold 437 if noise_profiler is not None: 438 self._noise_profiler: NoiseProfiler | None = noise_profiler 439 elif adaptive_threshold: 440 self._noise_profiler = NoiseProfiler(base_threshold=threshold) 441 else: 442 self._noise_profiler = None 443 444 # K5: Speaker verification 445 self._speaker_verify_fn = speaker_verify_fn 446 447 # K7: Power manager 448 self._power_manager = power_manager 449 450 # Warn on deprecated models 451 if model in MODEL_REGISTRY and "DEPRECATED" in MODEL_REGISTRY[model].description: 452 import warnings 453 454 warnings.warn( 455 f"Model '{model}' is deprecated: {MODEL_REGISTRY[model].description}. " 456 f"Use model='temporal_cnn' instead.", 457 DeprecationWarning, 458 stacklevel=2, 459 ) 460 461 # Load models 462 self._oww_backbone = self._create_oww_backbone() 463 self._mlp_session = self._load_session(model) 464 self._mlp_input_name = self._mlp_session.get_inputs()[0].name 465 self._last_score = 0.0 466 467 # Detect temporal vs MLP model from input shape 468 mlp_input_shape = self._mlp_session.get_inputs()[0].shape 469 if len(mlp_input_shape) == 3: 470 # Temporal model: input is (batch, seq_len, embedding_dim) 471 self._is_temporal = True 472 self._temporal_seq_len = ( 473 mlp_input_shape[1] 474 if isinstance(mlp_input_shape[1], int) 475 else _TEMPORAL_SEQ_LEN_DEFAULT 476 ) 477 self._embedding_buffer: collections.deque[np.ndarray] = collections.deque( 478 maxlen=self._temporal_seq_len, 479 ) 480 logger.info( 481 "Temporal model detected: seq_len=%d", 482 self._temporal_seq_len, 483 ) 484 else: 485 self._is_temporal = False 486 self._temporal_seq_len = 0 487 488 # K3: Load additional ensemble models 489 if models and self._ensemble is not None: 490 # Add primary model to ensemble 491 self._ensemble.add_session(self._mlp_session, self._mlp_input_name) 492 for extra_model in models: 493 extra_session = self._load_session(extra_model) 494 extra_input_name = extra_session.get_inputs()[0].name 495 self._ensemble.add_session(extra_session, extra_input_name) 496 497 logger.info( 498 "WakeDetector initialized: model=%s, threshold=%.2f, backend=%s", 499 model, 500 threshold, 501 self._backend.name, 502 ) 503 self._warn_on_oww_backbone_change(self._resolve_model_path(model)) 504 505 # ------------------------------------------------------------------ 506 # Context manager support 507 # ------------------------------------------------------------------ 508 509 def __enter__(self) -> WakeDetector: 510 """Enter sync context manager. Returns self.""" 511 return self 512 513 def __exit__( 514 self, 515 exc_type: type[BaseException] | None, 516 exc_val: BaseException | None, 517 exc_tb: object, 518 ) -> None: 519 """Exit sync context manager. Releases sessions and resets state.""" 520 self.close() 521 522 def close(self) -> None: 523 """Release inference sessions and reset internal state. 524 525 After calling close(), the detector should not be used for inference. 526 This is called automatically when using WakeDetector as a context 527 manager. 528 """ 529 self.reset() 530 # Release inference session references so the underlying runtime 531 # (ONNX / TFLite) can free memory immediately rather than waiting 532 # for garbage collection. 533 self._mlp_session = None # type: ignore[assignment] 534 if self._ensemble is not None: 535 self._ensemble.clear() 536 self._oww_backbone = None # type: ignore[assignment] 537 538 def _create_oww_backbone(self) -> OpenWakeWordBackbone: 539 """Create the shared OpenWakeWord backbone.""" 540 return OpenWakeWordBackbone(self._backend) 541 542 def _warn_on_oww_backbone_change(self, model_path: Path) -> None: 543 """Warn when the installed OWW backbone differs from the training config.""" 544 config_path = model_path.with_suffix(".config.json") 545 if not config_path.exists(): 546 return 547 548 try: 549 with config_path.open(encoding="utf-8") as f: 550 config = json.load(f) 551 except (OSError, json.JSONDecodeError): 552 return 553 554 expected_mel = config.get("oww_mel_sha256") 555 expected_emb = config.get("oww_emb_sha256") 556 if not isinstance(expected_mel, str) or not isinstance(expected_emb, str): 557 return 558 559 try: 560 current_hashes = get_openwakeword_backbone_hashes("onnx") 561 except Exception: 562 return 563 564 if ( 565 current_hashes["oww_mel_sha256"] != expected_mel 566 or current_hashes["oww_emb_sha256"] != expected_emb 567 ): 568 logger.warning( 569 "OWW backbone version changed since training. Model may produce degraded results." 570 ) 571 572 def _load_session(self, model: str) -> BackendSession: 573 """Load a model file via the configured backend. 574 575 Resolves *model* to a file path (direct path, .onnx/.tflite suffix, 576 or registry lookup), then delegates to ``self._backend.load()``. 577 578 For TFLite backends, if only a ``.onnx`` file exists in the cache 579 the method looks for a sibling ``.tflite`` file with the same stem. 580 """ 581 model_path = self._resolve_model_path(model) 582 583 # When using the TFLite backend, prefer a .tflite sibling if the 584 # resolved path is an .onnx file. 585 if self._backend.name == "tflite" and model_path.suffix == ".onnx": 586 tflite_sibling = model_path.with_suffix(".tflite") 587 if tflite_sibling.exists(): 588 model_path = tflite_sibling 589 logger.debug("TFLite backend: using .tflite sibling %s", model_path) 590 else: 591 logger.warning( 592 "TFLite backend selected but only .onnx file found at %s. " 593 "Convert with: python -c " 594 '"from violawake_sdk.backends.tflite_backend import ' 595 "convert_onnx_to_tflite; convert_onnx_to_tflite('%s')\"", 596 model_path, 597 model_path, 598 ) 599 600 try: 601 session = self._backend.load(model_path) 602 except Exception as e: 603 raise ModelLoadError(f"Failed to load model {model_path}: {e}") from e 604 logger.debug("Loaded model via %s backend: %s", self._backend.name, model_path) 605 return session 606 607 @staticmethod 608 def _resolve_model_path(model: str) -> Path: 609 """Resolve a model name or path string to a concrete file path. 610 611 Resolution order: 612 1. If *model* is an existing file path, use it directly. 613 2. If *model* ends with ``.onnx`` or ``.tflite``, treat as a path 614 (raise if not found). 615 3. Otherwise, look up *model* in the model registry / cache. 616 """ 617 if Path(model).is_file(): 618 return Path(model) 619 620 if model.endswith((".onnx", ".tflite")): 621 path = Path(model) 622 if not path.exists(): 623 raise ModelNotFoundError( 624 f"Model file not found: {model}. " 625 f"If this is a named model, omit the file extension." 626 ) 627 return path 628 629 try: 630 return get_model_path(model) 631 except FileNotFoundError as e: 632 raise ModelNotFoundError( 633 f"Model '{model}' not found in cache and auto-download failed or is disabled. " 634 f"Run: violawake-download --model {model}" 635 ) from e 636 637 def _get_embedding(self, audio_frame: bytes | np.ndarray) -> np.ndarray: 638 """Extract the OWW embedding from an audio frame. 639 640 Returns the raw embedding vector before MLP scoring. 641 Used internally for speaker verification (K5). 642 """ 643 with self._backbone_lock: 644 embedding = self._oww_backbone.last_embedding 645 if embedding is None: 646 _, embedding = self._oww_backbone.push_audio(audio_frame) 647 if embedding is None: 648 return np.zeros(EMBEDDING_DIM, dtype=np.float32) 649 return embedding 650 651 @staticmethod 652 def _needs_int16_normalization(audio_frame: bytes | np.ndarray) -> bool: 653 """Check whether audio_frame requires int16-to-float normalization.""" 654 return isinstance(audio_frame, bytes) or ( 655 isinstance(audio_frame, np.ndarray) and audio_frame.dtype == np.int16 656 ) 657 658 @staticmethod 659 def _prepare_model_audio(audio_frame: bytes | np.ndarray) -> np.ndarray: 660 """Validate an audio frame and normalize it for model inference.""" 661 pcm = validate_audio_chunk(audio_frame) 662 if WakeDetector._needs_int16_normalization(audio_frame): 663 return pcm / 32768.0 664 return pcm 665 666 def process(self, audio_frame: bytes | np.ndarray) -> float: 667 """Process a 20ms audio frame and return the wake word detection score. 668 669 If ensemble mode is active (K3), returns the fused score. 670 The score is recorded for confidence tracking (K2) and reported 671 to the power manager (K7) if configured. 672 673 Thread-safe: protects internal state mutation with a lock. 674 675 Raises: 676 TypeError: If audio_frame is not bytes or ndarray. 677 ValueError: If audio_frame is empty, malformed, or drastically 678 larger than the supported streaming frame size. 679 """ 680 return self._process_core(self._prepare_model_audio(audio_frame), audio_frame) 681 682 def _process_core(self, pcm: np.ndarray, raw_audio_frame: bytes | np.ndarray) -> float: 683 """Internal scoring engine operating on pre-validated, normalized PCM. 684 685 Args: 686 pcm: Float32 array, already validated and normalized by _prepare_model_audio. 687 raw_audio_frame: Original audio frame passed through to OWW backbone. 688 """ 689 if pcm.shape[0] != FRAME_SAMPLES: 690 # Reject pathologically large or empty frames first 691 if pcm.shape[0] == 0 or pcm.shape[0] > _MAX_PROCESS_FRAME_SAMPLES: 692 raise ValueError( 693 "Audio frame length is too far from the expected streaming size: " 694 f"expected {FRAME_SAMPLES} samples, got {pcm.shape[0]} " 695 f"(maximum non-pathological size is {_MAX_PROCESS_FRAME_SAMPLES})" 696 ) 697 # Non-multiples of 320 indicate wrong sample rate — return 0.0 698 if pcm.shape[0] % FRAME_SAMPLES != 0: 699 logger.warning( 700 "Audio frame has %d samples (not a multiple of %d). " 701 "Expected 16kHz, 20ms frames. Returning score 0.0.", 702 pcm.shape[0], 703 FRAME_SAMPLES, 704 ) 705 return 0.0 706 with self._backbone_lock: 707 produced_embedding, embedding = self._oww_backbone.push_audio(raw_audio_frame) 708 if embedding is None: 709 score = self._last_score 710 elif self._ensemble is not None and self._ensemble.model_count > 0: 711 score = ( 712 self._ensemble.score(embedding.flatten()) 713 if produced_embedding 714 else self._last_score 715 ) 716 elif self._is_temporal: 717 if produced_embedding: 718 self._embedding_buffer.append(embedding.flatten()) 719 if len(self._embedding_buffer) >= self._temporal_seq_len: 720 temporal_input = np.stack(list(self._embedding_buffer)) 721 temporal_input = temporal_input.reshape( 722 1, 723 self._temporal_seq_len, 724 EMBEDDING_DIM, 725 ).astype(np.float32) 726 score = float( 727 self._mlp_session.run(None, {self._mlp_input_name: temporal_input})[ 728 0 729 ].flatten()[0] 730 ) 731 else: 732 score = 0.0 733 else: 734 score = self._last_score 735 else: 736 if produced_embedding: 737 mlp_input = embedding.reshape(1, EMBEDDING_DIM).astype(np.float32) 738 score_output = self._mlp_session.run(None, {self._mlp_input_name: mlp_input})[0] 739 score = float(np.asarray(score_output).reshape(-1)[0]) 740 else: 741 score = self._last_score 742 743 with self._lock: 744 self._last_score = score 745 # K2: Record score for confidence tracking 746 self._score_tracker.record(score) 747 748 # K7: Report score to power manager for activity detection 749 if self._power_manager is not None: 750 self._power_manager.report_score(score) 751 752 return score 753 754 def detect(self, audio_frame: bytes | np.ndarray, is_playing: bool = False) -> bool: 755 """Process a frame and apply the full decision policy. 756 757 Integrates adaptive threshold (K4), multi-window confirmation (K2), 758 speaker verification (K5), and power management (K7) when configured. 759 760 Thread-safe: protects internal state mutation with a lock. 761 762 Raises: 763 TypeError: If audio_frame is not bytes or ndarray. 764 ValueError: If audio_frame is empty or has invalid format. 765 """ 766 # G1: Input validation (single pass — process_core skips re-validation) 767 pcm = validate_audio_chunk(audio_frame) 768 769 # Compute RMS on int16-scale PCM for the rms_floor comparison. 770 # rms_floor=1.0 is calibrated for int16 scale (speech ≈ 500–5000, 771 # silence ≈ 0–5). Float32 input in [-1, 1] is scaled up so the 772 # same rms_floor works regardless of input format. 773 rms = float(np.sqrt(np.mean(pcm**2))) 774 if not self._needs_int16_normalization(audio_frame): 775 # Float32/float64 input: RMS is in [0, ~0.7] — scale to int16 range 776 rms *= 32768.0 777 778 # Normalize for model inference 779 model_pcm = pcm / 32768.0 if self._needs_int16_normalization(audio_frame) else pcm 780 781 # K7: Power management -- skip frame if power manager says so 782 if self._power_manager is not None and not self._power_manager.should_process(pcm): 783 return False 784 785 # K4: Update noise profiler and get adaptive threshold 786 if self._noise_profiler is not None and self._adaptive_threshold: 787 adapted = self._noise_profiler.update(pcm) 788 self._policy.threshold = adapted 789 790 score = self._process_core(model_pcm, audio_frame) 791 792 # K5: Pre-fetch embedding outside _lock to avoid ABBA deadlock 793 # (_process_core acquires _backbone_lock -> _lock; we must not 794 # acquire _backbone_lock while holding _lock). 795 # 796 # Trade-off: Under concurrent access, _get_embedding reads the 797 # backbone's last_embedding which may have been overwritten by 798 # another thread's _process_core call since our score was computed. 799 # This means the embedding used for speaker verification may not 800 # correspond to the score just returned by _process_core. This is 801 # accepted for performance — taking _backbone_lock across both 802 # _process_core and _get_embedding would serialize all detection, 803 # and the mismatch is benign (embeddings from adjacent frames are 804 # nearly identical in practice). 805 speaker_embedding: np.ndarray | None = None 806 if self._speaker_verify_fn is not None: 807 speaker_embedding = self._get_embedding(audio_frame) 808 809 with self._lock: 810 # K2: Multi-window confirmation 811 if score >= self._policy.threshold: 812 self._confirm_counter += 1 813 else: 814 self._confirm_counter = 0 815 816 effective_detected = self._confirm_counter >= self._confirm_required 817 818 if effective_detected: 819 detected = self._policy.evaluate(score=score, rms=rms, is_playing=is_playing) 820 else: 821 detected = False 822 823 if detected: 824 # K5: Speaker verification post-detection 825 if self._speaker_verify_fn is not None and speaker_embedding is not None: # noqa: SIM102 826 if not self._speaker_verify_fn(speaker_embedding.flatten()): 827 logger.debug("Speaker verification rejected detection") 828 return False 829 830 self._confirm_counter = 0 831 return True 832 833 return False 834 835 def reset_cooldown(self) -> None: 836 """Reset the cooldown window without clearing confirmation state or buffers.""" 837 with self._lock: 838 self._policy.reset_cooldown() 839 840 def reset(self) -> None: 841 """Reset cooldown, confirmation state, score history, and temporal buffers. 842 843 Lock ordering: _backbone_lock then _lock, matching _process_core 844 to prevent ABBA deadlock. 845 """ 846 with self._backbone_lock, self._lock: 847 self._policy.reset_cooldown() 848 self._confirm_counter = 0 849 self._last_score = 0.0 850 self._score_tracker.reset() 851 self._oww_backbone.reset() 852 if self._is_temporal: 853 self._embedding_buffer.clear() 854 855 # ------------------------------------------------------------------ 856 # K2: Confidence API 857 # ------------------------------------------------------------------ 858 859 def get_confidence(self) -> ConfidenceResult: 860 """Return a confidence assessment of the current detection state. 861 862 Includes the raw MLP score, multi-window confirmation count, 863 and a classified confidence level (LOW/MEDIUM/HIGH/CERTAIN). 864 """ 865 return self._score_tracker.classify( 866 confirm_count=self._confirm_counter, 867 confirm_required=self._confirm_required, 868 ) 869 870 @property 871 def last_scores(self) -> tuple[float, ...]: 872 """Return the recent score history (most recent last).""" 873 return self._score_tracker.last_scores 874 875 # ------------------------------------------------------------------ 876 # K5: Speaker verification helpers 877 # ------------------------------------------------------------------ 878 879 def enroll_speaker(self, speaker_id: str, audio_frames: list[bytes | np.ndarray]) -> int: 880 """Enroll a speaker by extracting embeddings from audio frames. 881 882 Requires a ``SpeakerVerificationHook`` as the ``speaker_verify_fn``. 883 884 Args: 885 speaker_id: Unique identifier for the speaker. 886 audio_frames: Audio frames to extract embeddings from. 887 888 Returns: 889 Total enrollment count for this speaker. 890 891 Raises: 892 RuntimeError: If no SpeakerVerificationHook is configured. 893 """ 894 from violawake_sdk.speaker import SpeakerVerificationHook 895 896 hook = self._speaker_verify_fn 897 if not isinstance(hook, SpeakerVerificationHook): 898 raise RuntimeError( 899 "enroll_speaker requires a SpeakerVerificationHook as speaker_verify_fn" 900 ) 901 902 embeddings = [] 903 for frame in audio_frames: 904 emb = self._get_embedding(frame) 905 embeddings.append(emb.flatten()) 906 907 return hook.enroll_speaker(speaker_id, embeddings) 908 909 def verify_speaker(self, audio_frame: bytes | np.ndarray) -> SpeakerVerifyResult: 910 """Verify the speaker in an audio frame against enrolled profiles. 911 912 Args: 913 audio_frame: Audio frame to verify. 914 915 Returns: 916 SpeakerVerifyResult with match details. 917 918 Raises: 919 RuntimeError: If no SpeakerVerificationHook is configured. 920 """ 921 from violawake_sdk.speaker import SpeakerVerificationHook 922 923 hook = self._speaker_verify_fn 924 if not isinstance(hook, SpeakerVerificationHook): 925 raise RuntimeError( 926 "verify_speaker requires a SpeakerVerificationHook as speaker_verify_fn" 927 ) 928 929 embedding = self._get_embedding(audio_frame) 930 return hook.verify_speaker(embedding.flatten()) 931 932 # ------------------------------------------------------------------ 933 # K6: Audio source factory 934 # ------------------------------------------------------------------ 935 936 @classmethod 937 def from_source( 938 cls, 939 source: AudioSource, 940 model: str = "temporal_cnn", 941 threshold: float = DEFAULT_THRESHOLD, 942 cooldown_s: float = DEFAULT_COOLDOWN_S, 943 **kwargs: Any, 944 ) -> _SourceDetector: 945 """Create a WakeDetector bound to an AudioSource. 946 947 The returned object wraps a WakeDetector and provides a ``run()`` 948 method that reads frames from the source and runs detection. 949 950 Args: 951 source: Any object implementing the AudioSource protocol. 952 model: Model name or path. 953 threshold: Detection threshold. 954 cooldown_s: Cooldown between detections. 955 **kwargs: Additional WakeDetector keyword arguments. 956 957 Returns: 958 A _SourceDetector wrapping both the source and detector. 959 """ 960 detector = cls( 961 model=model, 962 threshold=threshold, 963 cooldown_s=cooldown_s, 964 **kwargs, 965 ) 966 return _SourceDetector(detector=detector, source=source) 967 968 # ------------------------------------------------------------------ 969 # Original methods 970 # ------------------------------------------------------------------ 971 972 def stream_mic(self, device_index: int | None = None) -> Generator[bytes, None, None]: 973 """Generator that yields 20ms audio frames from the default microphone.""" 974 try: 975 import pyaudio 976 except ImportError: 977 raise ImportError( 978 "pyaudio is required for microphone features. " 979 "Install with: pip install violawake[audio]" 980 ) from None 981 pa = pyaudio.PyAudio() 982 try: 983 stream = pa.open( 984 format=pyaudio.paInt16, 985 channels=1, 986 rate=SAMPLE_RATE, 987 input=True, 988 frames_per_buffer=FRAME_SAMPLES, 989 input_device_index=device_index, 990 ) 991 except Exception as e: 992 pa.terminate() 993 raise AudioCaptureError( 994 f"Failed to open microphone: {e}. " 995 f"Check that a microphone is connected and not in use by another application." 996 ) from e 997 logger.info("Microphone capture started (16kHz, mono, 20ms frames)") 998 _MAX_CONSECUTIVE_ERRORS = 10 999 consecutive_errors = 0 1000 try: 1001 while True: 1002 try: 1003 yield stream.read(FRAME_SAMPLES, exception_on_overflow=False) 1004 consecutive_errors = 0 1005 except Exception as e: 1006 consecutive_errors += 1 1007 logger.warning( 1008 "Mic read error (%d/%d): %s", consecutive_errors, _MAX_CONSECUTIVE_ERRORS, e 1009 ) 1010 if consecutive_errors >= _MAX_CONSECUTIVE_ERRORS: 1011 raise AudioCaptureError( 1012 f"Microphone read failed {_MAX_CONSECUTIVE_ERRORS} consecutive times. " 1013 f"Last error: {e}" 1014 ) from e 1015 continue 1016 finally: 1017 stream.stop_stream() 1018 stream.close() 1019 pa.terminate() 1020 logger.info("Microphone capture stopped")
Wake word detector using ViolaWake MLP on OpenWakeWord embeddings.
Supports pluggable inference backends (ONNX Runtime, TFLite) via the
backend parameter. The default "auto" mode tries ONNX Runtime
first, then falls back to TFLite, so users on edge devices can run
without installing onnxruntime.
Also supports optional competitive features (all opt-in, backward compatible):
- K2 Confidence API:
get_confidence()andlast_scoresproperty. - K3 Multi-model ensemble:
modelsparameter with fusion strategies. - K4 Adaptive threshold:
adaptive_thresholdparameter with noise profiling. - K5 Speaker verification:
speaker_verify_fncallback for post-detection. - K6 Audio source abstraction:
from_source()class method factory. - K7 Power management:
power_managerparameter for duty cycling.
Threshold tuning guide:
- 0.70 = sensitive (more detections, more false positives)
- 0.80 = balanced (default, recommended starting point)
- 0.85 = conservative (fewer false positives, may miss some)
- 0.90+ = very conservative (for noisy environments)
Start at 0.80 and adjust based on your false accept rate.
Arguments:
- model: Model name from the registry, or a path to a model file.
- threshold: Detection confidence threshold in [0.0, 1.0].
- cooldown_s: Minimum seconds between consecutive detections.
- providers: ONNX Runtime execution providers (ignored for TFLite).
- backend: Inference backend selector (
"onnx","tflite","auto"). - config: A
DetectorConfiginstance bundling all advanced options. Mutually exclusive with the individual advanced kwargs below. - models: Additional model paths for ensemble scoring (K3).
- fusion_strategy: Score fusion strategy for ensemble (K3).
- fusion_weights: Per-model weights for weighted_average fusion (K3).
- adaptive_threshold: Enable dynamic threshold based on noise (K4).
- noise_profiler: Custom NoiseProfiler instance (K4).
- speaker_verify_fn: Post-detection speaker verification callback (K5).
- power_manager: Power management controller for duty cycling (K7).
- confirm_count: Consecutive above-threshold scores required for detection (K2).
- score_history_size: Number of recent scores to retain (K2).
319 def __init__( 320 self, 321 model: str = "temporal_cnn", 322 threshold: float = DEFAULT_THRESHOLD, 323 cooldown_s: float = DEFAULT_COOLDOWN_S, 324 providers: list[str] | None = None, 325 backend: str = "auto", 326 *, 327 config: DetectorConfig | None = None, 328 # K3: Multi-model ensemble (individual kwargs, backwards compat) 329 models: list[str] | None = _UNSET, 330 fusion_strategy: FusionStrategy | str = _UNSET, 331 fusion_weights: list[float] | None = _UNSET, 332 # K4: Adaptive threshold 333 adaptive_threshold: bool = _UNSET, 334 noise_profiler: NoiseProfiler | None = _UNSET, 335 # K5: Speaker verification 336 speaker_verify_fn: Callable[[np.ndarray], bool] | None = _UNSET, 337 # K7: Power management 338 power_manager: PowerManager | None = _UNSET, 339 # K2: Confidence tracking 340 confirm_count: int = _UNSET, 341 score_history_size: int = _UNSET, 342 ) -> None: 343 # --- Resolve config vs individual kwargs ------------------------- 344 # Detect if any advanced kwarg was explicitly passed (not _UNSET) 345 _locals = { 346 "models": models, 347 "fusion_strategy": fusion_strategy, 348 "fusion_weights": fusion_weights, 349 "adaptive_threshold": adaptive_threshold, 350 "noise_profiler": noise_profiler, 351 "speaker_verify_fn": speaker_verify_fn, 352 "power_manager": power_manager, 353 "confirm_count": confirm_count, 354 "score_history_size": score_history_size, 355 } 356 explicit_kwargs = {name for name, val in _locals.items() if val is not _UNSET} 357 if config is not None and explicit_kwargs: 358 raise ValueError( 359 f"Cannot specify both config= and individual advanced kwargs. " 360 f"Conflicting kwargs: {sorted(explicit_kwargs)}. " 361 f"Either use config=DetectorConfig(...) or pass kwargs directly, not both." 362 ) 363 364 if config is not None: 365 # Unpack from DetectorConfig 366 models = config.models 367 fusion_strategy = config.fusion_strategy 368 fusion_weights = config.fusion_weights 369 adaptive_threshold = config.adaptive_threshold 370 noise_profiler = config.noise_profiler 371 speaker_verify_fn = config.speaker_verify_fn 372 power_manager = config.power_manager 373 confirm_count = config.confirm_count 374 score_history_size = config.score_history_size 375 else: 376 # Apply defaults for any _UNSET values (backwards compat path) 377 if models is _UNSET: 378 models = None 379 if fusion_strategy is _UNSET: 380 fusion_strategy = FusionStrategy.AVERAGE 381 if fusion_weights is _UNSET: 382 fusion_weights = None 383 if adaptive_threshold is _UNSET: 384 adaptive_threshold = False 385 if noise_profiler is _UNSET: 386 noise_profiler = None 387 if speaker_verify_fn is _UNSET: 388 speaker_verify_fn = None 389 if power_manager is _UNSET: 390 power_manager = None 391 if confirm_count is _UNSET: 392 confirm_count = 1 393 if score_history_size is _UNSET: 394 score_history_size = 50 395 396 # G1: Input validation for public constructor parameters 397 if not isinstance(threshold, (int, float)): 398 raise TypeError(f"threshold must be a number, got {type(threshold).__name__}") 399 if not 0.0 <= threshold <= 1.0: 400 raise ValueError(f"threshold must be in [0.0, 1.0], got {threshold!r}") 401 if not isinstance(cooldown_s, (int, float)): 402 raise TypeError(f"cooldown_s must be a number, got {type(cooldown_s).__name__}") 403 if cooldown_s < 0: 404 raise ValueError(f"cooldown_s must be >= 0, got {cooldown_s!r}") 405 if backend not in self._VALID_BACKENDS: 406 raise ValueError(f"backend must be one of {self._VALID_BACKENDS}, got {backend!r}") 407 if confirm_count < 1: 408 raise ValueError(f"confirm_count must be >= 1, got {confirm_count}") 409 410 self.threshold = threshold 411 self._lock = threading.Lock() 412 self._backbone_lock = threading.Lock() 413 self._policy = WakeDecisionPolicy(threshold=threshold, cooldown_s=cooldown_s) 414 self._providers = providers or ["CPUExecutionProvider"] 415 self._backend: InferenceBackend = get_backend(backend, providers=self._providers) 416 417 # K2: Confidence tracking 418 self._score_tracker = ScoreTracker( 419 threshold=threshold, 420 history_size=score_history_size, 421 ) 422 self._confirm_required = confirm_count 423 self._confirm_counter = 0 424 425 # K3: Ensemble support 426 self._ensemble: EnsembleScorer | None = None 427 if models and len(models) > 0: 428 if isinstance(fusion_strategy, str): 429 fusion_strategy = FusionStrategy(fusion_strategy) 430 self._ensemble = EnsembleScorer( 431 strategy=fusion_strategy, 432 weights=fusion_weights, 433 ) 434 435 # K4: Noise profiler / adaptive threshold 436 self._adaptive_threshold = adaptive_threshold 437 if noise_profiler is not None: 438 self._noise_profiler: NoiseProfiler | None = noise_profiler 439 elif adaptive_threshold: 440 self._noise_profiler = NoiseProfiler(base_threshold=threshold) 441 else: 442 self._noise_profiler = None 443 444 # K5: Speaker verification 445 self._speaker_verify_fn = speaker_verify_fn 446 447 # K7: Power manager 448 self._power_manager = power_manager 449 450 # Warn on deprecated models 451 if model in MODEL_REGISTRY and "DEPRECATED" in MODEL_REGISTRY[model].description: 452 import warnings 453 454 warnings.warn( 455 f"Model '{model}' is deprecated: {MODEL_REGISTRY[model].description}. " 456 f"Use model='temporal_cnn' instead.", 457 DeprecationWarning, 458 stacklevel=2, 459 ) 460 461 # Load models 462 self._oww_backbone = self._create_oww_backbone() 463 self._mlp_session = self._load_session(model) 464 self._mlp_input_name = self._mlp_session.get_inputs()[0].name 465 self._last_score = 0.0 466 467 # Detect temporal vs MLP model from input shape 468 mlp_input_shape = self._mlp_session.get_inputs()[0].shape 469 if len(mlp_input_shape) == 3: 470 # Temporal model: input is (batch, seq_len, embedding_dim) 471 self._is_temporal = True 472 self._temporal_seq_len = ( 473 mlp_input_shape[1] 474 if isinstance(mlp_input_shape[1], int) 475 else _TEMPORAL_SEQ_LEN_DEFAULT 476 ) 477 self._embedding_buffer: collections.deque[np.ndarray] = collections.deque( 478 maxlen=self._temporal_seq_len, 479 ) 480 logger.info( 481 "Temporal model detected: seq_len=%d", 482 self._temporal_seq_len, 483 ) 484 else: 485 self._is_temporal = False 486 self._temporal_seq_len = 0 487 488 # K3: Load additional ensemble models 489 if models and self._ensemble is not None: 490 # Add primary model to ensemble 491 self._ensemble.add_session(self._mlp_session, self._mlp_input_name) 492 for extra_model in models: 493 extra_session = self._load_session(extra_model) 494 extra_input_name = extra_session.get_inputs()[0].name 495 self._ensemble.add_session(extra_session, extra_input_name) 496 497 logger.info( 498 "WakeDetector initialized: model=%s, threshold=%.2f, backend=%s", 499 model, 500 threshold, 501 self._backend.name, 502 ) 503 self._warn_on_oww_backbone_change(self._resolve_model_path(model))
522 def close(self) -> None: 523 """Release inference sessions and reset internal state. 524 525 After calling close(), the detector should not be used for inference. 526 This is called automatically when using WakeDetector as a context 527 manager. 528 """ 529 self.reset() 530 # Release inference session references so the underlying runtime 531 # (ONNX / TFLite) can free memory immediately rather than waiting 532 # for garbage collection. 533 self._mlp_session = None # type: ignore[assignment] 534 if self._ensemble is not None: 535 self._ensemble.clear() 536 self._oww_backbone = None # type: ignore[assignment]
Release inference sessions and reset internal state.
After calling close(), the detector should not be used for inference. This is called automatically when using WakeDetector as a context manager.
666 def process(self, audio_frame: bytes | np.ndarray) -> float: 667 """Process a 20ms audio frame and return the wake word detection score. 668 669 If ensemble mode is active (K3), returns the fused score. 670 The score is recorded for confidence tracking (K2) and reported 671 to the power manager (K7) if configured. 672 673 Thread-safe: protects internal state mutation with a lock. 674 675 Raises: 676 TypeError: If audio_frame is not bytes or ndarray. 677 ValueError: If audio_frame is empty, malformed, or drastically 678 larger than the supported streaming frame size. 679 """ 680 return self._process_core(self._prepare_model_audio(audio_frame), audio_frame)
Process a 20ms audio frame and return the wake word detection score.
If ensemble mode is active (K3), returns the fused score. The score is recorded for confidence tracking (K2) and reported to the power manager (K7) if configured.
Thread-safe: protects internal state mutation with a lock.
Raises:
- TypeError: If audio_frame is not bytes or ndarray.
- ValueError: If audio_frame is empty, malformed, or drastically larger than the supported streaming frame size.
754 def detect(self, audio_frame: bytes | np.ndarray, is_playing: bool = False) -> bool: 755 """Process a frame and apply the full decision policy. 756 757 Integrates adaptive threshold (K4), multi-window confirmation (K2), 758 speaker verification (K5), and power management (K7) when configured. 759 760 Thread-safe: protects internal state mutation with a lock. 761 762 Raises: 763 TypeError: If audio_frame is not bytes or ndarray. 764 ValueError: If audio_frame is empty or has invalid format. 765 """ 766 # G1: Input validation (single pass — process_core skips re-validation) 767 pcm = validate_audio_chunk(audio_frame) 768 769 # Compute RMS on int16-scale PCM for the rms_floor comparison. 770 # rms_floor=1.0 is calibrated for int16 scale (speech ≈ 500–5000, 771 # silence ≈ 0–5). Float32 input in [-1, 1] is scaled up so the 772 # same rms_floor works regardless of input format. 773 rms = float(np.sqrt(np.mean(pcm**2))) 774 if not self._needs_int16_normalization(audio_frame): 775 # Float32/float64 input: RMS is in [0, ~0.7] — scale to int16 range 776 rms *= 32768.0 777 778 # Normalize for model inference 779 model_pcm = pcm / 32768.0 if self._needs_int16_normalization(audio_frame) else pcm 780 781 # K7: Power management -- skip frame if power manager says so 782 if self._power_manager is not None and not self._power_manager.should_process(pcm): 783 return False 784 785 # K4: Update noise profiler and get adaptive threshold 786 if self._noise_profiler is not None and self._adaptive_threshold: 787 adapted = self._noise_profiler.update(pcm) 788 self._policy.threshold = adapted 789 790 score = self._process_core(model_pcm, audio_frame) 791 792 # K5: Pre-fetch embedding outside _lock to avoid ABBA deadlock 793 # (_process_core acquires _backbone_lock -> _lock; we must not 794 # acquire _backbone_lock while holding _lock). 795 # 796 # Trade-off: Under concurrent access, _get_embedding reads the 797 # backbone's last_embedding which may have been overwritten by 798 # another thread's _process_core call since our score was computed. 799 # This means the embedding used for speaker verification may not 800 # correspond to the score just returned by _process_core. This is 801 # accepted for performance — taking _backbone_lock across both 802 # _process_core and _get_embedding would serialize all detection, 803 # and the mismatch is benign (embeddings from adjacent frames are 804 # nearly identical in practice). 805 speaker_embedding: np.ndarray | None = None 806 if self._speaker_verify_fn is not None: 807 speaker_embedding = self._get_embedding(audio_frame) 808 809 with self._lock: 810 # K2: Multi-window confirmation 811 if score >= self._policy.threshold: 812 self._confirm_counter += 1 813 else: 814 self._confirm_counter = 0 815 816 effective_detected = self._confirm_counter >= self._confirm_required 817 818 if effective_detected: 819 detected = self._policy.evaluate(score=score, rms=rms, is_playing=is_playing) 820 else: 821 detected = False 822 823 if detected: 824 # K5: Speaker verification post-detection 825 if self._speaker_verify_fn is not None and speaker_embedding is not None: # noqa: SIM102 826 if not self._speaker_verify_fn(speaker_embedding.flatten()): 827 logger.debug("Speaker verification rejected detection") 828 return False 829 830 self._confirm_counter = 0 831 return True 832 833 return False
Process a frame and apply the full decision policy.
Integrates adaptive threshold (K4), multi-window confirmation (K2), speaker verification (K5), and power management (K7) when configured.
Thread-safe: protects internal state mutation with a lock.
Raises:
- TypeError: If audio_frame is not bytes or ndarray.
- ValueError: If audio_frame is empty or has invalid format.
835 def reset_cooldown(self) -> None: 836 """Reset the cooldown window without clearing confirmation state or buffers.""" 837 with self._lock: 838 self._policy.reset_cooldown()
Reset the cooldown window without clearing confirmation state or buffers.
840 def reset(self) -> None: 841 """Reset cooldown, confirmation state, score history, and temporal buffers. 842 843 Lock ordering: _backbone_lock then _lock, matching _process_core 844 to prevent ABBA deadlock. 845 """ 846 with self._backbone_lock, self._lock: 847 self._policy.reset_cooldown() 848 self._confirm_counter = 0 849 self._last_score = 0.0 850 self._score_tracker.reset() 851 self._oww_backbone.reset() 852 if self._is_temporal: 853 self._embedding_buffer.clear()
Reset cooldown, confirmation state, score history, and temporal buffers.
Lock ordering: _backbone_lock then _lock, matching _process_core to prevent ABBA deadlock.
859 def get_confidence(self) -> ConfidenceResult: 860 """Return a confidence assessment of the current detection state. 861 862 Includes the raw MLP score, multi-window confirmation count, 863 and a classified confidence level (LOW/MEDIUM/HIGH/CERTAIN). 864 """ 865 return self._score_tracker.classify( 866 confirm_count=self._confirm_counter, 867 confirm_required=self._confirm_required, 868 )
Return a confidence assessment of the current detection state.
Includes the raw MLP score, multi-window confirmation count, and a classified confidence level (LOW/MEDIUM/HIGH/CERTAIN).
870 @property 871 def last_scores(self) -> tuple[float, ...]: 872 """Return the recent score history (most recent last).""" 873 return self._score_tracker.last_scores
Return the recent score history (most recent last).
879 def enroll_speaker(self, speaker_id: str, audio_frames: list[bytes | np.ndarray]) -> int: 880 """Enroll a speaker by extracting embeddings from audio frames. 881 882 Requires a ``SpeakerVerificationHook`` as the ``speaker_verify_fn``. 883 884 Args: 885 speaker_id: Unique identifier for the speaker. 886 audio_frames: Audio frames to extract embeddings from. 887 888 Returns: 889 Total enrollment count for this speaker. 890 891 Raises: 892 RuntimeError: If no SpeakerVerificationHook is configured. 893 """ 894 from violawake_sdk.speaker import SpeakerVerificationHook 895 896 hook = self._speaker_verify_fn 897 if not isinstance(hook, SpeakerVerificationHook): 898 raise RuntimeError( 899 "enroll_speaker requires a SpeakerVerificationHook as speaker_verify_fn" 900 ) 901 902 embeddings = [] 903 for frame in audio_frames: 904 emb = self._get_embedding(frame) 905 embeddings.append(emb.flatten()) 906 907 return hook.enroll_speaker(speaker_id, embeddings)
Enroll a speaker by extracting embeddings from audio frames.
Requires a SpeakerVerificationHook as the speaker_verify_fn.
Arguments:
- speaker_id: Unique identifier for the speaker.
- audio_frames: Audio frames to extract embeddings from.
Returns:
Total enrollment count for this speaker.
Raises:
- RuntimeError: If no SpeakerVerificationHook is configured.
909 def verify_speaker(self, audio_frame: bytes | np.ndarray) -> SpeakerVerifyResult: 910 """Verify the speaker in an audio frame against enrolled profiles. 911 912 Args: 913 audio_frame: Audio frame to verify. 914 915 Returns: 916 SpeakerVerifyResult with match details. 917 918 Raises: 919 RuntimeError: If no SpeakerVerificationHook is configured. 920 """ 921 from violawake_sdk.speaker import SpeakerVerificationHook 922 923 hook = self._speaker_verify_fn 924 if not isinstance(hook, SpeakerVerificationHook): 925 raise RuntimeError( 926 "verify_speaker requires a SpeakerVerificationHook as speaker_verify_fn" 927 ) 928 929 embedding = self._get_embedding(audio_frame) 930 return hook.verify_speaker(embedding.flatten())
Verify the speaker in an audio frame against enrolled profiles.
Arguments:
- audio_frame: Audio frame to verify.
Returns:
SpeakerVerifyResult with match details.
Raises:
- RuntimeError: If no SpeakerVerificationHook is configured.
936 @classmethod 937 def from_source( 938 cls, 939 source: AudioSource, 940 model: str = "temporal_cnn", 941 threshold: float = DEFAULT_THRESHOLD, 942 cooldown_s: float = DEFAULT_COOLDOWN_S, 943 **kwargs: Any, 944 ) -> _SourceDetector: 945 """Create a WakeDetector bound to an AudioSource. 946 947 The returned object wraps a WakeDetector and provides a ``run()`` 948 method that reads frames from the source and runs detection. 949 950 Args: 951 source: Any object implementing the AudioSource protocol. 952 model: Model name or path. 953 threshold: Detection threshold. 954 cooldown_s: Cooldown between detections. 955 **kwargs: Additional WakeDetector keyword arguments. 956 957 Returns: 958 A _SourceDetector wrapping both the source and detector. 959 """ 960 detector = cls( 961 model=model, 962 threshold=threshold, 963 cooldown_s=cooldown_s, 964 **kwargs, 965 ) 966 return _SourceDetector(detector=detector, source=source)
Create a WakeDetector bound to an AudioSource.
The returned object wraps a WakeDetector and provides a run()
method that reads frames from the source and runs detection.
Arguments:
- source: Any object implementing the AudioSource protocol.
- model: Model name or path.
- threshold: Detection threshold.
- cooldown_s: Cooldown between detections.
- **kwargs: Additional WakeDetector keyword arguments.
Returns:
A _SourceDetector wrapping both the source and detector.
972 def stream_mic(self, device_index: int | None = None) -> Generator[bytes, None, None]: 973 """Generator that yields 20ms audio frames from the default microphone.""" 974 try: 975 import pyaudio 976 except ImportError: 977 raise ImportError( 978 "pyaudio is required for microphone features. " 979 "Install with: pip install violawake[audio]" 980 ) from None 981 pa = pyaudio.PyAudio() 982 try: 983 stream = pa.open( 984 format=pyaudio.paInt16, 985 channels=1, 986 rate=SAMPLE_RATE, 987 input=True, 988 frames_per_buffer=FRAME_SAMPLES, 989 input_device_index=device_index, 990 ) 991 except Exception as e: 992 pa.terminate() 993 raise AudioCaptureError( 994 f"Failed to open microphone: {e}. " 995 f"Check that a microphone is connected and not in use by another application." 996 ) from e 997 logger.info("Microphone capture started (16kHz, mono, 20ms frames)") 998 _MAX_CONSECUTIVE_ERRORS = 10 999 consecutive_errors = 0 1000 try: 1001 while True: 1002 try: 1003 yield stream.read(FRAME_SAMPLES, exception_on_overflow=False) 1004 consecutive_errors = 0 1005 except Exception as e: 1006 consecutive_errors += 1 1007 logger.warning( 1008 "Mic read error (%d/%d): %s", consecutive_errors, _MAX_CONSECUTIVE_ERRORS, e 1009 ) 1010 if consecutive_errors >= _MAX_CONSECUTIVE_ERRORS: 1011 raise AudioCaptureError( 1012 f"Microphone read failed {_MAX_CONSECUTIVE_ERRORS} consecutive times. " 1013 f"Last error: {e}" 1014 ) from e 1015 continue 1016 finally: 1017 stream.stop_stream() 1018 stream.close() 1019 pa.terminate() 1020 logger.info("Microphone capture stopped")
Generator that yields 20ms audio frames from the default microphone.
38class AsyncWakeDetector: 39 """Async wrapper around ``WakeDetector`` for asyncio-based applications. 40 41 All CPU-bound inference is dispatched to a background thread via 42 ``loop.run_in_executor``. The wrapper is fully transparent -- it 43 accepts the same constructor arguments and exposes the same methods 44 as ``WakeDetector``, but with ``async`` signatures. 45 46 Args: 47 **kwargs: Forwarded to ``WakeDetector.__init__``. 48 """ 49 50 def __init__(self, **kwargs: Any) -> None: 51 self._detector = WakeDetector(**kwargs) 52 self._executor: ThreadPoolExecutor | None = None 53 54 def _get_executor(self) -> ThreadPoolExecutor: 55 if self._executor is None: 56 self._executor = ThreadPoolExecutor(max_workers=1) 57 return self._executor 58 59 async def __aenter__(self) -> AsyncWakeDetector: 60 """Enter async context manager.""" 61 return self 62 63 async def __aexit__( 64 self, exc_type: type | None, exc_val: BaseException | None, exc_tb: object 65 ) -> None: 66 """Exit async context manager, shutting down the executor.""" 67 self.close() 68 69 async def detect( 70 self, 71 audio_frame: bytes | np.ndarray, 72 is_playing: bool = False, 73 ) -> bool: 74 """Async version of ``WakeDetector.detect``.""" 75 loop = asyncio.get_running_loop() 76 return await loop.run_in_executor( 77 self._get_executor(), 78 lambda: self._detector.detect(audio_frame, is_playing), 79 ) 80 81 async def process( 82 self, 83 audio_frame: bytes | np.ndarray, 84 ) -> float: 85 """Async version of ``WakeDetector.process``.""" 86 loop = asyncio.get_running_loop() 87 return await loop.run_in_executor( 88 self._get_executor(), 89 lambda: self._detector.process(audio_frame), 90 ) 91 92 async def stream( 93 self, 94 source: AsyncIterator[bytes | np.ndarray], 95 ) -> AsyncIterator[bool]: 96 """Async generator that yields detection results from an async audio source. 97 98 Usage:: 99 100 async for detected in detector.stream(audio_source): 101 if detected: 102 print("Wake word!") 103 104 Args: 105 source: An async iterator yielding audio frames. 106 107 Yields: 108 Boolean detection result for each frame. 109 """ 110 async for frame in source: 111 yield await self.detect(frame) 112 113 def reset_cooldown(self) -> None: 114 """Reset the cooldown window (delegates to WakeDetector public API).""" 115 self._detector.reset_cooldown() 116 117 @property 118 def threshold(self) -> float: 119 """Current detection threshold.""" 120 return self._detector.threshold 121 122 def get_confidence(self) -> ConfidenceResult: 123 """Return confidence assessment of the current detection state (K2).""" 124 return self._detector.get_confidence() 125 126 @property 127 def last_scores(self) -> tuple[float, ...]: 128 """Return the recent score history (most recent last).""" 129 return self._detector.last_scores 130 131 def close(self) -> None: 132 """Shut down the background executor and release detector resources. 133 134 Safe to call multiple times. 135 """ 136 if self._executor is not None: 137 self._executor.shutdown(wait=True) 138 self._executor = None 139 self._detector.close() 140 141 def __del__(self) -> None: 142 # Use wait=False in __del__ to avoid blocking the GC/finalizer thread. 143 # The explicit close() method still uses wait=True for graceful shutdown. 144 if self._executor is not None: 145 self._executor.shutdown(wait=False) 146 self._executor = None
Async wrapper around WakeDetector for asyncio-based applications.
All CPU-bound inference is dispatched to a background thread via
loop.run_in_executor. The wrapper is fully transparent -- it
accepts the same constructor arguments and exposes the same methods
as WakeDetector, but with async signatures.
Arguments:
- **kwargs: Forwarded to
WakeDetector.__init__.
69 async def detect( 70 self, 71 audio_frame: bytes | np.ndarray, 72 is_playing: bool = False, 73 ) -> bool: 74 """Async version of ``WakeDetector.detect``.""" 75 loop = asyncio.get_running_loop() 76 return await loop.run_in_executor( 77 self._get_executor(), 78 lambda: self._detector.detect(audio_frame, is_playing), 79 )
Async version of WakeDetector.detect.
81 async def process( 82 self, 83 audio_frame: bytes | np.ndarray, 84 ) -> float: 85 """Async version of ``WakeDetector.process``.""" 86 loop = asyncio.get_running_loop() 87 return await loop.run_in_executor( 88 self._get_executor(), 89 lambda: self._detector.process(audio_frame), 90 )
Async version of WakeDetector.process.
92 async def stream( 93 self, 94 source: AsyncIterator[bytes | np.ndarray], 95 ) -> AsyncIterator[bool]: 96 """Async generator that yields detection results from an async audio source. 97 98 Usage:: 99 100 async for detected in detector.stream(audio_source): 101 if detected: 102 print("Wake word!") 103 104 Args: 105 source: An async iterator yielding audio frames. 106 107 Yields: 108 Boolean detection result for each frame. 109 """ 110 async for frame in source: 111 yield await self.detect(frame)
Async generator that yields detection results from an async audio source.
Usage::
async for detected in detector.stream(audio_source):
if detected:
print("Wake word!")
Arguments:
- source: An async iterator yielding audio frames.
Yields:
Boolean detection result for each frame.
113 def reset_cooldown(self) -> None: 114 """Reset the cooldown window (delegates to WakeDetector public API).""" 115 self._detector.reset_cooldown()
Reset the cooldown window (delegates to WakeDetector public API).
117 @property 118 def threshold(self) -> float: 119 """Current detection threshold.""" 120 return self._detector.threshold
Current detection threshold.
122 def get_confidence(self) -> ConfidenceResult: 123 """Return confidence assessment of the current detection state (K2).""" 124 return self._detector.get_confidence()
Return confidence assessment of the current detection state (K2).
126 @property 127 def last_scores(self) -> tuple[float, ...]: 128 """Return the recent score history (most recent last).""" 129 return self._detector.last_scores
Return the recent score history (most recent last).
131 def close(self) -> None: 132 """Shut down the background executor and release detector resources. 133 134 Safe to call multiple times. 135 """ 136 if self._executor is not None: 137 self._executor.shutdown(wait=True) 138 self._executor = None 139 self._detector.close()
Shut down the background executor and release detector resources.
Safe to call multiple times.
215class WakeDecisionPolicy: 216 """4-gate core decision pipeline (RMS floor, threshold, cooldown, playback suppression). 217 218 Extended by WakeDetector with optional confirmation (K2), adaptive 219 threshold (K4), and speaker verification (K5). 220 221 Gate 1: Zero-input guard -- skip if RMS < 1.0 (silence / DC offset artifact) 222 Gate 2: Score threshold -- skip if model score < threshold 223 Gate 3: Cooldown -- ignore events within cooldown_s of last detection 224 Gate 4: Listening gate -- suppress during active playback (optional) 225 """ 226 227 def __init__( 228 self, 229 threshold: float = DEFAULT_THRESHOLD, 230 cooldown_s: float = DEFAULT_COOLDOWN_S, 231 rms_floor: float = 1.0, 232 ) -> None: 233 if not 0.0 <= threshold <= 1.0: 234 raise ValueError(f"threshold must be in [0.0, 1.0], got {threshold!r}") 235 236 self.threshold = threshold 237 self.cooldown_s = cooldown_s 238 self.rms_floor = rms_floor 239 self._last_detection: float = 0.0 240 241 def evaluate( 242 self, 243 score: float, 244 rms: float = 100.0, 245 is_playing: bool = False, 246 ) -> bool: 247 """Evaluate whether a wake word event should be triggered.""" 248 if rms < self.rms_floor: 249 logger.debug("Gate 1 reject: RMS %.1f below floor %.1f", rms, self.rms_floor) 250 return False 251 if score < self.threshold: 252 return False 253 now = time.monotonic() 254 if now - self._last_detection < self.cooldown_s: 255 logger.debug( 256 "Gate 3 reject: cooldown active (%.1fs remaining)", 257 self.cooldown_s - (now - self._last_detection), 258 ) 259 return False 260 if is_playing: 261 logger.debug("Gate 4 reject: playback active") 262 return False 263 self._last_detection = now 264 logger.info("Wake word detected! score=%.3f", score) 265 return True 266 267 def reset_cooldown(self) -> None: 268 """Reset the cooldown window (useful for testing).""" 269 self._last_detection = 0.0
4-gate core decision pipeline (RMS floor, threshold, cooldown, playback suppression).
Extended by WakeDetector with optional confirmation (K2), adaptive threshold (K4), and speaker verification (K5).
Gate 1: Zero-input guard -- skip if RMS < 1.0 (silence / DC offset artifact) Gate 2: Score threshold -- skip if model score < threshold Gate 3: Cooldown -- ignore events within cooldown_s of last detection Gate 4: Listening gate -- suppress during active playback (optional)
227 def __init__( 228 self, 229 threshold: float = DEFAULT_THRESHOLD, 230 cooldown_s: float = DEFAULT_COOLDOWN_S, 231 rms_floor: float = 1.0, 232 ) -> None: 233 if not 0.0 <= threshold <= 1.0: 234 raise ValueError(f"threshold must be in [0.0, 1.0], got {threshold!r}") 235 236 self.threshold = threshold 237 self.cooldown_s = cooldown_s 238 self.rms_floor = rms_floor 239 self._last_detection: float = 0.0
241 def evaluate( 242 self, 243 score: float, 244 rms: float = 100.0, 245 is_playing: bool = False, 246 ) -> bool: 247 """Evaluate whether a wake word event should be triggered.""" 248 if rms < self.rms_floor: 249 logger.debug("Gate 1 reject: RMS %.1f below floor %.1f", rms, self.rms_floor) 250 return False 251 if score < self.threshold: 252 return False 253 now = time.monotonic() 254 if now - self._last_detection < self.cooldown_s: 255 logger.debug( 256 "Gate 3 reject: cooldown active (%.1fs remaining)", 257 self.cooldown_s - (now - self._last_detection), 258 ) 259 return False 260 if is_playing: 261 logger.debug("Gate 4 reject: playback active") 262 return False 263 self._last_detection = now 264 logger.info("Wake word detected! score=%.3f", score) 265 return True
Evaluate whether a wake word event should be triggered.
161def validate_audio_chunk(data: bytes | np.ndarray) -> np.ndarray: 162 """Validate and normalize an audio chunk for use with WakeDetector. 163 164 Accepts bytes (int16 PCM) or numpy arrays (int16, float32, float64). 165 Returns a float32 numpy array suitable for processing. 166 167 Args: 168 data: Audio chunk as bytes (int16 little-endian PCM) or numpy array. 169 170 Returns: 171 Validated float32 numpy array. 172 173 Raises: 174 TypeError: If data is not bytes or ndarray. 175 ValueError: If data is empty, has invalid dtype, contains only 176 non-finite values, or exceeds the maximum chunk size. 177 """ 178 if isinstance(data, bytes): 179 if len(data) == 0: 180 raise ValueError("Audio chunk is empty (0 bytes)") 181 if len(data) % 2 != 0: 182 raise ValueError( 183 f"Audio bytes length must be even (int16 = 2 bytes/sample), got {len(data)}" 184 ) 185 pcm = np.frombuffer(data, dtype=np.int16).astype(np.float32) 186 elif isinstance(data, np.ndarray): 187 if data.size == 0: 188 raise ValueError("Audio chunk is empty (0 samples)") 189 if data.ndim != 1: 190 raise ValueError( 191 f"Audio chunk must be 1-D, got {data.ndim}-D array with shape {data.shape}" 192 ) 193 _ALLOWED_DTYPES = (np.int16, np.float32, np.float64) 194 if data.dtype not in _ALLOWED_DTYPES: 195 raise ValueError( 196 f"Audio chunk dtype must be one of {[str(d) for d in _ALLOWED_DTYPES]}, " 197 f"got {data.dtype}" 198 ) 199 if np.issubdtype(data.dtype, np.floating) and not np.all(np.isfinite(data)): 200 data = np.where(np.isfinite(data), data, 0.0) 201 logger.warning("Audio chunk contained non-finite values (NaN/inf); replaced with 0") 202 pcm = data.astype(np.float32) 203 else: 204 raise TypeError(f"Audio chunk must be bytes or numpy ndarray, got {type(data).__name__}") 205 206 if len(pcm) > _MAX_CHUNK_SAMPLES: 207 raise ValueError( 208 f"Audio chunk too large: {len(pcm)} samples " 209 f"(max {_MAX_CHUNK_SAMPLES} = {_MAX_CHUNK_SAMPLES // SAMPLE_RATE}s at {SAMPLE_RATE}Hz)" 210 ) 211 212 return pcm
Validate and normalize an audio chunk for use with WakeDetector.
Accepts bytes (int16 PCM) or numpy arrays (int16, float32, float64). Returns a float32 numpy array suitable for processing.
Arguments:
- data: Audio chunk as bytes (int16 little-endian PCM) or numpy array.
Returns:
Validated float32 numpy array.
Raises:
- TypeError: If data is not bytes or ndarray.
- ValueError: If data is empty, has invalid dtype, contains only non-finite values, or exceeds the maximum chunk size.
28@dataclass(frozen=True) 29class ConfidenceResult: 30 """Result from get_confidence() with full detection context. 31 32 Attributes: 33 raw_score: The most recent MLP/CNN output score in [0.0, 1.0]. 34 confirm_count: Number of consecutive above-threshold scores in the 35 current multi-window confirmation sequence. 36 confirm_required: Total consecutive scores required for detection. 37 confidence: Classified confidence level. 38 score_history: Recent score history (most recent last). 39 """ 40 41 raw_score: float 42 confirm_count: int 43 confirm_required: int 44 confidence: ConfidenceLevel 45 score_history: tuple[float, ...]
Result from get_confidence() with full detection context.
Attributes:
- raw_score: The most recent MLP/CNN output score in [0.0, 1.0].
- confirm_count: Number of consecutive above-threshold scores in the current multi-window confirmation sequence.
- confirm_required: Total consecutive scores required for detection.
- confidence: Classified confidence level.
- score_history: Recent score history (most recent last).
19class ConfidenceLevel(str, Enum): 20 """Confidence classification for wake word detection.""" 21 22 LOW = "LOW" 23 MEDIUM = "MEDIUM" 24 HIGH = "HIGH" 25 CERTAIN = "CERTAIN"
Confidence classification for wake word detection.
27class FusionStrategy(str, Enum): 28 """Score fusion strategy for multi-model ensemble.""" 29 30 AVERAGE = "average" 31 MAX = "max" 32 VOTING = "voting" 33 WEIGHTED_AVERAGE = "weighted_average"
Score fusion strategy for multi-model ensemble.
47class NoiseProfiler: 48 """Estimates ambient noise and adjusts detection threshold. 49 50 The profiler maintains a rolling window of RMS energy measurements. 51 The noise floor is estimated as the 10th percentile of recent RMS values 52 (capturing the quietest frames, which are likely ambient noise). 53 54 Threshold adjustment logic: 55 - High SNR (signal clearly above noise): lower threshold slightly to 56 improve sensitivity. 57 - Low SNR (signal barely above noise): raise threshold to reduce 58 false alarms from noise bursts. 59 - The adjusted threshold is always clamped to 60 ``[min_threshold, max_threshold]``. 61 62 Adaptive threshold bounds: 63 - ``min_threshold`` defaults to ``0.60``. In very quiet environments, or 64 when the current frame is far above the estimated noise floor, the 65 threshold may be lowered to improve sensitivity, but it will never be 66 reduced below this floor. 67 - ``max_threshold`` defaults to ``0.95``. In very noisy environments, or 68 when the current frame is close to the estimated noise floor, the 69 threshold may be raised to suppress false accepts, but it will never be 70 increased above this ceiling. 71 - Before enough history is collected (fewer than 10 RMS frames), the 72 profiler returns ``base_threshold`` with no adaptation. 73 74 Args: 75 base_threshold: The default detection threshold (e.g. 0.80). 76 noise_window_s: Seconds of audio history for noise estimation. 77 min_threshold: Floor for adaptive threshold. Default 0.60. 78 max_threshold: Ceiling for adaptive threshold. Default 0.95. 79 snr_boost_db: SNR above this value enables threshold lowering. 80 snr_penalty_db: SNR below this value enables threshold raising. 81 frames_per_second: Expected audio frames per second (default 50 for 20ms). 82 """ 83 84 def __init__( 85 self, 86 base_threshold: float = 0.80, 87 noise_window_s: float = DEFAULT_NOISE_WINDOW_S, 88 min_threshold: float = DEFAULT_MIN_THRESHOLD, 89 max_threshold: float = DEFAULT_MAX_THRESHOLD, 90 snr_boost_db: float = DEFAULT_SNR_BOOST_DB, 91 snr_penalty_db: float = DEFAULT_SNR_PENALTY_DB, 92 frames_per_second: float = 50.0, 93 ) -> None: 94 self._base_threshold = base_threshold 95 self._min_threshold = min_threshold 96 self._max_threshold = max_threshold 97 self._snr_boost_db = snr_boost_db 98 self._snr_penalty_db = snr_penalty_db 99 100 window_frames = max(1, int(noise_window_s * frames_per_second)) 101 self._rms_history: deque[float] = deque(maxlen=window_frames) 102 self._current_rms: float = 0.0 103 self._noise_floor_rms: float = 0.0 104 105 @property 106 def base_threshold(self) -> float: 107 """The unadjusted detection threshold.""" 108 return self._base_threshold 109 110 @property 111 def noise_floor(self) -> float: 112 """Current estimated noise floor RMS.""" 113 return self._noise_floor_rms 114 115 def update(self, audio_frame: np.ndarray) -> float: 116 """Update noise estimate with a new audio frame and return adjusted threshold. 117 118 The audio should be float32 values. Both normalized [-1,1] and int16-range 119 float32 are accepted; the profiler works on relative ratios so absolute 120 scale doesn't matter. 121 122 Args: 123 audio_frame: 1-D float32 audio samples (any length). 124 125 Returns: 126 The adaptively adjusted detection threshold. 127 """ 128 rms = float(np.sqrt(np.mean(audio_frame.astype(np.float64) ** 2))) 129 self._current_rms = rms 130 self._rms_history.append(rms) 131 132 # Estimate noise floor as 10th percentile of recent RMS values 133 if len(self._rms_history) >= 10: 134 sorted_rms = sorted(self._rms_history) 135 idx = max(0, int(len(sorted_rms) * 0.10)) 136 self._noise_floor_rms = sorted_rms[idx] 137 elif self._rms_history: 138 self._noise_floor_rms = min(self._rms_history) 139 else: 140 self._noise_floor_rms = 0.0 141 142 return self._compute_adjusted_threshold() 143 144 def _compute_adjusted_threshold(self) -> float: 145 """Compute threshold adjustment based on current SNR estimate.""" 146 snr_db = self._estimate_snr_db() 147 148 # No adjustment if we don't have enough data 149 if len(self._rms_history) < 10: 150 return self._base_threshold 151 152 if snr_db > self._snr_boost_db: 153 # High SNR: lower threshold proportionally (max 0.10 reduction) 154 excess = snr_db - self._snr_boost_db 155 reduction = min(0.10, excess * 0.01) 156 adjusted = self._base_threshold - reduction 157 elif snr_db < self._snr_penalty_db: 158 # Low SNR: raise threshold proportionally (max 0.10 increase) 159 deficit = self._snr_penalty_db - snr_db 160 increase = min(0.10, deficit * 0.02) 161 adjusted = self._base_threshold + increase 162 else: 163 adjusted = self._base_threshold 164 165 return max(self._min_threshold, min(self._max_threshold, adjusted)) 166 167 def _estimate_snr_db(self) -> float: 168 """Estimate signal-to-noise ratio in decibels. 169 170 Uses the current frame RMS as signal and the noise floor as noise. 171 Returns 0.0 if noise floor is effectively zero. 172 """ 173 if self._noise_floor_rms < 1e-10: 174 # No noise estimate yet — return a neutral value 175 return self._snr_boost_db # neutral, no adjustment 176 if self._current_rms < 1e-10: 177 return 0.0 178 179 ratio = self._current_rms / self._noise_floor_rms 180 return 20.0 * math.log10(max(ratio, 1e-10)) 181 182 def get_profile(self) -> NoiseProfile: 183 """Return a snapshot of the current noise state. 184 185 Returns: 186 NoiseProfile with noise floor, signal RMS, SNR, and adjusted threshold. 187 """ 188 snr_db = self._estimate_snr_db() 189 adjusted = self._compute_adjusted_threshold() 190 191 return NoiseProfile( 192 noise_rms=self._noise_floor_rms, 193 signal_rms=self._current_rms, 194 snr_db=snr_db, 195 adjusted_threshold=adjusted, 196 base_threshold=self._base_threshold, 197 ) 198 199 def reset(self) -> None: 200 """Clear noise history and reset estimates.""" 201 self._rms_history.clear() 202 self._current_rms = 0.0 203 self._noise_floor_rms = 0.0
Estimates ambient noise and adjusts detection threshold.
The profiler maintains a rolling window of RMS energy measurements. The noise floor is estimated as the 10th percentile of recent RMS values (capturing the quietest frames, which are likely ambient noise).
Threshold adjustment logic:
- High SNR (signal clearly above noise): lower threshold slightly to improve sensitivity.
- Low SNR (signal barely above noise): raise threshold to reduce false alarms from noise bursts.
- The adjusted threshold is always clamped to
[min_threshold, max_threshold].
Adaptive threshold bounds:
min_thresholddefaults to0.60. In very quiet environments, or when the current frame is far above the estimated noise floor, the threshold may be lowered to improve sensitivity, but it will never be reduced below this floor.max_thresholddefaults to0.95. In very noisy environments, or when the current frame is close to the estimated noise floor, the threshold may be raised to suppress false accepts, but it will never be increased above this ceiling.- Before enough history is collected (fewer than 10 RMS frames), the
profiler returns
base_thresholdwith no adaptation.
Arguments:
- base_threshold: The default detection threshold (e.g. 0.80).
- noise_window_s: Seconds of audio history for noise estimation.
- min_threshold: Floor for adaptive threshold. Default 0.60.
- max_threshold: Ceiling for adaptive threshold. Default 0.95.
- snr_boost_db: SNR above this value enables threshold lowering.
- snr_penalty_db: SNR below this value enables threshold raising.
- frames_per_second: Expected audio frames per second (default 50 for 20ms).
84 def __init__( 85 self, 86 base_threshold: float = 0.80, 87 noise_window_s: float = DEFAULT_NOISE_WINDOW_S, 88 min_threshold: float = DEFAULT_MIN_THRESHOLD, 89 max_threshold: float = DEFAULT_MAX_THRESHOLD, 90 snr_boost_db: float = DEFAULT_SNR_BOOST_DB, 91 snr_penalty_db: float = DEFAULT_SNR_PENALTY_DB, 92 frames_per_second: float = 50.0, 93 ) -> None: 94 self._base_threshold = base_threshold 95 self._min_threshold = min_threshold 96 self._max_threshold = max_threshold 97 self._snr_boost_db = snr_boost_db 98 self._snr_penalty_db = snr_penalty_db 99 100 window_frames = max(1, int(noise_window_s * frames_per_second)) 101 self._rms_history: deque[float] = deque(maxlen=window_frames) 102 self._current_rms: float = 0.0 103 self._noise_floor_rms: float = 0.0
105 @property 106 def base_threshold(self) -> float: 107 """The unadjusted detection threshold.""" 108 return self._base_threshold
The unadjusted detection threshold.
110 @property 111 def noise_floor(self) -> float: 112 """Current estimated noise floor RMS.""" 113 return self._noise_floor_rms
Current estimated noise floor RMS.
115 def update(self, audio_frame: np.ndarray) -> float: 116 """Update noise estimate with a new audio frame and return adjusted threshold. 117 118 The audio should be float32 values. Both normalized [-1,1] and int16-range 119 float32 are accepted; the profiler works on relative ratios so absolute 120 scale doesn't matter. 121 122 Args: 123 audio_frame: 1-D float32 audio samples (any length). 124 125 Returns: 126 The adaptively adjusted detection threshold. 127 """ 128 rms = float(np.sqrt(np.mean(audio_frame.astype(np.float64) ** 2))) 129 self._current_rms = rms 130 self._rms_history.append(rms) 131 132 # Estimate noise floor as 10th percentile of recent RMS values 133 if len(self._rms_history) >= 10: 134 sorted_rms = sorted(self._rms_history) 135 idx = max(0, int(len(sorted_rms) * 0.10)) 136 self._noise_floor_rms = sorted_rms[idx] 137 elif self._rms_history: 138 self._noise_floor_rms = min(self._rms_history) 139 else: 140 self._noise_floor_rms = 0.0 141 142 return self._compute_adjusted_threshold()
Update noise estimate with a new audio frame and return adjusted threshold.
The audio should be float32 values. Both normalized [-1,1] and int16-range float32 are accepted; the profiler works on relative ratios so absolute scale doesn't matter.
Arguments:
- audio_frame: 1-D float32 audio samples (any length).
Returns:
The adaptively adjusted detection threshold.
182 def get_profile(self) -> NoiseProfile: 183 """Return a snapshot of the current noise state. 184 185 Returns: 186 NoiseProfile with noise floor, signal RMS, SNR, and adjusted threshold. 187 """ 188 snr_db = self._estimate_snr_db() 189 adjusted = self._compute_adjusted_threshold() 190 191 return NoiseProfile( 192 noise_rms=self._noise_floor_rms, 193 signal_rms=self._current_rms, 194 snr_db=snr_db, 195 adjusted_threshold=adjusted, 196 base_threshold=self._base_threshold, 197 )
Return a snapshot of the current noise state.
Returns:
NoiseProfile with noise floor, signal RMS, SNR, and adjusted threshold.
99class PowerManager: 100 """Energy-aware inference controller. 101 102 Reduces inference frequency based on battery level, silence detection, 103 and explicit duty cycling configuration. 104 105 Modes of power saving: 106 1. **Duty cycling**: Process every Nth frame when idle (no recent detections). 107 When a score above ``activity_threshold`` is detected, switches to 108 full-rate processing for ``active_window_s`` seconds. 109 2. **Silence skipping**: Skip inference when audio RMS is below 110 ``silence_rms`` (no speech possible). 111 3. **Battery-aware**: When on battery and below ``battery_low_pct``, 112 increase the duty cycle factor by ``battery_multiplier``. 113 114 Args: 115 duty_cycle_n: Base duty cycle (process every Nth frame). Default 1 (no skipping). 116 silence_rms: RMS threshold in int16 scale (typical range 0-32768) below which 117 frames are skipped. Default 10.0 filters near-silence. 118 activity_threshold: Score above which the system enters "active" mode. Default 0.3. 119 active_window_s: Seconds to stay in full-rate mode after activity. Default 3.0. 120 battery_low_pct: Battery percent below which power saving kicks in. Default 20. 121 battery_multiplier: Multiply duty_cycle_n by this when on low battery. Default 3. 122 check_battery_interval_s: How often to re-check battery. Default 60. 123 """ 124 125 def __init__( 126 self, 127 duty_cycle_n: int = 1, 128 silence_rms: float = 10.0, 129 activity_threshold: float = 0.3, 130 active_window_s: float = 3.0, 131 battery_low_pct: int = 20, 132 battery_multiplier: int = 3, 133 check_battery_interval_s: float = 60.0, 134 ) -> None: 135 if duty_cycle_n < 1: 136 raise ValueError(f"duty_cycle_n must be >= 1, got {duty_cycle_n}") 137 138 self._base_duty = duty_cycle_n 139 self._silence_rms = silence_rms 140 self._activity_threshold = activity_threshold 141 self._active_window_s = active_window_s 142 self._battery_low_pct = battery_low_pct 143 self._battery_multiplier = battery_multiplier 144 self._check_interval = check_battery_interval_s 145 146 # Lock protects all mutable state below 147 self._lock = threading.Lock() 148 149 # State 150 self._frame_counter = 0 151 self._frames_processed = 0 152 self._frames_skipped = 0 153 self._silence_skipped = 0 154 self._last_activity_time = 0.0 155 self._is_active = False 156 157 # Battery state (cached) 158 self._battery_pct = -1 159 self._is_on_battery = False 160 self._last_battery_check = 0.0 161 162 @property 163 def effective_duty_cycle(self) -> int: 164 """Current effective duty cycle considering battery and activity state.""" 165 with self._lock: 166 return self._effective_duty_cycle_unlocked() 167 168 def _effective_duty_cycle_unlocked(self) -> int: 169 """Compute duty cycle without acquiring the lock (caller must hold it).""" 170 if self._is_active: 171 return 1 # Full rate when active 172 173 base = self._base_duty 174 175 # Battery scaling 176 if self._is_on_battery and 0 <= self._battery_pct < self._battery_low_pct: 177 base = base * self._battery_multiplier 178 179 return max(1, base) 180 181 def should_process(self, audio_frame: np.ndarray) -> bool: 182 """Decide whether this frame should be processed or skipped. 183 184 Call this before running inference. If it returns False, skip the 185 frame to save CPU/power. 186 187 Args: 188 audio_frame: 1-D audio samples (int16-range float32 or actual int16). 189 190 Returns: 191 True if inference should run on this frame. 192 """ 193 # Compute RMS outside the lock (pure computation on immutable input) 194 rms = float(np.sqrt(np.mean(audio_frame.astype(np.float32) ** 2))) 195 196 with self._lock: 197 self._frame_counter += 1 198 199 # Periodically check battery 200 now = time.monotonic() 201 if now - self._last_battery_check > self._check_interval: 202 self._battery_pct, self._is_on_battery = _get_battery_info() 203 self._last_battery_check = now 204 205 # Check if active window has expired 206 if self._is_active and (now - self._last_activity_time > self._active_window_s): 207 self._is_active = False 208 209 # Silence gate: skip if audio is very quiet 210 if rms < self._silence_rms: 211 self._frames_skipped += 1 212 self._silence_skipped += 1 213 return False 214 215 # Duty cycling: process every Nth frame 216 duty = self._effective_duty_cycle_unlocked() 217 if duty > 1 and (self._frame_counter % duty) != 0: 218 self._frames_skipped += 1 219 return False 220 221 self._frames_processed += 1 222 return True 223 224 def report_score(self, score: float) -> None: 225 """Report a detection score to the power manager. 226 227 If the score is above the activity threshold, the manager switches 228 to full-rate processing mode for ``active_window_s`` seconds. 229 230 Args: 231 score: Detection score from the model. 232 """ 233 if score >= self._activity_threshold: 234 with self._lock: 235 self._is_active = True 236 self._last_activity_time = time.monotonic() 237 238 def get_state(self) -> PowerState: 239 """Return current power management state snapshot.""" 240 with self._lock: 241 total = self._frames_processed + self._frames_skipped 242 # Returns 0.0 if no frames received (no data yet to measure). 243 rate = self._frames_processed / total if total > 0 else 0.0 244 245 return PowerState( 246 battery_percent=self._battery_pct, 247 is_on_battery=self._is_on_battery, 248 duty_cycle_n=self._effective_duty_cycle_unlocked(), 249 frames_processed=self._frames_processed, 250 frames_skipped=self._frames_skipped, 251 silence_skipped=self._silence_skipped, 252 effective_rate=rate, 253 ) 254 255 def reset(self) -> None: 256 """Reset all counters and state.""" 257 with self._lock: 258 self._frame_counter = 0 259 self._frames_processed = 0 260 self._frames_skipped = 0 261 self._silence_skipped = 0 262 self._last_activity_time = 0.0 263 self._is_active = False
Energy-aware inference controller.
Reduces inference frequency based on battery level, silence detection, and explicit duty cycling configuration.
Modes of power saving:
- Duty cycling: Process every Nth frame when idle (no recent detections).
When a score above
activity_thresholdis detected, switches to full-rate processing foractive_window_sseconds. - Silence skipping: Skip inference when audio RMS is below
silence_rms(no speech possible). - Battery-aware: When on battery and below
battery_low_pct, increase the duty cycle factor bybattery_multiplier.
Arguments:
- duty_cycle_n: Base duty cycle (process every Nth frame). Default 1 (no skipping).
- silence_rms: RMS threshold in int16 scale (typical range 0-32768) below which frames are skipped. Default 10.0 filters near-silence.
- activity_threshold: Score above which the system enters "active" mode. Default 0.3.
- active_window_s: Seconds to stay in full-rate mode after activity. Default 3.0.
- battery_low_pct: Battery percent below which power saving kicks in. Default 20.
- battery_multiplier: Multiply duty_cycle_n by this when on low battery. Default 3.
- check_battery_interval_s: How often to re-check battery. Default 60.
125 def __init__( 126 self, 127 duty_cycle_n: int = 1, 128 silence_rms: float = 10.0, 129 activity_threshold: float = 0.3, 130 active_window_s: float = 3.0, 131 battery_low_pct: int = 20, 132 battery_multiplier: int = 3, 133 check_battery_interval_s: float = 60.0, 134 ) -> None: 135 if duty_cycle_n < 1: 136 raise ValueError(f"duty_cycle_n must be >= 1, got {duty_cycle_n}") 137 138 self._base_duty = duty_cycle_n 139 self._silence_rms = silence_rms 140 self._activity_threshold = activity_threshold 141 self._active_window_s = active_window_s 142 self._battery_low_pct = battery_low_pct 143 self._battery_multiplier = battery_multiplier 144 self._check_interval = check_battery_interval_s 145 146 # Lock protects all mutable state below 147 self._lock = threading.Lock() 148 149 # State 150 self._frame_counter = 0 151 self._frames_processed = 0 152 self._frames_skipped = 0 153 self._silence_skipped = 0 154 self._last_activity_time = 0.0 155 self._is_active = False 156 157 # Battery state (cached) 158 self._battery_pct = -1 159 self._is_on_battery = False 160 self._last_battery_check = 0.0
162 @property 163 def effective_duty_cycle(self) -> int: 164 """Current effective duty cycle considering battery and activity state.""" 165 with self._lock: 166 return self._effective_duty_cycle_unlocked()
Current effective duty cycle considering battery and activity state.
181 def should_process(self, audio_frame: np.ndarray) -> bool: 182 """Decide whether this frame should be processed or skipped. 183 184 Call this before running inference. If it returns False, skip the 185 frame to save CPU/power. 186 187 Args: 188 audio_frame: 1-D audio samples (int16-range float32 or actual int16). 189 190 Returns: 191 True if inference should run on this frame. 192 """ 193 # Compute RMS outside the lock (pure computation on immutable input) 194 rms = float(np.sqrt(np.mean(audio_frame.astype(np.float32) ** 2))) 195 196 with self._lock: 197 self._frame_counter += 1 198 199 # Periodically check battery 200 now = time.monotonic() 201 if now - self._last_battery_check > self._check_interval: 202 self._battery_pct, self._is_on_battery = _get_battery_info() 203 self._last_battery_check = now 204 205 # Check if active window has expired 206 if self._is_active and (now - self._last_activity_time > self._active_window_s): 207 self._is_active = False 208 209 # Silence gate: skip if audio is very quiet 210 if rms < self._silence_rms: 211 self._frames_skipped += 1 212 self._silence_skipped += 1 213 return False 214 215 # Duty cycling: process every Nth frame 216 duty = self._effective_duty_cycle_unlocked() 217 if duty > 1 and (self._frame_counter % duty) != 0: 218 self._frames_skipped += 1 219 return False 220 221 self._frames_processed += 1 222 return True
Decide whether this frame should be processed or skipped.
Call this before running inference. If it returns False, skip the frame to save CPU/power.
Arguments:
- audio_frame: 1-D audio samples (int16-range float32 or actual int16).
Returns:
True if inference should run on this frame.
224 def report_score(self, score: float) -> None: 225 """Report a detection score to the power manager. 226 227 If the score is above the activity threshold, the manager switches 228 to full-rate processing mode for ``active_window_s`` seconds. 229 230 Args: 231 score: Detection score from the model. 232 """ 233 if score >= self._activity_threshold: 234 with self._lock: 235 self._is_active = True 236 self._last_activity_time = time.monotonic()
Report a detection score to the power manager.
If the score is above the activity threshold, the manager switches
to full-rate processing mode for active_window_s seconds.
Arguments:
- score: Detection score from the model.
238 def get_state(self) -> PowerState: 239 """Return current power management state snapshot.""" 240 with self._lock: 241 total = self._frames_processed + self._frames_skipped 242 # Returns 0.0 if no frames received (no data yet to measure). 243 rate = self._frames_processed / total if total > 0 else 0.0 244 245 return PowerState( 246 battery_percent=self._battery_pct, 247 is_on_battery=self._is_on_battery, 248 duty_cycle_n=self._effective_duty_cycle_unlocked(), 249 frames_processed=self._frames_processed, 250 frames_skipped=self._frames_skipped, 251 silence_skipped=self._silence_skipped, 252 effective_rate=rate, 253 )
Return current power management state snapshot.
255 def reset(self) -> None: 256 """Reset all counters and state.""" 257 with self._lock: 258 self._frame_counter = 0 259 self._frames_processed = 0 260 self._frames_skipped = 0 261 self._silence_skipped = 0 262 self._last_activity_time = 0.0 263 self._is_active = False
Reset all counters and state.
303class VADEngine: 304 """Voice Activity Detection engine. 305 306 Auto-selects the best available backend unless explicitly specified. 307 308 Example:: 309 310 vad = VADEngine(backend="webrtc") # or "silero", "rms", "auto" 311 prob = vad.process_frame(audio_20ms_bytes) 312 is_speech = prob > 0.5 313 """ 314 315 def __init__( 316 self, 317 backend: str | VADBackend = VADBackend.AUTO, 318 **backend_kwargs: object, 319 ) -> None: 320 """Initialize the VAD engine. 321 322 Args: 323 backend: One of "auto", "webrtc", "silero", "rms". 324 "auto" selects the best available backend. 325 **backend_kwargs: Backend-specific arguments. 326 For "webrtc": aggressiveness (0–3, default 2) 327 For "rms": speech_threshold, silence_threshold 328 """ 329 if isinstance(backend, str): 330 backend = VADBackend(backend) 331 332 self._backend_name, self._backend = _create_backend(backend, **backend_kwargs) 333 334 @property 335 def backend_name(self) -> str: 336 """Name of the active backend.""" 337 return self._backend_name.value 338 339 def process_frame(self, audio: bytes | np.ndarray) -> float: 340 """Process a 20ms audio frame. 341 342 Args: 343 audio: 320 samples of 16kHz mono audio. Accepted formats: 344 - bytes/bytearray: int16 PCM (640 bytes for 20ms) 345 - np.ndarray float32/float64: assumed normalized to [-1.0, 1.0], 346 scaled by 32768 to int16. Use int16 dtype for int16-range data. 347 - np.ndarray int16: converted to bytes directly 348 349 Returns: 350 Speech probability in [0.0, 1.0]. 351 1.0 = definitely speech, 0.0 = definitely silence. 352 """ 353 audio_bytes = _coerce_to_bytes(audio) 354 return self._backend.process_frame(audio_bytes) 355 356 def is_speech(self, audio: bytes | np.ndarray, threshold: float = 0.5) -> bool: 357 """Convenience method: returns True if speech probability exceeds threshold.""" 358 return self.process_frame(audio) >= threshold 359 360 def reset(self) -> None: 361 """Reset internal state (useful between utterances).""" 362 self._backend.reset() 363 364 def close(self) -> None: 365 """Release backend resources.""" 366 self._backend = None # type: ignore[assignment] 367 368 def __enter__(self) -> VADEngine: 369 """Enter sync context manager. Returns self.""" 370 return self 371 372 def __exit__( 373 self, 374 exc_type: type[BaseException] | None, 375 exc_val: BaseException | None, 376 exc_tb: object, 377 ) -> None: 378 """Exit sync context manager. Releases backend resources.""" 379 self.close()
Voice Activity Detection engine.
Auto-selects the best available backend unless explicitly specified.
Example::
vad = VADEngine(backend="webrtc") # or "silero", "rms", "auto"
prob = vad.process_frame(audio_20ms_bytes)
is_speech = prob > 0.5
315 def __init__( 316 self, 317 backend: str | VADBackend = VADBackend.AUTO, 318 **backend_kwargs: object, 319 ) -> None: 320 """Initialize the VAD engine. 321 322 Args: 323 backend: One of "auto", "webrtc", "silero", "rms". 324 "auto" selects the best available backend. 325 **backend_kwargs: Backend-specific arguments. 326 For "webrtc": aggressiveness (0–3, default 2) 327 For "rms": speech_threshold, silence_threshold 328 """ 329 if isinstance(backend, str): 330 backend = VADBackend(backend) 331 332 self._backend_name, self._backend = _create_backend(backend, **backend_kwargs)
Initialize the VAD engine.
Arguments:
- backend: One of "auto", "webrtc", "silero", "rms". "auto" selects the best available backend.
- **backend_kwargs: Backend-specific arguments. For "webrtc": aggressiveness (0–3, default 2) For "rms": speech_threshold, silence_threshold
334 @property 335 def backend_name(self) -> str: 336 """Name of the active backend.""" 337 return self._backend_name.value
Name of the active backend.
339 def process_frame(self, audio: bytes | np.ndarray) -> float: 340 """Process a 20ms audio frame. 341 342 Args: 343 audio: 320 samples of 16kHz mono audio. Accepted formats: 344 - bytes/bytearray: int16 PCM (640 bytes for 20ms) 345 - np.ndarray float32/float64: assumed normalized to [-1.0, 1.0], 346 scaled by 32768 to int16. Use int16 dtype for int16-range data. 347 - np.ndarray int16: converted to bytes directly 348 349 Returns: 350 Speech probability in [0.0, 1.0]. 351 1.0 = definitely speech, 0.0 = definitely silence. 352 """ 353 audio_bytes = _coerce_to_bytes(audio) 354 return self._backend.process_frame(audio_bytes)
Process a 20ms audio frame.
Arguments:
- audio: 320 samples of 16kHz mono audio. Accepted formats:
- bytes/bytearray: int16 PCM (640 bytes for 20ms)
- np.ndarray float32/float64: assumed normalized to [-1.0, 1.0], scaled by 32768 to int16. Use int16 dtype for int16-range data.
- np.ndarray int16: converted to bytes directly
Returns:
Speech probability in [0.0, 1.0]. 1.0 = definitely speech, 0.0 = definitely silence.
356 def is_speech(self, audio: bytes | np.ndarray, threshold: float = 0.5) -> bool: 357 """Convenience method: returns True if speech probability exceeds threshold.""" 358 return self.process_frame(audio) >= threshold
Convenience method: returns True if speech probability exceeds threshold.
56class TTSEngine: 57 """On-device TTS using Kokoro-82M (Apache 2.0 model). 58 59 Thread-safe: multiple threads can call ``synthesize()`` concurrently. 60 Calls are serialized via ``_synthesis_lock`` since kokoro-onnx is not 61 guaranteed to be thread-safe. Model initialization is separately guarded 62 by ``_lock`` (lazy load on first use). 63 64 Model files required (auto-downloaded on first use): 65 - ``kokoro_v1_0.onnx`` — Kokoro-82M model (~326MB) 66 - ``kokoro_voices_v1_0.bin`` — Voice embeddings (~28MB) 67 68 Example:: 69 70 tts = TTSEngine(voice="af_heart") 71 audio = tts.synthesize("Hello, world!") # returns np.ndarray 72 tts.play(audio) # blocking by default 73 tts.play_async(audio) # optional non-blocking playback 74 """ 75 76 def __init__( 77 self, 78 voice: str = DEFAULT_VOICE, 79 speed: float = 1.0, 80 sample_rate: int = TARGET_SAMPLE_RATE, 81 ) -> None: 82 """Initialize the TTS engine. 83 84 Args: 85 voice: Kokoro voice name. Default "af_heart". 86 See ``AVAILABLE_VOICES`` for full list. 87 speed: Speech speed multiplier. 1.0 = normal, 1.2 = 20% faster. 88 sample_rate: Output sample rate. Default 16kHz (pipeline standard). 89 Kokoro outputs 24kHz; resampled if different. 90 """ 91 if voice not in AVAILABLE_VOICES: 92 raise ValueError(f"Unknown voice '{voice}'. Available: {', '.join(AVAILABLE_VOICES)}") 93 94 if not (0.1 <= speed <= 3.0): 95 raise ValueError(f"Speed must be between 0.1 and 3.0, got {speed}") 96 97 self.voice = voice 98 self.speed = speed 99 self.sample_rate = sample_rate 100 self._lock = threading.Lock() 101 self._synthesis_lock = threading.Lock() 102 self._kokoro: object | None = None 103 104 # Lazy initialization — load model on first use 105 logger.info("TTSEngine created: voice=%s, speed=%.1f", voice, speed) 106 107 def _get_kokoro(self) -> object: 108 """Lazy-load the Kokoro model (thread-safe).""" 109 with self._lock: 110 if self._kokoro is None: 111 self._kokoro = self._load_kokoro() 112 return self._kokoro 113 114 def _load_kokoro(self) -> object: 115 """Load the Kokoro ONNX model.""" 116 try: 117 import kokoro_onnx 118 except ImportError as e: 119 raise ImportError( 120 "kokoro-onnx is not installed. Install with: pip install 'violawake[tts]'" 121 ) from e 122 123 try: 124 model_path = get_model_path("kokoro_v1_0") 125 voices_path = get_model_path("kokoro_voices_v1_0") 126 except FileNotFoundError as e: 127 raise ModelNotFoundError( 128 "Kokoro models not found. Run:\n" 129 " violawake-download --model kokoro_v1_0\n" 130 " violawake-download --model kokoro_voices_v1_0" 131 ) from e 132 133 try: 134 kokoro = kokoro_onnx.Kokoro(str(model_path), str(voices_path)) 135 except Exception as e: 136 raise ModelLoadError(f"Failed to load Kokoro model: {e}") from e 137 138 logger.info("Kokoro-82M loaded: %s", model_path) 139 return kokoro 140 141 def synthesize(self, text: str) -> np.ndarray: 142 """Synthesize text to audio. 143 144 Args: 145 text: Text to synthesize. May be multi-sentence. 146 Long text is processed as a single batch call. 147 148 Returns: 149 Audio samples as float32 numpy array at ``self.sample_rate``. 150 """ 151 if not text.strip(): 152 return np.zeros(0, dtype=np.float32) 153 154 kokoro = self._get_kokoro() 155 156 # Hold synthesis lock to serialize access to the kokoro model, 157 # which is not guaranteed to be thread-safe by kokoro-onnx. 158 with self._synthesis_lock: 159 try: 160 # kokoro-onnx API: returns (samples, sample_rate) 161 audio, sr = kokoro.create( # type: ignore[attr-defined] 162 text, 163 voice=self.voice, 164 speed=self.speed, 165 lang="en-us", 166 ) 167 except Exception as e: 168 logger.exception("TTS synthesis failed for text: %.50s...", text) 169 raise RuntimeError(f"TTS synthesis failed: {e}") from e 170 171 audio = np.asarray(audio, dtype=np.float32) 172 173 # Resample if needed 174 if sr != self.sample_rate: 175 audio = self._resample(audio, sr, self.sample_rate) 176 177 return audio 178 179 def synthesize_chunked(self, text: str) -> Generator[np.ndarray, None, None]: 180 """Synthesize text sentence-by-sentence for lower latency. 181 182 Splits text at sentence boundaries and yields audio for each sentence 183 as soon as it's synthesized. This allows playback to begin before 184 the full text is processed — matching the pattern from production Viola. 185 186 Args: 187 text: Text to synthesize. May be multi-sentence. 188 189 Yields: 190 Audio chunks (one per sentence) as float32 numpy arrays. 191 """ 192 sentences = self._split_sentences(text) 193 for sentence in sentences: 194 if sentence.strip(): 195 audio = self.synthesize(sentence) 196 if audio.size > 0: 197 yield audio 198 199 def play(self, audio: np.ndarray, *, blocking: bool = True) -> None: 200 """Play audio through the default output device. 201 202 Args: 203 audio: Float32 numpy array of audio samples. 204 blocking: If True, wait for playback to finish. If False, return 205 immediately after starting playback. 206 """ 207 try: 208 import sounddevice as sd 209 except ImportError as sd_err: 210 logger.debug("sounddevice not available (%s), falling back to pyaudio", sd_err) 211 try: 212 self._play_pyaudio(audio, blocking=blocking) 213 except ImportError as e: 214 raise ImportError( 215 "No audio playback backend is installed. " 216 "Install sounddevice with: pip install sounddevice " 217 "or install violawake[audio] for PyAudio playback." 218 ) from e 219 return 220 221 # Copy to prevent mutation of caller's array during async playback 222 sd.play(audio.copy(), samplerate=self.sample_rate, blocking=blocking) 223 224 def play_async(self, audio: np.ndarray) -> None: 225 """Play audio without blocking the calling thread.""" 226 self.play(audio, blocking=False) 227 228 def _play_pyaudio(self, audio: np.ndarray, *, blocking: bool = True) -> None: 229 """Play audio using pyaudio as fallback.""" 230 try: 231 import pyaudio 232 except ImportError: 233 raise ImportError( 234 "pyaudio is required for audio playback. Install with: pip install violawake[audio]" 235 ) from None 236 237 if not blocking: 238 thread = threading.Thread( 239 target=self._play_pyaudio, 240 args=(audio.copy(),), 241 kwargs={"blocking": True}, 242 daemon=True, 243 ) 244 thread.start() 245 return 246 247 clipped = np.clip(audio, -1.0, 1.0) 248 pcm = (clipped * 32767).astype(np.int16) 249 pa = pyaudio.PyAudio() 250 stream = pa.open( 251 format=pyaudio.paInt16, 252 channels=1, 253 rate=self.sample_rate, 254 output=True, 255 ) 256 try: 257 stream.write(pcm.tobytes()) 258 finally: 259 stream.stop_stream() 260 stream.close() 261 pa.terminate() 262 263 @staticmethod 264 def _resample(audio: np.ndarray, src_rate: int, dst_rate: int) -> np.ndarray: 265 """Resample audio using scipy.""" 266 import math 267 268 try: 269 from scipy.signal import resample_poly 270 except ImportError as e: 271 raise ImportError( 272 "scipy is required for audio resampling. Install with: pip install scipy" 273 ) from e 274 gcd = math.gcd(src_rate, dst_rate) 275 return resample_poly(audio, dst_rate // gcd, src_rate // gcd).astype(np.float32) 276 277 @staticmethod 278 def _split_sentences(text: str) -> list[str]: 279 """Split text at sentence boundaries for chunked synthesis. 280 281 Uses a regex that splits on sentence-ending punctuation followed by 282 whitespace and an uppercase letter (or end of string). This avoids 283 false splits on abbreviations ("Dr. Smith"), decimals ("3.14"), 284 and URLs. 285 """ 286 import re 287 288 # Split on sentence-ending punctuation followed by space+uppercase or end of string 289 pattern = r"(?<=[.!?])\s+(?=[A-Z])|(?<=[.!?])\s*$" 290 parts = re.split(pattern, text) 291 return [s.strip() for s in parts if s and s.strip()] 292 293 def close(self) -> None: 294 """Release model resources.""" 295 self._kokoro = None 296 297 def __enter__(self) -> TTSEngine: 298 """Enter sync context manager. Returns self.""" 299 return self 300 301 def __exit__( 302 self, 303 exc_type: type[BaseException] | None, 304 exc_val: BaseException | None, 305 exc_tb: object, 306 ) -> None: 307 """Exit sync context manager. Releases model resources.""" 308 self.close()
On-device TTS using Kokoro-82M (Apache 2.0 model).
Thread-safe: multiple threads can call synthesize() concurrently.
Calls are serialized via _synthesis_lock since kokoro-onnx is not
guaranteed to be thread-safe. Model initialization is separately guarded
by _lock (lazy load on first use).
Model files required (auto-downloaded on first use):
- kokoro_v1_0.onnx — Kokoro-82M model (~326MB)
- kokoro_voices_v1_0.bin — Voice embeddings (~28MB)
Example::
tts = TTSEngine(voice="af_heart")
audio = tts.synthesize("Hello, world!") # returns np.ndarray
tts.play(audio) # blocking by default
tts.play_async(audio) # optional non-blocking playback
76 def __init__( 77 self, 78 voice: str = DEFAULT_VOICE, 79 speed: float = 1.0, 80 sample_rate: int = TARGET_SAMPLE_RATE, 81 ) -> None: 82 """Initialize the TTS engine. 83 84 Args: 85 voice: Kokoro voice name. Default "af_heart". 86 See ``AVAILABLE_VOICES`` for full list. 87 speed: Speech speed multiplier. 1.0 = normal, 1.2 = 20% faster. 88 sample_rate: Output sample rate. Default 16kHz (pipeline standard). 89 Kokoro outputs 24kHz; resampled if different. 90 """ 91 if voice not in AVAILABLE_VOICES: 92 raise ValueError(f"Unknown voice '{voice}'. Available: {', '.join(AVAILABLE_VOICES)}") 93 94 if not (0.1 <= speed <= 3.0): 95 raise ValueError(f"Speed must be between 0.1 and 3.0, got {speed}") 96 97 self.voice = voice 98 self.speed = speed 99 self.sample_rate = sample_rate 100 self._lock = threading.Lock() 101 self._synthesis_lock = threading.Lock() 102 self._kokoro: object | None = None 103 104 # Lazy initialization — load model on first use 105 logger.info("TTSEngine created: voice=%s, speed=%.1f", voice, speed)
Initialize the TTS engine.
Arguments:
- voice: Kokoro voice name. Default "af_heart".
See
AVAILABLE_VOICESfor full list. - speed: Speech speed multiplier. 1.0 = normal, 1.2 = 20% faster.
- sample_rate: Output sample rate. Default 16kHz (pipeline standard). Kokoro outputs 24kHz; resampled if different.
141 def synthesize(self, text: str) -> np.ndarray: 142 """Synthesize text to audio. 143 144 Args: 145 text: Text to synthesize. May be multi-sentence. 146 Long text is processed as a single batch call. 147 148 Returns: 149 Audio samples as float32 numpy array at ``self.sample_rate``. 150 """ 151 if not text.strip(): 152 return np.zeros(0, dtype=np.float32) 153 154 kokoro = self._get_kokoro() 155 156 # Hold synthesis lock to serialize access to the kokoro model, 157 # which is not guaranteed to be thread-safe by kokoro-onnx. 158 with self._synthesis_lock: 159 try: 160 # kokoro-onnx API: returns (samples, sample_rate) 161 audio, sr = kokoro.create( # type: ignore[attr-defined] 162 text, 163 voice=self.voice, 164 speed=self.speed, 165 lang="en-us", 166 ) 167 except Exception as e: 168 logger.exception("TTS synthesis failed for text: %.50s...", text) 169 raise RuntimeError(f"TTS synthesis failed: {e}") from e 170 171 audio = np.asarray(audio, dtype=np.float32) 172 173 # Resample if needed 174 if sr != self.sample_rate: 175 audio = self._resample(audio, sr, self.sample_rate) 176 177 return audio
Synthesize text to audio.
Arguments:
- text: Text to synthesize. May be multi-sentence. Long text is processed as a single batch call.
Returns:
Audio samples as float32 numpy array at
self.sample_rate.
179 def synthesize_chunked(self, text: str) -> Generator[np.ndarray, None, None]: 180 """Synthesize text sentence-by-sentence for lower latency. 181 182 Splits text at sentence boundaries and yields audio for each sentence 183 as soon as it's synthesized. This allows playback to begin before 184 the full text is processed — matching the pattern from production Viola. 185 186 Args: 187 text: Text to synthesize. May be multi-sentence. 188 189 Yields: 190 Audio chunks (one per sentence) as float32 numpy arrays. 191 """ 192 sentences = self._split_sentences(text) 193 for sentence in sentences: 194 if sentence.strip(): 195 audio = self.synthesize(sentence) 196 if audio.size > 0: 197 yield audio
Synthesize text sentence-by-sentence for lower latency.
Splits text at sentence boundaries and yields audio for each sentence as soon as it's synthesized. This allows playback to begin before the full text is processed — matching the pattern from production Viola.
Arguments:
- text: Text to synthesize. May be multi-sentence.
Yields:
Audio chunks (one per sentence) as float32 numpy arrays.
199 def play(self, audio: np.ndarray, *, blocking: bool = True) -> None: 200 """Play audio through the default output device. 201 202 Args: 203 audio: Float32 numpy array of audio samples. 204 blocking: If True, wait for playback to finish. If False, return 205 immediately after starting playback. 206 """ 207 try: 208 import sounddevice as sd 209 except ImportError as sd_err: 210 logger.debug("sounddevice not available (%s), falling back to pyaudio", sd_err) 211 try: 212 self._play_pyaudio(audio, blocking=blocking) 213 except ImportError as e: 214 raise ImportError( 215 "No audio playback backend is installed. " 216 "Install sounddevice with: pip install sounddevice " 217 "or install violawake[audio] for PyAudio playback." 218 ) from e 219 return 220 221 # Copy to prevent mutation of caller's array during async playback 222 sd.play(audio.copy(), samplerate=self.sample_rate, blocking=blocking)
Play audio through the default output device.
Arguments:
- audio: Float32 numpy array of audio samples.
- blocking: If True, wait for playback to finish. If False, return immediately after starting playback.
84class STTEngine: 85 """Speech-to-text transcription via faster-whisper. 86 87 Thread-safe: ``WhisperModel`` is thread-safe for concurrent ``transcribe()`` calls. 88 89 Model is loaded once and reused. First call includes model load time 90 (~1-3s). Subsequent calls are ~380ms (base model, CPU, 3s audio). 91 92 Example:: 93 94 stt = STTEngine(model="base") 95 text = stt.transcribe(audio_np_float32) 96 print(text) # "what's the weather today" 97 """ 98 99 def __init__( 100 self, 101 model: str = DEFAULT_MODEL, 102 device: str = "cpu", 103 compute_type: str = "int8", 104 language: str | None = None, 105 language_cache_ttl_s: float = 60.0, 106 ) -> None: 107 """Initialize the STT engine. 108 109 Args: 110 model: Whisper model size. One of: tiny, base, small, medium, large-v3. 111 Default "base" — good accuracy/speed balance (WER ~9%). 112 device: "cpu" or "cuda". Default "cpu". 113 compute_type: CTranslate2 compute type. "int8" (default), "float16", "float32". 114 "int8" is fastest on CPU with minimal accuracy loss. 115 language: Force a specific language (e.g., "en"). None = auto-detect. 116 language_cache_ttl_s: Cache detected language for N seconds to avoid 117 per-call language detection overhead. 118 """ 119 if model not in MODEL_PROFILES: 120 available = ", ".join(MODEL_PROFILES.keys()) 121 raise ValueError(f"Unknown model '{model}'. Available: {available}") 122 123 self.model_name = model 124 self.device = device 125 self.compute_type = compute_type 126 self.forced_language = language 127 self._language_cache: tuple[str, float] | None = None # (lang, cached_at) 128 self._language_cache_ttl = language_cache_ttl_s 129 self._model: WhisperModel | None = None 130 self._model_lock = threading.Lock() 131 132 profile = MODEL_PROFILES[model] 133 logger.info( 134 "STTEngine created: model=%s, device=%s (WER~%.0f%%, %dMB)", 135 model, 136 device, 137 profile["wer"], 138 profile["vram_mb"], 139 ) 140 141 def _get_model(self) -> WhisperModel: 142 """Lazy-load the Whisper model on first use (thread-safe).""" 143 if self._model is not None: 144 return self._model 145 146 with self._model_lock: 147 # Double-checked locking: another thread may have loaded 148 # the model while we waited for the lock. 149 if self._model is not None: 150 return self._model 151 152 try: 153 from faster_whisper import WhisperModel # type: ignore[import] 154 except ImportError as e: 155 raise ImportError( 156 "faster-whisper is not installed. Install with: pip install 'violawake[stt]'" 157 ) from e 158 159 logger.info("Loading Whisper model '%s'...", self.model_name) 160 t0 = time.perf_counter() 161 self._model = WhisperModel( 162 self.model_name, 163 device=self.device, 164 compute_type=self.compute_type, 165 ) 166 elapsed_ms = (time.perf_counter() - t0) * 1000 167 logger.info("Whisper model loaded in %.0f ms", elapsed_ms) 168 169 return self._model 170 171 def transcribe(self, audio: np.ndarray) -> str: 172 """Transcribe audio to text. 173 174 Note: 175 This engine uses a progressive temperature fallback of 176 ``[0.0, 0.2, 0.4, 0.6, 0.8, 1.0]`` during decoding, which can 177 trigger up to 6 decoding passes and increase latency. For 178 low-latency use cases, prefer a single-pass configuration such as 179 ``temperature_fallback=[0.0]``. 180 181 Args: 182 audio: Float32 numpy array at 16kHz mono. Values should be in [-1.0, 1.0]. 183 184 Returns: 185 Transcribed text as string. Empty string if no speech detected. 186 """ 187 result = self.transcribe_full(audio) 188 return result.text 189 190 def transcribe_streaming( 191 self, 192 audio: np.ndarray, 193 channels_first: bool | None = None, 194 beam_size: int = 5, 195 best_of: int = 5, 196 temperature: list[float] | None = None, 197 ) -> Iterator[TranscriptSegment]: 198 """Stream transcription segments as they become available. 199 200 Uses faster-whisper's generator mode: ``model.transcribe()`` returns a 201 ``(segments_iterator, info)`` tuple. This method yields each 202 ``TranscriptSegment`` one at a time as faster-whisper decodes it, 203 instead of collecting all segments first. 204 205 This is useful when: 206 - You want to display partial results before full transcription completes. 207 - You need to pipe segments to a downstream consumer (TTS, logging, etc.) 208 without waiting for the full buffer to finish. 209 210 Note: 211 Segments with ``no_speech_prob`` above ``NO_SPEECH_THRESHOLD`` are 212 silently skipped (not yielded). 213 214 Args: 215 audio: Float32 numpy array at 16kHz mono, or 2-D stereo. 216 channels_first: Layout hint for 2-D stereo audio (same semantics as 217 ``transcribe_full``). 218 beam_size: Beam search width. Default 5. 219 best_of: Number of candidates when sampling. Default 5. 220 temperature: Temperature schedule. Default ``[0.0, 0.2, 0.4, 0.6, 0.8, 1.0]``. 221 222 Yields: 223 TranscriptSegment — one per decoded segment, in time order. 224 225 Example:: 226 227 stt = STTEngine(model="base") 228 for seg in stt.transcribe_streaming(audio_np): 229 print(f"[{seg.start:.1f}s] {seg.text}") 230 """ 231 if temperature is None: 232 temperature = [0.0, 0.2, 0.4, 0.6, 0.8, 1.0] 233 234 audio = np.asarray(audio, dtype=np.float32) 235 if audio.ndim > 1: 236 if channels_first is True: 237 audio = audio.mean(axis=0) 238 elif channels_first is False: 239 audio = audio.mean(axis=1) 240 else: 241 if audio.shape[0] < audio.shape[1]: 242 audio = audio.mean(axis=0) 243 else: 244 audio = audio.mean(axis=1) 245 246 language = self._get_language() 247 model = self._get_model() 248 249 logger.debug("transcribe_streaming: starting generator on %d samples", len(audio)) 250 251 segments_gen, info = model.transcribe( 252 audio, 253 language=language, 254 vad_filter=True, 255 vad_parameters={"min_silence_duration_ms": 500}, 256 word_timestamps=False, 257 beam_size=beam_size, 258 best_of=best_of, 259 temperature=temperature, 260 ) 261 262 # Update language cache after model.transcribe() returns info — same 263 # logic as transcribe_full, but we must do it before consuming the 264 # generator so the cache is primed for subsequent calls. 265 if language is None and info.language_probability > 0.5: 266 with self._model_lock: 267 self._language_cache = (info.language, time.monotonic()) 268 269 for seg in segments_gen: 270 if seg.no_speech_prob > NO_SPEECH_THRESHOLD: 271 logger.debug( 272 "Skipping silent segment [%.1f-%.1f] no_speech_prob=%.2f", 273 seg.start, 274 seg.end, 275 seg.no_speech_prob, 276 ) 277 continue 278 279 text = seg.text.strip() 280 logger.debug("Streaming segment [%.1f-%.1f]: '%s'", seg.start, seg.end, text) 281 yield TranscriptSegment( 282 text=text, 283 start=seg.start, 284 end=seg.end, 285 no_speech_prob=seg.no_speech_prob, 286 ) 287 288 def transcribe_full( 289 self, 290 audio: np.ndarray, 291 channels_first: bool | None = None, 292 ) -> TranscriptResult: 293 """Transcribe audio and return full result with segments, timing, and metadata. 294 295 Args: 296 audio: Float32 numpy array at 16kHz mono, or 2-D stereo. 297 channels_first: Layout hint for 2-D stereo audio. 298 ``True`` = (channels, samples) e.g. shape (2, 48000). 299 ``False`` = (samples, channels) e.g. shape (48000, 2) — the 300 standard layout. 301 ``None`` (default) = fall back to a shape heuristic (smaller 302 dimension is assumed to be channels). Prefer passing an 303 explicit value to avoid ambiguity with short audio clips. 304 305 Returns: 306 TranscriptResult with text, segments, language, and no_speech_prob. 307 """ 308 audio = np.asarray(audio, dtype=np.float32) 309 if audio.ndim > 1: 310 if channels_first is True: 311 # Explicit: (channels, samples) — e.g. shape (2, 48000) 312 audio = audio.mean(axis=0) 313 elif channels_first is False: 314 # Explicit: (samples, channels) — e.g. shape (48000, 2) 315 audio = audio.mean(axis=1) 316 else: 317 # Legacy heuristic: channels axis is the smaller dimension. 318 if audio.shape[0] < audio.shape[1]: 319 audio = audio.mean(axis=0) 320 else: 321 audio = audio.mean(axis=1) 322 323 # Determine language (use cache if available) 324 language = self._get_language() 325 326 model = self._get_model() 327 t0 = time.perf_counter() 328 329 segments_gen, info = model.transcribe( 330 audio, 331 language=language, 332 vad_filter=True, # Use Silero VAD for silence removal 333 vad_parameters={"min_silence_duration_ms": 500}, 334 word_timestamps=False, 335 beam_size=5, 336 best_of=5, 337 temperature=[0.0, 0.2, 0.4, 0.6, 0.8, 1.0], # Progressive fallback 338 ) 339 340 # Consume the generator (transcription happens here) 341 segments = list(segments_gen) 342 elapsed_ms = (time.perf_counter() - t0) * 1000 343 344 # Update language cache (protected by _model_lock for thread safety) 345 if language is None and info.language_probability > 0.5: 346 with self._model_lock: 347 self._language_cache = (info.language, time.monotonic()) 348 349 transcript_segments = [ 350 TranscriptSegment( 351 text=s.text.strip(), 352 start=s.start, 353 end=s.end, 354 no_speech_prob=s.no_speech_prob, 355 ) 356 for s in segments 357 ] 358 359 full_text = " ".join(s.text for s in transcript_segments).strip() 360 overall_no_speech = max((s.no_speech_prob for s in transcript_segments), default=0.0) 361 362 if overall_no_speech > NO_SPEECH_THRESHOLD: 363 logger.debug( 364 "No speech detected (no_speech_prob=%.2f) — returning empty", 365 overall_no_speech, 366 ) 367 full_text = "" 368 369 logger.debug( 370 "Transcribed in %.0f ms: '%s'", 371 elapsed_ms, 372 full_text[:60] + "..." if len(full_text) > 60 else full_text, 373 ) 374 375 return TranscriptResult( 376 text=full_text, 377 segments=transcript_segments, 378 language=info.language, 379 language_prob=info.language_probability, 380 duration_s=info.duration, 381 no_speech_prob=overall_no_speech, 382 ) 383 384 def _get_language(self) -> str | None: 385 """Return cached language or None for auto-detection. 386 387 Thread-safe: reads ``_language_cache`` under ``_model_lock``. 388 """ 389 if self.forced_language: 390 return self.forced_language 391 392 with self._model_lock: 393 if self._language_cache is not None: 394 lang, cached_at = self._language_cache 395 if time.monotonic() - cached_at < self._language_cache_ttl: 396 return lang 397 398 return None # auto-detect 399 400 def prewarm(self) -> None: 401 """Load the model eagerly (avoids cold-start latency on first transcription).""" 402 self._get_model() 403 logger.info("STTEngine prewarmed: model '%s' loaded", self.model_name) 404 405 def close(self) -> None: 406 """Release model resources.""" 407 with self._model_lock: 408 self._model = None 409 410 def __enter__(self) -> STTEngine: 411 """Enter sync context manager. Returns self.""" 412 return self 413 414 def __exit__( 415 self, 416 exc_type: type[BaseException] | None, 417 exc_val: BaseException | None, 418 exc_tb: object, 419 ) -> None: 420 """Exit sync context manager. Releases model resources.""" 421 self.close()
Speech-to-text transcription via faster-whisper.
Thread-safe: WhisperModel is thread-safe for concurrent transcribe() calls.
Model is loaded once and reused. First call includes model load time (~1-3s). Subsequent calls are ~380ms (base model, CPU, 3s audio).
Example::
stt = STTEngine(model="base")
text = stt.transcribe(audio_np_float32)
print(text) # "what's the weather today"
99 def __init__( 100 self, 101 model: str = DEFAULT_MODEL, 102 device: str = "cpu", 103 compute_type: str = "int8", 104 language: str | None = None, 105 language_cache_ttl_s: float = 60.0, 106 ) -> None: 107 """Initialize the STT engine. 108 109 Args: 110 model: Whisper model size. One of: tiny, base, small, medium, large-v3. 111 Default "base" — good accuracy/speed balance (WER ~9%). 112 device: "cpu" or "cuda". Default "cpu". 113 compute_type: CTranslate2 compute type. "int8" (default), "float16", "float32". 114 "int8" is fastest on CPU with minimal accuracy loss. 115 language: Force a specific language (e.g., "en"). None = auto-detect. 116 language_cache_ttl_s: Cache detected language for N seconds to avoid 117 per-call language detection overhead. 118 """ 119 if model not in MODEL_PROFILES: 120 available = ", ".join(MODEL_PROFILES.keys()) 121 raise ValueError(f"Unknown model '{model}'. Available: {available}") 122 123 self.model_name = model 124 self.device = device 125 self.compute_type = compute_type 126 self.forced_language = language 127 self._language_cache: tuple[str, float] | None = None # (lang, cached_at) 128 self._language_cache_ttl = language_cache_ttl_s 129 self._model: WhisperModel | None = None 130 self._model_lock = threading.Lock() 131 132 profile = MODEL_PROFILES[model] 133 logger.info( 134 "STTEngine created: model=%s, device=%s (WER~%.0f%%, %dMB)", 135 model, 136 device, 137 profile["wer"], 138 profile["vram_mb"], 139 )
Initialize the STT engine.
Arguments:
- model: Whisper model size. One of: tiny, base, small, medium, large-v3. Default "base" — good accuracy/speed balance (WER ~9%).
- device: "cpu" or "cuda". Default "cpu".
- compute_type: CTranslate2 compute type. "int8" (default), "float16", "float32". "int8" is fastest on CPU with minimal accuracy loss.
- language: Force a specific language (e.g., "en"). None = auto-detect.
- language_cache_ttl_s: Cache detected language for N seconds to avoid per-call language detection overhead.
171 def transcribe(self, audio: np.ndarray) -> str: 172 """Transcribe audio to text. 173 174 Note: 175 This engine uses a progressive temperature fallback of 176 ``[0.0, 0.2, 0.4, 0.6, 0.8, 1.0]`` during decoding, which can 177 trigger up to 6 decoding passes and increase latency. For 178 low-latency use cases, prefer a single-pass configuration such as 179 ``temperature_fallback=[0.0]``. 180 181 Args: 182 audio: Float32 numpy array at 16kHz mono. Values should be in [-1.0, 1.0]. 183 184 Returns: 185 Transcribed text as string. Empty string if no speech detected. 186 """ 187 result = self.transcribe_full(audio) 188 return result.text
Transcribe audio to text.
Note:
This engine uses a progressive temperature fallback of
[0.0, 0.2, 0.4, 0.6, 0.8, 1.0]during decoding, which can trigger up to 6 decoding passes and increase latency. For low-latency use cases, prefer a single-pass configuration such astemperature_fallback=[0.0].
Arguments:
- audio: Float32 numpy array at 16kHz mono. Values should be in [-1.0, 1.0].
Returns:
Transcribed text as string. Empty string if no speech detected.
190 def transcribe_streaming( 191 self, 192 audio: np.ndarray, 193 channels_first: bool | None = None, 194 beam_size: int = 5, 195 best_of: int = 5, 196 temperature: list[float] | None = None, 197 ) -> Iterator[TranscriptSegment]: 198 """Stream transcription segments as they become available. 199 200 Uses faster-whisper's generator mode: ``model.transcribe()`` returns a 201 ``(segments_iterator, info)`` tuple. This method yields each 202 ``TranscriptSegment`` one at a time as faster-whisper decodes it, 203 instead of collecting all segments first. 204 205 This is useful when: 206 - You want to display partial results before full transcription completes. 207 - You need to pipe segments to a downstream consumer (TTS, logging, etc.) 208 without waiting for the full buffer to finish. 209 210 Note: 211 Segments with ``no_speech_prob`` above ``NO_SPEECH_THRESHOLD`` are 212 silently skipped (not yielded). 213 214 Args: 215 audio: Float32 numpy array at 16kHz mono, or 2-D stereo. 216 channels_first: Layout hint for 2-D stereo audio (same semantics as 217 ``transcribe_full``). 218 beam_size: Beam search width. Default 5. 219 best_of: Number of candidates when sampling. Default 5. 220 temperature: Temperature schedule. Default ``[0.0, 0.2, 0.4, 0.6, 0.8, 1.0]``. 221 222 Yields: 223 TranscriptSegment — one per decoded segment, in time order. 224 225 Example:: 226 227 stt = STTEngine(model="base") 228 for seg in stt.transcribe_streaming(audio_np): 229 print(f"[{seg.start:.1f}s] {seg.text}") 230 """ 231 if temperature is None: 232 temperature = [0.0, 0.2, 0.4, 0.6, 0.8, 1.0] 233 234 audio = np.asarray(audio, dtype=np.float32) 235 if audio.ndim > 1: 236 if channels_first is True: 237 audio = audio.mean(axis=0) 238 elif channels_first is False: 239 audio = audio.mean(axis=1) 240 else: 241 if audio.shape[0] < audio.shape[1]: 242 audio = audio.mean(axis=0) 243 else: 244 audio = audio.mean(axis=1) 245 246 language = self._get_language() 247 model = self._get_model() 248 249 logger.debug("transcribe_streaming: starting generator on %d samples", len(audio)) 250 251 segments_gen, info = model.transcribe( 252 audio, 253 language=language, 254 vad_filter=True, 255 vad_parameters={"min_silence_duration_ms": 500}, 256 word_timestamps=False, 257 beam_size=beam_size, 258 best_of=best_of, 259 temperature=temperature, 260 ) 261 262 # Update language cache after model.transcribe() returns info — same 263 # logic as transcribe_full, but we must do it before consuming the 264 # generator so the cache is primed for subsequent calls. 265 if language is None and info.language_probability > 0.5: 266 with self._model_lock: 267 self._language_cache = (info.language, time.monotonic()) 268 269 for seg in segments_gen: 270 if seg.no_speech_prob > NO_SPEECH_THRESHOLD: 271 logger.debug( 272 "Skipping silent segment [%.1f-%.1f] no_speech_prob=%.2f", 273 seg.start, 274 seg.end, 275 seg.no_speech_prob, 276 ) 277 continue 278 279 text = seg.text.strip() 280 logger.debug("Streaming segment [%.1f-%.1f]: '%s'", seg.start, seg.end, text) 281 yield TranscriptSegment( 282 text=text, 283 start=seg.start, 284 end=seg.end, 285 no_speech_prob=seg.no_speech_prob, 286 )
Stream transcription segments as they become available.
Uses faster-whisper's generator mode: model.transcribe() returns a
(segments_iterator, info) tuple. This method yields each
TranscriptSegment one at a time as faster-whisper decodes it,
instead of collecting all segments first.
This is useful when:
- You want to display partial results before full transcription completes.
- You need to pipe segments to a downstream consumer (TTS, logging, etc.) without waiting for the full buffer to finish.
Note:
Segments with
no_speech_probaboveNO_SPEECH_THRESHOLDare silently skipped (not yielded).
Arguments:
- audio: Float32 numpy array at 16kHz mono, or 2-D stereo.
- channels_first: Layout hint for 2-D stereo audio (same semantics as
transcribe_full). - beam_size: Beam search width. Default 5.
- best_of: Number of candidates when sampling. Default 5.
- temperature: Temperature schedule. Default
[0.0, 0.2, 0.4, 0.6, 0.8, 1.0].
Yields:
TranscriptSegment — one per decoded segment, in time order.
Example::
stt = STTEngine(model="base")
for seg in stt.transcribe_streaming(audio_np):
print(f"[{seg.start:.1f}s] {seg.text}")
288 def transcribe_full( 289 self, 290 audio: np.ndarray, 291 channels_first: bool | None = None, 292 ) -> TranscriptResult: 293 """Transcribe audio and return full result with segments, timing, and metadata. 294 295 Args: 296 audio: Float32 numpy array at 16kHz mono, or 2-D stereo. 297 channels_first: Layout hint for 2-D stereo audio. 298 ``True`` = (channels, samples) e.g. shape (2, 48000). 299 ``False`` = (samples, channels) e.g. shape (48000, 2) — the 300 standard layout. 301 ``None`` (default) = fall back to a shape heuristic (smaller 302 dimension is assumed to be channels). Prefer passing an 303 explicit value to avoid ambiguity with short audio clips. 304 305 Returns: 306 TranscriptResult with text, segments, language, and no_speech_prob. 307 """ 308 audio = np.asarray(audio, dtype=np.float32) 309 if audio.ndim > 1: 310 if channels_first is True: 311 # Explicit: (channels, samples) — e.g. shape (2, 48000) 312 audio = audio.mean(axis=0) 313 elif channels_first is False: 314 # Explicit: (samples, channels) — e.g. shape (48000, 2) 315 audio = audio.mean(axis=1) 316 else: 317 # Legacy heuristic: channels axis is the smaller dimension. 318 if audio.shape[0] < audio.shape[1]: 319 audio = audio.mean(axis=0) 320 else: 321 audio = audio.mean(axis=1) 322 323 # Determine language (use cache if available) 324 language = self._get_language() 325 326 model = self._get_model() 327 t0 = time.perf_counter() 328 329 segments_gen, info = model.transcribe( 330 audio, 331 language=language, 332 vad_filter=True, # Use Silero VAD for silence removal 333 vad_parameters={"min_silence_duration_ms": 500}, 334 word_timestamps=False, 335 beam_size=5, 336 best_of=5, 337 temperature=[0.0, 0.2, 0.4, 0.6, 0.8, 1.0], # Progressive fallback 338 ) 339 340 # Consume the generator (transcription happens here) 341 segments = list(segments_gen) 342 elapsed_ms = (time.perf_counter() - t0) * 1000 343 344 # Update language cache (protected by _model_lock for thread safety) 345 if language is None and info.language_probability > 0.5: 346 with self._model_lock: 347 self._language_cache = (info.language, time.monotonic()) 348 349 transcript_segments = [ 350 TranscriptSegment( 351 text=s.text.strip(), 352 start=s.start, 353 end=s.end, 354 no_speech_prob=s.no_speech_prob, 355 ) 356 for s in segments 357 ] 358 359 full_text = " ".join(s.text for s in transcript_segments).strip() 360 overall_no_speech = max((s.no_speech_prob for s in transcript_segments), default=0.0) 361 362 if overall_no_speech > NO_SPEECH_THRESHOLD: 363 logger.debug( 364 "No speech detected (no_speech_prob=%.2f) — returning empty", 365 overall_no_speech, 366 ) 367 full_text = "" 368 369 logger.debug( 370 "Transcribed in %.0f ms: '%s'", 371 elapsed_ms, 372 full_text[:60] + "..." if len(full_text) > 60 else full_text, 373 ) 374 375 return TranscriptResult( 376 text=full_text, 377 segments=transcript_segments, 378 language=info.language, 379 language_prob=info.language_probability, 380 duration_s=info.duration, 381 no_speech_prob=overall_no_speech, 382 )
Transcribe audio and return full result with segments, timing, and metadata.
Arguments:
- audio: Float32 numpy array at 16kHz mono, or 2-D stereo.
- channels_first: Layout hint for 2-D stereo audio.
True= (channels, samples) e.g. shape (2, 48000).False= (samples, channels) e.g. shape (48000, 2) — the standard layout.None(default) = fall back to a shape heuristic (smaller dimension is assumed to be channels). Prefer passing an explicit value to avoid ambiguity with short audio clips.
Returns:
TranscriptResult with text, segments, language, and no_speech_prob.
400 def prewarm(self) -> None: 401 """Load the model eagerly (avoids cold-start latency on first transcription).""" 402 self._get_model() 403 logger.info("STTEngine prewarmed: model '%s' loaded", self.model_name)
Load the model eagerly (avoids cold-start latency on first transcription).
433@dataclass 434class StreamingSTTEngine: 435 """Incremental streaming STT: accepts audio chunks, yields segments. 436 437 Audio chunks are pushed one at a time via :meth:`push_chunk`. When the 438 accumulated buffer reaches ``min_buffer_seconds``, :meth:`push_chunk` 439 transparently transcribes the buffer and yields any new segments. You can 440 also force a transcription at any time with :meth:`flush`. 441 442 A sliding-window approach is supported via ``stride_seconds``: after each 443 transcription pass the engine retains the last ``stride_seconds`` of audio 444 so that words near the boundary are not lost on the next pass. Set 445 ``stride_seconds=0.0`` (default) to discard all audio after each pass. 446 447 Thread safety: **not** thread-safe. Call from a single thread or protect 448 externally with a lock. 449 450 Args: 451 model: Whisper model size. One of ``tiny``, ``base``, ``small``, 452 ``medium``, ``large-v3``. Default ``"base"``. 453 device: ``"cpu"`` or ``"cuda"``. Default ``"cpu"``. 454 compute_type: CTranslate2 compute type. Default ``"int8"``. 455 language: Force a specific language code (e.g. ``"en"``). ``None`` 456 for auto-detect. 457 min_buffer_seconds: Minimum seconds of audio to accumulate before 458 attempting a transcription pass. Shorter values 459 mean lower latency but more frequent (and 460 potentially noisier) passes. Default ``2.0``. 461 stride_seconds: Seconds of audio overlap to retain between passes 462 (sliding-window). Default ``0.0`` (no overlap). 463 sample_rate: Sample rate of incoming audio chunks. Default ``16000``. 464 465 Example:: 466 467 streaming = StreamingSTTEngine(model="base", min_buffer_seconds=2.0) 468 for chunk in mic_chunks: 469 for segment in streaming.push_chunk(chunk): 470 print(f"[{segment.start:.1f}s] {segment.text}") 471 472 # Force final transcription when done 473 for segment in streaming.flush(): 474 print(f"[{segment.start:.1f}s] {segment.text}") 475 """ 476 477 model: str = "base" 478 device: str = "cpu" 479 compute_type: str = "int8" 480 language: str | None = None 481 min_buffer_seconds: float = 2.0 482 stride_seconds: float = _DEFAULT_STRIDE_S 483 sample_rate: int = 16_000 484 485 # Internal state — populated post-init; not part of the public constructor. 486 _engine: STTEngine = field(init=False, repr=False) 487 _buffer: list[np.ndarray] = field(init=False, repr=False, default_factory=list) 488 _buffer_samples: int = field(init=False, repr=False, default=0) 489 490 def __post_init__(self) -> None: 491 self._engine = STTEngine( 492 model=self.model, 493 device=self.device, 494 compute_type=self.compute_type, 495 language=self.language, 496 ) 497 self._buffer = [] 498 self._buffer_samples = 0 499 logger.info( 500 "StreamingSTTEngine created: model=%s, min_buffer=%.1fs, stride=%.1fs", 501 self.model, 502 self.min_buffer_seconds, 503 self.stride_seconds, 504 ) 505 506 # ------------------------------------------------------------------ 507 # Public API 508 # ------------------------------------------------------------------ 509 510 @property 511 def buffer_duration_s(self) -> float: 512 """Current accumulated audio duration in seconds.""" 513 return self._buffer_samples / self.sample_rate 514 515 def push_chunk(self, chunk: np.ndarray | bytes) -> Iterator[TranscriptSegment]: 516 """Push an audio chunk into the buffer. 517 518 If the buffer has accumulated at least ``min_buffer_seconds`` of audio, 519 a transcription pass is run and any yielded segments are returned. 520 Otherwise, no segments are yielded and the chunk is silently buffered. 521 522 Args: 523 chunk: Float32 numpy array (16kHz mono) **or** raw ``int16`` PCM 524 bytes. Bytes are automatically converted to float32. 525 526 Yields: 527 TranscriptSegment — segments decoded in this pass (may be empty). 528 """ 529 arr = self._coerce_chunk(chunk) 530 self._buffer.append(arr) 531 self._buffer_samples += len(arr) 532 533 min_samples = int(self.min_buffer_seconds * self.sample_rate) 534 if self._buffer_samples >= min_samples: 535 yield from self._run_pass() 536 537 def flush(self) -> Iterator[TranscriptSegment]: 538 """Transcribe whatever remains in the buffer and clear it. 539 540 Call this when the audio stream ends to ensure trailing audio is 541 transcribed. 542 543 Yields: 544 TranscriptSegment — segments from the remaining buffer. 545 """ 546 if self._buffer_samples == 0: 547 return 548 549 logger.debug("StreamingSTTEngine.flush: %.2f s buffered", self.buffer_duration_s) 550 yield from self._run_pass(force=True) 551 552 def reset(self) -> None: 553 """Discard the current buffer without transcribing.""" 554 self._buffer = [] 555 self._buffer_samples = 0 556 logger.debug("StreamingSTTEngine buffer reset") 557 558 def prewarm(self) -> None: 559 """Eagerly load the underlying Whisper model.""" 560 self._engine.prewarm() 561 562 def close(self) -> None: 563 """Release model resources and discard the buffer.""" 564 self.reset() 565 self._engine.close() 566 567 def __enter__(self) -> StreamingSTTEngine: 568 """Enter sync context manager.""" 569 return self 570 571 def __exit__( 572 self, 573 exc_type: type[BaseException] | None, 574 exc_val: BaseException | None, 575 exc_tb: object, 576 ) -> None: 577 """Exit sync context manager. Releases engine resources.""" 578 self.close() 579 580 # ------------------------------------------------------------------ 581 # Internal helpers 582 # ------------------------------------------------------------------ 583 584 def _coerce_chunk(self, chunk: np.ndarray | bytes) -> np.ndarray: 585 """Convert raw int16 bytes or ensure float32 array.""" 586 if isinstance(chunk, (bytes, bytearray)): 587 arr = np.frombuffer(chunk, dtype=np.int16).astype(np.float32) / 32768.0 588 return arr 589 arr = np.asarray(chunk, dtype=np.float32) 590 if arr.ndim > 1: 591 # Best-effort stereo → mono using the shape heuristic from STTEngine 592 if arr.shape[0] < arr.shape[1]: 593 arr = arr.mean(axis=0) 594 else: 595 arr = arr.mean(axis=1) 596 return arr 597 598 def _run_pass(self, *, force: bool = False) -> Iterator[TranscriptSegment]: 599 """Concatenate buffer, transcribe, apply sliding window, yield segments.""" 600 audio = np.concatenate(self._buffer) 601 602 logger.debug( 603 "StreamingSTTEngine pass: %.2f s (force=%s)", 604 len(audio) / self.sample_rate, 605 force, 606 ) 607 608 yield from self._engine.transcribe_streaming(audio) 609 610 # Sliding window: retain the last stride_seconds of audio so that 611 # words near the boundary are not cut off on the next pass. 612 stride_samples = int(self.stride_seconds * self.sample_rate) 613 if stride_samples > 0 and len(audio) > stride_samples: 614 retained = audio[-stride_samples:] 615 self._buffer = [retained] 616 self._buffer_samples = len(retained) 617 else: 618 self._buffer = [] 619 self._buffer_samples = 0
Incremental streaming STT: accepts audio chunks, yields segments.
Audio chunks are pushed one at a time via push_chunk(). When the
accumulated buffer reaches min_buffer_seconds, push_chunk()
transparently transcribes the buffer and yields any new segments. You can
also force a transcription at any time with flush().
A sliding-window approach is supported via stride_seconds: after each
transcription pass the engine retains the last stride_seconds of audio
so that words near the boundary are not lost on the next pass. Set
stride_seconds=0.0 (default) to discard all audio after each pass.
Thread safety: not thread-safe. Call from a single thread or protect externally with a lock.
Arguments:
- model: Whisper model size. One of
tiny,base,small,medium,large-v3. Default"base". - device:
"cpu"or"cuda". Default"cpu". - compute_type: CTranslate2 compute type. Default
"int8". - language: Force a specific language code (e.g.
"en").Nonefor auto-detect. - min_buffer_seconds: Minimum seconds of audio to accumulate before
attempting a transcription pass. Shorter values
mean lower latency but more frequent (and
potentially noisier) passes. Default
2.0. - stride_seconds: Seconds of audio overlap to retain between passes
(sliding-window). Default
0.0(no overlap). - sample_rate: Sample rate of incoming audio chunks. Default
16000.
Example::
streaming = StreamingSTTEngine(model="base", min_buffer_seconds=2.0)
for chunk in mic_chunks:
for segment in streaming.push_chunk(chunk):
print(f"[{segment.start:.1f}s] {segment.text}")
# Force final transcription when done
for segment in streaming.flush():
print(f"[{segment.start:.1f}s] {segment.text}")
510 @property 511 def buffer_duration_s(self) -> float: 512 """Current accumulated audio duration in seconds.""" 513 return self._buffer_samples / self.sample_rate
Current accumulated audio duration in seconds.
515 def push_chunk(self, chunk: np.ndarray | bytes) -> Iterator[TranscriptSegment]: 516 """Push an audio chunk into the buffer. 517 518 If the buffer has accumulated at least ``min_buffer_seconds`` of audio, 519 a transcription pass is run and any yielded segments are returned. 520 Otherwise, no segments are yielded and the chunk is silently buffered. 521 522 Args: 523 chunk: Float32 numpy array (16kHz mono) **or** raw ``int16`` PCM 524 bytes. Bytes are automatically converted to float32. 525 526 Yields: 527 TranscriptSegment — segments decoded in this pass (may be empty). 528 """ 529 arr = self._coerce_chunk(chunk) 530 self._buffer.append(arr) 531 self._buffer_samples += len(arr) 532 533 min_samples = int(self.min_buffer_seconds * self.sample_rate) 534 if self._buffer_samples >= min_samples: 535 yield from self._run_pass()
Push an audio chunk into the buffer.
If the buffer has accumulated at least min_buffer_seconds of audio,
a transcription pass is run and any yielded segments are returned.
Otherwise, no segments are yielded and the chunk is silently buffered.
Arguments:
- chunk: Float32 numpy array (16kHz mono) or raw
int16PCM bytes. Bytes are automatically converted to float32.
Yields:
TranscriptSegment — segments decoded in this pass (may be empty).
537 def flush(self) -> Iterator[TranscriptSegment]: 538 """Transcribe whatever remains in the buffer and clear it. 539 540 Call this when the audio stream ends to ensure trailing audio is 541 transcribed. 542 543 Yields: 544 TranscriptSegment — segments from the remaining buffer. 545 """ 546 if self._buffer_samples == 0: 547 return 548 549 logger.debug("StreamingSTTEngine.flush: %.2f s buffered", self.buffer_duration_s) 550 yield from self._run_pass(force=True)
Transcribe whatever remains in the buffer and clear it.
Call this when the audio stream ends to ensure trailing audio is transcribed.
Yields:
TranscriptSegment — segments from the remaining buffer.
552 def reset(self) -> None: 553 """Discard the current buffer without transcribing.""" 554 self._buffer = [] 555 self._buffer_samples = 0 556 logger.debug("StreamingSTTEngine buffer reset")
Discard the current buffer without transcribing.
56class VoicePipeline: 57 """Orchestrated Wake→VAD→STT→TTS voice pipeline. 58 59 State Machine 60 ============= 61 Four states with the following transitions:: 62 63 IDLE ──(wake detected)──→ LISTENING 64 LISTENING ──(silence/timeout)──→ TRANSCRIBING 65 TRANSCRIBING ──(text ready)──→ RESPONDING 66 RESPONDING ──(handlers done)──→ IDLE 67 68 Thread ownership of transitions: 69 - **Main thread** (``_run_loop``): IDLE→LISTENING, LISTENING→TRANSCRIBING. 70 Owns the mic capture loop and wake/VAD detection. 71 - **Worker thread** (``_transcribe_and_respond``): TRANSCRIBING→RESPONDING→IDLE. 72 Spawned by ``_start_worker()`` as a daemon thread. Runs STT, dispatches 73 command handlers, and triggers TTS playback. Only one worker thread is 74 active at a time (guarded by ``_worker_lock``). 75 76 Threading Model 77 =============== 78 - The main loop runs in the calling thread via ``run()`` (blocking). 79 - When a command recording ends, ``_start_worker()`` spawns a single daemon 80 thread for STT transcription + handler dispatch + TTS playback. 81 - ``_state_lock`` guards all reads/writes of ``_state``. 82 - ``_worker_lock`` guards ``_worker_thread`` reference management. 83 84 Known Limitation 85 ================ 86 There is a brief window between the worker thread completing (setting state 87 back to IDLE) and the next wake detection where the pipeline is technically 88 idle but the worker thread reference may not yet be cleared. During this 89 window, a new wake detection will proceed normally since the state is IDLE. 90 91 Usage:: 92 93 pipeline = VoicePipeline() 94 95 @pipeline.on_command 96 def handle(text: str) -> str | None: 97 return f"You said: {text}" 98 99 pipeline.run() 100 """ 101 102 def __init__( 103 self, 104 wake_word: str = "viola", 105 stt_model: str = "base", 106 tts_voice: str = "af_heart", 107 threshold: float = DEFAULT_THRESHOLD, 108 vad_backend: str = "auto", 109 vad_threshold: float = 0.4, 110 enable_tts: bool = True, 111 device_index: int | None = None, 112 on_wake: WakeCallback | None = None, 113 streaming_stt: bool = False, 114 ) -> None: 115 """Initialize the voice pipeline. 116 117 Args: 118 wake_word: Wake word model name. Default "viola". 119 stt_model: Whisper model size. Default "base". 120 tts_voice: TTS voice name. Default "af_heart". 121 threshold: Wake word detection threshold. Default 0.80. 122 vad_backend: VAD backend. Default "auto". 123 vad_threshold: VAD speech detection threshold. Default 0.4. 124 enable_tts: If True, speak responses via TTS. If False, skip TTS. 125 device_index: Microphone device index. None = system default. 126 on_wake: Optional callback fired after wake-word detection and 127 before command transcription begins. 128 streaming_stt: If True, use ``STTEngine.transcribe_streaming()`` 129 to yield segments incrementally instead of waiting 130 for the full transcription. The command text passed 131 to handlers is the concatenation of all yielded 132 segment texts (identical to non-streaming behaviour), 133 but each segment is logged as it arrives. 134 Default False. 135 """ 136 self._wake_detector = WakeDetector(model=wake_word, threshold=threshold) 137 self._vad = VADEngine(backend=vad_backend) 138 self._vad_threshold = vad_threshold 139 self._enable_tts = enable_tts 140 self._device_index = device_index 141 self._stt_model = stt_model 142 self._tts_voice = tts_voice 143 self._on_wake = on_wake 144 self._streaming_stt = streaming_stt 145 146 # Typed as Any because STTEngine/TTSEngine are lazily imported to avoid 147 # hard dependencies on optional extras (violawake[stt], violawake[tts]). 148 self._stt: Any | None = None # lazy — violawake_sdk.stt.STTEngine 149 self._tts: Any | None = None # lazy — violawake_sdk.tts.TTSEngine 150 151 self._state = _STATE_IDLE 152 self._state_lock = threading.Lock() 153 self._stop_event = threading.Event() 154 self._worker_lock = threading.Lock() 155 self._worker_thread: threading.Thread | None = None 156 157 self._command_handlers: list[CommandHandler] = [] 158 159 logger.info( 160 "VoicePipeline initialized: wake=%s, stt=%s, tts=%s, streaming_stt=%s", 161 wake_word, 162 stt_model, 163 tts_voice, 164 streaming_stt, 165 ) 166 167 def on_command(self, handler: CommandHandler) -> CommandHandler: 168 """Decorator to register a command handler. 169 170 The handler receives the transcribed text and may return a string 171 response (which is spoken via TTS) or None (no TTS response). 172 173 Example:: 174 175 @pipeline.on_command 176 def handle(text: str) -> str | None: 177 if "weather" in text: 178 return "It's sunny!" 179 return None 180 """ 181 self._command_handlers.append(handler) 182 return handler 183 184 def run(self) -> None: 185 """Run the pipeline. Blocks until ``stop()`` is called or Ctrl+C. 186 187 Raises: 188 PipelineError: If the pipeline encounters an unrecoverable error. 189 """ 190 logger.info("VoicePipeline started. Say the wake word to begin.") 191 self._stop_event.clear() 192 193 try: 194 self._run_loop() 195 except KeyboardInterrupt: 196 logger.info("Pipeline interrupted by user.") 197 except Exception as e: 198 raise PipelineError(f"Pipeline error: {e}") from e 199 finally: 200 self.stop() 201 with self._state_lock: 202 self._state = _STATE_IDLE 203 logger.info("VoicePipeline stopped.") 204 205 def stop(self, timeout: float = 5.0) -> None: 206 """Signal the pipeline to stop and wait briefly for worker cleanup.""" 207 self._stop_event.set() 208 worker = self._get_worker_thread() 209 if worker is None or worker is threading.current_thread(): 210 return 211 212 worker.join(timeout=timeout) 213 if worker.is_alive(): 214 logger.warning("VoicePipeline worker thread did not exit within %.1f s", timeout) 215 else: 216 # Clear reference only after the worker has fully exited 217 with self._worker_lock: 218 if self._worker_thread is worker: 219 self._worker_thread = None 220 221 def close(self) -> None: 222 """Stop the pipeline and release all engine resources.""" 223 self.stop() 224 self._wake_detector.close() 225 self._stt = None 226 self._tts = None 227 228 def __enter__(self) -> VoicePipeline: 229 """Enter sync context manager.""" 230 return self 231 232 def __exit__( 233 self, 234 exc_type: type[BaseException] | None, 235 exc_val: BaseException | None, 236 exc_tb: object, 237 ) -> None: 238 """Exit sync context manager. Stops pipeline and releases resources.""" 239 self.close() 240 241 def speak(self, text: str) -> None: 242 """Synthesize and play text via TTS (called from within command handlers).""" 243 if not self._enable_tts or self._stop_event.is_set(): 244 return 245 246 tts = self._get_tts() 247 if tts is None: 248 logger.warning("TTS not available — install 'violawake[tts]'") 249 return 250 251 try: 252 audio = tts.synthesize(text) 253 tts.play(audio) 254 except Exception as e: 255 logger.exception("TTS playback failed for text '%.50s': %s", text, e) 256 257 def _run_loop(self) -> None: 258 """Main mic capture and detection loop.""" 259 recording_buffer: list[bytes] = [] 260 silence_count = 0 261 262 for frame in self._wake_detector.stream_mic(device_index=self._device_index): 263 if self._stop_event.is_set(): 264 break 265 266 with self._state_lock: 267 state = self._state 268 269 if state == _STATE_IDLE: 270 # Listening for wake word 271 if self._wake_detector.detect(frame, is_playing=False): 272 logger.info("Wake word detected → listening for command") 273 recording_buffer.clear() 274 silence_count = 0 275 if self._on_wake is not None: 276 try: 277 self._on_wake() 278 except Exception: 279 logger.exception("on_wake callback failed") 280 with self._state_lock: 281 self._state = _STATE_LISTENING 282 283 elif state == _STATE_LISTENING: 284 # Recording command 285 recording_buffer.append(frame) 286 is_speech = self._vad.is_speech(frame, threshold=self._vad_threshold) 287 288 if not is_speech: 289 silence_count += 1 290 else: 291 silence_count = 0 292 293 # Check stop conditions 294 total_frames = len(recording_buffer) 295 296 if silence_count >= SILENCE_FRAMES_STOP or total_frames >= MAX_RECORDING_FRAMES: 297 audio_bytes = b"".join(recording_buffer) 298 with self._state_lock: 299 self._state = _STATE_TRANSCRIBING 300 301 # Transcribe in a worker thread to not block the mic loop 302 self._start_worker(audio_bytes) 303 304 # _STATE_TRANSCRIBING and _STATE_RESPONDING: keep looping (don't detect wake) 305 306 def _transcribe_and_respond(self, audio_bytes: bytes) -> None: 307 """Transcribe audio and dispatch to command handlers.""" 308 stt = self._get_stt() 309 if stt is None: 310 logger.warning("STT not available — install 'violawake[stt]'") 311 with self._state_lock: 312 self._state = _STATE_IDLE 313 return 314 315 try: 316 if len(audio_bytes) % 2 != 0: 317 logger.warning( 318 "Audio buffer length %d is not a multiple of 2 bytes (int16); " 319 "truncating to even boundary", 320 len(audio_bytes), 321 ) 322 audio_bytes = audio_bytes[: len(audio_bytes) & ~1] 323 if len(audio_bytes) == 0: 324 logger.warning("Empty audio buffer — skipping transcription") 325 return 326 pcm = np.frombuffer(audio_bytes, dtype=np.int16).astype(np.float32) / 32768.0 327 328 if self._streaming_stt: 329 # Streaming mode: consume the generator and log each segment as it arrives. 330 segment_texts: list[str] = [] 331 for seg in stt.transcribe_streaming(pcm): 332 if self._stop_event.is_set(): 333 logger.debug("Pipeline stopping; aborting streaming transcription") 334 return 335 logger.debug("Streaming segment [%.1f-%.1f]: '%s'", seg.start, seg.end, seg.text) 336 segment_texts.append(seg.text) 337 text = " ".join(segment_texts).strip() 338 else: 339 text = stt.transcribe(pcm) 340 341 if self._stop_event.is_set(): 342 logger.debug("Pipeline stopping; dropping transcription result") 343 return 344 345 if text.strip(): 346 logger.info("Command: '%s'", text) 347 self._dispatch_command(text) 348 else: 349 logger.debug("Empty transcription — returning to idle") 350 351 except Exception: 352 logger.exception("Transcription failed") 353 finally: 354 self._clear_worker_thread() 355 with self._state_lock: 356 self._state = _STATE_IDLE 357 358 def _dispatch_command(self, text: str) -> None: 359 """Call all registered command handlers.""" 360 if self._stop_event.is_set(): 361 return 362 363 with self._state_lock: 364 self._state = _STATE_RESPONDING 365 366 try: 367 for handler in self._command_handlers: 368 if self._stop_event.is_set(): 369 break 370 try: 371 response = handler(text) 372 if response and self._enable_tts and not self._stop_event.is_set(): 373 self.speak(response) 374 except Exception: 375 logger.exception("Command handler '%s' failed", handler.__name__) 376 finally: 377 with self._state_lock: 378 self._state = _STATE_IDLE 379 380 def _start_worker(self, audio_bytes: bytes) -> None: 381 """Start the STT/TTS worker thread and retain it for shutdown. 382 383 If a previous worker is still alive, skip spawning to prevent 384 concurrent transcription. 385 """ 386 with self._worker_lock: 387 if self._worker_thread is not None and self._worker_thread.is_alive(): 388 logger.warning("Previous worker thread still alive — skipping new spawn") 389 with self._state_lock: 390 self._state = _STATE_IDLE 391 return 392 worker = threading.Thread( 393 target=self._transcribe_and_respond, 394 args=(audio_bytes,), 395 daemon=True, 396 ) 397 self._worker_thread = worker 398 worker.start() 399 400 def _get_worker_thread(self) -> threading.Thread | None: 401 """Return the active worker thread, if any.""" 402 with self._worker_lock: 403 return self._worker_thread 404 405 def _clear_worker_thread(self) -> None: 406 """Clear the worker reference once the worker exits.""" 407 current = threading.current_thread() 408 with self._worker_lock: 409 if self._worker_thread is current: 410 self._worker_thread = None 411 412 def _get_stt(self) -> Any | None: 413 """Lazy-load STT engine.""" 414 if self._stt is None: 415 try: 416 from violawake_sdk.stt import STTEngine 417 418 self._stt = STTEngine(model=self._stt_model) 419 self._stt.prewarm() 420 except ImportError: 421 return None 422 return self._stt 423 424 def _get_tts(self) -> Any | None: 425 """Lazy-load TTS engine.""" 426 if self._tts is None and self._enable_tts: 427 try: 428 from violawake_sdk.tts import TTSEngine 429 430 self._tts = TTSEngine(voice=self._tts_voice) 431 except ImportError: 432 return None 433 return self._tts
Orchestrated Wake→VAD→STT→TTS voice pipeline.
State Machine
Four states with the following transitions::
IDLE ──(wake detected)──→ LISTENING
LISTENING ──(silence/timeout)──→ TRANSCRIBING
TRANSCRIBING ──(text ready)──→ RESPONDING
RESPONDING ──(handlers done)──→ IDLE
Thread ownership of transitions:
- Main thread (
_run_loop): IDLE→LISTENING, LISTENING→TRANSCRIBING. Owns the mic capture loop and wake/VAD detection.- Worker thread (
_transcribe_and_respond): TRANSCRIBING→RESPONDING→IDLE. Spawned by_start_worker()as a daemon thread. Runs STT, dispatches command handlers, and triggers TTS playback. Only one worker thread is active at a time (guarded by_worker_lock).
Threading Model
- The main loop runs in the calling thread via
run()(blocking). - When a command recording ends,
_start_worker()spawns a single daemon thread for STT transcription + handler dispatch + TTS playback. _state_lockguards all reads/writes of_state._worker_lockguards_worker_threadreference management.
Known Limitation
There is a brief window between the worker thread completing (setting state back to IDLE) and the next wake detection where the pipeline is technically idle but the worker thread reference may not yet be cleared. During this window, a new wake detection will proceed normally since the state is IDLE.
Usage::
pipeline = VoicePipeline()
@pipeline.on_command
def handle(text: str) -> str | None:
return f"You said: {text}"
pipeline.run()
102 def __init__( 103 self, 104 wake_word: str = "viola", 105 stt_model: str = "base", 106 tts_voice: str = "af_heart", 107 threshold: float = DEFAULT_THRESHOLD, 108 vad_backend: str = "auto", 109 vad_threshold: float = 0.4, 110 enable_tts: bool = True, 111 device_index: int | None = None, 112 on_wake: WakeCallback | None = None, 113 streaming_stt: bool = False, 114 ) -> None: 115 """Initialize the voice pipeline. 116 117 Args: 118 wake_word: Wake word model name. Default "viola". 119 stt_model: Whisper model size. Default "base". 120 tts_voice: TTS voice name. Default "af_heart". 121 threshold: Wake word detection threshold. Default 0.80. 122 vad_backend: VAD backend. Default "auto". 123 vad_threshold: VAD speech detection threshold. Default 0.4. 124 enable_tts: If True, speak responses via TTS. If False, skip TTS. 125 device_index: Microphone device index. None = system default. 126 on_wake: Optional callback fired after wake-word detection and 127 before command transcription begins. 128 streaming_stt: If True, use ``STTEngine.transcribe_streaming()`` 129 to yield segments incrementally instead of waiting 130 for the full transcription. The command text passed 131 to handlers is the concatenation of all yielded 132 segment texts (identical to non-streaming behaviour), 133 but each segment is logged as it arrives. 134 Default False. 135 """ 136 self._wake_detector = WakeDetector(model=wake_word, threshold=threshold) 137 self._vad = VADEngine(backend=vad_backend) 138 self._vad_threshold = vad_threshold 139 self._enable_tts = enable_tts 140 self._device_index = device_index 141 self._stt_model = stt_model 142 self._tts_voice = tts_voice 143 self._on_wake = on_wake 144 self._streaming_stt = streaming_stt 145 146 # Typed as Any because STTEngine/TTSEngine are lazily imported to avoid 147 # hard dependencies on optional extras (violawake[stt], violawake[tts]). 148 self._stt: Any | None = None # lazy — violawake_sdk.stt.STTEngine 149 self._tts: Any | None = None # lazy — violawake_sdk.tts.TTSEngine 150 151 self._state = _STATE_IDLE 152 self._state_lock = threading.Lock() 153 self._stop_event = threading.Event() 154 self._worker_lock = threading.Lock() 155 self._worker_thread: threading.Thread | None = None 156 157 self._command_handlers: list[CommandHandler] = [] 158 159 logger.info( 160 "VoicePipeline initialized: wake=%s, stt=%s, tts=%s, streaming_stt=%s", 161 wake_word, 162 stt_model, 163 tts_voice, 164 streaming_stt, 165 )
Initialize the voice pipeline.
Arguments:
- wake_word: Wake word model name. Default "viola".
- stt_model: Whisper model size. Default "base".
- tts_voice: TTS voice name. Default "af_heart".
- threshold: Wake word detection threshold. Default 0.80.
- vad_backend: VAD backend. Default "auto".
- vad_threshold: VAD speech detection threshold. Default 0.4.
- enable_tts: If True, speak responses via TTS. If False, skip TTS.
- device_index: Microphone device index. None = system default.
- on_wake: Optional callback fired after wake-word detection and before command transcription begins.
- streaming_stt: If True, use
STTEngine.transcribe_streaming()to yield segments incrementally instead of waiting for the full transcription. The command text passed to handlers is the concatenation of all yielded segment texts (identical to non-streaming behaviour), but each segment is logged as it arrives. Default False.
167 def on_command(self, handler: CommandHandler) -> CommandHandler: 168 """Decorator to register a command handler. 169 170 The handler receives the transcribed text and may return a string 171 response (which is spoken via TTS) or None (no TTS response). 172 173 Example:: 174 175 @pipeline.on_command 176 def handle(text: str) -> str | None: 177 if "weather" in text: 178 return "It's sunny!" 179 return None 180 """ 181 self._command_handlers.append(handler) 182 return handler
Decorator to register a command handler.
The handler receives the transcribed text and may return a string response (which is spoken via TTS) or None (no TTS response).
Example::
@pipeline.on_command
def handle(text: str) -> str | None:
if "weather" in text:
return "It's sunny!"
return None
184 def run(self) -> None: 185 """Run the pipeline. Blocks until ``stop()`` is called or Ctrl+C. 186 187 Raises: 188 PipelineError: If the pipeline encounters an unrecoverable error. 189 """ 190 logger.info("VoicePipeline started. Say the wake word to begin.") 191 self._stop_event.clear() 192 193 try: 194 self._run_loop() 195 except KeyboardInterrupt: 196 logger.info("Pipeline interrupted by user.") 197 except Exception as e: 198 raise PipelineError(f"Pipeline error: {e}") from e 199 finally: 200 self.stop() 201 with self._state_lock: 202 self._state = _STATE_IDLE 203 logger.info("VoicePipeline stopped.")
Run the pipeline. Blocks until stop() is called or Ctrl+C.
Raises:
- PipelineError: If the pipeline encounters an unrecoverable error.
205 def stop(self, timeout: float = 5.0) -> None: 206 """Signal the pipeline to stop and wait briefly for worker cleanup.""" 207 self._stop_event.set() 208 worker = self._get_worker_thread() 209 if worker is None or worker is threading.current_thread(): 210 return 211 212 worker.join(timeout=timeout) 213 if worker.is_alive(): 214 logger.warning("VoicePipeline worker thread did not exit within %.1f s", timeout) 215 else: 216 # Clear reference only after the worker has fully exited 217 with self._worker_lock: 218 if self._worker_thread is worker: 219 self._worker_thread = None
Signal the pipeline to stop and wait briefly for worker cleanup.
221 def close(self) -> None: 222 """Stop the pipeline and release all engine resources.""" 223 self.stop() 224 self._wake_detector.close() 225 self._stt = None 226 self._tts = None
Stop the pipeline and release all engine resources.
241 def speak(self, text: str) -> None: 242 """Synthesize and play text via TTS (called from within command handlers).""" 243 if not self._enable_tts or self._stop_event.is_set(): 244 return 245 246 tts = self._get_tts() 247 if tts is None: 248 logger.warning("TTS not available — install 'violawake[tts]'") 249 return 250 251 try: 252 audio = tts.synthesize(text) 253 tts.play(audio) 254 except Exception as e: 255 logger.exception("TTS playback failed for text '%.50s': %s", text, e)
Synthesize and play text via TTS (called from within command handlers).
Base exception for all ViolaWake SDK errors.
11class ModelNotFoundError(ViolaWakeError): 12 """Raised when a model file is not found in the cache or at the given path. 13 14 Resolution: run ``violawake-download --model <model_name>`` to download. 15 """
Raised when a model file is not found in the cache or at the given path.
Resolution: run violawake-download --model <model_name> to download.
25class AudioCaptureError(ViolaWakeError): 26 """Raised when microphone capture fails to initialize or read frames. 27 28 Common causes: no audio input device, device already in use, 29 PortAudio not installed. 30 """
Raised when microphone capture fails to initialize or read frames.
Common causes: no audio input device, device already in use, PortAudio not installed.
18class ModelLoadError(ViolaWakeError): 19 """Raised when a model file exists but cannot be loaded by ONNX Runtime. 20 21 Possible causes: corrupted file, ONNX opset version mismatch. 22 """
Raised when a model file exists but cannot be loaded by ONNX Runtime.
Possible causes: corrupted file, ONNX opset version mismatch.
40class PipelineError(ViolaWakeError): 41 """Raised when the VoicePipeline encounters an unrecoverable error."""
Raised when the VoicePipeline encounters an unrecoverable error.
33class VADBackendError(ViolaWakeError): 34 """Raised when the requested VAD backend is unavailable. 35 36 Falls back to RMS heuristic if webrtcvad/silero not installed. 37 """
Raised when the requested VAD backend is unavailable.
Falls back to RMS heuristic if webrtcvad/silero not installed.
74def list_models() -> list[dict[str, str]]: 75 """Return available wake word models with their descriptions. 76 77 Each entry is a dict with keys: ``name``, ``description``, ``version``. 78 79 Example:: 80 81 >>> from violawake_sdk import list_models 82 >>> for m in list_models(): 83 ... print(f"{m['name']:20s} {m['description']}") 84 """ 85 from violawake_sdk.models import MODEL_REGISTRY 86 87 seen: set[str] = set() 88 result: list[dict[str, str]] = [] 89 for name, spec in MODEL_REGISTRY.items(): 90 # Deduplicate aliases (e.g. "viola" -> "temporal_cnn") 91 if spec.name in seen: 92 continue 93 # Hide deprecated, package-managed, and non-wake-word models 94 if "DEPRECATED" in spec.description: 95 continue 96 if spec.name in ("oww_backbone", "kokoro_v1_0", "kokoro_voices_v1_0"): 97 continue 98 seen.add(spec.name) 99 result.append( 100 { 101 "name": name, 102 "description": spec.description, 103 "version": spec.version, 104 } 105 ) 106 return result
Return available wake word models with their descriptions.
Each entry is a dict with keys: name, description, version.
Example::
>>> from violawake_sdk import list_models
>>> for m in list_models():
... print(f"{m['name']:20s} {m['description']}")
109def list_voices() -> list[str]: 110 """Return available TTS voice names for use with ``TTSEngine``. 111 112 Requires the ``[tts]`` extra to be installed for actual synthesis, 113 but this function always works for discovery. 114 115 Example:: 116 117 >>> from violawake_sdk import list_voices 118 >>> list_voices() 119 ['af_heart', 'af_bella', 'af_sarah', ...] 120 """ 121 from violawake_sdk.tts import AVAILABLE_VOICES 122 123 return list(AVAILABLE_VOICES)
Return available TTS voice names for use with TTSEngine.
Requires the [tts] extra to be installed for actual synthesis,
but this function always works for discovery.
Example::
>>> from violawake_sdk import list_voices
>>> list_voices()
['af_heart', 'af_bella', 'af_sarah', ...]