violawake_sdk

ViolaWake SDK — Open-source wake word detection + voice pipeline.

Public API surface:

WakeDetector — detect a wake word in an audio stream AsyncWakeDetector — async wrapper for asyncio-based applications DetectorConfig — advanced configuration (ensemble, adaptive, speaker, power) VADEngine — voice activity detection (WebRTC, Silero, RMS) TTSEngine — on-device text-to-speech (Kokoro-82M) STTEngine — speech-to-text (faster-whisper) VoicePipeline — bundled Wake→VAD→STT→TTS orchestration NoiseProfiler — noise-adaptive threshold adjustment PowerManager — battery-aware power management FusionStrategy — multi-model ensemble scoring list_models() — discover available wake word models list_voices() — discover available TTS voices

Quick start::

from violawake_sdk import WakeDetector

with WakeDetector(threshold=0.80) as detector:
    for chunk in detector.stream_mic():
        if detector.detect(chunk):
            print("Wake word detected!")
            break

See README.md or https://github.com/GeeIHadAGoodTime/ViolaWake for full documentation.

  1"""
  2ViolaWake SDK — Open-source wake word detection + voice pipeline.
  3
  4Public API surface:
  5    WakeDetector      — detect a wake word in an audio stream
  6    AsyncWakeDetector — async wrapper for asyncio-based applications
  7    DetectorConfig    — advanced configuration (ensemble, adaptive, speaker, power)
  8    VADEngine         — voice activity detection (WebRTC, Silero, RMS)
  9    TTSEngine         — on-device text-to-speech (Kokoro-82M)
 10    STTEngine         — speech-to-text (faster-whisper)
 11    VoicePipeline     — bundled Wake→VAD→STT→TTS orchestration
 12    NoiseProfiler     — noise-adaptive threshold adjustment
 13    PowerManager      — battery-aware power management
 14    FusionStrategy    — multi-model ensemble scoring
 15    list_models()     — discover available wake word models
 16    list_voices()     — discover available TTS voices
 17
 18Quick start::
 19
 20    from violawake_sdk import WakeDetector
 21
 22    with WakeDetector(threshold=0.80) as detector:
 23        for chunk in detector.stream_mic():
 24            if detector.detect(chunk):
 25                print("Wake word detected!")
 26                break
 27
 28See README.md or https://github.com/GeeIHadAGoodTime/ViolaWake for full documentation.
 29"""
 30
 31from __future__ import annotations
 32
 33__version__ = "0.2.2"
 34__author__ = "ViolaWake Contributors"
 35__license__ = "Apache-2.0"
 36
 37from violawake_sdk._exceptions import (
 38    AudioCaptureError,
 39    ModelLoadError,
 40    ModelNotFoundError,
 41    PipelineError,
 42    VADBackendError,
 43    ViolaWakeError,
 44)
 45from violawake_sdk.async_detector import AsyncWakeDetector
 46from violawake_sdk.confidence import ConfidenceLevel, ConfidenceResult
 47from violawake_sdk.ensemble import FusionStrategy
 48from violawake_sdk.noise_profiler import NoiseProfiler
 49from violawake_sdk.pipeline import VoicePipeline
 50from violawake_sdk.power_manager import PowerManager
 51from violawake_sdk.vad import VADEngine
 52from violawake_sdk.wake_detector import (
 53    DetectorConfig,
 54    WakeDecisionPolicy,
 55    WakeDetector,
 56    WakewordDetector,  # noqa: F401 — backward compat
 57    validate_audio_chunk,
 58)
 59
 60# Conditional imports for optional extras
 61try:
 62    from violawake_sdk.tts import TTSEngine
 63except ImportError:
 64    TTSEngine = None  # type: ignore[assignment,misc]
 65
 66try:
 67    from violawake_sdk.stt import STTEngine, StreamingSTTEngine
 68except ImportError:
 69    STTEngine = None  # type: ignore[assignment,misc]
 70    StreamingSTTEngine = None  # type: ignore[assignment,misc]
 71
 72
 73def list_models() -> list[dict[str, str]]:
 74    """Return available wake word models with their descriptions.
 75
 76    Each entry is a dict with keys: ``name``, ``description``, ``version``.
 77
 78    Example::
 79
 80        >>> from violawake_sdk import list_models
 81        >>> for m in list_models():
 82        ...     print(f"{m['name']:20s} {m['description']}")
 83    """
 84    from violawake_sdk.models import MODEL_REGISTRY
 85
 86    seen: set[str] = set()
 87    result: list[dict[str, str]] = []
 88    for name, spec in MODEL_REGISTRY.items():
 89        # Deduplicate aliases (e.g. "viola" -> "temporal_cnn")
 90        if spec.name in seen:
 91            continue
 92        # Hide deprecated, package-managed, and non-wake-word models
 93        if "DEPRECATED" in spec.description:
 94            continue
 95        if spec.name in ("oww_backbone", "kokoro_v1_0", "kokoro_voices_v1_0"):
 96            continue
 97        seen.add(spec.name)
 98        result.append(
 99            {
100                "name": name,
101                "description": spec.description,
102                "version": spec.version,
103            }
104        )
105    return result
106
107
108def list_voices() -> list[str]:
109    """Return available TTS voice names for use with ``TTSEngine``.
110
111    Requires the ``[tts]`` extra to be installed for actual synthesis,
112    but this function always works for discovery.
113
114    Example::
115
116        >>> from violawake_sdk import list_voices
117        >>> list_voices()
118        ['af_heart', 'af_bella', 'af_sarah', ...]
119    """
120    from violawake_sdk.tts import AVAILABLE_VOICES
121
122    return list(AVAILABLE_VOICES)
123
124
125__all__ = [
126    # Core detection
127    "DetectorConfig",
128    "WakeDetector",
129    "AsyncWakeDetector",
130    "WakeDecisionPolicy",
131    "validate_audio_chunk",
132    # Confidence & scoring
133    "ConfidenceResult",
134    "ConfidenceLevel",
135    "FusionStrategy",
136    # Advanced features
137    "NoiseProfiler",
138    "PowerManager",
139    # Pipeline components
140    "VADEngine",
141    "TTSEngine",
142    "STTEngine",
143    "StreamingSTTEngine",
144    "VoicePipeline",
145    # Exceptions
146    "ViolaWakeError",
147    "ModelNotFoundError",
148    "AudioCaptureError",
149    "ModelLoadError",
150    "PipelineError",
151    "VADBackendError",
152    # Discovery
153    "list_models",
154    "list_voices",
155    "__version__",
156]
@dataclass
class DetectorConfig:
 70@dataclass
 71class DetectorConfig:
 72    """Advanced configuration for WakeDetector.
 73
 74    Basic usage needs no config -- just use ``WakeDetector(threshold=0.80)``.
 75    Use ``DetectorConfig`` to opt-in to advanced features without cluttering
 76    the constructor.
 77
 78    Example::
 79
 80        # Simple (80% of users):
 81        det = WakeDetector(model="temporal_cnn", threshold=0.80)
 82
 83        # Advanced (multi-model ensemble + adaptive threshold):
 84        det = WakeDetector(
 85            model="temporal_cnn",
 86            config=DetectorConfig(
 87                adaptive_threshold=True,
 88                confirm_count=3,
 89            ),
 90        )
 91
 92    Attributes:
 93        models: Additional model paths for multi-model ensemble (K3).
 94        fusion_strategy: Score fusion strategy for ensemble (K3).
 95        fusion_weights: Per-model weights for weighted_average fusion (K3).
 96        adaptive_threshold: Enable dynamic threshold based on noise (K4).
 97        noise_profiler: Custom NoiseProfiler instance (K4).
 98        speaker_verify_fn: Post-detection speaker verification callback (K5).
 99        power_manager: Power management controller for duty cycling (K7).
100        confirm_count: Consecutive above-threshold scores required (K2).
101        score_history_size: Number of recent scores to retain (K2).
102    """
103
104    # K3: Multi-model ensemble
105    models: list[str] | None = None
106    fusion_strategy: FusionStrategy | str = FusionStrategy.AVERAGE
107    fusion_weights: list[float] | None = None
108
109    # K4: Adaptive threshold
110    adaptive_threshold: bool = False
111    noise_profiler: NoiseProfiler | None = None
112
113    # K5: Speaker verification
114    speaker_verify_fn: Callable[..., bool] | None = None
115
116    # K7: Power management
117    power_manager: PowerManager | None = None
118
119    # K2: Confidence tracking
120    confirm_count: int = 1
121    score_history_size: int = 50
122
123    def build(self, model: str = "temporal_cnn", **kwargs: Any) -> WakeDetector:
124        """Build a WakeDetector from this config.
125
126        Convenience method that passes ``self`` as the ``config=`` argument.
127
128        Args:
129            model: Model name or path (default: ``"temporal_cnn"``).
130            **kwargs: Additional WakeDetector constructor arguments
131                (threshold, cooldown_s, etc.).
132
133        Returns:
134            Configured WakeDetector instance.
135        """
136        return WakeDetector(model=model, config=self, **kwargs)

Advanced configuration for WakeDetector.

Basic usage needs no config -- just use WakeDetector(threshold=0.80). Use DetectorConfig to opt-in to advanced features without cluttering the constructor.

Example::

# Simple (80% of users):
det = WakeDetector(model="temporal_cnn", threshold=0.80)

# Advanced (multi-model ensemble + adaptive threshold):
det = WakeDetector(
    model="temporal_cnn",
    config=DetectorConfig(
        adaptive_threshold=True,
        confirm_count=3,
    ),
)
Attributes:
  • models: Additional model paths for multi-model ensemble (K3).
  • fusion_strategy: Score fusion strategy for ensemble (K3).
  • fusion_weights: Per-model weights for weighted_average fusion (K3).
  • adaptive_threshold: Enable dynamic threshold based on noise (K4).
  • noise_profiler: Custom NoiseProfiler instance (K4).
  • speaker_verify_fn: Post-detection speaker verification callback (K5).
  • power_manager: Power management controller for duty cycling (K7).
  • confirm_count: Consecutive above-threshold scores required (K2).
  • score_history_size: Number of recent scores to retain (K2).
DetectorConfig( models: list[str] | None = None, fusion_strategy: FusionStrategy | str = <FusionStrategy.AVERAGE: 'average'>, fusion_weights: list[float] | None = None, adaptive_threshold: bool = False, noise_profiler: NoiseProfiler | None = None, speaker_verify_fn: Callable[..., bool] | None = None, power_manager: PowerManager | None = None, confirm_count: int = 1, score_history_size: int = 50)
models: list[str] | None = None
fusion_strategy: FusionStrategy | str = <FusionStrategy.AVERAGE: 'average'>
fusion_weights: list[float] | None = None
adaptive_threshold: bool = False
noise_profiler: NoiseProfiler | None = None
speaker_verify_fn: Callable[..., bool] | None = None
power_manager: PowerManager | None = None
confirm_count: int = 1
score_history_size: int = 50
def build( self, model: str = 'temporal_cnn', **kwargs: Any) -> WakeDetector:
123    def build(self, model: str = "temporal_cnn", **kwargs: Any) -> WakeDetector:
124        """Build a WakeDetector from this config.
125
126        Convenience method that passes ``self`` as the ``config=`` argument.
127
128        Args:
129            model: Model name or path (default: ``"temporal_cnn"``).
130            **kwargs: Additional WakeDetector constructor arguments
131                (threshold, cooldown_s, etc.).
132
133        Returns:
134            Configured WakeDetector instance.
135        """
136        return WakeDetector(model=model, config=self, **kwargs)

Build a WakeDetector from this config.

Convenience method that passes self as the config= argument.

Arguments:
  • model: Model name or path (default: "temporal_cnn").
  • **kwargs: Additional WakeDetector constructor arguments (threshold, cooldown_s, etc.).
Returns:

Configured WakeDetector instance.

class WakeDetector:
 272class WakeDetector:
 273    """Wake word detector using ViolaWake MLP on OpenWakeWord embeddings.
 274
 275    Supports pluggable inference backends (ONNX Runtime, TFLite) via the
 276    ``backend`` parameter.  The default ``"auto"`` mode tries ONNX Runtime
 277    first, then falls back to TFLite, so users on edge devices can run
 278    without installing ``onnxruntime``.
 279
 280    Also supports optional competitive features (all opt-in, backward compatible):
 281
 282    - **K2 Confidence API**: ``get_confidence()`` and ``last_scores`` property.
 283    - **K3 Multi-model ensemble**: ``models`` parameter with fusion strategies.
 284    - **K4 Adaptive threshold**: ``adaptive_threshold`` parameter with noise profiling.
 285    - **K5 Speaker verification**: ``speaker_verify_fn`` callback for post-detection.
 286    - **K6 Audio source abstraction**: ``from_source()`` class method factory.
 287    - **K7 Power management**: ``power_manager`` parameter for duty cycling.
 288
 289    **Threshold tuning guide:**
 290
 291    - 0.70 = sensitive (more detections, more false positives)
 292    - 0.80 = balanced (default, recommended starting point)
 293    - 0.85 = conservative (fewer false positives, may miss some)
 294    - 0.90+ = very conservative (for noisy environments)
 295
 296    Start at 0.80 and adjust based on your false accept rate.
 297
 298    Args:
 299        model: Model name from the registry, or a path to a model file.
 300        threshold: Detection confidence threshold in [0.0, 1.0].
 301        cooldown_s: Minimum seconds between consecutive detections.
 302        providers: ONNX Runtime execution providers (ignored for TFLite).
 303        backend: Inference backend selector (``"onnx"``, ``"tflite"``, ``"auto"``).
 304        config: A ``DetectorConfig`` instance bundling all advanced options.
 305            Mutually exclusive with the individual advanced kwargs below.
 306        models: Additional model paths for ensemble scoring (K3).
 307        fusion_strategy: Score fusion strategy for ensemble (K3).
 308        fusion_weights: Per-model weights for weighted_average fusion (K3).
 309        adaptive_threshold: Enable dynamic threshold based on noise (K4).
 310        noise_profiler: Custom NoiseProfiler instance (K4).
 311        speaker_verify_fn: Post-detection speaker verification callback (K5).
 312        power_manager: Power management controller for duty cycling (K7).
 313        confirm_count: Consecutive above-threshold scores required for detection (K2).
 314        score_history_size: Number of recent scores to retain (K2).
 315    """
 316
 317    _VALID_BACKENDS = ("onnx", "tflite", "auto")
 318
 319    def __init__(
 320        self,
 321        model: str = "temporal_cnn",
 322        threshold: float = DEFAULT_THRESHOLD,
 323        cooldown_s: float = DEFAULT_COOLDOWN_S,
 324        providers: list[str] | None = None,
 325        backend: str = "auto",
 326        *,
 327        config: DetectorConfig | None = None,
 328        # K3: Multi-model ensemble (individual kwargs, backwards compat)
 329        models: list[str] | None = _UNSET,
 330        fusion_strategy: FusionStrategy | str = _UNSET,
 331        fusion_weights: list[float] | None = _UNSET,
 332        # K4: Adaptive threshold
 333        adaptive_threshold: bool = _UNSET,
 334        noise_profiler: NoiseProfiler | None = _UNSET,
 335        # K5: Speaker verification
 336        speaker_verify_fn: Callable[[np.ndarray], bool] | None = _UNSET,
 337        # K7: Power management
 338        power_manager: PowerManager | None = _UNSET,
 339        # K2: Confidence tracking
 340        confirm_count: int = _UNSET,
 341        score_history_size: int = _UNSET,
 342    ) -> None:
 343        # --- Resolve config vs individual kwargs -------------------------
 344        # Detect if any advanced kwarg was explicitly passed (not _UNSET)
 345        _locals = {
 346            "models": models,
 347            "fusion_strategy": fusion_strategy,
 348            "fusion_weights": fusion_weights,
 349            "adaptive_threshold": adaptive_threshold,
 350            "noise_profiler": noise_profiler,
 351            "speaker_verify_fn": speaker_verify_fn,
 352            "power_manager": power_manager,
 353            "confirm_count": confirm_count,
 354            "score_history_size": score_history_size,
 355        }
 356        explicit_kwargs = {name for name, val in _locals.items() if val is not _UNSET}
 357        if config is not None and explicit_kwargs:
 358            raise ValueError(
 359                f"Cannot specify both config= and individual advanced kwargs. "
 360                f"Conflicting kwargs: {sorted(explicit_kwargs)}. "
 361                f"Either use config=DetectorConfig(...) or pass kwargs directly, not both."
 362            )
 363
 364        if config is not None:
 365            # Unpack from DetectorConfig
 366            models = config.models
 367            fusion_strategy = config.fusion_strategy
 368            fusion_weights = config.fusion_weights
 369            adaptive_threshold = config.adaptive_threshold
 370            noise_profiler = config.noise_profiler
 371            speaker_verify_fn = config.speaker_verify_fn
 372            power_manager = config.power_manager
 373            confirm_count = config.confirm_count
 374            score_history_size = config.score_history_size
 375        else:
 376            # Apply defaults for any _UNSET values (backwards compat path)
 377            if models is _UNSET:
 378                models = None
 379            if fusion_strategy is _UNSET:
 380                fusion_strategy = FusionStrategy.AVERAGE
 381            if fusion_weights is _UNSET:
 382                fusion_weights = None
 383            if adaptive_threshold is _UNSET:
 384                adaptive_threshold = False
 385            if noise_profiler is _UNSET:
 386                noise_profiler = None
 387            if speaker_verify_fn is _UNSET:
 388                speaker_verify_fn = None
 389            if power_manager is _UNSET:
 390                power_manager = None
 391            if confirm_count is _UNSET:
 392                confirm_count = 1
 393            if score_history_size is _UNSET:
 394                score_history_size = 50
 395
 396        # G1: Input validation for public constructor parameters
 397        if not isinstance(threshold, (int, float)):
 398            raise TypeError(f"threshold must be a number, got {type(threshold).__name__}")
 399        if not 0.0 <= threshold <= 1.0:
 400            raise ValueError(f"threshold must be in [0.0, 1.0], got {threshold!r}")
 401        if not isinstance(cooldown_s, (int, float)):
 402            raise TypeError(f"cooldown_s must be a number, got {type(cooldown_s).__name__}")
 403        if cooldown_s < 0:
 404            raise ValueError(f"cooldown_s must be >= 0, got {cooldown_s!r}")
 405        if backend not in self._VALID_BACKENDS:
 406            raise ValueError(f"backend must be one of {self._VALID_BACKENDS}, got {backend!r}")
 407        if confirm_count < 1:
 408            raise ValueError(f"confirm_count must be >= 1, got {confirm_count}")
 409
 410        self.threshold = threshold
 411        self._lock = threading.Lock()
 412        self._backbone_lock = threading.Lock()
 413        self._policy = WakeDecisionPolicy(threshold=threshold, cooldown_s=cooldown_s)
 414        self._providers = providers or ["CPUExecutionProvider"]
 415        self._backend: InferenceBackend = get_backend(backend, providers=self._providers)
 416
 417        # K2: Confidence tracking
 418        self._score_tracker = ScoreTracker(
 419            threshold=threshold,
 420            history_size=score_history_size,
 421        )
 422        self._confirm_required = confirm_count
 423        self._confirm_counter = 0
 424
 425        # K3: Ensemble support
 426        self._ensemble: EnsembleScorer | None = None
 427        if models and len(models) > 0:
 428            if isinstance(fusion_strategy, str):
 429                fusion_strategy = FusionStrategy(fusion_strategy)
 430            self._ensemble = EnsembleScorer(
 431                strategy=fusion_strategy,
 432                weights=fusion_weights,
 433            )
 434
 435        # K4: Noise profiler / adaptive threshold
 436        self._adaptive_threshold = adaptive_threshold
 437        if noise_profiler is not None:
 438            self._noise_profiler: NoiseProfiler | None = noise_profiler
 439        elif adaptive_threshold:
 440            self._noise_profiler = NoiseProfiler(base_threshold=threshold)
 441        else:
 442            self._noise_profiler = None
 443
 444        # K5: Speaker verification
 445        self._speaker_verify_fn = speaker_verify_fn
 446
 447        # K7: Power manager
 448        self._power_manager = power_manager
 449
 450        # Warn on deprecated models
 451        if model in MODEL_REGISTRY and "DEPRECATED" in MODEL_REGISTRY[model].description:
 452            import warnings
 453
 454            warnings.warn(
 455                f"Model '{model}' is deprecated: {MODEL_REGISTRY[model].description}. "
 456                f"Use model='temporal_cnn' instead.",
 457                DeprecationWarning,
 458                stacklevel=2,
 459            )
 460
 461        # Load models
 462        self._oww_backbone = self._create_oww_backbone()
 463        self._mlp_session = self._load_session(model)
 464        self._mlp_input_name = self._mlp_session.get_inputs()[0].name
 465        self._last_score = 0.0
 466
 467        # Detect temporal vs MLP model from input shape
 468        mlp_input_shape = self._mlp_session.get_inputs()[0].shape
 469        if len(mlp_input_shape) == 3:
 470            # Temporal model: input is (batch, seq_len, embedding_dim)
 471            self._is_temporal = True
 472            self._temporal_seq_len = (
 473                mlp_input_shape[1]
 474                if isinstance(mlp_input_shape[1], int)
 475                else _TEMPORAL_SEQ_LEN_DEFAULT
 476            )
 477            self._embedding_buffer: collections.deque[np.ndarray] = collections.deque(
 478                maxlen=self._temporal_seq_len,
 479            )
 480            logger.info(
 481                "Temporal model detected: seq_len=%d",
 482                self._temporal_seq_len,
 483            )
 484        else:
 485            self._is_temporal = False
 486            self._temporal_seq_len = 0
 487
 488        # K3: Load additional ensemble models
 489        if models and self._ensemble is not None:
 490            # Add primary model to ensemble
 491            self._ensemble.add_session(self._mlp_session, self._mlp_input_name)
 492            for extra_model in models:
 493                extra_session = self._load_session(extra_model)
 494                extra_input_name = extra_session.get_inputs()[0].name
 495                self._ensemble.add_session(extra_session, extra_input_name)
 496
 497        logger.info(
 498            "WakeDetector initialized: model=%s, threshold=%.2f, backend=%s",
 499            model,
 500            threshold,
 501            self._backend.name,
 502        )
 503        self._warn_on_oww_backbone_change(self._resolve_model_path(model))
 504
 505    # ------------------------------------------------------------------
 506    # Context manager support
 507    # ------------------------------------------------------------------
 508
 509    def __enter__(self) -> WakeDetector:
 510        """Enter sync context manager. Returns self."""
 511        return self
 512
 513    def __exit__(
 514        self,
 515        exc_type: type[BaseException] | None,
 516        exc_val: BaseException | None,
 517        exc_tb: object,
 518    ) -> None:
 519        """Exit sync context manager. Releases sessions and resets state."""
 520        self.close()
 521
 522    def close(self) -> None:
 523        """Release inference sessions and reset internal state.
 524
 525        After calling close(), the detector should not be used for inference.
 526        This is called automatically when using WakeDetector as a context
 527        manager.
 528        """
 529        self.reset()
 530        # Release inference session references so the underlying runtime
 531        # (ONNX / TFLite) can free memory immediately rather than waiting
 532        # for garbage collection.
 533        self._mlp_session = None  # type: ignore[assignment]
 534        if self._ensemble is not None:
 535            self._ensemble.clear()
 536        self._oww_backbone = None  # type: ignore[assignment]
 537
 538    def _create_oww_backbone(self) -> OpenWakeWordBackbone:
 539        """Create the shared OpenWakeWord backbone."""
 540        return OpenWakeWordBackbone(self._backend)
 541
 542    def _warn_on_oww_backbone_change(self, model_path: Path) -> None:
 543        """Warn when the installed OWW backbone differs from the training config."""
 544        config_path = model_path.with_suffix(".config.json")
 545        if not config_path.exists():
 546            return
 547
 548        try:
 549            with config_path.open(encoding="utf-8") as f:
 550                config = json.load(f)
 551        except (OSError, json.JSONDecodeError):
 552            return
 553
 554        expected_mel = config.get("oww_mel_sha256")
 555        expected_emb = config.get("oww_emb_sha256")
 556        if not isinstance(expected_mel, str) or not isinstance(expected_emb, str):
 557            return
 558
 559        try:
 560            current_hashes = get_openwakeword_backbone_hashes("onnx")
 561        except Exception:
 562            return
 563
 564        if (
 565            current_hashes["oww_mel_sha256"] != expected_mel
 566            or current_hashes["oww_emb_sha256"] != expected_emb
 567        ):
 568            logger.warning(
 569                "OWW backbone version changed since training. Model may produce degraded results."
 570            )
 571
 572    def _load_session(self, model: str) -> BackendSession:
 573        """Load a model file via the configured backend.
 574
 575        Resolves *model* to a file path (direct path, .onnx/.tflite suffix,
 576        or registry lookup), then delegates to ``self._backend.load()``.
 577
 578        For TFLite backends, if only a ``.onnx`` file exists in the cache
 579        the method looks for a sibling ``.tflite`` file with the same stem.
 580        """
 581        model_path = self._resolve_model_path(model)
 582
 583        # When using the TFLite backend, prefer a .tflite sibling if the
 584        # resolved path is an .onnx file.
 585        if self._backend.name == "tflite" and model_path.suffix == ".onnx":
 586            tflite_sibling = model_path.with_suffix(".tflite")
 587            if tflite_sibling.exists():
 588                model_path = tflite_sibling
 589                logger.debug("TFLite backend: using .tflite sibling %s", model_path)
 590            else:
 591                logger.warning(
 592                    "TFLite backend selected but only .onnx file found at %s. "
 593                    "Convert with: python -c "
 594                    '"from violawake_sdk.backends.tflite_backend import '
 595                    "convert_onnx_to_tflite; convert_onnx_to_tflite('%s')\"",
 596                    model_path,
 597                    model_path,
 598                )
 599
 600        try:
 601            session = self._backend.load(model_path)
 602        except Exception as e:
 603            raise ModelLoadError(f"Failed to load model {model_path}: {e}") from e
 604        logger.debug("Loaded model via %s backend: %s", self._backend.name, model_path)
 605        return session
 606
 607    @staticmethod
 608    def _resolve_model_path(model: str) -> Path:
 609        """Resolve a model name or path string to a concrete file path.
 610
 611        Resolution order:
 612        1. If *model* is an existing file path, use it directly.
 613        2. If *model* ends with ``.onnx`` or ``.tflite``, treat as a path
 614           (raise if not found).
 615        3. Otherwise, look up *model* in the model registry / cache.
 616        """
 617        if Path(model).is_file():
 618            return Path(model)
 619
 620        if model.endswith((".onnx", ".tflite")):
 621            path = Path(model)
 622            if not path.exists():
 623                raise ModelNotFoundError(
 624                    f"Model file not found: {model}. "
 625                    f"If this is a named model, omit the file extension."
 626                )
 627            return path
 628
 629        try:
 630            return get_model_path(model)
 631        except FileNotFoundError as e:
 632            raise ModelNotFoundError(
 633                f"Model '{model}' not found in cache and auto-download failed or is disabled. "
 634                f"Run: violawake-download --model {model}"
 635            ) from e
 636
 637    def _get_embedding(self, audio_frame: bytes | np.ndarray) -> np.ndarray:
 638        """Extract the OWW embedding from an audio frame.
 639
 640        Returns the raw embedding vector before MLP scoring.
 641        Used internally for speaker verification (K5).
 642        """
 643        with self._backbone_lock:
 644            embedding = self._oww_backbone.last_embedding
 645            if embedding is None:
 646                _, embedding = self._oww_backbone.push_audio(audio_frame)
 647        if embedding is None:
 648            return np.zeros(EMBEDDING_DIM, dtype=np.float32)
 649        return embedding
 650
 651    @staticmethod
 652    def _needs_int16_normalization(audio_frame: bytes | np.ndarray) -> bool:
 653        """Check whether audio_frame requires int16-to-float normalization."""
 654        return isinstance(audio_frame, bytes) or (
 655            isinstance(audio_frame, np.ndarray) and audio_frame.dtype == np.int16
 656        )
 657
 658    @staticmethod
 659    def _prepare_model_audio(audio_frame: bytes | np.ndarray) -> np.ndarray:
 660        """Validate an audio frame and normalize it for model inference."""
 661        pcm = validate_audio_chunk(audio_frame)
 662        if WakeDetector._needs_int16_normalization(audio_frame):
 663            return pcm / 32768.0
 664        return pcm
 665
 666    def process(self, audio_frame: bytes | np.ndarray) -> float:
 667        """Process a 20ms audio frame and return the wake word detection score.
 668
 669        If ensemble mode is active (K3), returns the fused score.
 670        The score is recorded for confidence tracking (K2) and reported
 671        to the power manager (K7) if configured.
 672
 673        Thread-safe: protects internal state mutation with a lock.
 674
 675        Raises:
 676            TypeError: If audio_frame is not bytes or ndarray.
 677            ValueError: If audio_frame is empty, malformed, or drastically
 678                larger than the supported streaming frame size.
 679        """
 680        return self._process_core(self._prepare_model_audio(audio_frame), audio_frame)
 681
 682    def _process_core(self, pcm: np.ndarray, raw_audio_frame: bytes | np.ndarray) -> float:
 683        """Internal scoring engine operating on pre-validated, normalized PCM.
 684
 685        Args:
 686            pcm: Float32 array, already validated and normalized by _prepare_model_audio.
 687            raw_audio_frame: Original audio frame passed through to OWW backbone.
 688        """
 689        if pcm.shape[0] != FRAME_SAMPLES:
 690            # Reject pathologically large or empty frames first
 691            if pcm.shape[0] == 0 or pcm.shape[0] > _MAX_PROCESS_FRAME_SAMPLES:
 692                raise ValueError(
 693                    "Audio frame length is too far from the expected streaming size: "
 694                    f"expected {FRAME_SAMPLES} samples, got {pcm.shape[0]} "
 695                    f"(maximum non-pathological size is {_MAX_PROCESS_FRAME_SAMPLES})"
 696                )
 697            # Non-multiples of 320 indicate wrong sample rate — return 0.0
 698            if pcm.shape[0] % FRAME_SAMPLES != 0:
 699                logger.warning(
 700                    "Audio frame has %d samples (not a multiple of %d). "
 701                    "Expected 16kHz, 20ms frames. Returning score 0.0.",
 702                    pcm.shape[0],
 703                    FRAME_SAMPLES,
 704                )
 705                return 0.0
 706        with self._backbone_lock:
 707            produced_embedding, embedding = self._oww_backbone.push_audio(raw_audio_frame)
 708            if embedding is None:
 709                score = self._last_score
 710            elif self._ensemble is not None and self._ensemble.model_count > 0:
 711                score = (
 712                    self._ensemble.score(embedding.flatten())
 713                    if produced_embedding
 714                    else self._last_score
 715                )
 716            elif self._is_temporal:
 717                if produced_embedding:
 718                    self._embedding_buffer.append(embedding.flatten())
 719                    if len(self._embedding_buffer) >= self._temporal_seq_len:
 720                        temporal_input = np.stack(list(self._embedding_buffer))
 721                        temporal_input = temporal_input.reshape(
 722                            1,
 723                            self._temporal_seq_len,
 724                            EMBEDDING_DIM,
 725                        ).astype(np.float32)
 726                        score = float(
 727                            self._mlp_session.run(None, {self._mlp_input_name: temporal_input})[
 728                                0
 729                            ].flatten()[0]
 730                        )
 731                    else:
 732                        score = 0.0
 733                else:
 734                    score = self._last_score
 735            else:
 736                if produced_embedding:
 737                    mlp_input = embedding.reshape(1, EMBEDDING_DIM).astype(np.float32)
 738                    score_output = self._mlp_session.run(None, {self._mlp_input_name: mlp_input})[0]
 739                    score = float(np.asarray(score_output).reshape(-1)[0])
 740                else:
 741                    score = self._last_score
 742
 743        with self._lock:
 744            self._last_score = score
 745            # K2: Record score for confidence tracking
 746            self._score_tracker.record(score)
 747
 748        # K7: Report score to power manager for activity detection
 749        if self._power_manager is not None:
 750            self._power_manager.report_score(score)
 751
 752        return score
 753
 754    def detect(self, audio_frame: bytes | np.ndarray, is_playing: bool = False) -> bool:
 755        """Process a frame and apply the full decision policy.
 756
 757        Integrates adaptive threshold (K4), multi-window confirmation (K2),
 758        speaker verification (K5), and power management (K7) when configured.
 759
 760        Thread-safe: protects internal state mutation with a lock.
 761
 762        Raises:
 763            TypeError: If audio_frame is not bytes or ndarray.
 764            ValueError: If audio_frame is empty or has invalid format.
 765        """
 766        # G1: Input validation (single pass — process_core skips re-validation)
 767        pcm = validate_audio_chunk(audio_frame)
 768
 769        # Compute RMS on int16-scale PCM for the rms_floor comparison.
 770        # rms_floor=1.0 is calibrated for int16 scale (speech ≈ 500–5000,
 771        # silence ≈ 0–5).  Float32 input in [-1, 1] is scaled up so the
 772        # same rms_floor works regardless of input format.
 773        rms = float(np.sqrt(np.mean(pcm**2)))
 774        if not self._needs_int16_normalization(audio_frame):
 775            # Float32/float64 input: RMS is in [0, ~0.7] — scale to int16 range
 776            rms *= 32768.0
 777
 778        # Normalize for model inference
 779        model_pcm = pcm / 32768.0 if self._needs_int16_normalization(audio_frame) else pcm
 780
 781        # K7: Power management -- skip frame if power manager says so
 782        if self._power_manager is not None and not self._power_manager.should_process(pcm):
 783            return False
 784
 785        # K4: Update noise profiler and get adaptive threshold
 786        if self._noise_profiler is not None and self._adaptive_threshold:
 787            adapted = self._noise_profiler.update(pcm)
 788            self._policy.threshold = adapted
 789
 790        score = self._process_core(model_pcm, audio_frame)
 791
 792        # K5: Pre-fetch embedding outside _lock to avoid ABBA deadlock
 793        # (_process_core acquires _backbone_lock -> _lock; we must not
 794        # acquire _backbone_lock while holding _lock).
 795        #
 796        # Trade-off: Under concurrent access, _get_embedding reads the
 797        # backbone's last_embedding which may have been overwritten by
 798        # another thread's _process_core call since our score was computed.
 799        # This means the embedding used for speaker verification may not
 800        # correspond to the score just returned by _process_core.  This is
 801        # accepted for performance — taking _backbone_lock across both
 802        # _process_core and _get_embedding would serialize all detection,
 803        # and the mismatch is benign (embeddings from adjacent frames are
 804        # nearly identical in practice).
 805        speaker_embedding: np.ndarray | None = None
 806        if self._speaker_verify_fn is not None:
 807            speaker_embedding = self._get_embedding(audio_frame)
 808
 809        with self._lock:
 810            # K2: Multi-window confirmation
 811            if score >= self._policy.threshold:
 812                self._confirm_counter += 1
 813            else:
 814                self._confirm_counter = 0
 815
 816            effective_detected = self._confirm_counter >= self._confirm_required
 817
 818            if effective_detected:
 819                detected = self._policy.evaluate(score=score, rms=rms, is_playing=is_playing)
 820            else:
 821                detected = False
 822
 823            if detected:
 824                # K5: Speaker verification post-detection
 825                if self._speaker_verify_fn is not None and speaker_embedding is not None:  # noqa: SIM102
 826                    if not self._speaker_verify_fn(speaker_embedding.flatten()):
 827                        logger.debug("Speaker verification rejected detection")
 828                        return False
 829
 830                self._confirm_counter = 0
 831                return True
 832
 833        return False
 834
 835    def reset_cooldown(self) -> None:
 836        """Reset the cooldown window without clearing confirmation state or buffers."""
 837        with self._lock:
 838            self._policy.reset_cooldown()
 839
 840    def reset(self) -> None:
 841        """Reset cooldown, confirmation state, score history, and temporal buffers.
 842
 843        Lock ordering: _backbone_lock then _lock, matching _process_core
 844        to prevent ABBA deadlock.
 845        """
 846        with self._backbone_lock, self._lock:
 847            self._policy.reset_cooldown()
 848            self._confirm_counter = 0
 849            self._last_score = 0.0
 850            self._score_tracker.reset()
 851            self._oww_backbone.reset()
 852            if self._is_temporal:
 853                self._embedding_buffer.clear()
 854
 855    # ------------------------------------------------------------------
 856    # K2: Confidence API
 857    # ------------------------------------------------------------------
 858
 859    def get_confidence(self) -> ConfidenceResult:
 860        """Return a confidence assessment of the current detection state.
 861
 862        Includes the raw MLP score, multi-window confirmation count,
 863        and a classified confidence level (LOW/MEDIUM/HIGH/CERTAIN).
 864        """
 865        return self._score_tracker.classify(
 866            confirm_count=self._confirm_counter,
 867            confirm_required=self._confirm_required,
 868        )
 869
 870    @property
 871    def last_scores(self) -> tuple[float, ...]:
 872        """Return the recent score history (most recent last)."""
 873        return self._score_tracker.last_scores
 874
 875    # ------------------------------------------------------------------
 876    # K5: Speaker verification helpers
 877    # ------------------------------------------------------------------
 878
 879    def enroll_speaker(self, speaker_id: str, audio_frames: list[bytes | np.ndarray]) -> int:
 880        """Enroll a speaker by extracting embeddings from audio frames.
 881
 882        Requires a ``SpeakerVerificationHook`` as the ``speaker_verify_fn``.
 883
 884        Args:
 885            speaker_id: Unique identifier for the speaker.
 886            audio_frames: Audio frames to extract embeddings from.
 887
 888        Returns:
 889            Total enrollment count for this speaker.
 890
 891        Raises:
 892            RuntimeError: If no SpeakerVerificationHook is configured.
 893        """
 894        from violawake_sdk.speaker import SpeakerVerificationHook
 895
 896        hook = self._speaker_verify_fn
 897        if not isinstance(hook, SpeakerVerificationHook):
 898            raise RuntimeError(
 899                "enroll_speaker requires a SpeakerVerificationHook as speaker_verify_fn"
 900            )
 901
 902        embeddings = []
 903        for frame in audio_frames:
 904            emb = self._get_embedding(frame)
 905            embeddings.append(emb.flatten())
 906
 907        return hook.enroll_speaker(speaker_id, embeddings)
 908
 909    def verify_speaker(self, audio_frame: bytes | np.ndarray) -> SpeakerVerifyResult:
 910        """Verify the speaker in an audio frame against enrolled profiles.
 911
 912        Args:
 913            audio_frame: Audio frame to verify.
 914
 915        Returns:
 916            SpeakerVerifyResult with match details.
 917
 918        Raises:
 919            RuntimeError: If no SpeakerVerificationHook is configured.
 920        """
 921        from violawake_sdk.speaker import SpeakerVerificationHook
 922
 923        hook = self._speaker_verify_fn
 924        if not isinstance(hook, SpeakerVerificationHook):
 925            raise RuntimeError(
 926                "verify_speaker requires a SpeakerVerificationHook as speaker_verify_fn"
 927            )
 928
 929        embedding = self._get_embedding(audio_frame)
 930        return hook.verify_speaker(embedding.flatten())
 931
 932    # ------------------------------------------------------------------
 933    # K6: Audio source factory
 934    # ------------------------------------------------------------------
 935
 936    @classmethod
 937    def from_source(
 938        cls,
 939        source: AudioSource,
 940        model: str = "temporal_cnn",
 941        threshold: float = DEFAULT_THRESHOLD,
 942        cooldown_s: float = DEFAULT_COOLDOWN_S,
 943        **kwargs: Any,
 944    ) -> _SourceDetector:
 945        """Create a WakeDetector bound to an AudioSource.
 946
 947        The returned object wraps a WakeDetector and provides a ``run()``
 948        method that reads frames from the source and runs detection.
 949
 950        Args:
 951            source: Any object implementing the AudioSource protocol.
 952            model: Model name or path.
 953            threshold: Detection threshold.
 954            cooldown_s: Cooldown between detections.
 955            **kwargs: Additional WakeDetector keyword arguments.
 956
 957        Returns:
 958            A _SourceDetector wrapping both the source and detector.
 959        """
 960        detector = cls(
 961            model=model,
 962            threshold=threshold,
 963            cooldown_s=cooldown_s,
 964            **kwargs,
 965        )
 966        return _SourceDetector(detector=detector, source=source)
 967
 968    # ------------------------------------------------------------------
 969    # Original methods
 970    # ------------------------------------------------------------------
 971
 972    def stream_mic(self, device_index: int | None = None) -> Generator[bytes, None, None]:
 973        """Generator that yields 20ms audio frames from the default microphone."""
 974        try:
 975            import pyaudio
 976        except ImportError:
 977            raise ImportError(
 978                "pyaudio is required for microphone features. "
 979                "Install with: pip install violawake[audio]"
 980            ) from None
 981        pa = pyaudio.PyAudio()
 982        try:
 983            stream = pa.open(
 984                format=pyaudio.paInt16,
 985                channels=1,
 986                rate=SAMPLE_RATE,
 987                input=True,
 988                frames_per_buffer=FRAME_SAMPLES,
 989                input_device_index=device_index,
 990            )
 991        except Exception as e:
 992            pa.terminate()
 993            raise AudioCaptureError(
 994                f"Failed to open microphone: {e}. "
 995                f"Check that a microphone is connected and not in use by another application."
 996            ) from e
 997        logger.info("Microphone capture started (16kHz, mono, 20ms frames)")
 998        _MAX_CONSECUTIVE_ERRORS = 10
 999        consecutive_errors = 0
1000        try:
1001            while True:
1002                try:
1003                    yield stream.read(FRAME_SAMPLES, exception_on_overflow=False)
1004                    consecutive_errors = 0
1005                except Exception as e:
1006                    consecutive_errors += 1
1007                    logger.warning(
1008                        "Mic read error (%d/%d): %s", consecutive_errors, _MAX_CONSECUTIVE_ERRORS, e
1009                    )
1010                    if consecutive_errors >= _MAX_CONSECUTIVE_ERRORS:
1011                        raise AudioCaptureError(
1012                            f"Microphone read failed {_MAX_CONSECUTIVE_ERRORS} consecutive times. "
1013                            f"Last error: {e}"
1014                        ) from e
1015                    continue
1016        finally:
1017            stream.stop_stream()
1018            stream.close()
1019            pa.terminate()
1020            logger.info("Microphone capture stopped")

Wake word detector using ViolaWake MLP on OpenWakeWord embeddings.

Supports pluggable inference backends (ONNX Runtime, TFLite) via the backend parameter. The default "auto" mode tries ONNX Runtime first, then falls back to TFLite, so users on edge devices can run without installing onnxruntime.

Also supports optional competitive features (all opt-in, backward compatible):

  • K2 Confidence API: get_confidence() and last_scores property.
  • K3 Multi-model ensemble: models parameter with fusion strategies.
  • K4 Adaptive threshold: adaptive_threshold parameter with noise profiling.
  • K5 Speaker verification: speaker_verify_fn callback for post-detection.
  • K6 Audio source abstraction: from_source() class method factory.
  • K7 Power management: power_manager parameter for duty cycling.

Threshold tuning guide:

  • 0.70 = sensitive (more detections, more false positives)
  • 0.80 = balanced (default, recommended starting point)
  • 0.85 = conservative (fewer false positives, may miss some)
  • 0.90+ = very conservative (for noisy environments)

Start at 0.80 and adjust based on your false accept rate.

Arguments:
  • model: Model name from the registry, or a path to a model file.
  • threshold: Detection confidence threshold in [0.0, 1.0].
  • cooldown_s: Minimum seconds between consecutive detections.
  • providers: ONNX Runtime execution providers (ignored for TFLite).
  • backend: Inference backend selector ("onnx", "tflite", "auto").
  • config: A DetectorConfig instance bundling all advanced options. Mutually exclusive with the individual advanced kwargs below.
  • models: Additional model paths for ensemble scoring (K3).
  • fusion_strategy: Score fusion strategy for ensemble (K3).
  • fusion_weights: Per-model weights for weighted_average fusion (K3).
  • adaptive_threshold: Enable dynamic threshold based on noise (K4).
  • noise_profiler: Custom NoiseProfiler instance (K4).
  • speaker_verify_fn: Post-detection speaker verification callback (K5).
  • power_manager: Power management controller for duty cycling (K7).
  • confirm_count: Consecutive above-threshold scores required for detection (K2).
  • score_history_size: Number of recent scores to retain (K2).
WakeDetector( model: str = 'temporal_cnn', threshold: float = 0.8, cooldown_s: float = 2.0, providers: list[str] | None = None, backend: str = 'auto', *, config: DetectorConfig | None = None, models: list[str] | None = <object object>, fusion_strategy: FusionStrategy | str = <object object>, fusion_weights: list[float] | None = <object object>, adaptive_threshold: bool = <object object>, noise_profiler: NoiseProfiler | None = <object object>, speaker_verify_fn: Callable[[numpy.ndarray], bool] | None = <object object>, power_manager: PowerManager | None = <object object>, confirm_count: int = <object object>, score_history_size: int = <object object>)
319    def __init__(
320        self,
321        model: str = "temporal_cnn",
322        threshold: float = DEFAULT_THRESHOLD,
323        cooldown_s: float = DEFAULT_COOLDOWN_S,
324        providers: list[str] | None = None,
325        backend: str = "auto",
326        *,
327        config: DetectorConfig | None = None,
328        # K3: Multi-model ensemble (individual kwargs, backwards compat)
329        models: list[str] | None = _UNSET,
330        fusion_strategy: FusionStrategy | str = _UNSET,
331        fusion_weights: list[float] | None = _UNSET,
332        # K4: Adaptive threshold
333        adaptive_threshold: bool = _UNSET,
334        noise_profiler: NoiseProfiler | None = _UNSET,
335        # K5: Speaker verification
336        speaker_verify_fn: Callable[[np.ndarray], bool] | None = _UNSET,
337        # K7: Power management
338        power_manager: PowerManager | None = _UNSET,
339        # K2: Confidence tracking
340        confirm_count: int = _UNSET,
341        score_history_size: int = _UNSET,
342    ) -> None:
343        # --- Resolve config vs individual kwargs -------------------------
344        # Detect if any advanced kwarg was explicitly passed (not _UNSET)
345        _locals = {
346            "models": models,
347            "fusion_strategy": fusion_strategy,
348            "fusion_weights": fusion_weights,
349            "adaptive_threshold": adaptive_threshold,
350            "noise_profiler": noise_profiler,
351            "speaker_verify_fn": speaker_verify_fn,
352            "power_manager": power_manager,
353            "confirm_count": confirm_count,
354            "score_history_size": score_history_size,
355        }
356        explicit_kwargs = {name for name, val in _locals.items() if val is not _UNSET}
357        if config is not None and explicit_kwargs:
358            raise ValueError(
359                f"Cannot specify both config= and individual advanced kwargs. "
360                f"Conflicting kwargs: {sorted(explicit_kwargs)}. "
361                f"Either use config=DetectorConfig(...) or pass kwargs directly, not both."
362            )
363
364        if config is not None:
365            # Unpack from DetectorConfig
366            models = config.models
367            fusion_strategy = config.fusion_strategy
368            fusion_weights = config.fusion_weights
369            adaptive_threshold = config.adaptive_threshold
370            noise_profiler = config.noise_profiler
371            speaker_verify_fn = config.speaker_verify_fn
372            power_manager = config.power_manager
373            confirm_count = config.confirm_count
374            score_history_size = config.score_history_size
375        else:
376            # Apply defaults for any _UNSET values (backwards compat path)
377            if models is _UNSET:
378                models = None
379            if fusion_strategy is _UNSET:
380                fusion_strategy = FusionStrategy.AVERAGE
381            if fusion_weights is _UNSET:
382                fusion_weights = None
383            if adaptive_threshold is _UNSET:
384                adaptive_threshold = False
385            if noise_profiler is _UNSET:
386                noise_profiler = None
387            if speaker_verify_fn is _UNSET:
388                speaker_verify_fn = None
389            if power_manager is _UNSET:
390                power_manager = None
391            if confirm_count is _UNSET:
392                confirm_count = 1
393            if score_history_size is _UNSET:
394                score_history_size = 50
395
396        # G1: Input validation for public constructor parameters
397        if not isinstance(threshold, (int, float)):
398            raise TypeError(f"threshold must be a number, got {type(threshold).__name__}")
399        if not 0.0 <= threshold <= 1.0:
400            raise ValueError(f"threshold must be in [0.0, 1.0], got {threshold!r}")
401        if not isinstance(cooldown_s, (int, float)):
402            raise TypeError(f"cooldown_s must be a number, got {type(cooldown_s).__name__}")
403        if cooldown_s < 0:
404            raise ValueError(f"cooldown_s must be >= 0, got {cooldown_s!r}")
405        if backend not in self._VALID_BACKENDS:
406            raise ValueError(f"backend must be one of {self._VALID_BACKENDS}, got {backend!r}")
407        if confirm_count < 1:
408            raise ValueError(f"confirm_count must be >= 1, got {confirm_count}")
409
410        self.threshold = threshold
411        self._lock = threading.Lock()
412        self._backbone_lock = threading.Lock()
413        self._policy = WakeDecisionPolicy(threshold=threshold, cooldown_s=cooldown_s)
414        self._providers = providers or ["CPUExecutionProvider"]
415        self._backend: InferenceBackend = get_backend(backend, providers=self._providers)
416
417        # K2: Confidence tracking
418        self._score_tracker = ScoreTracker(
419            threshold=threshold,
420            history_size=score_history_size,
421        )
422        self._confirm_required = confirm_count
423        self._confirm_counter = 0
424
425        # K3: Ensemble support
426        self._ensemble: EnsembleScorer | None = None
427        if models and len(models) > 0:
428            if isinstance(fusion_strategy, str):
429                fusion_strategy = FusionStrategy(fusion_strategy)
430            self._ensemble = EnsembleScorer(
431                strategy=fusion_strategy,
432                weights=fusion_weights,
433            )
434
435        # K4: Noise profiler / adaptive threshold
436        self._adaptive_threshold = adaptive_threshold
437        if noise_profiler is not None:
438            self._noise_profiler: NoiseProfiler | None = noise_profiler
439        elif adaptive_threshold:
440            self._noise_profiler = NoiseProfiler(base_threshold=threshold)
441        else:
442            self._noise_profiler = None
443
444        # K5: Speaker verification
445        self._speaker_verify_fn = speaker_verify_fn
446
447        # K7: Power manager
448        self._power_manager = power_manager
449
450        # Warn on deprecated models
451        if model in MODEL_REGISTRY and "DEPRECATED" in MODEL_REGISTRY[model].description:
452            import warnings
453
454            warnings.warn(
455                f"Model '{model}' is deprecated: {MODEL_REGISTRY[model].description}. "
456                f"Use model='temporal_cnn' instead.",
457                DeprecationWarning,
458                stacklevel=2,
459            )
460
461        # Load models
462        self._oww_backbone = self._create_oww_backbone()
463        self._mlp_session = self._load_session(model)
464        self._mlp_input_name = self._mlp_session.get_inputs()[0].name
465        self._last_score = 0.0
466
467        # Detect temporal vs MLP model from input shape
468        mlp_input_shape = self._mlp_session.get_inputs()[0].shape
469        if len(mlp_input_shape) == 3:
470            # Temporal model: input is (batch, seq_len, embedding_dim)
471            self._is_temporal = True
472            self._temporal_seq_len = (
473                mlp_input_shape[1]
474                if isinstance(mlp_input_shape[1], int)
475                else _TEMPORAL_SEQ_LEN_DEFAULT
476            )
477            self._embedding_buffer: collections.deque[np.ndarray] = collections.deque(
478                maxlen=self._temporal_seq_len,
479            )
480            logger.info(
481                "Temporal model detected: seq_len=%d",
482                self._temporal_seq_len,
483            )
484        else:
485            self._is_temporal = False
486            self._temporal_seq_len = 0
487
488        # K3: Load additional ensemble models
489        if models and self._ensemble is not None:
490            # Add primary model to ensemble
491            self._ensemble.add_session(self._mlp_session, self._mlp_input_name)
492            for extra_model in models:
493                extra_session = self._load_session(extra_model)
494                extra_input_name = extra_session.get_inputs()[0].name
495                self._ensemble.add_session(extra_session, extra_input_name)
496
497        logger.info(
498            "WakeDetector initialized: model=%s, threshold=%.2f, backend=%s",
499            model,
500            threshold,
501            self._backend.name,
502        )
503        self._warn_on_oww_backbone_change(self._resolve_model_path(model))
threshold
def close(self) -> None:
522    def close(self) -> None:
523        """Release inference sessions and reset internal state.
524
525        After calling close(), the detector should not be used for inference.
526        This is called automatically when using WakeDetector as a context
527        manager.
528        """
529        self.reset()
530        # Release inference session references so the underlying runtime
531        # (ONNX / TFLite) can free memory immediately rather than waiting
532        # for garbage collection.
533        self._mlp_session = None  # type: ignore[assignment]
534        if self._ensemble is not None:
535            self._ensemble.clear()
536        self._oww_backbone = None  # type: ignore[assignment]

Release inference sessions and reset internal state.

After calling close(), the detector should not be used for inference. This is called automatically when using WakeDetector as a context manager.

def process(self, audio_frame: bytes | numpy.ndarray) -> float:
666    def process(self, audio_frame: bytes | np.ndarray) -> float:
667        """Process a 20ms audio frame and return the wake word detection score.
668
669        If ensemble mode is active (K3), returns the fused score.
670        The score is recorded for confidence tracking (K2) and reported
671        to the power manager (K7) if configured.
672
673        Thread-safe: protects internal state mutation with a lock.
674
675        Raises:
676            TypeError: If audio_frame is not bytes or ndarray.
677            ValueError: If audio_frame is empty, malformed, or drastically
678                larger than the supported streaming frame size.
679        """
680        return self._process_core(self._prepare_model_audio(audio_frame), audio_frame)

Process a 20ms audio frame and return the wake word detection score.

If ensemble mode is active (K3), returns the fused score. The score is recorded for confidence tracking (K2) and reported to the power manager (K7) if configured.

Thread-safe: protects internal state mutation with a lock.

Raises:
  • TypeError: If audio_frame is not bytes or ndarray.
  • ValueError: If audio_frame is empty, malformed, or drastically larger than the supported streaming frame size.
def detect( self, audio_frame: bytes | numpy.ndarray, is_playing: bool = False) -> bool:
754    def detect(self, audio_frame: bytes | np.ndarray, is_playing: bool = False) -> bool:
755        """Process a frame and apply the full decision policy.
756
757        Integrates adaptive threshold (K4), multi-window confirmation (K2),
758        speaker verification (K5), and power management (K7) when configured.
759
760        Thread-safe: protects internal state mutation with a lock.
761
762        Raises:
763            TypeError: If audio_frame is not bytes or ndarray.
764            ValueError: If audio_frame is empty or has invalid format.
765        """
766        # G1: Input validation (single pass — process_core skips re-validation)
767        pcm = validate_audio_chunk(audio_frame)
768
769        # Compute RMS on int16-scale PCM for the rms_floor comparison.
770        # rms_floor=1.0 is calibrated for int16 scale (speech ≈ 500–5000,
771        # silence ≈ 0–5).  Float32 input in [-1, 1] is scaled up so the
772        # same rms_floor works regardless of input format.
773        rms = float(np.sqrt(np.mean(pcm**2)))
774        if not self._needs_int16_normalization(audio_frame):
775            # Float32/float64 input: RMS is in [0, ~0.7] — scale to int16 range
776            rms *= 32768.0
777
778        # Normalize for model inference
779        model_pcm = pcm / 32768.0 if self._needs_int16_normalization(audio_frame) else pcm
780
781        # K7: Power management -- skip frame if power manager says so
782        if self._power_manager is not None and not self._power_manager.should_process(pcm):
783            return False
784
785        # K4: Update noise profiler and get adaptive threshold
786        if self._noise_profiler is not None and self._adaptive_threshold:
787            adapted = self._noise_profiler.update(pcm)
788            self._policy.threshold = adapted
789
790        score = self._process_core(model_pcm, audio_frame)
791
792        # K5: Pre-fetch embedding outside _lock to avoid ABBA deadlock
793        # (_process_core acquires _backbone_lock -> _lock; we must not
794        # acquire _backbone_lock while holding _lock).
795        #
796        # Trade-off: Under concurrent access, _get_embedding reads the
797        # backbone's last_embedding which may have been overwritten by
798        # another thread's _process_core call since our score was computed.
799        # This means the embedding used for speaker verification may not
800        # correspond to the score just returned by _process_core.  This is
801        # accepted for performance — taking _backbone_lock across both
802        # _process_core and _get_embedding would serialize all detection,
803        # and the mismatch is benign (embeddings from adjacent frames are
804        # nearly identical in practice).
805        speaker_embedding: np.ndarray | None = None
806        if self._speaker_verify_fn is not None:
807            speaker_embedding = self._get_embedding(audio_frame)
808
809        with self._lock:
810            # K2: Multi-window confirmation
811            if score >= self._policy.threshold:
812                self._confirm_counter += 1
813            else:
814                self._confirm_counter = 0
815
816            effective_detected = self._confirm_counter >= self._confirm_required
817
818            if effective_detected:
819                detected = self._policy.evaluate(score=score, rms=rms, is_playing=is_playing)
820            else:
821                detected = False
822
823            if detected:
824                # K5: Speaker verification post-detection
825                if self._speaker_verify_fn is not None and speaker_embedding is not None:  # noqa: SIM102
826                    if not self._speaker_verify_fn(speaker_embedding.flatten()):
827                        logger.debug("Speaker verification rejected detection")
828                        return False
829
830                self._confirm_counter = 0
831                return True
832
833        return False

Process a frame and apply the full decision policy.

Integrates adaptive threshold (K4), multi-window confirmation (K2), speaker verification (K5), and power management (K7) when configured.

Thread-safe: protects internal state mutation with a lock.

Raises:
  • TypeError: If audio_frame is not bytes or ndarray.
  • ValueError: If audio_frame is empty or has invalid format.
def reset_cooldown(self) -> None:
835    def reset_cooldown(self) -> None:
836        """Reset the cooldown window without clearing confirmation state or buffers."""
837        with self._lock:
838            self._policy.reset_cooldown()

Reset the cooldown window without clearing confirmation state or buffers.

def reset(self) -> None:
840    def reset(self) -> None:
841        """Reset cooldown, confirmation state, score history, and temporal buffers.
842
843        Lock ordering: _backbone_lock then _lock, matching _process_core
844        to prevent ABBA deadlock.
845        """
846        with self._backbone_lock, self._lock:
847            self._policy.reset_cooldown()
848            self._confirm_counter = 0
849            self._last_score = 0.0
850            self._score_tracker.reset()
851            self._oww_backbone.reset()
852            if self._is_temporal:
853                self._embedding_buffer.clear()

Reset cooldown, confirmation state, score history, and temporal buffers.

Lock ordering: _backbone_lock then _lock, matching _process_core to prevent ABBA deadlock.

def get_confidence(self) -> ConfidenceResult:
859    def get_confidence(self) -> ConfidenceResult:
860        """Return a confidence assessment of the current detection state.
861
862        Includes the raw MLP score, multi-window confirmation count,
863        and a classified confidence level (LOW/MEDIUM/HIGH/CERTAIN).
864        """
865        return self._score_tracker.classify(
866            confirm_count=self._confirm_counter,
867            confirm_required=self._confirm_required,
868        )

Return a confidence assessment of the current detection state.

Includes the raw MLP score, multi-window confirmation count, and a classified confidence level (LOW/MEDIUM/HIGH/CERTAIN).

last_scores: tuple[float, ...]
870    @property
871    def last_scores(self) -> tuple[float, ...]:
872        """Return the recent score history (most recent last)."""
873        return self._score_tracker.last_scores

Return the recent score history (most recent last).

def enroll_speaker(self, speaker_id: str, audio_frames: list[bytes | numpy.ndarray]) -> int:
879    def enroll_speaker(self, speaker_id: str, audio_frames: list[bytes | np.ndarray]) -> int:
880        """Enroll a speaker by extracting embeddings from audio frames.
881
882        Requires a ``SpeakerVerificationHook`` as the ``speaker_verify_fn``.
883
884        Args:
885            speaker_id: Unique identifier for the speaker.
886            audio_frames: Audio frames to extract embeddings from.
887
888        Returns:
889            Total enrollment count for this speaker.
890
891        Raises:
892            RuntimeError: If no SpeakerVerificationHook is configured.
893        """
894        from violawake_sdk.speaker import SpeakerVerificationHook
895
896        hook = self._speaker_verify_fn
897        if not isinstance(hook, SpeakerVerificationHook):
898            raise RuntimeError(
899                "enroll_speaker requires a SpeakerVerificationHook as speaker_verify_fn"
900            )
901
902        embeddings = []
903        for frame in audio_frames:
904            emb = self._get_embedding(frame)
905            embeddings.append(emb.flatten())
906
907        return hook.enroll_speaker(speaker_id, embeddings)

Enroll a speaker by extracting embeddings from audio frames.

Requires a SpeakerVerificationHook as the speaker_verify_fn.

Arguments:
  • speaker_id: Unique identifier for the speaker.
  • audio_frames: Audio frames to extract embeddings from.
Returns:

Total enrollment count for this speaker.

Raises:
  • RuntimeError: If no SpeakerVerificationHook is configured.
def verify_speaker( self, audio_frame: bytes | numpy.ndarray) -> violawake_sdk.speaker.SpeakerVerifyResult:
909    def verify_speaker(self, audio_frame: bytes | np.ndarray) -> SpeakerVerifyResult:
910        """Verify the speaker in an audio frame against enrolled profiles.
911
912        Args:
913            audio_frame: Audio frame to verify.
914
915        Returns:
916            SpeakerVerifyResult with match details.
917
918        Raises:
919            RuntimeError: If no SpeakerVerificationHook is configured.
920        """
921        from violawake_sdk.speaker import SpeakerVerificationHook
922
923        hook = self._speaker_verify_fn
924        if not isinstance(hook, SpeakerVerificationHook):
925            raise RuntimeError(
926                "verify_speaker requires a SpeakerVerificationHook as speaker_verify_fn"
927            )
928
929        embedding = self._get_embedding(audio_frame)
930        return hook.verify_speaker(embedding.flatten())

Verify the speaker in an audio frame against enrolled profiles.

Arguments:
  • audio_frame: Audio frame to verify.
Returns:

SpeakerVerifyResult with match details.

Raises:
  • RuntimeError: If no SpeakerVerificationHook is configured.
@classmethod
def from_source( cls, source: violawake_sdk.audio_source.AudioSource, model: str = 'temporal_cnn', threshold: float = 0.8, cooldown_s: float = 2.0, **kwargs: Any) -> violawake_sdk.wake_detector._SourceDetector:
936    @classmethod
937    def from_source(
938        cls,
939        source: AudioSource,
940        model: str = "temporal_cnn",
941        threshold: float = DEFAULT_THRESHOLD,
942        cooldown_s: float = DEFAULT_COOLDOWN_S,
943        **kwargs: Any,
944    ) -> _SourceDetector:
945        """Create a WakeDetector bound to an AudioSource.
946
947        The returned object wraps a WakeDetector and provides a ``run()``
948        method that reads frames from the source and runs detection.
949
950        Args:
951            source: Any object implementing the AudioSource protocol.
952            model: Model name or path.
953            threshold: Detection threshold.
954            cooldown_s: Cooldown between detections.
955            **kwargs: Additional WakeDetector keyword arguments.
956
957        Returns:
958            A _SourceDetector wrapping both the source and detector.
959        """
960        detector = cls(
961            model=model,
962            threshold=threshold,
963            cooldown_s=cooldown_s,
964            **kwargs,
965        )
966        return _SourceDetector(detector=detector, source=source)

Create a WakeDetector bound to an AudioSource.

The returned object wraps a WakeDetector and provides a run() method that reads frames from the source and runs detection.

Arguments:
  • source: Any object implementing the AudioSource protocol.
  • model: Model name or path.
  • threshold: Detection threshold.
  • cooldown_s: Cooldown between detections.
  • **kwargs: Additional WakeDetector keyword arguments.
Returns:

A _SourceDetector wrapping both the source and detector.

def stream_mic(self, device_index: int | None = None) -> Generator[bytes, None, None]:
 972    def stream_mic(self, device_index: int | None = None) -> Generator[bytes, None, None]:
 973        """Generator that yields 20ms audio frames from the default microphone."""
 974        try:
 975            import pyaudio
 976        except ImportError:
 977            raise ImportError(
 978                "pyaudio is required for microphone features. "
 979                "Install with: pip install violawake[audio]"
 980            ) from None
 981        pa = pyaudio.PyAudio()
 982        try:
 983            stream = pa.open(
 984                format=pyaudio.paInt16,
 985                channels=1,
 986                rate=SAMPLE_RATE,
 987                input=True,
 988                frames_per_buffer=FRAME_SAMPLES,
 989                input_device_index=device_index,
 990            )
 991        except Exception as e:
 992            pa.terminate()
 993            raise AudioCaptureError(
 994                f"Failed to open microphone: {e}. "
 995                f"Check that a microphone is connected and not in use by another application."
 996            ) from e
 997        logger.info("Microphone capture started (16kHz, mono, 20ms frames)")
 998        _MAX_CONSECUTIVE_ERRORS = 10
 999        consecutive_errors = 0
1000        try:
1001            while True:
1002                try:
1003                    yield stream.read(FRAME_SAMPLES, exception_on_overflow=False)
1004                    consecutive_errors = 0
1005                except Exception as e:
1006                    consecutive_errors += 1
1007                    logger.warning(
1008                        "Mic read error (%d/%d): %s", consecutive_errors, _MAX_CONSECUTIVE_ERRORS, e
1009                    )
1010                    if consecutive_errors >= _MAX_CONSECUTIVE_ERRORS:
1011                        raise AudioCaptureError(
1012                            f"Microphone read failed {_MAX_CONSECUTIVE_ERRORS} consecutive times. "
1013                            f"Last error: {e}"
1014                        ) from e
1015                    continue
1016        finally:
1017            stream.stop_stream()
1018            stream.close()
1019            pa.terminate()
1020            logger.info("Microphone capture stopped")

Generator that yields 20ms audio frames from the default microphone.

class AsyncWakeDetector:
 38class AsyncWakeDetector:
 39    """Async wrapper around ``WakeDetector`` for asyncio-based applications.
 40
 41    All CPU-bound inference is dispatched to a background thread via
 42    ``loop.run_in_executor``. The wrapper is fully transparent -- it
 43    accepts the same constructor arguments and exposes the same methods
 44    as ``WakeDetector``, but with ``async`` signatures.
 45
 46    Args:
 47        **kwargs: Forwarded to ``WakeDetector.__init__``.
 48    """
 49
 50    def __init__(self, **kwargs: Any) -> None:
 51        self._detector = WakeDetector(**kwargs)
 52        self._executor: ThreadPoolExecutor | None = None
 53
 54    def _get_executor(self) -> ThreadPoolExecutor:
 55        if self._executor is None:
 56            self._executor = ThreadPoolExecutor(max_workers=1)
 57        return self._executor
 58
 59    async def __aenter__(self) -> AsyncWakeDetector:
 60        """Enter async context manager."""
 61        return self
 62
 63    async def __aexit__(
 64        self, exc_type: type | None, exc_val: BaseException | None, exc_tb: object
 65    ) -> None:
 66        """Exit async context manager, shutting down the executor."""
 67        self.close()
 68
 69    async def detect(
 70        self,
 71        audio_frame: bytes | np.ndarray,
 72        is_playing: bool = False,
 73    ) -> bool:
 74        """Async version of ``WakeDetector.detect``."""
 75        loop = asyncio.get_running_loop()
 76        return await loop.run_in_executor(
 77            self._get_executor(),
 78            lambda: self._detector.detect(audio_frame, is_playing),
 79        )
 80
 81    async def process(
 82        self,
 83        audio_frame: bytes | np.ndarray,
 84    ) -> float:
 85        """Async version of ``WakeDetector.process``."""
 86        loop = asyncio.get_running_loop()
 87        return await loop.run_in_executor(
 88            self._get_executor(),
 89            lambda: self._detector.process(audio_frame),
 90        )
 91
 92    async def stream(
 93        self,
 94        source: AsyncIterator[bytes | np.ndarray],
 95    ) -> AsyncIterator[bool]:
 96        """Async generator that yields detection results from an async audio source.
 97
 98        Usage::
 99
100            async for detected in detector.stream(audio_source):
101                if detected:
102                    print("Wake word!")
103
104        Args:
105            source: An async iterator yielding audio frames.
106
107        Yields:
108            Boolean detection result for each frame.
109        """
110        async for frame in source:
111            yield await self.detect(frame)
112
113    def reset_cooldown(self) -> None:
114        """Reset the cooldown window (delegates to WakeDetector public API)."""
115        self._detector.reset_cooldown()
116
117    @property
118    def threshold(self) -> float:
119        """Current detection threshold."""
120        return self._detector.threshold
121
122    def get_confidence(self) -> ConfidenceResult:
123        """Return confidence assessment of the current detection state (K2)."""
124        return self._detector.get_confidence()
125
126    @property
127    def last_scores(self) -> tuple[float, ...]:
128        """Return the recent score history (most recent last)."""
129        return self._detector.last_scores
130
131    def close(self) -> None:
132        """Shut down the background executor and release detector resources.
133
134        Safe to call multiple times.
135        """
136        if self._executor is not None:
137            self._executor.shutdown(wait=True)
138            self._executor = None
139        self._detector.close()
140
141    def __del__(self) -> None:
142        # Use wait=False in __del__ to avoid blocking the GC/finalizer thread.
143        # The explicit close() method still uses wait=True for graceful shutdown.
144        if self._executor is not None:
145            self._executor.shutdown(wait=False)
146            self._executor = None

Async wrapper around WakeDetector for asyncio-based applications.

All CPU-bound inference is dispatched to a background thread via loop.run_in_executor. The wrapper is fully transparent -- it accepts the same constructor arguments and exposes the same methods as WakeDetector, but with async signatures.

Arguments:
AsyncWakeDetector(**kwargs: Any)
50    def __init__(self, **kwargs: Any) -> None:
51        self._detector = WakeDetector(**kwargs)
52        self._executor: ThreadPoolExecutor | None = None
async def detect( self, audio_frame: bytes | numpy.ndarray, is_playing: bool = False) -> bool:
69    async def detect(
70        self,
71        audio_frame: bytes | np.ndarray,
72        is_playing: bool = False,
73    ) -> bool:
74        """Async version of ``WakeDetector.detect``."""
75        loop = asyncio.get_running_loop()
76        return await loop.run_in_executor(
77            self._get_executor(),
78            lambda: self._detector.detect(audio_frame, is_playing),
79        )

Async version of WakeDetector.detect.

async def process(self, audio_frame: bytes | numpy.ndarray) -> float:
81    async def process(
82        self,
83        audio_frame: bytes | np.ndarray,
84    ) -> float:
85        """Async version of ``WakeDetector.process``."""
86        loop = asyncio.get_running_loop()
87        return await loop.run_in_executor(
88            self._get_executor(),
89            lambda: self._detector.process(audio_frame),
90        )

Async version of WakeDetector.process.

async def stream( self, source: AsyncIterator[bytes | numpy.ndarray]) -> AsyncIterator[bool]:
 92    async def stream(
 93        self,
 94        source: AsyncIterator[bytes | np.ndarray],
 95    ) -> AsyncIterator[bool]:
 96        """Async generator that yields detection results from an async audio source.
 97
 98        Usage::
 99
100            async for detected in detector.stream(audio_source):
101                if detected:
102                    print("Wake word!")
103
104        Args:
105            source: An async iterator yielding audio frames.
106
107        Yields:
108            Boolean detection result for each frame.
109        """
110        async for frame in source:
111            yield await self.detect(frame)

Async generator that yields detection results from an async audio source.

Usage::

async for detected in detector.stream(audio_source):
    if detected:
        print("Wake word!")
Arguments:
  • source: An async iterator yielding audio frames.
Yields:

Boolean detection result for each frame.

def reset_cooldown(self) -> None:
113    def reset_cooldown(self) -> None:
114        """Reset the cooldown window (delegates to WakeDetector public API)."""
115        self._detector.reset_cooldown()

Reset the cooldown window (delegates to WakeDetector public API).

threshold: float
117    @property
118    def threshold(self) -> float:
119        """Current detection threshold."""
120        return self._detector.threshold

Current detection threshold.

def get_confidence(self) -> ConfidenceResult:
122    def get_confidence(self) -> ConfidenceResult:
123        """Return confidence assessment of the current detection state (K2)."""
124        return self._detector.get_confidence()

Return confidence assessment of the current detection state (K2).

last_scores: tuple[float, ...]
126    @property
127    def last_scores(self) -> tuple[float, ...]:
128        """Return the recent score history (most recent last)."""
129        return self._detector.last_scores

Return the recent score history (most recent last).

def close(self) -> None:
131    def close(self) -> None:
132        """Shut down the background executor and release detector resources.
133
134        Safe to call multiple times.
135        """
136        if self._executor is not None:
137            self._executor.shutdown(wait=True)
138            self._executor = None
139        self._detector.close()

Shut down the background executor and release detector resources.

Safe to call multiple times.

class WakeDecisionPolicy:
215class WakeDecisionPolicy:
216    """4-gate core decision pipeline (RMS floor, threshold, cooldown, playback suppression).
217
218    Extended by WakeDetector with optional confirmation (K2), adaptive
219    threshold (K4), and speaker verification (K5).
220
221    Gate 1: Zero-input guard -- skip if RMS < 1.0 (silence / DC offset artifact)
222    Gate 2: Score threshold -- skip if model score < threshold
223    Gate 3: Cooldown -- ignore events within cooldown_s of last detection
224    Gate 4: Listening gate -- suppress during active playback (optional)
225    """
226
227    def __init__(
228        self,
229        threshold: float = DEFAULT_THRESHOLD,
230        cooldown_s: float = DEFAULT_COOLDOWN_S,
231        rms_floor: float = 1.0,
232    ) -> None:
233        if not 0.0 <= threshold <= 1.0:
234            raise ValueError(f"threshold must be in [0.0, 1.0], got {threshold!r}")
235
236        self.threshold = threshold
237        self.cooldown_s = cooldown_s
238        self.rms_floor = rms_floor
239        self._last_detection: float = 0.0
240
241    def evaluate(
242        self,
243        score: float,
244        rms: float = 100.0,
245        is_playing: bool = False,
246    ) -> bool:
247        """Evaluate whether a wake word event should be triggered."""
248        if rms < self.rms_floor:
249            logger.debug("Gate 1 reject: RMS %.1f below floor %.1f", rms, self.rms_floor)
250            return False
251        if score < self.threshold:
252            return False
253        now = time.monotonic()
254        if now - self._last_detection < self.cooldown_s:
255            logger.debug(
256                "Gate 3 reject: cooldown active (%.1fs remaining)",
257                self.cooldown_s - (now - self._last_detection),
258            )
259            return False
260        if is_playing:
261            logger.debug("Gate 4 reject: playback active")
262            return False
263        self._last_detection = now
264        logger.info("Wake word detected! score=%.3f", score)
265        return True
266
267    def reset_cooldown(self) -> None:
268        """Reset the cooldown window (useful for testing)."""
269        self._last_detection = 0.0

4-gate core decision pipeline (RMS floor, threshold, cooldown, playback suppression).

Extended by WakeDetector with optional confirmation (K2), adaptive threshold (K4), and speaker verification (K5).

Gate 1: Zero-input guard -- skip if RMS < 1.0 (silence / DC offset artifact) Gate 2: Score threshold -- skip if model score < threshold Gate 3: Cooldown -- ignore events within cooldown_s of last detection Gate 4: Listening gate -- suppress during active playback (optional)

WakeDecisionPolicy( threshold: float = 0.8, cooldown_s: float = 2.0, rms_floor: float = 1.0)
227    def __init__(
228        self,
229        threshold: float = DEFAULT_THRESHOLD,
230        cooldown_s: float = DEFAULT_COOLDOWN_S,
231        rms_floor: float = 1.0,
232    ) -> None:
233        if not 0.0 <= threshold <= 1.0:
234            raise ValueError(f"threshold must be in [0.0, 1.0], got {threshold!r}")
235
236        self.threshold = threshold
237        self.cooldown_s = cooldown_s
238        self.rms_floor = rms_floor
239        self._last_detection: float = 0.0
threshold
cooldown_s
rms_floor
def evaluate(self, score: float, rms: float = 100.0, is_playing: bool = False) -> bool:
241    def evaluate(
242        self,
243        score: float,
244        rms: float = 100.0,
245        is_playing: bool = False,
246    ) -> bool:
247        """Evaluate whether a wake word event should be triggered."""
248        if rms < self.rms_floor:
249            logger.debug("Gate 1 reject: RMS %.1f below floor %.1f", rms, self.rms_floor)
250            return False
251        if score < self.threshold:
252            return False
253        now = time.monotonic()
254        if now - self._last_detection < self.cooldown_s:
255            logger.debug(
256                "Gate 3 reject: cooldown active (%.1fs remaining)",
257                self.cooldown_s - (now - self._last_detection),
258            )
259            return False
260        if is_playing:
261            logger.debug("Gate 4 reject: playback active")
262            return False
263        self._last_detection = now
264        logger.info("Wake word detected! score=%.3f", score)
265        return True

Evaluate whether a wake word event should be triggered.

def reset_cooldown(self) -> None:
267    def reset_cooldown(self) -> None:
268        """Reset the cooldown window (useful for testing)."""
269        self._last_detection = 0.0

Reset the cooldown window (useful for testing).

def validate_audio_chunk(data: bytes | numpy.ndarray) -> numpy.ndarray:
161def validate_audio_chunk(data: bytes | np.ndarray) -> np.ndarray:
162    """Validate and normalize an audio chunk for use with WakeDetector.
163
164    Accepts bytes (int16 PCM) or numpy arrays (int16, float32, float64).
165    Returns a float32 numpy array suitable for processing.
166
167    Args:
168        data: Audio chunk as bytes (int16 little-endian PCM) or numpy array.
169
170    Returns:
171        Validated float32 numpy array.
172
173    Raises:
174        TypeError: If data is not bytes or ndarray.
175        ValueError: If data is empty, has invalid dtype, contains only
176            non-finite values, or exceeds the maximum chunk size.
177    """
178    if isinstance(data, bytes):
179        if len(data) == 0:
180            raise ValueError("Audio chunk is empty (0 bytes)")
181        if len(data) % 2 != 0:
182            raise ValueError(
183                f"Audio bytes length must be even (int16 = 2 bytes/sample), got {len(data)}"
184            )
185        pcm = np.frombuffer(data, dtype=np.int16).astype(np.float32)
186    elif isinstance(data, np.ndarray):
187        if data.size == 0:
188            raise ValueError("Audio chunk is empty (0 samples)")
189        if data.ndim != 1:
190            raise ValueError(
191                f"Audio chunk must be 1-D, got {data.ndim}-D array with shape {data.shape}"
192            )
193        _ALLOWED_DTYPES = (np.int16, np.float32, np.float64)
194        if data.dtype not in _ALLOWED_DTYPES:
195            raise ValueError(
196                f"Audio chunk dtype must be one of {[str(d) for d in _ALLOWED_DTYPES]}, "
197                f"got {data.dtype}"
198            )
199        if np.issubdtype(data.dtype, np.floating) and not np.all(np.isfinite(data)):
200            data = np.where(np.isfinite(data), data, 0.0)
201            logger.warning("Audio chunk contained non-finite values (NaN/inf); replaced with 0")
202        pcm = data.astype(np.float32)
203    else:
204        raise TypeError(f"Audio chunk must be bytes or numpy ndarray, got {type(data).__name__}")
205
206    if len(pcm) > _MAX_CHUNK_SAMPLES:
207        raise ValueError(
208            f"Audio chunk too large: {len(pcm)} samples "
209            f"(max {_MAX_CHUNK_SAMPLES} = {_MAX_CHUNK_SAMPLES // SAMPLE_RATE}s at {SAMPLE_RATE}Hz)"
210        )
211
212    return pcm

Validate and normalize an audio chunk for use with WakeDetector.

Accepts bytes (int16 PCM) or numpy arrays (int16, float32, float64). Returns a float32 numpy array suitable for processing.

Arguments:
  • data: Audio chunk as bytes (int16 little-endian PCM) or numpy array.
Returns:

Validated float32 numpy array.

Raises:
  • TypeError: If data is not bytes or ndarray.
  • ValueError: If data is empty, has invalid dtype, contains only non-finite values, or exceeds the maximum chunk size.
@dataclass(frozen=True)
class ConfidenceResult:
28@dataclass(frozen=True)
29class ConfidenceResult:
30    """Result from get_confidence() with full detection context.
31
32    Attributes:
33        raw_score: The most recent MLP/CNN output score in [0.0, 1.0].
34        confirm_count: Number of consecutive above-threshold scores in the
35            current multi-window confirmation sequence.
36        confirm_required: Total consecutive scores required for detection.
37        confidence: Classified confidence level.
38        score_history: Recent score history (most recent last).
39    """
40
41    raw_score: float
42    confirm_count: int
43    confirm_required: int
44    confidence: ConfidenceLevel
45    score_history: tuple[float, ...]

Result from get_confidence() with full detection context.

Attributes:
  • raw_score: The most recent MLP/CNN output score in [0.0, 1.0].
  • confirm_count: Number of consecutive above-threshold scores in the current multi-window confirmation sequence.
  • confirm_required: Total consecutive scores required for detection.
  • confidence: Classified confidence level.
  • score_history: Recent score history (most recent last).
ConfidenceResult( raw_score: float, confirm_count: int, confirm_required: int, confidence: ConfidenceLevel, score_history: tuple[float, ...])
raw_score: float
confirm_count: int
confirm_required: int
confidence: ConfidenceLevel
score_history: tuple[float, ...]
class ConfidenceLevel(builtins.str, enum.Enum):
19class ConfidenceLevel(str, Enum):
20    """Confidence classification for wake word detection."""
21
22    LOW = "LOW"
23    MEDIUM = "MEDIUM"
24    HIGH = "HIGH"
25    CERTAIN = "CERTAIN"

Confidence classification for wake word detection.

LOW = <ConfidenceLevel.LOW: 'LOW'>
MEDIUM = <ConfidenceLevel.MEDIUM: 'MEDIUM'>
HIGH = <ConfidenceLevel.HIGH: 'HIGH'>
CERTAIN = <ConfidenceLevel.CERTAIN: 'CERTAIN'>
class FusionStrategy(builtins.str, enum.Enum):
27class FusionStrategy(str, Enum):
28    """Score fusion strategy for multi-model ensemble."""
29
30    AVERAGE = "average"
31    MAX = "max"
32    VOTING = "voting"
33    WEIGHTED_AVERAGE = "weighted_average"

Score fusion strategy for multi-model ensemble.

AVERAGE = <FusionStrategy.AVERAGE: 'average'>
MAX = <FusionStrategy.MAX: 'max'>
VOTING = <FusionStrategy.VOTING: 'voting'>
WEIGHTED_AVERAGE = <FusionStrategy.WEIGHTED_AVERAGE: 'weighted_average'>
class NoiseProfiler:
 47class NoiseProfiler:
 48    """Estimates ambient noise and adjusts detection threshold.
 49
 50    The profiler maintains a rolling window of RMS energy measurements.
 51    The noise floor is estimated as the 10th percentile of recent RMS values
 52    (capturing the quietest frames, which are likely ambient noise).
 53
 54    Threshold adjustment logic:
 55    - High SNR (signal clearly above noise): lower threshold slightly to
 56      improve sensitivity.
 57    - Low SNR (signal barely above noise): raise threshold to reduce
 58      false alarms from noise bursts.
 59    - The adjusted threshold is always clamped to
 60      ``[min_threshold, max_threshold]``.
 61
 62    Adaptive threshold bounds:
 63    - ``min_threshold`` defaults to ``0.60``. In very quiet environments, or
 64      when the current frame is far above the estimated noise floor, the
 65      threshold may be lowered to improve sensitivity, but it will never be
 66      reduced below this floor.
 67    - ``max_threshold`` defaults to ``0.95``. In very noisy environments, or
 68      when the current frame is close to the estimated noise floor, the
 69      threshold may be raised to suppress false accepts, but it will never be
 70      increased above this ceiling.
 71    - Before enough history is collected (fewer than 10 RMS frames), the
 72      profiler returns ``base_threshold`` with no adaptation.
 73
 74    Args:
 75        base_threshold: The default detection threshold (e.g. 0.80).
 76        noise_window_s: Seconds of audio history for noise estimation.
 77        min_threshold: Floor for adaptive threshold. Default 0.60.
 78        max_threshold: Ceiling for adaptive threshold. Default 0.95.
 79        snr_boost_db: SNR above this value enables threshold lowering.
 80        snr_penalty_db: SNR below this value enables threshold raising.
 81        frames_per_second: Expected audio frames per second (default 50 for 20ms).
 82    """
 83
 84    def __init__(
 85        self,
 86        base_threshold: float = 0.80,
 87        noise_window_s: float = DEFAULT_NOISE_WINDOW_S,
 88        min_threshold: float = DEFAULT_MIN_THRESHOLD,
 89        max_threshold: float = DEFAULT_MAX_THRESHOLD,
 90        snr_boost_db: float = DEFAULT_SNR_BOOST_DB,
 91        snr_penalty_db: float = DEFAULT_SNR_PENALTY_DB,
 92        frames_per_second: float = 50.0,
 93    ) -> None:
 94        self._base_threshold = base_threshold
 95        self._min_threshold = min_threshold
 96        self._max_threshold = max_threshold
 97        self._snr_boost_db = snr_boost_db
 98        self._snr_penalty_db = snr_penalty_db
 99
100        window_frames = max(1, int(noise_window_s * frames_per_second))
101        self._rms_history: deque[float] = deque(maxlen=window_frames)
102        self._current_rms: float = 0.0
103        self._noise_floor_rms: float = 0.0
104
105    @property
106    def base_threshold(self) -> float:
107        """The unadjusted detection threshold."""
108        return self._base_threshold
109
110    @property
111    def noise_floor(self) -> float:
112        """Current estimated noise floor RMS."""
113        return self._noise_floor_rms
114
115    def update(self, audio_frame: np.ndarray) -> float:
116        """Update noise estimate with a new audio frame and return adjusted threshold.
117
118        The audio should be float32 values. Both normalized [-1,1] and int16-range
119        float32 are accepted; the profiler works on relative ratios so absolute
120        scale doesn't matter.
121
122        Args:
123            audio_frame: 1-D float32 audio samples (any length).
124
125        Returns:
126            The adaptively adjusted detection threshold.
127        """
128        rms = float(np.sqrt(np.mean(audio_frame.astype(np.float64) ** 2)))
129        self._current_rms = rms
130        self._rms_history.append(rms)
131
132        # Estimate noise floor as 10th percentile of recent RMS values
133        if len(self._rms_history) >= 10:
134            sorted_rms = sorted(self._rms_history)
135            idx = max(0, int(len(sorted_rms) * 0.10))
136            self._noise_floor_rms = sorted_rms[idx]
137        elif self._rms_history:
138            self._noise_floor_rms = min(self._rms_history)
139        else:
140            self._noise_floor_rms = 0.0
141
142        return self._compute_adjusted_threshold()
143
144    def _compute_adjusted_threshold(self) -> float:
145        """Compute threshold adjustment based on current SNR estimate."""
146        snr_db = self._estimate_snr_db()
147
148        # No adjustment if we don't have enough data
149        if len(self._rms_history) < 10:
150            return self._base_threshold
151
152        if snr_db > self._snr_boost_db:
153            # High SNR: lower threshold proportionally (max 0.10 reduction)
154            excess = snr_db - self._snr_boost_db
155            reduction = min(0.10, excess * 0.01)
156            adjusted = self._base_threshold - reduction
157        elif snr_db < self._snr_penalty_db:
158            # Low SNR: raise threshold proportionally (max 0.10 increase)
159            deficit = self._snr_penalty_db - snr_db
160            increase = min(0.10, deficit * 0.02)
161            adjusted = self._base_threshold + increase
162        else:
163            adjusted = self._base_threshold
164
165        return max(self._min_threshold, min(self._max_threshold, adjusted))
166
167    def _estimate_snr_db(self) -> float:
168        """Estimate signal-to-noise ratio in decibels.
169
170        Uses the current frame RMS as signal and the noise floor as noise.
171        Returns 0.0 if noise floor is effectively zero.
172        """
173        if self._noise_floor_rms < 1e-10:
174            # No noise estimate yet — return a neutral value
175            return self._snr_boost_db  # neutral, no adjustment
176        if self._current_rms < 1e-10:
177            return 0.0
178
179        ratio = self._current_rms / self._noise_floor_rms
180        return 20.0 * math.log10(max(ratio, 1e-10))
181
182    def get_profile(self) -> NoiseProfile:
183        """Return a snapshot of the current noise state.
184
185        Returns:
186            NoiseProfile with noise floor, signal RMS, SNR, and adjusted threshold.
187        """
188        snr_db = self._estimate_snr_db()
189        adjusted = self._compute_adjusted_threshold()
190
191        return NoiseProfile(
192            noise_rms=self._noise_floor_rms,
193            signal_rms=self._current_rms,
194            snr_db=snr_db,
195            adjusted_threshold=adjusted,
196            base_threshold=self._base_threshold,
197        )
198
199    def reset(self) -> None:
200        """Clear noise history and reset estimates."""
201        self._rms_history.clear()
202        self._current_rms = 0.0
203        self._noise_floor_rms = 0.0

Estimates ambient noise and adjusts detection threshold.

The profiler maintains a rolling window of RMS energy measurements. The noise floor is estimated as the 10th percentile of recent RMS values (capturing the quietest frames, which are likely ambient noise).

Threshold adjustment logic:

  • High SNR (signal clearly above noise): lower threshold slightly to improve sensitivity.
  • Low SNR (signal barely above noise): raise threshold to reduce false alarms from noise bursts.
  • The adjusted threshold is always clamped to [min_threshold, max_threshold].

Adaptive threshold bounds:

  • min_threshold defaults to 0.60. In very quiet environments, or when the current frame is far above the estimated noise floor, the threshold may be lowered to improve sensitivity, but it will never be reduced below this floor.
  • max_threshold defaults to 0.95. In very noisy environments, or when the current frame is close to the estimated noise floor, the threshold may be raised to suppress false accepts, but it will never be increased above this ceiling.
  • Before enough history is collected (fewer than 10 RMS frames), the profiler returns base_threshold with no adaptation.
Arguments:
  • base_threshold: The default detection threshold (e.g. 0.80).
  • noise_window_s: Seconds of audio history for noise estimation.
  • min_threshold: Floor for adaptive threshold. Default 0.60.
  • max_threshold: Ceiling for adaptive threshold. Default 0.95.
  • snr_boost_db: SNR above this value enables threshold lowering.
  • snr_penalty_db: SNR below this value enables threshold raising.
  • frames_per_second: Expected audio frames per second (default 50 for 20ms).
NoiseProfiler( base_threshold: float = 0.8, noise_window_s: float = 5.0, min_threshold: float = 0.6, max_threshold: float = 0.95, snr_boost_db: float = 6.0, snr_penalty_db: float = 3.0, frames_per_second: float = 50.0)
 84    def __init__(
 85        self,
 86        base_threshold: float = 0.80,
 87        noise_window_s: float = DEFAULT_NOISE_WINDOW_S,
 88        min_threshold: float = DEFAULT_MIN_THRESHOLD,
 89        max_threshold: float = DEFAULT_MAX_THRESHOLD,
 90        snr_boost_db: float = DEFAULT_SNR_BOOST_DB,
 91        snr_penalty_db: float = DEFAULT_SNR_PENALTY_DB,
 92        frames_per_second: float = 50.0,
 93    ) -> None:
 94        self._base_threshold = base_threshold
 95        self._min_threshold = min_threshold
 96        self._max_threshold = max_threshold
 97        self._snr_boost_db = snr_boost_db
 98        self._snr_penalty_db = snr_penalty_db
 99
100        window_frames = max(1, int(noise_window_s * frames_per_second))
101        self._rms_history: deque[float] = deque(maxlen=window_frames)
102        self._current_rms: float = 0.0
103        self._noise_floor_rms: float = 0.0
base_threshold: float
105    @property
106    def base_threshold(self) -> float:
107        """The unadjusted detection threshold."""
108        return self._base_threshold

The unadjusted detection threshold.

noise_floor: float
110    @property
111    def noise_floor(self) -> float:
112        """Current estimated noise floor RMS."""
113        return self._noise_floor_rms

Current estimated noise floor RMS.

def update(self, audio_frame: numpy.ndarray) -> float:
115    def update(self, audio_frame: np.ndarray) -> float:
116        """Update noise estimate with a new audio frame and return adjusted threshold.
117
118        The audio should be float32 values. Both normalized [-1,1] and int16-range
119        float32 are accepted; the profiler works on relative ratios so absolute
120        scale doesn't matter.
121
122        Args:
123            audio_frame: 1-D float32 audio samples (any length).
124
125        Returns:
126            The adaptively adjusted detection threshold.
127        """
128        rms = float(np.sqrt(np.mean(audio_frame.astype(np.float64) ** 2)))
129        self._current_rms = rms
130        self._rms_history.append(rms)
131
132        # Estimate noise floor as 10th percentile of recent RMS values
133        if len(self._rms_history) >= 10:
134            sorted_rms = sorted(self._rms_history)
135            idx = max(0, int(len(sorted_rms) * 0.10))
136            self._noise_floor_rms = sorted_rms[idx]
137        elif self._rms_history:
138            self._noise_floor_rms = min(self._rms_history)
139        else:
140            self._noise_floor_rms = 0.0
141
142        return self._compute_adjusted_threshold()

Update noise estimate with a new audio frame and return adjusted threshold.

The audio should be float32 values. Both normalized [-1,1] and int16-range float32 are accepted; the profiler works on relative ratios so absolute scale doesn't matter.

Arguments:
  • audio_frame: 1-D float32 audio samples (any length).
Returns:

The adaptively adjusted detection threshold.

def get_profile(self) -> violawake_sdk.noise_profiler.NoiseProfile:
182    def get_profile(self) -> NoiseProfile:
183        """Return a snapshot of the current noise state.
184
185        Returns:
186            NoiseProfile with noise floor, signal RMS, SNR, and adjusted threshold.
187        """
188        snr_db = self._estimate_snr_db()
189        adjusted = self._compute_adjusted_threshold()
190
191        return NoiseProfile(
192            noise_rms=self._noise_floor_rms,
193            signal_rms=self._current_rms,
194            snr_db=snr_db,
195            adjusted_threshold=adjusted,
196            base_threshold=self._base_threshold,
197        )

Return a snapshot of the current noise state.

Returns:

NoiseProfile with noise floor, signal RMS, SNR, and adjusted threshold.

def reset(self) -> None:
199    def reset(self) -> None:
200        """Clear noise history and reset estimates."""
201        self._rms_history.clear()
202        self._current_rms = 0.0
203        self._noise_floor_rms = 0.0

Clear noise history and reset estimates.

class PowerManager:
 99class PowerManager:
100    """Energy-aware inference controller.
101
102    Reduces inference frequency based on battery level, silence detection,
103    and explicit duty cycling configuration.
104
105    Modes of power saving:
106    1. **Duty cycling**: Process every Nth frame when idle (no recent detections).
107       When a score above ``activity_threshold`` is detected, switches to
108       full-rate processing for ``active_window_s`` seconds.
109    2. **Silence skipping**: Skip inference when audio RMS is below
110       ``silence_rms`` (no speech possible).
111    3. **Battery-aware**: When on battery and below ``battery_low_pct``,
112       increase the duty cycle factor by ``battery_multiplier``.
113
114    Args:
115        duty_cycle_n: Base duty cycle (process every Nth frame). Default 1 (no skipping).
116        silence_rms: RMS threshold in int16 scale (typical range 0-32768) below which
117            frames are skipped. Default 10.0 filters near-silence.
118        activity_threshold: Score above which the system enters "active" mode. Default 0.3.
119        active_window_s: Seconds to stay in full-rate mode after activity. Default 3.0.
120        battery_low_pct: Battery percent below which power saving kicks in. Default 20.
121        battery_multiplier: Multiply duty_cycle_n by this when on low battery. Default 3.
122        check_battery_interval_s: How often to re-check battery. Default 60.
123    """
124
125    def __init__(
126        self,
127        duty_cycle_n: int = 1,
128        silence_rms: float = 10.0,
129        activity_threshold: float = 0.3,
130        active_window_s: float = 3.0,
131        battery_low_pct: int = 20,
132        battery_multiplier: int = 3,
133        check_battery_interval_s: float = 60.0,
134    ) -> None:
135        if duty_cycle_n < 1:
136            raise ValueError(f"duty_cycle_n must be >= 1, got {duty_cycle_n}")
137
138        self._base_duty = duty_cycle_n
139        self._silence_rms = silence_rms
140        self._activity_threshold = activity_threshold
141        self._active_window_s = active_window_s
142        self._battery_low_pct = battery_low_pct
143        self._battery_multiplier = battery_multiplier
144        self._check_interval = check_battery_interval_s
145
146        # Lock protects all mutable state below
147        self._lock = threading.Lock()
148
149        # State
150        self._frame_counter = 0
151        self._frames_processed = 0
152        self._frames_skipped = 0
153        self._silence_skipped = 0
154        self._last_activity_time = 0.0
155        self._is_active = False
156
157        # Battery state (cached)
158        self._battery_pct = -1
159        self._is_on_battery = False
160        self._last_battery_check = 0.0
161
162    @property
163    def effective_duty_cycle(self) -> int:
164        """Current effective duty cycle considering battery and activity state."""
165        with self._lock:
166            return self._effective_duty_cycle_unlocked()
167
168    def _effective_duty_cycle_unlocked(self) -> int:
169        """Compute duty cycle without acquiring the lock (caller must hold it)."""
170        if self._is_active:
171            return 1  # Full rate when active
172
173        base = self._base_duty
174
175        # Battery scaling
176        if self._is_on_battery and 0 <= self._battery_pct < self._battery_low_pct:
177            base = base * self._battery_multiplier
178
179        return max(1, base)
180
181    def should_process(self, audio_frame: np.ndarray) -> bool:
182        """Decide whether this frame should be processed or skipped.
183
184        Call this before running inference. If it returns False, skip the
185        frame to save CPU/power.
186
187        Args:
188            audio_frame: 1-D audio samples (int16-range float32 or actual int16).
189
190        Returns:
191            True if inference should run on this frame.
192        """
193        # Compute RMS outside the lock (pure computation on immutable input)
194        rms = float(np.sqrt(np.mean(audio_frame.astype(np.float32) ** 2)))
195
196        with self._lock:
197            self._frame_counter += 1
198
199            # Periodically check battery
200            now = time.monotonic()
201            if now - self._last_battery_check > self._check_interval:
202                self._battery_pct, self._is_on_battery = _get_battery_info()
203                self._last_battery_check = now
204
205            # Check if active window has expired
206            if self._is_active and (now - self._last_activity_time > self._active_window_s):
207                self._is_active = False
208
209            # Silence gate: skip if audio is very quiet
210            if rms < self._silence_rms:
211                self._frames_skipped += 1
212                self._silence_skipped += 1
213                return False
214
215            # Duty cycling: process every Nth frame
216            duty = self._effective_duty_cycle_unlocked()
217            if duty > 1 and (self._frame_counter % duty) != 0:
218                self._frames_skipped += 1
219                return False
220
221            self._frames_processed += 1
222            return True
223
224    def report_score(self, score: float) -> None:
225        """Report a detection score to the power manager.
226
227        If the score is above the activity threshold, the manager switches
228        to full-rate processing mode for ``active_window_s`` seconds.
229
230        Args:
231            score: Detection score from the model.
232        """
233        if score >= self._activity_threshold:
234            with self._lock:
235                self._is_active = True
236                self._last_activity_time = time.monotonic()
237
238    def get_state(self) -> PowerState:
239        """Return current power management state snapshot."""
240        with self._lock:
241            total = self._frames_processed + self._frames_skipped
242            # Returns 0.0 if no frames received (no data yet to measure).
243            rate = self._frames_processed / total if total > 0 else 0.0
244
245            return PowerState(
246                battery_percent=self._battery_pct,
247                is_on_battery=self._is_on_battery,
248                duty_cycle_n=self._effective_duty_cycle_unlocked(),
249                frames_processed=self._frames_processed,
250                frames_skipped=self._frames_skipped,
251                silence_skipped=self._silence_skipped,
252                effective_rate=rate,
253            )
254
255    def reset(self) -> None:
256        """Reset all counters and state."""
257        with self._lock:
258            self._frame_counter = 0
259            self._frames_processed = 0
260            self._frames_skipped = 0
261            self._silence_skipped = 0
262            self._last_activity_time = 0.0
263            self._is_active = False

Energy-aware inference controller.

Reduces inference frequency based on battery level, silence detection, and explicit duty cycling configuration.

Modes of power saving:

  1. Duty cycling: Process every Nth frame when idle (no recent detections). When a score above activity_threshold is detected, switches to full-rate processing for active_window_s seconds.
  2. Silence skipping: Skip inference when audio RMS is below silence_rms (no speech possible).
  3. Battery-aware: When on battery and below battery_low_pct, increase the duty cycle factor by battery_multiplier.
Arguments:
  • duty_cycle_n: Base duty cycle (process every Nth frame). Default 1 (no skipping).
  • silence_rms: RMS threshold in int16 scale (typical range 0-32768) below which frames are skipped. Default 10.0 filters near-silence.
  • activity_threshold: Score above which the system enters "active" mode. Default 0.3.
  • active_window_s: Seconds to stay in full-rate mode after activity. Default 3.0.
  • battery_low_pct: Battery percent below which power saving kicks in. Default 20.
  • battery_multiplier: Multiply duty_cycle_n by this when on low battery. Default 3.
  • check_battery_interval_s: How often to re-check battery. Default 60.
PowerManager( duty_cycle_n: int = 1, silence_rms: float = 10.0, activity_threshold: float = 0.3, active_window_s: float = 3.0, battery_low_pct: int = 20, battery_multiplier: int = 3, check_battery_interval_s: float = 60.0)
125    def __init__(
126        self,
127        duty_cycle_n: int = 1,
128        silence_rms: float = 10.0,
129        activity_threshold: float = 0.3,
130        active_window_s: float = 3.0,
131        battery_low_pct: int = 20,
132        battery_multiplier: int = 3,
133        check_battery_interval_s: float = 60.0,
134    ) -> None:
135        if duty_cycle_n < 1:
136            raise ValueError(f"duty_cycle_n must be >= 1, got {duty_cycle_n}")
137
138        self._base_duty = duty_cycle_n
139        self._silence_rms = silence_rms
140        self._activity_threshold = activity_threshold
141        self._active_window_s = active_window_s
142        self._battery_low_pct = battery_low_pct
143        self._battery_multiplier = battery_multiplier
144        self._check_interval = check_battery_interval_s
145
146        # Lock protects all mutable state below
147        self._lock = threading.Lock()
148
149        # State
150        self._frame_counter = 0
151        self._frames_processed = 0
152        self._frames_skipped = 0
153        self._silence_skipped = 0
154        self._last_activity_time = 0.0
155        self._is_active = False
156
157        # Battery state (cached)
158        self._battery_pct = -1
159        self._is_on_battery = False
160        self._last_battery_check = 0.0
effective_duty_cycle: int
162    @property
163    def effective_duty_cycle(self) -> int:
164        """Current effective duty cycle considering battery and activity state."""
165        with self._lock:
166            return self._effective_duty_cycle_unlocked()

Current effective duty cycle considering battery and activity state.

def should_process(self, audio_frame: numpy.ndarray) -> bool:
181    def should_process(self, audio_frame: np.ndarray) -> bool:
182        """Decide whether this frame should be processed or skipped.
183
184        Call this before running inference. If it returns False, skip the
185        frame to save CPU/power.
186
187        Args:
188            audio_frame: 1-D audio samples (int16-range float32 or actual int16).
189
190        Returns:
191            True if inference should run on this frame.
192        """
193        # Compute RMS outside the lock (pure computation on immutable input)
194        rms = float(np.sqrt(np.mean(audio_frame.astype(np.float32) ** 2)))
195
196        with self._lock:
197            self._frame_counter += 1
198
199            # Periodically check battery
200            now = time.monotonic()
201            if now - self._last_battery_check > self._check_interval:
202                self._battery_pct, self._is_on_battery = _get_battery_info()
203                self._last_battery_check = now
204
205            # Check if active window has expired
206            if self._is_active and (now - self._last_activity_time > self._active_window_s):
207                self._is_active = False
208
209            # Silence gate: skip if audio is very quiet
210            if rms < self._silence_rms:
211                self._frames_skipped += 1
212                self._silence_skipped += 1
213                return False
214
215            # Duty cycling: process every Nth frame
216            duty = self._effective_duty_cycle_unlocked()
217            if duty > 1 and (self._frame_counter % duty) != 0:
218                self._frames_skipped += 1
219                return False
220
221            self._frames_processed += 1
222            return True

Decide whether this frame should be processed or skipped.

Call this before running inference. If it returns False, skip the frame to save CPU/power.

Arguments:
  • audio_frame: 1-D audio samples (int16-range float32 or actual int16).
Returns:

True if inference should run on this frame.

def report_score(self, score: float) -> None:
224    def report_score(self, score: float) -> None:
225        """Report a detection score to the power manager.
226
227        If the score is above the activity threshold, the manager switches
228        to full-rate processing mode for ``active_window_s`` seconds.
229
230        Args:
231            score: Detection score from the model.
232        """
233        if score >= self._activity_threshold:
234            with self._lock:
235                self._is_active = True
236                self._last_activity_time = time.monotonic()

Report a detection score to the power manager.

If the score is above the activity threshold, the manager switches to full-rate processing mode for active_window_s seconds.

Arguments:
  • score: Detection score from the model.
def get_state(self) -> violawake_sdk.power_manager.PowerState:
238    def get_state(self) -> PowerState:
239        """Return current power management state snapshot."""
240        with self._lock:
241            total = self._frames_processed + self._frames_skipped
242            # Returns 0.0 if no frames received (no data yet to measure).
243            rate = self._frames_processed / total if total > 0 else 0.0
244
245            return PowerState(
246                battery_percent=self._battery_pct,
247                is_on_battery=self._is_on_battery,
248                duty_cycle_n=self._effective_duty_cycle_unlocked(),
249                frames_processed=self._frames_processed,
250                frames_skipped=self._frames_skipped,
251                silence_skipped=self._silence_skipped,
252                effective_rate=rate,
253            )

Return current power management state snapshot.

def reset(self) -> None:
255    def reset(self) -> None:
256        """Reset all counters and state."""
257        with self._lock:
258            self._frame_counter = 0
259            self._frames_processed = 0
260            self._frames_skipped = 0
261            self._silence_skipped = 0
262            self._last_activity_time = 0.0
263            self._is_active = False

Reset all counters and state.

class VADEngine:
303class VADEngine:
304    """Voice Activity Detection engine.
305
306    Auto-selects the best available backend unless explicitly specified.
307
308    Example::
309
310        vad = VADEngine(backend="webrtc")  # or "silero", "rms", "auto"
311        prob = vad.process_frame(audio_20ms_bytes)
312        is_speech = prob > 0.5
313    """
314
315    def __init__(
316        self,
317        backend: str | VADBackend = VADBackend.AUTO,
318        **backend_kwargs: object,
319    ) -> None:
320        """Initialize the VAD engine.
321
322        Args:
323            backend: One of "auto", "webrtc", "silero", "rms".
324                     "auto" selects the best available backend.
325            **backend_kwargs: Backend-specific arguments.
326                For "webrtc": aggressiveness (0–3, default 2)
327                For "rms": speech_threshold, silence_threshold
328        """
329        if isinstance(backend, str):
330            backend = VADBackend(backend)
331
332        self._backend_name, self._backend = _create_backend(backend, **backend_kwargs)
333
334    @property
335    def backend_name(self) -> str:
336        """Name of the active backend."""
337        return self._backend_name.value
338
339    def process_frame(self, audio: bytes | np.ndarray) -> float:
340        """Process a 20ms audio frame.
341
342        Args:
343            audio: 320 samples of 16kHz mono audio. Accepted formats:
344                - bytes/bytearray: int16 PCM (640 bytes for 20ms)
345                - np.ndarray float32/float64: assumed normalized to [-1.0, 1.0],
346                  scaled by 32768 to int16. Use int16 dtype for int16-range data.
347                - np.ndarray int16: converted to bytes directly
348
349        Returns:
350            Speech probability in [0.0, 1.0].
351            1.0 = definitely speech, 0.0 = definitely silence.
352        """
353        audio_bytes = _coerce_to_bytes(audio)
354        return self._backend.process_frame(audio_bytes)
355
356    def is_speech(self, audio: bytes | np.ndarray, threshold: float = 0.5) -> bool:
357        """Convenience method: returns True if speech probability exceeds threshold."""
358        return self.process_frame(audio) >= threshold
359
360    def reset(self) -> None:
361        """Reset internal state (useful between utterances)."""
362        self._backend.reset()
363
364    def close(self) -> None:
365        """Release backend resources."""
366        self._backend = None  # type: ignore[assignment]
367
368    def __enter__(self) -> VADEngine:
369        """Enter sync context manager. Returns self."""
370        return self
371
372    def __exit__(
373        self,
374        exc_type: type[BaseException] | None,
375        exc_val: BaseException | None,
376        exc_tb: object,
377    ) -> None:
378        """Exit sync context manager. Releases backend resources."""
379        self.close()

Voice Activity Detection engine.

Auto-selects the best available backend unless explicitly specified.

Example::

vad = VADEngine(backend="webrtc")  # or "silero", "rms", "auto"
prob = vad.process_frame(audio_20ms_bytes)
is_speech = prob > 0.5
VADEngine( backend: str | violawake_sdk.vad.VADBackend = <VADBackend.AUTO: 'auto'>, **backend_kwargs: object)
315    def __init__(
316        self,
317        backend: str | VADBackend = VADBackend.AUTO,
318        **backend_kwargs: object,
319    ) -> None:
320        """Initialize the VAD engine.
321
322        Args:
323            backend: One of "auto", "webrtc", "silero", "rms".
324                     "auto" selects the best available backend.
325            **backend_kwargs: Backend-specific arguments.
326                For "webrtc": aggressiveness (0–3, default 2)
327                For "rms": speech_threshold, silence_threshold
328        """
329        if isinstance(backend, str):
330            backend = VADBackend(backend)
331
332        self._backend_name, self._backend = _create_backend(backend, **backend_kwargs)

Initialize the VAD engine.

Arguments:
  • backend: One of "auto", "webrtc", "silero", "rms". "auto" selects the best available backend.
  • **backend_kwargs: Backend-specific arguments. For "webrtc": aggressiveness (0–3, default 2) For "rms": speech_threshold, silence_threshold
backend_name: str
334    @property
335    def backend_name(self) -> str:
336        """Name of the active backend."""
337        return self._backend_name.value

Name of the active backend.

def process_frame(self, audio: bytes | numpy.ndarray) -> float:
339    def process_frame(self, audio: bytes | np.ndarray) -> float:
340        """Process a 20ms audio frame.
341
342        Args:
343            audio: 320 samples of 16kHz mono audio. Accepted formats:
344                - bytes/bytearray: int16 PCM (640 bytes for 20ms)
345                - np.ndarray float32/float64: assumed normalized to [-1.0, 1.0],
346                  scaled by 32768 to int16. Use int16 dtype for int16-range data.
347                - np.ndarray int16: converted to bytes directly
348
349        Returns:
350            Speech probability in [0.0, 1.0].
351            1.0 = definitely speech, 0.0 = definitely silence.
352        """
353        audio_bytes = _coerce_to_bytes(audio)
354        return self._backend.process_frame(audio_bytes)

Process a 20ms audio frame.

Arguments:
  • audio: 320 samples of 16kHz mono audio. Accepted formats:
    • bytes/bytearray: int16 PCM (640 bytes for 20ms)
    • np.ndarray float32/float64: assumed normalized to [-1.0, 1.0], scaled by 32768 to int16. Use int16 dtype for int16-range data.
    • np.ndarray int16: converted to bytes directly
Returns:

Speech probability in [0.0, 1.0]. 1.0 = definitely speech, 0.0 = definitely silence.

def is_speech(self, audio: bytes | numpy.ndarray, threshold: float = 0.5) -> bool:
356    def is_speech(self, audio: bytes | np.ndarray, threshold: float = 0.5) -> bool:
357        """Convenience method: returns True if speech probability exceeds threshold."""
358        return self.process_frame(audio) >= threshold

Convenience method: returns True if speech probability exceeds threshold.

def reset(self) -> None:
360    def reset(self) -> None:
361        """Reset internal state (useful between utterances)."""
362        self._backend.reset()

Reset internal state (useful between utterances).

def close(self) -> None:
364    def close(self) -> None:
365        """Release backend resources."""
366        self._backend = None  # type: ignore[assignment]

Release backend resources.

class TTSEngine:
 56class TTSEngine:
 57    """On-device TTS using Kokoro-82M (Apache 2.0 model).
 58
 59    Thread-safe: multiple threads can call ``synthesize()`` concurrently.
 60    Calls are serialized via ``_synthesis_lock`` since kokoro-onnx is not
 61    guaranteed to be thread-safe. Model initialization is separately guarded
 62    by ``_lock`` (lazy load on first use).
 63
 64    Model files required (auto-downloaded on first use):
 65        - ``kokoro_v1_0.onnx`` — Kokoro-82M model (~326MB)
 66        - ``kokoro_voices_v1_0.bin`` — Voice embeddings (~28MB)
 67
 68    Example::
 69
 70        tts = TTSEngine(voice="af_heart")
 71        audio = tts.synthesize("Hello, world!")  # returns np.ndarray
 72        tts.play(audio)  # blocking by default
 73        tts.play_async(audio)  # optional non-blocking playback
 74    """
 75
 76    def __init__(
 77        self,
 78        voice: str = DEFAULT_VOICE,
 79        speed: float = 1.0,
 80        sample_rate: int = TARGET_SAMPLE_RATE,
 81    ) -> None:
 82        """Initialize the TTS engine.
 83
 84        Args:
 85            voice: Kokoro voice name. Default "af_heart".
 86                   See ``AVAILABLE_VOICES`` for full list.
 87            speed: Speech speed multiplier. 1.0 = normal, 1.2 = 20% faster.
 88            sample_rate: Output sample rate. Default 16kHz (pipeline standard).
 89                         Kokoro outputs 24kHz; resampled if different.
 90        """
 91        if voice not in AVAILABLE_VOICES:
 92            raise ValueError(f"Unknown voice '{voice}'. Available: {', '.join(AVAILABLE_VOICES)}")
 93
 94        if not (0.1 <= speed <= 3.0):
 95            raise ValueError(f"Speed must be between 0.1 and 3.0, got {speed}")
 96
 97        self.voice = voice
 98        self.speed = speed
 99        self.sample_rate = sample_rate
100        self._lock = threading.Lock()
101        self._synthesis_lock = threading.Lock()
102        self._kokoro: object | None = None
103
104        # Lazy initialization — load model on first use
105        logger.info("TTSEngine created: voice=%s, speed=%.1f", voice, speed)
106
107    def _get_kokoro(self) -> object:
108        """Lazy-load the Kokoro model (thread-safe)."""
109        with self._lock:
110            if self._kokoro is None:
111                self._kokoro = self._load_kokoro()
112        return self._kokoro
113
114    def _load_kokoro(self) -> object:
115        """Load the Kokoro ONNX model."""
116        try:
117            import kokoro_onnx
118        except ImportError as e:
119            raise ImportError(
120                "kokoro-onnx is not installed. Install with: pip install 'violawake[tts]'"
121            ) from e
122
123        try:
124            model_path = get_model_path("kokoro_v1_0")
125            voices_path = get_model_path("kokoro_voices_v1_0")
126        except FileNotFoundError as e:
127            raise ModelNotFoundError(
128                "Kokoro models not found. Run:\n"
129                "  violawake-download --model kokoro_v1_0\n"
130                "  violawake-download --model kokoro_voices_v1_0"
131            ) from e
132
133        try:
134            kokoro = kokoro_onnx.Kokoro(str(model_path), str(voices_path))
135        except Exception as e:
136            raise ModelLoadError(f"Failed to load Kokoro model: {e}") from e
137
138        logger.info("Kokoro-82M loaded: %s", model_path)
139        return kokoro
140
141    def synthesize(self, text: str) -> np.ndarray:
142        """Synthesize text to audio.
143
144        Args:
145            text: Text to synthesize. May be multi-sentence.
146                  Long text is processed as a single batch call.
147
148        Returns:
149            Audio samples as float32 numpy array at ``self.sample_rate``.
150        """
151        if not text.strip():
152            return np.zeros(0, dtype=np.float32)
153
154        kokoro = self._get_kokoro()
155
156        # Hold synthesis lock to serialize access to the kokoro model,
157        # which is not guaranteed to be thread-safe by kokoro-onnx.
158        with self._synthesis_lock:
159            try:
160                # kokoro-onnx API: returns (samples, sample_rate)
161                audio, sr = kokoro.create(  # type: ignore[attr-defined]
162                    text,
163                    voice=self.voice,
164                    speed=self.speed,
165                    lang="en-us",
166                )
167            except Exception as e:
168                logger.exception("TTS synthesis failed for text: %.50s...", text)
169                raise RuntimeError(f"TTS synthesis failed: {e}") from e
170
171        audio = np.asarray(audio, dtype=np.float32)
172
173        # Resample if needed
174        if sr != self.sample_rate:
175            audio = self._resample(audio, sr, self.sample_rate)
176
177        return audio
178
179    def synthesize_chunked(self, text: str) -> Generator[np.ndarray, None, None]:
180        """Synthesize text sentence-by-sentence for lower latency.
181
182        Splits text at sentence boundaries and yields audio for each sentence
183        as soon as it's synthesized. This allows playback to begin before
184        the full text is processed — matching the pattern from production Viola.
185
186        Args:
187            text: Text to synthesize. May be multi-sentence.
188
189        Yields:
190            Audio chunks (one per sentence) as float32 numpy arrays.
191        """
192        sentences = self._split_sentences(text)
193        for sentence in sentences:
194            if sentence.strip():
195                audio = self.synthesize(sentence)
196                if audio.size > 0:
197                    yield audio
198
199    def play(self, audio: np.ndarray, *, blocking: bool = True) -> None:
200        """Play audio through the default output device.
201
202        Args:
203            audio: Float32 numpy array of audio samples.
204            blocking: If True, wait for playback to finish. If False, return
205                      immediately after starting playback.
206        """
207        try:
208            import sounddevice as sd
209        except ImportError as sd_err:
210            logger.debug("sounddevice not available (%s), falling back to pyaudio", sd_err)
211            try:
212                self._play_pyaudio(audio, blocking=blocking)
213            except ImportError as e:
214                raise ImportError(
215                    "No audio playback backend is installed. "
216                    "Install sounddevice with: pip install sounddevice "
217                    "or install violawake[audio] for PyAudio playback."
218                ) from e
219            return
220
221        # Copy to prevent mutation of caller's array during async playback
222        sd.play(audio.copy(), samplerate=self.sample_rate, blocking=blocking)
223
224    def play_async(self, audio: np.ndarray) -> None:
225        """Play audio without blocking the calling thread."""
226        self.play(audio, blocking=False)
227
228    def _play_pyaudio(self, audio: np.ndarray, *, blocking: bool = True) -> None:
229        """Play audio using pyaudio as fallback."""
230        try:
231            import pyaudio
232        except ImportError:
233            raise ImportError(
234                "pyaudio is required for audio playback. Install with: pip install violawake[audio]"
235            ) from None
236
237        if not blocking:
238            thread = threading.Thread(
239                target=self._play_pyaudio,
240                args=(audio.copy(),),
241                kwargs={"blocking": True},
242                daemon=True,
243            )
244            thread.start()
245            return
246
247        clipped = np.clip(audio, -1.0, 1.0)
248        pcm = (clipped * 32767).astype(np.int16)
249        pa = pyaudio.PyAudio()
250        stream = pa.open(
251            format=pyaudio.paInt16,
252            channels=1,
253            rate=self.sample_rate,
254            output=True,
255        )
256        try:
257            stream.write(pcm.tobytes())
258        finally:
259            stream.stop_stream()
260            stream.close()
261            pa.terminate()
262
263    @staticmethod
264    def _resample(audio: np.ndarray, src_rate: int, dst_rate: int) -> np.ndarray:
265        """Resample audio using scipy."""
266        import math
267
268        try:
269            from scipy.signal import resample_poly
270        except ImportError as e:
271            raise ImportError(
272                "scipy is required for audio resampling. Install with: pip install scipy"
273            ) from e
274        gcd = math.gcd(src_rate, dst_rate)
275        return resample_poly(audio, dst_rate // gcd, src_rate // gcd).astype(np.float32)
276
277    @staticmethod
278    def _split_sentences(text: str) -> list[str]:
279        """Split text at sentence boundaries for chunked synthesis.
280
281        Uses a regex that splits on sentence-ending punctuation followed by
282        whitespace and an uppercase letter (or end of string). This avoids
283        false splits on abbreviations ("Dr. Smith"), decimals ("3.14"),
284        and URLs.
285        """
286        import re
287
288        # Split on sentence-ending punctuation followed by space+uppercase or end of string
289        pattern = r"(?<=[.!?])\s+(?=[A-Z])|(?<=[.!?])\s*$"
290        parts = re.split(pattern, text)
291        return [s.strip() for s in parts if s and s.strip()]
292
293    def close(self) -> None:
294        """Release model resources."""
295        self._kokoro = None
296
297    def __enter__(self) -> TTSEngine:
298        """Enter sync context manager. Returns self."""
299        return self
300
301    def __exit__(
302        self,
303        exc_type: type[BaseException] | None,
304        exc_val: BaseException | None,
305        exc_tb: object,
306    ) -> None:
307        """Exit sync context manager. Releases model resources."""
308        self.close()

On-device TTS using Kokoro-82M (Apache 2.0 model).

Thread-safe: multiple threads can call synthesize() concurrently. Calls are serialized via _synthesis_lock since kokoro-onnx is not guaranteed to be thread-safe. Model initialization is separately guarded by _lock (lazy load on first use).

Model files required (auto-downloaded on first use): - kokoro_v1_0.onnx — Kokoro-82M model (~326MB) - kokoro_voices_v1_0.bin — Voice embeddings (~28MB)

Example::

tts = TTSEngine(voice="af_heart")
audio = tts.synthesize("Hello, world!")  # returns np.ndarray
tts.play(audio)  # blocking by default
tts.play_async(audio)  # optional non-blocking playback
TTSEngine( voice: str = 'af_heart', speed: float = 1.0, sample_rate: int = 16000)
 76    def __init__(
 77        self,
 78        voice: str = DEFAULT_VOICE,
 79        speed: float = 1.0,
 80        sample_rate: int = TARGET_SAMPLE_RATE,
 81    ) -> None:
 82        """Initialize the TTS engine.
 83
 84        Args:
 85            voice: Kokoro voice name. Default "af_heart".
 86                   See ``AVAILABLE_VOICES`` for full list.
 87            speed: Speech speed multiplier. 1.0 = normal, 1.2 = 20% faster.
 88            sample_rate: Output sample rate. Default 16kHz (pipeline standard).
 89                         Kokoro outputs 24kHz; resampled if different.
 90        """
 91        if voice not in AVAILABLE_VOICES:
 92            raise ValueError(f"Unknown voice '{voice}'. Available: {', '.join(AVAILABLE_VOICES)}")
 93
 94        if not (0.1 <= speed <= 3.0):
 95            raise ValueError(f"Speed must be between 0.1 and 3.0, got {speed}")
 96
 97        self.voice = voice
 98        self.speed = speed
 99        self.sample_rate = sample_rate
100        self._lock = threading.Lock()
101        self._synthesis_lock = threading.Lock()
102        self._kokoro: object | None = None
103
104        # Lazy initialization — load model on first use
105        logger.info("TTSEngine created: voice=%s, speed=%.1f", voice, speed)

Initialize the TTS engine.

Arguments:
  • voice: Kokoro voice name. Default "af_heart". See AVAILABLE_VOICES for full list.
  • speed: Speech speed multiplier. 1.0 = normal, 1.2 = 20% faster.
  • sample_rate: Output sample rate. Default 16kHz (pipeline standard). Kokoro outputs 24kHz; resampled if different.
voice
speed
sample_rate
def synthesize(self, text: str) -> numpy.ndarray:
141    def synthesize(self, text: str) -> np.ndarray:
142        """Synthesize text to audio.
143
144        Args:
145            text: Text to synthesize. May be multi-sentence.
146                  Long text is processed as a single batch call.
147
148        Returns:
149            Audio samples as float32 numpy array at ``self.sample_rate``.
150        """
151        if not text.strip():
152            return np.zeros(0, dtype=np.float32)
153
154        kokoro = self._get_kokoro()
155
156        # Hold synthesis lock to serialize access to the kokoro model,
157        # which is not guaranteed to be thread-safe by kokoro-onnx.
158        with self._synthesis_lock:
159            try:
160                # kokoro-onnx API: returns (samples, sample_rate)
161                audio, sr = kokoro.create(  # type: ignore[attr-defined]
162                    text,
163                    voice=self.voice,
164                    speed=self.speed,
165                    lang="en-us",
166                )
167            except Exception as e:
168                logger.exception("TTS synthesis failed for text: %.50s...", text)
169                raise RuntimeError(f"TTS synthesis failed: {e}") from e
170
171        audio = np.asarray(audio, dtype=np.float32)
172
173        # Resample if needed
174        if sr != self.sample_rate:
175            audio = self._resample(audio, sr, self.sample_rate)
176
177        return audio

Synthesize text to audio.

Arguments:
  • text: Text to synthesize. May be multi-sentence. Long text is processed as a single batch call.
Returns:

Audio samples as float32 numpy array at self.sample_rate.

def synthesize_chunked(self, text: str) -> Generator[numpy.ndarray, None, None]:
179    def synthesize_chunked(self, text: str) -> Generator[np.ndarray, None, None]:
180        """Synthesize text sentence-by-sentence for lower latency.
181
182        Splits text at sentence boundaries and yields audio for each sentence
183        as soon as it's synthesized. This allows playback to begin before
184        the full text is processed — matching the pattern from production Viola.
185
186        Args:
187            text: Text to synthesize. May be multi-sentence.
188
189        Yields:
190            Audio chunks (one per sentence) as float32 numpy arrays.
191        """
192        sentences = self._split_sentences(text)
193        for sentence in sentences:
194            if sentence.strip():
195                audio = self.synthesize(sentence)
196                if audio.size > 0:
197                    yield audio

Synthesize text sentence-by-sentence for lower latency.

Splits text at sentence boundaries and yields audio for each sentence as soon as it's synthesized. This allows playback to begin before the full text is processed — matching the pattern from production Viola.

Arguments:
  • text: Text to synthesize. May be multi-sentence.
Yields:

Audio chunks (one per sentence) as float32 numpy arrays.

def play(self, audio: numpy.ndarray, *, blocking: bool = True) -> None:
199    def play(self, audio: np.ndarray, *, blocking: bool = True) -> None:
200        """Play audio through the default output device.
201
202        Args:
203            audio: Float32 numpy array of audio samples.
204            blocking: If True, wait for playback to finish. If False, return
205                      immediately after starting playback.
206        """
207        try:
208            import sounddevice as sd
209        except ImportError as sd_err:
210            logger.debug("sounddevice not available (%s), falling back to pyaudio", sd_err)
211            try:
212                self._play_pyaudio(audio, blocking=blocking)
213            except ImportError as e:
214                raise ImportError(
215                    "No audio playback backend is installed. "
216                    "Install sounddevice with: pip install sounddevice "
217                    "or install violawake[audio] for PyAudio playback."
218                ) from e
219            return
220
221        # Copy to prevent mutation of caller's array during async playback
222        sd.play(audio.copy(), samplerate=self.sample_rate, blocking=blocking)

Play audio through the default output device.

Arguments:
  • audio: Float32 numpy array of audio samples.
  • blocking: If True, wait for playback to finish. If False, return immediately after starting playback.
def play_async(self, audio: numpy.ndarray) -> None:
224    def play_async(self, audio: np.ndarray) -> None:
225        """Play audio without blocking the calling thread."""
226        self.play(audio, blocking=False)

Play audio without blocking the calling thread.

def close(self) -> None:
293    def close(self) -> None:
294        """Release model resources."""
295        self._kokoro = None

Release model resources.

class STTEngine:
 84class STTEngine:
 85    """Speech-to-text transcription via faster-whisper.
 86
 87    Thread-safe: ``WhisperModel`` is thread-safe for concurrent ``transcribe()`` calls.
 88
 89    Model is loaded once and reused. First call includes model load time
 90    (~1-3s). Subsequent calls are ~380ms (base model, CPU, 3s audio).
 91
 92    Example::
 93
 94        stt = STTEngine(model="base")
 95        text = stt.transcribe(audio_np_float32)
 96        print(text)  # "what's the weather today"
 97    """
 98
 99    def __init__(
100        self,
101        model: str = DEFAULT_MODEL,
102        device: str = "cpu",
103        compute_type: str = "int8",
104        language: str | None = None,
105        language_cache_ttl_s: float = 60.0,
106    ) -> None:
107        """Initialize the STT engine.
108
109        Args:
110            model: Whisper model size. One of: tiny, base, small, medium, large-v3.
111                   Default "base" — good accuracy/speed balance (WER ~9%).
112            device: "cpu" or "cuda". Default "cpu".
113            compute_type: CTranslate2 compute type. "int8" (default), "float16", "float32".
114                          "int8" is fastest on CPU with minimal accuracy loss.
115            language: Force a specific language (e.g., "en"). None = auto-detect.
116            language_cache_ttl_s: Cache detected language for N seconds to avoid
117                                   per-call language detection overhead.
118        """
119        if model not in MODEL_PROFILES:
120            available = ", ".join(MODEL_PROFILES.keys())
121            raise ValueError(f"Unknown model '{model}'. Available: {available}")
122
123        self.model_name = model
124        self.device = device
125        self.compute_type = compute_type
126        self.forced_language = language
127        self._language_cache: tuple[str, float] | None = None  # (lang, cached_at)
128        self._language_cache_ttl = language_cache_ttl_s
129        self._model: WhisperModel | None = None
130        self._model_lock = threading.Lock()
131
132        profile = MODEL_PROFILES[model]
133        logger.info(
134            "STTEngine created: model=%s, device=%s (WER~%.0f%%, %dMB)",
135            model,
136            device,
137            profile["wer"],
138            profile["vram_mb"],
139        )
140
141    def _get_model(self) -> WhisperModel:
142        """Lazy-load the Whisper model on first use (thread-safe)."""
143        if self._model is not None:
144            return self._model
145
146        with self._model_lock:
147            # Double-checked locking: another thread may have loaded
148            # the model while we waited for the lock.
149            if self._model is not None:
150                return self._model
151
152            try:
153                from faster_whisper import WhisperModel  # type: ignore[import]
154            except ImportError as e:
155                raise ImportError(
156                    "faster-whisper is not installed. Install with: pip install 'violawake[stt]'"
157                ) from e
158
159            logger.info("Loading Whisper model '%s'...", self.model_name)
160            t0 = time.perf_counter()
161            self._model = WhisperModel(
162                self.model_name,
163                device=self.device,
164                compute_type=self.compute_type,
165            )
166            elapsed_ms = (time.perf_counter() - t0) * 1000
167            logger.info("Whisper model loaded in %.0f ms", elapsed_ms)
168
169        return self._model
170
171    def transcribe(self, audio: np.ndarray) -> str:
172        """Transcribe audio to text.
173
174        Note:
175            This engine uses a progressive temperature fallback of
176            ``[0.0, 0.2, 0.4, 0.6, 0.8, 1.0]`` during decoding, which can
177            trigger up to 6 decoding passes and increase latency. For
178            low-latency use cases, prefer a single-pass configuration such as
179            ``temperature_fallback=[0.0]``.
180
181        Args:
182            audio: Float32 numpy array at 16kHz mono. Values should be in [-1.0, 1.0].
183
184        Returns:
185            Transcribed text as string. Empty string if no speech detected.
186        """
187        result = self.transcribe_full(audio)
188        return result.text
189
190    def transcribe_streaming(
191        self,
192        audio: np.ndarray,
193        channels_first: bool | None = None,
194        beam_size: int = 5,
195        best_of: int = 5,
196        temperature: list[float] | None = None,
197    ) -> Iterator[TranscriptSegment]:
198        """Stream transcription segments as they become available.
199
200        Uses faster-whisper's generator mode: ``model.transcribe()`` returns a
201        ``(segments_iterator, info)`` tuple.  This method yields each
202        ``TranscriptSegment`` one at a time as faster-whisper decodes it,
203        instead of collecting all segments first.
204
205        This is useful when:
206        - You want to display partial results before full transcription completes.
207        - You need to pipe segments to a downstream consumer (TTS, logging, etc.)
208          without waiting for the full buffer to finish.
209
210        Note:
211            Segments with ``no_speech_prob`` above ``NO_SPEECH_THRESHOLD`` are
212            silently skipped (not yielded).
213
214        Args:
215            audio: Float32 numpy array at 16kHz mono, or 2-D stereo.
216            channels_first: Layout hint for 2-D stereo audio (same semantics as
217                ``transcribe_full``).
218            beam_size: Beam search width. Default 5.
219            best_of: Number of candidates when sampling. Default 5.
220            temperature: Temperature schedule. Default ``[0.0, 0.2, 0.4, 0.6, 0.8, 1.0]``.
221
222        Yields:
223            TranscriptSegment — one per decoded segment, in time order.
224
225        Example::
226
227            stt = STTEngine(model="base")
228            for seg in stt.transcribe_streaming(audio_np):
229                print(f"[{seg.start:.1f}s] {seg.text}")
230        """
231        if temperature is None:
232            temperature = [0.0, 0.2, 0.4, 0.6, 0.8, 1.0]
233
234        audio = np.asarray(audio, dtype=np.float32)
235        if audio.ndim > 1:
236            if channels_first is True:
237                audio = audio.mean(axis=0)
238            elif channels_first is False:
239                audio = audio.mean(axis=1)
240            else:
241                if audio.shape[0] < audio.shape[1]:
242                    audio = audio.mean(axis=0)
243                else:
244                    audio = audio.mean(axis=1)
245
246        language = self._get_language()
247        model = self._get_model()
248
249        logger.debug("transcribe_streaming: starting generator on %d samples", len(audio))
250
251        segments_gen, info = model.transcribe(
252            audio,
253            language=language,
254            vad_filter=True,
255            vad_parameters={"min_silence_duration_ms": 500},
256            word_timestamps=False,
257            beam_size=beam_size,
258            best_of=best_of,
259            temperature=temperature,
260        )
261
262        # Update language cache after model.transcribe() returns info — same
263        # logic as transcribe_full, but we must do it before consuming the
264        # generator so the cache is primed for subsequent calls.
265        if language is None and info.language_probability > 0.5:
266            with self._model_lock:
267                self._language_cache = (info.language, time.monotonic())
268
269        for seg in segments_gen:
270            if seg.no_speech_prob > NO_SPEECH_THRESHOLD:
271                logger.debug(
272                    "Skipping silent segment [%.1f-%.1f] no_speech_prob=%.2f",
273                    seg.start,
274                    seg.end,
275                    seg.no_speech_prob,
276                )
277                continue
278
279            text = seg.text.strip()
280            logger.debug("Streaming segment [%.1f-%.1f]: '%s'", seg.start, seg.end, text)
281            yield TranscriptSegment(
282                text=text,
283                start=seg.start,
284                end=seg.end,
285                no_speech_prob=seg.no_speech_prob,
286            )
287
288    def transcribe_full(
289        self,
290        audio: np.ndarray,
291        channels_first: bool | None = None,
292    ) -> TranscriptResult:
293        """Transcribe audio and return full result with segments, timing, and metadata.
294
295        Args:
296            audio: Float32 numpy array at 16kHz mono, or 2-D stereo.
297            channels_first: Layout hint for 2-D stereo audio.
298                ``True``  = (channels, samples)  e.g. shape (2, 48000).
299                ``False`` = (samples, channels)  e.g. shape (48000, 2) — the
300                standard layout.
301                ``None`` (default) = fall back to a shape heuristic (smaller
302                dimension is assumed to be channels).  Prefer passing an
303                explicit value to avoid ambiguity with short audio clips.
304
305        Returns:
306            TranscriptResult with text, segments, language, and no_speech_prob.
307        """
308        audio = np.asarray(audio, dtype=np.float32)
309        if audio.ndim > 1:
310            if channels_first is True:
311                # Explicit: (channels, samples) — e.g. shape (2, 48000)
312                audio = audio.mean(axis=0)
313            elif channels_first is False:
314                # Explicit: (samples, channels) — e.g. shape (48000, 2)
315                audio = audio.mean(axis=1)
316            else:
317                # Legacy heuristic: channels axis is the smaller dimension.
318                if audio.shape[0] < audio.shape[1]:
319                    audio = audio.mean(axis=0)
320                else:
321                    audio = audio.mean(axis=1)
322
323        # Determine language (use cache if available)
324        language = self._get_language()
325
326        model = self._get_model()
327        t0 = time.perf_counter()
328
329        segments_gen, info = model.transcribe(
330            audio,
331            language=language,
332            vad_filter=True,  # Use Silero VAD for silence removal
333            vad_parameters={"min_silence_duration_ms": 500},
334            word_timestamps=False,
335            beam_size=5,
336            best_of=5,
337            temperature=[0.0, 0.2, 0.4, 0.6, 0.8, 1.0],  # Progressive fallback
338        )
339
340        # Consume the generator (transcription happens here)
341        segments = list(segments_gen)
342        elapsed_ms = (time.perf_counter() - t0) * 1000
343
344        # Update language cache (protected by _model_lock for thread safety)
345        if language is None and info.language_probability > 0.5:
346            with self._model_lock:
347                self._language_cache = (info.language, time.monotonic())
348
349        transcript_segments = [
350            TranscriptSegment(
351                text=s.text.strip(),
352                start=s.start,
353                end=s.end,
354                no_speech_prob=s.no_speech_prob,
355            )
356            for s in segments
357        ]
358
359        full_text = " ".join(s.text for s in transcript_segments).strip()
360        overall_no_speech = max((s.no_speech_prob for s in transcript_segments), default=0.0)
361
362        if overall_no_speech > NO_SPEECH_THRESHOLD:
363            logger.debug(
364                "No speech detected (no_speech_prob=%.2f) — returning empty",
365                overall_no_speech,
366            )
367            full_text = ""
368
369        logger.debug(
370            "Transcribed in %.0f ms: '%s'",
371            elapsed_ms,
372            full_text[:60] + "..." if len(full_text) > 60 else full_text,
373        )
374
375        return TranscriptResult(
376            text=full_text,
377            segments=transcript_segments,
378            language=info.language,
379            language_prob=info.language_probability,
380            duration_s=info.duration,
381            no_speech_prob=overall_no_speech,
382        )
383
384    def _get_language(self) -> str | None:
385        """Return cached language or None for auto-detection.
386
387        Thread-safe: reads ``_language_cache`` under ``_model_lock``.
388        """
389        if self.forced_language:
390            return self.forced_language
391
392        with self._model_lock:
393            if self._language_cache is not None:
394                lang, cached_at = self._language_cache
395                if time.monotonic() - cached_at < self._language_cache_ttl:
396                    return lang
397
398        return None  # auto-detect
399
400    def prewarm(self) -> None:
401        """Load the model eagerly (avoids cold-start latency on first transcription)."""
402        self._get_model()
403        logger.info("STTEngine prewarmed: model '%s' loaded", self.model_name)
404
405    def close(self) -> None:
406        """Release model resources."""
407        with self._model_lock:
408            self._model = None
409
410    def __enter__(self) -> STTEngine:
411        """Enter sync context manager. Returns self."""
412        return self
413
414    def __exit__(
415        self,
416        exc_type: type[BaseException] | None,
417        exc_val: BaseException | None,
418        exc_tb: object,
419    ) -> None:
420        """Exit sync context manager. Releases model resources."""
421        self.close()

Speech-to-text transcription via faster-whisper.

Thread-safe: WhisperModel is thread-safe for concurrent transcribe() calls.

Model is loaded once and reused. First call includes model load time (~1-3s). Subsequent calls are ~380ms (base model, CPU, 3s audio).

Example::

stt = STTEngine(model="base")
text = stt.transcribe(audio_np_float32)
print(text)  # "what's the weather today"
STTEngine( model: str = 'base', device: str = 'cpu', compute_type: str = 'int8', language: str | None = None, language_cache_ttl_s: float = 60.0)
 99    def __init__(
100        self,
101        model: str = DEFAULT_MODEL,
102        device: str = "cpu",
103        compute_type: str = "int8",
104        language: str | None = None,
105        language_cache_ttl_s: float = 60.0,
106    ) -> None:
107        """Initialize the STT engine.
108
109        Args:
110            model: Whisper model size. One of: tiny, base, small, medium, large-v3.
111                   Default "base" — good accuracy/speed balance (WER ~9%).
112            device: "cpu" or "cuda". Default "cpu".
113            compute_type: CTranslate2 compute type. "int8" (default), "float16", "float32".
114                          "int8" is fastest on CPU with minimal accuracy loss.
115            language: Force a specific language (e.g., "en"). None = auto-detect.
116            language_cache_ttl_s: Cache detected language for N seconds to avoid
117                                   per-call language detection overhead.
118        """
119        if model not in MODEL_PROFILES:
120            available = ", ".join(MODEL_PROFILES.keys())
121            raise ValueError(f"Unknown model '{model}'. Available: {available}")
122
123        self.model_name = model
124        self.device = device
125        self.compute_type = compute_type
126        self.forced_language = language
127        self._language_cache: tuple[str, float] | None = None  # (lang, cached_at)
128        self._language_cache_ttl = language_cache_ttl_s
129        self._model: WhisperModel | None = None
130        self._model_lock = threading.Lock()
131
132        profile = MODEL_PROFILES[model]
133        logger.info(
134            "STTEngine created: model=%s, device=%s (WER~%.0f%%, %dMB)",
135            model,
136            device,
137            profile["wer"],
138            profile["vram_mb"],
139        )

Initialize the STT engine.

Arguments:
  • model: Whisper model size. One of: tiny, base, small, medium, large-v3. Default "base" — good accuracy/speed balance (WER ~9%).
  • device: "cpu" or "cuda". Default "cpu".
  • compute_type: CTranslate2 compute type. "int8" (default), "float16", "float32". "int8" is fastest on CPU with minimal accuracy loss.
  • language: Force a specific language (e.g., "en"). None = auto-detect.
  • language_cache_ttl_s: Cache detected language for N seconds to avoid per-call language detection overhead.
model_name
device
compute_type
forced_language
def transcribe(self, audio: numpy.ndarray) -> str:
171    def transcribe(self, audio: np.ndarray) -> str:
172        """Transcribe audio to text.
173
174        Note:
175            This engine uses a progressive temperature fallback of
176            ``[0.0, 0.2, 0.4, 0.6, 0.8, 1.0]`` during decoding, which can
177            trigger up to 6 decoding passes and increase latency. For
178            low-latency use cases, prefer a single-pass configuration such as
179            ``temperature_fallback=[0.0]``.
180
181        Args:
182            audio: Float32 numpy array at 16kHz mono. Values should be in [-1.0, 1.0].
183
184        Returns:
185            Transcribed text as string. Empty string if no speech detected.
186        """
187        result = self.transcribe_full(audio)
188        return result.text

Transcribe audio to text.

Note:

This engine uses a progressive temperature fallback of [0.0, 0.2, 0.4, 0.6, 0.8, 1.0] during decoding, which can trigger up to 6 decoding passes and increase latency. For low-latency use cases, prefer a single-pass configuration such as temperature_fallback=[0.0].

Arguments:
  • audio: Float32 numpy array at 16kHz mono. Values should be in [-1.0, 1.0].
Returns:

Transcribed text as string. Empty string if no speech detected.

def transcribe_streaming( self, audio: numpy.ndarray, channels_first: bool | None = None, beam_size: int = 5, best_of: int = 5, temperature: list[float] | None = None) -> Iterator[violawake_sdk.stt.TranscriptSegment]:
190    def transcribe_streaming(
191        self,
192        audio: np.ndarray,
193        channels_first: bool | None = None,
194        beam_size: int = 5,
195        best_of: int = 5,
196        temperature: list[float] | None = None,
197    ) -> Iterator[TranscriptSegment]:
198        """Stream transcription segments as they become available.
199
200        Uses faster-whisper's generator mode: ``model.transcribe()`` returns a
201        ``(segments_iterator, info)`` tuple.  This method yields each
202        ``TranscriptSegment`` one at a time as faster-whisper decodes it,
203        instead of collecting all segments first.
204
205        This is useful when:
206        - You want to display partial results before full transcription completes.
207        - You need to pipe segments to a downstream consumer (TTS, logging, etc.)
208          without waiting for the full buffer to finish.
209
210        Note:
211            Segments with ``no_speech_prob`` above ``NO_SPEECH_THRESHOLD`` are
212            silently skipped (not yielded).
213
214        Args:
215            audio: Float32 numpy array at 16kHz mono, or 2-D stereo.
216            channels_first: Layout hint for 2-D stereo audio (same semantics as
217                ``transcribe_full``).
218            beam_size: Beam search width. Default 5.
219            best_of: Number of candidates when sampling. Default 5.
220            temperature: Temperature schedule. Default ``[0.0, 0.2, 0.4, 0.6, 0.8, 1.0]``.
221
222        Yields:
223            TranscriptSegment — one per decoded segment, in time order.
224
225        Example::
226
227            stt = STTEngine(model="base")
228            for seg in stt.transcribe_streaming(audio_np):
229                print(f"[{seg.start:.1f}s] {seg.text}")
230        """
231        if temperature is None:
232            temperature = [0.0, 0.2, 0.4, 0.6, 0.8, 1.0]
233
234        audio = np.asarray(audio, dtype=np.float32)
235        if audio.ndim > 1:
236            if channels_first is True:
237                audio = audio.mean(axis=0)
238            elif channels_first is False:
239                audio = audio.mean(axis=1)
240            else:
241                if audio.shape[0] < audio.shape[1]:
242                    audio = audio.mean(axis=0)
243                else:
244                    audio = audio.mean(axis=1)
245
246        language = self._get_language()
247        model = self._get_model()
248
249        logger.debug("transcribe_streaming: starting generator on %d samples", len(audio))
250
251        segments_gen, info = model.transcribe(
252            audio,
253            language=language,
254            vad_filter=True,
255            vad_parameters={"min_silence_duration_ms": 500},
256            word_timestamps=False,
257            beam_size=beam_size,
258            best_of=best_of,
259            temperature=temperature,
260        )
261
262        # Update language cache after model.transcribe() returns info — same
263        # logic as transcribe_full, but we must do it before consuming the
264        # generator so the cache is primed for subsequent calls.
265        if language is None and info.language_probability > 0.5:
266            with self._model_lock:
267                self._language_cache = (info.language, time.monotonic())
268
269        for seg in segments_gen:
270            if seg.no_speech_prob > NO_SPEECH_THRESHOLD:
271                logger.debug(
272                    "Skipping silent segment [%.1f-%.1f] no_speech_prob=%.2f",
273                    seg.start,
274                    seg.end,
275                    seg.no_speech_prob,
276                )
277                continue
278
279            text = seg.text.strip()
280            logger.debug("Streaming segment [%.1f-%.1f]: '%s'", seg.start, seg.end, text)
281            yield TranscriptSegment(
282                text=text,
283                start=seg.start,
284                end=seg.end,
285                no_speech_prob=seg.no_speech_prob,
286            )

Stream transcription segments as they become available.

Uses faster-whisper's generator mode: model.transcribe() returns a (segments_iterator, info) tuple. This method yields each TranscriptSegment one at a time as faster-whisper decodes it, instead of collecting all segments first.

This is useful when:

  • You want to display partial results before full transcription completes.
  • You need to pipe segments to a downstream consumer (TTS, logging, etc.) without waiting for the full buffer to finish.
Note:

Segments with no_speech_prob above NO_SPEECH_THRESHOLD are silently skipped (not yielded).

Arguments:
  • audio: Float32 numpy array at 16kHz mono, or 2-D stereo.
  • channels_first: Layout hint for 2-D stereo audio (same semantics as transcribe_full).
  • beam_size: Beam search width. Default 5.
  • best_of: Number of candidates when sampling. Default 5.
  • temperature: Temperature schedule. Default [0.0, 0.2, 0.4, 0.6, 0.8, 1.0].
Yields:

TranscriptSegment — one per decoded segment, in time order.

Example::

stt = STTEngine(model="base")
for seg in stt.transcribe_streaming(audio_np):
    print(f"[{seg.start:.1f}s] {seg.text}")
def transcribe_full( self, audio: numpy.ndarray, channels_first: bool | None = None) -> violawake_sdk.stt.TranscriptResult:
288    def transcribe_full(
289        self,
290        audio: np.ndarray,
291        channels_first: bool | None = None,
292    ) -> TranscriptResult:
293        """Transcribe audio and return full result with segments, timing, and metadata.
294
295        Args:
296            audio: Float32 numpy array at 16kHz mono, or 2-D stereo.
297            channels_first: Layout hint for 2-D stereo audio.
298                ``True``  = (channels, samples)  e.g. shape (2, 48000).
299                ``False`` = (samples, channels)  e.g. shape (48000, 2) — the
300                standard layout.
301                ``None`` (default) = fall back to a shape heuristic (smaller
302                dimension is assumed to be channels).  Prefer passing an
303                explicit value to avoid ambiguity with short audio clips.
304
305        Returns:
306            TranscriptResult with text, segments, language, and no_speech_prob.
307        """
308        audio = np.asarray(audio, dtype=np.float32)
309        if audio.ndim > 1:
310            if channels_first is True:
311                # Explicit: (channels, samples) — e.g. shape (2, 48000)
312                audio = audio.mean(axis=0)
313            elif channels_first is False:
314                # Explicit: (samples, channels) — e.g. shape (48000, 2)
315                audio = audio.mean(axis=1)
316            else:
317                # Legacy heuristic: channels axis is the smaller dimension.
318                if audio.shape[0] < audio.shape[1]:
319                    audio = audio.mean(axis=0)
320                else:
321                    audio = audio.mean(axis=1)
322
323        # Determine language (use cache if available)
324        language = self._get_language()
325
326        model = self._get_model()
327        t0 = time.perf_counter()
328
329        segments_gen, info = model.transcribe(
330            audio,
331            language=language,
332            vad_filter=True,  # Use Silero VAD for silence removal
333            vad_parameters={"min_silence_duration_ms": 500},
334            word_timestamps=False,
335            beam_size=5,
336            best_of=5,
337            temperature=[0.0, 0.2, 0.4, 0.6, 0.8, 1.0],  # Progressive fallback
338        )
339
340        # Consume the generator (transcription happens here)
341        segments = list(segments_gen)
342        elapsed_ms = (time.perf_counter() - t0) * 1000
343
344        # Update language cache (protected by _model_lock for thread safety)
345        if language is None and info.language_probability > 0.5:
346            with self._model_lock:
347                self._language_cache = (info.language, time.monotonic())
348
349        transcript_segments = [
350            TranscriptSegment(
351                text=s.text.strip(),
352                start=s.start,
353                end=s.end,
354                no_speech_prob=s.no_speech_prob,
355            )
356            for s in segments
357        ]
358
359        full_text = " ".join(s.text for s in transcript_segments).strip()
360        overall_no_speech = max((s.no_speech_prob for s in transcript_segments), default=0.0)
361
362        if overall_no_speech > NO_SPEECH_THRESHOLD:
363            logger.debug(
364                "No speech detected (no_speech_prob=%.2f) — returning empty",
365                overall_no_speech,
366            )
367            full_text = ""
368
369        logger.debug(
370            "Transcribed in %.0f ms: '%s'",
371            elapsed_ms,
372            full_text[:60] + "..." if len(full_text) > 60 else full_text,
373        )
374
375        return TranscriptResult(
376            text=full_text,
377            segments=transcript_segments,
378            language=info.language,
379            language_prob=info.language_probability,
380            duration_s=info.duration,
381            no_speech_prob=overall_no_speech,
382        )

Transcribe audio and return full result with segments, timing, and metadata.

Arguments:
  • audio: Float32 numpy array at 16kHz mono, or 2-D stereo.
  • channels_first: Layout hint for 2-D stereo audio. True = (channels, samples) e.g. shape (2, 48000). False = (samples, channels) e.g. shape (48000, 2) — the standard layout. None (default) = fall back to a shape heuristic (smaller dimension is assumed to be channels). Prefer passing an explicit value to avoid ambiguity with short audio clips.
Returns:

TranscriptResult with text, segments, language, and no_speech_prob.

def prewarm(self) -> None:
400    def prewarm(self) -> None:
401        """Load the model eagerly (avoids cold-start latency on first transcription)."""
402        self._get_model()
403        logger.info("STTEngine prewarmed: model '%s' loaded", self.model_name)

Load the model eagerly (avoids cold-start latency on first transcription).

def close(self) -> None:
405    def close(self) -> None:
406        """Release model resources."""
407        with self._model_lock:
408            self._model = None

Release model resources.

@dataclass
class StreamingSTTEngine:
433@dataclass
434class StreamingSTTEngine:
435    """Incremental streaming STT: accepts audio chunks, yields segments.
436
437    Audio chunks are pushed one at a time via :meth:`push_chunk`.  When the
438    accumulated buffer reaches ``min_buffer_seconds``, :meth:`push_chunk`
439    transparently transcribes the buffer and yields any new segments.  You can
440    also force a transcription at any time with :meth:`flush`.
441
442    A sliding-window approach is supported via ``stride_seconds``: after each
443    transcription pass the engine retains the last ``stride_seconds`` of audio
444    so that words near the boundary are not lost on the next pass.  Set
445    ``stride_seconds=0.0`` (default) to discard all audio after each pass.
446
447    Thread safety: **not** thread-safe.  Call from a single thread or protect
448    externally with a lock.
449
450    Args:
451        model: Whisper model size. One of ``tiny``, ``base``, ``small``,
452               ``medium``, ``large-v3``. Default ``"base"``.
453        device: ``"cpu"`` or ``"cuda"``. Default ``"cpu"``.
454        compute_type: CTranslate2 compute type. Default ``"int8"``.
455        language: Force a specific language code (e.g. ``"en"``). ``None``
456                  for auto-detect.
457        min_buffer_seconds: Minimum seconds of audio to accumulate before
458                            attempting a transcription pass.  Shorter values
459                            mean lower latency but more frequent (and
460                            potentially noisier) passes.  Default ``2.0``.
461        stride_seconds: Seconds of audio overlap to retain between passes
462                        (sliding-window).  Default ``0.0`` (no overlap).
463        sample_rate: Sample rate of incoming audio chunks. Default ``16000``.
464
465    Example::
466
467        streaming = StreamingSTTEngine(model="base", min_buffer_seconds=2.0)
468        for chunk in mic_chunks:
469            for segment in streaming.push_chunk(chunk):
470                print(f"[{segment.start:.1f}s] {segment.text}")
471
472        # Force final transcription when done
473        for segment in streaming.flush():
474            print(f"[{segment.start:.1f}s] {segment.text}")
475    """
476
477    model: str = "base"
478    device: str = "cpu"
479    compute_type: str = "int8"
480    language: str | None = None
481    min_buffer_seconds: float = 2.0
482    stride_seconds: float = _DEFAULT_STRIDE_S
483    sample_rate: int = 16_000
484
485    # Internal state — populated post-init; not part of the public constructor.
486    _engine: STTEngine = field(init=False, repr=False)
487    _buffer: list[np.ndarray] = field(init=False, repr=False, default_factory=list)
488    _buffer_samples: int = field(init=False, repr=False, default=0)
489
490    def __post_init__(self) -> None:
491        self._engine = STTEngine(
492            model=self.model,
493            device=self.device,
494            compute_type=self.compute_type,
495            language=self.language,
496        )
497        self._buffer = []
498        self._buffer_samples = 0
499        logger.info(
500            "StreamingSTTEngine created: model=%s, min_buffer=%.1fs, stride=%.1fs",
501            self.model,
502            self.min_buffer_seconds,
503            self.stride_seconds,
504        )
505
506    # ------------------------------------------------------------------
507    # Public API
508    # ------------------------------------------------------------------
509
510    @property
511    def buffer_duration_s(self) -> float:
512        """Current accumulated audio duration in seconds."""
513        return self._buffer_samples / self.sample_rate
514
515    def push_chunk(self, chunk: np.ndarray | bytes) -> Iterator[TranscriptSegment]:
516        """Push an audio chunk into the buffer.
517
518        If the buffer has accumulated at least ``min_buffer_seconds`` of audio,
519        a transcription pass is run and any yielded segments are returned.
520        Otherwise, no segments are yielded and the chunk is silently buffered.
521
522        Args:
523            chunk: Float32 numpy array (16kHz mono) **or** raw ``int16`` PCM
524                   bytes.  Bytes are automatically converted to float32.
525
526        Yields:
527            TranscriptSegment — segments decoded in this pass (may be empty).
528        """
529        arr = self._coerce_chunk(chunk)
530        self._buffer.append(arr)
531        self._buffer_samples += len(arr)
532
533        min_samples = int(self.min_buffer_seconds * self.sample_rate)
534        if self._buffer_samples >= min_samples:
535            yield from self._run_pass()
536
537    def flush(self) -> Iterator[TranscriptSegment]:
538        """Transcribe whatever remains in the buffer and clear it.
539
540        Call this when the audio stream ends to ensure trailing audio is
541        transcribed.
542
543        Yields:
544            TranscriptSegment — segments from the remaining buffer.
545        """
546        if self._buffer_samples == 0:
547            return
548
549        logger.debug("StreamingSTTEngine.flush: %.2f s buffered", self.buffer_duration_s)
550        yield from self._run_pass(force=True)
551
552    def reset(self) -> None:
553        """Discard the current buffer without transcribing."""
554        self._buffer = []
555        self._buffer_samples = 0
556        logger.debug("StreamingSTTEngine buffer reset")
557
558    def prewarm(self) -> None:
559        """Eagerly load the underlying Whisper model."""
560        self._engine.prewarm()
561
562    def close(self) -> None:
563        """Release model resources and discard the buffer."""
564        self.reset()
565        self._engine.close()
566
567    def __enter__(self) -> StreamingSTTEngine:
568        """Enter sync context manager."""
569        return self
570
571    def __exit__(
572        self,
573        exc_type: type[BaseException] | None,
574        exc_val: BaseException | None,
575        exc_tb: object,
576    ) -> None:
577        """Exit sync context manager. Releases engine resources."""
578        self.close()
579
580    # ------------------------------------------------------------------
581    # Internal helpers
582    # ------------------------------------------------------------------
583
584    def _coerce_chunk(self, chunk: np.ndarray | bytes) -> np.ndarray:
585        """Convert raw int16 bytes or ensure float32 array."""
586        if isinstance(chunk, (bytes, bytearray)):
587            arr = np.frombuffer(chunk, dtype=np.int16).astype(np.float32) / 32768.0
588            return arr
589        arr = np.asarray(chunk, dtype=np.float32)
590        if arr.ndim > 1:
591            # Best-effort stereo → mono using the shape heuristic from STTEngine
592            if arr.shape[0] < arr.shape[1]:
593                arr = arr.mean(axis=0)
594            else:
595                arr = arr.mean(axis=1)
596        return arr
597
598    def _run_pass(self, *, force: bool = False) -> Iterator[TranscriptSegment]:
599        """Concatenate buffer, transcribe, apply sliding window, yield segments."""
600        audio = np.concatenate(self._buffer)
601
602        logger.debug(
603            "StreamingSTTEngine pass: %.2f s (force=%s)",
604            len(audio) / self.sample_rate,
605            force,
606        )
607
608        yield from self._engine.transcribe_streaming(audio)
609
610        # Sliding window: retain the last stride_seconds of audio so that
611        # words near the boundary are not cut off on the next pass.
612        stride_samples = int(self.stride_seconds * self.sample_rate)
613        if stride_samples > 0 and len(audio) > stride_samples:
614            retained = audio[-stride_samples:]
615            self._buffer = [retained]
616            self._buffer_samples = len(retained)
617        else:
618            self._buffer = []
619            self._buffer_samples = 0

Incremental streaming STT: accepts audio chunks, yields segments.

Audio chunks are pushed one at a time via push_chunk(). When the accumulated buffer reaches min_buffer_seconds, push_chunk() transparently transcribes the buffer and yields any new segments. You can also force a transcription at any time with flush().

A sliding-window approach is supported via stride_seconds: after each transcription pass the engine retains the last stride_seconds of audio so that words near the boundary are not lost on the next pass. Set stride_seconds=0.0 (default) to discard all audio after each pass.

Thread safety: not thread-safe. Call from a single thread or protect externally with a lock.

Arguments:
  • model: Whisper model size. One of tiny, base, small, medium, large-v3. Default "base".
  • device: "cpu" or "cuda". Default "cpu".
  • compute_type: CTranslate2 compute type. Default "int8".
  • language: Force a specific language code (e.g. "en"). None for auto-detect.
  • min_buffer_seconds: Minimum seconds of audio to accumulate before attempting a transcription pass. Shorter values mean lower latency but more frequent (and potentially noisier) passes. Default 2.0.
  • stride_seconds: Seconds of audio overlap to retain between passes (sliding-window). Default 0.0 (no overlap).
  • sample_rate: Sample rate of incoming audio chunks. Default 16000.

Example::

streaming = StreamingSTTEngine(model="base", min_buffer_seconds=2.0)
for chunk in mic_chunks:
    for segment in streaming.push_chunk(chunk):
        print(f"[{segment.start:.1f}s] {segment.text}")

# Force final transcription when done
for segment in streaming.flush():
    print(f"[{segment.start:.1f}s] {segment.text}")
StreamingSTTEngine( model: str = 'base', device: str = 'cpu', compute_type: str = 'int8', language: str | None = None, min_buffer_seconds: float = 2.0, stride_seconds: float = 0.0, sample_rate: int = 16000)
model: str = 'base'
device: str = 'cpu'
compute_type: str = 'int8'
language: str | None = None
min_buffer_seconds: float = 2.0
stride_seconds: float = 0.0
sample_rate: int = 16000
buffer_duration_s: float
510    @property
511    def buffer_duration_s(self) -> float:
512        """Current accumulated audio duration in seconds."""
513        return self._buffer_samples / self.sample_rate

Current accumulated audio duration in seconds.

def push_chunk( self, chunk: numpy.ndarray | bytes) -> Iterator[violawake_sdk.stt.TranscriptSegment]:
515    def push_chunk(self, chunk: np.ndarray | bytes) -> Iterator[TranscriptSegment]:
516        """Push an audio chunk into the buffer.
517
518        If the buffer has accumulated at least ``min_buffer_seconds`` of audio,
519        a transcription pass is run and any yielded segments are returned.
520        Otherwise, no segments are yielded and the chunk is silently buffered.
521
522        Args:
523            chunk: Float32 numpy array (16kHz mono) **or** raw ``int16`` PCM
524                   bytes.  Bytes are automatically converted to float32.
525
526        Yields:
527            TranscriptSegment — segments decoded in this pass (may be empty).
528        """
529        arr = self._coerce_chunk(chunk)
530        self._buffer.append(arr)
531        self._buffer_samples += len(arr)
532
533        min_samples = int(self.min_buffer_seconds * self.sample_rate)
534        if self._buffer_samples >= min_samples:
535            yield from self._run_pass()

Push an audio chunk into the buffer.

If the buffer has accumulated at least min_buffer_seconds of audio, a transcription pass is run and any yielded segments are returned. Otherwise, no segments are yielded and the chunk is silently buffered.

Arguments:
  • chunk: Float32 numpy array (16kHz mono) or raw int16 PCM bytes. Bytes are automatically converted to float32.
Yields:

TranscriptSegment — segments decoded in this pass (may be empty).

def flush(self) -> Iterator[violawake_sdk.stt.TranscriptSegment]:
537    def flush(self) -> Iterator[TranscriptSegment]:
538        """Transcribe whatever remains in the buffer and clear it.
539
540        Call this when the audio stream ends to ensure trailing audio is
541        transcribed.
542
543        Yields:
544            TranscriptSegment — segments from the remaining buffer.
545        """
546        if self._buffer_samples == 0:
547            return
548
549        logger.debug("StreamingSTTEngine.flush: %.2f s buffered", self.buffer_duration_s)
550        yield from self._run_pass(force=True)

Transcribe whatever remains in the buffer and clear it.

Call this when the audio stream ends to ensure trailing audio is transcribed.

Yields:

TranscriptSegment — segments from the remaining buffer.

def reset(self) -> None:
552    def reset(self) -> None:
553        """Discard the current buffer without transcribing."""
554        self._buffer = []
555        self._buffer_samples = 0
556        logger.debug("StreamingSTTEngine buffer reset")

Discard the current buffer without transcribing.

def prewarm(self) -> None:
558    def prewarm(self) -> None:
559        """Eagerly load the underlying Whisper model."""
560        self._engine.prewarm()

Eagerly load the underlying Whisper model.

def close(self) -> None:
562    def close(self) -> None:
563        """Release model resources and discard the buffer."""
564        self.reset()
565        self._engine.close()

Release model resources and discard the buffer.

class VoicePipeline:
 56class VoicePipeline:
 57    """Orchestrated Wake→VAD→STT→TTS voice pipeline.
 58
 59    State Machine
 60    =============
 61    Four states with the following transitions::
 62
 63        IDLE ──(wake detected)──→ LISTENING
 64        LISTENING ──(silence/timeout)──→ TRANSCRIBING
 65        TRANSCRIBING ──(text ready)──→ RESPONDING
 66        RESPONDING ──(handlers done)──→ IDLE
 67
 68    Thread ownership of transitions:
 69        - **Main thread** (``_run_loop``): IDLE→LISTENING, LISTENING→TRANSCRIBING.
 70          Owns the mic capture loop and wake/VAD detection.
 71        - **Worker thread** (``_transcribe_and_respond``): TRANSCRIBING→RESPONDING→IDLE.
 72          Spawned by ``_start_worker()`` as a daemon thread. Runs STT, dispatches
 73          command handlers, and triggers TTS playback. Only one worker thread is
 74          active at a time (guarded by ``_worker_lock``).
 75
 76    Threading Model
 77    ===============
 78    - The main loop runs in the calling thread via ``run()`` (blocking).
 79    - When a command recording ends, ``_start_worker()`` spawns a single daemon
 80      thread for STT transcription + handler dispatch + TTS playback.
 81    - ``_state_lock`` guards all reads/writes of ``_state``.
 82    - ``_worker_lock`` guards ``_worker_thread`` reference management.
 83
 84    Known Limitation
 85    ================
 86    There is a brief window between the worker thread completing (setting state
 87    back to IDLE) and the next wake detection where the pipeline is technically
 88    idle but the worker thread reference may not yet be cleared. During this
 89    window, a new wake detection will proceed normally since the state is IDLE.
 90
 91    Usage::
 92
 93        pipeline = VoicePipeline()
 94
 95        @pipeline.on_command
 96        def handle(text: str) -> str | None:
 97            return f"You said: {text}"
 98
 99        pipeline.run()
100    """
101
102    def __init__(
103        self,
104        wake_word: str = "viola",
105        stt_model: str = "base",
106        tts_voice: str = "af_heart",
107        threshold: float = DEFAULT_THRESHOLD,
108        vad_backend: str = "auto",
109        vad_threshold: float = 0.4,
110        enable_tts: bool = True,
111        device_index: int | None = None,
112        on_wake: WakeCallback | None = None,
113        streaming_stt: bool = False,
114    ) -> None:
115        """Initialize the voice pipeline.
116
117        Args:
118            wake_word: Wake word model name. Default "viola".
119            stt_model: Whisper model size. Default "base".
120            tts_voice: TTS voice name. Default "af_heart".
121            threshold: Wake word detection threshold. Default 0.80.
122            vad_backend: VAD backend. Default "auto".
123            vad_threshold: VAD speech detection threshold. Default 0.4.
124            enable_tts: If True, speak responses via TTS. If False, skip TTS.
125            device_index: Microphone device index. None = system default.
126            on_wake: Optional callback fired after wake-word detection and
127                     before command transcription begins.
128            streaming_stt: If True, use ``STTEngine.transcribe_streaming()``
129                           to yield segments incrementally instead of waiting
130                           for the full transcription.  The command text passed
131                           to handlers is the concatenation of all yielded
132                           segment texts (identical to non-streaming behaviour),
133                           but each segment is logged as it arrives.
134                           Default False.
135        """
136        self._wake_detector = WakeDetector(model=wake_word, threshold=threshold)
137        self._vad = VADEngine(backend=vad_backend)
138        self._vad_threshold = vad_threshold
139        self._enable_tts = enable_tts
140        self._device_index = device_index
141        self._stt_model = stt_model
142        self._tts_voice = tts_voice
143        self._on_wake = on_wake
144        self._streaming_stt = streaming_stt
145
146        # Typed as Any because STTEngine/TTSEngine are lazily imported to avoid
147        # hard dependencies on optional extras (violawake[stt], violawake[tts]).
148        self._stt: Any | None = None  # lazy — violawake_sdk.stt.STTEngine
149        self._tts: Any | None = None  # lazy — violawake_sdk.tts.TTSEngine
150
151        self._state = _STATE_IDLE
152        self._state_lock = threading.Lock()
153        self._stop_event = threading.Event()
154        self._worker_lock = threading.Lock()
155        self._worker_thread: threading.Thread | None = None
156
157        self._command_handlers: list[CommandHandler] = []
158
159        logger.info(
160            "VoicePipeline initialized: wake=%s, stt=%s, tts=%s, streaming_stt=%s",
161            wake_word,
162            stt_model,
163            tts_voice,
164            streaming_stt,
165        )
166
167    def on_command(self, handler: CommandHandler) -> CommandHandler:
168        """Decorator to register a command handler.
169
170        The handler receives the transcribed text and may return a string
171        response (which is spoken via TTS) or None (no TTS response).
172
173        Example::
174
175            @pipeline.on_command
176            def handle(text: str) -> str | None:
177                if "weather" in text:
178                    return "It's sunny!"
179                return None
180        """
181        self._command_handlers.append(handler)
182        return handler
183
184    def run(self) -> None:
185        """Run the pipeline. Blocks until ``stop()`` is called or Ctrl+C.
186
187        Raises:
188            PipelineError: If the pipeline encounters an unrecoverable error.
189        """
190        logger.info("VoicePipeline started. Say the wake word to begin.")
191        self._stop_event.clear()
192
193        try:
194            self._run_loop()
195        except KeyboardInterrupt:
196            logger.info("Pipeline interrupted by user.")
197        except Exception as e:
198            raise PipelineError(f"Pipeline error: {e}") from e
199        finally:
200            self.stop()
201            with self._state_lock:
202                self._state = _STATE_IDLE
203            logger.info("VoicePipeline stopped.")
204
205    def stop(self, timeout: float = 5.0) -> None:
206        """Signal the pipeline to stop and wait briefly for worker cleanup."""
207        self._stop_event.set()
208        worker = self._get_worker_thread()
209        if worker is None or worker is threading.current_thread():
210            return
211
212        worker.join(timeout=timeout)
213        if worker.is_alive():
214            logger.warning("VoicePipeline worker thread did not exit within %.1f s", timeout)
215        else:
216            # Clear reference only after the worker has fully exited
217            with self._worker_lock:
218                if self._worker_thread is worker:
219                    self._worker_thread = None
220
221    def close(self) -> None:
222        """Stop the pipeline and release all engine resources."""
223        self.stop()
224        self._wake_detector.close()
225        self._stt = None
226        self._tts = None
227
228    def __enter__(self) -> VoicePipeline:
229        """Enter sync context manager."""
230        return self
231
232    def __exit__(
233        self,
234        exc_type: type[BaseException] | None,
235        exc_val: BaseException | None,
236        exc_tb: object,
237    ) -> None:
238        """Exit sync context manager. Stops pipeline and releases resources."""
239        self.close()
240
241    def speak(self, text: str) -> None:
242        """Synthesize and play text via TTS (called from within command handlers)."""
243        if not self._enable_tts or self._stop_event.is_set():
244            return
245
246        tts = self._get_tts()
247        if tts is None:
248            logger.warning("TTS not available — install 'violawake[tts]'")
249            return
250
251        try:
252            audio = tts.synthesize(text)
253            tts.play(audio)
254        except Exception as e:
255            logger.exception("TTS playback failed for text '%.50s': %s", text, e)
256
257    def _run_loop(self) -> None:
258        """Main mic capture and detection loop."""
259        recording_buffer: list[bytes] = []
260        silence_count = 0
261
262        for frame in self._wake_detector.stream_mic(device_index=self._device_index):
263            if self._stop_event.is_set():
264                break
265
266            with self._state_lock:
267                state = self._state
268
269            if state == _STATE_IDLE:
270                # Listening for wake word
271                if self._wake_detector.detect(frame, is_playing=False):
272                    logger.info("Wake word detected → listening for command")
273                    recording_buffer.clear()
274                    silence_count = 0
275                    if self._on_wake is not None:
276                        try:
277                            self._on_wake()
278                        except Exception:
279                            logger.exception("on_wake callback failed")
280                    with self._state_lock:
281                        self._state = _STATE_LISTENING
282
283            elif state == _STATE_LISTENING:
284                # Recording command
285                recording_buffer.append(frame)
286                is_speech = self._vad.is_speech(frame, threshold=self._vad_threshold)
287
288                if not is_speech:
289                    silence_count += 1
290                else:
291                    silence_count = 0
292
293                # Check stop conditions
294                total_frames = len(recording_buffer)
295
296                if silence_count >= SILENCE_FRAMES_STOP or total_frames >= MAX_RECORDING_FRAMES:
297                    audio_bytes = b"".join(recording_buffer)
298                    with self._state_lock:
299                        self._state = _STATE_TRANSCRIBING
300
301                    # Transcribe in a worker thread to not block the mic loop
302                    self._start_worker(audio_bytes)
303
304            # _STATE_TRANSCRIBING and _STATE_RESPONDING: keep looping (don't detect wake)
305
306    def _transcribe_and_respond(self, audio_bytes: bytes) -> None:
307        """Transcribe audio and dispatch to command handlers."""
308        stt = self._get_stt()
309        if stt is None:
310            logger.warning("STT not available — install 'violawake[stt]'")
311            with self._state_lock:
312                self._state = _STATE_IDLE
313            return
314
315        try:
316            if len(audio_bytes) % 2 != 0:
317                logger.warning(
318                    "Audio buffer length %d is not a multiple of 2 bytes (int16); "
319                    "truncating to even boundary",
320                    len(audio_bytes),
321                )
322                audio_bytes = audio_bytes[: len(audio_bytes) & ~1]
323            if len(audio_bytes) == 0:
324                logger.warning("Empty audio buffer — skipping transcription")
325                return
326            pcm = np.frombuffer(audio_bytes, dtype=np.int16).astype(np.float32) / 32768.0
327
328            if self._streaming_stt:
329                # Streaming mode: consume the generator and log each segment as it arrives.
330                segment_texts: list[str] = []
331                for seg in stt.transcribe_streaming(pcm):
332                    if self._stop_event.is_set():
333                        logger.debug("Pipeline stopping; aborting streaming transcription")
334                        return
335                    logger.debug("Streaming segment [%.1f-%.1f]: '%s'", seg.start, seg.end, seg.text)
336                    segment_texts.append(seg.text)
337                text = " ".join(segment_texts).strip()
338            else:
339                text = stt.transcribe(pcm)
340
341            if self._stop_event.is_set():
342                logger.debug("Pipeline stopping; dropping transcription result")
343                return
344
345            if text.strip():
346                logger.info("Command: '%s'", text)
347                self._dispatch_command(text)
348            else:
349                logger.debug("Empty transcription — returning to idle")
350
351        except Exception:
352            logger.exception("Transcription failed")
353        finally:
354            self._clear_worker_thread()
355            with self._state_lock:
356                self._state = _STATE_IDLE
357
358    def _dispatch_command(self, text: str) -> None:
359        """Call all registered command handlers."""
360        if self._stop_event.is_set():
361            return
362
363        with self._state_lock:
364            self._state = _STATE_RESPONDING
365
366        try:
367            for handler in self._command_handlers:
368                if self._stop_event.is_set():
369                    break
370                try:
371                    response = handler(text)
372                    if response and self._enable_tts and not self._stop_event.is_set():
373                        self.speak(response)
374                except Exception:
375                    logger.exception("Command handler '%s' failed", handler.__name__)
376        finally:
377            with self._state_lock:
378                self._state = _STATE_IDLE
379
380    def _start_worker(self, audio_bytes: bytes) -> None:
381        """Start the STT/TTS worker thread and retain it for shutdown.
382
383        If a previous worker is still alive, skip spawning to prevent
384        concurrent transcription.
385        """
386        with self._worker_lock:
387            if self._worker_thread is not None and self._worker_thread.is_alive():
388                logger.warning("Previous worker thread still alive — skipping new spawn")
389                with self._state_lock:
390                    self._state = _STATE_IDLE
391                return
392            worker = threading.Thread(
393                target=self._transcribe_and_respond,
394                args=(audio_bytes,),
395                daemon=True,
396            )
397            self._worker_thread = worker
398        worker.start()
399
400    def _get_worker_thread(self) -> threading.Thread | None:
401        """Return the active worker thread, if any."""
402        with self._worker_lock:
403            return self._worker_thread
404
405    def _clear_worker_thread(self) -> None:
406        """Clear the worker reference once the worker exits."""
407        current = threading.current_thread()
408        with self._worker_lock:
409            if self._worker_thread is current:
410                self._worker_thread = None
411
412    def _get_stt(self) -> Any | None:
413        """Lazy-load STT engine."""
414        if self._stt is None:
415            try:
416                from violawake_sdk.stt import STTEngine
417
418                self._stt = STTEngine(model=self._stt_model)
419                self._stt.prewarm()
420            except ImportError:
421                return None
422        return self._stt
423
424    def _get_tts(self) -> Any | None:
425        """Lazy-load TTS engine."""
426        if self._tts is None and self._enable_tts:
427            try:
428                from violawake_sdk.tts import TTSEngine
429
430                self._tts = TTSEngine(voice=self._tts_voice)
431            except ImportError:
432                return None
433        return self._tts

Orchestrated Wake→VAD→STT→TTS voice pipeline.

State Machine

Four states with the following transitions::

IDLE ──(wake detected)──→ LISTENING
LISTENING ──(silence/timeout)──→ TRANSCRIBING
TRANSCRIBING ──(text ready)──→ RESPONDING
RESPONDING ──(handlers done)──→ IDLE
Thread ownership of transitions:
  • Main thread (_run_loop): IDLE→LISTENING, LISTENING→TRANSCRIBING. Owns the mic capture loop and wake/VAD detection.
  • Worker thread (_transcribe_and_respond): TRANSCRIBING→RESPONDING→IDLE. Spawned by _start_worker() as a daemon thread. Runs STT, dispatches command handlers, and triggers TTS playback. Only one worker thread is active at a time (guarded by _worker_lock).

Threading Model

  • The main loop runs in the calling thread via run() (blocking).
  • When a command recording ends, _start_worker() spawns a single daemon thread for STT transcription + handler dispatch + TTS playback.
  • _state_lock guards all reads/writes of _state.
  • _worker_lock guards _worker_thread reference management.

Known Limitation

There is a brief window between the worker thread completing (setting state back to IDLE) and the next wake detection where the pipeline is technically idle but the worker thread reference may not yet be cleared. During this window, a new wake detection will proceed normally since the state is IDLE.

Usage::

pipeline = VoicePipeline()

@pipeline.on_command
def handle(text: str) -> str | None:
    return f"You said: {text}"

pipeline.run()
VoicePipeline( wake_word: str = 'viola', stt_model: str = 'base', tts_voice: str = 'af_heart', threshold: float = 0.8, vad_backend: str = 'auto', vad_threshold: float = 0.4, enable_tts: bool = True, device_index: int | None = None, on_wake: Callable[[], None] | None = None, streaming_stt: bool = False)
102    def __init__(
103        self,
104        wake_word: str = "viola",
105        stt_model: str = "base",
106        tts_voice: str = "af_heart",
107        threshold: float = DEFAULT_THRESHOLD,
108        vad_backend: str = "auto",
109        vad_threshold: float = 0.4,
110        enable_tts: bool = True,
111        device_index: int | None = None,
112        on_wake: WakeCallback | None = None,
113        streaming_stt: bool = False,
114    ) -> None:
115        """Initialize the voice pipeline.
116
117        Args:
118            wake_word: Wake word model name. Default "viola".
119            stt_model: Whisper model size. Default "base".
120            tts_voice: TTS voice name. Default "af_heart".
121            threshold: Wake word detection threshold. Default 0.80.
122            vad_backend: VAD backend. Default "auto".
123            vad_threshold: VAD speech detection threshold. Default 0.4.
124            enable_tts: If True, speak responses via TTS. If False, skip TTS.
125            device_index: Microphone device index. None = system default.
126            on_wake: Optional callback fired after wake-word detection and
127                     before command transcription begins.
128            streaming_stt: If True, use ``STTEngine.transcribe_streaming()``
129                           to yield segments incrementally instead of waiting
130                           for the full transcription.  The command text passed
131                           to handlers is the concatenation of all yielded
132                           segment texts (identical to non-streaming behaviour),
133                           but each segment is logged as it arrives.
134                           Default False.
135        """
136        self._wake_detector = WakeDetector(model=wake_word, threshold=threshold)
137        self._vad = VADEngine(backend=vad_backend)
138        self._vad_threshold = vad_threshold
139        self._enable_tts = enable_tts
140        self._device_index = device_index
141        self._stt_model = stt_model
142        self._tts_voice = tts_voice
143        self._on_wake = on_wake
144        self._streaming_stt = streaming_stt
145
146        # Typed as Any because STTEngine/TTSEngine are lazily imported to avoid
147        # hard dependencies on optional extras (violawake[stt], violawake[tts]).
148        self._stt: Any | None = None  # lazy — violawake_sdk.stt.STTEngine
149        self._tts: Any | None = None  # lazy — violawake_sdk.tts.TTSEngine
150
151        self._state = _STATE_IDLE
152        self._state_lock = threading.Lock()
153        self._stop_event = threading.Event()
154        self._worker_lock = threading.Lock()
155        self._worker_thread: threading.Thread | None = None
156
157        self._command_handlers: list[CommandHandler] = []
158
159        logger.info(
160            "VoicePipeline initialized: wake=%s, stt=%s, tts=%s, streaming_stt=%s",
161            wake_word,
162            stt_model,
163            tts_voice,
164            streaming_stt,
165        )

Initialize the voice pipeline.

Arguments:
  • wake_word: Wake word model name. Default "viola".
  • stt_model: Whisper model size. Default "base".
  • tts_voice: TTS voice name. Default "af_heart".
  • threshold: Wake word detection threshold. Default 0.80.
  • vad_backend: VAD backend. Default "auto".
  • vad_threshold: VAD speech detection threshold. Default 0.4.
  • enable_tts: If True, speak responses via TTS. If False, skip TTS.
  • device_index: Microphone device index. None = system default.
  • on_wake: Optional callback fired after wake-word detection and before command transcription begins.
  • streaming_stt: If True, use STTEngine.transcribe_streaming() to yield segments incrementally instead of waiting for the full transcription. The command text passed to handlers is the concatenation of all yielded segment texts (identical to non-streaming behaviour), but each segment is logged as it arrives. Default False.
def on_command( self, handler: Callable[[str], str | None]) -> Callable[[str], str | None]:
167    def on_command(self, handler: CommandHandler) -> CommandHandler:
168        """Decorator to register a command handler.
169
170        The handler receives the transcribed text and may return a string
171        response (which is spoken via TTS) or None (no TTS response).
172
173        Example::
174
175            @pipeline.on_command
176            def handle(text: str) -> str | None:
177                if "weather" in text:
178                    return "It's sunny!"
179                return None
180        """
181        self._command_handlers.append(handler)
182        return handler

Decorator to register a command handler.

The handler receives the transcribed text and may return a string response (which is spoken via TTS) or None (no TTS response).

Example::

@pipeline.on_command
def handle(text: str) -> str | None:
    if "weather" in text:
        return "It's sunny!"
    return None
def run(self) -> None:
184    def run(self) -> None:
185        """Run the pipeline. Blocks until ``stop()`` is called or Ctrl+C.
186
187        Raises:
188            PipelineError: If the pipeline encounters an unrecoverable error.
189        """
190        logger.info("VoicePipeline started. Say the wake word to begin.")
191        self._stop_event.clear()
192
193        try:
194            self._run_loop()
195        except KeyboardInterrupt:
196            logger.info("Pipeline interrupted by user.")
197        except Exception as e:
198            raise PipelineError(f"Pipeline error: {e}") from e
199        finally:
200            self.stop()
201            with self._state_lock:
202                self._state = _STATE_IDLE
203            logger.info("VoicePipeline stopped.")

Run the pipeline. Blocks until stop() is called or Ctrl+C.

Raises:
  • PipelineError: If the pipeline encounters an unrecoverable error.
def stop(self, timeout: float = 5.0) -> None:
205    def stop(self, timeout: float = 5.0) -> None:
206        """Signal the pipeline to stop and wait briefly for worker cleanup."""
207        self._stop_event.set()
208        worker = self._get_worker_thread()
209        if worker is None or worker is threading.current_thread():
210            return
211
212        worker.join(timeout=timeout)
213        if worker.is_alive():
214            logger.warning("VoicePipeline worker thread did not exit within %.1f s", timeout)
215        else:
216            # Clear reference only after the worker has fully exited
217            with self._worker_lock:
218                if self._worker_thread is worker:
219                    self._worker_thread = None

Signal the pipeline to stop and wait briefly for worker cleanup.

def close(self) -> None:
221    def close(self) -> None:
222        """Stop the pipeline and release all engine resources."""
223        self.stop()
224        self._wake_detector.close()
225        self._stt = None
226        self._tts = None

Stop the pipeline and release all engine resources.

def speak(self, text: str) -> None:
241    def speak(self, text: str) -> None:
242        """Synthesize and play text via TTS (called from within command handlers)."""
243        if not self._enable_tts or self._stop_event.is_set():
244            return
245
246        tts = self._get_tts()
247        if tts is None:
248            logger.warning("TTS not available — install 'violawake[tts]'")
249            return
250
251        try:
252            audio = tts.synthesize(text)
253            tts.play(audio)
254        except Exception as e:
255            logger.exception("TTS playback failed for text '%.50s': %s", text, e)

Synthesize and play text via TTS (called from within command handlers).

class ViolaWakeError(builtins.Exception):
7class ViolaWakeError(Exception):
8    """Base exception for all ViolaWake SDK errors."""

Base exception for all ViolaWake SDK errors.

class ModelNotFoundError(violawake_sdk.ViolaWakeError):
11class ModelNotFoundError(ViolaWakeError):
12    """Raised when a model file is not found in the cache or at the given path.
13
14    Resolution: run ``violawake-download --model <model_name>`` to download.
15    """

Raised when a model file is not found in the cache or at the given path.

Resolution: run violawake-download --model <model_name> to download.

class AudioCaptureError(violawake_sdk.ViolaWakeError):
25class AudioCaptureError(ViolaWakeError):
26    """Raised when microphone capture fails to initialize or read frames.
27
28    Common causes: no audio input device, device already in use,
29    PortAudio not installed.
30    """

Raised when microphone capture fails to initialize or read frames.

Common causes: no audio input device, device already in use, PortAudio not installed.

class ModelLoadError(violawake_sdk.ViolaWakeError):
18class ModelLoadError(ViolaWakeError):
19    """Raised when a model file exists but cannot be loaded by ONNX Runtime.
20
21    Possible causes: corrupted file, ONNX opset version mismatch.
22    """

Raised when a model file exists but cannot be loaded by ONNX Runtime.

Possible causes: corrupted file, ONNX opset version mismatch.

class PipelineError(violawake_sdk.ViolaWakeError):
40class PipelineError(ViolaWakeError):
41    """Raised when the VoicePipeline encounters an unrecoverable error."""

Raised when the VoicePipeline encounters an unrecoverable error.

class VADBackendError(violawake_sdk.ViolaWakeError):
33class VADBackendError(ViolaWakeError):
34    """Raised when the requested VAD backend is unavailable.
35
36    Falls back to RMS heuristic if webrtcvad/silero not installed.
37    """

Raised when the requested VAD backend is unavailable.

Falls back to RMS heuristic if webrtcvad/silero not installed.

def list_models() -> list[dict[str, str]]:
 74def list_models() -> list[dict[str, str]]:
 75    """Return available wake word models with their descriptions.
 76
 77    Each entry is a dict with keys: ``name``, ``description``, ``version``.
 78
 79    Example::
 80
 81        >>> from violawake_sdk import list_models
 82        >>> for m in list_models():
 83        ...     print(f"{m['name']:20s} {m['description']}")
 84    """
 85    from violawake_sdk.models import MODEL_REGISTRY
 86
 87    seen: set[str] = set()
 88    result: list[dict[str, str]] = []
 89    for name, spec in MODEL_REGISTRY.items():
 90        # Deduplicate aliases (e.g. "viola" -> "temporal_cnn")
 91        if spec.name in seen:
 92            continue
 93        # Hide deprecated, package-managed, and non-wake-word models
 94        if "DEPRECATED" in spec.description:
 95            continue
 96        if spec.name in ("oww_backbone", "kokoro_v1_0", "kokoro_voices_v1_0"):
 97            continue
 98        seen.add(spec.name)
 99        result.append(
100            {
101                "name": name,
102                "description": spec.description,
103                "version": spec.version,
104            }
105        )
106    return result

Return available wake word models with their descriptions.

Each entry is a dict with keys: name, description, version.

Example::

>>> from violawake_sdk import list_models
>>> for m in list_models():
...     print(f"{m['name']:20s} {m['description']}")
def list_voices() -> list[str]:
109def list_voices() -> list[str]:
110    """Return available TTS voice names for use with ``TTSEngine``.
111
112    Requires the ``[tts]`` extra to be installed for actual synthesis,
113    but this function always works for discovery.
114
115    Example::
116
117        >>> from violawake_sdk import list_voices
118        >>> list_voices()
119        ['af_heart', 'af_bella', 'af_sarah', ...]
120    """
121    from violawake_sdk.tts import AVAILABLE_VOICES
122
123    return list(AVAILABLE_VOICES)

Return available TTS voice names for use with TTSEngine.

Requires the [tts] extra to be installed for actual synthesis, but this function always works for discovery.

Example::

>>> from violawake_sdk import list_voices
>>> list_voices()
['af_heart', 'af_bella', 'af_sarah', ...]
__version__ = '0.2.2'