Metadata-Version: 2.4
Name: openspeechapi
Version: 0.2.10
Summary: Unified speech interface for STT/TTS providers
Requires-Python: >=3.11
Requires-Dist: httpx>=0.27
Requires-Dist: loguru>=0.7
Requires-Dist: msgpack>=1.0
Requires-Dist: pydantic>=2.0
Requires-Dist: pyyaml>=6.0
Provides-Extra: alibaba
Provides-Extra: alibaba-stt
Provides-Extra: alibaba-tts
Provides-Extra: all
Requires-Dist: elevenlabs; extra == 'all'
Requires-Dist: faster-whisper; extra == 'all'
Requires-Dist: openai; extra == 'all'
Requires-Dist: openai-whisper; extra == 'all'
Requires-Dist: piper-tts; extra == 'all'
Requires-Dist: pyttsx3; (sys_platform == 'win32') and extra == 'all'
Requires-Dist: torchaudio; extra == 'all'
Requires-Dist: tts; extra == 'all'
Requires-Dist: websockets; extra == 'all'
Provides-Extra: assemblyai-stt
Provides-Extra: audio
Requires-Dist: numpy; extra == 'audio'
Requires-Dist: sounddevice; extra == 'audio'
Provides-Extra: azure
Provides-Extra: azure-stt
Provides-Extra: azure-tts
Provides-Extra: baidu
Provides-Extra: baidu-stt
Provides-Extra: baidu-tts
Provides-Extra: canary-qwen-stt
Provides-Extra: cloud
Requires-Dist: websockets; extra == 'cloud'
Provides-Extra: coqui-tts
Requires-Dist: tts; extra == 'coqui-tts'
Provides-Extra: cosyvoice-tts
Requires-Dist: torchaudio; extra == 'cosyvoice-tts'
Provides-Extra: deepgram
Requires-Dist: websockets; extra == 'deepgram'
Provides-Extra: deepgram-stt
Requires-Dist: websockets; extra == 'deepgram-stt'
Provides-Extra: deepgram-tts
Provides-Extra: dev
Requires-Dist: numpy; extra == 'dev'
Requires-Dist: pytest-asyncio>=0.24; extra == 'dev'
Requires-Dist: pytest-cov; extra == 'dev'
Requires-Dist: pytest-dotenv; extra == 'dev'
Requires-Dist: pytest>=8.0; extra == 'dev'
Requires-Dist: ruff==0.15.*; extra == 'dev'
Provides-Extra: dolphin-stt
Requires-Dist: dataoceanai-dolphin; extra == 'dolphin-stt'
Requires-Dist: torchcodec; extra == 'dolphin-stt'
Provides-Extra: elevenlabs
Requires-Dist: elevenlabs; extra == 'elevenlabs'
Requires-Dist: websockets; extra == 'elevenlabs'
Provides-Extra: elevenlabs-stt
Requires-Dist: websockets; extra == 'elevenlabs-stt'
Provides-Extra: elevenlabs-tts
Requires-Dist: elevenlabs; extra == 'elevenlabs-tts'
Provides-Extra: faster-whisper-stt
Requires-Dist: faster-whisper; extra == 'faster-whisper-stt'
Provides-Extra: fireredasr-stt
Requires-Dist: fireredasr; extra == 'fireredasr-stt'
Provides-Extra: fish-speech-tts
Provides-Extra: funasr-stt
Requires-Dist: funasr>=1.1.0; extra == 'funasr-stt'
Requires-Dist: torch; extra == 'funasr-stt'
Requires-Dist: torchaudio; extra == 'funasr-stt'
Provides-Extra: gemma4-stt
Requires-Dist: mlx-vlm<0.6.2,>=0.6.1; (sys_platform == 'darwin') and extra == 'gemma4-stt'
Provides-Extra: google
Provides-Extra: google-stt
Provides-Extra: google-tts
Provides-Extra: iflytek
Requires-Dist: websockets; extra == 'iflytek'
Provides-Extra: iflytek-stt
Requires-Dist: websockets; extra == 'iflytek-stt'
Provides-Extra: iflytek-tts
Requires-Dist: websockets; extra == 'iflytek-tts'
Provides-Extra: kimi-audio-stt
Requires-Dist: torch; extra == 'kimi-audio-stt'
Provides-Extra: macos-native
Provides-Extra: minimax-tts
Provides-Extra: mlx-whisper-stt
Requires-Dist: mlx-whisper; (sys_platform == 'darwin') and extra == 'mlx-whisper-stt'
Provides-Extra: mms-stt
Requires-Dist: soundfile; extra == 'mms-stt'
Requires-Dist: torch; extra == 'mms-stt'
Requires-Dist: transformers; extra == 'mms-stt'
Provides-Extra: moonshine-stt
Requires-Dist: soundfile; extra == 'moonshine-stt'
Requires-Dist: torch; extra == 'moonshine-stt'
Requires-Dist: transformers; extra == 'moonshine-stt'
Provides-Extra: openai
Requires-Dist: openai; extra == 'openai'
Provides-Extra: openai-stt
Requires-Dist: openai; extra == 'openai-stt'
Provides-Extra: openai-tts
Requires-Dist: openai; extra == 'openai-tts'
Provides-Extra: paraformer-stt
Requires-Dist: funasr>=1.1.0; extra == 'paraformer-stt'
Requires-Dist: torch; extra == 'paraformer-stt'
Requires-Dist: torchaudio; extra == 'paraformer-stt'
Provides-Extra: parakeet-stt
Requires-Dist: parakeet-mlx; (sys_platform == 'darwin') and extra == 'parakeet-stt'
Provides-Extra: phi4-multimodal-stt
Requires-Dist: accelerate; extra == 'phi4-multimodal-stt'
Requires-Dist: backoff; extra == 'phi4-multimodal-stt'
Requires-Dist: peft; extra == 'phi4-multimodal-stt'
Requires-Dist: pillow; extra == 'phi4-multimodal-stt'
Requires-Dist: scipy; extra == 'phi4-multimodal-stt'
Requires-Dist: soundfile; extra == 'phi4-multimodal-stt'
Requires-Dist: torch; extra == 'phi4-multimodal-stt'
Requires-Dist: torchvision; extra == 'phi4-multimodal-stt'
Requires-Dist: transformers; extra == 'phi4-multimodal-stt'
Provides-Extra: piper-tts
Requires-Dist: piper-tts; extra == 'piper-tts'
Provides-Extra: qwen3-asr-stt
Requires-Dist: modelscope; extra == 'qwen3-asr-stt'
Requires-Dist: qwen-asr; extra == 'qwen3-asr-stt'
Provides-Extra: qwen3-omni-stt
Requires-Dist: accelerate; extra == 'qwen3-omni-stt'
Requires-Dist: qwen-omni-utils; extra == 'qwen3-omni-stt'
Requires-Dist: torch; extra == 'qwen3-omni-stt'
Requires-Dist: transformers; extra == 'qwen3-omni-stt'
Provides-Extra: sensevoice-stt
Requires-Dist: funasr>=1.1.0; extra == 'sensevoice-stt'
Requires-Dist: torch; extra == 'sensevoice-stt'
Requires-Dist: torchaudio; extra == 'sensevoice-stt'
Provides-Extra: server
Requires-Dist: fastapi; extra == 'server'
Requires-Dist: python-multipart; extra == 'server'
Requires-Dist: uvicorn; extra == 'server'
Requires-Dist: websockets; extra == 'server'
Provides-Extra: sherpa-onnx-stt
Requires-Dist: websockets; extra == 'sherpa-onnx-stt'
Provides-Extra: tencent
Provides-Extra: tencent-stt
Provides-Extra: tencent-tts
Provides-Extra: tracing
Requires-Dist: opentelemetry-api; extra == 'tracing'
Requires-Dist: opentelemetry-sdk; extra == 'tracing'
Provides-Extra: volcengine
Provides-Extra: volcengine-stt
Provides-Extra: volcengine-tts
Provides-Extra: vosk-stt
Requires-Dist: huggingface-hub; extra == 'vosk-stt'
Requires-Dist: vosk; extra == 'vosk-stt'
Provides-Extra: voxtral-stt
Requires-Dist: accelerate; extra == 'voxtral-stt'
Requires-Dist: mistral-common[audio]>=1.8.1; extra == 'voxtral-stt'
Requires-Dist: torch; extra == 'voxtral-stt'
Requires-Dist: transformers>=4.54.0; extra == 'voxtral-stt'
Provides-Extra: wenet-stt
Provides-Extra: whisper-stt
Requires-Dist: openai-whisper; extra == 'whisper-stt'
Provides-Extra: whisperlivekit-stt
Requires-Dist: websockets; extra == 'whisperlivekit-stt'
Provides-Extra: windows-native
Requires-Dist: pyttsx3; (sys_platform == 'win32') and extra == 'windows-native'
Description-Content-Type: text/markdown

# OpenSpeechAPI

> Unified speech interface for STT/TTS providers — one API, multiple backends.

OpenSpeechAPI 提供统一的语音接口，通过字符串指定 provider 即可切换不同的 STT/TTS 后端（云端 API、本地模型），无需关心底层实现。

## Quick Start

### 安装

**方式一 · 通过 PyPI 安装(直接使用)**
```bash
pip install "openspeechapi[server]"           # 起 HTTP 服务 / WebUI 必须带 [server](fastapi/uvicorn)
pip install "openspeechapi[server,openai]"    # 服务 + 指定 provider
pip install "openspeechapi[server,all]"       # 服务 + 全部 provider
pip install openspeechapi                      # 仅核心库(库模式;不含服务,也起不了 server)
```

**方式二 · 源码安装(开发,可编辑)**
```bash
git clone https://github.com/wingsfly/OpenSpeechAPI.git
cd OpenSpeechAPI
uv venv && uv pip install -e ".[server,dev]"  # 或 pip install -e ".[server,dev]";按需换 .[all] 等
```

> ⚠️ 纯 `pip install openspeechapi`(核心库)**不含 fastapi/uvicorn**,无法 `serve`;起服务请带 `[server]`。
> 两种方式启动服务的差异见下方 [启动服务](#启动服务)。

### 30 秒上手 — TTS

```python
import asyncio
from openspeechapi import create_provider

async def main():
    tts = create_provider("openai-tts", api_key="sk-...")
    await tts.start()

    audio = await tts.synthesize("Hello, OpenSpeechAPI!")

    import wave
    with wave.open("output.wav", "wb") as wf:
        wf.setnchannels(audio.channels)
        wf.setsampwidth(2)
        wf.setframerate(audio.sample_rate)
        wf.writeframes(audio.data)

    await tts.stop()

asyncio.run(main())
```

### 30 秒上手 — STT

```python
import asyncio
from openspeechapi import create_provider, AudioData, AudioFormat
from pathlib import Path

async def main():
    stt = create_provider("faster-whisper", model_size="tiny")
    await stt.start()

    audio = AudioData(
        data=Path("output.wav").read_bytes(),
        sample_rate=16000, channels=1, format=AudioFormat.WAV,
    )
    result = await stt.transcribe(audio)
    print(result.text)        # "Hello, OpenSpeechAPI!"
    print(result.language)    # "en"
    print(result.confidence)  # 0.98

    await stt.stop()

asyncio.run(main())
```

### macOS 零依赖快速上手

在 macOS 上无需任何 API Key 或模型下载，开箱即用：

```bash
# 1. 克隆项目
git clone https://github.com/wingsfly/OpenSpeechAPI.git
cd OpenSpeechAPI

# 2. 安装（仅核心包 + 服务依赖）
pip install -e ".[server]"

# 3. 启动服务和 WebUI
python -m openspeechapi.cli --config providers.yaml serve

# 4. 浏览器打开 http://127.0.0.1:8600/ui/
#    - TTS：选择 macos_tts → 选择发音人（如 Tingting）→ 输入文本 → Run TTS
#    - STT：前往 Engine Catalog → macOS STT → Install（自动下载预编译包）
```

#### macOS STT 安装（通过 WebUI）

`macos-stt` 默认未写入配置（避免"假可用"），需通过 Engine Catalog 一键安装：

1. WebUI → **Engine Catalog** → macOS STT → **Install**
   - 优先下载 CI 预构建的 universal `.app`（无需 Xcode，通过 `gh` 自动完成）
   - 若 `gh` 不可用或资产下载失败，自动回退到本地编译（需 Xcode Command Line Tools）
2. 安装完成后，**手动授权语音识别**（每台机器一次）：
   ```bash
   open scripts/engines/macos-stt/MacOSSTTHelper.app
   # 弹出对话框后点击"允许"
   ```
3. **手动下载听写语言模型**（每台机器一次）：
   系统设置 > 键盘 > 听写 > 下载所需语言模型（中文/英文等）

安装完成后 `macos_stt` 自动写入配置并热重载，Dashboard 显示 healthy。

> 授权和听写模型下载是 macOS TCC 系统限制，无法自动化，必须每台机器手动执行一次。
> 详细机制见 [docs/architecture/native-engine-install.md](docs/architecture/native-engine-install.md)。

## CLI Demo

无需写代码，直接在命令行体验：

```bash
# TTS：文本 → 语音
python -m openspeechapi.demo tts -t "Hello world" -o output.wav

# STT：语音 → 文本
python -m openspeechapi.demo stt -i output.wav -p faster-whisper

# Roundtrip：文本 → TTS → STT → 文本
python -m openspeechapi.demo roundtrip -t "Hello world"

# Compare：多引擎对比
python -m openspeechapi.demo compare -i output.wav -p openai,faster-whisper

# REPL：交互模式
python -m openspeechapi.demo repl

# WebUI（Phase A）
python -m openspeechapi.cli serve --host 0.0.0.0 --port 8600
# 浏览器打开 http://127.0.0.1:8600/ui

# 实时 STT：优先使用 WebSocket PCM 流式（/v1/stt/stream），
# 若浏览器或链路异常会自动回退到分片 HTTP 转写模式。
```

### 本地引擎管理（实验特性）

```bash
# 1) 下载/更新运行镜像
python -m openspeechapi.cli engine install --name fish-speech --runtime docker --follow

# 2) 启动本地引擎（含健康检查）
python -m openspeechapi.cli engine start --name fish-speech --runtime docker --follow

# 3) 查看运行状态/日志
python -m openspeechapi.cli engine status --name fish-speech --runtime docker
python -m openspeechapi.cli engine logs --name fish-speech --runtime docker --lines 200

# 4) 停止
python -m openspeechapi.cli engine stop --name fish-speech --runtime docker --follow

# 5) 跨进程查询任务
python -m openspeechapi.cli engine task list --name fish-speech --limit 20
python -m openspeechapi.cli engine task status --task-id <TASK_ID>
python -m openspeechapi.cli engine task follow --task-id <TASK_ID>
python -m openspeechapi.cli engine task cancel --task-id <TASK_ID>
```

进度反馈会显示 task id、阶段、百分比和当前消息，便于追踪长耗时任务。

#### STT 本地模型引擎（复用已有模型路径）

```bash
# faster-whisper 模型资产（native，无常驻服务）
python -m openspeechapi.cli engine install --name faster-whisper --runtime native --follow
python -m openspeechapi.cli engine start   --name faster-whisper --runtime native --follow
python -m openspeechapi.cli engine status  --name faster-whisper --runtime native

# whisper 模型资产（native，无常驻服务）
python -m openspeechapi.cli engine install --name whisper --runtime native --follow
python -m openspeechapi.cli engine start   --name whisper --runtime native --follow
python -m openspeechapi.cli engine status  --name whisper --runtime native
```

说明：安装会优先读取 `~/.aim/config.json + ~/.aim/registry.json` 的 provision 信息来定位模型；若 AIM 未命中，再回退默认本地路径候选。若仍未找到，可按配置走“模拟下载”流程以验证安装进度。

### Demo 音频播放

```bash
# 合成后直接播放
python -m openspeechapi.demo tts -t "Hello world" --play

# 指定播放参数
python -m openspeechapi.demo tts -t "Hello world" --play \
  --play-backend sounddevice --play-device 2 --play-volume 0.8
```

## Providers

### 已实现

| Provider | 类型 | 说明 | 执行模式 | 安装 |
|----------|------|------|----------|------|
| `openai-stt` | STT | OpenAI Whisper API（云端） | remote | `pip install -e ".[openai]"` |
| `faster-whisper` | STT | 本地 Whisper 推理（GPU/CPU） | subprocess | `pip install -e ".[faster-whisper]"` |
| `whisper` | STT | OpenAI Whisper 本地推理（CPU/GPU） | subprocess | `pip install -e ".[whisper]"` |
| `whisperlivekit-stt` | STT | WhisperLiveKit 本地服务（Deepgram 兼容 WS，支持 MLX 后端） | local | `pip install -e ".[whisperlivekit]"` |
| `elevenlabs-stt` | STT | ElevenLabs Scribe API（云端，支持实时流式 WS + 批量） | remote | `pip install -e ".[elevenlabs-stt]"` |
| `deepgram` | STT | Deepgram API（云端，支持实时流式） | remote | `pip install -e ".[deepgram]"` |
| `gemma4` | STT | Google Gemma 4 多模态 ASR（macOS/MLX 本地，E2B/E4B，>30s 自动分段；任务：转写 / 翻译(任意目标语言) / 理解 / 问答 / 语种识别） | subprocess | `pip install -e ".[gemma4-stt]"` |
| `sensevoice` | STT | SenseVoice-Small 本地多语种 ASR（FunASR，zh/粤/en/ja/ko，比 Whisper 快 ~15-50×） | subprocess | `pip install -e ".[sensevoice-stt]"` |
| `qwen3-asr` | STT | Qwen3-ASR 本地多语种 ASR（2026 开源 SOTA，中/方言/英，0.6B/1.7B） | subprocess | `pip install -e ".[qwen3-asr-stt]"` |
| `mlx-whisper` | STT | Whisper on Apple MLX（本地，large-v3 / turbo，中/英多语种，仅 Apple Silicon） | subprocess | `pip install -e ".[mlx-whisper-stt]"` |
| `paraformer` | STT | Paraformer 本地 ASR（FunASR，普通话 SOTA 级，VAD+标点，zh/en） | subprocess | `pip install -e ".[paraformer-stt]"` |
| `funasr` | STT | FunASR 总入口（任选模型库 + VAD/标点/说话人分离） | subprocess | `pip install -e ".[funasr-stt]"` |
| `fireredasr` | STT | 小红书 FireRedASR（普通话 SOTA+方言+英文，歌词识别，AED/LLM） | subprocess | `pip install -e ".[fireredasr-stt]"` |
| `dolphin` | STT | DataoceanAI Dolphin（40 东方语种 + 22 中文方言，small/base） | subprocess | `pip install -e ".[dolphin-stt]"` |
| `wenet` | STT | WeNet U2++ Conformer（生产级，zh/en 预置；流式后续） | subprocess | WebUI Engines 安装,或 `pip install 'wenet @ git+https://github.com/wenet-e2e/wenet.git'` |
| `canary-qwen` | STT | NVIDIA Canary-Qwen-2.5B（Open ASR 英文第1，SALM；仅英文，需 NeMo+GPU） | subprocess | WebUI Engines 安装,或 `pip install 'nemo_toolkit[asr] @ git+https://github.com/NVIDIA/NeMo.git'` |
| `parakeet` | STT | NVIDIA Parakeet-TDT on MLX（最快，v2 英文/v3 欧语；中文弱，仅 Apple Silicon） | subprocess | `pip install -e ".[parakeet-stt]"` |
| `qwen3-omni` | STT | Qwen3-Omni-30B 全模态 LLM（ASR+理解，zh/en+；需大 GPU ~60GB） | subprocess | `pip install -e ".[qwen3-omni-stt]"` |
| `voxtral` | STT | Mistral Voxtral（Mini-3B/Small-24B，转写+理解，多语种；建议 GPU） | subprocess | `pip install -e ".[voxtral-stt]"` |
| `phi4-multimodal` | STT | 微软 Phi-4-multimodal（多模态 LLM，ASR+理解，zh/en+；建议 GPU） | subprocess | `pip install -e ".[phi4-multimodal-stt]"` |
| `kimi-audio` | STT | 月之暗面 Kimi-Audio-7B（音频基础模型，ASR+理解，zh/en；需 Linux+CUDA/flash-attn） | subprocess | WebUI Engines 安装,或 `pip install 'kimi-audio @ git+https://github.com/MoonshotAI/Kimi-Audio.git'` |
| `moonshine` | STT | Useful Sensors Moonshine（边缘/实时英文 ASR，tiny/base，轻量） | subprocess | `pip install -e ".[moonshine-stt]"` |
| `vosk` | STT | Vosk（Kaldi 离线，20+ 语言含 zh/en，轻量低资源） | subprocess | `pip install -e ".[vosk-stt]"` |
| `mms` | STT | Meta MMS（Wav2Vec2-CTC，1000+ 语言含 zh/en，按 ISO 639-3 选语言） | subprocess | `pip install -e ".[mms-stt]"` |
| `openai-tts` | TTS | OpenAI Speech API（云端，支持流式） | remote | `pip install -e ".[openai]"` |
| `elevenlabs` | TTS | ElevenLabs 高质量语音（云端，支持 HTTP/WS 流式） | remote | `pip install -e ".[elevenlabs-tts]"` |
| `minimax` | TTS | Minimax 语音合成（云端） | remote | `pip install -e ".[minimax]"` |
| `cosyvoice` | TTS | CosyVoice 本地中文语音合成（GPU） | subprocess | 需手动安装 CosyVoice |
| `fish-speech` | TTS | Fish-Speech 本地多语 TTS + voice clone | local | `pip install -e ".[fish-speech]"` |
| `piper` | TTS | Piper 轻量级本地 TTS（CPU 即可） | in_process | `pip install -e ".[piper]"` |
| `macos-say` | TTS | macOS 内置语音合成（`say` 命令，零依赖） | in_process | 无需安装，macOS 自带 |
| `macos-stt` | STT | macOS 内置语音识别（SFSpeechRecognizer） | in_process | WebUI Engine Catalog → Install（预编译优先，编译兜底） |

### Stub（待实现）

`coqui`

### 查看所有 provider

```python
from openspeechapi import list_providers
print(list_providers())
# ['coqui', 'cosyvoice', 'deepgram', 'elevenlabs', 'faster-whisper',
#  'fish-speech', 'minimax', 'openai-stt', 'openai-tts', 'piper', 'whisper',
#  'whisperlivekit-stt']
```

> **音频输入格式**:STT 上传支持 WAV/PCM/MP3/FLAC/OGG/WebM 等。引擎不能直接处理的格式会由服务端自动转为 16k 单声道 WAV(压缩格式需 `ffmpeg`);缺 ffmpeg 且格式不被支持时返回 400,Web UI 会在上传/录音前拦截提示。详见 [docs/architecture/audio-format-negotiation.md](docs/architecture/audio-format-negotiation.md)。

## Provider 参数

### `openai-stt`

```python
create_provider("openai-stt",
    api_key="sk-...",         # 必填，OpenAI API Key
    model="whisper-1",        # 模型名称
)
```

转录选项通过 `STTOptions` 传入：

```python
from openspeechapi import STTOptions
result = await stt.transcribe(audio, STTOptions(
    language="zh",            # 语言提示
    prompt="技术会议记录",     # 上下文提示
    temperature=0.0,          # 0.0-1.0
))
```

### `faster-whisper`

```python
create_provider("faster-whisper",
    model_size="base",        # tiny / base / small / medium / large-v3
    device="auto",            # auto / cuda / cpu
    compute_type="default",   # default / int8 / float16
    beam_size=5,              # beam search 宽度
    download_root=None,       # 模型缓存目录
)
```

### `gemma4`

```python
create_provider("gemma4",
    model="mlx-community/gemma-4-E4B-it-8bit",  # E2B/E4B（8bit 翻译更准；勿用 12B）
    task="transcribe",          # transcribe｜translate｜understand｜qa｜detect_language
    target_language="English",  # task=translate 的目标语言（任意语言）
    include_transcript=False,   # task=translate：同时输出源转写 + 译文
)
```

macOS / Apple Silicon 本地多模态 ASR（mlx-vlm）。5 个任务及全部字段可在 Web UI 的 Lab「Advanced Options」按请求覆盖。详见 [docs/architecture/gemma4-stt-provider.md](docs/architecture/gemma4-stt-provider.md)。

### `sensevoice`

```python
create_provider("sensevoice",
    model="FunAudioLLM/SenseVoiceSmall",
    language="auto",      # auto|zh|en|yue|ja|ko|nospeech
    device="cpu",         # cpu|mps|cuda
    use_itn=True,         # 标点/数字规整
)
```

FunASR 本地多语种 ASR（zh/粤/en/ja/ko），非自回归、极快；全部字段可在 Lab「Advanced Options」按请求覆盖。详见 [docs/architecture/sensevoice-stt-provider.md](docs/architecture/sensevoice-stt-provider.md)。

### `qwen3-asr`

```python
create_provider("qwen3-asr",
    model="Qwen/Qwen3-ASR-0.6B",  # 或 Qwen/Qwen3-ASR-1.7B
    language="auto",              # auto|Chinese|English|Cantonese|Japanese|Korean
    device="cpu",                 # cpu|mps|cuda
)
```

阿里 Qwen3-ASR（2026 开源 ASR SOTA，中/方言/英）本地推理（qwen-asr 包）。需另装 `torch`。详见 [docs/architecture/qwen3-asr-stt-provider.md](docs/architecture/qwen3-asr-stt-provider.md)。

### `mlx-whisper`

```python
create_provider("mlx-whisper",
    model="mlx-community/whisper-large-v3-turbo",  # 或 whisper-large-v3-mlx
    language="auto",                               # auto|en|zh|yue|ja|ko|...
)
```

Apple Silicon 原生 Whisper（MLX），large-v3 / turbo，中英文多语种。仅 macOS/Apple Silicon。详见 [docs/architecture/mlx-whisper-stt-provider.md](docs/architecture/mlx-whisper-stt-provider.md)。

### `paraformer`

```python
create_provider("paraformer",
    model="funasr/paraformer-zh",  # 或 funasr/paraformer-en
    vad=True, punc=True,           # VAD 切分 + 标点恢复
)
```

阿里 Paraformer（FunASR），普通话 SOTA 级非自回归 ASR，带 VAD + 标点。详见 [docs/architecture/paraformer-stt-provider.md](docs/architecture/paraformer-stt-provider.md)。

### `funasr`

```python
create_provider("funasr",
    model="funasr/paraformer-zh",  # 模型库任意条目
    spk=True,                      # CAM++ 说话人分离 → [spk0]/[spk1] 标注
)
```

FunASR 通用总入口:任选模型库模型 + VAD/标点/**说话人分离**。详见 [docs/architecture/funasr-stt-provider.md](docs/architecture/funasr-stt-provider.md)。

### `fireredasr`

```python
create_provider("fireredasr",
    model_type="aed",   # aed(≤60s) | llm(≤30s);权重自动下载
)
```

小红书 FireRedASR,普通话公开基准 SOTA + 方言 + 英文,歌词识别强。详见 [docs/architecture/fireredasr-stt-provider.md](docs/architecture/fireredasr-stt-provider.md)。

### `dolphin`

```python
create_provider("dolphin",
    model_name="small",   # small | base
    lang_sym="zh", region_sym="CN",   # 留空则自动检测
)
```

DataoceanAI Dolphin,40 种东方语言 + 22 种中文方言。详见 [docs/architecture/dolphin-stt-provider.md](docs/architecture/dolphin-stt-provider.md)。

### `wenet`

```python
create_provider("wenet",
    model="chinese",   # chinese | english
)
```

WeNet 生产级 U2++ Conformer(zh/en 预置)。从 git 安装(不在 PyPI)。详见 [docs/architecture/wenet-stt-provider.md](docs/architecture/wenet-stt-provider.md)。

### `canary-qwen`

```python
create_provider("canary-qwen",
    model="nvidia/canary-qwen-2.5b",
    device="cuda", dtype="bfloat16",   # 仅英文;强烈建议 GPU
)
```

NVIDIA Canary-Qwen-2.5B(Open ASR 英文第 1,SALM)。**仅英文**;NeMo 重型安装 + 建议 GPU。详见 [docs/architecture/canary-qwen-stt-provider.md](docs/architecture/canary-qwen-stt-provider.md)。

### `parakeet`

```python
create_provider("parakeet",
    model="mlx-community/parakeet-tdt-0.6b-v2",  # v2 英文;v3 + 欧洲语言
)
```

NVIDIA Parakeet-TDT on Apple MLX,榜上最快。英文/欧语为主,**中文弱**;仅 Apple Silicon。详见 [docs/architecture/parakeet-stt-provider.md](docs/architecture/parakeet-stt-provider.md)。

### `qwen3-omni`

```python
create_provider("qwen3-omni",
    model="Qwen/Qwen3-Omni-30B-A3B-Instruct",
    prompt="Transcribe the audio into text.",   # 改成问题即可做音频问答
)
```

阿里 Qwen3-Omni-30B-A3B 全模态 LLM(ASR + 音频理解,zh/en+)。**需大显存 GPU(~60GB),笔记本装不下**。详见 [docs/architecture/qwen3-omni-stt-provider.md](docs/architecture/qwen3-omni-stt-provider.md)。

### `voxtral`

```python
create_provider("voxtral",
    model="mistralai/Voxtral-Mini-3B-2507",  # 或 Voxtral-Small-24B-2507
    language="en",
)
```

Mistral Voxtral(转写 + 音频理解,多语种)。3B/24B,建议 GPU。详见 [docs/architecture/voxtral-stt-provider.md](docs/architecture/voxtral-stt-provider.md)。

### `phi4-multimodal`

```python
create_provider("phi4-multimodal",
    model="microsoft/Phi-4-multimodal-instruct",
    prompt="Transcribe the audio clip into text.",
)
```

微软 Phi-4-multimodal,紧凑多模态 LLM(ASR + 音频理解,zh/en+)。建议 GPU。详见 [docs/architecture/phi4-multimodal-stt-provider.md](docs/architecture/phi4-multimodal-stt-provider.md)。

### `kimi-audio`

```python
create_provider("kimi-audio",
    model="moonshotai/Kimi-Audio-7B-Instruct",
    prompt="Please transcribe the audio into text.",
)
```

月之暗面 Kimi-Audio-7B 音频基础模型(ASR + 音频理解,zh/en)。git 安装 + 建议 GPU。详见 [docs/architecture/kimi-audio-stt-provider.md](docs/architecture/kimi-audio-stt-provider.md)。

### `moonshine`

```python
create_provider("moonshine",
    model="UsefulSensors/moonshine-base",  # base | tiny
)
```

Useful Sensors Moonshine,边缘/实时英文 ASR,轻量快速。详见 [docs/architecture/moonshine-stt-provider.md](docs/architecture/moonshine-stt-provider.md)。

### `vosk`

```python
create_provider("vosk",
    model="vosk-model-small-en-us-0.15",  # 中文: vosk-model-small-cn-0.22
)
```

Vosk(Kaldi 离线),20+ 语言,轻量低资源,模型自动下载。详见 [docs/architecture/vosk-stt-provider.md](docs/architecture/vosk-stt-provider.md)。

### `mms`

```python
create_provider("mms",
    model="facebook/mms-1b-all",
    language="eng",   # ISO 639-3: eng / cmn / yue / jpn ...
)
```

Meta MMS(Wav2Vec2-CTC),1000+ 语言含中英,按 **ISO 639-3** 码切换语言适配器;CTC 输出小写无标点。详见 [docs/architecture/mms-stt-provider.md](docs/architecture/mms-stt-provider.md)。

### `openai-tts`

```python
create_provider("openai-tts",
    api_key="sk-...",         # 必填，OpenAI API Key
    model="tts-1",            # tts-1 / tts-1-hd
    voice="alloy",            # alloy / echo / fable / onyx / nova / shimmer
    response_format="pcm",    # 输出格式
)
```

合成选项通过 `TTSOptions` 传入：

```python
from openspeechapi import TTSOptions
audio = await tts.synthesize("Hello", TTSOptions(
    voice="nova",             # 覆盖默认声音
    speed=1.2,                # 语速倍率
))
```

### `deepgram`

```python
create_provider("deepgram",
    api_key="...",            # 必填，Deepgram API Key
    model="nova-2",           # 模型名称
    language="en",            # 默认语言
    punctuate=True,           # 自动标点
    smart_format=True,        # 智能格式化
)
```

支持实时流式转录（`transcribe_stream`），详见[流式 STT](#流式-stt) 章节。

### `elevenlabs`

```python
create_provider("elevenlabs",
    api_key="...",            # 必填，ElevenLabs API Key
    voice_id="21m00Tcm4TlvDq8ikWAM",  # 声音 ID
    model_id="eleven_monolingual_v1",  # 模型
    stability=0.5,            # 声音稳定性
    similarity_boost=0.75,    # 相似度增强
)
```

### `minimax`

```python
create_provider("minimax",
    api_key="...",            # 必填，Minimax API Key
    group_id="...",           # 必填，Minimax Group ID
    model="speech-01-turbo",  # 模型
    voice_id="male-qn-qingse", # 声音 ID
    speed=1.0,                # 语速
)
```

### `cosyvoice`

```python
create_provider("cosyvoice",
    model_dir="/path/to/model",  # 必填，本地模型目录
    device="auto",               # auto / cuda / cpu
    spk_id="中文女",              # 说话人 ID
)
```

### `fish-speech`

```python
create_provider("fish-speech",
    api_url="http://localhost:8080",  # Fish-Speech 本地服务地址
    reference_audio=None,            # 参考音频路径（voice clone）
)
```

### `piper`

```python
create_provider("piper",
    model_path="/path/to/model.onnx",  # 必填，模型文件路径
    config_path="/path/to/config.json", # 必填，配置文件路径
    use_cuda=False,           # 是否使用 GPU
    length_scale=1.0,         # 语速（越大越慢）
    noise_scale=0.667,        # 噪声比例
)
```

### `macos-say`（macOS 原生 TTS）

零额外依赖，使用 macOS 内置 `say` 命令，支持系统所有发音人。

```python
create_provider("macos-say",
    default_voice="Tingting",  # 默认发音人（say -v '?' 查看全部）
    default_rate=200,          # 默认语速（words per minute）
)
```

支持通过 `list_voices()` 获取所有可用发音人（按语言分组）。合成时通过 `TTSOptions(voice="Samantha", speed=1.5)` 指定发音人和语速。

### `macos-stt`（macOS 原生 STT）

使用 macOS 内置 SFSpeechRecognizer，通过 Swift CLI 助手（`.app` bundle）实现。
**推荐通过 WebUI Engine Catalog 一键安装**（自动下载预编译 universal 包，无需 Xcode）。

```python
create_provider("macos-stt",
    language="zh-CN",          # 默认识别语言
    binary_path="",            # Swift 工具路径，空则自动检测
)
```

**安装方式（推荐）：** WebUI → Engine Catalog → macOS STT → **Install**

安装流程：B 预编译优先（`gh release download`），B 不可用时自动 C 兜底（`bash install.sh`，需 Xcode CLT）。
详见 [docs/architecture/native-engine-install.md](docs/architecture/native-engine-install.md)。

**每台机器必须手动完成一次（无法自动化）：**

```bash
# 1. 授权语音识别权限（安装后运行，弹出对话框后点击"允许"）
open scripts/engines/macos-stt/MacOSSTTHelper.app

# 2. 验证授权状态
scripts/engines/macos-stt/MacOSSTTHelper.app/Contents/MacOS/macos-stt-helper --check --language en-US
```

- **系统设置 > 键盘 > 听写** → 下载对应语言的离线听写模型（中文、英文等）
- macOS 13+ 支持完全离线识别，旧版本需联网

**高级 / 离线手动安装（不依赖 WebUI 或 `gh`）：**

```bash
# 需要 Xcode Command Line Tools（xcode-select --install）
bash scripts/engines/macos-stt/install.sh
```

## HTTP 服务 + Client 模式

### 启动服务

**pip 安装后**(已带 `[server]`)—— 配置自动解析/生成,开箱即起:
```bash
openspeechapi serve                 # 自动解析配置;没有则生成默认(macOS 默认 macos_tts)
openspeechapi serve --port 8600     # 指定端口
```

**源码目录运行**:
```bash
python -m openspeechapi.cli serve   # 或 openspeechapi serve;在仓库目录优先用 ./providers.yaml
```

启动后打开 WebUI:**http://127.0.0.1:8600/ui/**

**配置解析顺序**(`--config` 可放在 `serve` **前或后**,例如 `openspeechapi --config x serve` 或 `openspeechapi serve --config x`):
1. 显式 `--config <path>`
2. 当前目录 `./providers.yaml`(源码目录运行时优先)
3. `~/.config/openspeechapi/providers.yaml`(遵循 `XDG_CONFIG_HOME`)
4. 都没有 → 在 `~/.config/openspeechapi/providers.yaml` **自动生成**一份可用默认配置

### Python Client（与 Library 模式接口一致）

```python
from openspeechapi import Client

async with Client("http://localhost:8600") as c:
    # STT
    result = await c.stt.transcribe("faster-whisper", audio)

    # TTS
    audio = await c.tts.synthesize("openai-tts", "Hello world")

    # FanOut
    result = await c.stt.fanout(["openai", "faster-whisper"], audio, strategy="collect_all")

    # 管理
    providers = await c.list_providers()
    health = await c.health()
```

### REST API

```bash
# STT
curl -X POST http://localhost:8600/v1/stt/transcribe \
  -F audio=@audio.wav -F provider=faster-whisper

# TTS
curl -X POST http://localhost:8600/v1/tts/synthesize \
  -H "Content-Type: application/json" \
  -d '{"text": "Hello", "provider": "openai-tts"}' --output out.wav

# 管理
curl http://localhost:8600/v1/providers
curl http://localhost:8600/v1/health
curl http://localhost:8600/v1/metrics
```

## 高级用法

### Config-Driven（YAML 配置）

```yaml
# providers.yaml
providers:
  cloud-stt:
    provider: openai-stt
    exec_mode: remote
    settings:
      api_key: ${OPENAI_API_KEY}

  local-stt:
    provider: faster-whisper
    exec_mode: subprocess      # 独立进程，隔离 GPU 内存
    settings:
      model_size: large-v3
      device: cuda
```

`exec_mode` 约定：
- `subprocess`：子进程模型推理（IPC）
- `local`：本地服务引擎（HTTP/HTTPS）
- `remote`：云端服务 API
- `in_process`：预留给真正进程内推理（兼容旧配置，建议迁移）

```python
from openspeechapi import ServiceDispatcher, ProviderRegistry
from openspeechapi.providers.stt.openai import OpenAISTT
from openspeechapi.providers.stt.faster_whisper import FasterWhisperSTT

registry = ProviderRegistry()
registry.register("openai-stt", OpenAISTT)
registry.register("faster-whisper", FasterWhisperSTT)

dispatcher = ServiceDispatcher.from_config("providers.yaml", registry)
await dispatcher.start()

result = await dispatcher.stt.transcribe("cloud-stt", audio)
await dispatcher.stop()
```

### FanOut — 多引擎并发

```python
from openspeechapi.dispatch.fanout import FirstCompleted, CollectAll

# 取最快返回的结果
result = await dispatcher.stt.fanout(
    ["cloud-stt", "local-stt"], audio, strategy=FirstCompleted()
)

# 收集所有结果对比
results = await dispatcher.stt.fanout(
    ["cloud-stt", "local-stt"], audio, strategy=CollectAll()
)
for name, t in results.successes.items():
    print(f"{name}: {t.text}")
```

### Result Filters

```yaml
providers:
  my-stt:
    provider: faster-whisper
    exec_mode: subprocess
    settings:
      model_size: base
    filters:
      - type: confidence
        min: 0.8              # 过滤低置信度结果
      - type: language
        allow: ["zh", "en"]   # 只保留中英文
```

### Observers（可观测性）

```python
from openspeechapi.observe.metrics import MetricsObserver
from openspeechapi.observe.debug import DebugLogObserver

dispatcher.add_observer(MetricsObserver())    # TTFB、耗时、吞吐
dispatcher.add_observer(DebugLogObserver())   # 详细日志
```

内置 5 个 Observer：`MetricsObserver` `LatencyObserver` `DebugLogObserver` `UsageObserver` `TracingObserver`

## 数据模型

```python
AudioData(data=bytes, sample_rate=int, channels=int, format=AudioFormat, duration_ms=int|None)
Transcription(text=str, language=str|None, confidence=float|None, words=list[Word]|None)
Word(text=str, start_ms=int, end_ms=int, confidence=float|None)
STTOptions(language=str|None, prompt=str|None, temperature=float|None)
TTSOptions(voice=str|None, speed=float, output_format=AudioFormat)
AudioFormat: PCM_16K | PCM_44K | WAV | AIFF | MP3 | OGG | FLAC | OPUS
```

## 项目结构

```
openspeechapi/
  core/           # L1: Provider 抽象层（models, enums, base, registry）
  providers/      # Provider 适配器（stt/ 5个含macos, tts/ 8个含macos）
  utils/           # 工具模块（audio_converter, audio_playback）
  dispatch/       # L2: 调度层（dispatcher, executors, fanout, filters）
  observe/        # 可观测性（metrics, latency, debug, usage, tracing）
  server/         # L3: FastAPI HTTP/WebSocket 服务
  client/         # Python 薄客户端
  factory.py      # create_provider() 工厂函数
  config.py       # YAML 配置加载
  cli.py          # openspeechapi list / check / serve
  demo.py         # 交互式 demo CLI
examples/         # 示例脚本（Library 模式 + Client 模式）
tests/            # 332 tests（unit + integration + E2E）
Dockerfile        # 容器化部署
docker-compose.yml
.github/workflows/ci.yml  # GitHub Actions CI
```

## 环境变量

| 变量 | 用途 |
|------|------|
| `OPENAI_API_KEY` | OpenAI STT/TTS 所需的 API Key |
| `DEEPGRAM_API_KEY` | Deepgram STT 所需的 API Key |
| `ELEVENLABS_API_KEY` | ElevenLabs TTS 所需的 API Key |
| `MINIMAX_API_KEY` | Minimax TTS 所需的 API Key |
| `OPENSPEECH_API_KEY` | HTTP 服务 Bearer token 认证 Key |

支持 `.env` 文件自动加载（需 `python-dotenv`）。

## 部署

**Docker:**
```bash
# 构建并启动
docker-compose up -d

# 查看日志
docker-compose logs -f

# GPU 支持（编辑 docker-compose.yml 取消注释 openspeechapi-gpu 服务）
```

**直接启动:**
```bash
openspeechapi serve --config providers.yaml --port 8600
```

## 认证

在 `providers.yaml` 中配置 API Key 认证：

```yaml
server:
  auth:
    enabled: true
    api_keys:
      - ${OPENSPEECH_API_KEY}
```

启用后所有 REST 请求需携带 Bearer token：
```bash
curl -H "Authorization: Bearer your-key" http://localhost:8600/v1/providers
```

WebSocket 通过查询参数传递：
```
ws://localhost:8600/v1/stt/stream?provider=deepgram&token=your-key
```

`/v1/health` 端点免认证。不配置 `server.auth` 则无认证（开发模式）。

## 流式 STT

Deepgram 支持实时流式转录：

```python
async with Client("http://localhost:8600") as c:
    async def audio_source():
        # 从麦克风或文件读取 PCM 音频块
        yield pcm_chunk

    async for transcription in c.stt.transcribe_stream("deepgram", audio_source()):
        print(transcription.text)
```

WebSocket 方式：
```
ws://localhost:8600/v1/stt/stream?provider=deepgram
# 发送: binary PCM audio frames
# 接收: {"type": "partial", "text": "..."}
```

## CI

项目使用 GitHub Actions 自动化测试。每次 push 到 main 或 PR 时自动运行：
- ruff lint
- 单元测试 + 集成测试
- 代码覆盖率检查（≥70%）

## License

Private — personal multi-project reuse.
