curl /health
{"ok": true, "vllm": true, "whisper": true, "tts": true}
curl /v1/models
| Parameter | Type | Description | |
|---|---|---|---|
| model | string | required | Must be llama-3.1-8b-instruct |
| messages | array | required | Array of {role, content} objects |
| max_tokens | int | optional | Max tokens to generate |
| temperature | float | optional | 0 = deterministic, default 1.0 |
| top_p | float | optional | Nucleus sampling threshold |
| stream | bool | optional | Stream tokens via SSE |
| stop | string/array | optional | Stop sequence(s) |
curl /v1/chat/completions \
-H "Content-Type: application/json" \
-d '{
"model": "llama-3.1-8b-instruct",
"messages": [
{"role": "system", "content": "You are a helpful assistant."},
{"role": "user", "content": "What is the capital of France?"}
],
"max_tokens": 256,
"temperature": 0.7
}'
curl /v1/chat/completions \
-H "Content-Type: application/json" \
-d '{
"model": "llama-3.1-8b-instruct",
"messages": [{"role": "user", "content": "Tell me a short joke"}],
"max_tokens": 128,
"stream": true
}'
from openai import OpenAI
client = OpenAI(base_url="/v1", api_key="dummy")
resp = client.chat.completions.create(
model="llama-3.1-8b-instruct",
messages=[{"role": "user", "content": "Hello!"}],
max_tokens=256,
)
print(resp.choices[0].message.content)
| Parameter | Type | Description | |
|---|---|---|---|
| file | file | required | Audio file — wav, mp3, m4a, ogg, flac, webm |
| model | string | required | Any value accepted, e.g. whisper-1 |
| response_format | string | optional | json · text · verbose_json · srt · vtt |
| language | string | optional | ISO-639-1 code (e.g. en, hi). Auto-detect if omitted. |
| timestamp_granularities[] | string | optional | word for word-level timestamps (use with verbose_json) |
curl /v1/audio/transcriptions \ -F "file=@audio.wav" \ -F "model=whisper-1"
curl /v1/audio/transcriptions \ -F "file=@audio.wav" \ -F "model=whisper-1" \ -F "response_format=verbose_json" \ -F "timestamp_granularities[]=word"
curl /v1/audio/transcriptions \ -F "file=@audio.mp3" \ -F "model=whisper-1" \ -F "response_format=srt"
with open("audio.wav", "rb") as f:
transcript = client.audio.transcriptions.create(
model="whisper-1",
file=f,
)
print(transcript.text)
| Parameter | Type | Description | |
|---|---|---|---|
| input | string | required | Text to synthesize (max ~500 words) |
| voice | string | optional | OpenAI alias or raw Kokoro voice name (default: alloy) |
| response_format | string | optional | mp3 (default) · wav · opus · aac · flac · pcm |
| speed | float | optional | 0.5 – 2.0, default 1.0 |
| model | string | optional | Any value accepted, e.g. tts-1 |
curl /v1/audio/speech \
-H "Content-Type: application/json" \
-d '{"model":"tts-1","input":"Hello, welcome to the debate.","voice":"nova"}' \
--output speech.mp3
curl /v1/audio/speech \
-H "Content-Type: application/json" \
-d '{
"model": "tts-1",
"input": "This is slightly faster speech.",
"voice": "onyx",
"speed": 1.25,
"response_format": "wav"
}' \
--output speech.wav
response = client.audio.speech.create(
model="tts-1",
input="Hello from the debate stage.",
voice="nova",
)
response.stream_to_file("output.mp3")