Metadata-Version: 2.4
Name: isa_model
Version: 0.6.0
Summary: Unified AI model serving framework with API streaming support
Author: isA_Model Contributors
Classifier: Development Status :: 3 - Alpha
Classifier: Intended Audience :: Developers
Classifier: Operating System :: OS Independent
Classifier: Programming Language :: Python :: 3
Requires-Python: >=3.8
Description-Content-Type: text/markdown
Requires-Dist: fastapi>=0.95.0
Requires-Dist: uvicorn>=0.22.0
Requires-Dist: pydantic>=2.0.0
Requires-Dist: httpx>=0.23.0
Requires-Dist: cachetools>=5.3.0
Requires-Dist: requests>=2.28.0
Requires-Dist: aiohttp>=3.8.0
Requires-Dist: python-dotenv>=1.0.0
Requires-Dist: numpy>=1.20.0
Requires-Dist: psycopg2-binary>=2.9.0
Requires-Dist: asyncpg>=0.28.0
Requires-Dist: alembic>=1.13.0
Requires-Dist: sqlalchemy[asyncio]>=2.0.0
Requires-Dist: slowapi>=0.1.8
Requires-Dist: circuitbreaker>=1.3.2
Requires-Dist: structlog>=23.1.0
Requires-Dist: psutil>=5.9.0
Requires-Dist: redis>=4.5.0
Requires-Dist: tenacity>=8.2.0
Requires-Dist: prometheus-client>=0.21.0
Requires-Dist: cryptography>=42.0.0
Provides-Extra: cloud
Requires-Dist: openai>=1.10.0; extra == "cloud"
Requires-Dist: anthropic>=0.40.0; extra == "cloud"
Requires-Dist: replicate>=0.23.0; extra == "cloud"
Requires-Dist: cerebras-cloud-sdk>=1.0.0; extra == "cloud"
Requires-Dist: modal>=0.63.0; extra == "cloud"
Requires-Dist: grpclib>=0.4.7; extra == "cloud"
Requires-Dist: python-logging-loki>=0.3.1; extra == "cloud"
Requires-Dist: huggingface-hub>=0.16.0; extra == "cloud"
Requires-Dist: docker>=6.0.0; extra == "cloud"
Requires-Dist: influxdb-client>=1.36.0; extra == "cloud"
Requires-Dist: tiktoken>=0.5.0; extra == "cloud"
Requires-Dist: isa-common>=0.5.0; extra == "cloud"
Provides-Extra: local
Requires-Dist: torch>=2.0.0; extra == "local"
Requires-Dist: transformers>=4.30.0; extra == "local"
Requires-Dist: accelerate>=0.20.0; extra == "local"
Requires-Dist: huggingface-hub>=0.16.0; extra == "local"
Requires-Dist: safetensors>=0.4.1; extra == "local"
Requires-Dist: sentencepiece>=0.1.99; extra == "local"
Provides-Extra: training
Requires-Dist: datasets>=2.10.0; extra == "training"
Requires-Dist: peft>=0.4.0; extra == "training"
Requires-Dist: trl>=0.4.0; extra == "training"
Requires-Dist: bitsandbytes>=0.39.0; extra == "training"
Requires-Dist: agentlightning<0.4,>=0.3.0; extra == "training"
Provides-Extra: prediction
Requires-Dist: prophet>=1.1.5; extra == "prediction"
Requires-Dist: pandas>=1.5.0; extra == "prediction"
Requires-Dist: scikit-learn>=1.3.0; extra == "prediction"
Requires-Dist: joblib>=1.3.0; extra == "prediction"
Requires-Dist: xgboost>=2.0.0; extra == "prediction"
Requires-Dist: shap>=0.43.0; extra == "prediction"
Requires-Dist: matplotlib>=3.5.0; extra == "prediction"
Provides-Extra: sleep-data
Requires-Dist: kaggle>=1.5.16; extra == "sleep-data"
Requires-Dist: mne>=1.7.0; extra == "sleep-data"
Requires-Dist: wfdb>=4.1.0; extra == "sleep-data"
Provides-Extra: sleep-train
Requires-Dist: mne>=1.7.0; extra == "sleep-train"
Requires-Dist: torch>=2.0.0; extra == "sleep-train"
Requires-Dist: scipy>=1.10.0; extra == "sleep-train"
Requires-Dist: pywavelets>=1.4.0; extra == "sleep-train"
Requires-Dist: scikit-learn>=1.3.0; extra == "sleep-train"
Requires-Dist: mlflow>=2.4.0; extra == "sleep-train"
Provides-Extra: audio
Requires-Dist: librosa>=0.10.1; extra == "audio"
Requires-Dist: soundfile>=0.12.1; extra == "audio"
Requires-Dist: numba>=0.57.0; extra == "audio"
Provides-Extra: vision
Requires-Dist: Pillow>=10.0.1; extra == "vision"
Requires-Dist: torchvision>=0.15.2; extra == "vision"
Provides-Extra: langchain
Requires-Dist: langchain-core>=0.1.0; extra == "langchain"
Requires-Dist: langchain-openai>=0.0.2; extra == "langchain"
Provides-Extra: storage
Requires-Dist: boto3>=1.26.0; extra == "storage"
Requires-Dist: google-cloud-storage>=2.7.0; extra == "storage"
Provides-Extra: monitoring
Requires-Dist: mlflow>=2.4.0; extra == "monitoring"
Requires-Dist: redis>=4.5.0; extra == "monitoring"
Requires-Dist: influxdb-client>=1.36.0; extra == "monitoring"
Requires-Dist: pgvector>=0.2.0; extra == "monitoring"
Requires-Dist: python-logging-loki>=0.3.1; extra == "monitoring"
Provides-Extra: k8s
Requires-Dist: kubernetes>=25.3.0; extra == "k8s"
Provides-Extra: feast
Requires-Dist: feast[postgres,redis]>=0.40.0; extra == "feast"
Provides-Extra: gpu-cloud
Requires-Dist: runpod>=1.0.0; extra == "gpu-cloud"
Requires-Dist: ollama>=0.3.0; extra == "gpu-cloud"
Provides-Extra: dev
Requires-Dist: pytest>=7.0.0; extra == "dev"
Requires-Dist: pytest-asyncio>=0.23.0; extra == "dev"
Requires-Dist: pytest-cov>=4.1.0; extra == "dev"
Requires-Dist: pandas>=1.5.0; extra == "dev"
Requires-Dist: scipy>=1.10.0; extra == "dev"
Requires-Dist: black>=22.0.0; extra == "dev"
Requires-Dist: flake8>=4.0.0; extra == "dev"
Requires-Dist: mypy>=0.991; extra == "dev"
Requires-Dist: twine>=4.0.0; extra == "dev"
Requires-Dist: websockets>=11.0; extra == "dev"
Requires-Dist: locust>=2.20.0; extra == "dev"
Provides-Extra: api-only
Requires-Dist: isa-model[cloud,langchain]; extra == "api-only"
Provides-Extra: full-local
Requires-Dist: isa-model[audio,langchain,local,training,vision]; extra == "full-local"
Provides-Extra: production
Requires-Dist: isa-model[cloud,k8s,monitoring,storage]; extra == "production"
Provides-Extra: staging
Requires-Dist: isa-model[cloud,langchain,monitoring,storage]; extra == "staging"
Requires-Dist: python-consul>=1.1.0; extra == "staging"
Provides-Extra: staging-minimal
Requires-Dist: isa-model[cloud,langchain,storage]; extra == "staging-minimal"
Requires-Dist: influxdb-client>=1.36.0; extra == "staging-minimal"
Requires-Dist: python-logging-loki>=0.3.1; extra == "staging-minimal"
Requires-Dist: python-consul>=1.1.0; extra == "staging-minimal"
Provides-Extra: all
Requires-Dist: isa-model[audio,cloud,gpu-cloud,k8s,langchain,local,monitoring,prediction,storage,training,vision]; extra == "all"

# isA_Model - AI Model Serving & Training Platform

[![CI](https://github.com/xenoISA/isA_Model/actions/workflows/ci.yml/badge.svg)](https://github.com/xenoISA/isA_Model/actions/workflows/ci.yml)
[![Release](https://github.com/xenoISA/isA_Model/actions/workflows/release.yml/badge.svg)](https://github.com/xenoISA/isA_Model/actions/workflows/release.yml)
[![Security Scan](https://github.com/xenoISA/isA_Model/actions/workflows/security.yml/badge.svg)](https://github.com/xenoISA/isA_Model/actions/workflows/security.yml)

> **Operators:** see [`docs/PRODUCTION_READINESS.md`](./docs/PRODUCTION_READINESS.md) for the component-by-component status matrix (what's actually deployed vs Helm-only vs planned).

A comprehensive Python platform for AI model serving, training, and optimization. Provides unified interface for multiple AI providers, intelligent model selection, LLM caching, multi-modal capabilities, and Lightning-based training workflows.

**Current Version:** 0.6.0

## Table of Contents

- [Architecture Overview](#architecture-overview)
- [Core Components](#core-components)
- [Installation](#installation)
- [Quick Start](#quick-start)
- [AI Model Serving](#ai-model-serving)
- [Lightning Training](#lightning-training)
- [Multi-Modal Services](#multi-modal-services)
- [Examples](#examples)
- [Documentation](#documentation)
- [Development](#development)

## Architecture Overview

```
┌─────────────────────────────────────────────────────────────────┐
│                    isA_Model Platform                   │
│                                                             │
│  ┌─────────────────┐  ┌─────────────────┐  ┌─────────────┐ │
│  │  Model Serving  │  │  Lightning       │  │  Core       │ │
│  │                 │  │  Training       │  │  Services   │ │
│  │ • Multi-Provider│  │                 │  │ • Config     │ │
│  │ • LLM Caching   │  │ • APO/GRPO      │  │ • Discovery │ │
│  │ • Tool Calling   │  │ • Closed-Loop   │  │ • Logging    │ │
│  │ • Multi-Modal   │  │ • Custom        │  │ • Events     │ │
│  └─────────────────┘  └─────────────────┘  └─────────────┘ │
└─────────────────────────────────────────────────────────────────┘
```

## Core Components

### 1. AI Model Serving (`isa_model/inference/`)
- **Multi-Provider Support**: OpenAI, Replicate, Ollama, Cerebras, OpenRouter
- **Intelligent Caching**: Production-grade LLM caching with Redis backend
- **Tool Calling**: OpenAI-compatible function calling interface
- **Multi-Modal**: Text, Vision, Audio, Video, Embeddings
- **Streaming Support**: Real-time streaming for all providers

### 2. Lightning Training (`isa_model/training/lightning/`)
- **Algorithm Framework**: APO, GRPO, Closed-Loop, Custom algorithms
- **Data Pipeline**: Automated trace collection and conversion
- **Job Management**: RESTful API for training lifecycle
- **Event-Driven**: NATS-based coordination and monitoring
- **Storage Abstraction**: Memory and PostgreSQL backends

### 3. Core Services (`isa_model/core/`)
- **Configuration**: Environment-based config management
- **Discovery**: Consul-based service registration
- **Logging**: Structured logging with Loki integration
- **Pricing**: Cost tracking and optimization
- **Database**: PostgreSQL gRPC client abstraction

### 4. Deployment (`isa_model/deployment/`)
- **Kubernetes**: Production-ready K8s manifests
- **Docker**: Multi-stage Dockerfiles for all components
- **Modal**: Serverless deployment support
- **Triton**: NVIDIA Triton Inference Server integration

## Installation

### Basic Installation

```bash
pip install isa_model
```

### Installation with Optional Dependencies

```bash
# Cloud API providers (OpenAI, Replicate, Cerebras, Modal)
pip install isa_model[cloud]

# Local inference (PyTorch + transformers)
pip install isa_model[local]

# Audio processing
pip install isa_model[audio]

# Vision processing
pip install isa_model[vision]

# LangChain integration
pip install isa_model[langchain]

# Monitoring (MLflow, Prometheus, Redis)
pip install isa_model[monitoring]

# Full installation (all features)
pip install isa_model[all]

# Optimized for staging/production
pip install isa_model[staging]
```

## Quick Start

### Using the Async Client (Recommended)

The `AsyncISAModel` client provides an OpenAI-compatible interface:

```python
from isa_model.inference_client import AsyncISAModel
import asyncio

async def main():
    async with AsyncISAModel(base_url="http://localhost:8082") as client:
        # Simple chat
        response = await client.chat.completions.create(
            model="gpt-4o-mini",
            messages=[{"role": "user", "content": "Hello!"}]
        )
        print(response.choices[0].message.content)

        # Streaming chat
        stream = await client.chat.completions.create(
            model="gpt-4o-mini",
            messages=[{"role": "user", "content": "Count to 5"}],
            stream=True
        )
        async for chunk in stream:
            if chunk.choices[0].delta.content:
                print(chunk.choices[0].delta.content, end="", flush=True)

asyncio.run(main())
```

### Using AIFactory (Direct Service Access)

For more control, use the AIFactory to get service instances:

```python
from isa_model.inference.ai_factory import AIFactory

factory = AIFactory.get_instance()

# Use OpenAI with API key
llm = factory.get_llm(
    model_name="gpt-4o-mini", 
    provider="openai", 
    api_key="your-openai-api-key-here"
)

# Use local Ollama model (no API key needed)
llm = factory.get_llm(model_name="llama3.1", provider="ollama")
```

## Core Features

### Multi-Modal AI Services

- **LLM (Text Generation)**: OpenAI (GPT-4, GPT-4o-mini), Ollama (Llama, Qwen), Cerebras, OpenRouter (DeepSeek-R1)
- **Vision**: Image analysis (GPT-4o, ISA OmniParser), Image generation (DALL-E, Flux, Nano-Banana)
- **Audio**: Speech-to-Text (Whisper, GPT-4o-transcribe), Text-to-Speech (OpenAI TTS, Replicate)
- **Video**: Text-to-Video (ByteDance Seedance-1-Pro)
- **Embeddings**: Text embeddings (OpenAI, Ollama), Document reranking (Jina Reranker v2)

### Intelligent Features

- **Smart Model Selection**: Automatically choose the best model based on task and input
- **LLM Caching**: Two-layer cache (streaming + non-streaming) with 50-100x speedup
- **Tool Calling**: Function calling with OpenAI-compatible interface
- **Streaming Support**: Real-time streaming for all text generation
- **Format Negotiation**: Supports OpenAI dict, LangChain message formats

### Enterprise Features

- **Cost Tracking**: Automatic cost calculation and tracking
- **Graceful Degradation**: Cache failures don't break requests
- **Feature Flags**: Environment-based feature control
- **Monitoring**: Redis-backed metrics, hit rate tracking
- **Multi-Provider**: Easy provider switching without code changes

## API Client Usage

### Comprehensive Example

See `docs/guidance/examples/model_client_examples_async.py` for complete examples covering:

```python
from isa_model.inference_client import AsyncISAModel

async with AsyncISAModel(base_url="http://localhost:8082") as client:
    # 1. Simple chat
    response = await client.chat.completions.create(
        model="gpt-4o-mini",
        messages=[{"role": "user", "content": "Hello!"}]
    )
    
    # 2. Streaming chat
    stream = await client.chat.completions.create(
        model="gpt-4o-mini",
        messages=[{"role": "user", "content": "Tell a story"}],
        stream=True
    )
    async for chunk in stream:
        print(chunk.choices[0].delta.content, end="")
    
    # 3. JSON mode (structured output)
    response = await client.chat.completions.create(
        model="gpt-4o-mini",
        messages=[{"role": "user", "content": "Generate a person profile"}],
        response_format={"type": "json_object"}
    )
    
    # 4. Function calling
    response = await client.chat.completions.create(
        model="gpt-4o-mini",
        messages=[{"role": "user", "content": "What's the weather?"}],
        tools=[{
            "type": "function",
            "function": {
                "name": "get_weather",
                "description": "Get weather for a location",
                "parameters": {
                    "type": "object",
                    "properties": {
                        "location": {"type": "string"}
                    }
                }
            }
        }]
    )
    
    # 5. Vision analysis
    vision = await client.vision.completions.create(
        image="https://example.com/image.jpg",
        prompt="Describe this image",
        model="gpt-4o-mini",
        provider="openai"
    )
    
    # 6. Image generation
    image = await client.images.generate(
        prompt="A beautiful sunset over mountains",
        model="dall-e-3",
        size="1024x1024",
        provider="openai"
    )
    
    # 7. Embeddings
    embedding = await client.embeddings.create(
        input="This is a test sentence",
        model="text-embedding-3-small"
    )
    
    # 8. Speech-to-Text
    transcription = await client.audio.transcriptions.create(
        file="audio.wav",
        model="gpt-4o-mini-transcribe"
    )
```

### Client Test Results

**Async Client**: 11/11 examples passed (100% success rate)  
**Sync Client**: 5/9 attempted (streaming and TTS have limitations)

**Recommendation**: Always use `AsyncISAModel` for production workloads.

## LLM Caching

**NEW in v0.5.7**: Production-grade LLM inference caching with Phase 2 implementation complete.

### Features

- **Streaming Cache with Replay**: 15ms per chunk delay for natural streaming feel
- **Non-Streaming Cache**: Instant responses (~5ms vs 500ms)
- **Temperature-Based TTL**: Smart expiration (temp=0 → 24h, temp=0.3 → 1h, temp=0.7 → 5min)
- **Graceful Degradation**: Cache failure = automatic pass-through to LLM
- **Real-time Monitoring**: Hit rate, replay stats, time saved tracking

### Quick Setup

```bash
# Enable cache
export ENABLE_LLM_CACHE=true
export REDIS_HOST=localhost
export REDIS_PORT=50055

# Start service
python -m isa_model.serving.api.main
```

### Performance Gains

| Scenario | First Request | Cached Request | Speedup | Cost Saving |
|----------|--------------|----------------|---------|-------------|
| Non-streaming chat | 500ms | 5ms | **100x** | 100% |
| Streaming chat | 3000ms | 2500ms | 1.2x | 100% |
| Code generation | 2000ms | 8ms | **250x** | 100% |

**Expected Savings** (40% hit rate, 1000 req/day):
- Daily: $0.40
- Monthly: $12
- Annual: $144

For high-traffic systems (100K req/day): **$1,200/month savings**

### Cache Management

```bash
# Get cache statistics
curl http://localhost:8082/api/v1/cache/stats

# Invalidate model cache (when model updates)
curl -X POST http://localhost:8082/api/v1/cache/invalidate/openai/gpt-4o-mini

# Clear all cache
curl -X POST http://localhost:8082/api/v1/cache/clear
```

See [docs/CACHE_QUICKSTART.md](docs/CACHE_QUICKSTART.md) for complete documentation.

## DeepSeek-R1 Reasoning Model

**NEW in v0.5.7**: Support for DeepSeek-R1, a powerful reasoning model that shows its thought process.

### Features

- **Visible Reasoning**: See the model's thinking with `show_reasoning=True`
- **Streaming Tool Calling**: Call tools while streaming reasoning process
- **Token Tracking**: Separate tracking for reasoning tokens vs completion tokens
- **Cost Optimization**: Reasoning tokens charged at input token rate ($0.55/1M)

### Basic Usage

```python
from isa_model.inference.ai_factory import AIFactory

factory = AIFactory()
llm = factory.get_llm(provider="openrouter", model_name="deepseek-r1")

# Without reasoning (only final answer)
response = await llm.ainvoke("If 2x + 5 = 11, what is x?", show_reasoning=False)

# With reasoning (see thought process)
response = await llm.ainvoke("If 2x + 5 = 11, what is x?", show_reasoning=True)
# Output includes: [思考: ...] tags showing reasoning steps

# Get token usage
usage = llm.get_last_token_usage()
print(f"Reasoning tokens: {usage['reasoning_tokens']}")
print(f"Completion tokens: {usage['completion_tokens']}")
```

### Streaming with Reasoning

```python
async for chunk in llm.astream("Calculate 15 × 23", show_reasoning=True):
    if chunk.startswith('[思考:') and chunk.endswith(']'):
        # Reasoning tokens (gray text)
        reasoning = chunk[4:-1]
        print(f"\033[90m{reasoning}\033[0m", end="", flush=True)
    else:
        # Normal content
        print(chunk, end="", flush=True)
```

See [docs/guidance/examples/deepseek_r1_reasoning_example.py](docs/guidance/examples/deepseek_r1_reasoning_example.py) and [docs/guidance/deepseek-r1.md](docs/guidance/deepseek-r1.md) for complete examples.

## Multi-Modal Services

### Speech-to-Text (4 Models)

```python
# Basic transcription (fastest, cheapest)
transcription = await client.audio.transcriptions.create(
    file="audio.wav",
    model="gpt-4o-mini-transcribe"  # NEW default model
)

# High quality transcription
transcription = await client.audio.transcriptions.create(
    file="audio.wav",
    model="gpt-4o-transcribe"  # Highest quality
)

# With speaker diarization
transcription = await client.audio.transcriptions.create(
    file="audio.wav",
    model="gpt-4o-transcribe-diarize",
    enable_diarization=True,
    response_format="diarized_json"
)
# Returns: segments with speaker labels, timestamps

# Legacy Whisper model
transcription = await client.audio.transcriptions.create(
    file="audio.wav",
    model="whisper-1"  # Legacy
)
```

### Video Generation

```python
# Text-to-Video with ByteDance Seedance-1-Pro
response = await client._underlying_client.invoke(
    input_data="The sun rises slowly between tall buildings...",
    task="generate",
    service_type="video_generation",
    provider="replicate",
    model="seedance-1-pro",
    duration=5,
    fps=24,
    resolution="1080p",
    aspect_ratio="16:9"
)
```

### Multi-Image Input

```python
# Google Nano-Banana (Multi-Image Style Transfer)
response = await client._underlying_client.invoke(
    input_data="Make the sheets in the style of the logo",
    task="img2img",
    service_type="image_generation",
    provider="replicate",
    model="nano-banana",
    init_image=[
        "https://example.com/image1.png",
        "https://example.com/image2.png"
    ],
    aspect_ratio="match_input_image"
)
```

### ISA Proprietary Services

```python
# ISA OmniParser - UI Detection
vision = await client.vision.completions.create(
    image="https://example.com/ui-screenshot.jpg",
    prompt="Detect UI elements",
    model="isa-omniparser-ui-detection",
    provider="isa"
)

# ISA Jina Reranker v2 - Document Reranking
response = await client._underlying_client.invoke(
    input_data="What is machine learning?",
    task="rerank",
    service_type="embedding",
    provider="isa",
    model="isa-jina-reranker-v2-service",
    documents=[
        "Machine learning is a subset of AI...",
        "Python is a programming language...",
        "Neural networks are computational models..."
    ]
)
```

## Tool Calling

### OpenAI-Compatible Function Calling

```python
from isa_model.inference_client import AsyncISAModel
import json

WEATHER_TOOL = {
    "type": "function",
    "function": {
        "name": "get_weather",
        "description": "Get current weather for a location",
        "parameters": {
            "type": "object",
            "properties": {
                "location": {"type": "string", "description": "City name"}
            },
            "required": ["location"]
        }
    }
}

async with AsyncISAModel() as client:
    # Request with tool
    response = await client.chat.completions.create(
        model="gpt-4o-mini",
        messages=[{"role": "user", "content": "What's the weather in Tokyo?"}],
        tools=[WEATHER_TOOL]
    )
    
    # Check if tool was called
    if response.choices[0].message.tool_calls:
        tool_call = response.choices[0].message.tool_calls[0]
        print(f"Tool: {tool_call.function.name}")
        print(f"Args: {tool_call.function.arguments}")
        
        # Execute tool (your implementation)
        args = json.loads(tool_call.function.arguments)
        result = get_weather(**args)
        
        # Continue conversation with tool result
        messages = [
            {"role": "user", "content": "What's the weather in Tokyo?"},
            {
                "role": "assistant",
                "tool_calls": [{
                    "id": tool_call.id,
                    "type": "function",
                    "function": {
                        "name": tool_call.function.name,
                        "arguments": tool_call.function.arguments
                    }
                }]
            },
            {
                "role": "tool",
                "tool_call_id": tool_call.id,
                "content": json.dumps(result)
            }
        ]
        
        final = await client.chat.completions.create(
            model="gpt-4o-mini",
            messages=messages
        )
        print(final.choices[0].message.content)
```

### Streaming Tool Calling (DeepSeek-R1)

```python
# Tool calls appear at the end of stream in delta.tool_calls
stream = await client.chat.completions.create(
    model="deepseek-r1",
    provider="openrouter",
    messages=[{"role": "user", "content": "What's the weather in Tokyo?"}],
    tools=[WEATHER_TOOL],
    stream=True,
    show_reasoning=True
)

tool_calls = []
async for chunk in stream:
    delta = chunk.choices[0].delta
    
    # Collect reasoning and content
    if delta.content:
        print(delta.content, end="", flush=True)
    
    # Collect tool calls (appear at end)
    if delta.tool_calls:
        tool_calls.extend(delta.tool_calls)

# Execute tools after stream completes
for tc in tool_calls:
    args = json.loads(tc.function.arguments)
    result = execute_tool(**args)
```

See [docs/guidance/examples/tool_call_streaming_example.py](docs/guidance/examples/tool_call_streaming_example.py) for complete agent loop implementation.

## Examples

All runnable examples are in `docs/guidance/examples/`:

- **[model_client_examples_async.py](docs/guidance/examples/model_client_examples_async.py)**: Comprehensive async client examples (11/11 passed)
  - Simple chat, streaming, multiple providers
  - JSON mode, function calling
  - Vision, embeddings, image generation
  - Format negotiation, error handling
  - Speech-to-Text, ISA services

- **[model_client_examples_sync.py](docs/guidance/examples/model_client_examples_sync.py)**: Sync client (basic usage only, has limitations)

- **[deepseek_r1_reasoning_example.py](docs/guidance/examples/deepseek_r1_reasoning_example.py)**: DeepSeek-R1 reasoning examples
  - Basic math, complex problems
  - Streaming with reasoning
  - Code generation, multi-turn chat

- **[tool_call_streaming_example.py](docs/guidance/examples/tool_call_streaming_example.py)**: Tool calling examples
  - Basic streaming tool calls
  - Complete agent loop
  - DeepSeek-R1 reasoning + tools

- **[nano_banana_example.py](docs/guidance/examples/nano_banana_example.py)**: Multi-image style transfer

- **[seedance_video_example.py](docs/guidance/examples/seedance_video_example.py)**: Text-to-video generation

See [docs/guidance/examples/README.md](docs/guidance/examples/README.md) for detailed documentation.

## Documentation

Comprehensive documentation is available in the `docs/` directory:

```
docs/
├── overview/           → Project vision, goals, architecture
├── research/           → Research findings and exploration
├── domain/             → Domain concepts and knowledge models
├── prd/                → Product requirements documents
├── design/             → Technical design specifications
└── guidance/           → Developer guides and tutorials
    └── examples/       → Runnable Python scripts
```

### Getting Started

- **[Quick Start](docs/guidance/quickstart.md)**: Get started in 5 minutes
- **[LLM Services](docs/guidance/llm-services.md)**: Text generation and chat
- **[Tool Calling](docs/guidance/tool-calling.md)**: Function calling guide
- **[Providers](docs/guidance/providers.md)**: Configure model providers
- **[Caching](docs/guidance/caching.md)**: Cache optimization
- **[DeepSeek R1](docs/guidance/deepseek-r1.md)**: Reasoning model with tool calls

### Project Documentation

- **[Project Overview](docs/overview/README.md)**: Vision, goals, architecture
- **[Product Requirements](docs/prd/README.md)**: Feature specifications
- **[Technical Design](docs/design/README.md)**: System design documents

## Development

### Installing for Development

```bash
git clone <repository-url>
cd isA_Model

# Install with all dependencies
pip install -e ".[all]"

# Or install with specific extras
pip install -e ".[cloud,langchain,dev]"
```

### Environment Setup

For local development, copy the example deployment env file into a gitignored local override:

```bash
cp deployment/environments/dev.env.example deployment/environments/dev.env
# or create deployment/environments/dev.local.env instead
```

Then fill in your local secrets, for example:

```bash
OPENAI_API_KEY=your-openai-key
REPLICATE_API_TOKEN=your-replicate-token
INTERNAL_SERVICE_SECRET=your-local-internal-secret
```

### Running the Server

```bash
# Start the FastAPI server
python -m isa_model.serving.api.main

# Or with uvicorn
uvicorn isa_model.serving.api.fastapi_server:app --host 0.0.0.0 --port 8082
```

### Running Tests

```bash
# Run async client examples (recommended)
python docs/guidance/examples/model_client_examples_async.py

# Run specific tests
python tests/test_stt_models.py

# Run cache tests
bash tests/cache_test.sh
```

### Building and Publishing

```bash
# Update version in pyproject.toml
# Current version: 0.6.0

# Build the package
python -m build

# Upload to PyPI
python -m twine upload dist/isa_model-0.6.0* --username __token__ --password "$PYPI_API_TOKEN"
```

## What's New in v0.5.7

### LLM Caching (Phase 2 Complete)
- **Streaming Cache + Replay**: Natural streaming feel with 15ms/chunk delay
- **Non-Streaming Cache**: 100x speedup for deterministic queries
- **Temperature-Based TTL**: Smart caching based on output randomness
- **Real-time Monitoring**: Hit rate tracking, time saved statistics
- **Production Ready**: Feature flags, graceful degradation, zero-impact deployment

### DeepSeek-R1 Support
- **Visible Reasoning**: See model's thought process with `show_reasoning=True`
- **Streaming Tool Calling**: Function calling with reasoning visibility
- **Token Tracking**: Separate reasoning token counting and cost tracking
- **Agent Loop Support**: Complete multi-turn conversation with tools

### Enhanced Multi-Modal
- **Speech-to-Text**: 4 models (Whisper, gpt-4o-mini-transcribe, gpt-4o-transcribe, gpt-4o-transcribe-diarize)
- **Video Generation**: ByteDance Seedance-1-Pro text-to-video
- **Multi-Image Input**: Google Nano-Banana style transfer
- **ISA Services**: OmniParser UI detection, Jina Reranker v2

### Client Improvements
- **100% Pass Rate**: AsyncISAModel client (11/11 examples)
- **Format Negotiation**: OpenAI dict + LangChain message support
- **Better Error Handling**: Informative error messages and graceful failures
- **Resource Cleanup**: Proper context manager support

### Infrastructure
- **Consul Integration**: Service discovery and dynamic routing
- **Redis Caching**: Production-grade caching backend
- **Monitoring**: Comprehensive metrics and logging
- **Feature Flags**: Environment-based feature control

## Supported Providers

| Provider | LLM | Vision | Audio | Image Gen | Video | Embeddings |
|----------|-----|--------|-------|-----------|-------|------------|
| **OpenAI** | ✅ | ✅ | ✅ | ✅ | ❌ | ✅ |
| **Replicate** | ✅ | ✅ | ✅ | ✅ | ✅ | ❌ |
| **Ollama** | ✅ | ✅ | ❌ | ❌ | ❌ | ✅ |
| **Cerebras** | ✅ | ❌ | ❌ | ❌ | ❌ | ❌ |
| **OpenRouter** | ✅ | ❌ | ❌ | ❌ | ❌ | ❌ |
| **ISA** | ❌ | ✅ | ❌ | ❌ | ❌ | ✅ |

**Note**: OpenRouter provider includes DeepSeek-R1 reasoning model.

## Cost Optimization

### LLM Caching Benefits

With 40% cache hit rate on 1,000 requests/day:
- **Daily savings**: $0.40
- **Monthly savings**: $12
- **Annual savings**: $144

For high-traffic production (100K req/day):
- **Monthly savings**: $1,200+

### Model Selection Strategy

- **Development/Testing**: Use `gpt-4o-mini` or `ollama` (local, free)
- **Production**: Cache with `temperature=0` for deterministic queries
- **Creative Tasks**: Use higher temperature, shorter TTL
- **Code Generation**: Cache aggressively (24h TTL for temp=0)

## Architecture

```
isa_model/
├── client.py                  # Unified ISAModelClient
├── inference_client.py        # OpenAI-compatible client
├── inference/
│   ├── ai_factory.py         # Service factory
│   ├── services/             # Service implementations
│   │   ├── llm/             # LLM services
│   │   ├── vision/          # Vision services
│   │   ├── audio/           # Audio services (STT/TTS)
│   │   ├── img/             # Image generation
│   │   ├── video/           # Video generation
│   │   └── embedding/       # Embedding services
│   └── cache/               # LLM caching layer
├── serving/
│   └── api/                 # FastAPI server
├── core/
│   ├── config/              # Configuration management
│   ├── models/              # Model registry
│   └── services/            # Core services
└── deployment/              # Kubernetes, Docker configs
```

## Roadmap

### Phase 3: Semantic Caching (Planned)
- Embedding-based similarity matching
- Cache hits even with different wording
- Target: 60-80% hit rate (vs 40% exact match)

### Future Features
- Cache warming on model updates
- Distributed locking for multi-instance consistency
- Per-user cache namespaces
- A/B testing framework
- Advanced cost analytics

## License

MIT License - see LICENSE file for details.

## Contributing

Contributions are welcome! Please:

1. Fork the repository
2. Create a feature branch
3. Make your changes with tests
4. Submit a pull request

See [CONTRIBUTING.md](CONTRIBUTING.md) for detailed guidelines.

## Support

- **Documentation**: See `docs/` directory
- **Examples**: See `docs/guidance/examples/` directory
- **Issues**: Open an issue on GitHub
- **Discussions**: GitHub Discussions

## Acknowledgments

Built with:
- FastAPI for high-performance API serving
- Redis for production-grade caching
- OpenAI SDK compatibility layer
- LangChain integration support
- Comprehensive provider ecosystem

---

**Ready to get started?** Check out [docs/guidance/examples/](docs/guidance/examples/) for comprehensive usage examples!
