# Flexible GraphRAG Configuration

# =============================================================================
# Document Parser Configuration (Docling / LlamaParse)
# =============================================================================

# Document Parser Configuration
# DOCUMENT_PARSER can be "docling" (default, open-source) or "llamaparse" (cloud-based, requires API key)
# Docling: Free, open-source, runs locally, good for most document types
# LlamaParse: Cloud-based parsing service from LlamaIndex, advanced multimodal parsing, requires API key
DOCUMENT_PARSER=docling
#DOCUMENT_PARSER=llamaparse

# Save parsing results to files for inspection (works for both Docling and LlamaParse)
# When enabled, saves parsing outputs to ./parsing_output/
# - Docling: Saves both markdown (.md) and plaintext (.txt) outputs + metadata JSON
# - LlamaParse: Saves both markdown (.md) and plaintext (.txt) outputs + metadata JSON
# Useful for debugging and understanding how parsers interpret documents
#SAVE_PARSING_OUTPUT=false
#SAVE_PARSING_OUTPUT=true

# Format to use for knowledge graph extraction (what gets sent to LLM for entity/relation extraction)
# Applies to both Docling and LlamaParse
# Options:
#   "auto" (default) - Use markdown if tables detected, otherwise use plaintext
#   "markdown" - Always use markdown format (better for documents with tables/structure)
#   "plaintext" - Always use plaintext format (better for text-heavy documents)
# Note: Both formats are always saved to disk when SAVE_PARSING_OUTPUT=true
#PARSER_FORMAT_FOR_EXTRACTION=auto
#PARSER_FORMAT_FOR_EXTRACTION=markdown
#PARSER_FORMAT_FOR_EXTRACTION=plaintext

# =============================================================================
# Docling Config

# Docling Device Configuration (only used if DOCUMENT_PARSER=docling)
# Options:
#   "auto" (default) - Automatically uses GPU if available, falls back to CPU
#   "cpu" - Force CPU-only processing
#   "cuda" - Force CUDA/GPU processing (requires CUDA-capable GPU and PyTorch with CUDA)
#   "mps" - Force Apple Metal Performance Shaders (Mac with Apple Silicon)
# Note: GPU processing significantly speeds up PDF/document conversion with tables
#DOCLING_DEVICE=auto
#DOCLING_DEVICE=cpu
#DOCLING_DEVICE=cuda

# =============================================================================
# LlamaParse Config

# LlamaParse API Key (only needed if DOCUMENT_PARSER=llamaparse)
# Get your API key from https://cloud.llamaindex.ai/  bottom left "API Keys", then "Generate New Key"
# LlamaParse offers high-quality document parsing of complex documents
#LLAMAPARSE_API_KEY=llx-your-api-key-here

# LlamaParse Mode Configuration (only used if DOCUMENT_PARSER=llamaparse)
# Options:
#   "parse_page_without_llm" - 1 credit/page (cheapest, simple, text-only output, no markdown)
#   "parse_page_with_llm" - 3 credits/page (good balance, uses LLM, markdown output) - DEFAULT
#   "parse_page_with_agent" - 10-90 credits/page (best quality, markdown output, requires LLAMAPARSE_AGENT_MODEL)
#LLAMAPARSE_MODE=parse_page_with_llm

# LlamaParse Agent Model (only used if LLAMAPARSE_MODE=parse_page_with_agent)
# Available models from LlamaCloud (costs vary by model):
#   "openai-gpt-4-1-mini" - 10 credits/page (recommended balance)
#   Other models available - check LlamaCloud for full list / credit cost
#LLAMAPARSE_AGENT_MODEL=openai-gpt-4-1-mini


# =============================================================================
# Data Source Configuration
# =============================================================================

# Example configurations for different data sources:
# WEB_CONFIG={"url": "https://example.com/page"}
# WIKIPEDIA_CONFIG={"query": "artificial intelligence", "language": "en", "max_docs": 1}
# YOUTUBE_CONFIG={"url": "https://www.youtube.com/watch?v=VIDEO_ID", "chunk_size_seconds": 60}

# Amazon S3 Configuration
# S3_CONFIG={"bucket_name": "my-bucket", "access_key": "AKIAIOSFODNN7EXAMPLE", "secret_key": "wJalrXUtnFEMI/K7MDENG/bPxRfiCYEXAMPLEKEY", "region_name": "us-east-1", "prefix": "documents/"}
# Alternative format using "bucket" (S3Reader compatible):
# S3_CONFIG={"bucket": "my-bucket", "access_key": "AKIAIOSFODNN7EXAMPLE", "secret_key": "wJalrXUtnFEMI/K7MDENG/bPxRfiCYEXAMPLEKEY", "region_name": "us-east-1", "prefix": "documents/"}
# Required: bucket_name (or bucket), access_key, secret_key
# Optional: region_name (if not specified, falls back to S3_REGION_NAME env var or default "us-east-1"), prefix (folder path within bucket)
# Note: S3_CONFIG takes precedence over individual S3_* environment variables (see section 5 below)

# Google Cloud Storage Configuration
# GCS_CONFIG={"bucket_name": "my-bucket", "project_id": "my-project-123", "credentials": "{\"type\":\"service_account\",\"project_id\":\"my-project\",...}", "prefix": "documents/"}
# Required: bucket_name, project_id, credentials (service account JSON as string)
# Optional: prefix (folder path), folder_name, service_account_key_path (alternative to credentials string)
# Note: credentials should be JSON service account key from GCP Console as a string

# Google Drive Configuration
# GOOGLE_DRIVE_CONFIG={"credentials": "{\"type\":\"service_account\",\"project_id\":\"my-project\",...}", "folder_id": "1A2B3C4D5E6F7G8H9I0J", "file_ids": ["file1-id", "file2-id"], "query": "name contains 'report'"}
# Required: Either credentials (service account JSON string) OR credentials_path OR token_path
# Optional: folder_id, file_ids (specific files), query (search query), credentials_path, token_path
# Note: credentials should be service account JSON key or OAuth credentials file path

# Azure Blob Storage Configuration
# AZURE_BLOB_CONFIG={"container_name": "documents", "account_url": "https://myaccount.blob.core.windows.net", "account_name": "myaccount", "account_key": "base64key==", "prefix": "folder/"}
# Required: container_name, account_url, account_name, account_key
# Optional: blob (specific blob name), prefix (folder path), connection_string (alternative to account_key)

# Microsoft OneDrive Configuration
# ONEDRIVE_CONFIG={"user_principal_name": "admin@tenant.onmicrosoft.com", "client_id": "12345678-1234-1234-1234-123456789abc", "client_secret": "XFA8Q~abc123...", "tenant_id": "87654321-4321-4321-4321-cba987654321", "folder_path": "/Documents"}
# Required: user_principal_name (M365 email), client_id, client_secret, tenant_id
# Optional: folder_path (default: / for root), folder_id, file_ids
# Setup instructions: docs/DATA-SOURCE-CONFIGURATION.md

# Microsoft SharePoint Configuration
# SHAREPOINT_CONFIG={"client_id": "12345678-1234-1234-1234-123456789abc", "client_secret": "XFA8Q~abc123...", "tenant_id": "87654321-4321-4321-4321-cba987654321", "site_name": "MySite", "folder_path": "/Shared Documents"}
# Required: client_id, client_secret, tenant_id, site_name (SharePoint site name, not full URL)
# Optional: site_id, folder_path (default: /), folder_id, file_ids
# Setup instructions: docs/DATA-SOURCE-CONFIGURATION.md

# Box Configuration
# Developer Token (simplest, expires in 60 minutes):
# BOX_CONFIG={"folder_id": "0", "developer_token": "your_developer_token"}
# Client Credentials Grant (production, long-lived):
# BOX_CONFIG={"folder_id": "0", "client_id": "abc123", "client_secret": "xyz789", "user_id": "12345678"}
# For detailed setup instructions, see docs/DATA-SOURCE-CONFIGURATION.md

# Sample Text for Testing (optional - overrides default Dune text)
# SAMPLE_TEXT="Luke Skywalker is a Jedi Knight from Tatooine. His father is Darth Vader, formerly known as Anakin Skywalker."

# ====================================================================
# 1. GRAPH DATABASE CONFIGURATION
# ====================================================================

GRAPH_DB=neo4j
#GRAPH_DB=kuzu
#GRAPH_DB=falkordb
#GRAPH_DB=arcadedb
#GRAPH_DB=memgraph
#GRAPH_DB=nebula
#GRAPH_DB=neptune
#GRAPH_DB=neptune_analytics
#GRAPH_DB=none

# Enable knowledge graph extraction 
#ENABLE_KNOWLEDGE_GRAPH=false
ENABLE_KNOWLEDGE_GRAPH=true

# Knowledge graph extractor type: "simple", "schema", or "dynamic"
# simple: Basic extraction without schema constraints (uses SimpleLLMPathExtractor) 
# schema: Uses provided or buitlin schema for structured extraction (uses SchemaLLMPathExtractor) - default
# Use SchemaLLMPathExtractor with strict=True when you have a well-defined domain and want to ensure consistency
# Use SchemaLLMPathExtractor with strict=False when you want some flexibility while still being guided by a schema
# (for bedrock, fireworks, groq DynamicLLMPathExtractor is used instead of SchemaLLMPathExtractor to avoid a LlamaIndex issue)
# dynamic: More flexible extraction that can expand more beyond initial schema or no provided schema (uses DynamicLLMPathExtractor)
KG_EXTRACTOR_TYPE=schema
#KG_EXTRACTOR_TYPE=dynamic
#KG_EXTRACTOR_TYPE=simple

# Graph Database Connection Configurations:

# Neo4j
GRAPH_DB_CONFIG={"url": "bolt://localhost:7687", "username": "neo4j", "password": "password"}

# Kuzu (database file will be created as ./kuzu_db/database.kz)
# use_structured_schema: false (default) - enables initial entity types and schema (not generic entity type schema)
# use_vector_index: false (default) - enables Kuzu's built-in vector capabilities use internally
#GRAPH_DB_CONFIG={"db_path": "./kuzu_db/database.kz", "use_structured_schema": false, "use_vector_index": false}

# FalkorDB
#GRAPH_DB_CONFIG={"url": "falkor://localhost:6379"}
#GRAPH_DB_CONFIG={"url": "falkor://localhost:6379", "database": "falkor"}

# ArcadeDB (multi-model database with graph capabilities)
# include_basic_schema: True (default) - initial types: PERSON, ORGANIZATION, LOCATION, PLACE + Entity, TextChunk, MENTIONS
#                       False - initial types: Entity, TextChunk, MENTIONS only
#                       For both settings: LlamaIndex PathExtractors/KG_EXTRACTOR_TYPE control additional types
# Remote mode (default) — requires a running ArcadeDB server
#GRAPH_DB_CONFIG={"host": "localhost", "port": 2480, "username": "root", "password": "playwithdata", "database": "flexible_graphrag", "include_basic_schema": true}
#GRAPH_DB_CONFIG={"host": "localhost", "port": 2480, "username": "root", "password": "playwithdata", "database": "flexible_graphrag", "include_basic_schema": false}
# Embedded mode — no separate server required; install arcadedb-embedded>=26.2.1 first
# embedded_server: false (default) - pure in-process, no HTTP endpoint exposed
# embedded_server: true - also starts an HTTP server on embedded_server_port (default 2482)
#GRAPH_DB_CONFIG={"mode": "embedded", "db_path": "./arcadedb_data", "database": "flexible_graphrag", "include_basic_schema": true}
#GRAPH_DB_CONFIG={"mode": "embedded", "db_path": "./arcadedb_data", "database": "flexible_graphrag", "embedded_server": true, "embedded_server_port": 2482, "embedded_server_password": "playwithdata", "include_basic_schema": true}

# MemGraph (real-time graph database)
# Basic configuration:
#GRAPH_DB_CONFIG={"url": "bolt://localhost:7688", "username": "", "password": ""}
# Full configuration with database parameter:
#GRAPH_DB_CONFIG={"url": "bolt://localhost:7688", "username": "", "password": "", "database": "memgraph"}

# NebulaGraph (distributed graph database)
# Basic configuration (uses defaults for connection):
#GRAPH_DB_CONFIG={"space": "flexible_graphrag", "overwrite": true}
# Full configuration with connection parameters (address/port format):
#GRAPH_DB_CONFIG={"space": "flexible_graphrag", "overwrite": true, "address": "localhost", "port": 9669, "username": "root", "password": "nebula"}
# Full configuration with URL format:
#GRAPH_DB_CONFIG={"space": "flexible_graphrag", "overwrite": true, "url": "nebula://localhost:9669", "username": "root", "password": "nebula"}
# Alternative format (space_name also supported for backward compatibility):
#GRAPH_DB_CONFIG={"space_name": "flexible_graphrag", "address": "localhost", "port": 9669, "username": "root", "password": "nebula"}

# Amazon Neptune (managed graph database service)
# With explicit AWS credentials:
#GRAPH_DB_CONFIG={"host": "your-neptune-cluster.cluster-xyz.us-east-1.neptune.amazonaws.com", "port": 8182, "region": "us-east-1", "access_key": "your_access_key", "secret_key": "your_secret_key"}
# With AWS credentials profile (alternative approach):
#GRAPH_DB_CONFIG={"host": "your-neptune-cluster.cluster-xyz.us-east-1.neptune.amazonaws.com", "port": 8182, "region": "us-east-1", "credentials_profile_name": "my-aws-profile"}

# Amazon Neptune Analytics (serverless graph analytics engine)
# NOTE: Neptune Analytics has non-atomic vector index limitations
# The system automatically sets embed_kg_nodes=False for Neptune Analytics to avoid vector conflicts
# Use a separate VECTOR_DB for embeddings when using Neptune Analytics
# With explicit AWS credentials:
#GRAPH_DB_CONFIG={"graph_identifier": "g-1234567890", "region": "us-east-1", "access_key": "your_access_key", "secret_key": "your_secret_key"}
# With AWS credentials profile (alternative approach):
#GRAPH_DB_CONFIG={"graph_identifier": "g-1234567890", "region": "us-east-1", "credentials_profile_name": "my-aws-profile"}
# Using default AWS credentials (from environment variables, IAM role, etc.):
#GRAPH_DB_CONFIG={"graph_identifier": "g-1234567890", "region": "us-east-1"}
# Note: Neptune Analytics requires a region to be specified
# Recommended: Use Neptune Analytics for graph + separate VECTOR_DB for embeddings


# ====================================================================
# SCHEMA CONFIGURATION - Controls entity and relationship extraction
# ====================================================================

# default uses schema builtin into LlamaIndex (no schema passed to SchemaLLMPathExtractor or DynamicLLMPathExtractor)
SCHEMA_NAME=default

# Schema set to sample uses SAMPLE_SCHEMA in config.py
# "entities": ["PERSON", "ORGANIZATION", "LOCATION", "PLACE", "TECHNOLOGY", "PROJECT"],
# "relations": ["WORKS_FOR", "LOCATED_IN", "USES", "COLLABORATES_WITH", "DEVELOPS", "HAS", "PART_OF", "WORKED_ON", "WORKED_WITH", "WORKED_AT"],
# "validation_schema": [ bunch of type relation-type type triplets ]
#SCHEMA_NAME=sample

# Custom schema example for SchemaLLMPathExtractor (and DynamicLLMPathExtractor)
#SCHEMA_NAME=cmis_press
#SCHEMAS=[{"name": "cmis_press", "schema": {"entities": ["PERSON", "ORGANIZATION", "TECHNOLOGY", "SPECIFICATION", "CONCEPT"], "relations": ["WORKS_FOR", "DEVELOPS", "SUPPORTS", "IMPLEMENTS", "COLLABORATES_WITH", "ANNOUNCES"], "validation_schema": [["PERSON", "WORKS_FOR", "ORGANIZATION"], ["PERSON", "DEVELOPS", "TECHNOLOGY"], ["ORGANIZATION", "SUPPORTS", "SPECIFICATION"], ["ORGANIZATION", "IMPLEMENTS", "TECHNOLOGY"], ["ORGANIZATION", "COLLABORATES_WITH", "ORGANIZATION"], ["ORGANIZATION", "ANNOUNCES", "SPECIFICATION"]], "strict": false}}]

# ====================================================================
# 2. VECTOR DATABASE CONFIGURATION  
# ====================================================================

#VECTOR_DB=neo4j
VECTOR_DB=qdrant
#VECTOR_DB=elasticsearch
#VECTOR_DB=opensearch
#VECTOR_DB=chroma
#VECTOR_DB=milvus
#VECTOR_DB=weaviate
#VECTOR_DB=pinecone
#VECTOR_DB=postgres
#VECTOR_DB=lancedb
#VECTOR_DB=none

# Vector Database Connection Configurations:

# Qdrant
VECTOR_DB_CONFIG={"host": "localhost", "port": 6333, "collection_name": "hybrid_search_vector", "https": false}

# Elasticsearch
#VECTOR_DB_CONFIG={"url": "http://localhost:9200", "index_name": "hybrid_search_vector"}

# OpenSearch Configuration (Vector Store) - Dense vector search
#VECTOR_DB_CONFIG={"url": "http://localhost:9201", "index_name": "hybrid_search_vector"}

# Neo4j VECTOR database configuration (separate from graph)
#VECTOR_DB_CONFIG={"url": "bolt://localhost:7687", "username": "neo4j", "password": "password", "index_name": "hybrid_search_vector", "database": "neo4j"}

# Chroma (local vector database with persistence)
# Local mode (file-based storage):
#VECTOR_DB_CONFIG={"persist_directory": "./chroma_db", "collection_name": "hybrid_search"}
# HTTP mode (connect to remote ChromaDB server):
#VECTOR_DB_CONFIG={"host": "localhost", "port": 8001, "collection_name": "hybrid_search"}

# Milvus (scalable vector database)
#VECTOR_DB_CONFIG={"host": "localhost", "port": 19530, "collection_name": "hybrid_search", "username": "root", "password": "milvus"}

# Weaviate (vector search engine with semantic capabilities)
# For local Docker instance (no authentication):
#VECTOR_DB_CONFIG={"url": "http://localhost:8081", "index_name": "HybridSearch"}
# For authenticated instance (with API key):
#VECTOR_DB_CONFIG={"url": "http://localhost:8081", "index_name": "HybridSearch", "api_key": "your_weaviate_api_key"}

# Pinecone (managed serverless vector database service)
# Sign up at https://app.pinecone.io (free starter plan available)
# Note: dimension is auto-detected from your embedding model, don't include it in config
#VECTOR_DB_CONFIG={"api_key": "your_pinecone_api_key", "region": "us-east-1", "cloud": "aws", "index_name": "hybrid-search", "metric": "cosine"}

# PostgreSQL with pgvector extension
#VECTOR_DB_CONFIG={"host": "localhost", "port": 5433, "database": "postgres", "username": "postgres", "password": "password", "table_name": "hybrid_search_vectors"}

# LanceDB (modern embedded vector database)
#VECTOR_DB_CONFIG={"uri": "./lancedb", "table_name": "hybrid_search", "vector_column_name": "vector", "text_column_name": "text"}


# ====================================================================
# 3. SEARCH DATABASE CONFIGURATION
# ====================================================================

#SEARCH_DB=bm25
SEARCH_DB=elasticsearch
#SEARCH_DB=opensearch
#SEARCH_DB=none

# Search Database Connection Configurations:
# Elasticsearch
SEARCH_DB_CONFIG={"url": "http://localhost:9200", "index_name": "hybrid_search_fulltext"}

# OpenSearch Configuration (Search Store) - BM25 fulltext search
#SEARCH_DB_CONFIG={"url": "http://localhost:9201", "index_name": "hybrid_search_fulltext"}

# ====================================================================
# 4. LLM CONFIGURATION
# ====================================================================
# For detailed configuration examples and all options, see: docs/LLM-EMBEDDING-CONFIG.md

# LLM Provider Selection
LLM_PROVIDER=openai
#LLM_PROVIDER=ollama
#LLM_PROVIDER=azure_openai
#LLM_PROVIDER=gemini
#LLM_PROVIDER=anthropic
#LLM_PROVIDER=vertex_ai
#LLM_PROVIDER=bedrock
#LLM_PROVIDER=groq
#LLM_PROVIDER=fireworks

# OpenAI Configuration (if using OpenAI)
OPENAI_API_KEY=your-openai-api-key-here
OPENAI_MODEL=gpt-4o-mini
# OPENAI_TIMEOUT=120.0  # LLM request timeout in seconds (default: 2 minutes)

# Azure OpenAI Configuration (if using Azure OpenAI)
# See docs/LLM-EMBEDDING-CONFIG.md for detailed examples
# AZURE_OPENAI_API_KEY=your-azure-openai-key
# AZURE_OPENAI_ENDPOINT=https://your-resource.openai.azure.com/
# AZURE_OPENAI_ENGINE=gpt-4o-mini  # Your deployment name
# AZURE_OPENAI_MODEL=gpt-4o-mini
# AZURE_OPENAI_API_VERSION=2024-12-01-preview

# Google Gemini Configuration (if using Gemini)
# See docs/LLM-EMBEDDING-CONFIG.md for detailed examples
# GOOGLE_API_KEY=your-google-api-key
# GEMINI_MODEL=gemini-2.0-flash
# Note: Gemini has known issues with graph search (use with GRAPH_DB=none for vector+search only)

# Anthropic Claude Configuration (if using Claude)
# See docs/LLM-EMBEDDING-CONFIG.md for detailed examples
# ANTHROPIC_API_KEY=your-anthropic-api-key
# ANTHROPIC_MODEL=claude-sonnet-4-5-20250929
# Note: Claude only creates chunk nodes during graph building (no entity/relationship extraction)

# Google Vertex AI Configuration (if using Vertex AI)
# See docs/LLM-EMBEDDING-CONFIG.md for detailed examples and package options
# VERTEX_AI_PROJECT=your-gcp-project-id  # Required
# VERTEX_AI_LOCATION=us-central1
# VERTEX_AI_MODEL=gemini-2.0-flash-001
# VERTEX_AI_CREDENTIALS_PATH=/path/to/service-account-key.json  # Optional
# See: https://developers.llamaindex.ai/python/examples/llm/google_genai/#vertex-ai-support

# Amazon Bedrock Configuration (if using Bedrock)
# See docs/LLM-EMBEDDING-CONFIG.md for detailed examples and available models
# BEDROCK_MODEL=anthropic.claude-3-5-sonnet-20241022-v2:0
# BEDROCK_REGION=us-east-1
# BEDROCK_ACCESS_KEY=your-aws-access-key  # Optional: uses default AWS credentials if not provided
# BEDROCK_SECRET_KEY=your-aws-secret-key

# Groq Configuration (if using Groq - ultra-fast inference)
# See docs/LLM-EMBEDDING-CONFIG.md for detailed examples and available models
# See pricing: https://groq.com/pricing
# GROQ_API_KEY=your-groq-api-key  # Get from https://console.groq.com
# GROQ_MODEL=llama-3.3-70b-versatile  # Recommended for speed
# Note: Groq doesn't provide embeddings - defaults to local Ollama embeddings

# Fireworks AI Configuration (if using Fireworks - fast compound AI)
# See docs/LLM-EMBEDDING-CONFIG.md for detailed examples and available models
# See models: https://fireworks.ai/models
# FIREWORKS_API_KEY=your-fireworks-api-key  # Get from https://fireworks.ai
# FIREWORKS_MODEL=accounts/fireworks/models/llama-v3p3-70b-instruct

# Ollama Configuration (if using Ollama - local LLM)
# See docs/OLLAMA-CONFIGURATION.md for detailed setup and optimization
#OLLAMA_MODEL=llama3.2:3b              # 3B params - good balance, recommended default
#OLLAMA_BASE_URL=http://localhost:11434
#OLLAMA_TIMEOUT=900.0  # 15 minutes timeout for graph extraction (default: 900.0)
# Note: Graph extraction with Ollama can be slow (5-15 minutes per document)
# Increase OLLAMA_TIMEOUT if you get ReadTimeout errors during processing

# ====================================================================
# EMBEDDING CONFIGURATION (Independent of LLM Provider)
# ====================================================================
# Embeddings can be configured separately from your LLM provider.
# For detailed configuration examples, see: docs/LLM-EMBEDDING-CONFIG.md
#
# EMBEDDING_KIND: openai, ollama, google, vertex, azure, bedrock, fireworks
# EMBEDDING_MODEL: Specific model name
# EMBEDDING_DIMENSION: Optional explicit dimension override
#
# Quick Examples (see docs for full details):
# - Local Ollama embeddings: EMBEDDING_KIND=ollama, EMBEDDING_MODEL=nomic-embed-text (768 dims)
# - OpenAI embeddings: EMBEDDING_KIND=openai, EMBEDDING_MODEL=text-embedding-3-small (1536 dims)
# - Vertex AI embeddings: EMBEDDING_KIND=vertex, EMBEDDING_MODEL=text-embedding-004 (768 dims)
# - Bedrock embeddings: EMBEDDING_KIND=bedrock, EMBEDDING_MODEL=amazon.titan-embed-text-v2:0 (1024 dims)
# - Fireworks embeddings: EMBEDDING_KIND=fireworks, EMBEDDING_MODEL=nomic-ai/nomic-embed-text-v1.5 (768 dims)

# OpenAI Embeddings
EMBEDDING_KIND=openai
EMBEDDING_MODEL=text-embedding-3-small

# Ollama Embeddings (if using Ollama embeddings)
# See docs/LLM-EMBEDDING-CONFIG.md for detailed model options
# WARNING: all-minilm (384 dims, 512 context) is NOT recommended - its 512-token limit causes errors
# when embedding graph nodes (combined entity + relationship text). Use nomic-embed-text instead.
# EMBEDDING_KIND=ollama
#EMBEDDING_MODEL=nomic-embed-text       # 768 dims, 8192 context - recommended default
#EMBEDDING_MODEL=mxbai-embed-large      # 1024 dims, 512 context - higher quality but limited context
#EMBEDDING_MODEL=all-minilm             # 384 dims, 512 context - NOT RECOMMENDED (context too small)

# ====================================================================
# 5. CONTENT SOURCES CONFIGURATION
# ====================================================================

# Data Source Type (mainly for REST API and MCP server usage)
# Options: filesystem, cmis, alfresco, upload, web, wikipedia, youtube, s3, gcs, azure_blob, onedrive, sharepoint, box, google_drive
# Note: UI clients typically override this setting with their own data source selection
DATA_SOURCE=filesystem

# Source Paths (for filesystem data source - REST API and MCP server)
# Note: UI clients use file upload dialogs instead of this configuration
# For path format examples, see docs/DATA-SOURCE-CONFIGURATION.md
SOURCE_PATHS=["./sample-docs/cmispress.txt"]

# CMIS Configuration (if using CMIS)
CMIS_URL=http://localhost:8080/alfresco/api/-default-/public/cmis/versions/1.1/atom
CMIS_USERNAME=admin
CMIS_PASSWORD=admin

# Alfresco Configuration (if using Alfresco)
# Note: Requires python-alfresco-api >= 1.1.5 for optimal performance
# Paths: Use short format like "/Shared/GraphRAG" (recommended - matches Alfresco Share UI)
#        Full format like "/Company Home/Shared/GraphRAG" also works (prefix is auto-stripped)
#        Both formats supported - system handles /Company Home prefix automatically
ALFRESCO_URL=http://localhost:8080
ALFRESCO_USERNAME=admin
ALFRESCO_PASSWORD=admin
# ALFRESCO_STOMP_PORT: ActiveMQ STOMP port for real-time events (default: 61613)
# Note: If you've changed the ActiveMQ STOMP port in docker-compose (e.g., to 8613), set it here
#ALFRESCO_STOMP_PORT=8613

# Amazon S3 Configuration (if using S3)
# Note: These values serve as defaults/fallbacks when S3_CONFIG is not provided or incomplete
# S3_CONFIG (in section above) takes precedence, these are used as fallbacks
# S3_REGION_NAME=us-east-1           # AWS region for S3 bucket (default: us-east-1 if not in S3_CONFIG)
# S3_BUCKET_NAME=my-bucket           # Default bucket name (used if not specified in S3_CONFIG)
# S3_PREFIX=documents/               # Default prefix/path within bucket
# S3_PREFIX=""                       # Or leave empty for whole bucket (top level)
# S3_ACCESS_KEY=myaccesskeyid        # AWS access key ID (used if not specified in S3_CONFIG)
# S3_SECRET_KEY=mysecretaccesskey    # AWS secret access key (used if not specified in S3_CONFIG)

# ====================================================================
# DATABASE CONNECTION DETAILS (Individual Configs)
# ====================================================================

# Neo4j Configuration (for both vector and graph storage)
# Browser URL: http://localhost:7474/browser (for database management and queries)
NEO4J_URI=bolt://localhost:7687
NEO4J_USER=neo4j
NEO4J_PASSWORD=password
# NEO4J_DATABASE=neo4j  # Optional: specify database name (default: neo4j)

# Neo4j AuraDB (cloud) example:
# NEO4J_URI=neo4j+s://<dbid>.databases.neo4j.io
# NEO4J_USER=neo4j
# NEO4J_PASSWORD=<aura-generated-password>
# Console URL: https://console.neo4j.io

# Elasticsearch Configuration
ELASTICSEARCH_URL=http://localhost:9200
ELASTICSEARCH_USERNAME=
ELASTICSEARCH_PASSWORD=

# Qdrant Configuration
QDRANT_HOST=localhost
QDRANT_PORT=6333
QDRANT_API_KEY=
# QDRANT_COLLECTION=hybrid_search  # Optional: collection name
# QDRANT_HTTPS=false  # Use HTTPS for remote Qdrant instances

# Weaviate Configuration
WEAVIATE_URL=http://localhost:8081
WEAVIATE_INDEX_NAME=HybridSearch
# WEAVIATE_API_KEY=your_weaviate_api_key  # Optional: for authenticated instances
# WEAVIATE_TEXT_KEY=content  # Optional: field name for text content

# OpenSearch Configuration
OPENSEARCH_URL=http://localhost:9201
OPENSEARCH_USERNAME=
OPENSEARCH_PASSWORD=

# Processing Configuration
CHUNK_SIZE=1024
CHUNK_OVERLAP=128

#CHUNK_SIZE=512
#CHUNK_OVERLAP=64

# Knowledge Graph Extraction Limits (configurable for different content densities)
# MAX_TRIPLETS_PER_CHUNK: Used by DynamicLLMPathExtractor and SchemaLLMPathExtractor - controls how many entity-relationship triplets can be extracted per text chunk
# MAX_PATHS_PER_CHUNK: Used by SimpleLLMPathExtractor - controls how many relationship paths can be extracted per text chunk
# Higher values allow more comprehensive extraction from dense content but may increase processing time
# Lower values are faster but may miss entities/relationships in complex documents
# These values are higher than the usual 20-25 seen in examples, are more to rule out
# that this is what is limiting extraction.
#MAX_TRIPLETS_PER_CHUNK=100
#MAX_PATHS_PER_CHUNK=100

MAX_TRIPLETS_PER_CHUNK=20
MAX_PATHS_PER_CHUNK=20

# Timeout configurations moved to docs/TIMEOUT-CONFIGURATIONS.md
# Uncomment and adjust these if you need custom timeout values:
# 
# DOCLING_TIMEOUT=600  # Docling conversion timeout per document (seconds, default: 10 minutes)
# DOCLING_CANCEL_CHECK_INTERVAL=0.5  # How often to check for cancellation (seconds)
#   - We wrap Docling in background executor with periodic polling for cancellation
#   - Lower = more responsive, higher = less CPU overhead
# 
# KG_EXTRACTION_TIMEOUT=3600  # Knowledge graph extraction timeout per document
# KG_CANCEL_CHECK_INTERVAL=2.0  # How often to check for cancellation during KG extraction
# 
# OPENAI_TIMEOUT=120.0  # For OpenAI LLM requests
# OLLAMA_TIMEOUT=300.0  # For Ollama LLM requests

# ====================================================================
# 6. INCREMENTAL UPDATES CONFIGURATION (Optional - for auto-sync monitoring)
# ====================================================================

# Enable incremental updates system (automatic monitoring and sync)
# Set to "true" to enable auto-sync capabilities
# Requires PostgreSQL for state management
# Default: false (incremental updates disabled)
ENABLE_INCREMENTAL_UPDATES=false

# PostgreSQL connection for state management (required if ENABLE_INCREMENTAL_UPDATES=true)
# Recommended: Reuse existing postgres-pgvector service on port 5433
# Create database: flexible_graphrag_incremental (separate from pgvector database)
# Setup: See flexible-graphrag/incremental_updates/schema.sql
# POSTGRES_INCREMENTAL_URL=postgresql://postgres:password@localhost:5433/flexible_graphrag_incremental

# Optional: Periodic refresh interval (seconds) - how often to scan datasources
# Default: 3600 (1 hour) - datasources refresh automatically every hour
# Individual datasources can override this via API
# INCREMENTAL_REFRESH_INTERVAL=3600

# Optional: Watchdog filesystem delay (seconds) - debounce for file changes
# Default: 60 (1 minute) - filesystem changes are processed after 1 minute of inactivity
# Individual datasources can override this via API
# INCREMENTAL_WATCHDOG_DELAY=60

# Note: Individual datasources created via UI "Enable Sync" checkbox can override
# these defaults with custom refresh intervals and watchdog delays per datasource

# Setup Instructions: See flexible-graphrag/incremental_updates/SETUP-GUIDE.md
# - PostgreSQL schema setup
# - Database creation steps
# - Configuration details
# - Troubleshooting guide

# =============================================================================
# 7. OBSERVABILITY CONFIGURATION (OpenTelemetry Traces and Metrics)
# =============================================================================
# Enable observability with OpenTelemetry, Prometheus, Grafana, Jaeger
# Set to "true" to enable automatic instrumentation of LlamaIndex operations
ENABLE_OBSERVABILITY=false

# Observability Backend Mode (choose your telemetry producer)
# Options:
#   "openinference" - Default, trace-focused, requires spanmetrics connector for token metrics
#   "openlit" - Alternative with built-in token metrics (gen_ai_usage_*_tokens_total)
#   "both" - DUAL MODE (recommended!) - Best of both worlds!
#     - OpenInference: Detailed traces with full LlamaIndex operation visibility
#     - OpenLIT: Token metrics, cost tracking, VectorDB metrics out-of-the-box
#     - Custom metrics: Graph extraction, retrieval, document processing
# Recommendation: Use "both" for complete observability with all metrics
OBSERVABILITY_BACKEND=both

# OTLP (OpenTelemetry Protocol) exporter endpoint
# Default: http://localhost:4318 (OTLP HTTP receiver)
# For gRPC: use http://localhost:4317
# For cloud services (SigNoz, Langfuse, etc.), use their endpoint
OTEL_EXPORTER_OTLP_ENDPOINT=http://localhost:4318

# Service metadata for traces
OTEL_SERVICE_NAME=flexible-graphrag
OTEL_SERVICE_VERSION=1.0.0
OTEL_SERVICE_NAMESPACE=llm-apps

# Enable automatic LlamaIndex instrumentation (captures all LlamaIndex operations)
# Set to "false" if you only want manual tracing with custom decorators
ENABLE_LLAMA_INDEX_INSTRUMENTATION=true