# Flexible GraphRAG Configuration

# ====================================================================
# Property Graph, RDF Graph, Vector, Search Database Selection
# ====================================================================

# Property graph store
# LI + LC (both LlamaIndex and LangChain backends supported):
PG_GRAPH_DB=neo4j
#PG_GRAPH_DB=arcadedb
#PG_GRAPH_DB=falkordb
#PG_GRAPH_DB=ladybug
#PG_GRAPH_DB=memgraph
#PG_GRAPH_DB=nebula
#PG_GRAPH_DB=neptune
#PG_GRAPH_DB=neptune_analytics
# LI only (LlamaIndex backend only):
#PG_GRAPH_DB=spanner
# LC only (LangChain backend only — auto-sets GRAPH_BACKEND=langchain):
#PG_GRAPH_DB=arangodb
#PG_GRAPH_DB=apache_age
#PG_GRAPH_DB=cosmos_gremlin
#PG_GRAPH_DB=hugegraph
#PG_GRAPH_DB=tigergraph
#PG_GRAPH_DB=surrealdb
#PG_GRAPH_DB=none

# RDF graph store
# (can have both a rdf and a property graph store)
#RDF_GRAPH_DB=fuseki
#RDF_GRAPH_DB=oxigraph
#RDF_GRAPH_DB=graphdb
#RDF_GRAPH_DB=neptune_rdf
RDF_GRAPH_DB=none

#VECTOR_DB=neo4j
VECTOR_DB=qdrant
#VECTOR_DB=elasticsearch
#VECTOR_DB=opensearch
#VECTOR_DB=chroma
#VECTOR_DB=milvus
#VECTOR_DB=weaviate
#VECTOR_DB=pinecone
#VECTOR_DB=postgres
#VECTOR_DB=lancedb
#VECTOR_DB=none

#SEARCH_DB=bm25
SEARCH_DB=elasticsearch
#SEARCH_DB=opensearch
#SEARCH_DB=none


# ====================================================================
# FRAMEWORK CONFIGURATION (LamaIndex, LangChain)
# ====================================================================

# Chunker, Graph, Vector, Search, KG Extractor, Fusion Retrieval settings framework
# allow all llamaindex (default) or all langchain, or mixed to be configured.
# Ingest/reading always uses LlamaIndex readers

# Chunker backend: llamaindex (default, SentenceSplitter) | langchain (text splitters)
CHUNKER_BACKEND=llamaindex
#CHUNKER_BACKEND=langchain

# Property graph backend.
# LC-only stores (arangodb, apache_age, cosmos_gremlin, spanner, hugegraph, tigergraph,
# surrealdb) auto-select langchain — no need to set explicitly for those.
GRAPH_BACKEND=llamaindex
#GRAPH_BACKEND=langchain

VECTOR_BACKEND=llamaindex
#VECTOR_BACKEND=langchain

SEARCH_BACKEND=llamaindex
#SEARCH_BACKEND=langchain

# KG extraction: llamaindex (default, SchemaLLMPathExtractor / DynamicLLMPathExtractor)
# or langchain (LLMGraphTransformer). Does NOT auto-select for LC-only stores —
# set explicitly when GRAPH_BACKEND=langchain and you want LLMGraphTransformer.
KG_EXTRACTOR_BACKEND=llamaindex
#KG_EXTRACTOR_BACKEND=langchain

# Retrieval fusion framework
# 'llamaindex' (default) — QueryFusionRetriever with relative_score mode;
# 'langchain'            — LangChain EnsembleRetriever (RRF);
RETRIEVAL_FUSION=llamaindex
#RETRIEVAL_FUSION=langchain


# =============================================================================
# Document Parser Configuration (Docling / LlamaParse)
# =============================================================================

# Document Parser Configuration
# DOCUMENT_PARSER can be "docling" (default, open-source) or "llamaparse" (cloud-based, requires API key)
# Docling: Free, open-source, runs locally, good for most document types
# LlamaParse: Cloud-based parsing service from LlamaIndex, advanced multimodal parsing, requires API key
DOCUMENT_PARSER=docling
#DOCUMENT_PARSER=llamaparse

# Save parsing results to files for inspection (works for both Docling and LlamaParse)
# When enabled, saves parsing outputs to ./parsing_output/
# - Docling: Saves both markdown (.md) and plaintext (.txt) outputs + metadata JSON
# - LlamaParse: Saves both markdown (.md) and plaintext (.txt) outputs + metadata JSON
# Useful for debugging and understanding how parsers interpret documents
#SAVE_PARSING_OUTPUT=false
#SAVE_PARSING_OUTPUT=true

# Format to use for knowledge graph extraction (what gets sent to LLM for entity/relation extraction)
# Applies to both Docling and LlamaParse
# Options:
#   "auto" (default) - Use markdown if tables detected, otherwise use plaintext
#   "markdown" - Always use markdown format (better for documents with tables/structure)
#   "plaintext" - Always use plaintext format (better for text-heavy documents)
# Note: Both formats are always saved to disk when SAVE_PARSING_OUTPUT=true
#PARSER_FORMAT_FOR_EXTRACTION=auto
#PARSER_FORMAT_FOR_EXTRACTION=markdown
#PARSER_FORMAT_FOR_EXTRACTION=plaintext

# =============================================================================
# Docling Config

# Docling Device Configuration (only used if DOCUMENT_PARSER=docling)
# Options:
#   "auto" (default) - Automatically uses GPU if available, falls back to CPU
#   "cpu" - Force CPU-only processing
#   "cuda" - Force CUDA/GPU processing (requires CUDA-capable GPU and PyTorch with CUDA)
#   "mps" - Force Apple Metal Performance Shaders (Mac with Apple Silicon)
# Note: GPU processing significantly speeds up PDF/document conversion with tables
#DOCLING_DEVICE=auto
#DOCLING_DEVICE=cpu
#DOCLING_DEVICE=cuda

# Enable OCR for scanned documents or images (default: false)
# Only needed if ingesting scanned PDFs or image files with no embedded text layer.
# Engine choices:
#   auto          — docling picks best available engine (default when DOCLING_OCR=true)
#   rapidocr      — already in docling-slim[standard], no extra install needed
#   easyocr       — install: uv pip install -e ".[docling-ocr-easyocr]"
#   tesseract_cli — RECOMMENDED on Windows — uses system tesseract.exe only (no pip package).
#                   Install Tesseract then ensure it is on PATH (new terminal after install).
#                   Typical path: "C:\\Program Files\\Tesseract-OCR"
#                   Linux: apt install tesseract-ocr
#   tesserocr     — rarely worth it on Windows: pip must compile C++ bindings; the build runs
#                   "tesseract" during install, so tesseract.exe must already be on PATH (same terminal).
#                   You also need MSVC Build Tools + Tesseract/Leptonica dev headers; errors like
#                   "Tesseract library not found in LIBPATH" or "Failed to extract tesseract version"
#                   mean use tesseract_cli instead or fix PATH and install a full dev stack.
#                   Linux: apt install tesseract-ocr libtesseract-dev libleptonica-dev build-essential
#                   then: uv pip install -e ".[docling-ocr-tesserocr]"
#   ocrmac        — macOS only; install: uv pip install -e ".[docling-ocr-ocrmac]"
#DOCLING_OCR=false
#DOCLING_OCR_ENGINE=auto

# =============================================================================
# LlamaParse Config
# Note: uses llama-parse 0.6.x (1.0 API) pinned in pyproject.toml with llama-cloud<2.0.
# The 2.0 unified SDK (llama-cloud-py) requires a separate migration — not yet implemented.

# LlamaParse API Key (only needed if DOCUMENT_PARSER=llamaparse)
# Get your API key from https://cloud.llamaindex.ai/  bottom left "API Keys", then "Generate New Key"
# LlamaParse offers high-quality document parsing of complex documents
#LLAMAPARSE_API_KEY=llx-your-api-key-here

# LlamaParse Mode Configuration (only used if DOCUMENT_PARSER=llamaparse)
# Options:
#   "parse_page_without_llm" - 1 credit/page (cheapest, simple, text-only output, no markdown)
#   "parse_page_with_llm" - 3 credits/page (good balance, uses LLM, markdown output) - DEFAULT
#   "parse_page_with_agent" - 10-90 credits/page (best quality, markdown output, requires LLAMAPARSE_AGENT_MODEL)
#LLAMAPARSE_MODE=parse_page_without_llm
LLAMAPARSE_MODE=parse_page_with_llm
#LLAMAPARSE_MODE=parse_page_with_agent

# LlamaParse Agent Model (only used if LLAMAPARSE_MODE=parse_page_with_agent)
# Available models from LlamaCloud (costs vary by model):
#   "openai-gpt-4-1-mini" - 10 credits/page (recommended balance)
#   Other models available - check LlamaCloud for full list / credit cost
#LLAMAPARSE_AGENT_MODEL=openai-gpt-4-1-mini


# =============================================================================
# Data Source Configuration
# =============================================================================

# Example configurations for different data sources:
# WEB_CONFIG={"url": "https://example.com/page"}
# WIKIPEDIA_CONFIG={"query": "artificial intelligence", "language": "en", "max_docs": 1}
# YOUTUBE_CONFIG={"url": "https://www.youtube.com/watch?v=VIDEO_ID", "chunk_size_seconds": 60}

# Amazon S3 Configuration
# S3_CONFIG={"bucket_name": "my-bucket", "access_key": "AKIAIOSFODNN7EXAMPLE", "secret_key": "wJalrXUtnFEMI/K7MDENG/bPxRfiCYEXAMPLEKEY", "region_name": "us-east-1", "prefix": "documents/"}
# Alternative format using "bucket" (S3Reader compatible):
# S3_CONFIG={"bucket": "my-bucket", "access_key": "AKIAIOSFODNN7EXAMPLE", "secret_key": "wJalrXUtnFEMI/K7MDENG/bPxRfiCYEXAMPLEKEY", "region_name": "us-east-1", "prefix": "documents/"}
# Required: bucket_name (or bucket), access_key, secret_key
# Optional: region_name (if not specified, falls back to S3_REGION_NAME env var or default "us-east-1"), prefix (folder path within bucket)
# Note: S3_CONFIG takes precedence over individual S3_* environment variables (see section 5 below)

# Google Cloud Storage Configuration
# GCS_CONFIG={"bucket_name": "my-bucket", "project_id": "my-project-123", "credentials": "{\"type\":\"service_account\",\"project_id\":\"my-project\",...}", "prefix": "documents/"}
# Required: bucket_name, project_id, credentials (service account JSON as string)
# Optional: prefix (folder path), folder_name, service_account_key_path (alternative to credentials string)
# Note: credentials should be JSON service account key from GCP Console as a string

# Google Drive Configuration
# GOOGLE_DRIVE_CONFIG={"credentials": "{\"type\":\"service_account\",\"project_id\":\"my-project\",...}", "folder_id": "1A2B3C4D5E6F7G8H9I0J", "file_ids": ["file1-id", "file2-id"], "query": "name contains 'report'"}
# Required: Either credentials (service account JSON string) OR credentials_path OR token_path
# Optional: folder_id, file_ids (specific files), query (search query), credentials_path, token_path
# Note: credentials should be service account JSON key or OAuth credentials file path

# Azure Blob Storage Configuration
# AZURE_BLOB_CONFIG={"container_name": "documents", "account_url": "https://myaccount.blob.core.windows.net", "account_name": "myaccount", "account_key": "base64key==", "prefix": "folder/"}
# Required: container_name, account_url, account_name, account_key
# Optional: blob (specific blob name), prefix (folder path), connection_string (alternative to account_key)

# Microsoft OneDrive Configuration
# ONEDRIVE_CONFIG={"user_principal_name": "admin@tenant.onmicrosoft.com", "client_id": "12345678-1234-1234-1234-123456789abc", "client_secret": "XFA8Q~abc123...", "tenant_id": "87654321-4321-4321-4321-cba987654321", "folder_path": "/Documents"}
# Required: user_principal_name (M365 email), client_id, client_secret, tenant_id
# Optional: folder_path (default: / for root), folder_id, file_ids
# Setup instructions: docs/DATA-SOURCE-CONFIGURATION.md

# Microsoft SharePoint Configuration
# SHAREPOINT_CONFIG={"client_id": "12345678-1234-1234-1234-123456789abc", "client_secret": "XFA8Q~abc123...", "tenant_id": "87654321-4321-4321-4321-cba987654321", "site_name": "MySite", "folder_path": "/Shared Documents"}
# Required: client_id, client_secret, tenant_id, site_name (SharePoint site name, not full URL)
# Optional: site_id, folder_path (default: /), folder_id, file_ids
# Setup instructions: docs/DATA-SOURCE-CONFIGURATION.md

# Box Configuration
# Developer Token (simplest, expires in 60 minutes):
# BOX_CONFIG={"folder_id": "0", "developer_token": "your_developer_token"}
# Client Credentials Grant (production, long-lived):
# BOX_CONFIG={"folder_id": "0", "client_id": "abc123", "client_secret": "xyz789", "user_id": "12345678"}
# For detailed setup instructions, see docs/DATA-SOURCE-CONFIGURATION.md

# Sample Text for Testing (optional - overrides default Dune text)
# SAMPLE_TEXT="Luke Skywalker is a Jedi Knight from Tatooine. His father is Darth Vader, formerly known as Anakin Skywalker."


# ====================================================================
# GRAPH DATABASE CONFIGURATION
# ====================================================================

# Enable knowledge graph extraction 
#ENABLE_KNOWLEDGE_GRAPH=false
ENABLE_KNOWLEDGE_GRAPH=true

# Knowledge graph extractor type: "simple", "schema", or "dynamic"
# simple: Basic extraction without schema constraints (uses SimpleLLMPathExtractor) 
# schema: Uses provided or buitlin schema for structured extraction (uses SchemaLLMPathExtractor) - default
# Use SchemaLLMPathExtractor with strict=True when you have a well-defined domain and want to ensure consistency
# Use SchemaLLMPathExtractor with strict=False when you want some flexibility while still being guided by a schema
# (for bedrock, fireworks, groq DynamicLLMPathExtractor is used instead of SchemaLLMPathExtractor to avoid a LlamaIndex issue)
# dynamic: More flexible extraction that can expand more beyond initial schema or no provided schema (uses DynamicLLMPathExtractor)
KG_EXTRACTOR_TYPE=schema
#KG_EXTRACTOR_TYPE=dynamic
#KG_EXTRACTOR_TYPE=simple

# Graph Database Connection Configurations:
# Each store has its own config variable — set once, switch stores by changing PG_GRAPH_DB= above.
# All configs can coexist in this file; only the one matching PG_GRAPH_DB is active.
# Precedence: {TYPE}_GRAPH_DB_CONFIG > GRAPH_DB_CONFIG > individual env vars

# Neo4j (active when PG_GRAPH_DB=neo4j)
NEO4J_GRAPH_DB_CONFIG={"url": "bolt://localhost:7687", "username": "neo4j", "password": "password"}

# Ladybug — embedded property graph (active when PG_GRAPH_DB=ladybug)
# use_vector_index: true (default) - use Ladybug's built-in vector index for similarity search
#LADYBUG_GRAPH_DB_CONFIG={"db_dir": "./ladybug", "db_file": "database.lbug", "use_vector_index": true, "has_structured_schema": false}
LADYBUG_GRAPH_DB_CONFIG={"db_dir": "./ladybug", "db_file": "database.lbug", "use_vector_index": false, "has_structured_schema": true, "strict_schema": false}
#LADYBUG_GRAPH_DB_CONFIG={"db_dir": "./ladybug", "db_file": "database.lbug", "use_vector_index": false, "has_structured_schema": true, "strict_schema": true}
# individual vars still work as fallback:
#LADYBUG_DB_DIR=./ladybug
#LADYBUG_DB_FILE=database.lbug
#LADYBUG_USE_VECTOR_INDEX=true
#LADYBUG_USE_VECTOR_INDEX=false
#LADYBUG_STRUCTURED_SCHEMA=false
#LADYBUG_STRUCTURED_SCHEMA=true
#LADYBUG_STRICT_SCHEMA=false
#LADYBUG_STRICT_SCHEMA=true

# FalkorDB (active when PG_GRAPH_DB=falkordb)
#FALKORDB_GRAPH_DB_CONFIG={"url": "falkor://localhost:6379"}
FALKORDB_GRAPH_DB_CONFIG={"url": "falkor://localhost:6379", "database": "falkor"}


# ArcadeDB (active when PG_GRAPH_DB=arcadedb)

# Remote mode (default) — requires a running ArcadeDB server

# With LlamaIndex (GRAPH_BACKEND=llamaindex) (default)
# include_basic_schema: true (default) - initial types: PERSON, ORGANIZATION, LOCATION, PLACE + Entity, TextChunk, MENTIONS
#                       false - initial types: Entity, TextChunk, MENTIONS only
ARCADEDB_GRAPH_DB_CONFIG={"host": "localhost", "port": 2480, "username": "root", "password": "playwithdata", "database": "flexible_graphrag", "include_basic_schema": true}
#ARCADEDB_GRAPH_DB_CONFIG={"host": "localhost", "port": 2480, "username": "root", "password": "playwithdata", "database": "flexible_graphrag", "include_basic_schema": false}

# With LangChain (GRAPH_BACKEND=langchain) not llamaindex 
# requires Bolt port 7689 and Bolt plugin config — see docker/includes/arcadedb.yaml 
# Note: bolt_port 7689 avoids conflict with Neo4j (7687) and MemGraph (7688).  Also need port 2480
#ARCADEDB_GRAPH_DB_CONFIG={"host": "localhost", "port": 2480, "bolt_port": 7689, "username": "root", "password": "playwithdata", "database": "flexible_graphrag"}

# Embedded mode — no separate server required; install arcadedb-embedded>=26.2.1 first

# embedded_server: false (default) - pure in-process, no HTTP endpoint exposed
# embedded_server: true - also starts an HTTP server on embedded_server_port (default 2482)
# include_basic_schema: true (default) - initial types: PERSON, ORGANIZATION, LOCATION, PLACE + Entity, TextChunk, MENTIONS
#                       false - initial types: Entity, TextChunk, MENTIONS only
# With LlamaIndex (GRAPH_BACKEND=llamaindex) (default)
#ARCADEDB_GRAPH_DB_CONFIG={"mode": "embedded", "db_path": "./arcadedb_data", "database": "flexible_graphrag", "include_basic_schema": true}
#ARCADEDB_GRAPH_DB_CONFIG={"mode": "embedded", "db_path": "./arcadedb_data", "database": "flexible_graphrag", "embedded_server": true, "embedded_server_port": 2482, "embedded_server_password": "playwithdata", "include_basic_schema": true}


# Memgraph (active when PG_GRAPH_DB=memgraph)
#MEMGRAPH_GRAPH_DB_CONFIG={"url": "bolt://localhost:7688", "username": "", "password": ""}
MEMGRAPH_GRAPH_DB_CONFIG={"url": "bolt://localhost:7688", "username": "", "password": "", "database": "memgraph"}

# NebulaGraph (active when PG_GRAPH_DB=nebula)
# Note: Nebula Studio (localhost:7001) login — use host "nebula-graphd", port 9669, user root, password nebula.
NEBULA_GRAPH_DB_CONFIG={"space": "flexible_graphrag", "overwrite": true, "address": "localhost", "port": 9669, "username": "root", "password": "nebula"}
# Basic configuration (uses defaults for connection):
#NEBULA_GRAPH_DB_CONFIG={"space": "flexible_graphrag", "overwrite": true}
# Without overwrite (space must be pre-created manually):
#NEBULA_GRAPH_DB_CONFIG={"space": "flexible_graphrag", "address": "localhost", "port": 9669, "username": "root", "password": "nebula"}
# Full configuration with URL format:
#NEBULA_GRAPH_DB_CONFIG={"space": "flexible_graphrag", "overwrite": true, "url": "nebula://localhost:9669", "username": "root", "password": "nebula"}
# Alternative format (space_name also supported for backward compatibility):
#NEBULA_GRAPH_DB_CONFIG={"space_name": "flexible_graphrag", "address": "localhost", "port": 9669, "username": "root", "password": "nebula"}

# Amazon Neptune (active when PG_GRAPH_DB=neptune)
#NEPTUNE_GRAPH_DB_CONFIG={"host": "your-neptune-cluster.cluster-xyz.us-east-1.neptune.amazonaws.com", "port": 8182, "region": "us-east-1"}
# With explicit AWS credentials:
#NEPTUNE_GRAPH_DB_CONFIG={"host": "your-neptune-cluster.cluster-xyz.us-east-1.neptune.amazonaws.com", "port": 8182, "region": "us-east-1", "access_key": "your_access_key", "secret_key": "your_secret_key"}
# With AWS credentials profile (alternative approach):
#NEPTUNE_GRAPH_DB_CONFIG={"host": "your-neptune-cluster.cluster-xyz.us-east-1.neptune.amazonaws.com", "port": 8182, "region": "us-east-1", "credentials_profile_name": "my-aws-profile"}

# Amazon Neptune Analytics (active when PG_GRAPH_DB=neptune_analytics)
# With explicit AWS credentials:
NEPTUNE_ANALYTICS_GRAPH_DB_CONFIG={"graph_identifier": "g-1234567890", "region": "us-east-1", "access_key": "your_access_key", "secret_key": "your_secret_key"}
# With AWS credentials profile (alternative approach):
#NEPTUNE_ANALYTICS_GRAPH_DB_CONFIG={"graph_identifier": "g-1234567890", "region": "us-east-1", "credentials_profile_name": "my-aws-profile"}
# Using default AWS credentials (from environment variables, IAM role, etc.):
#NEPTUNE_ANALYTICS_GRAPH_DB_CONFIG={"graph_identifier": "g-1234567890", "region": "us-east-1"}

# LangChain-only property graph stores

# ArangoDB (active when PG_GRAPH_DB=arangodb)
ARANGODB_GRAPH_DB_CONFIG={"url": "http://localhost:8529", "database": "flexible-graphrag", "username": "root", "password": "testpass", "graph_name": "knowledge_graph"}

# Apache AGE / PostgreSQL — AGE graph + pgvector in one database (port 5434)
# Database: flexible_graphrag_age  (separate from postgres-pgvector at 5433)
# AGE_SEARCH_TYPE: "hybrid" (vector + full-text RRF) or "vector" (pure similarity)
APACHE_AGE_GRAPH_DB_CONFIG={"host": "localhost", "port": 5434, "database": "flexible_graphrag_age", "username": "postgres", "password": "password", "graph_name": "knowledge_graph", "collection_name": "langchain_age_vectors", "search_type": "hybrid"}

# Azure Cosmos DB Gremlin API (active when PG_GRAPH_DB=cosmos_gremlin)
# Cloud (Azure Cosmos DB for Gremlin):
#   url:                   wss://<account>.gremlin.cosmos.azure.com:443/
#   username:              /dbs/<database>/colls/<graph-container>
#   password:              primary key from Azure portal (Keys -> Primary Key)
#   partition_key_property: property name matching your container's partition key path
#                           (e.g. container created with /partitionKey -> use "partitionKey")
#                           Default: "partitionKey" (auto-applied for cosmos.azure.com URLs)
#   partition_key_value:    fixed value written on every vertex — keeps all graph data in one
#                           logical partition so traversals never cross partition boundaries.
#                           Do NOT use entity type — that scatters vertices. Default: "graph"
#
# Auto-create graph container (optional): add resource_group + subscription_id.
#   Requires: pip install azure-mgmt-cosmosdb azure-identity
#   The adapter will create the Gremlin database and graph container on startup if they
#   don't exist. Without these keys the auto-create is silently skipped.
#
#   IMPORTANT — use ClientSecretCredential (service principal) to avoid antivirus false
#   positives on Windows. DefaultAzureCredential spawns az CLI / PowerShell subprocesses
#   to fetch tokens, which Norton 360 and similar tools flag as suspicious behavior.
#   Add tenant_id + client_id + client_secret to use pure-HTTPS auth instead:
#     tenant_id:    Azure AD tenant ID (Portal: Azure Active Directory -> Overview)
#     client_id:    App registration Application (client) ID
#     client_secret: App registration client secret value
#   Create a service principal: az ad sp create-for-rbac --name flexible-graphrag-cosmos
#   Grant it the Cosmos DB Operator role on your account:
#     az role assignment create --assignee <client_id> \
#       --role "Cosmos DB Operator" --scope /subscriptions/<sub>/resourceGroups/<rg>
#
# Manual creation (Azure Portal): Cosmos DB account -> Data Explorer -> New Graph
#   Database id: graphdb | Graph id: knowledge_graph | Partition key: /partitionKey
# Manual creation (Azure CLI):
#   az cosmosdb gremlin database create --account-name <acct> --name graphdb --resource-group <rg>
#   az cosmosdb gremlin graph create --account-name <acct> --database-name graphdb \
#       --name knowledge_graph --partition-key-path /partitionKey --resource-group <rg>
#COSMOS_GREMLIN_GRAPH_DB_CONFIG={"url": "wss://my-cosmos.gremlin.cosmos.azure.com:443/", "username": "/dbs/graphdb/colls/knowledge_graph", "password": "your_primary_key==", "partition_key_property": "partitionKey", "partition_key_value": "graph"}
#COSMOS_GREMLIN_GRAPH_DB_CONFIG={"url": "wss://my-cosmos.gremlin.cosmos.azure.com:443/", "username": "/dbs/graphdb/colls/knowledge_graph", "password": "your_primary_key==", "partition_key_property": "partitionKey", "partition_key_value": "graph", "subscription_id": "your-sub-id", "resource_group": "your-rg", "tenant_id": "your-tenant-id", "client_id": "your-client-id", "client_secret": "your-client-secret"}
# Local TinkerPop Gremlin Server (Docker port 8182 — docker/includes/gremlin-server.yaml):
COSMOS_GREMLIN_GRAPH_DB_CONFIG={"url": "ws://localhost:8182/gremlin", "username": "/", "password": ""}

# Google Cloud Spanner Graph (active when PG_GRAPH_DB=spanner)
# LI-only: uses llama-index-spanner (SpannerPropertyGraphStore). Cloud only — no emulator support
# (the Spanner emulator supports SQL only, not Spanner Graph).
# Install: uv pip install -e ".[spanner-extras]" && uv pip uninstall llama-index
# (having both llama-index and llama-index-core causes version conflicts — uninstall the meta-package)
# LC is not supported — langchain-google-spanner requires langchain-core<1.0 (incompatible).
#
# use_flexible_schema (default true): creates {graph_name}_NODE + {graph_name}_EDGE Spanner
#   tables with a JSON properties column — "schemaless" mode. DO NOT pre-create tables or run
#   CREATE PROPERTY GRAPH manually; the library creates everything on first ingest.
#
# Authentication (in order of priority):
#   1. credentials_file key in config (service-account JSON)
#   2. GOOGLE_APPLICATION_CREDENTIALS environment variable
#   3. gcs.json auto-detected next to flexible-graphrag/ root (if present)
#   4. Application Default Credentials (gcloud auth application-default login / GCE)
#SPANNER_GRAPH_DB_CONFIG={"project_id": "my-gcp-project", "instance_id": "my-spanner-instance", "database_id": "my-database", "graph_name": "knowledge_graph"}
SPANNER_GRAPH_DB_CONFIG={"project_id": "my-gcp-project", "instance_id": "my-spanner-instance", "database_id": "my-database", "graph_name": "knowledge_graph", "credentials_file": "./gcs.json"}

# HugeGraph (active when PG_GRAPH_DB=hugegraph; retrieval-only — add_graph_documents limited)
HUGEGRAPH_GRAPH_DB_CONFIG={"host": "localhost", "port": 8082, "username": "admin", "password": "password", "database": "hugegraph"}

# TigerGraph (active when PG_GRAPH_DB=tigergraph; retrieval-only — GSQL-based)
# Docker: docker/includes/tigergraph.yaml  — GraphStudio UI: http://localhost:14240
# Port 14240 = GraphStudio / REST API (direct 1:1 host mapping)
# Port 9002  = RESTPP / GSQL HTTP API (container 9000 remapped; 9000 conflicts with Milvus MinIO)
TIGERGRAPH_GRAPH_DB_CONFIG={"host": "http://localhost", "port": 14240, "restpp_port": 9002, "database": "MyGraph", "username": "tigergraph", "password": "tigergraph"}

# SurrealDB (active when PG_GRAPH_DB=surrealdb)
# Requires the surrealdb-extras group — NOT included in langchain-extras due to conflict:
#   langchain-surrealdb==0.2.x pins surrealdb==1.0.8 strictly, conflicting with surrealdb>=2.0.0
#   and downgrading langchain-core from 1.3.x to 1.1.3.
# Install separately (accepts the downgrade):
#   uv pip install -e ".[surrealdb-extras]"
SURREALDB_GRAPH_DB_CONFIG={"url": "ws://localhost:8010/rpc", "namespace": "test", "database": "flexible_graphrag", "username": "root", "password": "root"}


# ====================================================================
# SCHEMA CONFIGURATION - Controls entity and relationship extraction
# ====================================================================

# default uses schema builtin into LlamaIndex (no schema passed to SchemaLLMPathExtractor or DynamicLLMPathExtractor)
SCHEMA_NAME=default

# Schema set to sample uses SAMPLE_SCHEMA in config.py
# "entities": ["PERSON", "ORGANIZATION", "LOCATION", "PLACE", "TECHNOLOGY", "PROJECT"],
# "relations": ["WORKS_FOR", "LOCATED_IN", "USES", "COLLABORATES_WITH", "DEVELOPS", "HAS", "PART_OF", "WORKED_ON", "WORKED_WITH", "WORKED_AT"],
# "validation_schema": [ bunch of type relation-type type triplets ]
#SCHEMA_NAME=sample

# Custom schema example for SchemaLLMPathExtractor (and DynamicLLMPathExtractor)
#SCHEMA_NAME=cmis_press
#SCHEMAS=[{"name": "cmis_press", "schema": {"entities": ["PERSON", "ORGANIZATION", "TECHNOLOGY", "SPECIFICATION", "CONCEPT"], "relations": ["WORKS_FOR", "DEVELOPS", "SUPPORTS", "IMPLEMENTS", "COLLABORATES_WITH", "ANNOUNCES"], "validation_schema": [["PERSON", "WORKS_FOR", "ORGANIZATION"], ["PERSON", "DEVELOPS", "TECHNOLOGY"], ["ORGANIZATION", "SUPPORTS", "SPECIFICATION"], ["ORGANIZATION", "IMPLEMENTS", "TECHNOLOGY"], ["ORGANIZATION", "COLLABORATES_WITH", "ORGANIZATION"], ["ORGANIZATION", "ANNOUNCES", "SPECIFICATION"]], "strict": false}}]

STRICT_SCHEMA_VALIDATION=false
#STRICT_SCHEMA_VALIDATION=true
# true = only extract types in schema; false = schema guides but LLM can go beyond it (default)

# Disable entity and relation property extraction
# Default: false - properties enabled for all providers (OpenAI uses function calling to avoid the
# "additionalProperties: false" structured output constraint)
# Set to true only if a specific LLM provider or model produces errors during property extraction
DISABLE_PROPERTIES=false
#DISABLE_PROPERTIES=true


# ====================================================================
# VECTOR DATABASE CONFIGURATION  
# ====================================================================


# Vector Database Connection Configurations:
# Each store has its own config variable — set once, switch stores by changing VECTOR_DB= above.
# All configs can coexist in this file; only the one matching VECTOR_DB is active.
# Precedence: {TYPE}_VECTOR_DB_CONFIG > VECTOR_DB_CONFIG > individual env vars
#
# NOTE: If you change embedding dimensions (e.g. switching models), you must
# delete and recreate the vector index — existing embeddings are incompatible.

# Qdrant (active when VECTOR_DB=qdrant)
QDRANT_VECTOR_DB_CONFIG={"host": "localhost", "port": 6333, "collection_name": "hybrid_search_vector", "https": false}

# Elasticsearch (active when VECTOR_DB=elasticsearch)
ELASTICSEARCH_VECTOR_DB_CONFIG={"url": "http://localhost:9200", "index_name": "hybrid_search_vector"}

# OpenSearch (active when VECTOR_DB=opensearch)
OPENSEARCH_VECTOR_DB_CONFIG={"url": "http://localhost:9201", "index_name": "hybrid_search_vector"}

# Neo4j vector store (active when VECTOR_DB=neo4j; separate from PG_GRAPH_DB)
NEO4J_VECTOR_DB_CONFIG={"url": "bolt://localhost:7687", "username": "neo4j", "password": "password", "index_name": "hybrid_search_vector", "database": "neo4j"}

# Chroma (active when VECTOR_DB=chroma)
# Local file mode:
CHROMA_VECTOR_DB_CONFIG={"persist_directory": "./chroma_db", "collection_name": "hybrid_search"}
# HTTP client mode:
#CHROMA_VECTOR_DB_CONFIG={"host": "localhost", "port": 8001, "collection_name": "hybrid_search"}

# Milvus (active when VECTOR_DB=milvus)
MILVUS_VECTOR_DB_CONFIG={"host": "localhost", "port": 19530, "collection_name": "hybrid_search", "username": "root", "password": "milvus"}

# Weaviate (active when VECTOR_DB=weaviate)
# For local Docker instance (no authentication):
WEAVIATE_VECTOR_DB_CONFIG={"url": "http://localhost:8081", "index_name": "HybridSearch"}
# For authenticated instance (with API key):
#WEAVIATE_VECTOR_DB_CONFIG={"url": "http://localhost:8081", "index_name": "HybridSearch", "api_key": "your_weaviate_api_key"}

# Pinecone (active when VECTOR_DB=pinecone)
# Sign up at https://app.pinecone.io (free starter plan available)
# Note: dimension is auto-detected from your embedding model, don't include it in config
PINECONE_VECTOR_DB_CONFIG={"api_key": "your_pinecone_api_key", "region": "us-east-1", "cloud": "aws", "index_name": "hybrid-search", "metric": "cosine"}

# PostgreSQL pgvector (active when VECTOR_DB=postgres)
POSTGRES_VECTOR_DB_CONFIG={"host": "localhost", "port": 5433, "database": "postgres", "username": "postgres", "password": "password", "table_name": "hybrid_search_vectors"}

# LanceDB (active when VECTOR_DB=lancedb)
LANCEDB_VECTOR_DB_CONFIG={"uri": "./lancedb", "table_name": "hybrid_search", "vector_column_name": "vector", "text_column_name": "text"}


# ====================================================================
# SEARCH DATABASE CONFIGURATION
# ====================================================================


# Search Database Connection Configurations:
# Precedence: {TYPE}_SEARCH_DB_CONFIG > SEARCH_DB_CONFIG > individual env vars

# Elasticsearch (active when SEARCH_DB=elasticsearch)
ELASTICSEARCH_SEARCH_DB_CONFIG={"url": "http://localhost:9200", "index_name": "hybrid_search_fulltext"}
#ELASTICSEARCH_SEARCH_DB_CONFIG={"url": "https://localhost:9200", "index_name": "hybrid_search_fulltext", "username": "elastic", "password": "password", "verify_certs": false}

# OpenSearch (active when SEARCH_DB=opensearch)
# Default port for OpenSearch is 9200; use 9201 if running alongside Elasticsearch
OPENSEARCH_SEARCH_DB_CONFIG={"url": "http://localhost:9201", "index_name": "hybrid_search_fulltext"}
#OPENSEARCH_SEARCH_DB_CONFIG={"url": "https://localhost:9201", "index_name": "hybrid_search_fulltext", "username": "admin", "password": "admin", "verify_certs": false}


# ====================================================================
# LLM CONFIGURATION
# ====================================================================

# For detailed configuration examples and all options, see: docs/LLM-EMBEDDING-CONFIG.md

# LLM Provider Selection
LLM_PROVIDER=openai
#LLM_PROVIDER=ollama
#LLM_PROVIDER=azure_openai
#LLM_PROVIDER=gemini
#LLM_PROVIDER=anthropic
#LLM_PROVIDER=vertex_ai
#LLM_PROVIDER=bedrock
#LLM_PROVIDER=groq
#LLM_PROVIDER=fireworks
#LLM_PROVIDER=openai_like    # Any OpenAI-compatible API (also used to use vLLM docker)
#LLM_PROVIDER=vllm           # vLLM Python package (Linux/macOS only — see vLLM section below)
#LLM_PROVIDER=litellm        # LiteLLM proxy (100+ providers)
#LLM_PROVIDER=openrouter     # OpenRouter (200+ models via unified API)

# OpenAI Configuration (if using OpenAI)
OPENAI_API_KEY=your-openai-api-key-here
OPENAI_MODEL=gpt-4.1-mini
#OPENAI_MODEL=gpt-5-mini        #  LlamaIndex classifies gpt-5 family as reasoning (O1) models
                                 #   → temperature is forced to 1.0 regardless of OPENAI_TEMPERATURE
                                 #   → non-deterministic extraction, use gpt-4.1-mini instead
#OPENAI_MODEL=gpt-4o
#OPENAI_MODEL=gpt-4o-mini
# gpt-4.1-mini:  $0.40/$1.60 per 1M tokens, 1M ctx   — best value; respects temperature=0.0
# gpt-5-mini:    $0.25/$2.00 per 1M tokens, 400K ctx  — reasoning model; temp forced to 1.0 by LlamaIndex
# gpt-4o:        $2.50/$10.00 per 1M tokens, 128K ctx — high quality, more expensive
# gpt-4o-mini:   $0.15/$0.60 per 1M tokens, 128K ctx  — fastest/cheapest, lower extraction quality
OPENAI_TEMPERATURE=0.0  # 0.0 = deterministic extraction (recommended)
# OPENAI_TIMEOUT=120.0  # LLM request timeout in seconds (default: 2 minutes)

# Ollama Configuration (if using Ollama - local LLM)
# See docs/OLLAMA-CONFIGURATION.md for detailed setup and optimization
# See docs/LLM/LLM-TESTING-RESULTS.md for full Ollama benchmark results
OLLAMA_MODEL=gpt-oss:20b             # 20B params - RECOMMENDED for graph extraction
#OLLAMA_MODEL=llama3.1:8b             # 8B params  - NOT RECOMMENDED: very slow, hangs on 2nd doc
#OLLAMA_MODEL=llama3.2:3b             # 3B params  - NOT RECOMMENDED: stalls at 8192 context, low entity count
OLLAMA_BASE_URL=http://localhost:11434
OLLAMA_TIMEOUT=900.0  # 15 minutes timeout for graph extraction (default: 900.0)

# Azure OpenAI Configuration (if using Azure OpenAI)
# See docs/LLM-EMBEDDING-CONFIG.md for detailed examples
AZURE_OPENAI_API_KEY=your-azure-openai-key
AZURE_OPENAI_ENDPOINT=https://your-resource.openai.azure.com/
AZURE_OPENAI_ENGINE=gpt-4.1-mini
AZURE_OPENAI_MODEL=gpt-4.1-mini
AZURE_OPENAI_API_VERSION=2024-12-01-preview

# Google Gemini Configuration (if using Gemini)
# See docs/LLM-EMBEDDING-CONFIG.md for detailed examples
GOOGLE_API_KEY=your-google-api-key
GEMINI_MODEL=gemini-3-flash-preview    # fast model - recommended default (gemini-2.5 shutdown 6/17/26)
# GEMINI_MODEL=gemini-3.1-pro-preview    # pro quality
# GEMINI_MODEL=gemini-2.5-flash          # shutdown 6/17/26

# Anthropic Claude Configuration (if using Claude)
# See docs/LLM-EMBEDDING-CONFIG.md for detailed examples
ANTHROPIC_API_KEY=your-anthropic-api-key
ANTHROPIC_MODEL=claude-sonnet-4-5-20250929

# Google Vertex AI Configuration (if using Vertex AI)
# See docs/LLM-EMBEDDING-CONFIG.md for detailed examples and package options
# VERTEX_AI_PROJECT=your-gcp-project-id  # Required
# VERTEX_AI_LOCATION=us-central1

VERTEX_AI_MODEL=gemini-3-flash-preview
# VERTEX_AI_MODEL=gemini-2.5-flash        # shutdown no earlier than 10/16/26
# VERTEX_AI_MODEL=gemini-3.1-pro-preview  # pro quality option
# VERTEX_AI_CREDENTIALS_PATH=/path/to/service-account-key.json  # Optional
# See: https://developers.llamaindex.ai/python/examples/llm/google_genai/#vertex-ai-support

# Amazon Bedrock Configuration (if using Bedrock)
# See docs/LLM-EMBEDDING-CONFIG.md for detailed examples and available models
BEDROCK_MODEL=us.anthropic.claude-sonnet-4-5-20250929-v1:0
# BEDROCK_MODEL=openai.gpt-oss-20b-1:0
# BEDROCK_REGION=us-east-1
# BEDROCK_ACCESS_KEY=your-aws-access-key  # Optional: uses default AWS credentials if not provided
# BEDROCK_SECRET_KEY=your-aws-secret-key

# Groq Configuration (if using Groq - ultra-fast inference)
# See docs/LLM-EMBEDDING-CONFIG.md for detailed examples and available models
# See pricing: https://groq.com/pricing
# All Groq models have 131,072 context window; LlamaIndex doesn't know Groq-specific models
# and defaults to 3900 tokens — flexible-graphrag overrides this automatically.
# GROQ_API_KEY=your-groq-api-key  # Get from https://console.groq.com
GROQ_MODEL=openai/gpt-oss-20b      # 131072 ctx, 65536 max completion, 1000 t/s — recommended
# GROQ_MODEL=openai/gpt-oss-120b     # 131072 ctx, 65536 max completion, 500 t/s
# GROQ_MODEL=llama-3.3-70b-versatile # 131072 ctx, 32768 max completion, 280 t/s
# GROQ_MODEL=llama-3.1-8b-instant    # 131072 ctx, 131072 max completion, 560 t/s (smaller model)
# Note: Groq doesn't provide embeddings - defaults to local Ollama embeddings

# Fireworks AI Configuration (if using Fireworks - fast compound AI)
# Filter to: Serverless deployment + Function calling support
# All models below confirmed serverless + function calling.
# Uses DynamicLLMPathExtractor (SchemaLLMPathExtractor throws exceptions).
# flexible-graphrag uses streaming mode (_FireworksStreaming subclass) to bypass the
#   non-streaming 4096 max_tokens cap. Default max_tokens=16384.
# FIREWORKS_API_KEY=your-fireworks-api-key  # Get from https://fireworks.ai
FIREWORKS_MODEL=accounts/fireworks/models/gpt-oss-120b
#FIREWORKS_MODEL=accounts/fireworks/models/deepseek-v3p2
#FIREWORKS_MODEL=accounts/fireworks/models/minimax-m2p5
#FIREWORKS_MODEL=accounts/fireworks/models/kimi-k2p5
#FIREWORKS_MODEL=accounts/fireworks/models/qwen3-vl-30b-a3b-thinking

# OpenAI-Like Configuration (any OpenAI-compatible API — LM Studio, LocalAI, Llamafile, Jan, etc.)
# Use for third-party OpenAI-compatible servers that are NOT vLLM.
# For vLLM use LLM_PROVIDER=vllm and the VLLM_* vars below.
# Examples: localhost:1234/v1 (LM Studio), localhost:8181/v1 (LocalAI — note: 8080 conflicts with Alfresco), localhost:11434/v1 (Ollama /v1)
OPENAI_LIKE_MODEL=local-model
OPENAI_LIKE_API_BASE=http://localhost:1234/v1
# OPENAI_LIKE_API_KEY=local                   # Any string if not required
# OPENAI_LIKE_CONTEXT_WINDOW=4096
# OPENAI_LIKE_FUNCTION_CALLING=true           # Set false if model doesn't support tool calling
# OPENAI_LIKE_TIMEOUT=120.0

# vLLM Configuration — high-performance local inference (Docker or in-process)
# See docs/LLM/LLM-EMBEDDING-CONFIG.md section 18 for full setup guide.
#
# VLLM_MODE controls which path is used:
#   server    (default) — connects to a running vLLM HTTP server (Docker or vllm serve)
#   inprocess           — loads the model in-process via the vLLM Python package (Linux/macOS only)
#
# OPTION 1: vLLM via Docker (all platforms — Windows, macOS, Linux) — RECOMMENDED
# Start: uncomment includes/vllm.yaml in docker/docker-compose.yaml
# LLM_PROVIDER=vllm
# VLLM_MODE=server
# VLLM_API_BASE=http://localhost:8002/v1     # standalone backend; use http://vllm:8000/v1 in full-stack Docker
# VLLM_MODEL=Qwen/Qwen2.5-7B-Instruct       # HuggingFace model ID served by the container
# VLLM_API_KEY=local                         # any string (vLLM server doesn't require auth by default)
# VLLM_CONTEXT_WINDOW=8192
# VLLM_FUNCTION_CALLING=false                # set true only if model supports tool calling
# VLLM_TIMEOUT=120.0
# VLLM_TEMPERATURE=0.1
#
# Docker Compose also substitutes these VLLM_* vars into the container startup command:
# VLLM_MODEL=Qwen/Qwen2.5-7B-Instruct   # HuggingFace model ID (default: Qwen2.5-7B-Instruct)
# VLLM_MAX_MODEL_LEN=8192               # max context length in tokens (default: 8192)
# VLLM_GPU_UTIL=0.90                    # fraction of GPU VRAM to use, 0.0-1.0 (default: 0.90)
# VLLM_DTYPE=auto                       # weight dtype: auto, float16, bfloat16 (default: auto)
# VLLM_MAX_NUM_SEQS=16                  # max concurrent sequences (default: 16)
# HF_TOKEN=                             # HuggingFace token — only needed for gated models
#
# OPTION 2: vLLM Python package (Linux / macOS only — not available on Windows)
# Requires: uv pip install vllm  (CUDA required)
# If SchemaLLMPathExtractor returns 0 entities, add KG_EXTRACTOR_TYPE=dynamic in .env.
# LLM_PROVIDER=vllm
# VLLM_MODE=inprocess
# VLLM_MODEL=Qwen/Qwen2.5-7B-Instruct
# VLLM_API_URL=http://localhost:8002     # optional: connect to an already-running vLLM process
# VLLM_MAX_MODEL_LEN=8192
# VLLM_MAX_NEW_TOKENS=2048              # max tokens to generate per request
# VLLM_GPU_UTIL=0.90
# VLLM_DTYPE=auto
# VLLM_MAX_NUM_SEQS=16
# VLLM_TEMPERATURE=0.1
# HF_TOKEN=

# LiteLLM Configuration (proxy for 100+ providers)
# Requires proxy extra: pip install "litellm[proxy]"  (cmd: pip install litellm[proxy])
# Run: litellm --model ollama/gpt-oss:20b --port 4000
# Docs: https://docs.litellm.ai/docs/proxy/quick_start
# LITELLM_MODEL=ollama/gpt-oss:20b            # Model name as configured in LiteLLM
# LITELLM_API_BASE=http://localhost:11434     # Direct to Ollama (LiteLLM routes ollama/* models here automatically)
LITELLM_MODEL=gpt-4.1-mini                   # Model name as configured in LiteLLM
LITELLM_API_BASE=http://localhost:4000/v1    # LiteLLM proxy endpoint
# LITELLM_API_KEY=local                       # LiteLLM master key (or "local" if none)
# LITELLM_FUNCTION_CALLING=true

# OpenRouter Configuration (unified API for 200+ models)
# Get API key from https://openrouter.ai/keys (free tier available)
# OPENROUTER_API_KEY=sk-or-your-key-here
OPENROUTER_MODEL=openai/gpt-4.1-mini         # or anthropic/claude-3-5-sonnet, meta-llama/llama-3.3-70b-instruct, etc.


# LLM Extraction Mode
# Controls how LlamaIndex calls the LLM for structured KG extraction (entity/relation extraction).
#   function   (default) — tool/function calling mode; most reliable for property extraction.
#              Avoids the OpenAI "additionalProperties: false" structured output bug.
#   json_schema           — structured output / JSON schema mode (PydanticProgramMode.DEFAULT)
#   auto                  — let LlamaIndex decide per provider (also maps to DEFAULT internally)
# Only change this if a specific model/server behaves poorly with function calling.
# Per-provider overrides: OPENAI_LIKE_FUNCTION_CALLING, LITELLM_FUNCTION_CALLING
#LLM_EXTRACTION_MODE=function
#LLM_EXTRACTION_MODE=json_schema
#LLM_EXTRACTION_MODE=auto

# ====================================================================
# EMBEDDING CONFIGURATION (Independent of LLM Provider)
# ====================================================================

# Embeddings are configured independently of your LLM provider.
# You can mix any LLM provider with any embedding provider.
# For detailed configuration examples, see: docs/LLM/LLM-EMBEDDING-CONFIG.md
#
# EMBEDDING_DIMENSION: Optional explicit dimension override (auto-detected for known models)
#EMBEDDING_DIMENSION=768   

EMBEDDING_KIND=openai
#EMBEDDING_KIND=ollama
#EMBEDDING_KIND=google
#EMBEDDING_KIND=vertex
#EMBEDDING_KIND=azure
#EMBEDDING_KIND=bedrock
#EMBEDDING_KIND=fireworks
#EMBEDDING_KIND=openai_like
#EMBEDDING_KIND=litellm

# OpenAI Embeddings (default — active when EMBEDDING_KIND=openai)
OPENAI_EMBEDDING_MODEL=text-embedding-3-small       # 1536 dims
#OPENAI_EMBEDDING_MODEL=text-embedding-3-large      # 3072 dims

# Ollama Embeddings (active when EMBEDDING_KIND=ollama)
# WARNING: all-minilm (384 dims, 512 context) is NOT recommended - its 512-token limit causes errors
# when embedding graph nodes (combined entity + relationship text). Use nomic-embed-text instead.
OLLAMA_EMBEDDING_MODEL=nomic-embed-text              # 768 dims, 8192 context - recommended
#OLLAMA_EMBEDDING_MODEL=mxbai-embed-large            # 1024 dims, 512 context - higher quality but limited context
#OLLAMA_EMBEDDING_MODEL=all-minilm                   # 384 dims, 512 context - NOT RECOMMENDED (context too small)

# Google Gemini Embeddings (active when EMBEDDING_KIND=google)
GOOGLE_EMBEDDING_MODEL=gemini-embedding-2-preview    # 768 dims - recommended
#GOOGLE_EMBEDDING_MODEL=gemini-embedding-001         # 768 dims - stable GA alternative

# Google Vertex AI Embeddings (active when EMBEDDING_KIND=vertex)
VERTEX_EMBEDDING_MODEL=gemini-embedding-2-preview    # 768 dims - recommended
#VERTEX_EMBEDDING_MODEL=gemini-embedding-001         # 768 dims - stable GA alternative
#VERTEX_EMBEDDING_MODEL=text-embedding-005           # 768 dims - text-only option

# Azure Embeddings (active when EMBEDDING_KIND=azure)
AZURE_EMBEDDING_DEPLOYMENT=text-embedding-3-small    # deployment name (if different from model name)
AZURE_EMBEDDING_MODEL=text-embedding-3-small         # model name in your Azure resource
AZURE_EMBEDDING_API_VERSION=2024-02-01

# Bedrock Embeddings (active when EMBEDDING_KIND=bedrock)
BEDROCK_EMBEDDING_MODEL=amazon.titan-embed-text-v2:0 # 1024 dims

# Fireworks Embeddings (active when EMBEDDING_KIND=fireworks)
FIREWORKS_EMBEDDING_MODEL=nomic-ai/nomic-embed-text-v1.5  # 768 dims

# OpenAI-Like Embeddings (active when EMBEDDING_KIND=openai_like)
# Any server with a /v1/embeddings endpoint: Ollama, LM Studio, LocalAI, vLLM, Llamafile, etc.
# Confirmed working: nomic-embed-text via Ollama (localhost:11434/v1) with OpenAI gpt-4.1-mini (2026-03-20)
OPENAI_LIKE_EMBEDDING_MODEL=nomic-embed-text
OPENAI_LIKE_EMBEDDING_API_BASE=http://localhost:11434/v1  # Ollama default — or your server's /v1 URL
# OPENAI_LIKE_API_KEY=local
# Use EMBEDDING_DIMENSION above (e.g. 768 for nomic-embed-text) — required, no auto-detection for openai_like

# LiteLLM Embeddings (active when EMBEDDING_KIND=litellm)
# Route embeddings through a LiteLLM proxy to any backend (OpenAI, Ollama, Bedrock, etc.)
# Confirmed working: text-embedding-3-small via LiteLLM proxy (2026-03-19)
LITELLM_EMBEDDING_MODEL=text-embedding-3-small
#LITELLM_EMBEDDING_MODEL=ollama/nomic-embed-text
#LITELLM_EMBEDDING_API_BASE=http://localhost:11434
LITELLM_EMBEDDING_API_BASE=http://localhost:4000/v1       # Must include /v1 suffix
# LITELLM_API_KEY=local                                   # LiteLLM master key

# ====================================================================
# CONTENT SOURCES CONFIGURATION
# ====================================================================

# Data Source Type (mainly for REST API and MCP server usage)
# Options: filesystem, cmis, alfresco, upload, web, wikipedia, youtube, s3, gcs, azure_blob, onedrive, sharepoint, box, google_drive
# Note: UI clients typically override this setting with their own data source selection
DATA_SOURCE=filesystem

# Source Paths (for filesystem data source - REST API and MCP server)
# Note: UI clients use file upload dialogs instead of this configuration
# For path format examples, see docs/DATA-SOURCE-CONFIGURATION.md
SOURCE_PATHS=["./sample-docs/cmispress.txt"]

# CMIS Configuration (if using CMIS)
CMIS_URL=http://localhost:8080/alfresco/api/-default-/public/cmis/versions/1.1/atom
CMIS_USERNAME=admin
CMIS_PASSWORD=admin

# Alfresco Configuration (if using Alfresco)
# Note: Requires python-alfresco-api >= 1.1.5 for optimal performance
# Paths: Use short format like "/Shared/GraphRAG" (recommended - matches Alfresco Share UI)
#        Full format like "/Company Home/Shared/GraphRAG" also works (prefix is auto-stripped)
#        Both formats supported - system handles /Company Home prefix automatically
ALFRESCO_URL=http://localhost:8080
ALFRESCO_USERNAME=admin
ALFRESCO_PASSWORD=admin
# ALFRESCO_STOMP_PORT: ActiveMQ STOMP port for real-time events (default: 61613)
# Note: If you've changed the ActiveMQ STOMP port in docker-compose (e.g., to 8613), set it here
ALFRESCO_STOMP_PORT=8613

# Amazon S3 Configuration (if using S3)
# Note: These values serve as defaults/fallbacks when S3_CONFIG is not provided or incomplete
# S3_CONFIG (in section above) takes precedence, these are used as fallbacks
# S3_REGION_NAME=us-east-1           # AWS region for S3 bucket (default: us-east-1 if not in S3_CONFIG)
# S3_BUCKET_NAME=my-bucket           # Default bucket name (used if not specified in S3_CONFIG)
# S3_PREFIX=documents/               # Default prefix/path within bucket
# S3_PREFIX=""                       # Or leave empty for whole bucket (top level)
# S3_ACCESS_KEY=myaccesskeyid        # AWS access key ID (used if not specified in S3_CONFIG)
# S3_SECRET_KEY=mysecretaccesskey    # AWS secret access key (used if not specified in S3_CONFIG)

# ====================================================================
# DATABASE CONNECTION DETAILS (Individual Configs)
# ====================================================================

# Neo4j Configuration (for both vector and graph storage)
# Browser URL: http://localhost:7474/browser (for database management and queries)
NEO4J_URI=bolt://localhost:7687
NEO4J_USER=neo4j
NEO4J_PASSWORD=password
# NEO4J_DATABASE=neo4j  # Optional: specify database name (default: neo4j)

# Neo4j AuraDB (cloud) example:
# NEO4J_URI=neo4j+s://<dbid>.databases.neo4j.io
# NEO4J_USER=neo4j
# NEO4J_PASSWORD=<aura-generated-password>
# Console URL: https://console.neo4j.io

# Elasticsearch Configuration
ELASTICSEARCH_URL=http://localhost:9200
ELASTICSEARCH_USERNAME=
ELASTICSEARCH_PASSWORD=

# Qdrant Configuration
QDRANT_HOST=localhost
QDRANT_PORT=6333
QDRANT_API_KEY=
# QDRANT_COLLECTION=hybrid_search  # Optional: collection name
# QDRANT_HTTPS=false  # Use HTTPS for remote Qdrant instances

# Weaviate Configuration
WEAVIATE_URL=http://localhost:8081
WEAVIATE_INDEX_NAME=HybridSearch
# WEAVIATE_API_KEY=your_weaviate_api_key  # Optional: for authenticated instances
# WEAVIATE_TEXT_KEY=content  # Optional: field name for text content

# OpenSearch Configuration
OPENSEARCH_URL=http://localhost:9201
OPENSEARCH_USERNAME=
OPENSEARCH_PASSWORD=

# Processing Configuration

# CHUNK_SIZE and CHUNK_OVERLAP are in CHARACTERS for both llamaindex and langchain backends.
CHUNK_SIZE=2048
CHUNK_OVERLAP=128

#CHUNK_SIZE=1024
#CHUNK_OVERLAP=128

#CHUNK_SIZE=512
#CHUNK_OVERLAP=64

# Knowledge Graph Extraction Limits (configurable for different content densities)
# MAX_TRIPLETS_PER_CHUNK: Used by DynamicLLMPathExtractor and SchemaLLMPathExtractor - controls how many entity-relationship triplets can be extracted per text chunk
# MAX_PATHS_PER_CHUNK: Used by SimpleLLMPathExtractor - controls how many relationship paths can be extracted per text chunk
# Higher values allow more comprehensive extraction from dense content but may increase processing time
# Lower values are faster but may miss entities/relationships in complex documents
# These values are higher than the usual 20-25 seen in examples, are more to rule out
# that this is what is limiting extraction.
#MAX_TRIPLETS_PER_CHUNK=100
#MAX_PATHS_PER_CHUNK=100

#MAX_TRIPLETS_PER_CHUNK=50
#MAX_PATHS_PER_CHUNK=50

MAX_TRIPLETS_PER_CHUNK=20
MAX_PATHS_PER_CHUNK=20

# Timeout configurations moved to docs/TIMEOUT-CONFIGURATIONS.md
# Uncomment and adjust these if you need custom timeout values:
# 
# DOCLING_TIMEOUT=600  # Docling conversion timeout per document (seconds, default: 10 minutes)
# DOCLING_CANCEL_CHECK_INTERVAL=0.5  # How often to check for cancellation (seconds)
#   - We wrap Docling in background executor with periodic polling for cancellation
#   - Lower = more responsive, higher = less CPU overhead
# 
# KG_EXTRACTION_TIMEOUT=3600  # Knowledge graph extraction timeout per document
# KG_CANCEL_CHECK_INTERVAL=2.0  # How often to check for cancellation during KG extraction
# 


# ====================================================================
# INCREMENTAL UPDATES CONFIGURATION (Optional - for auto-sync monitoring)
# ====================================================================

# Enable incremental updates system (automatic monitoring and sync)
# Set to "true" to enable auto-sync capabilities
# Requires PostgreSQL for state management
# Default: false (incremental updates disabled)
ENABLE_INCREMENTAL_UPDATES=false
#ENABLE_INCREMENTAL_UPDATES=true

# PostgreSQL connection for state management (required if ENABLE_INCREMENTAL_UPDATES=true)
# Recommended: Reuse existing postgres-pgvector service on port 5433
# Create database: flexible_graphrag_incremental (separate from pgvector database)
# Setup: See flexible-graphrag/incremental_updates/schema.sql
POSTGRES_INCREMENTAL_URL=postgresql://postgres:password@localhost:5433/flexible_graphrag_incremental

# Optional: Periodic refresh interval (seconds) - how often to scan datasources
# Default: 3600 (1 hour) - datasources refresh automatically every hour
# Individual datasources can override this via API
# INCREMENTAL_REFRESH_INTERVAL=3600

# Optional: Watchdog filesystem delay (seconds) - debounce for file changes
# Default: 60 (1 minute) - filesystem changes are processed after 1 minute of inactivity
# Individual datasources can override this via API
# INCREMENTAL_WATCHDOG_DELAY=60

# Note: Individual datasources created via UI "Enable Sync" checkbox can override
# these defaults with custom refresh intervals and watchdog delays per datasource

# Setup Instructions: See flexible-graphrag/incremental_updates/SETUP-GUIDE.md
# - PostgreSQL schema setup
# - Database creation steps
# - Configuration details
# - Troubleshooting guide

# =============================================================================
# OBSERVABILITY CONFIGURATION (OpenTelemetry Traces and Metrics)
# =============================================================================

# Enable observability with OpenTelemetry, Prometheus, Grafana, Jaeger
# Set to "true" to enable automatic instrumentation of LlamaIndex operations
ENABLE_OBSERVABILITY=false
#ENABLE_OBSERVABILITY=true

# Observability Backend Mode (choose your telemetry producer)
# Options:
#   "openinference" - Default, trace-focused, requires spanmetrics connector for token metrics
#   "openlit" - Alternative with built-in token metrics (gen_ai_usage_*_tokens_total)
#   "both" - DUAL MODE (recommended!) - Best of both worlds!
#     - OpenInference: Detailed traces with full LlamaIndex operation visibility
#     - OpenLIT: Token metrics, cost tracking, VectorDB metrics out-of-the-box
#     - Custom metrics: Graph extraction, retrieval, document processing
# Recommendation: Use "both" for complete observability with all metrics

#OBSERVABILITY_BACKEND=openlit
#OBSERVABILITY_BACKEND=openinference
OBSERVABILITY_BACKEND=both

# OTLP (OpenTelemetry Protocol) exporter endpoint
# Default: http://localhost:4318 (OTLP HTTP receiver)
# For gRPC: use http://localhost:4317
# For cloud services (SigNoz, Langfuse, etc.), use their endpoint
OTEL_EXPORTER_OTLP_ENDPOINT=http://localhost:4318

# Service metadata for traces
OTEL_SERVICE_NAME=flexible-graphrag
OTEL_SERVICE_VERSION=1.0.0
OTEL_SERVICE_NAMESPACE=llm-apps

# Enable automatic LlamaIndex instrumentation (captures all LlamaIndex operations)
# Set to "false" if you only want manual tracing with custom decorators
ENABLE_LLAMA_INDEX_INSTRUMENTATION=true


# =============================================================================
# LangChain Configuration Details
# =============================================================================

# When GRAPH_BACKEND=langchain, LangChain's add_graph_documents() handles ingestion
# and a LangChain property graph QA chain handles retrieval.  For LangChain-only
# stores (arangodb, apache_age, cosmos_gremlin, spanner, hugegraph, tigergraph,
# surrealdb), GRAPH_BACKEND=langchain is selected automatically.
#
# Set PG_GRAPH_DB to the graph database you want to use; set GRAPH_BACKEND=langchain
# to use LangChain for both ingestion and retrieval against that store.
#
# Supported PG_GRAPH_DB values for LangChain backend:
#   neo4j            - Neo4j (bolt) via langchain_neo4j > langchain_community
#   memgraph         - Memgraph (bolt) via langchain_memgraph > langchain_community
#   ladybug          - not supported for LangChain backend (LlamaIndex only)
#   falkordb         - FalkorDB via langchain_community
#   hugegraph        - Apache HugeGraph via langchain_community        [LC-only]
#   nebula           - NebulaGraph via langchain_community
#   tigergraph       - TigerGraph via langchain_community              [LC-only]
#   arcadedb         - ArcadeDB via langchain_arcadedb  (uv pip install langchain-arcadedb)
#   arangodb         - ArangoDB via langchain_community chains         [LC-only]
#   neptune          - Neptune Database OpenCypher via langchain_aws
#   neptune_analytics- Neptune Analytics via langchain_aws
#   apache_age       - Apache AGE (PostgreSQL Cypher) via langchain_community [LC-only]
#   cosmos_gremlin   - Azure Cosmos DB Gremlin via langchain_community  [LC-only]
#   spanner          - Google Spanner Graph via langchain_google_spanner [LC-only]
#   surrealdb        - SurrealDB via langchain_community               [LC-only]
#
# Connection details are read from the same PG_GRAPH_DB connection as for llama-index
# (e.g. for neo4j: graph_config url/username/password, from NEO4J_GRAPH_DB_CONFIG)

# USE_LC_TEXT_TO_GRAPH: opt-in Cypher/SPARQL/SurrealQL text-to-query retrieval.
# Only applies when GRAPH_BACKEND=langchain. Ignored for GRAPH_BACKEND=llamaindex.
#
# Non-vector stores (apache_age, hugegraph, nebula, tigergraph, cosmos_gremlin, neptune):
#   TextToGraphQueryRetriever is ALWAYS active — this flag has no effect.
# Vector-capable stores (neo4j):
#   Default false — GraphEntityVectorRetriever handles graph retrieval.
#   Set true to also add TextToGraphQueryRetriever alongside vector retrieval.
#USE_LC_TEXT_TO_GRAPH=true
USE_LC_TEXT_TO_GRAPH=false

# LangChain PG vector search — entity embedding similarity search (GraphEntityVectorRetriever).
# neo4j: searches the __Entity__[embedding] index (Neo4jVector).
# FalkorDB: searches its own Chunk-node vector index (FalkorDBVector).
# For neo4j only: USE_PG_NEIGHBORHOOD is auto-enabled so entity seeds are walked to text chunks.
#LANGCHAIN_PG_VECTOR_SEARCH=true
LANGCHAIN_PG_VECTOR_SEARCH=false

# LangChain PG neighborhood — k-hop graph walk from entity seeds to connected text chunks.
# Neo4j only (other stores not implemented). Requires LANGCHAIN_PG_VECTOR_SEARCH=true for seeds.
# Auto-enabled for neo4j when LANGCHAIN_PG_VECTOR_SEARCH=true; set explicitly to silence the log.
#USE_PG_NEIGHBORHOOD=true
USE_PG_NEIGHBORHOOD=false
#PG_NEIGHBORHOOD_HOPS=2
#PG_NEIGHBORHOOD_TOP_K_SEEDS=10

# Index / label / property names.
# NODE_LABEL: __Entity__ works for BOTH ingestion backends —
#   LlamaIndex (PropertyGraphIndex) always creates __Entity__ nodes.
#   LangChain (add_graph_documents + baseEntityLabel=True) adds __Entity__ as a second label
#   alongside the specific type (Person, Organization, ...) on every node.
# TEXT_PROPERTY depends on ingestion backend:
#   name  — LlamaIndex ingestion (GRAPH_BACKEND=llamaindex, default)
#   id    — LangChain ingestion  (GRAPH_BACKEND=langchain)
#LANGCHAIN_PG_VECTOR_INDEX=entity
#LANGCHAIN_PG_VECTOR_NODE_LABEL=__Entity__
#LANGCHAIN_PG_VECTOR_EMBEDDING_PROPERTY=embedding
#LANGCHAIN_PG_VECTOR_TEXT_PROPERTY=id      # default when GRAPH_BACKEND=langchain writes id
#LANGCHAIN_PG_VECTOR_TEXT_PROPERTY=name    # default: LlamaIndex ingestion writes name

# Hybrid search returns up to three types of result nodes per active graph:
#   1. Text chunks  — always returned from VECTOR_DB / SEARCH_DB
#   2. AI answer    — always: the LLM answer generated from graph context
#   3. Graph input  — optional: the entity names / rows the LLM received as
#                     input (formatted as "Graph results: Alice (Person), ...")
# When BOTH a property graph (PG_GRAPH_DB) and an RDF graph (RDF_GRAPH_DB) are
# active, types 2 and 3 appear once per graph — up to 5 results total:
#   1. Text chunks  (VECTOR_DB / SEARCH_DB)
#   2. AI answer    from property graph
#   3. AI answer    from RDF graph
#   4. Graph input  from property graph  (only when true)
#   5. Graph input  from RDF graph       (only when true)
# false = types 1 + 2 (+ 3 if RDF active)  — default, cleaner
# true  = all types including raw graph input rows (prefixed "Graph results: ...")
#LANGCHAIN_PG_INTERMEDIATE_STEPS=false
#LANGCHAIN_PG_INTERMEDIATE_STEPS=true

# Synonym exploder — LLM-based query rewriter applied before selected retrievers.
# Generates synonyms/related keywords to improve KEYWORD/BM25 recall.
#
# WHEN TO USE: keyword/BM25 search (Elasticsearch, OpenSearch, BM25).
#   "Acme staff" and "Acme employees" are different BM25 queries — synonyms fix this.
#
# WHEN NOT TO USE: vector and graph retrievers.
#   Vector stores (LI + LC) use embedding similarity — the model handles synonyms
#   implicitly. Explicit expansion just multiplies vector calls with little gain.
#   LI PropertyGraph seeds entity lookup via vector search — same reasoning.
#   Graph QA chains (RDF, Cypher) fire a full LLM call per synonym — N cycles =
#   N near-duplicate answers and N× latency.
#
# SYNONYM_EXPLODER_SCOPE: comma-separated retriever tags, or all / none.
# Tags (use only RECOMMENDED ones unless you have a specific reason):
#   llamaindex_search           — LI BM25 / Elasticsearch / OpenSearch  [RECOMMENDED]
#   langchain_search            — LC Elasticsearch / OpenSearch          [RECOMMENDED]
#   llamaindex_vector           — LI VectorStoreIndex  [not recommended — embedding handles it]
#   langchain_vector            — LC vector store      [not recommended — same reason]
#   llamaindex_pg_graph         — LI PropertyGraph VectorContextRetriever [not recommended]
#   langchain_pg_vector         — LC PG entity vector retriever
#   langchain_pg_neighborhood   — PG neighborhood k-hop retriever
#   langchain_rdf_graph         — LC RDF/SPARQL retriever  [AVOID — N LLM cycles per synonym]
#   langchain_pg_graph          — LC Cypher QA retriever   [AVOID — same reason]
USE_SYNONYM_EXPLODER=false
#USE_SYNONYM_EXPLODER=true
SYNONYM_EXPLODER_MAX_KEYWORDS=8
# Default scope: none (disabled). To enable for keyword search backends:
# Unused tags are silently pruned at startup — no error if backend doesn't match.
SYNONYM_EXPLODER_SCOPE=none
#SYNONYM_EXPLODER_SCOPE=llamaindex_search,langchain_search            # BM25 / Elasticsearch only
#SYNONYM_EXPLODER_SCOPE=llamaindex_search,langchain_search,langchain_pg_vector  # + LC PG vector

# LangChain splitter type — only active when CHUNKER_BACKEND=langchain
# Options:
#   recursive           — RecursiveCharacterTextSplitter (default, best for prose/code)
#   character           — CharacterTextSplitter (simple separator-based)
#   token               — TokenTextSplitter (tiktoken token-accurate)
#   markdown            — MarkdownTextSplitter (preserves Markdown structure)
#   python              — PythonCodeTextSplitter (syntax-aware for .py files)
#   sentence_transformers — SentenceTransformersTokenTextSplitter (requires sentence-transformers)
#LC_SPLITTER_TYPE=recursive

# Each can be: llamaindex | langchain | both (default llamaindex)
# Pre-setups llm / embedding, but mixed llamaindex , langchain will get dynamically
#LLM_BACKEND=llamaindex
#EMBEDDING_BACKEND=llamaindex


# ====================================================================
# RDF / TRIPLE STORE CONFIGURATION
# ====================================================================

# Flexible GraphRAG supports storing extracted knowledge graphs in RDF
# triple stores (Fuseki, GraphDB, Oxigraph) in addition to or instead
# of the configured property graph (Neo4j, etc.).
#
# Use RDF_GRAPH_DB (set near the top of this file) to select the RDF store.
# Setting RDF_GRAPH_DB=none (the default) disables all RDF storage and retrieval.
# RDF_GRAPH_DB controls both ingestion AND retrieval in one setting.

# RDF_ANNOTATION_SYNTAX: how relation properties are encoded in RDF output
#   rdf_1.2      (default) - RDF 1.2 inline {| |} annotation syntax
#                          Turtle 1.2 standard; cleaner and preferred for new data.
#                          Example: <s> <p> <o> {| <prop> "value" |} .
#                          Requires: Fuseki 5 (Jena 5), GraphDB 10+, Oxigraph 0.4+
#   rdf_star             - Legacy << >> assertion-annotation lines
#                          Example: << <s> <p> <o> >>  <prop>  "value" .
#                          Same store support as rdf_1.2, older syntax.
#   flat                 - Plain compound-predicate triples, no annotation semantics
#                          Example: <s>  onto:rel__prop  "value" .
#                          Works with any SPARQL 1.1 triple store.
RDF_ANNOTATION_SYNTAX=rdf_1.2

# Base namespace for entity instance URIs in RDF output.
# "Alice Johnson" -> <https://integratedsemantics.org/flexible-graphrag/kg/alice_johnson>
RDF_BASE_NAMESPACE=https://integratedsemantics.org/flexible-graphrag/kg/

# Relation properties are stored as RDF-star annotations (RDF 1.2):
#   <<:alice onto:WORKS_FOR :techcorp>> onto:since "2020"^^xsd:integer .
# All three stores support RDF-star natively: Fuseki (Jena 4+), GraphDB, Oxigraph

# -------------------------------------------------------------------
# RDF Store Connection vars (used when RDF_GRAPH_DB != none)
# -------------------------------------------------------------------

# Apache Fuseki  (used when RDF_GRAPH_DB=fuseki)
FUSEKI_BASE_URL=http://localhost:3030
FUSEKI_DATASET=flexible-graphrag
FUSEKI_USERNAME=admin
FUSEKI_PASSWORD=admin

# Ontotext GraphDB  (used when RDF_GRAPH_DB=graphdb)
GRAPHDB_BASE_URL=http://localhost:7200
GRAPHDB_REPOSITORY=flexible-graphrag
GRAPHDB_USERNAME=admin
GRAPHDB_PASSWORD=admin

# Oxigraph  (used when RDF_GRAPH_DB=oxigraph)
# HTTP mode (preferred — uses Docker container at port 7878, no file locking):
OXIGRAPH_URL=http://localhost:7878
# Embedded mode (single-process only, not recommended when API server is running):
#OXIGRAPH_STORE_PATH=./data/oxigraph_store

# Amazon Neptune RDF/SPARQL  (used when RDF_GRAPH_DB=neptune_rdf)
# Same cluster that PG_GRAPH_DB=neptune uses — Neptune is "OneGraph" (RDF + OpenCypher coexist).
# NEPTUNE_RDF_HOST is the cluster endpoint hostname only (no https://, no port).
#NEPTUNE_RDF_HOST=db-neptune-1.cluster-xxxxxxxxxxxx.us-east-1.neptune.amazonaws.com
#NEPTUNE_RDF_PORT=8182
#NEPTUNE_RDF_REGION=us-east-1
#NEPTUNE_RDF_USE_IAM_AUTH=true
#NEPTUNE_RDF_USE_HTTPS=true
# Explicit AWS credentials (optional — omit to use env AWS_ACCESS_KEY_ID / instance profile):
#NEPTUNE_RDF_AWS_ACCESS_KEY_ID=
#NEPTUNE_RDF_AWS_SECRET_ACCESS_KEY=

# ===========================
# Query Routing Defaults
# ===========================
# QUERY_ROUTING_DEFAULT: Where to route graph queries
#  - property_graph: Query configured property graph (Neo4j, etc.) only (Cypher)
#  - sparql: Query RDF stores only (Fuseki/GraphDB/Oxigraph)
#  - hybrid: Query both property graph AND RDF stores, merge results
#  - auto: Automatically choose based on query structure
# Note: This is different from "hybrid search" (vector+fulltext+graph)
QUERY_ROUTING_DEFAULT=property_graph

# Enable/disable query language support
SUPPORT_SPARQL=true
SUPPORT_CYPHER=true

# -------------------------------------------------------------------
# RDF Export
# -------------------------------------------------------------------
# Expose /export/rdf API endpoint to dump property graph as RDF
ENABLE_RDF_EXPORT=false
# Serialization format: turtle | ntriples | rdfxml | nquads
RDF_EXPORT_FORMAT=turtle

# Base namespace for entity instance URIs (named-graph URI = this value without trailing slash).
# Default: https://integratedsemantics.org/flexible-graphrag/kg/
#RDF_BASE_NAMESPACE=https://integratedsemantics.org/flexible-graphrag/kg/

# Namespace for ontology predicates / types (onto: prefix in SPARQL queries).
# Default: https://integratedsemantics.org/flexible-graphrag/ontology#
#RDF_ONTOLOGY_NAMESPACE=https://integratedsemantics.org/flexible-graphrag/ontology#

# ===========================
# RDF Retrieval Configuration
# ===========================
# RDF retrieval is automatically enabled when RDF_GRAPH_DB != none.
# Weight for RDF retrieval results in fusion (0.0-1.0)
# Higher = more emphasis on graph-based context
RDF_RETRIEVAL_WEIGHT=0.3

# Number of results to return from RDF retriever
RDF_RETRIEVAL_TOP_K=5


# ===========================
# Ontology Configuration
# ===========================
# Ontology Turtle files are not shipped with the pip wheel. Clone the repository or copy
# the sample .ttl files, then point ONTOLOGY_PATH, ONTOLOGY_PATHS, or ONTOLOGY_DIR at them.
# Prefer absolute paths in production.
#
# Repository layout (monorepo root): sample ontologies are in schemas/ next to the inner
# flexible-graphrag/ package folder. If your working directory is flexible-graphrag/flexible-graphrag/
# (the directory that contains this env-sample and main.py), use ../schemas/ as below.
#
# Sample files in schemas/ (names may evolve; check the repo):
#   company_classes.ttl        - OWL classes + object properties (company domain)
#   company_properties.ttl    - datatype properties
#   common_ontology.ttl       - shared upper-ish classes (Person, Place, ...)
#   foaf_ontology.ttl         - optional FOAF include; add to ONTOLOGY_PATHS if desired
#   company_ontology.ttl      - optional monolithic bundle (all-in-one alternative to split files)
#
# Enable ontology-based knowledge graph extraction
# When enabled, uses RDF/RDFS/OWL ontology schemas for entity/relation extraction (OWL is optional)
USE_ONTOLOGY=false
#USE_ONTOLOGY=true

# --- Single ontology file (simplest) ---
# All classes, object properties, and datatype properties in one file.
#ONTOLOGY_PATH=../schemas/company_ontology.ttl

# --- Comma-separated list of specific files (second precedence) ---
# Split ontologies are a common pattern: one file for classes/relations,
# another for datatype properties, plus optional third-party ontologies
# (FOAF, Dublin Core, schema.org) or a generic common_ontology.ttl.
ONTOLOGY_PATHS=../schemas/company_classes.ttl,../schemas/company_properties.ttl,../schemas/common_ontology.ttl

# --- Directory of ontology files (highest precedence) ---
# All .ttl / .rdf / .owl / .n3 / .nt files in the directory are loaded and merged.
# Useful when maintaining many domain ontologies or mixing published ontologies.
#ONTOLOGY_DIR=../schemas/

# Serialization format for ONTOLOGY_PATH and ONTOLOGY_PATHS.
# Directory loading (ONTOLOGY_DIR) auto-detects format from file extension.
ONTOLOGY_FORMAT=turtle          # turtle | rdfxml | ntriples | nquads
