FROM python:3.10-slim

# Define build arguments with default values
ARG SOLR_URL=http://opentapioca-solr:8983/solr
ARG SOLR_COLLECTION=tapioca
ARG CONTAINER_PORT=8457

# Set as environment variables
ENV SOLR_URL=$SOLR_URL
ENV SOLR_COLLECTION=$SOLR_COLLECTION
ENV CONTAINER_PORT=$CONTAINER_PORT
ENV SOLR_HOST=opentapioca-solr

WORKDIR /app

# Install system dependencies
RUN apt-get update && apt-get install -y --no-install-recommends curl unzip && rm -rf /var/lib/apt/lists/*

# Download opentapioca resources from GRO.data
ARG DATAVERSE_API_TOKEN
RUN echo "Checking GRO.data availability..." && \
    if ! curl -sf --max-time 15 "https://data.goettingen-research-online.de/api/info/version" > /dev/null; then \
      echo "ERROR: GRO.data (data.goettingen-research-online.de) is not reachable or in maintenance mode. Retry later."; exit 1; \
    fi && \
    echo "GRO.data is available. Downloading OpenTapioca data..." && \
    mkdir -p /app/data/opentapioca_entity_linker && \
    curl -fL -H "X-Dataverse-key:${DATAVERSE_API_TOKEN}" \
    -o /tmp/dataverse_files.zip \
    "https://data.goettingen-research-online.de/api/access/dataset/:persistentId/?persistentId=doi:10.25625/9OBIVL&version=1.0" && \
    unzip /tmp/dataverse_files.zip -d /app/data/opentapioca_entity_linker/ && \
    rm /tmp/dataverse_files.zip

# Copy required files
COPY requirements_opentapioca.txt /app/requirements_opentapioca.txt

# Install dependencies
RUN pip install --no-cache-dir -r /app/requirements_opentapioca.txt

# Add wait script
COPY wait-for-solr.sh /app/wait-for-solr.sh
RUN chmod +x /app/wait-for-solr.sh

# OpenTapioca port - using the build arg
EXPOSE $CONTAINER_PORT

# Start OpenTapioca only after Solr is ready
CMD /app/wait-for-solr.sh ${SOLR_HOST}:8983 python app.py
