services:
  # Main orchestration server - handles inference and embeddings
  predict-otron-9000:
    build:
      context: .
      dockerfile: crates/predict-otron-9000/Dockerfile
    ports:
      - "8080:8080"
    environment:
      - SERVER_PORT=8080
      - RUST_LOG=${RUST_LOG:-info}
      - HF_TOKEN=${HF_TOKEN}
      - HF_HOME=/app/.hf-cache
    volumes:
      # Mount HF cache to persist downloaded models
      - hf-cache:/app/.hf-cache
      # Mount FastEmbed cache for embeddings
      - fastembed-cache:/app/.fastembed_cache
    networks:
      - predict-otron-network
    healthcheck:
      test: ["CMD", "curl", "-f", "http://localhost:8080"]
      interval: 5s
      timeout: 1s
      retries: 10
      start_period: 10s

volumes:
  # Persistent storage for Hugging Face model cache
  hf-cache:
    driver: local
  # Persistent storage for FastEmbed model cache
  fastembed-cache:
    driver: local

networks:
  predict-otron-network:
    driver: bridge