add root dockerfile that contains binaries for all services

2025-09-08 22:46:44 +00:00 · 2025-09-04 14:54:20 -04:00
parent fb5098eba6
commit 296d4dbe7e
13 changed files with 189 additions and 255 deletions
--- a/.dockerignore
+++ b/.dockerignore
@@ -0,0 +1,35 @@
 # Git
 .git
 .gitignore
 # Rust
 target/
 # Documentation
 README.md
 *.md
 # IDE
 .vscode/
 .idea/
 *.swp
 *.swo
 # OS
 .DS_Store
 Thumbs.db
 # Logs
 *.log
 # Environment
 .env
 .env.local
 # Dependencies
 node_modules/
 # Build artifacts
 dist/
 build/
 .fastembed_cache
--- a/Cargo.lock
+++ b/Cargo.lock
@@ -7041,7 +7041,7 @@ checksum = "06abde3611657adf66d383f00b093d7faecc7fa57071cce2578660c9f1010821"
 [[package]]
 name = "utils"
-version = "0.0.0"
+version = "0.1.4"
 dependencies = [
 "ab_glyph",
 "accelerate-src",
--- a/50
+++ b/50
@@ -0,0 +1,50 @@
 # Multi-stage build for predict-otron-9000 workspace
 FROM rust:1 AS builder
 # Install system dependencies
 RUN apt-get update && apt-get install -y \
    pkg-config \
    libssl-dev \
    && rm -rf /var/lib/apt/lists/*
 # Create app directory
 WORKDIR /app
 # Copy workspace files
 COPY Cargo.toml Cargo.lock ./
 COPY crates/ ./crates/
 COPY integration/ ./integration/
 # Build all 3 main server binaries in release mode
 RUN cargo build --release -p predict-otron-9000 --bin predict-otron-9000 --no-default-features -p embeddings-engine --bin embeddings-engine -p inference-engine --bin inference-engine
 # Runtime stage
 FROM debian:bookworm-slim AS runtime
 # Install runtime dependencies
 RUN apt-get update && apt-get install -y \
    ca-certificates \
    libssl3 \
    && rm -rf /var/lib/apt/lists/*
 # Create app user
 RUN useradd -r -s /bin/false -m -d /app appuser
 # Set working directory
 WORKDIR /app
 # Copy binaries from builder stage
 COPY --from=builder /app/target/release/predict-otron-9000 ./bin/
 COPY --from=builder /app/target/release/embeddings-engine ./bin/
 COPY --from=builder /app/target/release/inference-engine ./bin/
 # Make binaries executable and change ownership
 RUN chmod +x ./bin/* && chown -R appuser:appuser /app
 # Switch to non-root user
 USER appuser
 # Expose ports (adjust as needed based on your services)
 EXPOSE 8080 8081 8082
 # Default command (can be overridden)
 CMD ["./bin/predict-otron-9000"]
--- a/crates/chat-ui/Cargo.toml
+++ b/crates/chat-ui/Cargo.toml
@@ -3,6 +3,7 @@ name = "chat-ui"
 version = "0.1.0"
 edition = "2021"
 [lib]
 crate-type = ["cdylib", "rlib"]
@@ -122,3 +123,7 @@ lib-default-features = false
 #
 # Optional. Defaults to "release".
 lib-profile-release = "release"
 [[bin]]
 name = "chat-ui"
 path = "src/main.rs"
--- a/crates/embeddings-engine/Cargo.toml
+++ b/crates/embeddings-engine/Cargo.toml
@@ -28,12 +28,13 @@ once_cell = "1.19.0"
 [package.metadata.compose]
-image = "ghcr.io/geoffsee/embeddings-service:latest"
+image = "ghcr.io/geoffsee/predict-otron-9000:latest"
 port = 8080
-
+cmd = ["./bin/embeddings-engine"]
 # generates kubernetes manifests
 [package.metadata.kube]
-image = "ghcr.io/geoffsee/embeddings-service:latest"
+image = "ghcr.io/geoffsee/predict-otron-9000:latest"
 cmd = ["./bin/embeddings-engine"]
 replicas = 1
 port = 8080
--- a/crates/embeddings-engine/Dockerfile
+++ b/crates/embeddings-engine/Dockerfile
@@ -1,42 +0,0 @@
 # ---- Build stage ----
 FROM rust:1-slim-bullseye AS builder
 WORKDIR /usr/src/app
 # Install build dependencies
 RUN apt-get update && \
    apt-get install -y --no-install-recommends \
        pkg-config \
        libssl-dev \
        build-essential \
        && rm -rf /var/lib/apt/lists/*
 # Cache deps first
 COPY . ./
 RUN rm -rf src
 RUN mkdir src && echo "fn main() {}" > src/main.rs && echo "// lib" > src/lib.rs && cargo build --release
 RUN rm -rf src
 # Copy real sources and build
 COPY . .
 RUN cargo build --release
 # ---- Runtime stage ----
 FROM debian:bullseye-slim
 # Install only what the compiled binary needs
 RUN apt-get update && \
    apt-get install -y --no-install-recommends \
        libssl1.1 \
        ca-certificates \
        && rm -rf /var/lib/apt/lists/*
 # Copy binary from builder
 COPY --from=builder /usr/src/app/target/release/embeddings-engine /usr/local/bin/
 # Run as non-root user for safety
 RUN useradd -m appuser
 USER appuser
 EXPOSE 8080
 CMD ["embeddings-engine"]
--- a/crates/inference-engine/Cargo.toml
+++ b/crates/inference-engine/Cargo.toml
@@ -1,7 +1,7 @@
 [package]
 name = "inference-engine"
 version.workspace = true
-edition = "2021"
+edition = "2024"
 [dependencies]
 candle-core = { git = "https://github.com/huggingface/candle.git" }
@@ -31,14 +31,21 @@ utoipa = { version = "4.2.0", features = ["axum_extras"] }
 uuid = { version = "1.7.0", features = ["v4"] }
 reborrow = "0.5.5"
 futures-util = "0.3.31"
-gemma-runner = { path = "../../integration/gemma-runner", features = ["metal"] }
+gemma-runner = { path = "../../integration/gemma-runner" }
-llama-runner = { path = "../../integration/llama-runner", features = ["metal"]}
+llama-runner = { path = "../../integration/llama-runner" }
 embeddings-engine = { path = "../embeddings-engine" }
 [target.'cfg(target_os = "linux")'.dependencies]
 candle-core = { git = "https://github.com/huggingface/candle.git", default-features = false }
 candle-nn = { git = "https://github.com/huggingface/candle.git", default-features = false }
 candle-transformers = { git = "https://github.com/huggingface/candle.git", default-features = false }
 [target.'cfg(target_os = "macos")'.dependencies]
 candle-core = { git = "https://github.com/huggingface/candle.git", features = ["metal"] }
 candle-nn = { git = "https://github.com/huggingface/candle.git", features = ["metal"] }
 candle-transformers = { git = "https://github.com/huggingface/candle.git", features = ["metal"] }
 gemma-runner = { path = "../../integration/gemma-runner", features = ["metal"] }
 llama-runner = { path = "../../integration/llama-runner", features = ["metal"] }
 [dev-dependencies]
@@ -62,15 +69,19 @@ bindgen_cuda = { version = "0.1.1", optional = true }
 [features]
 bin = []
 [[bin]]
 name = "inference-engine"
 path = "src/main.rs"
 [package.metadata.compose]
-image = "ghcr.io/geoffsee/inference-engine:latest"
+image = "ghcr.io/geoffsee/predict-otron-9000:latest"
 cmd = ["./bin/inference-engine"]
 port = 8080
 # generates kubernetes manifests
 [package.metadata.kube]
-image = "ghcr.io/geoffsee/inference-service:latest"
+image = "ghcr.io/geoffsee/predict-otron-9000:latest"
-replicas = 1
+cmd = ["./bin/inference-engine"]
 port = 8080
 replicas = 1
--- a/crates/inference-engine/Dockerfile
+++ b/crates/inference-engine/Dockerfile
@@ -1,86 +0,0 @@
 # ---- Build stage ----
 FROM rust:1-slim-bullseye AS builder
 WORKDIR /usr/src/app
 # Install build dependencies including CUDA toolkit for GPU support
 RUN apt-get update && \
    apt-get install -y --no-install-recommends \
        pkg-config \
        libssl-dev \
        build-essential \
        wget \
        gnupg2 \
        curl \
        && rm -rf /var/lib/apt/lists/*
 # Install CUDA toolkit (optional, for GPU support)
 # This is a minimal CUDA installation for building
 RUN wget https://developer.download.nvidia.com/compute/cuda/repos/debian11/x86_64/cuda-keyring_1.0-1_all.deb && \
    dpkg -i cuda-keyring_1.0-1_all.deb && \
    apt-get update && \
    apt-get install -y --no-install-recommends \
        cuda-minimal-build-11-8 \
        libcublas-dev-11-8 \
        libcurand-dev-11-8 \
        && rm -rf /var/lib/apt/lists/* \
        && rm cuda-keyring_1.0-1_all.deb
 # Set CUDA environment variables
 ENV CUDA_HOME=/usr/local/cuda
 ENV PATH=${CUDA_HOME}/bin:${PATH}
 ENV LD_LIBRARY_PATH=${CUDA_HOME}/lib64:${LD_LIBRARY_PATH}
 # Copy the entire workspace to get access to all crates
 COPY . ./
 # Cache dependencies first - create dummy source files
 RUN rm -rf crates/inference-engine/src
 RUN mkdir -p crates/inference-engine/src && \
    echo "fn main() {}" > crates/inference-engine/src/main.rs && \
    echo "fn main() {}" > crates/inference-engine/src/cli_main.rs && \
    echo "// lib" > crates/inference-engine/src/lib.rs && \
    cargo build --release --bin cli --package inference-engine
 # Remove dummy source and copy real sources
 RUN rm -rf crates/inference-engine/src
 COPY . .
 # Build the actual CLI binary
 RUN cargo build --release --bin cli --package inference-engine
 # ---- Runtime stage ----
 FROM debian:bullseye-slim
 # Install runtime dependencies
 RUN apt-get update && \
    apt-get install -y --no-install-recommends \
        libssl1.1 \
        ca-certificates \
        && rm -rf /var/lib/apt/lists/*
 # Install CUDA runtime libraries (optional, for GPU support at runtime)
 RUN apt-get update && \
    apt-get install -y --no-install-recommends \
        wget \
        gnupg2 \
        && wget https://developer.download.nvidia.com/compute/cuda/repos/debian11/x86_64/cuda-keyring_1.0-1_all.deb \
        && dpkg -i cuda-keyring_1.0-1_all.deb \
        && apt-get update \
        && apt-get install -y --no-install-recommends \
            cuda-cudart-11-8 \
            libcublas11 \
            libcurand10 \
        && rm -rf /var/lib/apt/lists/* \
        && rm cuda-keyring_1.0-1_all.deb \
        && apt-get purge -y wget gnupg2
 # Copy binary from builder
 COPY --from=builder /usr/src/app/target/release/cli /usr/local/bin/inference-cli
 # Run as non-root user for safety
 RUN useradd -m appuser
 USER appuser
 EXPOSE 8080
 CMD ["inference-cli"]
--- a/crates/inference-engine/src/main.rs
+++ b/crates/inference-engine/src/main.rs
@@ -0,0 +1,23 @@
 use inference_engine::{create_router, init_tracing, get_server_config, AppState};
 use tokio::net::TcpListener;
 use tracing::info;
 #[tokio::main]
 async fn main() -> Result<(), Box<dyn std::error::Error>> {
    init_tracing();
    let app_state = AppState::default();
    let app = create_router(app_state);
    let (server_host, server_port, server_address) = get_server_config();
    let listener = TcpListener::bind(&server_address).await?;
    info!("Inference Engine server starting on http://{}", server_address);
    info!("Available endpoints:");
    info!("  POST /v1/chat/completions - OpenAI-compatible chat completions");
    info!("  GET  /v1/models         - List available models");
    axum::serve(listener, app).await?;
    Ok(())
 }
--- a/crates/predict-otron-9000/Cargo.toml
+++ b/crates/predict-otron-9000/Cargo.toml
@@ -29,7 +29,7 @@ inference-engine = { path = "../inference-engine" }
 # Dependencies for leptos web app
 #leptos-app = { path = "../leptos-app", features = ["ssr"] }
-chat-ui = { path = "../chat-ui", features = ["ssr", "hydrate"], optional = false }
+chat-ui = { path = "../chat-ui", features = ["ssr", "hydrate"], optional = true }
 mime_guess = "2.0.5"
 log = "0.4.27"
@@ -39,15 +39,20 @@ log = "0.4.27"
 name = "predict-otron-9000"
 image = "ghcr.io/geoffsee/predict-otron-9000:latest"
 port = 8080
-
+cmd = ["./bin/predict-otron-9000"]
 # generates kubernetes manifests
 [package.metadata.kube]
 image = "ghcr.io/geoffsee/predict-otron-9000:latest"
 replicas = 1
 port = 8080
 cmd = ["./bin/predict-otron-9000"]
 # SERVER_CONFIG Example: {\"serverMode\":\"HighAvailability\",\"services\":{\"inference_url\":\"http://custom-inference:9000\",\"embeddings_url\":\"http://custom-embeddings:9001\"}}
 # you can generate this via node to avoid toil
 # const server_config = {serverMode: "HighAvailability", services: {inference_url: "http://custom-inference:9000", embeddings_url: "http://custom-embeddings:9001"} };
 # console.log(JSON.stringify(server_config).replace(/"/g, '\\"'));
 env = { SERVER_CONFIG = "<your-json-value-here>" }
 [features]
 default = ["ui"]
 ui = ["dep:chat-ui"]
--- a/crates/predict-otron-9000/Dockerfile
+++ b/crates/predict-otron-9000/Dockerfile
@@ -1,89 +0,0 @@
 # ---- Build stage ----
 FROM rust:1-slim-bullseye AS builder
 WORKDIR /usr/src/app
 # Install build dependencies including CUDA toolkit for GPU support (needed for inference-engine dependency)
 RUN apt-get update && \
    apt-get install -y --no-install-recommends \
        pkg-config \
        libssl-dev \
        build-essential \
        wget \
        gnupg2 \
        curl \
        && rm -rf /var/lib/apt/lists/*
 # Install CUDA toolkit (required for inference-engine dependency)
 # This is a minimal CUDA installation for building
 RUN wget https://developer.download.nvidia.com/compute/cuda/repos/debian11/x86_64/cuda-keyring_1.0-1_all.deb && \
    dpkg -i cuda-keyring_1.0-1_all.deb && \
    apt-get update && \
    apt-get install -y --no-install-recommends \
        cuda-minimal-build-11-8 \
        libcublas-dev-11-8 \
        libcurand-dev-11-8 \
        && rm -rf /var/lib/apt/lists/* \
        && rm cuda-keyring_1.0-1_all.deb
 # Set CUDA environment variables
 ENV CUDA_HOME=/usr/local/cuda
 ENV PATH=${CUDA_HOME}/bin:${PATH}
 ENV LD_LIBRARY_PATH=${CUDA_HOME}/lib64:${LD_LIBRARY_PATH}
 # Copy the entire workspace to get access to all crates (needed for local dependencies)
 COPY . ./
 # Cache dependencies first - create dummy source files for all crates
 RUN rm -rf crates/predict-otron-9000/src crates/inference-engine/src crates/embeddings-engine/src
 RUN mkdir -p crates/predict-otron-9000/src crates/inference-engine/src crates/embeddings-engine/src && \
    echo "fn main() {}" > crates/predict-otron-9000/src/main.rs && \
    echo "fn main() {}" > crates/inference-engine/src/main.rs && \
    echo "fn main() {}" > crates/inference-engine/src/cli_main.rs && \
    echo "// lib" > crates/inference-engine/src/lib.rs && \
    echo "fn main() {}" > crates/embeddings-engine/src/main.rs && \
    echo "// lib" > crates/embeddings-engine/src/lib.rs && \
    cargo build --release --bin predict-otron-9000 --package predict-otron-9000
 # Remove dummy sources and copy real sources
 RUN rm -rf crates/predict-otron-9000/src crates/inference-engine/src crates/embeddings-engine/src
 COPY . .
 # Build the actual binary
 RUN cargo build --release --bin predict-otron-9000 --package predict-otron-9000
 # ---- Runtime stage ----
 FROM debian:bullseye-slim
 # Install runtime dependencies
 RUN apt-get update && \
    apt-get install -y --no-install-recommends \
        libssl1.1 \
        ca-certificates \
        && rm -rf /var/lib/apt/lists/*
 # Install CUDA runtime libraries (required for inference-engine dependency)
 RUN apt-get update && \
    apt-get install -y --no-install-recommends \
        wget \
        gnupg2 \
        && wget https://developer.download.nvidia.com/compute/cuda/repos/debian11/x86_64/cuda-keyring_1.0-1_all.deb \
        && dpkg -i cuda-keyring_1.0-1_all.deb \
        && apt-get update \
        && apt-get install -y --no-install-recommends \
            cuda-cudart-11-8 \
            libcublas11 \
            libcurand10 \
        && rm -rf /var/lib/apt/lists/* \
        && rm cuda-keyring_1.0-1_all.deb \
        && apt-get purge -y wget gnupg2
 # Copy binary from builder
 COPY --from=builder /usr/src/app/target/release/predict-otron-9000 /usr/local/bin/
 # Run as non-root user for safety
 RUN useradd -m appuser
 USER appuser
 EXPOSE 8080
 CMD ["predict-otron-9000"]
--- a/crates/predict-otron-9000/src/main.rs
+++ b/crates/predict-otron-9000/src/main.rs
@@ -4,27 +4,32 @@ mod middleware;
 mod standalone_mode;
 use crate::standalone_mode::create_standalone_router;
 use axum::http::StatusCode as AxumStatusCode;
 use axum::http::header;
 use axum::response::IntoResponse;
 use axum::routing::get;
-use axum::{Router, http::Uri, response::Html, serve};
+use axum::{Router, serve};
 use config::ServerConfig;
 use ha_mode::create_ha_router;
 use inference_engine::AppState;
 use log::info;
 use middleware::{MetricsLayer, MetricsLoggerFuture, MetricsStore};
 use mime_guess::from_path;
 use rust_embed::Embed;
 use std::env;
-use std::path::Component::ParentDir;
+
 #[cfg(feature = "ui")]
 use axum::http::StatusCode as AxumStatusCode;
 #[cfg(feature = "ui")]
 use axum::http::header;
 #[cfg(feature = "ui")]
 use axum::response::IntoResponse;
 #[cfg(feature = "ui")]
 use axum::http::Uri;
 #[cfg(feature = "ui")]
 use mime_guess::from_path;
 #[cfg(feature = "ui")]
 use rust_embed::Embed;
 use tokio::net::TcpListener;
 use tower::MakeService;
 use tower_http::classify::ServerErrorsFailureClass::StatusCode;
 use tower_http::cors::{Any, CorsLayer};
 use tower_http::trace::TraceLayer;
 use tracing_subscriber::{layer::SubscriberExt, util::SubscriberInitExt};
 #[cfg(feature = "ui")]
 #[derive(Embed)]
 #[folder = "../../target/site"]
 #[include = "*.js"]
@@ -33,6 +38,7 @@ use tracing_subscriber::{layer::SubscriberExt, util::SubscriberInitExt};
 #[include = "*.ico"]
 struct Asset;
 #[cfg(feature = "ui")]
 async fn static_handler(uri: Uri) -> axum::response::Response {
    // Strip the leading `/`
    let path = uri.path().trim_start_matches('/');
@@ -110,17 +116,22 @@ async fn main() {
    // Create metrics layer
    let metrics_layer = MetricsLayer::new(metrics_store);
    let leptos_config = chat_ui::app::AppConfig::default();
    // Create the leptos router for the web frontend
    let leptos_router = chat_ui::app::create_router(leptos_config.config.leptos_options);
    // Merge the service router with base routes and add middleware layers
-    let app = Router::new()
+    let mut app = Router::new()
        .route("/pkg/{*path}", get(static_handler))
        .route("/health", get(|| async { "ok" }))
-        .merge(service_router)
+        .merge(service_router);
-        .merge(leptos_router)
+
    // Add UI routes if the UI feature is enabled
    #[cfg(feature = "ui")]
    {
        let leptos_config = chat_ui::app::AppConfig::default();
        let leptos_router = chat_ui::app::create_router(leptos_config.config.leptos_options);
        app = app
            .route("/pkg/{*path}", get(static_handler))
            .merge(leptos_router);
    }
    let app = app
        .layer(metrics_layer) // Add metrics tracking
        .layer(cors)
        .layer(TraceLayer::new_for_http());
@@ -141,6 +152,7 @@ async fn main() {
    );
    tracing::info!("Performance metrics tracking enabled - summary logs every 60 seconds");
    tracing::info!("Available endpoints:");
    #[cfg(feature = "ui")]
    tracing::info!("  GET  / - Leptos chat web application");
    tracing::info!("  GET  /health - Health check");
    tracing::info!("  POST /v1/models - List Models");
--- a/integration/utils/Cargo.toml
+++ b/integration/utils/Cargo.toml
@@ -1,17 +1,15 @@
 [package]
 name = "utils"
 version = "0.1.4"
 edition = "2021"
 [lib]
 path = "src/lib.rs"
 [dependencies]
 accelerate-src = {version = "0.3.2", optional = true }
 candle-nn = {version = "0.9.1" }
 candle-transformers = {version = "0.9.1" }
 candle-flash-attn = {version = "0.9.1", optional = true }
 candle-onnx = {version = "0.9.1", optional = true }
 candle-core="0.9.1"
 csv = "1.3.0"
 anyhow = "1.0.99"
 cudarc = {version = "0.17.3", optional = true }
@@ -86,3 +84,14 @@ mimi = ["cpal", "symphonia", "rubato"]
 snac = ["cpal", "symphonia", "rubato"]
 depth_anything_v2 = ["palette", "enterpolation"]
 tekken = ["tekken-rs"]
 # Platform-specific candle dependencies
 [target.'cfg(target_os = "linux")'.dependencies]
 candle-nn = {version = "0.9.1", default-features = false }
 candle-transformers = {version = "0.9.1", default-features = false }
 candle-core = {version = "0.9.1", default-features = false }
 [target.'cfg(not(target_os = "linux"))'.dependencies]
 candle-nn = {version = "0.9.1" }
 candle-transformers = {version = "0.9.1" }
 candle-core = {version = "0.9.1" }