add root dockerfile that contains binaries for all services

2025-09-08 22:46:44 +00:00 · 2025-09-04 14:54:20 -04:00
parent fb5098eba6
commit 296d4dbe7e
13 changed files with 189 additions and 255 deletions
--- a/crates/inference-engine/Cargo.toml
+++ b/crates/inference-engine/Cargo.toml
@@ -1,7 +1,7 @@
 [package]
 name = "inference-engine"
 version.workspace = true
-edition = "2021"
+edition = "2024"

 [dependencies]
 candle-core = { git = "https://github.com/huggingface/candle.git" }
@@ -31,14 +31,21 @@ utoipa = { version = "4.2.0", features = ["axum_extras"] }
 uuid = { version = "1.7.0", features = ["v4"] }
 reborrow = "0.5.5"
 futures-util = "0.3.31"
-gemma-runner = { path = "../../integration/gemma-runner", features = ["metal"] }
-llama-runner = { path = "../../integration/llama-runner", features = ["metal"]}
+gemma-runner = { path = "../../integration/gemma-runner" }
+llama-runner = { path = "../../integration/llama-runner" }
 embeddings-engine = { path = "../embeddings-engine" }

+[target.'cfg(target_os = "linux")'.dependencies]
+candle-core = { git = "https://github.com/huggingface/candle.git", default-features = false }
+candle-nn = { git = "https://github.com/huggingface/candle.git", default-features = false }
+candle-transformers = { git = "https://github.com/huggingface/candle.git", default-features = false }
+
 [target.'cfg(target_os = "macos")'.dependencies]
 candle-core = { git = "https://github.com/huggingface/candle.git", features = ["metal"] }
 candle-nn = { git = "https://github.com/huggingface/candle.git", features = ["metal"] }
 candle-transformers = { git = "https://github.com/huggingface/candle.git", features = ["metal"] }
+gemma-runner = { path = "../../integration/gemma-runner", features = ["metal"] }
+llama-runner = { path = "../../integration/llama-runner", features = ["metal"] }


 [dev-dependencies]
@@ -62,15 +69,19 @@ bindgen_cuda = { version = "0.1.1", optional = true }
 [features]
 bin = []

+[[bin]]
+name = "inference-engine"
+path = "src/main.rs"


 [package.metadata.compose]
-image = "ghcr.io/geoffsee/inference-engine:latest"
+image = "ghcr.io/geoffsee/predict-otron-9000:latest"
+cmd = ["./bin/inference-engine"]
 port = 8080

-
 # generates kubernetes manifests
 [package.metadata.kube]
-image = "ghcr.io/geoffsee/inference-service:latest"
-replicas = 1
+image = "ghcr.io/geoffsee/predict-otron-9000:latest"
+cmd = ["./bin/inference-engine"]
 port = 8080
+replicas = 1
--- a/crates/inference-engine/Dockerfile
+++ b/crates/inference-engine/Dockerfile
@@ -1,86 +0,0 @@
-# ---- Build stage ----
-FROM rust:1-slim-bullseye AS builder
-
-WORKDIR /usr/src/app
-
-# Install build dependencies including CUDA toolkit for GPU support
-RUN apt-get update && \
-    apt-get install -y --no-install-recommends \
-        pkg-config \
-        libssl-dev \
-        build-essential \
-        wget \
-        gnupg2 \
-        curl \
-        && rm -rf /var/lib/apt/lists/*
-
-# Install CUDA toolkit (optional, for GPU support)
-# This is a minimal CUDA installation for building
-RUN wget https://developer.download.nvidia.com/compute/cuda/repos/debian11/x86_64/cuda-keyring_1.0-1_all.deb && \
-    dpkg -i cuda-keyring_1.0-1_all.deb && \
-    apt-get update && \
-    apt-get install -y --no-install-recommends \
-        cuda-minimal-build-11-8 \
-        libcublas-dev-11-8 \
-        libcurand-dev-11-8 \
-        && rm -rf /var/lib/apt/lists/* \
-        && rm cuda-keyring_1.0-1_all.deb
-
-# Set CUDA environment variables
-ENV CUDA_HOME=/usr/local/cuda
-ENV PATH=${CUDA_HOME}/bin:${PATH}
-ENV LD_LIBRARY_PATH=${CUDA_HOME}/lib64:${LD_LIBRARY_PATH}
-
-# Copy the entire workspace to get access to all crates
-COPY . ./
-
-# Cache dependencies first - create dummy source files
-RUN rm -rf crates/inference-engine/src
-RUN mkdir -p crates/inference-engine/src && \
-    echo "fn main() {}" > crates/inference-engine/src/main.rs && \
-    echo "fn main() {}" > crates/inference-engine/src/cli_main.rs && \
-    echo "// lib" > crates/inference-engine/src/lib.rs && \
-    cargo build --release --bin cli --package inference-engine
-
-# Remove dummy source and copy real sources
-RUN rm -rf crates/inference-engine/src
-COPY . .
-
-# Build the actual CLI binary
-RUN cargo build --release --bin cli --package inference-engine
-
-# ---- Runtime stage ----
-FROM debian:bullseye-slim
-
-# Install runtime dependencies
-RUN apt-get update && \
-    apt-get install -y --no-install-recommends \
-        libssl1.1 \
-        ca-certificates \
-        && rm -rf /var/lib/apt/lists/*
-
-# Install CUDA runtime libraries (optional, for GPU support at runtime)
-RUN apt-get update && \
-    apt-get install -y --no-install-recommends \
-        wget \
-        gnupg2 \
-        && wget https://developer.download.nvidia.com/compute/cuda/repos/debian11/x86_64/cuda-keyring_1.0-1_all.deb \
-        && dpkg -i cuda-keyring_1.0-1_all.deb \
-        && apt-get update \
-        && apt-get install -y --no-install-recommends \
-            cuda-cudart-11-8 \
-            libcublas11 \
-            libcurand10 \
-        && rm -rf /var/lib/apt/lists/* \
-        && rm cuda-keyring_1.0-1_all.deb \
-        && apt-get purge -y wget gnupg2
-
-# Copy binary from builder
-COPY --from=builder /usr/src/app/target/release/cli /usr/local/bin/inference-cli
-
-# Run as non-root user for safety
-RUN useradd -m appuser
-USER appuser
-
-EXPOSE 8080
-CMD ["inference-cli"]
--- a/crates/inference-engine/src/main.rs
+++ b/crates/inference-engine/src/main.rs
@@ -0,0 +1,23 @@
+use inference_engine::{create_router, init_tracing, get_server_config, AppState};
+use tokio::net::TcpListener;
+use tracing::info;
+
+#[tokio::main]
+async fn main() -> Result<(), Box<dyn std::error::Error>> {
+    init_tracing();
+    
+    let app_state = AppState::default();
+    let app = create_router(app_state);
+    
+    let (server_host, server_port, server_address) = get_server_config();
+    let listener = TcpListener::bind(&server_address).await?;
+    
+    info!("Inference Engine server starting on http://{}", server_address);
+    info!("Available endpoints:");
+    info!("  POST /v1/chat/completions - OpenAI-compatible chat completions");
+    info!("  GET  /v1/models         - List available models");
+    
+    axum::serve(listener, app).await?;
+    
+    Ok(())
+}