add root dockerfile that contains binaries for all services

This commit is contained in:
geoffsee
2025-09-04 14:54:20 -04:00
parent fb5098eba6
commit 296d4dbe7e
13 changed files with 189 additions and 255 deletions

View File

@@ -1,7 +1,7 @@
[package]
name = "inference-engine"
version.workspace = true
edition = "2021"
edition = "2024"
[dependencies]
candle-core = { git = "https://github.com/huggingface/candle.git" }
@@ -31,14 +31,21 @@ utoipa = { version = "4.2.0", features = ["axum_extras"] }
uuid = { version = "1.7.0", features = ["v4"] }
reborrow = "0.5.5"
futures-util = "0.3.31"
gemma-runner = { path = "../../integration/gemma-runner", features = ["metal"] }
llama-runner = { path = "../../integration/llama-runner", features = ["metal"]}
gemma-runner = { path = "../../integration/gemma-runner" }
llama-runner = { path = "../../integration/llama-runner" }
embeddings-engine = { path = "../embeddings-engine" }
[target.'cfg(target_os = "linux")'.dependencies]
candle-core = { git = "https://github.com/huggingface/candle.git", default-features = false }
candle-nn = { git = "https://github.com/huggingface/candle.git", default-features = false }
candle-transformers = { git = "https://github.com/huggingface/candle.git", default-features = false }
[target.'cfg(target_os = "macos")'.dependencies]
candle-core = { git = "https://github.com/huggingface/candle.git", features = ["metal"] }
candle-nn = { git = "https://github.com/huggingface/candle.git", features = ["metal"] }
candle-transformers = { git = "https://github.com/huggingface/candle.git", features = ["metal"] }
gemma-runner = { path = "../../integration/gemma-runner", features = ["metal"] }
llama-runner = { path = "../../integration/llama-runner", features = ["metal"] }
[dev-dependencies]
@@ -62,15 +69,19 @@ bindgen_cuda = { version = "0.1.1", optional = true }
[features]
bin = []
[[bin]]
name = "inference-engine"
path = "src/main.rs"
[package.metadata.compose]
image = "ghcr.io/geoffsee/inference-engine:latest"
image = "ghcr.io/geoffsee/predict-otron-9000:latest"
cmd = ["./bin/inference-engine"]
port = 8080
# generates kubernetes manifests
[package.metadata.kube]
image = "ghcr.io/geoffsee/inference-service:latest"
replicas = 1
image = "ghcr.io/geoffsee/predict-otron-9000:latest"
cmd = ["./bin/inference-engine"]
port = 8080
replicas = 1

View File

@@ -1,86 +0,0 @@
# ---- Build stage ----
FROM rust:1-slim-bullseye AS builder
WORKDIR /usr/src/app
# Install build dependencies including CUDA toolkit for GPU support
RUN apt-get update && \
apt-get install -y --no-install-recommends \
pkg-config \
libssl-dev \
build-essential \
wget \
gnupg2 \
curl \
&& rm -rf /var/lib/apt/lists/*
# Install CUDA toolkit (optional, for GPU support)
# This is a minimal CUDA installation for building
RUN wget https://developer.download.nvidia.com/compute/cuda/repos/debian11/x86_64/cuda-keyring_1.0-1_all.deb && \
dpkg -i cuda-keyring_1.0-1_all.deb && \
apt-get update && \
apt-get install -y --no-install-recommends \
cuda-minimal-build-11-8 \
libcublas-dev-11-8 \
libcurand-dev-11-8 \
&& rm -rf /var/lib/apt/lists/* \
&& rm cuda-keyring_1.0-1_all.deb
# Set CUDA environment variables
ENV CUDA_HOME=/usr/local/cuda
ENV PATH=${CUDA_HOME}/bin:${PATH}
ENV LD_LIBRARY_PATH=${CUDA_HOME}/lib64:${LD_LIBRARY_PATH}
# Copy the entire workspace to get access to all crates
COPY . ./
# Cache dependencies first - create dummy source files
RUN rm -rf crates/inference-engine/src
RUN mkdir -p crates/inference-engine/src && \
echo "fn main() {}" > crates/inference-engine/src/main.rs && \
echo "fn main() {}" > crates/inference-engine/src/cli_main.rs && \
echo "// lib" > crates/inference-engine/src/lib.rs && \
cargo build --release --bin cli --package inference-engine
# Remove dummy source and copy real sources
RUN rm -rf crates/inference-engine/src
COPY . .
# Build the actual CLI binary
RUN cargo build --release --bin cli --package inference-engine
# ---- Runtime stage ----
FROM debian:bullseye-slim
# Install runtime dependencies
RUN apt-get update && \
apt-get install -y --no-install-recommends \
libssl1.1 \
ca-certificates \
&& rm -rf /var/lib/apt/lists/*
# Install CUDA runtime libraries (optional, for GPU support at runtime)
RUN apt-get update && \
apt-get install -y --no-install-recommends \
wget \
gnupg2 \
&& wget https://developer.download.nvidia.com/compute/cuda/repos/debian11/x86_64/cuda-keyring_1.0-1_all.deb \
&& dpkg -i cuda-keyring_1.0-1_all.deb \
&& apt-get update \
&& apt-get install -y --no-install-recommends \
cuda-cudart-11-8 \
libcublas11 \
libcurand10 \
&& rm -rf /var/lib/apt/lists/* \
&& rm cuda-keyring_1.0-1_all.deb \
&& apt-get purge -y wget gnupg2
# Copy binary from builder
COPY --from=builder /usr/src/app/target/release/cli /usr/local/bin/inference-cli
# Run as non-root user for safety
RUN useradd -m appuser
USER appuser
EXPOSE 8080
CMD ["inference-cli"]

View File

@@ -0,0 +1,23 @@
use inference_engine::{create_router, init_tracing, get_server_config, AppState};
use tokio::net::TcpListener;
use tracing::info;
#[tokio::main]
async fn main() -> Result<(), Box<dyn std::error::Error>> {
init_tracing();
let app_state = AppState::default();
let app = create_router(app_state);
let (server_host, server_port, server_address) = get_server_config();
let listener = TcpListener::bind(&server_address).await?;
info!("Inference Engine server starting on http://{}", server_address);
info!("Available endpoints:");
info!(" POST /v1/chat/completions - OpenAI-compatible chat completions");
info!(" GET /v1/models - List available models");
axum::serve(listener, app).await?;
Ok(())
}