mirror of
https://github.com/geoffsee/predict-otron-9001.git
synced 2025-09-08 22:46:44 +00:00
add root dockerfile that contains binaries for all services
This commit is contained in:
@@ -1,7 +1,7 @@
|
||||
[package]
|
||||
name = "inference-engine"
|
||||
version.workspace = true
|
||||
edition = "2021"
|
||||
edition = "2024"
|
||||
|
||||
[dependencies]
|
||||
candle-core = { git = "https://github.com/huggingface/candle.git" }
|
||||
@@ -31,14 +31,21 @@ utoipa = { version = "4.2.0", features = ["axum_extras"] }
|
||||
uuid = { version = "1.7.0", features = ["v4"] }
|
||||
reborrow = "0.5.5"
|
||||
futures-util = "0.3.31"
|
||||
gemma-runner = { path = "../../integration/gemma-runner", features = ["metal"] }
|
||||
llama-runner = { path = "../../integration/llama-runner", features = ["metal"]}
|
||||
gemma-runner = { path = "../../integration/gemma-runner" }
|
||||
llama-runner = { path = "../../integration/llama-runner" }
|
||||
embeddings-engine = { path = "../embeddings-engine" }
|
||||
|
||||
[target.'cfg(target_os = "linux")'.dependencies]
|
||||
candle-core = { git = "https://github.com/huggingface/candle.git", default-features = false }
|
||||
candle-nn = { git = "https://github.com/huggingface/candle.git", default-features = false }
|
||||
candle-transformers = { git = "https://github.com/huggingface/candle.git", default-features = false }
|
||||
|
||||
[target.'cfg(target_os = "macos")'.dependencies]
|
||||
candle-core = { git = "https://github.com/huggingface/candle.git", features = ["metal"] }
|
||||
candle-nn = { git = "https://github.com/huggingface/candle.git", features = ["metal"] }
|
||||
candle-transformers = { git = "https://github.com/huggingface/candle.git", features = ["metal"] }
|
||||
gemma-runner = { path = "../../integration/gemma-runner", features = ["metal"] }
|
||||
llama-runner = { path = "../../integration/llama-runner", features = ["metal"] }
|
||||
|
||||
|
||||
[dev-dependencies]
|
||||
@@ -62,15 +69,19 @@ bindgen_cuda = { version = "0.1.1", optional = true }
|
||||
[features]
|
||||
bin = []
|
||||
|
||||
[[bin]]
|
||||
name = "inference-engine"
|
||||
path = "src/main.rs"
|
||||
|
||||
|
||||
[package.metadata.compose]
|
||||
image = "ghcr.io/geoffsee/inference-engine:latest"
|
||||
image = "ghcr.io/geoffsee/predict-otron-9000:latest"
|
||||
cmd = ["./bin/inference-engine"]
|
||||
port = 8080
|
||||
|
||||
|
||||
# generates kubernetes manifests
|
||||
[package.metadata.kube]
|
||||
image = "ghcr.io/geoffsee/inference-service:latest"
|
||||
replicas = 1
|
||||
image = "ghcr.io/geoffsee/predict-otron-9000:latest"
|
||||
cmd = ["./bin/inference-engine"]
|
||||
port = 8080
|
||||
replicas = 1
|
||||
|
@@ -1,86 +0,0 @@
|
||||
# ---- Build stage ----
|
||||
FROM rust:1-slim-bullseye AS builder
|
||||
|
||||
WORKDIR /usr/src/app
|
||||
|
||||
# Install build dependencies including CUDA toolkit for GPU support
|
||||
RUN apt-get update && \
|
||||
apt-get install -y --no-install-recommends \
|
||||
pkg-config \
|
||||
libssl-dev \
|
||||
build-essential \
|
||||
wget \
|
||||
gnupg2 \
|
||||
curl \
|
||||
&& rm -rf /var/lib/apt/lists/*
|
||||
|
||||
# Install CUDA toolkit (optional, for GPU support)
|
||||
# This is a minimal CUDA installation for building
|
||||
RUN wget https://developer.download.nvidia.com/compute/cuda/repos/debian11/x86_64/cuda-keyring_1.0-1_all.deb && \
|
||||
dpkg -i cuda-keyring_1.0-1_all.deb && \
|
||||
apt-get update && \
|
||||
apt-get install -y --no-install-recommends \
|
||||
cuda-minimal-build-11-8 \
|
||||
libcublas-dev-11-8 \
|
||||
libcurand-dev-11-8 \
|
||||
&& rm -rf /var/lib/apt/lists/* \
|
||||
&& rm cuda-keyring_1.0-1_all.deb
|
||||
|
||||
# Set CUDA environment variables
|
||||
ENV CUDA_HOME=/usr/local/cuda
|
||||
ENV PATH=${CUDA_HOME}/bin:${PATH}
|
||||
ENV LD_LIBRARY_PATH=${CUDA_HOME}/lib64:${LD_LIBRARY_PATH}
|
||||
|
||||
# Copy the entire workspace to get access to all crates
|
||||
COPY . ./
|
||||
|
||||
# Cache dependencies first - create dummy source files
|
||||
RUN rm -rf crates/inference-engine/src
|
||||
RUN mkdir -p crates/inference-engine/src && \
|
||||
echo "fn main() {}" > crates/inference-engine/src/main.rs && \
|
||||
echo "fn main() {}" > crates/inference-engine/src/cli_main.rs && \
|
||||
echo "// lib" > crates/inference-engine/src/lib.rs && \
|
||||
cargo build --release --bin cli --package inference-engine
|
||||
|
||||
# Remove dummy source and copy real sources
|
||||
RUN rm -rf crates/inference-engine/src
|
||||
COPY . .
|
||||
|
||||
# Build the actual CLI binary
|
||||
RUN cargo build --release --bin cli --package inference-engine
|
||||
|
||||
# ---- Runtime stage ----
|
||||
FROM debian:bullseye-slim
|
||||
|
||||
# Install runtime dependencies
|
||||
RUN apt-get update && \
|
||||
apt-get install -y --no-install-recommends \
|
||||
libssl1.1 \
|
||||
ca-certificates \
|
||||
&& rm -rf /var/lib/apt/lists/*
|
||||
|
||||
# Install CUDA runtime libraries (optional, for GPU support at runtime)
|
||||
RUN apt-get update && \
|
||||
apt-get install -y --no-install-recommends \
|
||||
wget \
|
||||
gnupg2 \
|
||||
&& wget https://developer.download.nvidia.com/compute/cuda/repos/debian11/x86_64/cuda-keyring_1.0-1_all.deb \
|
||||
&& dpkg -i cuda-keyring_1.0-1_all.deb \
|
||||
&& apt-get update \
|
||||
&& apt-get install -y --no-install-recommends \
|
||||
cuda-cudart-11-8 \
|
||||
libcublas11 \
|
||||
libcurand10 \
|
||||
&& rm -rf /var/lib/apt/lists/* \
|
||||
&& rm cuda-keyring_1.0-1_all.deb \
|
||||
&& apt-get purge -y wget gnupg2
|
||||
|
||||
# Copy binary from builder
|
||||
COPY --from=builder /usr/src/app/target/release/cli /usr/local/bin/inference-cli
|
||||
|
||||
# Run as non-root user for safety
|
||||
RUN useradd -m appuser
|
||||
USER appuser
|
||||
|
||||
EXPOSE 8080
|
||||
CMD ["inference-cli"]
|
23
crates/inference-engine/src/main.rs
Normal file
23
crates/inference-engine/src/main.rs
Normal file
@@ -0,0 +1,23 @@
|
||||
use inference_engine::{create_router, init_tracing, get_server_config, AppState};
|
||||
use tokio::net::TcpListener;
|
||||
use tracing::info;
|
||||
|
||||
#[tokio::main]
|
||||
async fn main() -> Result<(), Box<dyn std::error::Error>> {
|
||||
init_tracing();
|
||||
|
||||
let app_state = AppState::default();
|
||||
let app = create_router(app_state);
|
||||
|
||||
let (server_host, server_port, server_address) = get_server_config();
|
||||
let listener = TcpListener::bind(&server_address).await?;
|
||||
|
||||
info!("Inference Engine server starting on http://{}", server_address);
|
||||
info!("Available endpoints:");
|
||||
info!(" POST /v1/chat/completions - OpenAI-compatible chat completions");
|
||||
info!(" GET /v1/models - List available models");
|
||||
|
||||
axum::serve(listener, app).await?;
|
||||
|
||||
Ok(())
|
||||
}
|
Reference in New Issue
Block a user