diff --git a/.dockerignore b/.dockerignore new file mode 100644 index 0000000..9ac97d7 --- /dev/null +++ b/.dockerignore @@ -0,0 +1,35 @@ +# Git +.git +.gitignore + +# Rust +target/ + +# Documentation +README.md +*.md + +# IDE +.vscode/ +.idea/ +*.swp +*.swo + +# OS +.DS_Store +Thumbs.db + +# Logs +*.log + +# Environment +.env +.env.local + +# Dependencies +node_modules/ + +# Build artifacts +dist/ +build/ +.fastembed_cache \ No newline at end of file diff --git a/Cargo.lock b/Cargo.lock index 4b92fb9..353e374 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -7041,7 +7041,7 @@ checksum = "06abde3611657adf66d383f00b093d7faecc7fa57071cce2578660c9f1010821" [[package]] name = "utils" -version = "0.0.0" +version = "0.1.4" dependencies = [ "ab_glyph", "accelerate-src", diff --git a/Dockerfile b/Dockerfile new file mode 100644 index 0000000..2a446c0 --- /dev/null +++ b/Dockerfile @@ -0,0 +1,50 @@ +# Multi-stage build for predict-otron-9000 workspace +FROM rust:1 AS builder + +# Install system dependencies +RUN apt-get update && apt-get install -y \ + pkg-config \ + libssl-dev \ + && rm -rf /var/lib/apt/lists/* + +# Create app directory +WORKDIR /app + +# Copy workspace files +COPY Cargo.toml Cargo.lock ./ +COPY crates/ ./crates/ +COPY integration/ ./integration/ + +# Build all 3 main server binaries in release mode +RUN cargo build --release -p predict-otron-9000 --bin predict-otron-9000 --no-default-features -p embeddings-engine --bin embeddings-engine -p inference-engine --bin inference-engine + +# Runtime stage +FROM debian:bookworm-slim AS runtime + +# Install runtime dependencies +RUN apt-get update && apt-get install -y \ + ca-certificates \ + libssl3 \ + && rm -rf /var/lib/apt/lists/* + +# Create app user +RUN useradd -r -s /bin/false -m -d /app appuser + +# Set working directory +WORKDIR /app + +# Copy binaries from builder stage +COPY --from=builder /app/target/release/predict-otron-9000 ./bin/ +COPY --from=builder /app/target/release/embeddings-engine ./bin/ +COPY --from=builder /app/target/release/inference-engine ./bin/ +# Make binaries executable and change ownership +RUN chmod +x ./bin/* && chown -R appuser:appuser /app + +# Switch to non-root user +USER appuser + +# Expose ports (adjust as needed based on your services) +EXPOSE 8080 8081 8082 + +# Default command (can be overridden) +CMD ["./bin/predict-otron-9000"] \ No newline at end of file diff --git a/crates/chat-ui/Cargo.toml b/crates/chat-ui/Cargo.toml index e66dc32..e51bfb8 100644 --- a/crates/chat-ui/Cargo.toml +++ b/crates/chat-ui/Cargo.toml @@ -3,6 +3,7 @@ name = "chat-ui" version = "0.1.0" edition = "2021" + [lib] crate-type = ["cdylib", "rlib"] @@ -122,3 +123,7 @@ lib-default-features = false # # Optional. Defaults to "release". lib-profile-release = "release" + +[[bin]] +name = "chat-ui" +path = "src/main.rs" \ No newline at end of file diff --git a/crates/embeddings-engine/Cargo.toml b/crates/embeddings-engine/Cargo.toml index ffc0b96..76efe88 100644 --- a/crates/embeddings-engine/Cargo.toml +++ b/crates/embeddings-engine/Cargo.toml @@ -28,12 +28,13 @@ once_cell = "1.19.0" [package.metadata.compose] -image = "ghcr.io/geoffsee/embeddings-service:latest" +image = "ghcr.io/geoffsee/predict-otron-9000:latest" port = 8080 - +cmd = ["./bin/embeddings-engine"] # generates kubernetes manifests [package.metadata.kube] -image = "ghcr.io/geoffsee/embeddings-service:latest" +image = "ghcr.io/geoffsee/predict-otron-9000:latest" +cmd = ["./bin/embeddings-engine"] replicas = 1 port = 8080 \ No newline at end of file diff --git a/crates/embeddings-engine/Dockerfile b/crates/embeddings-engine/Dockerfile deleted file mode 100644 index f266d39..0000000 --- a/crates/embeddings-engine/Dockerfile +++ /dev/null @@ -1,42 +0,0 @@ -# ---- Build stage ---- -FROM rust:1-slim-bullseye AS builder - -WORKDIR /usr/src/app - -# Install build dependencies -RUN apt-get update && \ - apt-get install -y --no-install-recommends \ - pkg-config \ - libssl-dev \ - build-essential \ - && rm -rf /var/lib/apt/lists/* - -# Cache deps first -COPY . ./ -RUN rm -rf src -RUN mkdir src && echo "fn main() {}" > src/main.rs && echo "// lib" > src/lib.rs && cargo build --release -RUN rm -rf src - -# Copy real sources and build -COPY . . -RUN cargo build --release - -# ---- Runtime stage ---- -FROM debian:bullseye-slim - -# Install only what the compiled binary needs -RUN apt-get update && \ - apt-get install -y --no-install-recommends \ - libssl1.1 \ - ca-certificates \ - && rm -rf /var/lib/apt/lists/* - -# Copy binary from builder -COPY --from=builder /usr/src/app/target/release/embeddings-engine /usr/local/bin/ - -# Run as non-root user for safety -RUN useradd -m appuser -USER appuser - -EXPOSE 8080 -CMD ["embeddings-engine"] \ No newline at end of file diff --git a/crates/inference-engine/Cargo.toml b/crates/inference-engine/Cargo.toml index fbefd88..692924f 100644 --- a/crates/inference-engine/Cargo.toml +++ b/crates/inference-engine/Cargo.toml @@ -1,7 +1,7 @@ [package] name = "inference-engine" version.workspace = true -edition = "2021" +edition = "2024" [dependencies] candle-core = { git = "https://github.com/huggingface/candle.git" } @@ -31,14 +31,21 @@ utoipa = { version = "4.2.0", features = ["axum_extras"] } uuid = { version = "1.7.0", features = ["v4"] } reborrow = "0.5.5" futures-util = "0.3.31" -gemma-runner = { path = "../../integration/gemma-runner", features = ["metal"] } -llama-runner = { path = "../../integration/llama-runner", features = ["metal"]} +gemma-runner = { path = "../../integration/gemma-runner" } +llama-runner = { path = "../../integration/llama-runner" } embeddings-engine = { path = "../embeddings-engine" } +[target.'cfg(target_os = "linux")'.dependencies] +candle-core = { git = "https://github.com/huggingface/candle.git", default-features = false } +candle-nn = { git = "https://github.com/huggingface/candle.git", default-features = false } +candle-transformers = { git = "https://github.com/huggingface/candle.git", default-features = false } + [target.'cfg(target_os = "macos")'.dependencies] candle-core = { git = "https://github.com/huggingface/candle.git", features = ["metal"] } candle-nn = { git = "https://github.com/huggingface/candle.git", features = ["metal"] } candle-transformers = { git = "https://github.com/huggingface/candle.git", features = ["metal"] } +gemma-runner = { path = "../../integration/gemma-runner", features = ["metal"] } +llama-runner = { path = "../../integration/llama-runner", features = ["metal"] } [dev-dependencies] @@ -62,15 +69,19 @@ bindgen_cuda = { version = "0.1.1", optional = true } [features] bin = [] +[[bin]] +name = "inference-engine" +path = "src/main.rs" [package.metadata.compose] -image = "ghcr.io/geoffsee/inference-engine:latest" +image = "ghcr.io/geoffsee/predict-otron-9000:latest" +cmd = ["./bin/inference-engine"] port = 8080 - # generates kubernetes manifests [package.metadata.kube] -image = "ghcr.io/geoffsee/inference-service:latest" -replicas = 1 +image = "ghcr.io/geoffsee/predict-otron-9000:latest" +cmd = ["./bin/inference-engine"] port = 8080 +replicas = 1 diff --git a/crates/inference-engine/Dockerfile b/crates/inference-engine/Dockerfile deleted file mode 100644 index 489a207..0000000 --- a/crates/inference-engine/Dockerfile +++ /dev/null @@ -1,86 +0,0 @@ -# ---- Build stage ---- -FROM rust:1-slim-bullseye AS builder - -WORKDIR /usr/src/app - -# Install build dependencies including CUDA toolkit for GPU support -RUN apt-get update && \ - apt-get install -y --no-install-recommends \ - pkg-config \ - libssl-dev \ - build-essential \ - wget \ - gnupg2 \ - curl \ - && rm -rf /var/lib/apt/lists/* - -# Install CUDA toolkit (optional, for GPU support) -# This is a minimal CUDA installation for building -RUN wget https://developer.download.nvidia.com/compute/cuda/repos/debian11/x86_64/cuda-keyring_1.0-1_all.deb && \ - dpkg -i cuda-keyring_1.0-1_all.deb && \ - apt-get update && \ - apt-get install -y --no-install-recommends \ - cuda-minimal-build-11-8 \ - libcublas-dev-11-8 \ - libcurand-dev-11-8 \ - && rm -rf /var/lib/apt/lists/* \ - && rm cuda-keyring_1.0-1_all.deb - -# Set CUDA environment variables -ENV CUDA_HOME=/usr/local/cuda -ENV PATH=${CUDA_HOME}/bin:${PATH} -ENV LD_LIBRARY_PATH=${CUDA_HOME}/lib64:${LD_LIBRARY_PATH} - -# Copy the entire workspace to get access to all crates -COPY . ./ - -# Cache dependencies first - create dummy source files -RUN rm -rf crates/inference-engine/src -RUN mkdir -p crates/inference-engine/src && \ - echo "fn main() {}" > crates/inference-engine/src/main.rs && \ - echo "fn main() {}" > crates/inference-engine/src/cli_main.rs && \ - echo "// lib" > crates/inference-engine/src/lib.rs && \ - cargo build --release --bin cli --package inference-engine - -# Remove dummy source and copy real sources -RUN rm -rf crates/inference-engine/src -COPY . . - -# Build the actual CLI binary -RUN cargo build --release --bin cli --package inference-engine - -# ---- Runtime stage ---- -FROM debian:bullseye-slim - -# Install runtime dependencies -RUN apt-get update && \ - apt-get install -y --no-install-recommends \ - libssl1.1 \ - ca-certificates \ - && rm -rf /var/lib/apt/lists/* - -# Install CUDA runtime libraries (optional, for GPU support at runtime) -RUN apt-get update && \ - apt-get install -y --no-install-recommends \ - wget \ - gnupg2 \ - && wget https://developer.download.nvidia.com/compute/cuda/repos/debian11/x86_64/cuda-keyring_1.0-1_all.deb \ - && dpkg -i cuda-keyring_1.0-1_all.deb \ - && apt-get update \ - && apt-get install -y --no-install-recommends \ - cuda-cudart-11-8 \ - libcublas11 \ - libcurand10 \ - && rm -rf /var/lib/apt/lists/* \ - && rm cuda-keyring_1.0-1_all.deb \ - && apt-get purge -y wget gnupg2 - -# Copy binary from builder -COPY --from=builder /usr/src/app/target/release/cli /usr/local/bin/inference-cli - -# Run as non-root user for safety -RUN useradd -m appuser -USER appuser - -EXPOSE 8080 -CMD ["inference-cli"] \ No newline at end of file diff --git a/crates/inference-engine/src/main.rs b/crates/inference-engine/src/main.rs new file mode 100644 index 0000000..690c9fc --- /dev/null +++ b/crates/inference-engine/src/main.rs @@ -0,0 +1,23 @@ +use inference_engine::{create_router, init_tracing, get_server_config, AppState}; +use tokio::net::TcpListener; +use tracing::info; + +#[tokio::main] +async fn main() -> Result<(), Box> { + init_tracing(); + + let app_state = AppState::default(); + let app = create_router(app_state); + + let (server_host, server_port, server_address) = get_server_config(); + let listener = TcpListener::bind(&server_address).await?; + + info!("Inference Engine server starting on http://{}", server_address); + info!("Available endpoints:"); + info!(" POST /v1/chat/completions - OpenAI-compatible chat completions"); + info!(" GET /v1/models - List available models"); + + axum::serve(listener, app).await?; + + Ok(()) +} \ No newline at end of file diff --git a/crates/predict-otron-9000/Cargo.toml b/crates/predict-otron-9000/Cargo.toml index b3dabb0..8130a6a 100644 --- a/crates/predict-otron-9000/Cargo.toml +++ b/crates/predict-otron-9000/Cargo.toml @@ -29,7 +29,7 @@ inference-engine = { path = "../inference-engine" } # Dependencies for leptos web app #leptos-app = { path = "../leptos-app", features = ["ssr"] } -chat-ui = { path = "../chat-ui", features = ["ssr", "hydrate"], optional = false } +chat-ui = { path = "../chat-ui", features = ["ssr", "hydrate"], optional = true } mime_guess = "2.0.5" log = "0.4.27" @@ -39,15 +39,20 @@ log = "0.4.27" name = "predict-otron-9000" image = "ghcr.io/geoffsee/predict-otron-9000:latest" port = 8080 - +cmd = ["./bin/predict-otron-9000"] # generates kubernetes manifests [package.metadata.kube] image = "ghcr.io/geoffsee/predict-otron-9000:latest" replicas = 1 port = 8080 +cmd = ["./bin/predict-otron-9000"] # SERVER_CONFIG Example: {\"serverMode\":\"HighAvailability\",\"services\":{\"inference_url\":\"http://custom-inference:9000\",\"embeddings_url\":\"http://custom-embeddings:9001\"}} # you can generate this via node to avoid toil # const server_config = {serverMode: "HighAvailability", services: {inference_url: "http://custom-inference:9000", embeddings_url: "http://custom-embeddings:9001"} }; # console.log(JSON.stringify(server_config).replace(/"/g, '\\"')); env = { SERVER_CONFIG = "" } + +[features] +default = ["ui"] +ui = ["dep:chat-ui"] diff --git a/crates/predict-otron-9000/Dockerfile b/crates/predict-otron-9000/Dockerfile deleted file mode 100644 index a849b21..0000000 --- a/crates/predict-otron-9000/Dockerfile +++ /dev/null @@ -1,89 +0,0 @@ -# ---- Build stage ---- -FROM rust:1-slim-bullseye AS builder - -WORKDIR /usr/src/app - -# Install build dependencies including CUDA toolkit for GPU support (needed for inference-engine dependency) -RUN apt-get update && \ - apt-get install -y --no-install-recommends \ - pkg-config \ - libssl-dev \ - build-essential \ - wget \ - gnupg2 \ - curl \ - && rm -rf /var/lib/apt/lists/* - -# Install CUDA toolkit (required for inference-engine dependency) -# This is a minimal CUDA installation for building -RUN wget https://developer.download.nvidia.com/compute/cuda/repos/debian11/x86_64/cuda-keyring_1.0-1_all.deb && \ - dpkg -i cuda-keyring_1.0-1_all.deb && \ - apt-get update && \ - apt-get install -y --no-install-recommends \ - cuda-minimal-build-11-8 \ - libcublas-dev-11-8 \ - libcurand-dev-11-8 \ - && rm -rf /var/lib/apt/lists/* \ - && rm cuda-keyring_1.0-1_all.deb - -# Set CUDA environment variables -ENV CUDA_HOME=/usr/local/cuda -ENV PATH=${CUDA_HOME}/bin:${PATH} -ENV LD_LIBRARY_PATH=${CUDA_HOME}/lib64:${LD_LIBRARY_PATH} - -# Copy the entire workspace to get access to all crates (needed for local dependencies) -COPY . ./ - -# Cache dependencies first - create dummy source files for all crates -RUN rm -rf crates/predict-otron-9000/src crates/inference-engine/src crates/embeddings-engine/src -RUN mkdir -p crates/predict-otron-9000/src crates/inference-engine/src crates/embeddings-engine/src && \ - echo "fn main() {}" > crates/predict-otron-9000/src/main.rs && \ - echo "fn main() {}" > crates/inference-engine/src/main.rs && \ - echo "fn main() {}" > crates/inference-engine/src/cli_main.rs && \ - echo "// lib" > crates/inference-engine/src/lib.rs && \ - echo "fn main() {}" > crates/embeddings-engine/src/main.rs && \ - echo "// lib" > crates/embeddings-engine/src/lib.rs && \ - cargo build --release --bin predict-otron-9000 --package predict-otron-9000 - -# Remove dummy sources and copy real sources -RUN rm -rf crates/predict-otron-9000/src crates/inference-engine/src crates/embeddings-engine/src -COPY . . - -# Build the actual binary -RUN cargo build --release --bin predict-otron-9000 --package predict-otron-9000 - -# ---- Runtime stage ---- -FROM debian:bullseye-slim - -# Install runtime dependencies -RUN apt-get update && \ - apt-get install -y --no-install-recommends \ - libssl1.1 \ - ca-certificates \ - && rm -rf /var/lib/apt/lists/* - -# Install CUDA runtime libraries (required for inference-engine dependency) -RUN apt-get update && \ - apt-get install -y --no-install-recommends \ - wget \ - gnupg2 \ - && wget https://developer.download.nvidia.com/compute/cuda/repos/debian11/x86_64/cuda-keyring_1.0-1_all.deb \ - && dpkg -i cuda-keyring_1.0-1_all.deb \ - && apt-get update \ - && apt-get install -y --no-install-recommends \ - cuda-cudart-11-8 \ - libcublas11 \ - libcurand10 \ - && rm -rf /var/lib/apt/lists/* \ - && rm cuda-keyring_1.0-1_all.deb \ - && apt-get purge -y wget gnupg2 - -# Copy binary from builder -COPY --from=builder /usr/src/app/target/release/predict-otron-9000 /usr/local/bin/ - -# Run as non-root user for safety -RUN useradd -m appuser -USER appuser - -EXPOSE 8080 -CMD ["predict-otron-9000"] \ No newline at end of file diff --git a/crates/predict-otron-9000/src/main.rs b/crates/predict-otron-9000/src/main.rs index 7418ed1..b263190 100644 --- a/crates/predict-otron-9000/src/main.rs +++ b/crates/predict-otron-9000/src/main.rs @@ -4,27 +4,32 @@ mod middleware; mod standalone_mode; use crate::standalone_mode::create_standalone_router; -use axum::http::StatusCode as AxumStatusCode; -use axum::http::header; -use axum::response::IntoResponse; use axum::routing::get; -use axum::{Router, http::Uri, response::Html, serve}; +use axum::{Router, serve}; use config::ServerConfig; use ha_mode::create_ha_router; -use inference_engine::AppState; -use log::info; use middleware::{MetricsLayer, MetricsLoggerFuture, MetricsStore}; -use mime_guess::from_path; -use rust_embed::Embed; use std::env; -use std::path::Component::ParentDir; + +#[cfg(feature = "ui")] +use axum::http::StatusCode as AxumStatusCode; +#[cfg(feature = "ui")] +use axum::http::header; +#[cfg(feature = "ui")] +use axum::response::IntoResponse; +#[cfg(feature = "ui")] +use axum::http::Uri; +#[cfg(feature = "ui")] +use mime_guess::from_path; +#[cfg(feature = "ui")] +use rust_embed::Embed; use tokio::net::TcpListener; -use tower::MakeService; -use tower_http::classify::ServerErrorsFailureClass::StatusCode; use tower_http::cors::{Any, CorsLayer}; use tower_http::trace::TraceLayer; use tracing_subscriber::{layer::SubscriberExt, util::SubscriberInitExt}; + +#[cfg(feature = "ui")] #[derive(Embed)] #[folder = "../../target/site"] #[include = "*.js"] @@ -33,6 +38,7 @@ use tracing_subscriber::{layer::SubscriberExt, util::SubscriberInitExt}; #[include = "*.ico"] struct Asset; +#[cfg(feature = "ui")] async fn static_handler(uri: Uri) -> axum::response::Response { // Strip the leading `/` let path = uri.path().trim_start_matches('/'); @@ -110,17 +116,22 @@ async fn main() { // Create metrics layer let metrics_layer = MetricsLayer::new(metrics_store); - let leptos_config = chat_ui::app::AppConfig::default(); - - // Create the leptos router for the web frontend - let leptos_router = chat_ui::app::create_router(leptos_config.config.leptos_options); - // Merge the service router with base routes and add middleware layers - let app = Router::new() - .route("/pkg/{*path}", get(static_handler)) + let mut app = Router::new() .route("/health", get(|| async { "ok" })) - .merge(service_router) - .merge(leptos_router) + .merge(service_router); + + // Add UI routes if the UI feature is enabled + #[cfg(feature = "ui")] + { + let leptos_config = chat_ui::app::AppConfig::default(); + let leptos_router = chat_ui::app::create_router(leptos_config.config.leptos_options); + app = app + .route("/pkg/{*path}", get(static_handler)) + .merge(leptos_router); + } + + let app = app .layer(metrics_layer) // Add metrics tracking .layer(cors) .layer(TraceLayer::new_for_http()); @@ -141,6 +152,7 @@ async fn main() { ); tracing::info!("Performance metrics tracking enabled - summary logs every 60 seconds"); tracing::info!("Available endpoints:"); + #[cfg(feature = "ui")] tracing::info!(" GET / - Leptos chat web application"); tracing::info!(" GET /health - Health check"); tracing::info!(" POST /v1/models - List Models"); diff --git a/integration/utils/Cargo.toml b/integration/utils/Cargo.toml index fa474c5..dc96482 100644 --- a/integration/utils/Cargo.toml +++ b/integration/utils/Cargo.toml @@ -1,17 +1,15 @@ [package] name = "utils" +version = "0.1.4" +edition = "2021" [lib] path = "src/lib.rs" [dependencies] accelerate-src = {version = "0.3.2", optional = true } -candle-nn = {version = "0.9.1" } -candle-transformers = {version = "0.9.1" } - candle-flash-attn = {version = "0.9.1", optional = true } candle-onnx = {version = "0.9.1", optional = true } -candle-core="0.9.1" csv = "1.3.0" anyhow = "1.0.99" cudarc = {version = "0.17.3", optional = true } @@ -85,4 +83,15 @@ encodec = ["cpal", "symphonia", "rubato"] mimi = ["cpal", "symphonia", "rubato"] snac = ["cpal", "symphonia", "rubato"] depth_anything_v2 = ["palette", "enterpolation"] -tekken = ["tekken-rs"] \ No newline at end of file +tekken = ["tekken-rs"] + +# Platform-specific candle dependencies +[target.'cfg(target_os = "linux")'.dependencies] +candle-nn = {version = "0.9.1", default-features = false } +candle-transformers = {version = "0.9.1", default-features = false } +candle-core = {version = "0.9.1", default-features = false } + +[target.'cfg(not(target_os = "linux"))'.dependencies] +candle-nn = {version = "0.9.1" } +candle-transformers = {version = "0.9.1" } +candle-core = {version = "0.9.1" } \ No newline at end of file