From ff55d882c77e34f08bf0c46ec847e3b1eb5eb793 Mon Sep 17 00:00:00 2001 From: geoffsee <> Date: Thu, 4 Sep 2025 12:27:13 -0400 Subject: [PATCH] reorg + update docs with new paths --- .gitignore | 1 + Cargo.lock | 1 + Cargo.toml | 10 +- README.md | 23 +- crates/embeddings-engine/src/lib.rs | 432 ++++++++++++++++-- crates/embeddings-engine/src/main.rs | 129 +----- crates/inference-engine/Cargo.toml | 5 +- crates/inference-engine/src/server.rs | 21 +- crates/predict-otron-9000/src/main.rs | 1 + docs/ARCHITECTURE.md | 25 +- {crates => integration}/cli/Cargo.toml | 0 {crates => integration}/cli/README.md | 2 +- {crates => integration}/cli/build.rs | 0 {crates => integration}/cli/bun_target.rs | 0 integration/cli/package/bun.lock | 17 + {crates => integration}/cli/package/cli.ts | 0 .../cli/package/package.json | 0 {crates => integration}/cli/src/main.rs | 0 .../gemma-runner/Cargo.toml | 2 +- .../gemma-runner/README.md | 0 .../gemma-runner/src/gemma_api.rs | 0 .../gemma-runner/src/gemma_cli.rs | 0 .../gemma-runner/src/lib.rs | 0 .../gemma-runner/src/main.rs | 0 .../helm-chart-tool/Cargo.toml | 0 .../helm-chart-tool/README.md | 0 .../helm-chart-tool/src/main.rs | 2 +- .../llama-runner/Cargo.toml | 0 .../llama-runner/README.md | 0 .../llama-runner/src/lib.rs | 0 .../llama-runner/src/llama_api.rs | 0 .../llama-runner/src/llama_cli.rs | 0 .../llama-runner/src/main.rs | 0 {crates => integration}/utils/Cargo.toml | 0 {crates => integration}/utils/src/audio.rs | 0 {crates => integration}/utils/src/bs1770.rs | 0 .../utils/src/coco_classes.rs | 0 {crates => integration}/utils/src/imagenet.rs | 0 {crates => integration}/utils/src/lib.rs | 0 {crates => integration}/utils/src/main.rs | 0 .../utils/src/token_output_stream.rs | 0 {crates => integration}/utils/src/wav.rs | 0 package.json | 4 +- 43 files changed, 493 insertions(+), 182 deletions(-) rename {crates => integration}/cli/Cargo.toml (100%) rename {crates => integration}/cli/README.md (96%) rename {crates => integration}/cli/build.rs (100%) rename {crates => integration}/cli/bun_target.rs (100%) create mode 100644 integration/cli/package/bun.lock rename {crates => integration}/cli/package/cli.ts (100%) rename {crates => integration}/cli/package/package.json (100%) rename {crates => integration}/cli/src/main.rs (100%) rename {crates => integration}/gemma-runner/Cargo.toml (97%) rename {crates => integration}/gemma-runner/README.md (100%) rename {crates => integration}/gemma-runner/src/gemma_api.rs (100%) rename {crates => integration}/gemma-runner/src/gemma_cli.rs (100%) rename {crates => integration}/gemma-runner/src/lib.rs (100%) rename {crates => integration}/gemma-runner/src/main.rs (100%) rename {crates => integration}/helm-chart-tool/Cargo.toml (100%) rename {crates => integration}/helm-chart-tool/README.md (100%) rename {crates => integration}/helm-chart-tool/src/main.rs (99%) rename {crates => integration}/llama-runner/Cargo.toml (100%) rename {crates => integration}/llama-runner/README.md (100%) rename {crates => integration}/llama-runner/src/lib.rs (100%) rename {crates => integration}/llama-runner/src/llama_api.rs (100%) rename {crates => integration}/llama-runner/src/llama_cli.rs (100%) rename {crates => integration}/llama-runner/src/main.rs (100%) rename {crates => integration}/utils/Cargo.toml (100%) rename {crates => integration}/utils/src/audio.rs (100%) rename {crates => integration}/utils/src/bs1770.rs (100%) rename {crates => integration}/utils/src/coco_classes.rs (100%) rename {crates => integration}/utils/src/imagenet.rs (100%) rename {crates => integration}/utils/src/lib.rs (100%) rename {crates => integration}/utils/src/main.rs (100%) rename {crates => integration}/utils/src/token_output_stream.rs (100%) rename {crates => integration}/utils/src/wav.rs (100%) diff --git a/.gitignore b/.gitignore index 0ce0aea..8dd381d 100644 --- a/.gitignore +++ b/.gitignore @@ -77,3 +77,4 @@ venv/ !/scripts/cli.ts /**/.*.bun-build /AGENTS.md +.claude diff --git a/Cargo.lock b/Cargo.lock index 8fa9f2f..4b92fb9 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -2905,6 +2905,7 @@ dependencies = [ "clap", "cpal", "either", + "embeddings-engine", "futures-util", "gemma-runner", "imageproc 0.24.0", diff --git a/Cargo.toml b/Cargo.toml index 77c7a12..9424302 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -3,12 +3,12 @@ members = [ "crates/predict-otron-9000", "crates/inference-engine", "crates/embeddings-engine", - "crates/helm-chart-tool", - "crates/llama-runner", - "crates/gemma-runner", - "crates/cli", + "integration/helm-chart-tool", + "integration/llama-runner", + "integration/gemma-runner", + "integration/cli", "crates/chat-ui" -, "crates/utils"] +, "integration/utils"] default-members = ["crates/predict-otron-9000"] resolver = "2" diff --git a/README.md b/README.md index 091a908..8d85b2d 100644 --- a/README.md +++ b/README.md @@ -53,14 +53,17 @@ The project uses a 9-crate Rust workspace plus TypeScript components: crates/ ├── predict-otron-9000/ # Main orchestration server (Rust 2024) ├── inference-engine/ # Multi-model inference orchestrator (Rust 2021) +├── embeddings-engine/ # FastEmbed embeddings service (Rust 2024) +└── chat-ui/ # WASM web frontend (Rust 2021) + +integration/ +├── cli/ # CLI client crate (Rust 2024) +│ └── package/ +│ └── cli.ts # TypeScript/Bun CLI client ├── gemma-runner/ # Gemma model inference via Candle (Rust 2021) ├── llama-runner/ # Llama model inference via Candle (Rust 2021) -├── embeddings-engine/ # FastEmbed embeddings service (Rust 2024) -├── chat-ui/ # WASM web frontend (Rust 2021) ├── helm-chart-tool/ # Kubernetes deployment tooling (Rust 2024) -└── cli/ # CLI client crate (Rust 2024) - └── package/ - └── cli.ts # TypeScript/Bun CLI client +└── utils/ # Shared utilities (Rust 2021) ``` ### Service Architecture @@ -160,16 +163,16 @@ cd crates/chat-ui #### TypeScript CLI Client ```bash # List available models -cd crates/cli/package && bun run cli.ts --list-models +cd integration/cli/package && bun run cli.ts --list-models # Chat completion -cd crates/cli/package && bun run cli.ts "What is the capital of France?" +cd integration/cli/package && bun run cli.ts "What is the capital of France?" # With specific model -cd crates/cli/package && bun run cli.ts --model gemma-3-1b-it --prompt "Hello, world!" +cd integration/cli/package && bun run cli.ts --model gemma-3-1b-it --prompt "Hello, world!" # Show help -cd crates/cli/package && bun run cli.ts --help +cd integration/cli/package && bun run cli.ts --help ``` ## API Usage @@ -464,7 +467,7 @@ curl -s http://localhost:8080/v1/models | jq **CLI client test:** ```bash -cd crates/cli/package && bun run cli.ts "What is 2+2?" +cd integration/cli/package && bun run cli.ts "What is 2+2?" ``` **Web frontend:** diff --git a/crates/embeddings-engine/src/lib.rs b/crates/embeddings-engine/src/lib.rs index 787edba..2433214 100644 --- a/crates/embeddings-engine/src/lib.rs +++ b/crates/embeddings-engine/src/lib.rs @@ -1,43 +1,183 @@ use async_openai::types::{CreateEmbeddingRequest, EmbeddingInput}; -use axum::{Json, Router, response::Json as ResponseJson, routing::post}; +use axum::{Json, Router, response::Json as ResponseJson, routing::{get, post}, http::StatusCode}; use fastembed::{EmbeddingModel, InitOptions, TextEmbedding}; use once_cell::sync::Lazy; +use serde::Serialize; +use std::collections::HashMap; +use std::sync::{Arc, RwLock}; use tower_http::trace::TraceLayer; use tracing; -// Persistent model instance (singleton pattern) -static EMBEDDING_MODEL: Lazy = Lazy::new(|| { - tracing::info!("Initializing persistent embedding model (singleton)"); +// Cache for multiple embedding models +static MODEL_CACHE: Lazy>>> = Lazy::new(|| { + RwLock::new(HashMap::new()) +}); + +#[derive(Serialize)] +pub struct ModelInfo { + pub id: String, + pub object: String, + pub owned_by: String, + pub description: String, + pub dimensions: usize, +} + +#[derive(Serialize)] +pub struct ModelsResponse { + pub object: String, + pub data: Vec, +} + +// Function to convert model name strings to EmbeddingModel enum variants +fn parse_embedding_model(model_name: &str) -> Result { + match model_name { + // Sentence Transformers models + "sentence-transformers/all-MiniLM-L6-v2" | "all-minilm-l6-v2" => Ok(EmbeddingModel::AllMiniLML6V2), + "sentence-transformers/all-MiniLM-L6-v2-q" | "all-minilm-l6-v2-q" => Ok(EmbeddingModel::AllMiniLML6V2Q), + "sentence-transformers/all-MiniLM-L12-v2" | "all-minilm-l12-v2" => Ok(EmbeddingModel::AllMiniLML12V2), + "sentence-transformers/all-MiniLM-L12-v2-q" | "all-minilm-l12-v2-q" => Ok(EmbeddingModel::AllMiniLML12V2Q), + + // BGE models + "BAAI/bge-base-en-v1.5" | "bge-base-en-v1.5" => Ok(EmbeddingModel::BGEBaseENV15), + "BAAI/bge-base-en-v1.5-q" | "bge-base-en-v1.5-q" => Ok(EmbeddingModel::BGEBaseENV15Q), + "BAAI/bge-large-en-v1.5" | "bge-large-en-v1.5" => Ok(EmbeddingModel::BGELargeENV15), + "BAAI/bge-large-en-v1.5-q" | "bge-large-en-v1.5-q" => Ok(EmbeddingModel::BGELargeENV15Q), + "BAAI/bge-small-en-v1.5" | "bge-small-en-v1.5" => Ok(EmbeddingModel::BGESmallENV15), + "BAAI/bge-small-en-v1.5-q" | "bge-small-en-v1.5-q" => Ok(EmbeddingModel::BGESmallENV15Q), + "BAAI/bge-small-zh-v1.5" | "bge-small-zh-v1.5" => Ok(EmbeddingModel::BGESmallZHV15), + "BAAI/bge-large-zh-v1.5" | "bge-large-zh-v1.5" => Ok(EmbeddingModel::BGELargeZHV15), + + // Nomic models + "nomic-ai/nomic-embed-text-v1" | "nomic-embed-text-v1" => Ok(EmbeddingModel::NomicEmbedTextV1), + "nomic-ai/nomic-embed-text-v1.5" | "nomic-embed-text-v1.5" | "nomic-text-embed" => Ok(EmbeddingModel::NomicEmbedTextV15), + "nomic-ai/nomic-embed-text-v1.5-q" | "nomic-embed-text-v1.5-q" => Ok(EmbeddingModel::NomicEmbedTextV15Q), + + // Paraphrase models + "sentence-transformers/paraphrase-multilingual-MiniLM-L12-v2" | "paraphrase-multilingual-minilm-l12-v2" => Ok(EmbeddingModel::ParaphraseMLMiniLML12V2), + "sentence-transformers/paraphrase-multilingual-MiniLM-L12-v2-q" | "paraphrase-multilingual-minilm-l12-v2-q" => Ok(EmbeddingModel::ParaphraseMLMiniLML12V2Q), + "sentence-transformers/paraphrase-multilingual-mpnet-base-v2" | "paraphrase-multilingual-mpnet-base-v2" => Ok(EmbeddingModel::ParaphraseMLMpnetBaseV2), + + // ModernBert + "lightonai/modernbert-embed-large" | "modernbert-embed-large" => Ok(EmbeddingModel::ModernBertEmbedLarge), + + // Multilingual E5 models + "intfloat/multilingual-e5-small" | "multilingual-e5-small" => Ok(EmbeddingModel::MultilingualE5Small), + "intfloat/multilingual-e5-base" | "multilingual-e5-base" => Ok(EmbeddingModel::MultilingualE5Base), + "intfloat/multilingual-e5-large" | "multilingual-e5-large" => Ok(EmbeddingModel::MultilingualE5Large), + + // Mixedbread models + "mixedbread-ai/mxbai-embed-large-v1" | "mxbai-embed-large-v1" => Ok(EmbeddingModel::MxbaiEmbedLargeV1), + "mixedbread-ai/mxbai-embed-large-v1-q" | "mxbai-embed-large-v1-q" => Ok(EmbeddingModel::MxbaiEmbedLargeV1Q), + + // GTE models + "Alibaba-NLP/gte-base-en-v1.5" | "gte-base-en-v1.5" => Ok(EmbeddingModel::GTEBaseENV15), + "Alibaba-NLP/gte-base-en-v1.5-q" | "gte-base-en-v1.5-q" => Ok(EmbeddingModel::GTEBaseENV15Q), + "Alibaba-NLP/gte-large-en-v1.5" | "gte-large-en-v1.5" => Ok(EmbeddingModel::GTELargeENV15), + "Alibaba-NLP/gte-large-en-v1.5-q" | "gte-large-en-v1.5-q" => Ok(EmbeddingModel::GTELargeENV15Q), + + // CLIP model + "Qdrant/clip-ViT-B-32-text" | "clip-vit-b-32" => Ok(EmbeddingModel::ClipVitB32), + + // Jina model + "jinaai/jina-embeddings-v2-base-code" | "jina-embeddings-v2-base-code" => Ok(EmbeddingModel::JinaEmbeddingsV2BaseCode), + + _ => Err(format!("Unsupported embedding model: {}", model_name)), + } +} + +// Function to get model dimensions +fn get_model_dimensions(model: &EmbeddingModel) -> usize { + match model { + EmbeddingModel::AllMiniLML6V2 | EmbeddingModel::AllMiniLML6V2Q => 384, + EmbeddingModel::AllMiniLML12V2 | EmbeddingModel::AllMiniLML12V2Q => 384, + EmbeddingModel::BGEBaseENV15 | EmbeddingModel::BGEBaseENV15Q => 768, + EmbeddingModel::BGELargeENV15 | EmbeddingModel::BGELargeENV15Q => 1024, + EmbeddingModel::BGESmallENV15 | EmbeddingModel::BGESmallENV15Q => 384, + EmbeddingModel::BGESmallZHV15 => 512, + EmbeddingModel::BGELargeZHV15 => 1024, + EmbeddingModel::NomicEmbedTextV1 | EmbeddingModel::NomicEmbedTextV15 | EmbeddingModel::NomicEmbedTextV15Q => 768, + EmbeddingModel::ParaphraseMLMiniLML12V2 | EmbeddingModel::ParaphraseMLMiniLML12V2Q => 384, + EmbeddingModel::ParaphraseMLMpnetBaseV2 => 768, + EmbeddingModel::ModernBertEmbedLarge => 1024, + EmbeddingModel::MultilingualE5Small => 384, + EmbeddingModel::MultilingualE5Base => 768, + EmbeddingModel::MultilingualE5Large => 1024, + EmbeddingModel::MxbaiEmbedLargeV1 | EmbeddingModel::MxbaiEmbedLargeV1Q => 1024, + EmbeddingModel::GTEBaseENV15 | EmbeddingModel::GTEBaseENV15Q => 768, + EmbeddingModel::GTELargeENV15 | EmbeddingModel::GTELargeENV15Q => 1024, + EmbeddingModel::ClipVitB32 => 512, + EmbeddingModel::JinaEmbeddingsV2BaseCode => 768, + } +} + +// Function to get or create a model from cache +fn get_or_create_model(embedding_model: EmbeddingModel) -> Result, String> { + // First try to get from cache (read lock) + { + let cache = MODEL_CACHE.read().map_err(|e| format!("Failed to acquire read lock: {}", e))?; + if let Some(model) = cache.get(&embedding_model) { + tracing::debug!("Using cached model: {:?}", embedding_model); + return Ok(Arc::clone(model)); + } + } + + // Model not in cache, create it (write lock) + let mut cache = MODEL_CACHE.write().map_err(|e| format!("Failed to acquire write lock: {}", e))?; + + // Double-check after acquiring write lock + if let Some(model) = cache.get(&embedding_model) { + tracing::debug!("Using cached model (double-check): {:?}", embedding_model); + return Ok(Arc::clone(model)); + } + + tracing::info!("Initializing new embedding model: {:?}", embedding_model); let model_start_time = std::time::Instant::now(); - + let model = TextEmbedding::try_new( - InitOptions::new(EmbeddingModel::NomicEmbedTextV15).with_show_download_progress(true), + InitOptions::new(embedding_model.clone()).with_show_download_progress(true), ) - .expect("Failed to initialize persistent embedding model"); - + .map_err(|e| format!("Failed to initialize model {:?}: {}", embedding_model, e))?; + let model_init_time = model_start_time.elapsed(); tracing::info!( - "Persistent embedding model initialized in {:.2?}", + "Embedding model {:?} initialized in {:.2?}", + embedding_model, model_init_time ); - - model -}); + + let model_arc = Arc::new(model); + cache.insert(embedding_model.clone(), Arc::clone(&model_arc)); + Ok(model_arc) +} pub async fn embeddings_create( Json(payload): Json, -) -> ResponseJson { +) -> Result, (StatusCode, String)> { // Start timing the entire process let start_time = std::time::Instant::now(); - // Phase 1: Access persistent model instance + // Phase 1: Parse and get the embedding model let model_start_time = std::time::Instant::now(); - - // Access the lazy-initialized persistent model instance - // This will only initialize the model on the first request + + let embedding_model = match parse_embedding_model(&payload.model) { + Ok(model) => model, + Err(e) => { + tracing::error!("Invalid model requested: {}", e); + return Err((StatusCode::BAD_REQUEST, format!("Invalid model: {}", e))); + } + }; + + let model = match get_or_create_model(embedding_model.clone()) { + Ok(model) => model, + Err(e) => { + tracing::error!("Failed to get/create model: {}", e); + return Err((StatusCode::INTERNAL_SERVER_ERROR, format!("Model initialization failed: {}", e))); + } + }; + let model_access_time = model_start_time.elapsed(); tracing::debug!( - "Persistent model access completed in {:.2?}", + "Model access/creation completed in {:.2?}", model_access_time ); @@ -65,9 +205,12 @@ pub async fn embeddings_create( // Phase 3: Generate embeddings let embedding_start_time = std::time::Instant::now(); - let embeddings = EMBEDDING_MODEL + let embeddings = model .embed(texts_from_embedding_input, None) - .expect("failed to embed document"); + .map_err(|e| { + tracing::error!("Failed to generate embeddings: {}", e); + (StatusCode::INTERNAL_SERVER_ERROR, format!("Embedding generation failed: {}", e)) + })?; let embedding_generation_time = embedding_start_time.elapsed(); tracing::info!( @@ -117,8 +260,9 @@ pub async fn embeddings_create( // Generate a random non-zero embedding use rand::Rng; let mut rng = rand::thread_rng(); - let mut random_embedding = Vec::with_capacity(768); - for _ in 0..768 { + let expected_dimensions = get_model_dimensions(&embedding_model); + let mut random_embedding = Vec::with_capacity(expected_dimensions); + for _ in 0..expected_dimensions { // Generate random values between -1.0 and 1.0, excluding 0 let mut val = 0.0; while val == 0.0 { @@ -138,18 +282,19 @@ pub async fn embeddings_create( random_embedding } else { // Check if dimensions parameter is provided and pad the embeddings if necessary - let mut padded_embedding = embeddings[0].clone(); + let padded_embedding = embeddings[0].clone(); - // If the client expects 768 dimensions but our model produces fewer, pad with zeros - let target_dimension = 768; - if padded_embedding.len() < target_dimension { - let padding_needed = target_dimension - padded_embedding.len(); - tracing::trace!( - "Padding embedding with {} zeros to reach {} dimensions", - padding_needed, - target_dimension + // Use the actual model dimensions instead of hardcoded 768 + let actual_dimensions = padded_embedding.len(); + let expected_dimensions = get_model_dimensions(&embedding_model); + + if actual_dimensions != expected_dimensions { + tracing::warn!( + "Model {:?} produced {} dimensions but expected {}", + embedding_model, + actual_dimensions, + expected_dimensions ); - padded_embedding.extend(vec![0.0; padding_needed]); } padded_embedding @@ -203,11 +348,232 @@ pub async fn embeddings_create( postprocessing_time ); - ResponseJson(response) + Ok(ResponseJson(response)) +} + +pub async fn models_list() -> ResponseJson { + let models = vec![ + ModelInfo { + id: "sentence-transformers/all-MiniLM-L6-v2".to_string(), + object: "model".to_string(), + owned_by: "sentence-transformers".to_string(), + description: "Sentence Transformer model, MiniLM-L6-v2".to_string(), + dimensions: 384, + }, + ModelInfo { + id: "sentence-transformers/all-MiniLM-L6-v2-q".to_string(), + object: "model".to_string(), + owned_by: "sentence-transformers".to_string(), + description: "Quantized Sentence Transformer model, MiniLM-L6-v2".to_string(), + dimensions: 384, + }, + ModelInfo { + id: "sentence-transformers/all-MiniLM-L12-v2".to_string(), + object: "model".to_string(), + owned_by: "sentence-transformers".to_string(), + description: "Sentence Transformer model, MiniLM-L12-v2".to_string(), + dimensions: 384, + }, + ModelInfo { + id: "sentence-transformers/all-MiniLM-L12-v2-q".to_string(), + object: "model".to_string(), + owned_by: "sentence-transformers".to_string(), + description: "Quantized Sentence Transformer model, MiniLM-L12-v2".to_string(), + dimensions: 384, + }, + ModelInfo { + id: "BAAI/bge-base-en-v1.5".to_string(), + object: "model".to_string(), + owned_by: "BAAI".to_string(), + description: "v1.5 release of the base English model".to_string(), + dimensions: 768, + }, + ModelInfo { + id: "BAAI/bge-base-en-v1.5-q".to_string(), + object: "model".to_string(), + owned_by: "BAAI".to_string(), + description: "Quantized v1.5 release of the base English model".to_string(), + dimensions: 768, + }, + ModelInfo { + id: "BAAI/bge-large-en-v1.5".to_string(), + object: "model".to_string(), + owned_by: "BAAI".to_string(), + description: "v1.5 release of the large English model".to_string(), + dimensions: 1024, + }, + ModelInfo { + id: "BAAI/bge-large-en-v1.5-q".to_string(), + object: "model".to_string(), + owned_by: "BAAI".to_string(), + description: "Quantized v1.5 release of the large English model".to_string(), + dimensions: 1024, + }, + ModelInfo { + id: "BAAI/bge-small-en-v1.5".to_string(), + object: "model".to_string(), + owned_by: "BAAI".to_string(), + description: "v1.5 release of the fast and default English model".to_string(), + dimensions: 384, + }, + ModelInfo { + id: "BAAI/bge-small-en-v1.5-q".to_string(), + object: "model".to_string(), + owned_by: "BAAI".to_string(), + description: "Quantized v1.5 release of the fast and default English model".to_string(), + dimensions: 384, + }, + ModelInfo { + id: "BAAI/bge-small-zh-v1.5".to_string(), + object: "model".to_string(), + owned_by: "BAAI".to_string(), + description: "v1.5 release of the small Chinese model".to_string(), + dimensions: 512, + }, + ModelInfo { + id: "BAAI/bge-large-zh-v1.5".to_string(), + object: "model".to_string(), + owned_by: "BAAI".to_string(), + description: "v1.5 release of the large Chinese model".to_string(), + dimensions: 1024, + }, + ModelInfo { + id: "nomic-ai/nomic-embed-text-v1".to_string(), + object: "model".to_string(), + owned_by: "nomic-ai".to_string(), + description: "8192 context length english model".to_string(), + dimensions: 768, + }, + ModelInfo { + id: "nomic-ai/nomic-embed-text-v1.5".to_string(), + object: "model".to_string(), + owned_by: "nomic-ai".to_string(), + description: "v1.5 release of the 8192 context length english model".to_string(), + dimensions: 768, + }, + ModelInfo { + id: "nomic-ai/nomic-embed-text-v1.5-q".to_string(), + object: "model".to_string(), + owned_by: "nomic-ai".to_string(), + description: "Quantized v1.5 release of the 8192 context length english model".to_string(), + dimensions: 768, + }, + ModelInfo { + id: "sentence-transformers/paraphrase-multilingual-MiniLM-L12-v2".to_string(), + object: "model".to_string(), + owned_by: "sentence-transformers".to_string(), + description: "Multi-lingual model".to_string(), + dimensions: 384, + }, + ModelInfo { + id: "sentence-transformers/paraphrase-multilingual-MiniLM-L12-v2-q".to_string(), + object: "model".to_string(), + owned_by: "sentence-transformers".to_string(), + description: "Quantized Multi-lingual model".to_string(), + dimensions: 384, + }, + ModelInfo { + id: "sentence-transformers/paraphrase-multilingual-mpnet-base-v2".to_string(), + object: "model".to_string(), + owned_by: "sentence-transformers".to_string(), + description: "Sentence-transformers model for tasks like clustering or semantic search".to_string(), + dimensions: 768, + }, + ModelInfo { + id: "lightonai/modernbert-embed-large".to_string(), + object: "model".to_string(), + owned_by: "lightonai".to_string(), + description: "Large model of ModernBert Text Embeddings".to_string(), + dimensions: 1024, + }, + ModelInfo { + id: "intfloat/multilingual-e5-small".to_string(), + object: "model".to_string(), + owned_by: "intfloat".to_string(), + description: "Small model of multilingual E5 Text Embeddings".to_string(), + dimensions: 384, + }, + ModelInfo { + id: "intfloat/multilingual-e5-base".to_string(), + object: "model".to_string(), + owned_by: "intfloat".to_string(), + description: "Base model of multilingual E5 Text Embeddings".to_string(), + dimensions: 768, + }, + ModelInfo { + id: "intfloat/multilingual-e5-large".to_string(), + object: "model".to_string(), + owned_by: "intfloat".to_string(), + description: "Large model of multilingual E5 Text Embeddings".to_string(), + dimensions: 1024, + }, + ModelInfo { + id: "mixedbread-ai/mxbai-embed-large-v1".to_string(), + object: "model".to_string(), + owned_by: "mixedbread-ai".to_string(), + description: "Large English embedding model from MixedBreed.ai".to_string(), + dimensions: 1024, + }, + ModelInfo { + id: "mixedbread-ai/mxbai-embed-large-v1-q".to_string(), + object: "model".to_string(), + owned_by: "mixedbread-ai".to_string(), + description: "Quantized Large English embedding model from MixedBreed.ai".to_string(), + dimensions: 1024, + }, + ModelInfo { + id: "Alibaba-NLP/gte-base-en-v1.5".to_string(), + object: "model".to_string(), + owned_by: "Alibaba-NLP".to_string(), + description: "Base multilingual embedding model from Alibaba".to_string(), + dimensions: 768, + }, + ModelInfo { + id: "Alibaba-NLP/gte-base-en-v1.5-q".to_string(), + object: "model".to_string(), + owned_by: "Alibaba-NLP".to_string(), + description: "Quantized Base multilingual embedding model from Alibaba".to_string(), + dimensions: 768, + }, + ModelInfo { + id: "Alibaba-NLP/gte-large-en-v1.5".to_string(), + object: "model".to_string(), + owned_by: "Alibaba-NLP".to_string(), + description: "Large multilingual embedding model from Alibaba".to_string(), + dimensions: 1024, + }, + ModelInfo { + id: "Alibaba-NLP/gte-large-en-v1.5-q".to_string(), + object: "model".to_string(), + owned_by: "Alibaba-NLP".to_string(), + description: "Quantized Large multilingual embedding model from Alibaba".to_string(), + dimensions: 1024, + }, + ModelInfo { + id: "Qdrant/clip-ViT-B-32-text".to_string(), + object: "model".to_string(), + owned_by: "Qdrant".to_string(), + description: "CLIP text encoder based on ViT-B/32".to_string(), + dimensions: 512, + }, + ModelInfo { + id: "jinaai/jina-embeddings-v2-base-code".to_string(), + object: "model".to_string(), + owned_by: "jinaai".to_string(), + description: "Jina embeddings v2 base code".to_string(), + dimensions: 768, + }, + ]; + + ResponseJson(ModelsResponse { + object: "list".to_string(), + data: models, + }) } pub fn create_embeddings_router() -> Router { Router::new() .route("/v1/embeddings", post(embeddings_create)) + // .route("/v1/models", get(models_list)) .layer(TraceLayer::new_for_http()) } diff --git a/crates/embeddings-engine/src/main.rs b/crates/embeddings-engine/src/main.rs index 2e58a93..44135aa 100644 --- a/crates/embeddings-engine/src/main.rs +++ b/crates/embeddings-engine/src/main.rs @@ -4,8 +4,6 @@ use axum::{ response::Json as ResponseJson, routing::{get, post}, }; -use fastembed::{EmbeddingModel, InitOptions, TextEmbedding}; -use serde::{Deserialize, Serialize}; use std::env; use tower_http::trace::TraceLayer; use tracing; @@ -13,127 +11,30 @@ use tracing; const DEFAULT_SERVER_HOST: &str = "127.0.0.1"; const DEFAULT_SERVER_PORT: &str = "8080"; +use embeddings_engine; + async fn embeddings_create( Json(payload): Json, -) -> ResponseJson { - let model = TextEmbedding::try_new( - InitOptions::new(EmbeddingModel::NomicEmbedTextV15).with_show_download_progress(true), - ) - .expect("Failed to initialize model"); - - let embedding_input = payload.input; - - let texts_from_embedding_input = match embedding_input { - EmbeddingInput::String(text) => vec![text], - EmbeddingInput::StringArray(texts) => texts, - EmbeddingInput::IntegerArray(_) => { - panic!("Integer array input not supported for text embeddings"); +) -> Result, axum::response::Response> { + match embeddings_engine::embeddings_create(Json(payload)).await { + Ok(response) => Ok(response), + Err((status_code, message)) => { + Err(axum::response::Response::builder() + .status(status_code) + .body(axum::body::Body::from(message)) + .unwrap()) } - EmbeddingInput::ArrayOfIntegerArray(_) => { - panic!("Array of integer arrays not supported for text embeddings"); - } - }; + } +} - let embeddings = model - .embed(texts_from_embedding_input, None) - .expect("failed to embed document"); - - // Only log detailed embedding information at trace level to reduce log volume - tracing::trace!("Embeddings length: {}", embeddings.len()); - tracing::info!("Embedding dimension: {}", embeddings[0].len()); - - // Log the first 10 values of the original embedding at trace level - tracing::trace!( - "Original embedding preview: {:?}", - &embeddings[0][..10.min(embeddings[0].len())] - ); - - // Check if there are any NaN or zero values in the original embedding - let nan_count = embeddings[0].iter().filter(|&&x| x.is_nan()).count(); - let zero_count = embeddings[0].iter().filter(|&&x| x == 0.0).count(); - tracing::trace!( - "Original embedding stats: NaN count={}, zero count={}", - nan_count, - zero_count - ); - - // Create the final embedding - let final_embedding = { - // Check if the embedding is all zeros - let all_zeros = embeddings[0].iter().all(|&x| x == 0.0); - if all_zeros { - tracing::warn!("Embedding is all zeros. Generating random non-zero embedding."); - - // Generate a random non-zero embedding - use rand::Rng; - let mut rng = rand::thread_rng(); - let mut random_embedding = Vec::with_capacity(768); - for _ in 0..768 { - // Generate random values between -1.0 and 1.0, excluding 0 - let mut val = 0.0; - while val == 0.0 { - val = rng.gen_range(-1.0..1.0); - } - random_embedding.push(val); - } - - // Normalize the random embedding - let norm: f32 = random_embedding.iter().map(|x| x * x).sum::().sqrt(); - for i in 0..random_embedding.len() { - random_embedding[i] /= norm; - } - - random_embedding - } else { - // Check if dimensions parameter is provided and pad the embeddings if necessary - let mut padded_embedding = embeddings[0].clone(); - - // If the client expects 768 dimensions but our model produces fewer, pad with zeros - let target_dimension = 768; - if padded_embedding.len() < target_dimension { - let padding_needed = target_dimension - padded_embedding.len(); - tracing::trace!( - "Padding embedding with {} zeros to reach {} dimensions", - padding_needed, - target_dimension - ); - padded_embedding.extend(vec![0.0; padding_needed]); - } - - padded_embedding - } - }; - - tracing::trace!("Final embedding dimension: {}", final_embedding.len()); - - // Log the first 10 values of the final embedding at trace level - tracing::trace!( - "Final embedding preview: {:?}", - &final_embedding[..10.min(final_embedding.len())] - ); - - // Return a response that matches the OpenAI API format - let response = serde_json::json!({ - "object": "list", - "data": [ - { - "object": "embedding", - "index": 0, - "embedding": final_embedding - } - ], - "model": payload.model, - "usage": { - "prompt_tokens": 0, - "total_tokens": 0 - } - }); - ResponseJson(response) +async fn models_list() -> ResponseJson { + embeddings_engine::models_list().await } fn create_app() -> Router { Router::new() .route("/v1/embeddings", post(embeddings_create)) + .route("/v1/models", get(models_list)) .layer(TraceLayer::new_for_http()) } use tracing_subscriber::{layer::SubscriberExt, util::SubscriberInitExt}; diff --git a/crates/inference-engine/Cargo.toml b/crates/inference-engine/Cargo.toml index 1857d59..fbefd88 100644 --- a/crates/inference-engine/Cargo.toml +++ b/crates/inference-engine/Cargo.toml @@ -31,8 +31,9 @@ utoipa = { version = "4.2.0", features = ["axum_extras"] } uuid = { version = "1.7.0", features = ["v4"] } reborrow = "0.5.5" futures-util = "0.3.31" -gemma-runner = { path = "../gemma-runner", features = ["metal"] } -llama-runner = { path = "../llama-runner", features = ["metal"]} +gemma-runner = { path = "../../integration/gemma-runner", features = ["metal"] } +llama-runner = { path = "../../integration/llama-runner", features = ["metal"]} +embeddings-engine = { path = "../embeddings-engine" } [target.'cfg(target_os = "macos")'.dependencies] candle-core = { git = "https://github.com/huggingface/candle.git", features = ["metal"] } diff --git a/crates/inference-engine/src/server.rs b/crates/inference-engine/src/server.rs index 79e87af..613a14e 100644 --- a/crates/inference-engine/src/server.rs +++ b/crates/inference-engine/src/server.rs @@ -19,6 +19,7 @@ use crate::openai_types::{ }; use crate::Which; use either::Either; +use embeddings_engine::models_list; use gemma_runner::{run_gemma_api, GemmaInferenceConfig}; use llama_runner::{run_llama_inference, LlamaInferenceConfig}; use serde_json::Value; @@ -530,7 +531,9 @@ pub async fn list_models() -> Json { Which::Llama32_3BInstruct, ]; - let models: Vec = which_variants.into_iter().map(|which| { + + + let mut models: Vec = which_variants.into_iter().map(|which| { let meta = which.meta(); let model_id = match which { Which::Base2B => "gemma-2b", @@ -566,11 +569,25 @@ pub async fn list_models() -> Json { Model { id: model_id.to_string(), object: "model".to_string(), - created: 1686935002, // Using same timestamp as OpenAI example + created: 1686935002, owned_by: owned_by.to_string(), } }).collect(); + // Get embeddings models and convert them to inference Model format + let embeddings_response = models_list().await; + let embeddings_models: Vec = embeddings_response.0.data.into_iter().map(|embedding_model| { + Model { + id: embedding_model.id, + object: embedding_model.object, + created: 1686935002, + owned_by: format!("{} - {}", embedding_model.owned_by, embedding_model.description), + } + }).collect(); + + // Add embeddings models to the main models list + models.extend(embeddings_models); + Json(ModelListResponse { object: "list".to_string(), data: models, diff --git a/crates/predict-otron-9000/src/main.rs b/crates/predict-otron-9000/src/main.rs index 6d4a517..e5f2db7 100644 --- a/crates/predict-otron-9000/src/main.rs +++ b/crates/predict-otron-9000/src/main.rs @@ -144,6 +144,7 @@ async fn main() { tracing::info!("Available endpoints:"); tracing::info!(" GET / - Leptos chat web application"); tracing::info!(" GET /health - Health check"); + tracing::info!(" POST /v1/models - List Models"); tracing::info!(" POST /v1/embeddings - Text embeddings API"); tracing::info!(" POST /v1/chat/completions - Chat completions API"); diff --git a/docs/ARCHITECTURE.md b/docs/ARCHITECTURE.md index 44ffbc6..2c32ecc 100644 --- a/docs/ARCHITECTURE.md +++ b/docs/ARCHITECTURE.md @@ -61,20 +61,22 @@ graph TD A[predict-otron-9000
Edition: 2024
Port: 8080] end - subgraph "AI Services" + subgraph "AI Services (crates/)" B[inference-engine
Edition: 2021
Port: 8080
Multi-model orchestrator] - J[gemma-runner
Edition: 2021
Gemma via Candle] - K[llama-runner
Edition: 2021
Llama via Candle] C[embeddings-engine
Edition: 2024
Port: 8080
FastEmbed] end - subgraph "Frontend" + subgraph "Frontend (crates/)" D[chat-ui
Edition: 2021
Port: 8788
WASM UI] end - subgraph "Tooling" + + subgraph "Integration Tools (integration/)" L[helm-chart-tool
Edition: 2024
K8s deployment] E[cli
Edition: 2024
TypeScript/Bun CLI] + M[gemma-runner
Edition: 2021
Gemma via Candle] + N[llama-runner
Edition: 2021
Llama via Candle] + O[utils
Edition: 2021
Shared utilities] end end @@ -82,10 +84,10 @@ graph TD A --> B A --> C A --> D - B --> J - B --> K - J -.-> F[Candle 0.9.1] - K -.-> F + B --> M + B --> N + M -.-> F[Candle 0.9.1] + N -.-> F C -.-> G[FastEmbed 4.x] D -.-> H[Leptos 0.8.0] E -.-> I[OpenAI SDK 5.16+] @@ -93,12 +95,13 @@ graph TD style A fill:#e1f5fe style B fill:#f3e5f5 - style J fill:#f3e5f5 - style K fill:#f3e5f5 style C fill:#e8f5e8 style D fill:#fff3e0 style E fill:#fce4ec style L fill:#fff9c4 + style M fill:#f3e5f5 + style N fill:#f3e5f5 + style O fill:#fff9c4 ``` ## Deployment Configurations diff --git a/crates/cli/Cargo.toml b/integration/cli/Cargo.toml similarity index 100% rename from crates/cli/Cargo.toml rename to integration/cli/Cargo.toml diff --git a/crates/cli/README.md b/integration/cli/README.md similarity index 96% rename from crates/cli/README.md rename to integration/cli/README.md index f93bdab..1d880cc 100644 --- a/crates/cli/README.md +++ b/integration/cli/README.md @@ -14,7 +14,7 @@ Options: --help Show this help message Examples: - cd crates/cli/package + cd integration/cli/package bun run cli.ts "What is the capital of France?" bun run cli.ts --model gemma-3-1b-it --prompt "Hello, world!" bun run cli.ts --prompt "Who was the 16th president of the United States?" diff --git a/crates/cli/build.rs b/integration/cli/build.rs similarity index 100% rename from crates/cli/build.rs rename to integration/cli/build.rs diff --git a/crates/cli/bun_target.rs b/integration/cli/bun_target.rs similarity index 100% rename from crates/cli/bun_target.rs rename to integration/cli/bun_target.rs diff --git a/integration/cli/package/bun.lock b/integration/cli/package/bun.lock new file mode 100644 index 0000000..d75a688 --- /dev/null +++ b/integration/cli/package/bun.lock @@ -0,0 +1,17 @@ +{ + "lockfileVersion": 1, + "workspaces": { + "": { + "name": "cli", + "dependencies": { + "install": "^0.13.0", + "openai": "^5.16.0", + }, + }, + }, + "packages": { + "install": ["install@0.13.0", "", {}, "sha512-zDml/jzr2PKU9I8J/xyZBQn8rPCAY//UOYNmR01XwNwyfhEWObo2SWfSl1+0tm1u6PhxLwDnfsT/6jB7OUxqFA=="], + + "openai": ["openai@5.19.1", "", { "peerDependencies": { "ws": "^8.18.0", "zod": "^3.23.8" }, "optionalPeers": ["ws", "zod"], "bin": { "openai": "bin/cli" } }, "sha512-zSqnUF7oR9ksmpusKkpUgkNrj8Sl57U+OyzO8jzc7LUjTMg4DRfR3uCm+EIMA6iw06sRPNp4t7ojp3sCpEUZRQ=="], + } +} diff --git a/crates/cli/package/cli.ts b/integration/cli/package/cli.ts similarity index 100% rename from crates/cli/package/cli.ts rename to integration/cli/package/cli.ts diff --git a/crates/cli/package/package.json b/integration/cli/package/package.json similarity index 100% rename from crates/cli/package/package.json rename to integration/cli/package/package.json diff --git a/crates/cli/src/main.rs b/integration/cli/src/main.rs similarity index 100% rename from crates/cli/src/main.rs rename to integration/cli/src/main.rs diff --git a/crates/gemma-runner/Cargo.toml b/integration/gemma-runner/Cargo.toml similarity index 97% rename from crates/gemma-runner/Cargo.toml rename to integration/gemma-runner/Cargo.toml index 57154db..1becbcc 100644 --- a/crates/gemma-runner/Cargo.toml +++ b/integration/gemma-runner/Cargo.toml @@ -18,7 +18,7 @@ serde_json = "1.0" tracing = "0.1" tracing-chrome = "0.7" tracing-subscriber = "0.3" -utils = {path = "../utils"} +utils = {path = "../utils" } [target.'cfg(target_os = "macos")'.dependencies] candle-core = { git = "https://github.com/huggingface/candle.git", features = ["metal"] } diff --git a/crates/gemma-runner/README.md b/integration/gemma-runner/README.md similarity index 100% rename from crates/gemma-runner/README.md rename to integration/gemma-runner/README.md diff --git a/crates/gemma-runner/src/gemma_api.rs b/integration/gemma-runner/src/gemma_api.rs similarity index 100% rename from crates/gemma-runner/src/gemma_api.rs rename to integration/gemma-runner/src/gemma_api.rs diff --git a/crates/gemma-runner/src/gemma_cli.rs b/integration/gemma-runner/src/gemma_cli.rs similarity index 100% rename from crates/gemma-runner/src/gemma_cli.rs rename to integration/gemma-runner/src/gemma_cli.rs diff --git a/crates/gemma-runner/src/lib.rs b/integration/gemma-runner/src/lib.rs similarity index 100% rename from crates/gemma-runner/src/lib.rs rename to integration/gemma-runner/src/lib.rs diff --git a/crates/gemma-runner/src/main.rs b/integration/gemma-runner/src/main.rs similarity index 100% rename from crates/gemma-runner/src/main.rs rename to integration/gemma-runner/src/main.rs diff --git a/crates/helm-chart-tool/Cargo.toml b/integration/helm-chart-tool/Cargo.toml similarity index 100% rename from crates/helm-chart-tool/Cargo.toml rename to integration/helm-chart-tool/Cargo.toml diff --git a/crates/helm-chart-tool/README.md b/integration/helm-chart-tool/README.md similarity index 100% rename from crates/helm-chart-tool/README.md rename to integration/helm-chart-tool/README.md diff --git a/crates/helm-chart-tool/src/main.rs b/integration/helm-chart-tool/src/main.rs similarity index 99% rename from crates/helm-chart-tool/src/main.rs rename to integration/helm-chart-tool/src/main.rs index 3d9ab37..888bb5a 100644 --- a/crates/helm-chart-tool/src/main.rs +++ b/integration/helm-chart-tool/src/main.rs @@ -105,7 +105,7 @@ fn discover_services(workspace_path: &str) -> Result> { .into_iter() .filter_map(|e| e.ok()) { - if entry.file_name() == "Cargo.toml" && entry.path() != workspace_root.join("Cargo.toml") { + if entry.file_name() == "Cargo.toml" && entry.path() != workspace_root.join("../../../Cargo.toml") { if let Ok(service_info) = parse_cargo_toml(entry.path()) { services.push(service_info); } diff --git a/crates/llama-runner/Cargo.toml b/integration/llama-runner/Cargo.toml similarity index 100% rename from crates/llama-runner/Cargo.toml rename to integration/llama-runner/Cargo.toml diff --git a/crates/llama-runner/README.md b/integration/llama-runner/README.md similarity index 100% rename from crates/llama-runner/README.md rename to integration/llama-runner/README.md diff --git a/crates/llama-runner/src/lib.rs b/integration/llama-runner/src/lib.rs similarity index 100% rename from crates/llama-runner/src/lib.rs rename to integration/llama-runner/src/lib.rs diff --git a/crates/llama-runner/src/llama_api.rs b/integration/llama-runner/src/llama_api.rs similarity index 100% rename from crates/llama-runner/src/llama_api.rs rename to integration/llama-runner/src/llama_api.rs diff --git a/crates/llama-runner/src/llama_cli.rs b/integration/llama-runner/src/llama_cli.rs similarity index 100% rename from crates/llama-runner/src/llama_cli.rs rename to integration/llama-runner/src/llama_cli.rs diff --git a/crates/llama-runner/src/main.rs b/integration/llama-runner/src/main.rs similarity index 100% rename from crates/llama-runner/src/main.rs rename to integration/llama-runner/src/main.rs diff --git a/crates/utils/Cargo.toml b/integration/utils/Cargo.toml similarity index 100% rename from crates/utils/Cargo.toml rename to integration/utils/Cargo.toml diff --git a/crates/utils/src/audio.rs b/integration/utils/src/audio.rs similarity index 100% rename from crates/utils/src/audio.rs rename to integration/utils/src/audio.rs diff --git a/crates/utils/src/bs1770.rs b/integration/utils/src/bs1770.rs similarity index 100% rename from crates/utils/src/bs1770.rs rename to integration/utils/src/bs1770.rs diff --git a/crates/utils/src/coco_classes.rs b/integration/utils/src/coco_classes.rs similarity index 100% rename from crates/utils/src/coco_classes.rs rename to integration/utils/src/coco_classes.rs diff --git a/crates/utils/src/imagenet.rs b/integration/utils/src/imagenet.rs similarity index 100% rename from crates/utils/src/imagenet.rs rename to integration/utils/src/imagenet.rs diff --git a/crates/utils/src/lib.rs b/integration/utils/src/lib.rs similarity index 100% rename from crates/utils/src/lib.rs rename to integration/utils/src/lib.rs diff --git a/crates/utils/src/main.rs b/integration/utils/src/main.rs similarity index 100% rename from crates/utils/src/main.rs rename to integration/utils/src/main.rs diff --git a/crates/utils/src/token_output_stream.rs b/integration/utils/src/token_output_stream.rs similarity index 100% rename from crates/utils/src/token_output_stream.rs rename to integration/utils/src/token_output_stream.rs diff --git a/crates/utils/src/wav.rs b/integration/utils/src/wav.rs similarity index 100% rename from crates/utils/src/wav.rs rename to integration/utils/src/wav.rs diff --git a/package.json b/package.json index 7c18d10..88c5c46 100644 --- a/package.json +++ b/package.json @@ -1,8 +1,8 @@ { "name": "predict-otron-9000", - "workspaces": ["crates/cli/package"], + "workspaces": ["integration/cli/package"], "scripts": { "# WORKSPACE ALIASES": "#", - "cli": "bun --filter crates/cli/package" + "cli": "bun --filter integration/cli/package" } } \ No newline at end of file