supports small llama and gemma models

Refactor inference

dedicated crates for llama and gemma inferencing, not integrated
This commit is contained in:
geoffsee
2025-08-29 18:15:29 -04:00
parent d06b16bb12
commit 315ef17605
26 changed files with 2136 additions and 1402 deletions

View File

@@ -67,18 +67,7 @@ async fn main() {
let embeddings_router = embeddings_engine::create_embeddings_router();
// Create AppState with correct model configuration
use inference_engine::Which;
use inference_engine::server::{PipelineArgs, build_pipeline};
let mut pipeline_args = PipelineArgs::default();
pipeline_args.model_id = "google/gemma-3-1b-it".to_string();
pipeline_args.which = Which::InstructV3_1B;
let text_generation = build_pipeline(pipeline_args.clone());
let app_state = AppState {
text_generation: std::sync::Arc::new(tokio::sync::Mutex::new(text_generation)),
model_id: "google/gemma-3-1b-it".to_string(),
build_args: pipeline_args,
};
let app_state = AppState::default();
// Get the inference router directly from the inference engine
let inference_router = inference_engine::create_router(app_state);