supports small llama and gemma models

Refactor inference dedicated crates for llama and gemma inferencing, not integrated
2025-09-08 22:46:44 +00:00 · 2025-08-29 18:15:29 -04:00
parent d06b16bb12
commit 315ef17605
26 changed files with 2136 additions and 1402 deletions
--- a/crates/predict-otron-9000/src/main.rs
+++ b/crates/predict-otron-9000/src/main.rs
@@ -67,18 +67,7 @@ async fn main() {
        let embeddings_router = embeddings_engine::create_embeddings_router();

        // Create AppState with correct model configuration
-        use inference_engine::Which;
-        use inference_engine::server::{PipelineArgs, build_pipeline};
-        let mut pipeline_args = PipelineArgs::default();
-        pipeline_args.model_id = "google/gemma-3-1b-it".to_string();
-        pipeline_args.which = Which::InstructV3_1B;
-
-        let text_generation = build_pipeline(pipeline_args.clone());
-        let app_state = AppState {
-            text_generation: std::sync::Arc::new(tokio::sync::Mutex::new(text_generation)),
-            model_id: "google/gemma-3-1b-it".to_string(),
-            build_args: pipeline_args,
-        };
+        let app_state = AppState::default();

        // Get the inference router directly from the inference engine
        let inference_router = inference_engine::create_router(app_state);