Refactor apply_cached_repeat_penalty for optimized caching and reuse, add extensive unit tests, and integrate special handling for gemma-specific models.

Removed `test_request.sh`, deprecated functionality, and unused imports; introduced a new CLI tool (`cli.ts`) for testing inference engine and adjusted handling of non-streaming/streaming chat completions. - Add CPU fallback support for text generation when primary device is unsupported - Introduce `execute_with_fallback` method to handle device compatibility and shape mismatch errors - Extend unit tests to reproduce tensor shape mismatch errors specific to model configurations - Increase HTTP timeout limits in `curl_chat_stream.sh` script for reliable API testing chat completion endpoint functions with gemma3 (no streaming) Add benchmarking guide with HTML reporting, Leptos chat crate, and middleware for metrics tracking
2025-09-08 22:46:44 +00:00 · 2025-08-26 01:30:26 -04:00
parent 7dd23213c9
commit 8338750beb
64 changed files with 14997 additions and 220 deletions
--- a/crates/inference-engine/src/lib.rs
+++ b/crates/inference-engine/src/lib.rs
@@ -13,8 +13,6 @@ pub use text_generation::TextGeneration;
 pub use token_output_stream::TokenOutputStream;
 pub use server::{AppState, create_router};

-use axum::{Json, http::StatusCode, routing::post, Router};
-use serde_json;
 use std::env;
 use tracing_subscriber::{layer::SubscriberExt, util::SubscriberInitExt};

@@ -45,26 +43,3 @@ pub fn init_tracing() {
        .with(tracing_subscriber::fmt::layer())
        .init();
 }
-
-/// Create a simplified inference router that returns appropriate error messages
-/// indicating that full model loading is required for production use
-pub fn create_inference_router() -> Router {
-    Router::new()
-        .route("/v1/chat/completions", post(simplified_chat_completions))
-}
-
-async fn simplified_chat_completions(
-    axum::Json(request): axum::Json<serde_json::Value>,
-) -> Result<Json<serde_json::Value>, (StatusCode, Json<serde_json::Value>)> {
-    // Return the same error message as the actual server implementation
-    // to indicate that full inference functionality requires proper model initialization
-    Err((
-        StatusCode::BAD_REQUEST,
-        Json(serde_json::json!({
-            "error": {
-                "message": "The OpenAI API is currently not supported due to compatibility issues with the tensor operations. Please use the CLI mode instead with: cargo run --bin inference-engine -- --prompt \"Your prompt here\"",
-                "type": "unsupported_api"
-            }
-        })),
-    ))
-}