mirror of
https://github.com/geoffsee/predict-otron-9001.git
synced 2025-09-08 22:46:44 +00:00
Refactor apply_cached_repeat_penalty
for optimized caching and reuse, add extensive unit tests, and integrate special handling for gemma-specific models.
Removed `test_request.sh`, deprecated functionality, and unused imports; introduced a new CLI tool (`cli.ts`) for testing inference engine and adjusted handling of non-streaming/streaming chat completions. - Add CPU fallback support for text generation when primary device is unsupported - Introduce `execute_with_fallback` method to handle device compatibility and shape mismatch errors - Extend unit tests to reproduce tensor shape mismatch errors specific to model configurations - Increase HTTP timeout limits in `curl_chat_stream.sh` script for reliable API testing chat completion endpoint functions with gemma3 (no streaming) Add benchmarking guide with HTML reporting, Leptos chat crate, and middleware for metrics tracking
This commit is contained in:
101
server.log
Normal file
101
server.log
Normal file
@@ -0,0 +1,101 @@
|
||||
warning: unused imports: `Deserialize` and `Serialize`
|
||||
--> crates/embeddings-engine/src/lib.rs:9:13
|
||||
|
|
||||
9 | use serde::{Deserialize, Serialize};
|
||||
| ^^^^^^^^^^^ ^^^^^^^^^
|
||||
|
|
||||
= note: `#[warn(unused_imports)]` on by default
|
||||
|
||||
warning: unused import: `candle_core::Tensor`
|
||||
--> crates/inference-engine/src/model.rs:1:5
|
||||
|
|
||||
1 | use candle_core::Tensor;
|
||||
| ^^^^^^^^^^^^^^^^^^^
|
||||
|
|
||||
= note: `#[warn(unused_imports)]` on by default
|
||||
|
||||
warning: unused import: `Config as Config1`
|
||||
--> crates/inference-engine/src/model.rs:2:42
|
||||
|
|
||||
2 | use candle_transformers::models::gemma::{Config as Config1, Model as Model1};
|
||||
| ^^^^^^^^^^^^^^^^^
|
||||
|
||||
warning: unused import: `Config as Config2`
|
||||
--> crates/inference-engine/src/model.rs:3:43
|
||||
|
|
||||
3 | use candle_transformers::models::gemma2::{Config as Config2, Model as Model2};
|
||||
| ^^^^^^^^^^^^^^^^^
|
||||
|
||||
warning: unused import: `Config as Config3`
|
||||
--> crates/inference-engine/src/model.rs:4:43
|
||||
|
|
||||
4 | use candle_transformers::models::gemma3::{Config as Config3, Model as Model3};
|
||||
| ^^^^^^^^^^^^^^^^^
|
||||
|
||||
warning: unused import: `ArrayBuilder`
|
||||
--> crates/inference-engine/src/openai_types.rs:23:27
|
||||
|
|
||||
23 | use utoipa::openapi::{ArrayBuilder, ObjectBuilder, OneOfBuilder, RefOr, Schema...
|
||||
| ^^^^^^^^^^^^
|
||||
|
||||
warning: unused import: `IntoResponse`
|
||||
--> crates/inference-engine/src/server.rs:4:38
|
||||
|
|
||||
4 | response::{sse::Event, sse::Sse, IntoResponse},
|
||||
| ^^^^^^^^^^^^
|
||||
|
||||
warning: unused import: `future`
|
||||
--> crates/inference-engine/src/server.rs:9:31
|
||||
|
|
||||
9 | use futures_util::{StreamExt, future};
|
||||
| ^^^^^^
|
||||
|
||||
warning: unused import: `std::io::Write`
|
||||
--> crates/inference-engine/src/text_generation.rs:5:5
|
||||
|
|
||||
5 | use std::io::Write;
|
||||
| ^^^^^^^^^^^^^^
|
||||
|
||||
warning: unused import: `StreamExt`
|
||||
--> crates/inference-engine/src/server.rs:9:20
|
||||
|
|
||||
9 | use futures_util::{StreamExt, future};
|
||||
| ^^^^^^^^^
|
||||
|
||||
warning: method `apply_cached_repeat_penalty` is never used
|
||||
--> crates/inference-engine/src/text_generation.rs:47:8
|
||||
|
|
||||
22 | impl TextGeneration {
|
||||
| ------------------- method in this implementation
|
||||
...
|
||||
47 | fn apply_cached_repeat_penalty(
|
||||
| ^^^^^^^^^^^^^^^^^^^^^^^^^^^
|
||||
|
|
||||
= note: `#[warn(dead_code)]` on by default
|
||||
|
||||
warning: `embeddings-engine` (lib) generated 1 warning (run `cargo fix --lib -p embeddings-engine` to apply 1 suggestion)
|
||||
warning: `inference-engine` (lib) generated 10 warnings (run `cargo fix --lib -p inference-engine` to apply 7 suggestions)
|
||||
Compiling predict-otron-9000 v0.1.0 (/Users/williamseemueller/workspace/seemueller-io/predict-otron-9000/crates/predict-otron-9000)
|
||||
warning: unused import: `axum::response::IntoResponse`
|
||||
--> crates/predict-otron-9000/src/main.rs:8:5
|
||||
|
|
||||
8 | use axum::response::IntoResponse;
|
||||
| ^^^^^^^^^^^^^^^^^^^^^^^^^^^^
|
||||
|
|
||||
= note: `#[warn(unused_imports)]` on by default
|
||||
|
||||
warning: `predict-otron-9000` (bin "predict-otron-9000") generated 1 warning (run `cargo fix --bin "predict-otron-9000"` to apply 1 suggestion)
|
||||
Finished `dev` profile [unoptimized + debuginfo] target(s) in 0.99s
|
||||
Running `target/debug/predict-otron-9000`
|
||||
[2m2025-08-27T17:30:52.870803Z[0m [32m INFO[0m [2mpredict_otron_9000::middleware::metrics[0m[2m:[0m Performance metrics summary:
|
||||
avx: false, neon: true, simd128: false, f16c: false
|
||||
[2m2025-08-27T17:30:52.871489Z[0m [32m INFO[0m [2mhf_hub[0m[2m:[0m Using token file found "/Users/williamseemueller/.cache/huggingface/token"
|
||||
Checking model_id: 'google/gemma-2b-it'
|
||||
Trimmed model_id length: 18
|
||||
Using explicitly specified model type: Instruct2B
|
||||
retrieved the files in 634.552791ms
|
||||
loaded the model in 569.864625ms
|
||||
|
||||
thread 'main' panicked at crates/predict-otron-9000/src/main.rs:80:10:
|
||||
Overlapping method route. Handler for `GET /` already exists
|
||||
note: run with `RUST_BACKTRACE=1` environment variable to display a backtrace
|
Reference in New Issue
Block a user