Refactor apply_cached_repeat_penalty for optimized caching and reuse, add extensive unit tests, and integrate special handling for gemma-specific models.

Removed `test_request.sh`, deprecated functionality, and unused imports; introduced a new CLI tool (`cli.ts`) for testing inference engine and adjusted handling of non-streaming/streaming chat completions. - Add CPU fallback support for text generation when primary device is unsupported - Introduce `execute_with_fallback` method to handle device compatibility and shape mismatch errors - Extend unit tests to reproduce tensor shape mismatch errors specific to model configurations - Increase HTTP timeout limits in `curl_chat_stream.sh` script for reliable API testing chat completion endpoint functions with gemma3 (no streaming) Add benchmarking guide with HTML reporting, Leptos chat crate, and middleware for metrics tracking
2025-09-08 22:46:44 +00:00 · 2025-08-26 01:30:26 -04:00
parent 7dd23213c9
commit 8338750beb
64 changed files with 14997 additions and 220 deletions
--- a/scripts/test_request.sh
+++ b/scripts/test_request.sh
@@ -0,0 +1,69 @@
+#!/bin/bash
+
+# Simple test script for inference-engine
+# This script sends a single chat completion request
+
+echo "===== Inference Engine Test ====="
+
+# Test parameters
+SERVER_URL="http://localhost:8080"  # Changed from 8080 to 3777 to match main.rs default port
+MAX_TOKENS=10
+PROMPT="What is the capital of France?"
+MODEL="${MODEL_ID:-gemma-2-2b-it}"  # Using gemma-2-2b-it as specified in the original test
+
+# Create a temp directory for test results
+TEMP_DIR=$(mktemp -d)
+echo "Storing test results in: $TEMP_DIR"
+
+# Prepare JSON payload
+json_payload=$(cat <<EOF
+{
+    "model": "$MODEL", 
+    "messages": [{"role": "user", "content": "$PROMPT"}],
+    "max_tokens": $MAX_TOKENS
+}
+EOF
+)
+
+# Make sure the server is running
+echo "Checking if the server is running..."
+if ! curl -s "$SERVER_URL" > /dev/null; then
+    echo "Server doesn't appear to be running at $SERVER_URL"
+    echo "Please start the server with: ./run_server.sh"
+    exit 1
+fi
+
+echo "Sending request..."
+
+# Send request and measure time
+start_time=$(date +%s.%N)
+
+# Send the chat completion request with 30 second timeout
+# Note: The gemma-2-2b-it model takes ~12.57 seconds per token on average
+# So even with MAX_TOKENS=10, the request might time out before completion
+# The timeout ensures the script doesn't hang indefinitely
+response=$(curl -s -X POST \
+    -H "Content-Type: application/json" \
+    -d "$json_payload" \
+    --max-time 30 \
+    "$SERVER_URL/v1/chat/completions")
+
+end_time=$(date +%s.%N)
+
+# Calculate elapsed time
+elapsed=$(echo "$end_time - $start_time" | bc)
+
+# Extract response content length
+content_length=$(echo "$response" | grep -o '"content":"[^"]*"' | wc -c)
+
+# Check if we got an error
+error_check=$(echo "$response" | grep -c "error")
+if [ "$error_check" -gt 0 ]; then
+    echo "Error in response: $response"
+fi
+
+# Log results
+echo "Time: ${elapsed}s, Response size: $content_length bytes"
+echo "Response: $response"
+
+echo -e "\nTest Complete"