Refactor apply_cached_repeat_penalty for optimized caching and reuse, add extensive unit tests, and integrate special handling for gemma-specific models.

Removed `test_request.sh`, deprecated functionality, and unused imports; introduced a new CLI tool (`cli.ts`) for testing inference engine and adjusted handling of non-streaming/streaming chat completions.

- Add CPU fallback support for text generation when primary device is unsupported
- Introduce `execute_with_fallback` method to handle device compatibility and shape mismatch errors
- Extend unit tests to reproduce tensor shape mismatch errors specific to model configurations
- Increase HTTP timeout limits in `curl_chat_stream.sh` script for reliable API testing

chat completion endpoint functions with gemma3 (no streaming)

Add benchmarking guide with HTML reporting, Leptos chat crate, and middleware for metrics tracking
This commit is contained in:
geoffsee
2025-08-26 01:30:26 -04:00
parent 7dd23213c9
commit 8338750beb
64 changed files with 14997 additions and 220 deletions

View File

@@ -0,0 +1,116 @@
#!/bin/bash
# Performance testing script for inference-engine
# This script sends a series of chat completion requests to measure performance
echo "===== Inference Engine Performance Test ====="
echo "Testing with varying prompt sizes to establish baseline performance"
# Test parameters
SERVER_URL="http://localhost:8080"
ITERATIONS=3 # Lower than embeddings test due to longer processing time
TEST_SIZES=("small" "medium" "large")
MAX_TOKENS=50 # Limit token generation to keep tests shorter
# Define test prompts of different sizes
SMALL_PROMPT="What is the capital of France?"
MEDIUM_PROMPT="Explain the basic principles of machine learning. Include a brief overview of supervised and unsupervised learning."
LARGE_PROMPT="Write a comprehensive explanation of large language models. Include details about their architecture, training process, capabilities, limitations, and potential future developments. Also discuss ethical considerations around their use and deployment."
# Create a temp directory for test results
TEMP_DIR=$(mktemp -d)
echo "Storing test results in: $TEMP_DIR"
# Function to run a single test and record the results
run_test() {
local size=$1
local prompt=$2
local output_file="${TEMP_DIR}/${size}_results.txt"
echo -e "\n===== Testing $size prompt =====" | tee -a "$output_file"
echo "Prompt length: $(echo "$prompt" | wc -w) words" | tee -a "$output_file"
# Prepare JSON payload
local json_payload=$(cat <<EOF
{
"model": "gemma-3-1b-it",
"messages": [{"role": "user", "content": "$prompt"}],
"max_tokens": $MAX_TOKENS
}
EOF
)
# Run the test multiple times
for i in $(seq 1 $ITERATIONS); do
echo "Iteration $i:" | tee -a "$output_file"
# Send request and measure time
start_time=$(date +%s.%N)
# Send the chat completion request
response=$(curl -s -X POST \
-H "Content-Type: application/json" \
-d "$json_payload" \
"$SERVER_URL/v1/chat/completions")
end_time=$(date +%s.%N)
# Calculate elapsed time
elapsed=$(echo "$end_time - $start_time" | bc)
# Extract response content length
content_length=$(echo "$response" | grep -o '"content":"[^"]*"' | wc -c)
# Check if we got an error (for troubleshooting)
error_check=$(echo "$response" | grep -c "error")
if [ "$error_check" -gt 0 ]; then
echo " Error in response: $response" | tee -a "$output_file"
fi
# Log results
echo " Time: ${elapsed}s, Response size: $content_length bytes" | tee -a "$output_file"
# Add a delay between requests to allow server to recover
sleep 2
done
# Calculate average time
avg_time=$(grep "Time:" "$output_file" | grep -v "Error" | awk '{sum+=$2} END {if(NR>0) print sum/NR; else print "N/A"}')
echo "Average time for $size prompt: ${avg_time}s" | tee -a "$output_file"
}
# Make sure the server is running
echo "Checking if the server is running..."
if ! curl -s "$SERVER_URL" > /dev/null; then
echo "Server doesn't appear to be running at $SERVER_URL"
echo "Please start the server with: ./run_server.sh"
exit 1
fi
# Run tests for each prompt size
echo "Starting performance tests..."
run_test "small" "$SMALL_PROMPT"
run_test "medium" "$MEDIUM_PROMPT"
run_test "large" "$LARGE_PROMPT"
echo -e "\n===== Performance Test Summary ====="
for size in "${TEST_SIZES[@]}"; do
avg=$(grep "Average time for $size prompt" "${TEMP_DIR}/${size}_results.txt" | awk '{print $6}')
if [ -z "$avg" ]; then
avg="N/A (possible errors)"
else
avg="${avg}s"
fi
echo "$size prompt: $avg"
done
# Provide more detailed analysis if possible
echo -e "\n===== Performance Analysis ====="
echo "Note: The inference-engine response times include:"
echo " - Input prompt tokenization"
echo " - Model inference (token generation)"
echo " - Response post-processing"
echo "Check server logs for more detailed performance breakdown"
echo -e "\nDetailed results are available in: $TEMP_DIR"
echo "===== Test Complete ====="