Refactor apply_cached_repeat_penalty for optimized caching and reuse, add extensive unit tests, and integrate special handling for gemma-specific models.

Removed `test_request.sh`, deprecated functionality, and unused imports; introduced a new CLI tool (`cli.ts`) for testing inference engine and adjusted handling of non-streaming/streaming chat completions. - Add CPU fallback support for text generation when primary device is unsupported - Introduce `execute_with_fallback` method to handle device compatibility and shape mismatch errors - Extend unit tests to reproduce tensor shape mismatch errors specific to model configurations - Increase HTTP timeout limits in `curl_chat_stream.sh` script for reliable API testing chat completion endpoint functions with gemma3 (no streaming) Add benchmarking guide with HTML reporting, Leptos chat crate, and middleware for metrics tracking
2025-09-08 22:46:44 +00:00 · 2025-08-26 01:30:26 -04:00
parent 7dd23213c9
commit 8338750beb
64 changed files with 14997 additions and 220 deletions
--- a/scripts/curl_chat.sh
+++ b/scripts/curl_chat.sh
@@ -0,0 +1,50 @@
+#!/usr/bin/env bash
+set -euo pipefail
+
+# Simple curl helper for non-streaming chat completions
+# Usage:
+#   scripts/curl_chat.sh "Who was the 16th president of the United States?"
+#   MODEL_ID=google/gemma-2b-it scripts/curl_chat.sh "Hello!"
+
+SERVER_URL=${SERVER_URL:-http://localhost:8080}
+MODEL_ID=${MODEL_ID:-gemma-3-1b-it}
+PROMPT=${1:-"What is the capital of France?"}
+MAX_TOKENS=${MAX_TOKENS:-128}
+# Timeout controls (seconds)
+CONNECT_TIMEOUT=${CONNECT_TIMEOUT:-2}
+MAX_TIME=${MAX_TIME:-20}
+
+cat <<EOF
+[info] POST $SERVER_URL/v1/chat/completions
+[info] model=$MODEL_ID, max_tokens=$MAX_TOKENS
+[info] prompt=$PROMPT
+[info] timeouts: connect=${CONNECT_TIMEOUT}s, max=${MAX_TIME}s
+EOF
+
+# Quick preflight to avoid long hangs when server is down
+if ! curl -sS -o /dev/null -w "%{http_code}" \
+      --connect-timeout "$CONNECT_TIMEOUT" \
+      --max-time "$CONNECT_TIMEOUT" \
+      "$SERVER_URL/" | grep -qE '^(200|3..)'; then
+  echo "[warn] Server not reachable at $SERVER_URL (preflight failed)."
+  echo "[hint] Start it with ./run_server.sh or adjust SERVER_URL."
+  exit 7
+fi
+
+curl -sS -X POST \
+  --connect-timeout "$CONNECT_TIMEOUT" \
+  --max-time "$MAX_TIME" \
+  -H "Content-Type: application/json" \
+  "$SERVER_URL/v1/chat/completions" \
+  -d @- <<JSON
+{
+  "model": "${MODEL_ID}",
+  "messages": [
+    {"role": "user", "content": "${PROMPT}"}
+  ],
+  "max_tokens": ${MAX_TOKENS},
+  "stream": false
+}
+JSON
+
+echo
--- a/scripts/curl_chat_stream.sh
+++ b/scripts/curl_chat_stream.sh
@@ -0,0 +1,50 @@
+#!/usr/bin/env bash
+set -euo pipefail
+
+# Simple curl helper for streaming chat completions (SSE)
+# Usage:
+#   scripts/curl_chat_stream.sh "Who was the 16th president of the United States?"
+#   MODEL_ID=google/gemma-2b-it scripts/curl_chat_stream.sh "Hello!"
+
+SERVER_URL=${SERVER_URL:-http://localhost:8080}
+MODEL_ID=${MODEL_ID:-gemma-3-1b-it}
+PROMPT=${1:-"What is the capital of France?"}
+MAX_TOKENS=${MAX_TOKENS:-128}
+# Timeout controls (seconds)
+CONNECT_TIMEOUT=${CONNECT_TIMEOUT:-10}
+MAX_TIME=${MAX_TIME:-30}
+
+cat <<EOF
+[info] POST $SERVER_URL/v1/chat/completions/stream (SSE)
+[info] model=$MODEL_ID, max_tokens=$MAX_TOKENS
+[info] prompt=$PROMPT
+[info] timeouts: connect=${CONNECT_TIMEOUT}s, max=${MAX_TIME}s
+EOF
+
+# Quick preflight to avoid long hangs when server is down
+if ! curl -sS -o /dev/null -w "%{http_code}" \
+      --connect-timeout "$CONNECT_TIMEOUT" \
+      --max-time "$CONNECT_TIMEOUT" \
+      "$SERVER_URL/" | grep -qE '^(200|3..)'; then
+  echo "[warn] Server not reachable at $SERVER_URL (preflight failed)."
+  echo "[hint] Start it with ./run_server.sh or adjust SERVER_URL."
+  exit 7
+fi
+
+curl -N -sS -X POST \
+  --connect-timeout "$CONNECT_TIMEOUT" \
+  --max-time "$MAX_TIME" \
+  -H "Content-Type: application/json" \
+  "$SERVER_URL/v1/chat/completions/stream" \
+  -d @- <<JSON
+{
+  "model": "${MODEL_ID}",
+  "messages": [
+    {"role": "user", "content": "${PROMPT}"}
+  ],
+  "max_tokens": ${MAX_TOKENS},
+  "stream": true
+}
+JSON
+
+echo
--- a/scripts/performance_test_embeddings.sh
+++ b/scripts/performance_test_embeddings.sh
@@ -0,0 +1,95 @@
+#!/bin/bash
+
+# Performance testing script for embeddings-engine
+# This script sends a series of embedding requests to measure performance
+
+echo "===== Embeddings Engine Performance Test ====="
+echo "Testing with varying input sizes to establish baseline performance"
+
+# Test parameters
+SERVER_URL="http://localhost:8080"
+ITERATIONS=5
+TEST_SIZES=("small" "medium" "large")
+
+# Define test inputs of different sizes
+SMALL_INPUT="This is a small test input for embeddings."
+MEDIUM_INPUT="This is a medium-sized test input for embeddings. It contains multiple sentences with varying structure and vocabulary. The goal is to test how the embedding engine handles moderately sized inputs that might be typical in a production environment."
+LARGE_INPUT="This is a large test input for embeddings. It contains multiple paragraphs with varying structure and vocabulary. The purpose of this test is to evaluate how the embedding engine performs with larger texts that might represent documents or long-form content. In a production environment, users might submit anything from short queries to entire documents for embedding, so it's important to understand the performance characteristics across different input sizes. This paragraph continues with additional text to ensure we have a sufficiently large input for testing purposes. The text doesn't need to be particularly meaningful, but it should represent a realistic workload in terms of token count and language patterns. We're particularly interested in how processing time scales with input size, as this information will help us optimize the service for different use cases and load patterns."
+
+# Create a temp directory for test results
+TEMP_DIR=$(mktemp -d)
+echo "Storing test results in: $TEMP_DIR"
+
+# Function to run a single test and record the results
+run_test() {
+    local size=$1
+    local input=$2
+    local output_file="${TEMP_DIR}/${size}_results.txt"
+    
+    echo -e "\n===== Testing $size input =====" | tee -a "$output_file"
+    echo "Input length: $(echo "$input" | wc -w) words" | tee -a "$output_file"
+    
+    # Prepare JSON payload
+    local json_payload=$(cat <<EOF
+{
+    "input": "$input",
+    "model": "text-embedding-3-small"
+}
+EOF
+)
+    
+    # Run the test multiple times
+    for i in $(seq 1 $ITERATIONS); do
+        echo "Iteration $i:" | tee -a "$output_file"
+        
+        # Send request and measure time
+        start_time=$(date +%s.%N)
+        
+        # Send the embedding request
+        response=$(curl -s -X POST \
+            -H "Content-Type: application/json" \
+            -d "$json_payload" \
+            "$SERVER_URL/v1/embeddings")
+        
+        end_time=$(date +%s.%N)
+        
+        # Calculate elapsed time
+        elapsed=$(echo "$end_time - $start_time" | bc)
+        
+        # Extract embedding dimensions
+        dimensions=$(echo "$response" | grep -o '"embedding":\[[^]]*\]' | wc -c)
+        
+        # Log results
+        echo "  Time: ${elapsed}s, Response size: $dimensions bytes" | tee -a "$output_file"
+        
+        # Add a small delay between requests
+        sleep 1
+    done
+    
+    # Calculate average time
+    avg_time=$(grep "Time:" "$output_file" | awk '{sum+=$2} END {print sum/NR}')
+    echo "Average time for $size input: ${avg_time}s" | tee -a "$output_file"
+}
+
+# Make sure the server is running
+echo "Checking if the server is running..."
+if ! curl -s "$SERVER_URL" > /dev/null; then
+    echo "Server doesn't appear to be running at $SERVER_URL"
+    echo "Please start the server with: ./run_server.sh"
+    exit 1
+fi
+
+# Run tests for each input size
+echo "Starting performance tests..."
+run_test "small" "$SMALL_INPUT"
+run_test "medium" "$MEDIUM_INPUT"
+run_test "large" "$LARGE_INPUT"
+
+echo -e "\n===== Performance Test Summary ====="
+for size in "${TEST_SIZES[@]}"; do
+    avg=$(grep "Average time for $size input" "${TEMP_DIR}/${size}_results.txt" | awk '{print $6}')
+    echo "$size input: $avg seconds"
+done
+
+echo -e "\nDetailed results are available in: $TEMP_DIR"
+echo "===== Test Complete ====="
--- a/scripts/performance_test_inference.sh
+++ b/scripts/performance_test_inference.sh
@@ -0,0 +1,116 @@
+#!/bin/bash
+
+# Performance testing script for inference-engine
+# This script sends a series of chat completion requests to measure performance
+
+echo "===== Inference Engine Performance Test ====="
+echo "Testing with varying prompt sizes to establish baseline performance"
+
+# Test parameters
+SERVER_URL="http://localhost:8080"
+ITERATIONS=3  # Lower than embeddings test due to longer processing time
+TEST_SIZES=("small" "medium" "large")
+MAX_TOKENS=50  # Limit token generation to keep tests shorter
+
+# Define test prompts of different sizes
+SMALL_PROMPT="What is the capital of France?"
+MEDIUM_PROMPT="Explain the basic principles of machine learning. Include a brief overview of supervised and unsupervised learning."
+LARGE_PROMPT="Write a comprehensive explanation of large language models. Include details about their architecture, training process, capabilities, limitations, and potential future developments. Also discuss ethical considerations around their use and deployment."
+
+# Create a temp directory for test results
+TEMP_DIR=$(mktemp -d)
+echo "Storing test results in: $TEMP_DIR"
+
+# Function to run a single test and record the results
+run_test() {
+    local size=$1
+    local prompt=$2
+    local output_file="${TEMP_DIR}/${size}_results.txt"
+    
+    echo -e "\n===== Testing $size prompt =====" | tee -a "$output_file"
+    echo "Prompt length: $(echo "$prompt" | wc -w) words" | tee -a "$output_file"
+    
+    # Prepare JSON payload
+    local json_payload=$(cat <<EOF
+{
+    "model": "gemma-3-1b-it",
+    "messages": [{"role": "user", "content": "$prompt"}],
+    "max_tokens": $MAX_TOKENS
+}
+EOF
+)
+    
+    # Run the test multiple times
+    for i in $(seq 1 $ITERATIONS); do
+        echo "Iteration $i:" | tee -a "$output_file"
+        
+        # Send request and measure time
+        start_time=$(date +%s.%N)
+        
+        # Send the chat completion request
+        response=$(curl -s -X POST \
+            -H "Content-Type: application/json" \
+            -d "$json_payload" \
+            "$SERVER_URL/v1/chat/completions")
+        
+        end_time=$(date +%s.%N)
+        
+        # Calculate elapsed time
+        elapsed=$(echo "$end_time - $start_time" | bc)
+        
+        # Extract response content length
+        content_length=$(echo "$response" | grep -o '"content":"[^"]*"' | wc -c)
+        
+        # Check if we got an error (for troubleshooting)
+        error_check=$(echo "$response" | grep -c "error")
+        if [ "$error_check" -gt 0 ]; then
+            echo "  Error in response: $response" | tee -a "$output_file"
+        fi
+        
+        # Log results
+        echo "  Time: ${elapsed}s, Response size: $content_length bytes" | tee -a "$output_file"
+        
+        # Add a delay between requests to allow server to recover
+        sleep 2
+    done
+    
+    # Calculate average time
+    avg_time=$(grep "Time:" "$output_file" | grep -v "Error" | awk '{sum+=$2} END {if(NR>0) print sum/NR; else print "N/A"}')
+    echo "Average time for $size prompt: ${avg_time}s" | tee -a "$output_file"
+}
+
+# Make sure the server is running
+echo "Checking if the server is running..."
+if ! curl -s "$SERVER_URL" > /dev/null; then
+    echo "Server doesn't appear to be running at $SERVER_URL"
+    echo "Please start the server with: ./run_server.sh"
+    exit 1
+fi
+
+# Run tests for each prompt size
+echo "Starting performance tests..."
+run_test "small" "$SMALL_PROMPT"
+run_test "medium" "$MEDIUM_PROMPT"
+run_test "large" "$LARGE_PROMPT"
+
+echo -e "\n===== Performance Test Summary ====="
+for size in "${TEST_SIZES[@]}"; do
+    avg=$(grep "Average time for $size prompt" "${TEMP_DIR}/${size}_results.txt" | awk '{print $6}')
+    if [ -z "$avg" ]; then
+        avg="N/A (possible errors)"
+    else
+        avg="${avg}s"
+    fi
+    echo "$size prompt: $avg"
+done
+
+# Provide more detailed analysis if possible
+echo -e "\n===== Performance Analysis ====="
+echo "Note: The inference-engine response times include:"
+echo "  - Input prompt tokenization"
+echo "  - Model inference (token generation)"
+echo "  - Response post-processing"
+echo "Check server logs for more detailed performance breakdown"
+
+echo -e "\nDetailed results are available in: $TEMP_DIR"
+echo "===== Test Complete ====="
--- a/scripts/run.sh
+++ b/scripts/run.sh
@@ -0,0 +1,3 @@
+#!/bin/bash
+
+cargo run --bin ptron
--- a/scripts/test_request.sh
+++ b/scripts/test_request.sh
@@ -0,0 +1,69 @@
+#!/bin/bash
+
+# Simple test script for inference-engine
+# This script sends a single chat completion request
+
+echo "===== Inference Engine Test ====="
+
+# Test parameters
+SERVER_URL="http://localhost:8080"  # Changed from 8080 to 3777 to match main.rs default port
+MAX_TOKENS=10
+PROMPT="What is the capital of France?"
+MODEL="${MODEL_ID:-gemma-2-2b-it}"  # Using gemma-2-2b-it as specified in the original test
+
+# Create a temp directory for test results
+TEMP_DIR=$(mktemp -d)
+echo "Storing test results in: $TEMP_DIR"
+
+# Prepare JSON payload
+json_payload=$(cat <<EOF
+{
+    "model": "$MODEL", 
+    "messages": [{"role": "user", "content": "$PROMPT"}],
+    "max_tokens": $MAX_TOKENS
+}
+EOF
+)
+
+# Make sure the server is running
+echo "Checking if the server is running..."
+if ! curl -s "$SERVER_URL" > /dev/null; then
+    echo "Server doesn't appear to be running at $SERVER_URL"
+    echo "Please start the server with: ./run_server.sh"
+    exit 1
+fi
+
+echo "Sending request..."
+
+# Send request and measure time
+start_time=$(date +%s.%N)
+
+# Send the chat completion request with 30 second timeout
+# Note: The gemma-2-2b-it model takes ~12.57 seconds per token on average
+# So even with MAX_TOKENS=10, the request might time out before completion
+# The timeout ensures the script doesn't hang indefinitely
+response=$(curl -s -X POST \
+    -H "Content-Type: application/json" \
+    -d "$json_payload" \
+    --max-time 30 \
+    "$SERVER_URL/v1/chat/completions")
+
+end_time=$(date +%s.%N)
+
+# Calculate elapsed time
+elapsed=$(echo "$end_time - $start_time" | bc)
+
+# Extract response content length
+content_length=$(echo "$response" | grep -o '"content":"[^"]*"' | wc -c)
+
+# Check if we got an error
+error_check=$(echo "$response" | grep -c "error")
+if [ "$error_check" -gt 0 ]; then
+    echo "Error in response: $response"
+fi
+
+# Log results
+echo "Time: ${elapsed}s, Response size: $content_length bytes"
+echo "Response: $response"
+
+echo -e "\nTest Complete"