Refactor apply_cached_repeat_penalty for optimized caching and reuse, add extensive unit tests, and integrate special handling for gemma-specific models.

Removed `test_request.sh`, deprecated functionality, and unused imports; introduced a new CLI tool (`cli.ts`) for testing inference engine and adjusted handling of non-streaming/streaming chat completions. - Add CPU fallback support for text generation when primary device is unsupported - Introduce `execute_with_fallback` method to handle device compatibility and shape mismatch errors - Extend unit tests to reproduce tensor shape mismatch errors specific to model configurations - Increase HTTP timeout limits in `curl_chat_stream.sh` script for reliable API testing chat completion endpoint functions with gemma3 (no streaming) Add benchmarking guide with HTML reporting, Leptos chat crate, and middleware for metrics tracking
2025-09-08 22:46:44 +00:00 · 2025-08-26 01:30:26 -04:00
parent 7dd23213c9
commit 8338750beb
64 changed files with 14997 additions and 220 deletions
--- a/scripts/curl_chat.sh
+++ b/scripts/curl_chat.sh
@@ -0,0 +1,50 @@
+#!/usr/bin/env bash
+set -euo pipefail
+
+# Simple curl helper for non-streaming chat completions
+# Usage:
+#   scripts/curl_chat.sh "Who was the 16th president of the United States?"
+#   MODEL_ID=google/gemma-2b-it scripts/curl_chat.sh "Hello!"
+
+SERVER_URL=${SERVER_URL:-http://localhost:8080}
+MODEL_ID=${MODEL_ID:-gemma-3-1b-it}
+PROMPT=${1:-"What is the capital of France?"}
+MAX_TOKENS=${MAX_TOKENS:-128}
+# Timeout controls (seconds)
+CONNECT_TIMEOUT=${CONNECT_TIMEOUT:-2}
+MAX_TIME=${MAX_TIME:-20}
+
+cat <<EOF
+[info] POST $SERVER_URL/v1/chat/completions
+[info] model=$MODEL_ID, max_tokens=$MAX_TOKENS
+[info] prompt=$PROMPT
+[info] timeouts: connect=${CONNECT_TIMEOUT}s, max=${MAX_TIME}s
+EOF
+
+# Quick preflight to avoid long hangs when server is down
+if ! curl -sS -o /dev/null -w "%{http_code}" \
+      --connect-timeout "$CONNECT_TIMEOUT" \
+      --max-time "$CONNECT_TIMEOUT" \
+      "$SERVER_URL/" | grep -qE '^(200|3..)'; then
+  echo "[warn] Server not reachable at $SERVER_URL (preflight failed)."
+  echo "[hint] Start it with ./run_server.sh or adjust SERVER_URL."
+  exit 7
+fi
+
+curl -sS -X POST \
+  --connect-timeout "$CONNECT_TIMEOUT" \
+  --max-time "$MAX_TIME" \
+  -H "Content-Type: application/json" \
+  "$SERVER_URL/v1/chat/completions" \
+  -d @- <<JSON
+{
+  "model": "${MODEL_ID}",
+  "messages": [
+    {"role": "user", "content": "${PROMPT}"}
+  ],
+  "max_tokens": ${MAX_TOKENS},
+  "stream": false
+}
+JSON
+
+echo