From d04340d9ac57f3a603d5541062789a482552fc7a Mon Sep 17 00:00:00 2001 From: geoffsee <> Date: Thu, 28 Aug 2025 12:54:09 -0400 Subject: [PATCH] update docs --- Cargo.toml | 5 +- README.md | 19 -- crates/embeddings-engine/README.md | 4 + .../helm-chart-tool}/Cargo.toml | 0 .../helm-chart-tool}/README.md | 0 .../helm-chart-tool}/src/main.rs | 0 crates/inference-engine/api_test.html | 295 ------------------ crates/inference-engine/openai-api-test.js | 176 ----------- crates/predict-otron-9000/README.md | 8 + integration/README.md | 1 + integration/package.json | 3 + package.json | 5 +- cli.ts => scripts/cli.ts | 0 scripts/run.sh | 3 - run_server.sh => scripts/run_server.sh | 0 test_predict_otron.sh => scripts/test.sh | 0 scripts/test_request.sh | 69 ---- test_repetition.ts | 85 ----- 18 files changed, 22 insertions(+), 651 deletions(-) create mode 100644 crates/embeddings-engine/README.md rename {helm-chart-tool => crates/helm-chart-tool}/Cargo.toml (100%) rename {helm-chart-tool => crates/helm-chart-tool}/README.md (100%) rename {helm-chart-tool => crates/helm-chart-tool}/src/main.rs (100%) delete mode 100644 crates/inference-engine/api_test.html delete mode 100644 crates/inference-engine/openai-api-test.js create mode 100644 crates/predict-otron-9000/README.md create mode 100644 integration/README.md rename cli.ts => scripts/cli.ts (100%) delete mode 100755 scripts/run.sh rename run_server.sh => scripts/run_server.sh (100%) rename test_predict_otron.sh => scripts/test.sh (100%) delete mode 100755 scripts/test_request.sh delete mode 100644 test_repetition.ts diff --git a/Cargo.toml b/Cargo.toml index 4b0627a..157334f 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -3,13 +3,12 @@ members = [ "crates/predict-otron-9000", "crates/inference-engine", "crates/embeddings-engine", - "crates/leptos-app" + "crates/leptos-app", + "crates/helm-chart-tool" ] default-members = ["crates/predict-otron-9000"] resolver = "2" - - [[workspace.metadata.leptos]] # project name bin-package = "leptos-app" diff --git a/README.md b/README.md index ab76b80..a63904f 100644 --- a/README.md +++ b/README.md @@ -264,23 +264,6 @@ export RUST_LOG=trace export RUST_LOG=predict_otron_9000=debug,embeddings_engine=trace ``` -## Chat Interface - -The project includes a WebAssembly-based chat interface built with the Leptos framework. - -### Building the Chat Interface - -```shell -# Navigate to the leptos-app crate -cd crates/leptos-app - -# Build the WebAssembly package -cargo build --target wasm32-unknown-unknown - -# For development with trunk (if installed) -trunk serve -``` - ### Usage The chat interface connects to the inference engine API and provides a user-friendly way to interact with the AI models. To use: @@ -298,8 +281,6 @@ The interface supports: - **Inference Engine**: Currently provides a simplified implementation for chat completions. Full model loading and text generation capabilities from the inference-engine crate are not yet integrated into the unified server. - **Model Support**: Embeddings are limited to the Nomic Embed Text v1.5 model. -- **Scalability**: Single-threaded model loading may impact performance under heavy load. -- **Chat Interface**: The WebAssembly chat interface requires compilation to a static site before deployment. ## Contributing diff --git a/crates/embeddings-engine/README.md b/crates/embeddings-engine/README.md new file mode 100644 index 0000000..c47ea5a --- /dev/null +++ b/crates/embeddings-engine/README.md @@ -0,0 +1,4 @@ +# Embeddings Engine + +A high-performance text embeddings service that generates vector representations of text using state-of-the-art models. +This crate wraps the fastembed crate to provide embeddings and partially adapts the openai specification. \ No newline at end of file diff --git a/helm-chart-tool/Cargo.toml b/crates/helm-chart-tool/Cargo.toml similarity index 100% rename from helm-chart-tool/Cargo.toml rename to crates/helm-chart-tool/Cargo.toml diff --git a/helm-chart-tool/README.md b/crates/helm-chart-tool/README.md similarity index 100% rename from helm-chart-tool/README.md rename to crates/helm-chart-tool/README.md diff --git a/helm-chart-tool/src/main.rs b/crates/helm-chart-tool/src/main.rs similarity index 100% rename from helm-chart-tool/src/main.rs rename to crates/helm-chart-tool/src/main.rs diff --git a/crates/inference-engine/api_test.html b/crates/inference-engine/api_test.html deleted file mode 100644 index f4654a5..0000000 --- a/crates/inference-engine/api_test.html +++ /dev/null @@ -1,295 +0,0 @@ - - - - - - OpenAI-Compatible API Tester - - - -

OpenAI-Compatible API Tester

-

Use this page to test the OpenAI-compatible chat completions endpoint of the local inference engine.

- -
-

Request Settings

-
-
- - -
-
- - -
-
- - -
-
- - -
-
- - -
-
- -

Request Body

- - - -
-

Example Requests

- - - - -
- -
-

Response

-
Response will appear here...
-
-
- - - - \ No newline at end of file diff --git a/crates/inference-engine/openai-api-test.js b/crates/inference-engine/openai-api-test.js deleted file mode 100644 index 51f2e3e..0000000 --- a/crates/inference-engine/openai-api-test.js +++ /dev/null @@ -1,176 +0,0 @@ -// Test requests for the OpenAI-compatible endpoint in the inference server -// This file contains IIFE (Immediately Invoked Function Expression) JavaScript requests -// to test the /v1/chat/completions endpoint - -// Basic chat completion request -(async function testBasicChatCompletion() { - console.log("Test 1: Basic chat completion request"); - try { - const response = await fetch('http://localhost:3777/v1/chat/completions', { - method: 'POST', - headers: { - 'Content-Type': 'application/json', - }, - body: JSON.stringify({ - model: "gemma-2-2b-it", - messages: [ - { - role: "user", - content: "Who was the 16th president of the United States?" - } - ], - max_tokens: 100 - }) - }); - - const data = await response.json(); - console.log("Response:", JSON.stringify(data, null, 2)); - } catch (error) { - console.error("Error:", error); - } -})(); - -// Multi-turn conversation -(async function testMultiTurnConversation() { - console.log("\nTest 2: Multi-turn conversation"); - try { - const response = await fetch('http://localhost:3777/v1/chat/completions', { - method: 'POST', - headers: { - 'Content-Type': 'application/json', - }, - body: JSON.stringify({ - model: "gemma-2-2b-it", - messages: [ - { - role: "system", - content: "You are a helpful assistant that provides concise answers." - }, - { - role: "user", - content: "What is machine learning?" - }, - { - role: "assistant", - content: "Machine learning is a subset of artificial intelligence that enables systems to learn and improve from experience without being explicitly programmed." - }, - { - role: "user", - content: "Give me an example of a machine learning algorithm." - } - ], - max_tokens: 150 - }) - }); - - const data = await response.json(); - console.log("Response:", JSON.stringify(data, null, 2)); - } catch (error) { - console.error("Error:", error); - } -})(); - -// Request with temperature and top_p parameters -(async function testTemperatureAndTopP() { - console.log("\nTest 3: Request with temperature and top_p parameters"); - try { - const response = await fetch('http://localhost:3777/v1/chat/completions', { - method: 'POST', - headers: { - 'Content-Type': 'application/json', - }, - body: JSON.stringify({ - model: "gemma-2-2b-it", - messages: [ - { - role: "user", - content: "Write a short poem about artificial intelligence." - } - ], - max_tokens: 200, - temperature: 0.8, - top_p: 0.9 - }) - }); - - const data = await response.json(); - console.log("Response:", JSON.stringify(data, null, 2)); - } catch (error) { - console.error("Error:", error); - } -})(); - -// Request with streaming enabled -(async function testStreaming() { - console.log("\nTest 4: Request with streaming enabled"); - try { - const response = await fetch('http://localhost:3777/v1/chat/completions', { - method: 'POST', - headers: { - 'Content-Type': 'application/json', - }, - body: JSON.stringify({ - model: "gemma-2-2b-it", - messages: [ - { - role: "user", - content: "Explain quantum computing in simple terms." - } - ], - max_tokens: 150, - stream: true - }) - }); - - // Note: Streaming might not be implemented yet, this is to test the API's handling of the parameter - if (response.headers.get('content-type')?.includes('text/event-stream')) { - console.log("Streaming response detected. Reading stream..."); - const reader = response.body.getReader(); - const decoder = new TextDecoder(); - - while (true) { - const { done, value } = await reader.read(); - if (done) break; - - const chunk = decoder.decode(value); - console.log("Chunk:", chunk); - } - } else { - const data = await response.json(); - console.log("Non-streaming response:", JSON.stringify(data, null, 2)); - } - } catch (error) { - console.error("Error:", error); - } -})(); - -// Request with a different model -(async function testDifferentModel() { - console.log("\nTest 5: Request with a different model"); - try { - const response = await fetch('http://localhost:3777/v1/chat/completions', { - method: 'POST', - headers: { - 'Content-Type': 'application/json', - }, - body: JSON.stringify({ - model: "gemma-2-2b-it", // Using a different model if available - messages: [ - { - role: "user", - content: "What are the benefits of renewable energy?" - } - ], - max_tokens: 150 - }) - }); - - const data = await response.json(); - console.log("Response:", JSON.stringify(data, null, 2)); - } catch (error) { - console.error("Error:", error); - } -})(); - -console.log("\nAll test requests have been sent. Check the server logs for more details."); -console.log("To run the server, use: cargo run --bin inference-engine -- --server"); diff --git a/crates/predict-otron-9000/README.md b/crates/predict-otron-9000/README.md new file mode 100644 index 0000000..066bf70 --- /dev/null +++ b/crates/predict-otron-9000/README.md @@ -0,0 +1,8 @@ +# predict-otron-9000 + +This is an extensible axum/tokio hybrid combining [embeddings-engine](../embeddings-engine), [inference-engine](../inference-engine), and [leptos-app](../leptos-app). + + +# Notes +- When `server_mode` is Standalone (default), the instance contains all components necessary for inference. +- When `server_mode` is HighAvailability, automatic scaling of inference and embeddings; proxies to inference and embeddings services via dns diff --git a/integration/README.md b/integration/README.md new file mode 100644 index 0000000..05bc836 --- /dev/null +++ b/integration/README.md @@ -0,0 +1 @@ +This package enables testing of a live instance of predict-otron-9000 using the openai node sdk directly. \ No newline at end of file diff --git a/integration/package.json b/integration/package.json index d80a8fa..9002d42 100644 --- a/integration/package.json +++ b/integration/package.json @@ -1,5 +1,8 @@ { "name": "@predict-otron-9000/ingeration", + "scripts": { + "test": "bun test" + }, "dependencies": { "openai": "^5.16.0" } diff --git a/package.json b/package.json index c37756b..f7f926d 100644 --- a/package.json +++ b/package.json @@ -1,5 +1,8 @@ { "dependencies": { "openai": "^5.16.0" + }, + "scripts": { + "cli": "./scripts/cli.ts" } -} +} \ No newline at end of file diff --git a/cli.ts b/scripts/cli.ts similarity index 100% rename from cli.ts rename to scripts/cli.ts diff --git a/scripts/run.sh b/scripts/run.sh deleted file mode 100755 index f1719cb..0000000 --- a/scripts/run.sh +++ /dev/null @@ -1,3 +0,0 @@ -#!/bin/bash - -cargo run --bin ptron \ No newline at end of file diff --git a/run_server.sh b/scripts/run_server.sh similarity index 100% rename from run_server.sh rename to scripts/run_server.sh diff --git a/test_predict_otron.sh b/scripts/test.sh similarity index 100% rename from test_predict_otron.sh rename to scripts/test.sh diff --git a/scripts/test_request.sh b/scripts/test_request.sh deleted file mode 100755 index e5657f3..0000000 --- a/scripts/test_request.sh +++ /dev/null @@ -1,69 +0,0 @@ -#!/bin/bash - -# Simple test script for inference-engine -# This script sends a single chat completion request - -echo "===== Inference Engine Test =====" - -# Test parameters -SERVER_URL="http://localhost:8080" # Changed from 8080 to 3777 to match main.rs default port -MAX_TOKENS=10 -PROMPT="What is the capital of France?" -MODEL="${MODEL_ID:-gemma-2-2b-it}" # Using gemma-2-2b-it as specified in the original test - -# Create a temp directory for test results -TEMP_DIR=$(mktemp -d) -echo "Storing test results in: $TEMP_DIR" - -# Prepare JSON payload -json_payload=$(cat < /dev/null; then - echo "Server doesn't appear to be running at $SERVER_URL" - echo "Please start the server with: ./run_server.sh" - exit 1 -fi - -echo "Sending request..." - -# Send request and measure time -start_time=$(date +%s.%N) - -# Send the chat completion request with 30 second timeout -# Note: The gemma-2-2b-it model takes ~12.57 seconds per token on average -# So even with MAX_TOKENS=10, the request might time out before completion -# The timeout ensures the script doesn't hang indefinitely -response=$(curl -s -X POST \ - -H "Content-Type: application/json" \ - -d "$json_payload" \ - --max-time 30 \ - "$SERVER_URL/v1/chat/completions") - -end_time=$(date +%s.%N) - -# Calculate elapsed time -elapsed=$(echo "$end_time - $start_time" | bc) - -# Extract response content length -content_length=$(echo "$response" | grep -o '"content":"[^"]*"' | wc -c) - -# Check if we got an error -error_check=$(echo "$response" | grep -c "error") -if [ "$error_check" -gt 0 ]; then - echo "Error in response: $response" -fi - -# Log results -echo "Time: ${elapsed}s, Response size: $content_length bytes" -echo "Response: $response" - -echo -e "\nTest Complete" \ No newline at end of file diff --git a/test_repetition.ts b/test_repetition.ts deleted file mode 100644 index 115f5d9..0000000 --- a/test_repetition.ts +++ /dev/null @@ -1,85 +0,0 @@ -#!/usr/bin/env node - -// Test script to reproduce token repetition issue with special characters -const { fetch } = require('node-fetch'); - -async function testTokenRepetition() { - console.log("Testing token repetition with special characters..."); - - try { - const response = await fetch('http://localhost:8080/chat/stream', { - method: 'POST', - headers: { - 'Content-Type': 'application/json', - }, - body: JSON.stringify({ - message: "Write a simple greeting with punctuation marks like: Hello! How are you? I'm fine, thanks." - }) - }); - - if (!response.ok) { - throw new Error(`HTTP error! status: ${response.status}`); - } - - const reader = response.body?.getReader(); - if (!reader) { - throw new Error('No reader available'); - } - - let fullResponse = ''; - let tokens = []; - - while (true) { - const { done, value } = await reader.read(); - if (done) break; - - const chunk = new TextDecoder().decode(value); - const lines = chunk.split('\n'); - - for (const line of lines) { - if (line.startsWith('data: ')) { - const data = line.slice(6); - if (data === '[DONE]') { - continue; - } - - try { - const parsed = JSON.parse(data); - if (parsed.token) { - tokens.push(parsed.token); - fullResponse += parsed.token; - console.log(`Token: "${parsed.token}"`); - } - } catch (e) { - console.log(`Non-JSON data: ${data}`); - } - } - } - } - - console.log('\n=== ANALYSIS ==='); - console.log('Full response:', fullResponse); - console.log('Total tokens:', tokens.length); - - // Check for repetition issues - const tokenCounts = {}; - let hasRepetition = false; - - for (const token of tokens) { - tokenCounts[token] = (tokenCounts[token] || 0) + 1; - if (tokenCounts[token] > 1 && token.match(/[!?,.;:]/)) { - console.log(`⚠️ Repetition detected: "${token}" appears ${tokenCounts[token]} times`); - hasRepetition = true; - } - } - - if (!hasRepetition) { - console.log('✅ No token repetition detected'); - } - - } catch (error) { - console.error('Error testing token repetition:', error); - } -} - -testTokenRepetition(); \ No newline at end of file