update docs

This commit is contained in:
geoffsee
2025-08-28 12:54:09 -04:00
parent 0488bddfdb
commit d04340d9ac
18 changed files with 22 additions and 651 deletions

View File

@@ -3,13 +3,12 @@ members = [
"crates/predict-otron-9000",
"crates/inference-engine",
"crates/embeddings-engine",
"crates/leptos-app"
"crates/leptos-app",
"crates/helm-chart-tool"
]
default-members = ["crates/predict-otron-9000"]
resolver = "2"
[[workspace.metadata.leptos]]
# project name
bin-package = "leptos-app"

View File

@@ -264,23 +264,6 @@ export RUST_LOG=trace
export RUST_LOG=predict_otron_9000=debug,embeddings_engine=trace
```
## Chat Interface
The project includes a WebAssembly-based chat interface built with the Leptos framework.
### Building the Chat Interface
```shell
# Navigate to the leptos-app crate
cd crates/leptos-app
# Build the WebAssembly package
cargo build --target wasm32-unknown-unknown
# For development with trunk (if installed)
trunk serve
```
### Usage
The chat interface connects to the inference engine API and provides a user-friendly way to interact with the AI models. To use:
@@ -298,8 +281,6 @@ The interface supports:
- **Inference Engine**: Currently provides a simplified implementation for chat completions. Full model loading and text generation capabilities from the inference-engine crate are not yet integrated into the unified server.
- **Model Support**: Embeddings are limited to the Nomic Embed Text v1.5 model.
- **Scalability**: Single-threaded model loading may impact performance under heavy load.
- **Chat Interface**: The WebAssembly chat interface requires compilation to a static site before deployment.
## Contributing

View File

@@ -0,0 +1,4 @@
# Embeddings Engine
A high-performance text embeddings service that generates vector representations of text using state-of-the-art models.
This crate wraps the fastembed crate to provide embeddings and partially adapts the openai specification.

View File

@@ -1,295 +0,0 @@
<!DOCTYPE html>
<html lang="en">
<head>
<meta charset="UTF-8">
<meta name="viewport" content="width=device-width, initial-scale=1.0">
<title>OpenAI-Compatible API Tester</title>
<style>
body {
font-family: Arial, sans-serif;
max-width: 800px;
margin: 0 auto;
padding: 20px;
line-height: 1.6;
}
h1, h2 {
color: #333;
}
.container {
margin-bottom: 20px;
}
textarea {
width: 100%;
height: 150px;
padding: 10px;
margin-bottom: 10px;
border: 1px solid #ddd;
border-radius: 4px;
font-family: monospace;
}
button {
background-color: #4CAF50;
color: white;
padding: 10px 15px;
border: none;
border-radius: 4px;
cursor: pointer;
font-size: 16px;
}
button:hover {
background-color: #45a049;
}
pre {
background-color: #f5f5f5;
padding: 15px;
border-radius: 4px;
overflow-x: auto;
white-space: pre-wrap;
}
.response {
margin-top: 20px;
}
.error {
color: red;
}
.settings {
display: flex;
flex-wrap: wrap;
gap: 10px;
margin-bottom: 15px;
}
.settings div {
display: flex;
flex-direction: column;
}
label {
margin-bottom: 5px;
font-weight: bold;
}
input {
padding: 8px;
border: 1px solid #ddd;
border-radius: 4px;
}
.examples {
margin-top: 30px;
}
.example-btn {
background-color: #2196F3;
margin-right: 10px;
margin-bottom: 10px;
}
.example-btn:hover {
background-color: #0b7dda;
}
</style>
</head>
<body>
<h1>OpenAI-Compatible API Tester</h1>
<p>Use this page to test the OpenAI-compatible chat completions endpoint of the local inference engine.</p>
<div class="container">
<h2>Request Settings</h2>
<div class="settings">
<div>
<label for="serverUrl">Server URL:</label>
<input type="text" id="serverUrl" value="http://localhost:3777" />
</div>
<div>
<label for="model">Model:</label>
<input type="text" id="model" value="gemma-3-1b-it" />
</div>
<div>
<label for="maxTokens">Max Tokens:</label>
<input type="number" id="maxTokens" value="150" />
</div>
<div>
<label for="temperature">Temperature:</label>
<input type="number" id="temperature" value="0.7" step="0.1" min="0" max="2" />
</div>
<div>
<label for="topP">Top P:</label>
<input type="number" id="topP" value="0.9" step="0.1" min="0" max="1" />
</div>
</div>
<h2>Request Body</h2>
<textarea id="requestBody">{
"model": "gemma-3-1b-it",
"messages": [
{
"role": "user",
"content": "Hello, how are you today?"
}
],
"max_tokens": 150,
"temperature": 0.7,
"top_p": 0.9
}</textarea>
<button id="sendRequest">Send Request</button>
<div class="examples">
<h3>Example Requests</h3>
<button class="example-btn" id="example1">Basic Question</button>
<button class="example-btn" id="example2">Multi-turn Conversation</button>
<button class="example-btn" id="example3">Creative Writing</button>
<button class="example-btn" id="example4">Code Generation</button>
</div>
<div class="response">
<h2>Response</h2>
<pre id="responseOutput">Response will appear here...</pre>
</div>
</div>
<script>
document.addEventListener('DOMContentLoaded', function() {
// Update request body when settings change
const serverUrlInput = document.getElementById('serverUrl');
const modelInput = document.getElementById('model');
const maxTokensInput = document.getElementById('maxTokens');
const temperatureInput = document.getElementById('temperature');
const topPInput = document.getElementById('topP');
const requestBodyTextarea = document.getElementById('requestBody');
const responseOutput = document.getElementById('responseOutput');
// Function to update request body from settings
function updateRequestBodyFromSettings() {
try {
const requestBody = JSON.parse(requestBodyTextarea.value);
requestBody.model = modelInput.value;
requestBody.max_tokens = parseInt(maxTokensInput.value);
requestBody.temperature = parseFloat(temperatureInput.value);
requestBody.top_p = parseFloat(topPInput.value);
requestBodyTextarea.value = JSON.stringify(requestBody, null, 2);
} catch (error) {
console.error("Error updating request body:", error);
}
}
// Update settings when request body changes
function updateSettingsFromRequestBody() {
try {
const requestBody = JSON.parse(requestBodyTextarea.value);
if (requestBody.model) modelInput.value = requestBody.model;
if (requestBody.max_tokens) maxTokensInput.value = requestBody.max_tokens;
if (requestBody.temperature) temperatureInput.value = requestBody.temperature;
if (requestBody.top_p) topPInput.value = requestBody.top_p;
} catch (error) {
console.error("Error updating settings:", error);
}
}
// Add event listeners for settings changes
modelInput.addEventListener('change', updateRequestBodyFromSettings);
maxTokensInput.addEventListener('change', updateRequestBodyFromSettings);
temperatureInput.addEventListener('change', updateRequestBodyFromSettings);
topPInput.addEventListener('change', updateRequestBodyFromSettings);
// Add event listener for request body changes
requestBodyTextarea.addEventListener('blur', updateSettingsFromRequestBody);
// Send request button
document.getElementById('sendRequest').addEventListener('click', async function() {
try {
responseOutput.textContent = "Sending request...";
const serverUrl = serverUrlInput.value;
const endpoint = '/v1/chat/completions';
const url = serverUrl + endpoint;
const requestBody = JSON.parse(requestBodyTextarea.value);
const response = await fetch(url, {
method: 'POST',
headers: {
'Content-Type': 'application/json',
},
body: JSON.stringify(requestBody)
});
const data = await response.json();
responseOutput.textContent = JSON.stringify(data, null, 2);
} catch (error) {
responseOutput.textContent = "Error: " + error.message;
responseOutput.classList.add('error');
}
});
// Example requests
document.getElementById('example1').addEventListener('click', function() {
requestBodyTextarea.value = JSON.stringify({
model: modelInput.value,
messages: [
{
role: "user",
content: "Who was the 16th president of the United States?"
}
],
max_tokens: parseInt(maxTokensInput.value),
temperature: parseFloat(temperatureInput.value),
top_p: parseFloat(topPInput.value)
}, null, 2);
});
document.getElementById('example2').addEventListener('click', function() {
requestBodyTextarea.value = JSON.stringify({
model: modelInput.value,
messages: [
{
role: "system",
content: "You are a helpful assistant that provides concise answers."
},
{
role: "user",
content: "What is machine learning?"
},
{
role: "assistant",
content: "Machine learning is a subset of artificial intelligence that enables systems to learn and improve from experience without being explicitly programmed."
},
{
role: "user",
content: "Give me an example of a machine learning algorithm."
}
],
max_tokens: parseInt(maxTokensInput.value),
temperature: parseFloat(temperatureInput.value),
top_p: parseFloat(topPInput.value)
}, null, 2);
});
document.getElementById('example3').addEventListener('click', function() {
requestBodyTextarea.value = JSON.stringify({
model: modelInput.value,
messages: [
{
role: "user",
content: "Write a short poem about artificial intelligence."
}
],
max_tokens: parseInt(maxTokensInput.value),
temperature: 0.9, // Higher temperature for creative tasks
top_p: 0.9
}, null, 2);
temperatureInput.value = 0.9;
});
document.getElementById('example4').addEventListener('click', function() {
requestBodyTextarea.value = JSON.stringify({
model: modelInput.value,
messages: [
{
role: "user",
content: "Write a Python function to calculate the Fibonacci sequence up to n terms."
}
],
max_tokens: parseInt(maxTokensInput.value),
temperature: 0.3, // Lower temperature for code generation
top_p: 0.9
}, null, 2);
temperatureInput.value = 0.3;
});
});
</script>
</body>
</html>

View File

@@ -1,176 +0,0 @@
// Test requests for the OpenAI-compatible endpoint in the inference server
// This file contains IIFE (Immediately Invoked Function Expression) JavaScript requests
// to test the /v1/chat/completions endpoint
// Basic chat completion request
(async function testBasicChatCompletion() {
console.log("Test 1: Basic chat completion request");
try {
const response = await fetch('http://localhost:3777/v1/chat/completions', {
method: 'POST',
headers: {
'Content-Type': 'application/json',
},
body: JSON.stringify({
model: "gemma-2-2b-it",
messages: [
{
role: "user",
content: "Who was the 16th president of the United States?"
}
],
max_tokens: 100
})
});
const data = await response.json();
console.log("Response:", JSON.stringify(data, null, 2));
} catch (error) {
console.error("Error:", error);
}
})();
// Multi-turn conversation
(async function testMultiTurnConversation() {
console.log("\nTest 2: Multi-turn conversation");
try {
const response = await fetch('http://localhost:3777/v1/chat/completions', {
method: 'POST',
headers: {
'Content-Type': 'application/json',
},
body: JSON.stringify({
model: "gemma-2-2b-it",
messages: [
{
role: "system",
content: "You are a helpful assistant that provides concise answers."
},
{
role: "user",
content: "What is machine learning?"
},
{
role: "assistant",
content: "Machine learning is a subset of artificial intelligence that enables systems to learn and improve from experience without being explicitly programmed."
},
{
role: "user",
content: "Give me an example of a machine learning algorithm."
}
],
max_tokens: 150
})
});
const data = await response.json();
console.log("Response:", JSON.stringify(data, null, 2));
} catch (error) {
console.error("Error:", error);
}
})();
// Request with temperature and top_p parameters
(async function testTemperatureAndTopP() {
console.log("\nTest 3: Request with temperature and top_p parameters");
try {
const response = await fetch('http://localhost:3777/v1/chat/completions', {
method: 'POST',
headers: {
'Content-Type': 'application/json',
},
body: JSON.stringify({
model: "gemma-2-2b-it",
messages: [
{
role: "user",
content: "Write a short poem about artificial intelligence."
}
],
max_tokens: 200,
temperature: 0.8,
top_p: 0.9
})
});
const data = await response.json();
console.log("Response:", JSON.stringify(data, null, 2));
} catch (error) {
console.error("Error:", error);
}
})();
// Request with streaming enabled
(async function testStreaming() {
console.log("\nTest 4: Request with streaming enabled");
try {
const response = await fetch('http://localhost:3777/v1/chat/completions', {
method: 'POST',
headers: {
'Content-Type': 'application/json',
},
body: JSON.stringify({
model: "gemma-2-2b-it",
messages: [
{
role: "user",
content: "Explain quantum computing in simple terms."
}
],
max_tokens: 150,
stream: true
})
});
// Note: Streaming might not be implemented yet, this is to test the API's handling of the parameter
if (response.headers.get('content-type')?.includes('text/event-stream')) {
console.log("Streaming response detected. Reading stream...");
const reader = response.body.getReader();
const decoder = new TextDecoder();
while (true) {
const { done, value } = await reader.read();
if (done) break;
const chunk = decoder.decode(value);
console.log("Chunk:", chunk);
}
} else {
const data = await response.json();
console.log("Non-streaming response:", JSON.stringify(data, null, 2));
}
} catch (error) {
console.error("Error:", error);
}
})();
// Request with a different model
(async function testDifferentModel() {
console.log("\nTest 5: Request with a different model");
try {
const response = await fetch('http://localhost:3777/v1/chat/completions', {
method: 'POST',
headers: {
'Content-Type': 'application/json',
},
body: JSON.stringify({
model: "gemma-2-2b-it", // Using a different model if available
messages: [
{
role: "user",
content: "What are the benefits of renewable energy?"
}
],
max_tokens: 150
})
});
const data = await response.json();
console.log("Response:", JSON.stringify(data, null, 2));
} catch (error) {
console.error("Error:", error);
}
})();
console.log("\nAll test requests have been sent. Check the server logs for more details.");
console.log("To run the server, use: cargo run --bin inference-engine -- --server");

View File

@@ -0,0 +1,8 @@
# predict-otron-9000
This is an extensible axum/tokio hybrid combining [embeddings-engine](../embeddings-engine), [inference-engine](../inference-engine), and [leptos-app](../leptos-app).
# Notes
- When `server_mode` is Standalone (default), the instance contains all components necessary for inference.
- When `server_mode` is HighAvailability, automatic scaling of inference and embeddings; proxies to inference and embeddings services via dns

1
integration/README.md Normal file
View File

@@ -0,0 +1 @@
This package enables testing of a live instance of predict-otron-9000 using the openai node sdk directly.

View File

@@ -1,5 +1,8 @@
{
"name": "@predict-otron-9000/ingeration",
"scripts": {
"test": "bun test"
},
"dependencies": {
"openai": "^5.16.0"
}

View File

@@ -1,5 +1,8 @@
{
"dependencies": {
"openai": "^5.16.0"
},
"scripts": {
"cli": "./scripts/cli.ts"
}
}
}

View File

@@ -1,3 +0,0 @@
#!/bin/bash
cargo run --bin ptron

View File

@@ -1,69 +0,0 @@
#!/bin/bash
# Simple test script for inference-engine
# This script sends a single chat completion request
echo "===== Inference Engine Test ====="
# Test parameters
SERVER_URL="http://localhost:8080" # Changed from 8080 to 3777 to match main.rs default port
MAX_TOKENS=10
PROMPT="What is the capital of France?"
MODEL="${MODEL_ID:-gemma-2-2b-it}" # Using gemma-2-2b-it as specified in the original test
# Create a temp directory for test results
TEMP_DIR=$(mktemp -d)
echo "Storing test results in: $TEMP_DIR"
# Prepare JSON payload
json_payload=$(cat <<EOF
{
"model": "$MODEL",
"messages": [{"role": "user", "content": "$PROMPT"}],
"max_tokens": $MAX_TOKENS
}
EOF
)
# Make sure the server is running
echo "Checking if the server is running..."
if ! curl -s "$SERVER_URL" > /dev/null; then
echo "Server doesn't appear to be running at $SERVER_URL"
echo "Please start the server with: ./run_server.sh"
exit 1
fi
echo "Sending request..."
# Send request and measure time
start_time=$(date +%s.%N)
# Send the chat completion request with 30 second timeout
# Note: The gemma-2-2b-it model takes ~12.57 seconds per token on average
# So even with MAX_TOKENS=10, the request might time out before completion
# The timeout ensures the script doesn't hang indefinitely
response=$(curl -s -X POST \
-H "Content-Type: application/json" \
-d "$json_payload" \
--max-time 30 \
"$SERVER_URL/v1/chat/completions")
end_time=$(date +%s.%N)
# Calculate elapsed time
elapsed=$(echo "$end_time - $start_time" | bc)
# Extract response content length
content_length=$(echo "$response" | grep -o '"content":"[^"]*"' | wc -c)
# Check if we got an error
error_check=$(echo "$response" | grep -c "error")
if [ "$error_check" -gt 0 ]; then
echo "Error in response: $response"
fi
# Log results
echo "Time: ${elapsed}s, Response size: $content_length bytes"
echo "Response: $response"
echo -e "\nTest Complete"

View File

@@ -1,85 +0,0 @@
#!/usr/bin/env node
// Test script to reproduce token repetition issue with special characters
const { fetch } = require('node-fetch');
async function testTokenRepetition() {
console.log("Testing token repetition with special characters...");
try {
const response = await fetch('http://localhost:8080/chat/stream', {
method: 'POST',
headers: {
'Content-Type': 'application/json',
},
body: JSON.stringify({
message: "Write a simple greeting with punctuation marks like: Hello! How are you? I'm fine, thanks."
})
});
if (!response.ok) {
throw new Error(`HTTP error! status: ${response.status}`);
}
const reader = response.body?.getReader();
if (!reader) {
throw new Error('No reader available');
}
let fullResponse = '';
let tokens = [];
while (true) {
const { done, value } = await reader.read();
if (done) break;
const chunk = new TextDecoder().decode(value);
const lines = chunk.split('\n');
for (const line of lines) {
if (line.startsWith('data: ')) {
const data = line.slice(6);
if (data === '[DONE]') {
continue;
}
try {
const parsed = JSON.parse(data);
if (parsed.token) {
tokens.push(parsed.token);
fullResponse += parsed.token;
console.log(`Token: "${parsed.token}"`);
}
} catch (e) {
console.log(`Non-JSON data: ${data}`);
}
}
}
}
console.log('\n=== ANALYSIS ===');
console.log('Full response:', fullResponse);
console.log('Total tokens:', tokens.length);
// Check for repetition issues
const tokenCounts = {};
let hasRepetition = false;
for (const token of tokens) {
tokenCounts[token] = (tokenCounts[token] || 0) + 1;
if (tokenCounts[token] > 1 && token.match(/[!?,.;:]/)) {
console.log(`⚠️ Repetition detected: "${token}" appears ${tokenCounts[token]} times`);
hasRepetition = true;
}
}
if (!hasRepetition) {
console.log('✅ No token repetition detected');
}
} catch (error) {
console.error('Error testing token repetition:', error);
}
}
testTokenRepetition();