From d04340d9ac57f3a603d5541062789a482552fc7a Mon Sep 17 00:00:00 2001
From: geoffsee <>
Date: Thu, 28 Aug 2025 12:54:09 -0400
Subject: [PATCH] update docs
---
Cargo.toml | 5 +-
README.md | 19 --
crates/embeddings-engine/README.md | 4 +
.../helm-chart-tool}/Cargo.toml | 0
.../helm-chart-tool}/README.md | 0
.../helm-chart-tool}/src/main.rs | 0
crates/inference-engine/api_test.html | 295 ------------------
crates/inference-engine/openai-api-test.js | 176 -----------
crates/predict-otron-9000/README.md | 8 +
integration/README.md | 1 +
integration/package.json | 3 +
package.json | 5 +-
cli.ts => scripts/cli.ts | 0
scripts/run.sh | 3 -
run_server.sh => scripts/run_server.sh | 0
test_predict_otron.sh => scripts/test.sh | 0
scripts/test_request.sh | 69 ----
test_repetition.ts | 85 -----
18 files changed, 22 insertions(+), 651 deletions(-)
create mode 100644 crates/embeddings-engine/README.md
rename {helm-chart-tool => crates/helm-chart-tool}/Cargo.toml (100%)
rename {helm-chart-tool => crates/helm-chart-tool}/README.md (100%)
rename {helm-chart-tool => crates/helm-chart-tool}/src/main.rs (100%)
delete mode 100644 crates/inference-engine/api_test.html
delete mode 100644 crates/inference-engine/openai-api-test.js
create mode 100644 crates/predict-otron-9000/README.md
create mode 100644 integration/README.md
rename cli.ts => scripts/cli.ts (100%)
delete mode 100755 scripts/run.sh
rename run_server.sh => scripts/run_server.sh (100%)
rename test_predict_otron.sh => scripts/test.sh (100%)
delete mode 100755 scripts/test_request.sh
delete mode 100644 test_repetition.ts
diff --git a/Cargo.toml b/Cargo.toml
index 4b0627a..157334f 100644
--- a/Cargo.toml
+++ b/Cargo.toml
@@ -3,13 +3,12 @@ members = [
"crates/predict-otron-9000",
"crates/inference-engine",
"crates/embeddings-engine",
- "crates/leptos-app"
+ "crates/leptos-app",
+ "crates/helm-chart-tool"
]
default-members = ["crates/predict-otron-9000"]
resolver = "2"
-
-
[[workspace.metadata.leptos]]
# project name
bin-package = "leptos-app"
diff --git a/README.md b/README.md
index ab76b80..a63904f 100644
--- a/README.md
+++ b/README.md
@@ -264,23 +264,6 @@ export RUST_LOG=trace
export RUST_LOG=predict_otron_9000=debug,embeddings_engine=trace
```
-## Chat Interface
-
-The project includes a WebAssembly-based chat interface built with the Leptos framework.
-
-### Building the Chat Interface
-
-```shell
-# Navigate to the leptos-app crate
-cd crates/leptos-app
-
-# Build the WebAssembly package
-cargo build --target wasm32-unknown-unknown
-
-# For development with trunk (if installed)
-trunk serve
-```
-
### Usage
The chat interface connects to the inference engine API and provides a user-friendly way to interact with the AI models. To use:
@@ -298,8 +281,6 @@ The interface supports:
- **Inference Engine**: Currently provides a simplified implementation for chat completions. Full model loading and text generation capabilities from the inference-engine crate are not yet integrated into the unified server.
- **Model Support**: Embeddings are limited to the Nomic Embed Text v1.5 model.
-- **Scalability**: Single-threaded model loading may impact performance under heavy load.
-- **Chat Interface**: The WebAssembly chat interface requires compilation to a static site before deployment.
## Contributing
diff --git a/crates/embeddings-engine/README.md b/crates/embeddings-engine/README.md
new file mode 100644
index 0000000..c47ea5a
--- /dev/null
+++ b/crates/embeddings-engine/README.md
@@ -0,0 +1,4 @@
+# Embeddings Engine
+
+A high-performance text embeddings service that generates vector representations of text using state-of-the-art models.
+This crate wraps the fastembed crate to provide embeddings and partially adapts the openai specification.
\ No newline at end of file
diff --git a/helm-chart-tool/Cargo.toml b/crates/helm-chart-tool/Cargo.toml
similarity index 100%
rename from helm-chart-tool/Cargo.toml
rename to crates/helm-chart-tool/Cargo.toml
diff --git a/helm-chart-tool/README.md b/crates/helm-chart-tool/README.md
similarity index 100%
rename from helm-chart-tool/README.md
rename to crates/helm-chart-tool/README.md
diff --git a/helm-chart-tool/src/main.rs b/crates/helm-chart-tool/src/main.rs
similarity index 100%
rename from helm-chart-tool/src/main.rs
rename to crates/helm-chart-tool/src/main.rs
diff --git a/crates/inference-engine/api_test.html b/crates/inference-engine/api_test.html
deleted file mode 100644
index f4654a5..0000000
--- a/crates/inference-engine/api_test.html
+++ /dev/null
@@ -1,295 +0,0 @@
-
-
-
-
-
- OpenAI-Compatible API Tester
-
-
-
- OpenAI-Compatible API Tester
- Use this page to test the OpenAI-compatible chat completions endpoint of the local inference engine.
-
-
-
Request Settings
-
-
-
Request Body
-
-
-
-
-
Example Requests
-
-
-
-
-
-
-
-
Response
-
Response will appear here...
-
-
-
-
-
-
\ No newline at end of file
diff --git a/crates/inference-engine/openai-api-test.js b/crates/inference-engine/openai-api-test.js
deleted file mode 100644
index 51f2e3e..0000000
--- a/crates/inference-engine/openai-api-test.js
+++ /dev/null
@@ -1,176 +0,0 @@
-// Test requests for the OpenAI-compatible endpoint in the inference server
-// This file contains IIFE (Immediately Invoked Function Expression) JavaScript requests
-// to test the /v1/chat/completions endpoint
-
-// Basic chat completion request
-(async function testBasicChatCompletion() {
- console.log("Test 1: Basic chat completion request");
- try {
- const response = await fetch('http://localhost:3777/v1/chat/completions', {
- method: 'POST',
- headers: {
- 'Content-Type': 'application/json',
- },
- body: JSON.stringify({
- model: "gemma-2-2b-it",
- messages: [
- {
- role: "user",
- content: "Who was the 16th president of the United States?"
- }
- ],
- max_tokens: 100
- })
- });
-
- const data = await response.json();
- console.log("Response:", JSON.stringify(data, null, 2));
- } catch (error) {
- console.error("Error:", error);
- }
-})();
-
-// Multi-turn conversation
-(async function testMultiTurnConversation() {
- console.log("\nTest 2: Multi-turn conversation");
- try {
- const response = await fetch('http://localhost:3777/v1/chat/completions', {
- method: 'POST',
- headers: {
- 'Content-Type': 'application/json',
- },
- body: JSON.stringify({
- model: "gemma-2-2b-it",
- messages: [
- {
- role: "system",
- content: "You are a helpful assistant that provides concise answers."
- },
- {
- role: "user",
- content: "What is machine learning?"
- },
- {
- role: "assistant",
- content: "Machine learning is a subset of artificial intelligence that enables systems to learn and improve from experience without being explicitly programmed."
- },
- {
- role: "user",
- content: "Give me an example of a machine learning algorithm."
- }
- ],
- max_tokens: 150
- })
- });
-
- const data = await response.json();
- console.log("Response:", JSON.stringify(data, null, 2));
- } catch (error) {
- console.error("Error:", error);
- }
-})();
-
-// Request with temperature and top_p parameters
-(async function testTemperatureAndTopP() {
- console.log("\nTest 3: Request with temperature and top_p parameters");
- try {
- const response = await fetch('http://localhost:3777/v1/chat/completions', {
- method: 'POST',
- headers: {
- 'Content-Type': 'application/json',
- },
- body: JSON.stringify({
- model: "gemma-2-2b-it",
- messages: [
- {
- role: "user",
- content: "Write a short poem about artificial intelligence."
- }
- ],
- max_tokens: 200,
- temperature: 0.8,
- top_p: 0.9
- })
- });
-
- const data = await response.json();
- console.log("Response:", JSON.stringify(data, null, 2));
- } catch (error) {
- console.error("Error:", error);
- }
-})();
-
-// Request with streaming enabled
-(async function testStreaming() {
- console.log("\nTest 4: Request with streaming enabled");
- try {
- const response = await fetch('http://localhost:3777/v1/chat/completions', {
- method: 'POST',
- headers: {
- 'Content-Type': 'application/json',
- },
- body: JSON.stringify({
- model: "gemma-2-2b-it",
- messages: [
- {
- role: "user",
- content: "Explain quantum computing in simple terms."
- }
- ],
- max_tokens: 150,
- stream: true
- })
- });
-
- // Note: Streaming might not be implemented yet, this is to test the API's handling of the parameter
- if (response.headers.get('content-type')?.includes('text/event-stream')) {
- console.log("Streaming response detected. Reading stream...");
- const reader = response.body.getReader();
- const decoder = new TextDecoder();
-
- while (true) {
- const { done, value } = await reader.read();
- if (done) break;
-
- const chunk = decoder.decode(value);
- console.log("Chunk:", chunk);
- }
- } else {
- const data = await response.json();
- console.log("Non-streaming response:", JSON.stringify(data, null, 2));
- }
- } catch (error) {
- console.error("Error:", error);
- }
-})();
-
-// Request with a different model
-(async function testDifferentModel() {
- console.log("\nTest 5: Request with a different model");
- try {
- const response = await fetch('http://localhost:3777/v1/chat/completions', {
- method: 'POST',
- headers: {
- 'Content-Type': 'application/json',
- },
- body: JSON.stringify({
- model: "gemma-2-2b-it", // Using a different model if available
- messages: [
- {
- role: "user",
- content: "What are the benefits of renewable energy?"
- }
- ],
- max_tokens: 150
- })
- });
-
- const data = await response.json();
- console.log("Response:", JSON.stringify(data, null, 2));
- } catch (error) {
- console.error("Error:", error);
- }
-})();
-
-console.log("\nAll test requests have been sent. Check the server logs for more details.");
-console.log("To run the server, use: cargo run --bin inference-engine -- --server");
diff --git a/crates/predict-otron-9000/README.md b/crates/predict-otron-9000/README.md
new file mode 100644
index 0000000..066bf70
--- /dev/null
+++ b/crates/predict-otron-9000/README.md
@@ -0,0 +1,8 @@
+# predict-otron-9000
+
+This is an extensible axum/tokio hybrid combining [embeddings-engine](../embeddings-engine), [inference-engine](../inference-engine), and [leptos-app](../leptos-app).
+
+
+# Notes
+- When `server_mode` is Standalone (default), the instance contains all components necessary for inference.
+- When `server_mode` is HighAvailability, automatic scaling of inference and embeddings; proxies to inference and embeddings services via dns
diff --git a/integration/README.md b/integration/README.md
new file mode 100644
index 0000000..05bc836
--- /dev/null
+++ b/integration/README.md
@@ -0,0 +1 @@
+This package enables testing of a live instance of predict-otron-9000 using the openai node sdk directly.
\ No newline at end of file
diff --git a/integration/package.json b/integration/package.json
index d80a8fa..9002d42 100644
--- a/integration/package.json
+++ b/integration/package.json
@@ -1,5 +1,8 @@
{
"name": "@predict-otron-9000/ingeration",
+ "scripts": {
+ "test": "bun test"
+ },
"dependencies": {
"openai": "^5.16.0"
}
diff --git a/package.json b/package.json
index c37756b..f7f926d 100644
--- a/package.json
+++ b/package.json
@@ -1,5 +1,8 @@
{
"dependencies": {
"openai": "^5.16.0"
+ },
+ "scripts": {
+ "cli": "./scripts/cli.ts"
}
-}
+}
\ No newline at end of file
diff --git a/cli.ts b/scripts/cli.ts
similarity index 100%
rename from cli.ts
rename to scripts/cli.ts
diff --git a/scripts/run.sh b/scripts/run.sh
deleted file mode 100755
index f1719cb..0000000
--- a/scripts/run.sh
+++ /dev/null
@@ -1,3 +0,0 @@
-#!/bin/bash
-
-cargo run --bin ptron
\ No newline at end of file
diff --git a/run_server.sh b/scripts/run_server.sh
similarity index 100%
rename from run_server.sh
rename to scripts/run_server.sh
diff --git a/test_predict_otron.sh b/scripts/test.sh
similarity index 100%
rename from test_predict_otron.sh
rename to scripts/test.sh
diff --git a/scripts/test_request.sh b/scripts/test_request.sh
deleted file mode 100755
index e5657f3..0000000
--- a/scripts/test_request.sh
+++ /dev/null
@@ -1,69 +0,0 @@
-#!/bin/bash
-
-# Simple test script for inference-engine
-# This script sends a single chat completion request
-
-echo "===== Inference Engine Test ====="
-
-# Test parameters
-SERVER_URL="http://localhost:8080" # Changed from 8080 to 3777 to match main.rs default port
-MAX_TOKENS=10
-PROMPT="What is the capital of France?"
-MODEL="${MODEL_ID:-gemma-2-2b-it}" # Using gemma-2-2b-it as specified in the original test
-
-# Create a temp directory for test results
-TEMP_DIR=$(mktemp -d)
-echo "Storing test results in: $TEMP_DIR"
-
-# Prepare JSON payload
-json_payload=$(cat < /dev/null; then
- echo "Server doesn't appear to be running at $SERVER_URL"
- echo "Please start the server with: ./run_server.sh"
- exit 1
-fi
-
-echo "Sending request..."
-
-# Send request and measure time
-start_time=$(date +%s.%N)
-
-# Send the chat completion request with 30 second timeout
-# Note: The gemma-2-2b-it model takes ~12.57 seconds per token on average
-# So even with MAX_TOKENS=10, the request might time out before completion
-# The timeout ensures the script doesn't hang indefinitely
-response=$(curl -s -X POST \
- -H "Content-Type: application/json" \
- -d "$json_payload" \
- --max-time 30 \
- "$SERVER_URL/v1/chat/completions")
-
-end_time=$(date +%s.%N)
-
-# Calculate elapsed time
-elapsed=$(echo "$end_time - $start_time" | bc)
-
-# Extract response content length
-content_length=$(echo "$response" | grep -o '"content":"[^"]*"' | wc -c)
-
-# Check if we got an error
-error_check=$(echo "$response" | grep -c "error")
-if [ "$error_check" -gt 0 ]; then
- echo "Error in response: $response"
-fi
-
-# Log results
-echo "Time: ${elapsed}s, Response size: $content_length bytes"
-echo "Response: $response"
-
-echo -e "\nTest Complete"
\ No newline at end of file
diff --git a/test_repetition.ts b/test_repetition.ts
deleted file mode 100644
index 115f5d9..0000000
--- a/test_repetition.ts
+++ /dev/null
@@ -1,85 +0,0 @@
-#!/usr/bin/env node
-
-// Test script to reproduce token repetition issue with special characters
-const { fetch } = require('node-fetch');
-
-async function testTokenRepetition() {
- console.log("Testing token repetition with special characters...");
-
- try {
- const response = await fetch('http://localhost:8080/chat/stream', {
- method: 'POST',
- headers: {
- 'Content-Type': 'application/json',
- },
- body: JSON.stringify({
- message: "Write a simple greeting with punctuation marks like: Hello! How are you? I'm fine, thanks."
- })
- });
-
- if (!response.ok) {
- throw new Error(`HTTP error! status: ${response.status}`);
- }
-
- const reader = response.body?.getReader();
- if (!reader) {
- throw new Error('No reader available');
- }
-
- let fullResponse = '';
- let tokens = [];
-
- while (true) {
- const { done, value } = await reader.read();
- if (done) break;
-
- const chunk = new TextDecoder().decode(value);
- const lines = chunk.split('\n');
-
- for (const line of lines) {
- if (line.startsWith('data: ')) {
- const data = line.slice(6);
- if (data === '[DONE]') {
- continue;
- }
-
- try {
- const parsed = JSON.parse(data);
- if (parsed.token) {
- tokens.push(parsed.token);
- fullResponse += parsed.token;
- console.log(`Token: "${parsed.token}"`);
- }
- } catch (e) {
- console.log(`Non-JSON data: ${data}`);
- }
- }
- }
- }
-
- console.log('\n=== ANALYSIS ===');
- console.log('Full response:', fullResponse);
- console.log('Total tokens:', tokens.length);
-
- // Check for repetition issues
- const tokenCounts = {};
- let hasRepetition = false;
-
- for (const token of tokens) {
- tokenCounts[token] = (tokenCounts[token] || 0) + 1;
- if (tokenCounts[token] > 1 && token.match(/[!?,.;:]/)) {
- console.log(`⚠️ Repetition detected: "${token}" appears ${tokenCounts[token]} times`);
- hasRepetition = true;
- }
- }
-
- if (!hasRepetition) {
- console.log('✅ No token repetition detected');
- }
-
- } catch (error) {
- console.error('Error testing token repetition:', error);
- }
-}
-
-testTokenRepetition();
\ No newline at end of file