Refactor apply_cached_repeat_penalty for optimized caching and reuse, add extensive unit tests, and integrate special handling for gemma-specific models.

Removed `test_request.sh`, deprecated functionality, and unused imports; introduced a new CLI tool (`cli.ts`) for testing inference engine and adjusted handling of non-streaming/streaming chat completions. - Add CPU fallback support for text generation when primary device is unsupported - Introduce `execute_with_fallback` method to handle device compatibility and shape mismatch errors - Extend unit tests to reproduce tensor shape mismatch errors specific to model configurations - Increase HTTP timeout limits in `curl_chat_stream.sh` script for reliable API testing chat completion endpoint functions with gemma3 (no streaming) Add benchmarking guide with HTML reporting, Leptos chat crate, and middleware for metrics tracking
2025-09-08 22:46:44 +00:00 · 2025-08-26 01:30:26 -04:00
parent 7dd23213c9
commit 8338750beb
64 changed files with 14997 additions and 220 deletions
--- a/integration/bun.lock
+++ b/integration/bun.lock
@@ -0,0 +1,14 @@
+{
+  "lockfileVersion": 1,
+  "workspaces": {
+    "": {
+      "name": "@predict-otron-9000/ingeration",
+      "dependencies": {
+        "openai": "^5.16.0",
+      },
+    },
+  },
+  "packages": {
+    "openai": ["openai@5.16.0", "", { "peerDependencies": { "ws": "^8.18.0", "zod": "^3.23.8" }, "optionalPeers": ["ws", "zod"], "bin": { "openai": "bin/cli" } }, "sha512-hoEH8ZNvg1HXjU9mp88L/ZH8O082Z8r6FHCXGiWAzVRrEv443aI57qhch4snu07yQydj+AUAWLenAiBXhu89Tw=="],
+  }
+}
--- a/integration/openai-client-tests/actual_openai.test.ts
+++ b/integration/openai-client-tests/actual_openai.test.ts
@@ -0,0 +1,32 @@
+// #!/usr/bin/env bun
+//
+// import OpenAI from "openai";
+// import {describe, test, expect} from "bun:test";
+//
+// async function requestActualOpenAI(userPrompt: string) {
+//     const openai = new OpenAI();
+//     return await openai.chat.completions.create({
+//         model: "gpt-4o",
+//         max_tokens: 100,
+//         messages: [{name: "user_1", role: "user", content: userPrompt}]
+//     }).then(result => result.choices[0].message);
+// }
+//
+// // Exists as a smoke test.
+// describe("Actual OpenAI Completions", () => {
+//     test("Should return a valid message", async () => {
+//         const userPrompt = "Who was the 16th president of the United States?";
+//         const result = await requestActualOpenAI(userPrompt);
+//
+//         console.log({
+//             test: "hitting actual openai to ensure basic functionality",
+//             modelResponse: result.content,
+//             userPrompt
+//         });
+//
+//         expect(result.annotations).toEqual([])
+//         expect(result.content).toBeDefined();
+//         expect(result.refusal).toEqual(null);
+//         expect(result.role).toEqual("assistant");
+//     })
+// })
--- a/integration/openai-client-tests/local_openai.test.ts
+++ b/integration/openai-client-tests/local_openai.test.ts
@@ -0,0 +1,43 @@
+import OpenAI from "openai";
+import {describe, test, expect} from "bun:test";
+
+const supportedModels = ["gemma-3-1b-it"];
+
+
+async function requestLocalOpenAI(model: string, userPrompt: string) {
+    const openai = new OpenAI({
+        baseURL: "http://localhost:8080/v1",
+        apiKey: "not used",
+    });
+    try {
+        return openai.chat.completions.create({
+            model: model,
+            max_tokens: 100,
+            stream: true,
+            messages: [
+                {name: "assistant_1", role: "system", content: "I am a helpful assistant" },
+                {name: "user_1", role: "user", content: userPrompt}
+            ]
+        });
+    } catch (e) {
+        console.error(e);
+        throw e;
+    }
+}
+
+describe("Local OpenAI Completions", () => {
+    test("Should return a valid message", async () => {
+        const model = supportedModels.pop();
+        const userPrompt = "Who was the 16th president of the United States?";
+        const response = await requestLocalOpenAI(model, userPrompt);
+
+        const chunks = [];
+        for await (const chunk of response) {
+            console.log('Received chunk:', chunk);
+            chunks.push(chunk);
+        }
+
+        expect(chunks.length).toBeGreaterThan(0);
+    })
+})
+
--- a/integration/package.json
+++ b/integration/package.json
@@ -0,0 +1,6 @@
+{
+  "name": "@predict-otron-9000/ingeration",
+  "dependencies": {
+    "openai": "^5.16.0"
+  }
+}