#!/usr/bin/env bun import OpenAI from "openai"; import { parseArgs } from "util"; // ===================== // Config // ===================== const DEFAULT_MODEL = "gemma-3-1b-it"; const DEFAULT_MAX_TOKENS = 256; // Toggle this to reduce log overhead during timing runs const PRINT_CHUNK_DEBUG = false; // How many rows to show in the timing tables const SHOW_FIRST_N = 3; const SHOW_SLOWEST_N = 3; // ===================== // Helpers // ===================== const now = () => performance.now(); type ChunkStat = { index: number; tSinceRequestStartMs: number; dtSincePrevMs: number; contentChars: number; }; function printHelp() { console.log(` ./cli [options] [prompt] Simple CLI tool for testing the local OpenAI-compatible API server. Options: --model Model to use (default: gemma-3-1b-it) --prompt The prompt to send (can also be provided as positional argument) --list-models List all available models from the server --help Show this help message Examples: ./cli "What is the capital of France?" ./cli --model gemma-3-1b-it --prompt "Hello, world!" ./cli --prompt "Who was the 16th president of the United States?" ./cli --list-models The server must be running at http://localhost:8080 `); } const { values, positionals } = parseArgs({ args: process.argv.slice(2), options: { model: { type: "string" }, prompt: { type: "string" }, help: { type: "boolean" }, "list-models": { type: "boolean" }, }, strict: false, allowPositionals: true, }); async function requestLocalOpenAI(model: string, userPrompt: string) { const openai = new OpenAI({ baseURL: "http://localhost:8080/v1", apiKey: "not used", }); try { console.log("[DEBUG] Creating chat completion request..."); return openai.chat.completions.create({ model, max_tokens: DEFAULT_MAX_TOKENS, stream: true, messages: [ { role: "system", content: "You are a helpful assistant who responds thoughtfully and concisely.", }, { role: "user", content: userPrompt }, ], }); } catch (e: any) { console.error("[ERROR] Failed to connect to local OpenAI server:", e.message); console.error("[HINT] Make sure the server is running at http://localhost:8080"); console.error("[HINT] Start it with: ./run_server.sh"); throw e; } } async function listModels() { const openai = new OpenAI({ baseURL: "http://localhost:8080/v1", apiKey: "not used", }); try { const models = await openai.models.list(); console.log(`[INFO] Available models from http://localhost:8080/v1:`); console.log("---"); if (models.data && models.data.length > 0) { models.data.forEach((model, index) => { console.log(`${index + 1}. ${model.id}`); console.log(` Owner: ${model.owned_by}`); console.log(` Created: ${new Date(model.created * 1000).toISOString()}`); console.log(""); }); console.log(`Total: ${models.data.length} models available`); } else { console.log("No models found."); } } catch (e: any) { console.error("[ERROR] Failed to fetch models from local OpenAI server:", e.message); console.error("[HINT] Make sure the server is running at http://localhost:8080"); console.error("[HINT] Start it with: ./run_server.sh"); throw e; } } // ===================== // Timing math // ===================== function median(nums: number[]) { if (nums.length === 0) return 0; const s = [...nums].sort((a, b) => a - b); const mid = Math.floor(s.length / 2); return s.length % 2 ? s[mid] : (s[mid - 1] + s[mid]) / 2; } function quantile(nums: number[], q: number) { if (nums.length === 0) return 0; const s = [...nums].sort((a, b) => a - b); const pos = (s.length - 1) * q; const base = Math.floor(pos); const rest = pos - base; return s[base + 1] !== undefined ? s[base] + rest * (s[base + 1] - s[base]) : s[base]; } function ms(n: number) { return `${n.toFixed(1)} ms`; } // ===================== // Main // ===================== async function main() { const tProgramStart = now(); if (values.help) { printHelp(); process.exit(0); } if (values["list-models"]) { try { await listModels(); process.exit(0); } catch (error: any) { console.error("\n[ERROR] Failed to list models:", error.message); process.exit(1); } } const prompt = values.prompt ?? positionals[0]; if (!prompt) { console.error("[ERROR] No prompt provided!"); printHelp(); process.exit(1); } const model = values.model || DEFAULT_MODEL; console.log(`[INFO] Using model: ${model}`); console.log(`[INFO] Prompt: ${prompt}`); console.log(`[INFO] Connecting to: http://localhost:8080/v1`); console.log("---"); const tBeforeRequest = now(); try { console.log("[DEBUG] Initiating request to OpenAI server..."); const response = await requestLocalOpenAI(model, prompt); const tAfterCreate = now(); // Streaming handling + timing let fullResponse = ""; let chunkCount = 0; const chunkStats: ChunkStat[] = []; let tFirstChunk: number | null = null; let tPrevChunk: number | null = null; console.log("[INFO] Waiting for model to generate response..."); let loadingInterval; if (!PRINT_CHUNK_DEBUG) { // Show loading animation only if not in debug mode const loadingChars = ['⠋', '⠙', '⠹', '⠸', '⠼', '⠴', '⠦', '⠧', '⠇', '⠏']; let i = 0; process.stdout.write('\r[INFO] Thinking '); loadingInterval = setInterval(() => { process.stdout.write(`\r[INFO] Thinking ${loadingChars[i++ % loadingChars.length]} `); }, 80); } else { console.log("[DEBUG] Starting to receive streaming response..."); } for await (const chunk of response) { // Clear loading animation on first chunk if (loadingInterval) { clearInterval(loadingInterval); process.stdout.write('\r \r'); } const tNow = now(); chunkCount++; // Extract content (delta) if present const content = chunk.choices?.[0]?.delta?.content ?? ""; if (PRINT_CHUNK_DEBUG) { console.log(`[DEBUG] Received chunk #${chunkCount}:`, JSON.stringify(chunk)); if (content) console.log(`[DEBUG] Chunk content: "${content}"`); } if (content) { process.stdout.write(content); fullResponse += content; } if (tFirstChunk === null) tFirstChunk = tNow; const dtSincePrev = tPrevChunk === null ? 0 : tNow - tPrevChunk; chunkStats.push({ index: chunkCount, tSinceRequestStartMs: tNow - tBeforeRequest, dtSincePrevMs: dtSincePrev, contentChars: content.length, }); tPrevChunk = tNow; } // ========= // Summary // ========= const tStreamEnd = now(); const totalChars = fullResponse.length; console.log("\n---"); console.log(`[DEBUG] Stream completed after ${chunkCount} chunks`); console.log(`[INFO] Response completed. Total length: ${totalChars} characters`); // Build timing metrics const ttfbMs = (tFirstChunk ?? tStreamEnd) - tAfterCreate; // time from create() resolved → first chunk const createOverheadMs = tAfterCreate - tBeforeRequest; // time spent awaiting create() promise const totalSinceRequestMs = tStreamEnd - tBeforeRequest; // from just before create() to last chunk const streamDurationMs = tFirstChunk === null ? 0 : tStreamEnd - tFirstChunk; const gaps = chunkStats .map((c) => c.dtSincePrevMs) // ignore the first "gap" which is 0 by construction .slice(1); const avgGapMs = gaps.length ? gaps.reduce((a, b) => a + b, 0) / gaps.length : 0; const medGapMs = median(gaps); const p95GapMs = quantile(gaps, 0.95); let maxGapMs = 0; let maxGapAtChunk = 0; for (let i = 0; i < gaps.length; i++) { if (gaps[i] > maxGapMs) { maxGapMs = gaps[i]; maxGapAtChunk = i + 2; // +1 to move from 0-based, +1 because we sliced starting at second chunk } } // Pretty print summary console.log("\n=== Timing Summary ==="); console.log(`create() await time: ${ms(createOverheadMs)}`); console.log(`TTFB (to 1st chunk): ${ms(ttfbMs)}`); console.log(`Stream duration: ${ms(streamDurationMs)}`); console.log(`End-to-end (req→last): ${ms(totalSinceRequestMs)}`); console.log(`Chunks: ${chunkCount}`); console.log(`Total content chars: ${totalChars}`); console.log(`Avg chars/chunk: ${(chunkCount ? totalChars / chunkCount : 0).toFixed(1)}`); console.log(`Inter-chunk gap (avg): ${ms(avgGapMs)}`); console.log(`Inter-chunk gap (median): ${ms(medGapMs)}`); console.log(`Inter-chunk gap (p95): ${ms(p95GapMs)}`); if (gaps.length > 0) { console.log(`Largest gap: ${ms(maxGapMs)} (before chunk #${maxGapAtChunk})`); } // Small tables: first N and slowest N gaps const firstRows = chunkStats.slice(0, SHOW_FIRST_N).map((c) => ({ chunk: c.index, "t since request": `${c.tSinceRequestStartMs.toFixed(1)} ms`, "dt since prev": `${c.dtSincePrevMs.toFixed(1)} ms`, "chars": c.contentChars, })); const slowestRows = chunkStats .slice(1) // skip first (no meaningful gap) .sort((a, b) => b.dtSincePrevMs - a.dtSincePrevMs) .slice(0, SHOW_SLOWEST_N) .map((c) => ({ chunk: c.index, "t since request": `${c.tSinceRequestStartMs.toFixed(1)} ms`, "dt since prev": `${c.dtSincePrevMs.toFixed(1)} ms`, "chars": c.contentChars, })); if (firstRows.length > 0) { console.log("\n--- First chunk timings ---"); // @ts-ignore Bun/Node support console.table console.table(firstRows); } if (slowestRows.length > 0) { console.log(`\n--- Slowest ${SHOW_SLOWEST_N} gaps ---`); // @ts-ignore console.table(slowestRows); } const tProgramEnd = now(); console.log("\n=== Program Overhead ==="); console.log(`Total program runtime: ${ms(tProgramEnd - tProgramStart)}`); } catch (error: any) { console.error("\n[ERROR] Request failed:", error.message); process.exit(1); } } // Run the main function main().catch((error) => { console.error("[FATAL ERROR]:", error); process.exit(1); });