Introduce predict-otron-9000: Unified server combining embeddings and inference engines. Includes OpenAI-compatible APIs, full documentation, and example scripts.

2025-09-08 22:46:44 +00:00 · 2025-08-16 19:11:35 -04:00
commit 2aa6d4cdf8
28 changed files with 16595 additions and 0 deletions
--- a/crates/inference-engine/src/cli.rs
+++ b/crates/inference-engine/src/cli.rs
@@ -0,0 +1,72 @@
+use clap::Parser;
+use crate::model::Which;
+
+#[derive(Parser, Debug)]
+#[command(author, version, about, long_about = None)]
+pub struct Args {
+    /// Run on CPU rather than on GPU.
+    #[arg(long)]
+    pub cpu: bool,
+
+    /// Enable tracing (generates a trace-timestamp.json file).
+    #[arg(long)]
+    pub tracing: bool,
+
+    /// Run in server mode with OpenAI compatible API
+    #[arg(long)]
+    pub server: bool,
+
+    /// Port to use for the server
+    #[arg(long, default_value_t = 3777)]
+    pub port: u16,
+
+    /// Prompt for text generation (not used in server mode)
+    #[arg(long)]
+    pub prompt: Option<String>,
+
+    /// The temperature used to generate samples.
+    #[arg(long)]
+    pub temperature: Option<f64>,
+
+    /// Nucleus sampling probability cutoff.
+    #[arg(long)]
+    pub top_p: Option<f64>,
+
+    /// The seed to use when generating random samples.
+    #[arg(long, default_value_t = 299792458)]
+    pub seed: u64,
+
+    /// The length of the sample to generate (in tokens).
+    #[arg(long, short = 'n', default_value_t = 10000)]
+    pub sample_len: usize,
+
+    #[arg(long)]
+    pub model_id: Option<String>,
+
+    #[arg(long, default_value = "main")]
+    pub revision: String,
+
+    #[arg(long)]
+    pub tokenizer_file: Option<String>,
+
+    #[arg(long)]
+    pub config_file: Option<String>,
+
+    #[arg(long)]
+    pub weight_files: Option<String>,
+
+    /// Penalty to be applied for repeating tokens, 1. means no penalty.
+    #[arg(long, default_value_t = 1.1)]
+    pub repeat_penalty: f32,
+
+    /// The context size to consider for the repeat penalty.
+    #[arg(long, default_value_t = 64)]
+    pub repeat_last_n: usize,
+
+    /// The model to use.
+    #[arg(long, default_value = "3-1b-it")]
+    pub which: Which,
+
+    #[arg(long)]
+    pub use_flash_attn: bool,
+}