chat client only displays available models

2025-09-08 22:46:44 +00:00 · 2025-09-01 22:29:54 -04:00
parent 545e0c9831
commit 2deecb5e51
20 changed files with 3314 additions and 484 deletions
--- a/Cargo.lock
+++ b/Cargo.lock
--- a/Cargo.toml
+++ b/Cargo.toml
@@ -8,7 +8,7 @@ members = [
    "crates/gemma-runner",
    "crates/cli",
    "crates/chat-ui"
-]
+, "crates/utils"]
 default-members = ["crates/predict-otron-9000"]
 resolver = "2"

--- a/README.md
+++ b/README.md
@@ -17,6 +17,11 @@ Stability is currently best effort. Many models require unique configuration. Wh
 A comprehensive multi-service AI platform built around local LLM inference, embeddings, and web interfaces.


+~~~shell
+./scripts/run.sh
+~~~
+
+
 ## Project Overview

 The predict-otron-9000 is a flexible AI platform that provides:
--- a/crates/gemma-runner/Cargo.toml
+++ b/crates/gemma-runner/Cargo.toml
@@ -10,15 +10,15 @@ edition = "2021"
 candle-core = { git = "https://github.com/huggingface/candle.git" }
 candle-nn = { git = "https://github.com/huggingface/candle.git" }
 candle-transformers = { git = "https://github.com/huggingface/candle.git" }
-candle-examples = { git = "https://github.com/huggingface/candle.git" }
 hf-hub = "0.4"
-tokenizers = "0.21"
+tokenizers = "0.22.0"
 anyhow = "1.0"
 clap = { version = "4.0", features = ["derive", "string"] }
 serde_json = "1.0"
 tracing = "0.1"
 tracing-chrome = "0.7"
 tracing-subscriber = "0.3"
+utils = {path = "../utils"}

 [target.'cfg(target_os = "macos")'.dependencies]
 candle-core = { git = "https://github.com/huggingface/candle.git", features = ["metal"] }
--- a/crates/gemma-runner/src/gemma_api.rs
+++ b/crates/gemma-runner/src/gemma_api.rs
@@ -10,16 +10,17 @@ use candle_transformers::models::gemma3::{Config as Config3, Model as Model3};
 use clap::ValueEnum;

 // Removed gemma_cli import as it's not needed for the API
-use candle_core::{utils, DType, Device, Tensor};
-use candle_examples::token_output_stream::TokenOutputStream;
+use candle_core::{DType, Device, Tensor};
 use candle_nn::VarBuilder;
 use candle_transformers::generation::LogitsProcessor;
 use hf_hub::{api::sync::Api, Repo, RepoType};
 use std::io::Write;
-use tokenizers::Tokenizer;

 use std::sync::mpsc::{self, Receiver, Sender};
 use std::thread;
+use tokenizers::Tokenizer;
+use utils::hub_load_safetensors;
+use utils::token_output_stream::TokenOutputStream;

 #[derive(Clone, Debug, Copy, PartialEq, Eq, ValueEnum)]
 pub enum WhichModel {
@@ -85,9 +86,9 @@ pub struct TextGeneration {
 fn device(cpu: bool) -> Result<Device> {
    if cpu {
        Ok(Device::Cpu)
-    } else if utils::cuda_is_available() {
+    } else if candle_core::utils::cuda_is_available() {
        Ok(Device::new_cuda(0)?)
-    } else if utils::metal_is_available() {
+    } else if candle_core::utils::metal_is_available() {
        Ok(Device::new_metal(0)?)
    } else {
        Ok(Device::Cpu)
@@ -98,7 +99,7 @@ impl TextGeneration {
    #[allow(clippy::too_many_arguments)]
    fn new(
        model: Model,
-        tokenizer: Tokenizer,
+        tokenizer: tokenizers::Tokenizer,
        seed: u64,
        temp: Option<f64>,
        top_p: Option<f64>,
@@ -262,10 +263,10 @@ pub fn run_gemma_api(cfg: GemmaInferenceConfig) -> Result<Receiver<Result<String

    println!(
        "avx: {}, neon: {}, simd128: {}, f16c: {}",
-        utils::with_avx(),
-        utils::with_neon(),
-        utils::with_simd128(),
-        utils::with_f16c()
+        candle_core::utils::with_avx(),
+        candle_core::utils::with_neon(),
+        candle_core::utils::with_simd128(),
+        candle_core::utils::with_f16c()
    );

    let device = device(cfg.cpu)?;
@@ -318,7 +319,7 @@ pub fn run_gemma_api(cfg: GemmaInferenceConfig) -> Result<Receiver<Result<String
    let config_filename = repo.get("config.json")?;
    let filenames = match cfg.model {
        WhichModel::BaseV3_1B | WhichModel::InstructV3_1B => vec![repo.get("model.safetensors")?],
-        _ => candle_examples::hub_load_safetensors(&repo, "model.safetensors.index.json")?,
+        _ => hub_load_safetensors(&repo, "model.safetensors.index.json")?,
    };
    println!("Retrieved files in {:?}", start.elapsed());

--- a/crates/inference-engine/Cargo.toml
+++ b/crates/inference-engine/Cargo.toml
@@ -31,8 +31,8 @@ utoipa = { version = "4.2.0", features = ["axum_extras"] }
 uuid = { version = "1.7.0", features = ["v4"] }
 reborrow = "0.5.5"
 futures-util = "0.3.31"
-gemma-runner = { path = "../gemma-runner" }
-llama-runner = { path = "../llama-runner" }
+gemma-runner = { path = "../gemma-runner", features = ["metal"] }
+llama-runner = { path = "../llama-runner", features = ["metal"]}

 [target.'cfg(target_os = "macos")'.dependencies]
 candle-core = { git = "https://github.com/huggingface/candle.git", features = ["metal"] }
--- a/crates/inference-engine/src/model.rs
+++ b/crates/inference-engine/src/model.rs
@@ -1,49 +1,9 @@
-// use candle_core::Tensor;
 use candle_transformers::models::csm::{LlamaConfig, LlamaModel};
 use candle_transformers::models::gemma::{Config as Config1, Model as Model1};
 use candle_transformers::models::gemma2::{Config as Config2, Model as Model2};
 use candle_transformers::models::gemma3::{Config as Config3, Model as Model3};

-#[derive(Clone, Debug, Copy, PartialEq, Eq, clap::ValueEnum)]
-pub enum Which {
-    #[value(name = "2b")]
-    Base2B,
-    #[value(name = "7b")]
-    Base7B,
-    #[value(name = "2b-it")]
-    Instruct2B,
-    #[value(name = "7b-it")]
-    Instruct7B,
-    #[value(name = "1.1-2b-it")]
-    InstructV1_1_2B,
-    #[value(name = "1.1-7b-it")]
-    InstructV1_1_7B,
-    #[value(name = "code-2b")]
-    CodeBase2B,
-    #[value(name = "code-7b")]
-    CodeBase7B,
-    #[value(name = "code-2b-it")]
-    CodeInstruct2B,
-    #[value(name = "code-7b-it")]
-    CodeInstruct7B,
-    #[value(name = "2-2b")]
-    BaseV2_2B,
-    #[value(name = "2-2b-it")]
-    InstructV2_2B,
-    #[value(name = "2-9b")]
-    BaseV2_9B,
-    #[value(name = "2-9b-it")]
-    InstructV2_9B,
-    #[value(name = "3-1b")]
-    BaseV3_1B,
-    #[value(name = "3-1b-it")]
-    InstructV3_1B,
-    #[value(name = "llama-3.2-1b-it")]
-    LlamaInstruct3_2_1B,
-    #[value(name = "llama-3.2-3b-it")]
-    LlamaInstruct3_2_3B,
-}
-
+#[derive(Clone, Debug)]
 pub enum Model {
    V1(Model1),
    V2(Model2),
@@ -66,48 +26,127 @@ impl Model {
    }
 }

+#[derive(Clone, Copy, Debug, PartialEq, Eq)]
+pub enum Family {
+    GemmaV1,
+    GemmaV2,
+    GemmaV3,
+    Llama,
+}
+
+#[derive(Clone, Copy, Debug)]
+pub struct ModelMeta {
+    pub id: &'static str,
+    pub family: Family,
+    pub instruct: bool,
+}
+
+const fn m(id: &'static str, family: Family, instruct: bool) -> ModelMeta {
+    ModelMeta { id, family, instruct }
+}
+
+#[derive(Clone, Debug, Copy, PartialEq, Eq, clap::ValueEnum)]
+pub enum Which {
+    // Gemma 1.x
+    #[value(name = "2b")]
+    Base2B,
+    #[value(name = "7b")]
+    Base7B,
+    #[value(name = "2b-it")]
+    Instruct2B,
+    #[value(name = "7b-it")]
+    Instruct7B,
+    #[value(name = "1.1-2b-it")]
+    InstructV1_1_2B,
+    #[value(name = "1.1-7b-it")]
+    InstructV1_1_7B,
+
+    // CodeGemma
+    #[value(name = "code-2b")]
+    CodeBase2B,
+    #[value(name = "code-7b")]
+    CodeBase7B,
+    #[value(name = "code-2b-it")]
+    CodeInstruct2B,
+    #[value(name = "code-7b-it")]
+    CodeInstruct7B,
+
+    // Gemma 2
+    #[value(name = "2-2b")]
+    BaseV2_2B,
+    #[value(name = "2-2b-it")]
+    InstructV2_2B,
+    #[value(name = "2-9b")]
+    BaseV2_9B,
+    #[value(name = "2-9b-it")]
+    InstructV2_9B,
+
+    // Gemma 3
+    #[value(name = "3-1b")]
+    BaseV3_1B,
+    #[value(name = "3-1b-it")]
+    InstructV3_1B,
+
+    // Llama 3.2 (use aliases instead of duplicate variants)
+    #[value(name = "llama-3.2-1b")]
+    Llama32_1B,
+    #[value(name = "llama-3.2-1b-it", alias = "llama-3.2-1b-instruct")]
+    Llama32_1BInstruct,
+    #[value(name = "llama-3.2-3b")]
+    Llama32_3B,
+    #[value(name = "llama-3.2-3b-it", alias = "llama-3.2-3b-instruct")]
+    Llama32_3BInstruct,
+}
+
 impl Which {
-    pub fn to_model_id(&self) -> String {
+    pub const fn meta(&self) -> ModelMeta {
+        use Family::*;
        match self {
-            Self::InstructV1_1_2B => "google/gemma-1.1-2b-it".to_string(),
-            Self::InstructV1_1_7B => "google/gemma-1.1-7b-it".to_string(),
-            Self::Base2B => "google/gemma-2b".to_string(),
-            Self::Base7B => "google/gemma-7b".to_string(),
-            Self::Instruct2B => "google/gemma-2b-it".to_string(),
-            Self::Instruct7B => "google/gemma-7b-it".to_string(),
-            Self::CodeBase2B => "google/codegemma-2b".to_string(),
-            Self::CodeBase7B => "google/codegemma-7b".to_string(),
-            Self::CodeInstruct2B => "google/codegemma-2b-it".to_string(),
-            Self::CodeInstruct7B => "google/codegemma-7b-it".to_string(),
-            Self::BaseV2_2B => "google/gemma-2-2b".to_string(),
-            Self::InstructV2_2B => "google/gemma-2-2b-it".to_string(),
-            Self::BaseV2_9B => "google/gemma-2-9b".to_string(),
-            Self::InstructV2_9B => "google/gemma-2-9b-it".to_string(),
-            Self::BaseV3_1B => "google/gemma-3-1b-pt".to_string(),
-            Self::InstructV3_1B => "google/gemma-3-1b-it".to_string(),
-            Self::LlamaInstruct3_2_1B => "meta-llama/Llama-3.2-1B-Instruct".to_string(),
-            Self::LlamaInstruct3_2_3B => "meta-llama/Llama-3.2-3B-Instruct".to_string(),
+            // Gemma 1.x
+            Self::Base2B => m("google/gemma-2b", GemmaV1, false),
+            Self::Base7B => m("google/gemma-7b", GemmaV1, false),
+            Self::Instruct2B => m("google/gemma-2b-it", GemmaV1, true),
+            Self::Instruct7B => m("google/gemma-7b-it", GemmaV1, true),
+            Self::InstructV1_1_2B => m("google/gemma-1.1-2b-it", GemmaV1, true),
+            Self::InstructV1_1_7B => m("google/gemma-1.1-7b-it", GemmaV1, true),
+
+            // CodeGemma
+            Self::CodeBase2B => m("google/codegemma-2b", GemmaV1, false),
+            Self::CodeBase7B => m("google/codegemma-7b", GemmaV1, false),
+            Self::CodeInstruct2B => m("google/codegemma-2b-it", GemmaV1, true),
+            Self::CodeInstruct7B => m("google/codegemma-7b-it", GemmaV1, true),
+
+            // Gemma 2
+            Self::BaseV2_2B => m("google/gemma-2-2b", GemmaV2, false),
+            Self::InstructV2_2B => m("google/gemma-2-2b-it", GemmaV2, true),
+            Self::BaseV2_9B => m("google/gemma-2-9b", GemmaV2, false),
+            Self::InstructV2_9B => m("google/gemma-2-9b-it", GemmaV2, true),
+
+            // Gemma 3
+            Self::BaseV3_1B => m("google/gemma-3-1b-pt", GemmaV3, false),
+            Self::InstructV3_1B => m("google/gemma-3-1b-it", GemmaV3, true),
+
+            // Llama 3.2
+            Self::Llama32_1B => m("meta-llama/Llama-3.2-1B", Llama, false),
+            Self::Llama32_1BInstruct => m("meta-llama/Llama-3.2-1B-Instruct", Llama, true),
+            Self::Llama32_3B => m("meta-llama/Llama-3.2-3B", Llama, false),
+            Self::Llama32_3BInstruct => m("meta-llama/Llama-3.2-3B-Instruct", Llama, true),
        }
    }

+    pub fn to_model_id(&self) -> String {
+        self.meta().id.to_string()
+    }
+
    pub fn is_instruct_model(&self) -> bool {
-        match self {
-            Self::Base2B
-            | Self::Base7B
-            | Self::CodeBase2B
-            | Self::CodeBase7B
-            | Self::BaseV2_2B
-            | Self::BaseV2_9B
-            | Self::BaseV3_1B => false,
-            _ => true,
-        }
+        self.meta().instruct
    }

    pub fn is_v3_model(&self) -> bool {
-        matches!(self, Self::BaseV3_1B | Self::InstructV3_1B)
+        matches!(self.meta().family, Family::GemmaV3)
    }

    pub fn is_llama_model(&self) -> bool {
-        matches!(self, Self::LlamaInstruct3_2_1B | Self::LlamaInstruct3_2_3B)
+        matches!(self.meta().family, Family::Llama)
    }
 }
--- a/crates/inference-engine/src/server.rs
+++ b/crates/inference-engine/src/server.rs
@@ -42,13 +42,18 @@ pub struct AppState {

 impl Default for AppState {
    fn default() -> Self {
+        // Configure a default model to prevent 503 errors from the chat-ui
+        // This can be overridden by environment variables if needed
+        let default_model_id = std::env::var("DEFAULT_MODEL").unwrap_or_else(|_| "gemma-3-1b-it".to_string());
+        
        let gemma_config = GemmaInferenceConfig {
            model: gemma_runner::WhichModel::InstructV3_1B,
            ..Default::default()
        };
+
        Self {
            model_type: ModelType::Gemma,
-            model_id: "gemma-3-1b-it".to_string(),
+            model_id: default_model_id,
            gemma_config: Some(gemma_config),
            llama_config: None,
        }
@@ -59,6 +64,34 @@ impl Default for AppState {
 // Helper functions
 // -------------------------

+fn model_id_to_which(model_id: &str) -> Option<Which> {
+    let normalized = normalize_model_id(model_id);
+    match normalized.as_str() {
+        "gemma-2b" => Some(Which::Base2B),
+        "gemma-7b" => Some(Which::Base7B),
+        "gemma-2b-it" => Some(Which::Instruct2B),
+        "gemma-7b-it" => Some(Which::Instruct7B),
+        "gemma-1.1-2b-it" => Some(Which::InstructV1_1_2B),
+        "gemma-1.1-7b-it" => Some(Which::InstructV1_1_7B),
+        "codegemma-2b" => Some(Which::CodeBase2B),
+        "codegemma-7b" => Some(Which::CodeBase7B),
+        "codegemma-2b-it" => Some(Which::CodeInstruct2B),
+        "codegemma-7b-it" => Some(Which::CodeInstruct7B),
+        "gemma-2-2b" => Some(Which::BaseV2_2B),
+        "gemma-2-2b-it" => Some(Which::InstructV2_2B),
+        "gemma-2-9b" => Some(Which::BaseV2_9B),
+        "gemma-2-9b-it" => Some(Which::InstructV2_9B),
+        "gemma-3-1b" => Some(Which::BaseV3_1B),
+        "gemma-3-1b-it" => Some(Which::InstructV3_1B),
+        "llama-3.2-1b-instruct" => Some(Which::Llama32_1BInstruct),
+        "llama-3.2-3b-instruct" => Some(Which::Llama32_3BInstruct),
+        _ => None,
+    }
+}
+
+
+
+
 fn normalize_model_id(model_id: &str) -> String {
    model_id.to_lowercase().replace("_", "-")
 }
@@ -116,90 +149,76 @@ pub async fn chat_completions_non_streaming_proxy(
    state: AppState,
    request: ChatCompletionRequest,
 ) -> Result<impl IntoResponse, (StatusCode, Json<Value>)> {
-    // Enforce model selection behavior: reject if a different model is requested
-    let configured_model = state.model_id.clone();
-    let requested_model = request.model.clone();
-    if requested_model.to_lowercase() != "default" {
-        let normalized_requested = normalize_model_id(&requested_model);
-        let normalized_configured = normalize_model_id(&configured_model);
-        if normalized_requested != normalized_configured {
+    // Use the model specified in the request
+    let model_id = request.model.clone();
+    let which_model = model_id_to_which(&model_id);
+    
+    // Validate that the requested model is supported
+    let which_model = match which_model {
+        Some(model) => model,
+        None => {
            return Err((
                StatusCode::BAD_REQUEST,
                Json(serde_json::json!({
                    "error": {
-                        "message": format!(
-                            "Requested model '{}' is not available. This server is running '{}' only.",
-                            requested_model, configured_model
-                        ),
-                        "type": "model_mismatch"
+                        "message": format!("Unsupported model: {}", model_id),
+                        "type": "model_not_supported"
                    }
                })),
            ));
        }
-    }
-
-    let model_id = state.model_id.clone();
+    };
    let max_tokens = request.max_tokens.unwrap_or(1000);

    // Build prompt based on model type
-    let prompt = match state.model_type {
-        ModelType::Gemma => build_gemma_prompt(&request.messages),
-        ModelType::Llama => {
-            // For Llama, just use the last user message for now
-            request
-                .messages
-                .last()
-                .and_then(|m| m.content.as_ref())
-                .and_then(|c| match c {
-                    MessageContent(Either::Left(text)) => Some(text.clone()),
-                    _ => None,
-                })
-                .unwrap_or_default()
-        }
+    let prompt = if which_model.is_llama_model() {
+        // For Llama, just use the last user message for now
+        request
+            .messages
+            .last()
+            .and_then(|m| m.content.as_ref())
+            .and_then(|c| match c {
+                MessageContent(Either::Left(text)) => Some(text.clone()),
+                _ => None,
+            })
+            .unwrap_or_default()
+    } else {
+        build_gemma_prompt(&request.messages)
    };

    // Get streaming receiver based on model type
-    let rx =
-        match state.model_type {
-            ModelType::Gemma => {
-                if let Some(mut config) = state.gemma_config {
-                    config.prompt = prompt.clone();
-                    config.max_tokens = max_tokens;
-                    run_gemma_api(config).map_err(|e| (
-                    StatusCode::INTERNAL_SERVER_ERROR,
-                    Json(serde_json::json!({
-                        "error": { "message": format!("Error initializing Gemma model: {}", e) }
-                    }))
-                ))?
-                } else {
-                    return Err((
-                        StatusCode::INTERNAL_SERVER_ERROR,
-                        Json(serde_json::json!({
-                            "error": { "message": "Gemma configuration not available" }
-                        })),
-                    ));
-                }
-            }
-            ModelType::Llama => {
-                if let Some(mut config) = state.llama_config {
-                    config.prompt = prompt.clone();
-                    config.max_tokens = max_tokens;
-                    run_llama_inference(config).map_err(|e| (
-                    StatusCode::INTERNAL_SERVER_ERROR,
-                    Json(serde_json::json!({
-                        "error": { "message": format!("Error initializing Llama model: {}", e) }
-                    }))
-                ))?
-                } else {
-                    return Err((
-                        StatusCode::INTERNAL_SERVER_ERROR,
-                        Json(serde_json::json!({
-                            "error": { "message": "Llama configuration not available" }
-                        })),
-                    ));
-                }
-            }
+    let rx = if which_model.is_llama_model() {
+        // Create Llama configuration dynamically
+        let mut config = LlamaInferenceConfig::default();
+        config.prompt = prompt.clone();
+        config.max_tokens = max_tokens;
+        run_llama_inference(config).map_err(|e| (
+            StatusCode::INTERNAL_SERVER_ERROR,
+            Json(serde_json::json!({
+                "error": { "message": format!("Error initializing Llama model: {}", e) }
+            }))
+        ))?
+    } else {
+        // Create Gemma configuration dynamically
+        let gemma_model = if which_model.is_v3_model() {
+            gemma_runner::WhichModel::InstructV3_1B
+        } else {
+            gemma_runner::WhichModel::InstructV3_1B // Default fallback
        };
+        
+        let mut config = GemmaInferenceConfig {
+            model: gemma_model,
+            ..Default::default()
+        };
+        config.prompt = prompt.clone();
+        config.max_tokens = max_tokens;
+        run_gemma_api(config).map_err(|e| (
+            StatusCode::INTERNAL_SERVER_ERROR,
+            Json(serde_json::json!({
+                "error": { "message": format!("Error initializing Gemma model: {}", e) }
+            }))
+        ))?
+    };

    // Collect all tokens from the stream
    let mut completion = String::new();
@@ -258,27 +277,25 @@ async fn handle_streaming_request(
    state: AppState,
    request: ChatCompletionRequest,
 ) -> Result<Sse<impl Stream<Item = Result<Event, Infallible>>>, (StatusCode, Json<Value>)> {
-    // Validate requested model vs configured model
-    let configured_model = state.model_id.clone();
-    let requested_model = request.model.clone();
-    if requested_model.to_lowercase() != "default" {
-        let normalized_requested = normalize_model_id(&requested_model);
-        let normalized_configured = normalize_model_id(&configured_model);
-        if normalized_requested != normalized_configured {
+    // Use the model specified in the request
+    let model_id = request.model.clone();
+    let which_model = model_id_to_which(&model_id);
+    
+    // Validate that the requested model is supported
+    let which_model = match which_model {
+        Some(model) => model,
+        None => {
            return Err((
                StatusCode::BAD_REQUEST,
                Json(serde_json::json!({
                    "error": {
-                        "message": format!(
-                            "Requested model '{}' is not available. This server is running '{}' only.",
-                            requested_model, configured_model
-                        ),
-                        "type": "model_mismatch"
+                        "message": format!("Unsupported model: {}", model_id),
+                        "type": "model_not_supported"
                    }
                })),
            ));
        }
-    }
+    };

    // Generate a unique ID and metadata
    let response_id = format!("chatcmpl-{}", Uuid::new_v4().to_string().replace('-', ""));
@@ -286,24 +303,22 @@ async fn handle_streaming_request(
        .duration_since(std::time::UNIX_EPOCH)
        .unwrap_or_default()
        .as_secs();
-    let model_id = state.model_id.clone();
    let max_tokens = request.max_tokens.unwrap_or(1000);

    // Build prompt based on model type
-    let prompt = match state.model_type {
-        ModelType::Gemma => build_gemma_prompt(&request.messages),
-        ModelType::Llama => {
-            // For Llama, just use the last user message for now
-            request
-                .messages
-                .last()
-                .and_then(|m| m.content.as_ref())
-                .and_then(|c| match c {
-                    MessageContent(Either::Left(text)) => Some(text.clone()),
-                    _ => None,
-                })
-                .unwrap_or_default()
-        }
+    let prompt = if which_model.is_llama_model() {
+        // For Llama, just use the last user message for now
+        request
+            .messages
+            .last()
+            .and_then(|m| m.content.as_ref())
+            .and_then(|c| match c {
+                MessageContent(Either::Left(text)) => Some(text.clone()),
+                _ => None,
+            })
+            .unwrap_or_default()
+    } else {
+        build_gemma_prompt(&request.messages)
    };
    tracing::debug!("Formatted prompt: {}", prompt);

@@ -330,51 +345,43 @@ async fn handle_streaming_request(
    }

    // Get streaming receiver based on model type
-    let model_rx = match state.model_type {
-        ModelType::Gemma => {
-            if let Some(mut config) = state.gemma_config {
-                config.prompt = prompt.clone();
-                config.max_tokens = max_tokens;
-                match run_gemma_api(config) {
-                    Ok(rx) => rx,
-                    Err(e) => {
-                        return Err((
-                            StatusCode::INTERNAL_SERVER_ERROR,
-                            Json(serde_json::json!({
-                                "error": { "message": format!("Error initializing Gemma model: {}", e) }
-                            })),
-                        ));
-                    }
-                }
-            } else {
+    let model_rx = if which_model.is_llama_model() {
+        // Create Llama configuration dynamically
+        let mut config = LlamaInferenceConfig::default();
+        config.prompt = prompt.clone();
+        config.max_tokens = max_tokens;
+        match run_llama_inference(config) {
+            Ok(rx) => rx,
+            Err(e) => {
                return Err((
                    StatusCode::INTERNAL_SERVER_ERROR,
                    Json(serde_json::json!({
-                        "error": { "message": "Gemma configuration not available" }
+                        "error": { "message": format!("Error initializing Llama model: {}", e) }
                    })),
                ));
            }
        }
-        ModelType::Llama => {
-            if let Some(mut config) = state.llama_config {
-                config.prompt = prompt.clone();
-                config.max_tokens = max_tokens;
-                match run_llama_inference(config) {
-                    Ok(rx) => rx,
-                    Err(e) => {
-                        return Err((
-                            StatusCode::INTERNAL_SERVER_ERROR,
-                            Json(serde_json::json!({
-                                "error": { "message": format!("Error initializing Llama model: {}", e) }
-                            })),
-                        ));
-                    }
-                }
-            } else {
+    } else {
+        // Create Gemma configuration dynamically
+        let gemma_model = if which_model.is_v3_model() {
+            gemma_runner::WhichModel::InstructV3_1B
+        } else {
+            gemma_runner::WhichModel::InstructV3_1B // Default fallback
+        };
+        
+        let mut config = GemmaInferenceConfig {
+            model: gemma_model,
+            ..Default::default()
+        };
+        config.prompt = prompt.clone();
+        config.max_tokens = max_tokens;
+        match run_gemma_api(config) {
+            Ok(rx) => rx,
+            Err(e) => {
                return Err((
                    StatusCode::INTERNAL_SERVER_ERROR,
                    Json(serde_json::json!({
-                        "error": { "message": "Llama configuration not available" }
+                        "error": { "message": format!("Error initializing Gemma model: {}", e) }
                    })),
                ));
            }
@@ -500,172 +507,69 @@ pub fn create_router(app_state: AppState) -> Router {
 /// Handler for GET /v1/models - returns list of available models
 pub async fn list_models() -> Json<ModelListResponse> {
    // Get all available model variants from the Which enum
-    let models = vec![
-        // Gemma models
+    let which_variants = vec![
+        Which::Base2B,
+        Which::Base7B,
+        Which::Instruct2B,
+        Which::Instruct7B,
+        Which::InstructV1_1_2B,
+        Which::InstructV1_1_7B,
+        Which::CodeBase2B,
+        Which::CodeBase7B,
+        Which::CodeInstruct2B,
+        Which::CodeInstruct7B,
+        Which::BaseV2_2B,
+        Which::InstructV2_2B,
+        Which::BaseV2_9B,
+        Which::InstructV2_9B,
+        Which::BaseV3_1B,
+        Which::InstructV3_1B,
+        Which::Llama32_1B,
+        Which::Llama32_1BInstruct,
+        Which::Llama32_3B,
+        Which::Llama32_3BInstruct,
+    ];
+
+    let models: Vec<Model> = which_variants.into_iter().map(|which| {
+        let meta = which.meta();
+        let model_id = match which {
+            Which::Base2B => "gemma-2b",
+            Which::Base7B => "gemma-7b",
+            Which::Instruct2B => "gemma-2b-it",
+            Which::Instruct7B => "gemma-7b-it",
+            Which::InstructV1_1_2B => "gemma-1.1-2b-it",
+            Which::InstructV1_1_7B => "gemma-1.1-7b-it",
+            Which::CodeBase2B => "codegemma-2b",
+            Which::CodeBase7B => "codegemma-7b",
+            Which::CodeInstruct2B => "codegemma-2b-it",
+            Which::CodeInstruct7B => "codegemma-7b-it",
+            Which::BaseV2_2B => "gemma-2-2b",
+            Which::InstructV2_2B => "gemma-2-2b-it",
+            Which::BaseV2_9B => "gemma-2-9b",
+            Which::InstructV2_9B => "gemma-2-9b-it",
+            Which::BaseV3_1B => "gemma-3-1b",
+            Which::InstructV3_1B => "gemma-3-1b-it",
+            Which::Llama32_1B => "llama-3.2-1b",
+            Which::Llama32_1BInstruct => "llama-3.2-1b-instruct",
+            Which::Llama32_3B => "llama-3.2-3b",
+            Which::Llama32_3BInstruct => "llama-3.2-3b-instruct",
+        };
+
+        let owned_by = if meta.id.starts_with("google/") {
+            "google"
+        } else if meta.id.starts_with("meta-llama/") {
+            "meta"
+        } else {
+            "unknown"
+        };
+
        Model {
-            id: "gemma-2b".to_string(),
+            id: model_id.to_string(),
            object: "model".to_string(),
            created: 1686935002, // Using same timestamp as OpenAI example
-            owned_by: "google".to_string(),
-        },
-        Model {
-            id: "gemma-7b".to_string(),
-            object: "model".to_string(),
-            created: 1686935002,
-            owned_by: "google".to_string(),
-        },
-        Model {
-            id: "gemma-2b-it".to_string(),
-            object: "model".to_string(),
-            created: 1686935002,
-            owned_by: "google".to_string(),
-        },
-        Model {
-            id: "gemma-7b-it".to_string(),
-            object: "model".to_string(),
-            created: 1686935002,
-            owned_by: "google".to_string(),
-        },
-        Model {
-            id: "gemma-1.1-2b-it".to_string(),
-            object: "model".to_string(),
-            created: 1686935002,
-            owned_by: "google".to_string(),
-        },
-        Model {
-            id: "gemma-1.1-7b-it".to_string(),
-            object: "model".to_string(),
-            created: 1686935002,
-            owned_by: "google".to_string(),
-        },
-        Model {
-            id: "codegemma-2b".to_string(),
-            object: "model".to_string(),
-            created: 1686935002,
-            owned_by: "google".to_string(),
-        },
-        Model {
-            id: "codegemma-7b".to_string(),
-            object: "model".to_string(),
-            created: 1686935002,
-            owned_by: "google".to_string(),
-        },
-        Model {
-            id: "codegemma-2b-it".to_string(),
-            object: "model".to_string(),
-            created: 1686935002,
-            owned_by: "google".to_string(),
-        },
-        Model {
-            id: "codegemma-7b-it".to_string(),
-            object: "model".to_string(),
-            created: 1686935002,
-            owned_by: "google".to_string(),
-        },
-        Model {
-            id: "gemma-2-2b".to_string(),
-            object: "model".to_string(),
-            created: 1686935002,
-            owned_by: "google".to_string(),
-        },
-        Model {
-            id: "gemma-2-2b-it".to_string(),
-            object: "model".to_string(),
-            created: 1686935002,
-            owned_by: "google".to_string(),
-        },
-        Model {
-            id: "gemma-2-9b".to_string(),
-            object: "model".to_string(),
-            created: 1686935002,
-            owned_by: "google".to_string(),
-        },
-        Model {
-            id: "gemma-2-9b-it".to_string(),
-            object: "model".to_string(),
-            created: 1686935002,
-            owned_by: "google".to_string(),
-        },
-        Model {
-            id: "gemma-3-1b".to_string(),
-            object: "model".to_string(),
-            created: 1686935002,
-            owned_by: "google".to_string(),
-        },
-        Model {
-            id: "gemma-3-1b-it".to_string(),
-            object: "model".to_string(),
-            created: 1686935002,
-            owned_by: "google".to_string(),
-        },
-        // Llama models
-        Model {
-            id: "llama-3.2-1b".to_string(),
-            object: "model".to_string(),
-            created: 1686935002,
-            owned_by: "meta".to_string(),
-        },
-        Model {
-            id: "llama-3.2-1b-instruct".to_string(),
-            object: "model".to_string(),
-            created: 1686935002,
-            owned_by: "meta".to_string(),
-        },
-        Model {
-            id: "llama-3.2-3b".to_string(),
-            object: "model".to_string(),
-            created: 1686935002,
-            owned_by: "meta".to_string(),
-        },
-        Model {
-            id: "llama-3.2-3b-instruct".to_string(),
-            object: "model".to_string(),
-            created: 1686935002,
-            owned_by: "meta".to_string(),
-        },
-        Model {
-            id: "smollm2-135m".to_string(),
-            object: "model".to_string(),
-            created: 1686935002,
-            owned_by: "huggingface".to_string(),
-        },
-        Model {
-            id: "smollm2-135m-instruct".to_string(),
-            object: "model".to_string(),
-            created: 1686935002,
-            owned_by: "huggingface".to_string(),
-        },
-        Model {
-            id: "smollm2-360m".to_string(),
-            object: "model".to_string(),
-            created: 1686935002,
-            owned_by: "huggingface".to_string(),
-        },
-        Model {
-            id: "smollm2-360m-instruct".to_string(),
-            object: "model".to_string(),
-            created: 1686935002,
-            owned_by: "huggingface".to_string(),
-        },
-        Model {
-            id: "smollm2-1.7b".to_string(),
-            object: "model".to_string(),
-            created: 1686935002,
-            owned_by: "huggingface".to_string(),
-        },
-        Model {
-            id: "smollm2-1.7b-instruct".to_string(),
-            object: "model".to_string(),
-            created: 1686935002,
-            owned_by: "huggingface".to_string(),
-        },
-        Model {
-            id: "tinyllama-1.1b-chat".to_string(),
-            object: "model".to_string(),
-            created: 1686935002,
-            owned_by: "tinyllama".to_string(),
-        },
-    ];
+            owned_by: owned_by.to_string(),
+        }
+    }).collect();

    Json(ModelListResponse {
        object: "list".to_string(),
--- a/crates/llama-runner/Cargo.toml
+++ b/crates/llama-runner/Cargo.toml
@@ -5,8 +5,8 @@ edition = "2021"

 [dependencies]
 candle-core = { git = "https://github.com/huggingface/candle.git" }
-candle-nn = { git = "https://github.com/huggingface/candle.git" }  
-candle-transformers = { git = "https://github.com/huggingface/candle.git" }
+candle-nn = { git = "https://github.com/huggingface/candle.git" }
+candle-transformers = { git = "https://github.com/huggingface/candle.git"}
 hf-hub = "0.3"
 tokenizers = "0.20"
 anyhow = "1.0"
--- a/crates/llama-runner/src/llama_api.rs
+++ b/crates/llama-runner/src/llama_api.rs
@@ -82,7 +82,7 @@ impl Default for LlamaInferenceConfig {

            // Performance flags
            no_kv_cache: false,   // keep cache ON for speed
-            use_flash_attn: true, // great speed boost if supported
+            use_flash_attn: false, // great speed boost if supported

            // Precision: bf16 is a good default on Ampere+; fallback to fp16 if needed.
            dtype: Some("bf16".to_string()),
--- a/crates/predict-otron-9000/src/standalone_mode.rs
+++ b/crates/predict-otron-9000/src/standalone_mode.rs
@@ -6,7 +6,8 @@ pub fn create_standalone_router(server_config: ServerConfig) -> Router {
    // Create unified router by merging embeddings and inference routers (existing behavior)
    let embeddings_router = embeddings_engine::create_embeddings_router();

-    // Create AppState with correct model configuration
+    // Create AppState - no default model, must be configured explicitly
+    // This removes the hardcoded gemma-3-1b-it default behavior
    let app_state = AppState::default();

    // Get the inference router directly from the inference engine
--- a/crates/utils/Cargo.toml
+++ b/crates/utils/Cargo.toml
@@ -0,0 +1,88 @@
+[package]
+name = "utils"
+
+[lib]
+path = "src/lib.rs"
+
+[dependencies]
+accelerate-src = {version = "0.3.2", optional = true }
+candle-nn = {version = "0.9.1" }
+candle-transformers = {version = "0.9.1" }
+
+candle-flash-attn = {version = "0.9.1", optional = true }
+candle-onnx = {version = "0.9.1", optional = true }
+candle-core="0.9.1"
+csv = "1.3.0"
+anyhow = "1.0.99"
+cudarc = {version = "0.17.3", optional = true }
+half = {version = "2.6.0", optional = true }
+hf-hub = {version = "0.4.3", features = ["tokio"] }
+image = {version = "0.25.6" }
+intel-mkl-src = {version = "0.8.1", optional = true }
+num-traits = {version = "0.2.19" }
+palette = { version = "0.7.6", optional = true }
+enterpolation = { version = "0.2.1", optional = true }
+pyo3 = { version = "0.22.0", features = [
+    "auto-initialize",
+    "abi3-py311",
+], optional = true }
+rayon = {version = "1.11.0" }
+rubato = { version = "0.15.0", optional = true }
+safetensors = {version = "0.6.2" }
+serde = {version = "1.0.219" }
+serde_json = {version = "1.0.143" }
+symphonia = { version = "0.5.3", features = ["all"], optional = true }
+tokenizers = {version = "0.22.0", features = ["onig"] }
+cpal = { version = "0.15.2", optional = true }
+pdf2image = { version = "0.1.2", optional = true }
+tekken-rs = { version = "0.1.1", optional = true }
+
+[dev-dependencies]
+anyhow = {version = "1.0.99" }
+byteorder = {version = "1.5.0" }
+clap = {version = "4.5.46" }
+imageproc = {version = "0.25.0" }
+memmap2 = {version = "0.9.8" }
+rand = {version = "0.9.2" }
+ab_glyph = {version = "0.2.31" }
+tracing = {version = "0.1.41" }
+tracing-chrome = {version = "0.7.2" }
+tracing-subscriber = {version = "0.3.20" }
+# Necessary to disambiguate with tokio in wasm examples which are 1.28.1
+tokio = "1.43.0"
+
+[build-dependencies]
+anyhow = {version = "1.0.99" }
+bindgen_cuda = { version = "0.1.1", optional = true }
+#
+[features]
+default = []
+accelerate = [
+    "dep:accelerate-src",
+    "candle-core/accelerate",
+    "candle-nn/accelerate",
+    "candle-transformers/accelerate",
+]
+cuda = [
+    "candle-core/cuda",
+    "candle-nn/cuda",
+    "candle-transformers/cuda",
+    "dep:bindgen_cuda",
+]
+cudnn = ["candle-core/cudnn", "candle-nn/cudnn", "candle-transformers/cudnn"]
+flash-attn = ["cuda", "candle-transformers/flash-attn", "dep:candle-flash-attn"]
+mkl = [
+    "dep:intel-mkl-src",
+    "candle-core/mkl",
+    "candle-nn/mkl",
+    "candle-transformers/mkl",
+]
+nccl = ["cuda", "cudarc/nccl", "dep:half"]
+onnx = ["candle-onnx"]
+metal = ["candle-core/metal", "candle-nn/metal"]
+microphone = ["cpal", "rubato"]
+encodec = ["cpal", "symphonia", "rubato"]
+mimi = ["cpal", "symphonia", "rubato"]
+snac = ["cpal", "symphonia", "rubato"]
+depth_anything_v2 = ["palette", "enterpolation"]
+tekken = ["tekken-rs"]
--- a/crates/utils/src/audio.rs
+++ b/crates/utils/src/audio.rs
@@ -0,0 +1,138 @@
+use candle_core::{Result, Tensor};
+
+// https://github.com/facebookresearch/audiocraft/blob/69fea8b290ad1b4b40d28f92d1dfc0ab01dbab85/audiocraft/data/audio_utils.py#L57
+pub fn normalize_loudness(
+    wav: &Tensor,
+    sample_rate: u32,
+    loudness_compressor: bool,
+) -> Result<Tensor> {
+    let energy = wav.sqr()?.mean_all()?.sqrt()?.to_vec0::<f32>()?;
+    if energy < 2e-3 {
+        return Ok(wav.clone());
+    }
+    let wav_array = wav.to_vec1::<f32>()?;
+    let mut meter = crate::bs1770::ChannelLoudnessMeter::new(sample_rate);
+    meter.push(wav_array.into_iter());
+    let power = meter.as_100ms_windows();
+    let loudness = match crate::bs1770::gated_mean(power) {
+        None => return Ok(wav.clone()),
+        Some(gp) => gp.loudness_lkfs() as f64,
+    };
+    let delta_loudness = -14. - loudness;
+    let gain = 10f64.powf(delta_loudness / 20.);
+    let wav = (wav * gain)?;
+    if loudness_compressor {
+        wav.tanh()
+    } else {
+        Ok(wav)
+    }
+}
+
+#[cfg(feature = "symphonia")]
+pub fn pcm_decode<P: AsRef<std::path::Path>>(path: P) -> Result<(Vec<f32>, u32)> {
+    use symphonia::core::audio::{AudioBufferRef, Signal};
+    use symphonia::core::codecs::{DecoderOptions, CODEC_TYPE_NULL};
+    use symphonia::core::conv::FromSample;
+
+    fn conv<T>(
+        samples: &mut Vec<f32>,
+        data: std::borrow::Cow<symphonia::core::audio::AudioBuffer<T>>,
+    ) where
+        T: symphonia::core::sample::Sample,
+        f32: symphonia::core::conv::FromSample<T>,
+    {
+        samples.extend(data.chan(0).iter().map(|v| f32::from_sample(*v)))
+    }
+
+    // Open the media source.
+    let src = std::fs::File::open(path).map_err(candle::Error::wrap)?;
+
+    // Create the media source stream.
+    let mss = symphonia::core::io::MediaSourceStream::new(Box::new(src), Default::default());
+
+    // Create a probe hint using the file's extension. [Optional]
+    let hint = symphonia::core::probe::Hint::new();
+
+    // Use the default options for metadata and format readers.
+    let meta_opts: symphonia::core::meta::MetadataOptions = Default::default();
+    let fmt_opts: symphonia::core::formats::FormatOptions = Default::default();
+
+    // Probe the media source.
+    let probed = symphonia::default::get_probe()
+        .format(&hint, mss, &fmt_opts, &meta_opts)
+        .map_err(candle::Error::wrap)?;
+    // Get the instantiated format reader.
+    let mut format = probed.format;
+
+    // Find the first audio track with a known (decodeable) codec.
+    let track = format
+        .tracks()
+        .iter()
+        .find(|t| t.codec_params.codec != CODEC_TYPE_NULL)
+        .ok_or_else(|| candle::Error::Msg("no supported audio tracks".to_string()))?;
+
+    // Use the default options for the decoder.
+    let dec_opts: DecoderOptions = Default::default();
+
+    // Create a decoder for the track.
+    let mut decoder = symphonia::default::get_codecs()
+        .make(&track.codec_params, &dec_opts)
+        .map_err(|_| candle::Error::Msg("unsupported codec".to_string()))?;
+    let track_id = track.id;
+    let sample_rate = track.codec_params.sample_rate.unwrap_or(0);
+    let mut pcm_data = Vec::new();
+    // The decode loop.
+    while let Ok(packet) = format.next_packet() {
+        // Consume any new metadata that has been read since the last packet.
+        while !format.metadata().is_latest() {
+            format.metadata().pop();
+        }
+
+        // If the packet does not belong to the selected track, skip over it.
+        if packet.track_id() != track_id {
+            continue;
+        }
+        match decoder.decode(&packet).map_err(candle::Error::wrap)? {
+            AudioBufferRef::F32(buf) => pcm_data.extend(buf.chan(0)),
+            AudioBufferRef::U8(data) => conv(&mut pcm_data, data),
+            AudioBufferRef::U16(data) => conv(&mut pcm_data, data),
+            AudioBufferRef::U24(data) => conv(&mut pcm_data, data),
+            AudioBufferRef::U32(data) => conv(&mut pcm_data, data),
+            AudioBufferRef::S8(data) => conv(&mut pcm_data, data),
+            AudioBufferRef::S16(data) => conv(&mut pcm_data, data),
+            AudioBufferRef::S24(data) => conv(&mut pcm_data, data),
+            AudioBufferRef::S32(data) => conv(&mut pcm_data, data),
+            AudioBufferRef::F64(data) => conv(&mut pcm_data, data),
+        }
+    }
+    Ok((pcm_data, sample_rate))
+}
+
+#[cfg(feature = "rubato")]
+pub fn resample(pcm_in: &[f32], sr_in: u32, sr_out: u32) -> Result<Vec<f32>> {
+    use rubato::Resampler;
+
+    let mut pcm_out =
+        Vec::with_capacity((pcm_in.len() as f64 * sr_out as f64 / sr_in as f64) as usize + 1024);
+
+    let mut resampler = rubato::FftFixedInOut::<f32>::new(sr_in as usize, sr_out as usize, 1024, 1)
+        .map_err(candle::Error::wrap)?;
+    let mut output_buffer = resampler.output_buffer_allocate(true);
+    let mut pos_in = 0;
+    while pos_in + resampler.input_frames_next() < pcm_in.len() {
+        let (in_len, out_len) = resampler
+            .process_into_buffer(&[&pcm_in[pos_in..]], &mut output_buffer, None)
+            .map_err(candle::Error::wrap)?;
+        pos_in += in_len;
+        pcm_out.extend_from_slice(&output_buffer[0][..out_len]);
+    }
+
+    if pos_in < pcm_in.len() {
+        let (_in_len, out_len) = resampler
+            .process_partial_into_buffer(Some(&[&pcm_in[pos_in..]]), &mut output_buffer, None)
+            .map_err(candle::Error::wrap)?;
+        pcm_out.extend_from_slice(&output_buffer[0][..out_len]);
+    }
+
+    Ok(pcm_out)
+}
--- a/crates/utils/src/bs1770.rs
+++ b/crates/utils/src/bs1770.rs
@@ -0,0 +1,506 @@
+// Copied from https://github.com/ruuda/bs1770/blob/master/src/lib.rs
+// BS1770 -- Loudness analysis library conforming to ITU-R BS.1770
+// Copyright 2020 Ruud van Asseldonk
+
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// A copy of the License has been included in the root of the repository.
+
+//! Loudness analysis conforming to [ITU-R BS.1770-4][bs17704].
+//!
+//! This library offers the building blocks to perform BS.1770 loudness
+//! measurements, but you need to put the pieces together yourself.
+//!
+//! [bs17704]: https://www.itu.int/rec/R-REC-BS.1770-4-201510-I/en
+//!
+//! # Stereo integrated loudness example
+//!
+//! ```ignore
+//! # fn load_stereo_audio() -> [Vec<i16>; 2] {
+//! #     [vec![0; 48_000], vec![0; 48_000]]
+//! # }
+//! #
+//! let sample_rate_hz = 44_100;
+//! let bits_per_sample = 16;
+//! let channel_samples: [Vec<i16>; 2] = load_stereo_audio();
+//!
+//! // When converting integer samples to float, note that the maximum amplitude
+//! // is `1 << (bits_per_sample - 1)`, one bit is the sign bit.
+//! let normalizer = 1.0 / (1_u64 << (bits_per_sample - 1)) as f32;
+//!
+//! let channel_power: Vec<_> = channel_samples.iter().map(|samples| {
+//!     let mut meter = bs1770::ChannelLoudnessMeter::new(sample_rate_hz);
+//!     meter.push(samples.iter().map(|&s| s as f32 * normalizer));
+//!     meter.into_100ms_windows()
+//! }).collect();
+//!
+//! let stereo_power = bs1770::reduce_stereo(
+//!     channel_power[0].as_ref(),
+//!     channel_power[1].as_ref(),
+//! );
+//!
+//! let gated_power = bs1770::gated_mean(
+//!     stereo_power.as_ref()
+//! ).unwrap_or(bs1770::Power(0.0));
+//! println!("Integrated loudness: {:.1} LUFS", gated_power.loudness_lkfs());
+//! ```
+
+use std::f32;
+
+/// Coefficients for a 2nd-degree infinite impulse response filter.
+///
+/// Coefficient a0 is implicitly 1.0.
+#[derive(Clone)]
+struct Filter {
+    a1: f32,
+    a2: f32,
+    b0: f32,
+    b1: f32,
+    b2: f32,
+
+    // The past two input and output samples.
+    x1: f32,
+    x2: f32,
+    y1: f32,
+    y2: f32,
+}
+
+impl Filter {
+    /// Stage 1 of th BS.1770-4 pre-filter.
+    pub fn high_shelf(sample_rate_hz: f32) -> Filter {
+        // Coefficients taken from https://github.com/csteinmetz1/pyloudnorm/blob/
+        // 6baa64d59b7794bc812e124438692e7fd2e65c0c/pyloudnorm/meter.py#L135-L136.
+        let gain_db = 3.999_843_8;
+        let q = 0.707_175_25;
+        let center_hz = 1_681.974_5;
+
+        // Formula taken from https://github.com/csteinmetz1/pyloudnorm/blob/
+        // 6baa64d59b7794bc812e124438692e7fd2e65c0c/pyloudnorm/iirfilter.py#L134-L143.
+        let k = (f32::consts::PI * center_hz / sample_rate_hz).tan();
+        let vh = 10.0_f32.powf(gain_db / 20.0);
+        let vb = vh.powf(0.499_666_78);
+        let a0 = 1.0 + k / q + k * k;
+        Filter {
+            b0: (vh + vb * k / q + k * k) / a0,
+            b1: 2.0 * (k * k - vh) / a0,
+            b2: (vh - vb * k / q + k * k) / a0,
+            a1: 2.0 * (k * k - 1.0) / a0,
+            a2: (1.0 - k / q + k * k) / a0,
+
+            x1: 0.0,
+            x2: 0.0,
+            y1: 0.0,
+            y2: 0.0,
+        }
+    }
+
+    /// Stage 2 of th BS.1770-4 pre-filter.
+    pub fn high_pass(sample_rate_hz: f32) -> Filter {
+        // Coefficients taken from https://github.com/csteinmetz1/pyloudnorm/blob/
+        // 6baa64d59b7794bc812e124438692e7fd2e65c0c/pyloudnorm/meter.py#L135-L136.
+        let q = 0.500_327_05;
+        let center_hz = 38.135_47;
+
+        // Formula taken from https://github.com/csteinmetz1/pyloudnorm/blob/
+        // 6baa64d59b7794bc812e124438692e7fd2e65c0c/pyloudnorm/iirfilter.py#L145-L151
+        let k = (f32::consts::PI * center_hz / sample_rate_hz).tan();
+        Filter {
+            a1: 2.0 * (k * k - 1.0) / (1.0 + k / q + k * k),
+            a2: (1.0 - k / q + k * k) / (1.0 + k / q + k * k),
+            b0: 1.0,
+            b1: -2.0,
+            b2: 1.0,
+
+            x1: 0.0,
+            x2: 0.0,
+            y1: 0.0,
+            y2: 0.0,
+        }
+    }
+
+    /// Feed the next input sample, get the next output sample.
+    #[inline(always)]
+    pub fn apply(&mut self, x0: f32) -> f32 {
+        let y0 = 0.0 + self.b0 * x0 + self.b1 * self.x1 + self.b2 * self.x2
+            - self.a1 * self.y1
+            - self.a2 * self.y2;
+
+        self.x2 = self.x1;
+        self.x1 = x0;
+        self.y2 = self.y1;
+        self.y1 = y0;
+
+        y0
+    }
+}
+
+/// Compensated sum, for summing many values of different orders of magnitude
+/// accurately.
+#[derive(Copy, Clone, PartialEq)]
+struct Sum {
+    sum: f32,
+    residue: f32,
+}
+
+impl Sum {
+    #[inline(always)]
+    fn zero() -> Sum {
+        Sum {
+            sum: 0.0,
+            residue: 0.0,
+        }
+    }
+
+    #[inline(always)]
+    fn add(&mut self, x: f32) {
+        let sum = self.sum + (self.residue + x);
+        self.residue = (self.residue + x) - (sum - self.sum);
+        self.sum = sum;
+    }
+}
+
+/// The mean of the squares of the K-weighted samples in a window of time.
+///
+/// K-weighted power is equivalent to K-weighted loudness, the only difference
+/// is one of scale: power is quadratic in sample amplitudes, whereas loudness
+/// units are logarithmic. `loudness_lkfs` and `from_lkfs` convert between power,
+/// and K-weighted Loudness Units relative to nominal Full Scale (LKFS).
+///
+/// The term “LKFS” (Loudness Units, K-Weighted, relative to nominal Full Scale)
+/// is used in BS.1770-4 to emphasize K-weighting, but the term is otherwise
+/// interchangeable with the more widespread term “LUFS” (Loudness Units,
+/// relative to Full Scale). Loudness units are related to decibels in the
+/// following sense: boosting a signal that has a loudness of
+/// -<var>L<sub>K</sub></var> LUFS by <var>L<sub>K</sub></var> dB (by
+/// multiplying the amplitude by 10<sup><var>L<sub>K</sub></var>/20</sup>) will
+/// bring the loudness to 0 LUFS.
+///
+/// K-weighting refers to a high-shelf and high-pass filter that model the
+/// effect that humans perceive a certain amount of power in low frequencies to
+/// be less loud than the same amount of power in higher frequencies. In this
+/// library the `Power` type is used exclusively to refer to power after applying K-weighting.
+///
+/// The nominal “full scale” is the range [-1.0, 1.0]. Because the power is the
+/// mean square of the samples, if no input samples exceeded the full scale, the
+/// power will be in the range [0.0, 1.0]. However, the power delivered by
+/// multiple channels, which is a weighted sum over individual channel powers,
+/// can exceed this range, because the weighted sum is not normalized.
+#[derive(Copy, Clone, PartialEq, PartialOrd)]
+pub struct Power(pub f32);
+
+impl Power {
+    /// Convert Loudness Units relative to Full Scale into a squared sample amplitude.
+    ///
+    /// This is the inverse of `loudness_lkfs`.
+    pub fn from_lkfs(lkfs: f32) -> Power {
+        // The inverse of the formula below.
+        Power(10.0_f32.powf((lkfs + 0.691) * 0.1))
+    }
+
+    /// Return the loudness of this window in Loudness Units, K-weighted, relative to Full Scale.
+    ///
+    /// This is the inverse of `from_lkfs`.
+    pub fn loudness_lkfs(&self) -> f32 {
+        // Equation 2 (p.5) of BS.1770-4.
+        -0.691 + 10.0 * self.0.log10()
+    }
+}
+
+/// A `T` value for non-overlapping windows of audio, 100ms in length.
+///
+/// The `ChannelLoudnessMeter` applies K-weighting and then produces the power
+/// for non-overlapping windows of 100ms duration.
+///
+/// These non-overlapping 100ms windows can later be combined into overlapping
+/// windows of 400ms, spaced 100ms apart, to compute instantaneous loudness or
+/// to perform a gated measurement, or they can be combined into even larger
+/// windows for a momentary loudness measurement.
+#[derive(Copy, Clone, Debug)]
+pub struct Windows100ms<T> {
+    pub inner: T,
+}
+
+impl<T> Windows100ms<T> {
+    /// Wrap a new empty vector.
+    pub fn new() -> Windows100ms<Vec<T>> {
+        Windows100ms { inner: Vec::new() }
+    }
+
+    /// Apply `as_ref` to the inner value.
+    pub fn as_ref(&self) -> Windows100ms<&[Power]>
+    where
+        T: AsRef<[Power]>,
+    {
+        Windows100ms {
+            inner: self.inner.as_ref(),
+        }
+    }
+
+    /// Apply `as_mut` to the inner value.
+    pub fn as_mut(&mut self) -> Windows100ms<&mut [Power]>
+    where
+        T: AsMut<[Power]>,
+    {
+        Windows100ms {
+            inner: self.inner.as_mut(),
+        }
+    }
+
+    #[allow(clippy::len_without_is_empty)]
+    /// Apply `len` to the inner value.
+    pub fn len(&self) -> usize
+    where
+        T: AsRef<[Power]>,
+    {
+        self.inner.as_ref().len()
+    }
+}
+
+/// Measures K-weighted power of non-overlapping 100ms windows of a single channel of audio.
+///
+/// # Output
+///
+/// The output of the meter is an intermediate result in the form of power for
+/// 100ms non-overlapping windows. The windows need to be processed further to
+/// get one of the instantaneous, momentary, and integrated loudness
+/// measurements defined in BS.1770.
+///
+/// The windows can also be inspected directly; the data is meaningful
+/// on its own (the K-weighted power delivered in that window of time), but it
+/// is not something that BS.1770 defines a term for.
+///
+/// # Multichannel audio
+///
+/// To perform a loudness measurement of multichannel audio, construct a
+/// `ChannelLoudnessMeter` per channel, and later combine the measured power
+/// with e.g. `reduce_stereo`.
+///
+/// # Instantaneous loudness
+///
+/// The instantaneous loudness is the power over a 400ms window, so you can
+/// average four 100ms windows. No special functionality is implemented to help
+/// with that at this time. ([Pull requests would be accepted.][contribute])
+///
+/// # Momentary loudness
+///
+/// The momentary loudness is the power over a 3-second window, so you can
+/// average thirty 100ms windows. No special functionality is implemented to
+/// help with that at this time. ([Pull requests would be accepted.][contribute])
+///
+/// # Integrated loudness
+///
+/// Use `gated_mean` to perform an integrated loudness measurement:
+///
+/// ```ignore
+/// # use std::iter;
+/// # use bs1770::{ChannelLoudnessMeter, gated_mean};
+/// # let sample_rate_hz = 44_100;
+/// # let samples_per_100ms = sample_rate_hz / 10;
+/// # let mut meter = ChannelLoudnessMeter::new(sample_rate_hz);
+/// # meter.push((0..44_100).map(|i| (i as f32 * 0.01).sin()));
+/// let integrated_loudness_lkfs = gated_mean(meter.as_100ms_windows())
+///     .unwrap_or(bs1770::Power(0.0))
+///     .loudness_lkfs();
+/// ```
+///
+/// [contribute]: https://github.com/ruuda/bs1770/blob/master/CONTRIBUTING.md
+#[derive(Clone)]
+pub struct ChannelLoudnessMeter {
+    /// The number of samples that fit in 100ms of audio.
+    samples_per_100ms: u32,
+
+    /// Stage 1 filter (head effects, high shelf).
+    filter_stage1: Filter,
+
+    /// Stage 2 filter (high-pass).
+    filter_stage2: Filter,
+
+    /// Sum of the squares over non-overlapping windows of 100ms.
+    windows: Windows100ms<Vec<Power>>,
+
+    /// The number of samples in the current unfinished window.
+    count: u32,
+
+    /// The sum of the squares of the samples in the current unfinished window.
+    square_sum: Sum,
+}
+
+impl ChannelLoudnessMeter {
+    /// Construct a new loudness meter for the given sample rate.
+    pub fn new(sample_rate_hz: u32) -> ChannelLoudnessMeter {
+        ChannelLoudnessMeter {
+            samples_per_100ms: sample_rate_hz / 10,
+            filter_stage1: Filter::high_shelf(sample_rate_hz as f32),
+            filter_stage2: Filter::high_pass(sample_rate_hz as f32),
+            windows: Windows100ms::new(),
+            count: 0,
+            square_sum: Sum::zero(),
+        }
+    }
+
+    /// Feed input samples for loudness analysis.
+    ///
+    /// # Full scale
+    ///
+    /// Full scale for the input samples is the interval [-1.0, 1.0]. If your
+    /// input consists of signed integer samples, you can convert as follows:
+    ///
+    /// ```ignore
+    /// # let mut meter = bs1770::ChannelLoudnessMeter::new(44_100);
+    /// # let bits_per_sample = 16_usize;
+    /// # let samples = &[0_i16];
+    /// // Note that the maximum amplitude is `1 << (bits_per_sample - 1)`,
+    /// // one bit is the sign bit.
+    /// let normalizer = 1.0 / (1_u64 << (bits_per_sample - 1)) as f32;
+    /// meter.push(samples.iter().map(|&s| s as f32 * normalizer));
+    /// ```
+    ///
+    /// # Repeated calls
+    ///
+    /// You can call `push` multiple times to feed multiple batches of samples.
+    /// This is equivalent to feeding a single chained iterator. The leftover of
+    /// samples that did not fill a full 100ms window is not discarded:
+    ///
+    /// ```ignore
+    /// # use std::iter;
+    /// # use bs1770::ChannelLoudnessMeter;
+    /// let sample_rate_hz = 44_100;
+    /// let samples_per_100ms = sample_rate_hz / 10;
+    /// let mut meter = ChannelLoudnessMeter::new(sample_rate_hz);
+    ///
+    /// meter.push(iter::repeat(0.0).take(samples_per_100ms as usize - 1));
+    /// assert_eq!(meter.as_100ms_windows().len(), 0);
+    ///
+    /// meter.push(iter::once(0.0));
+    /// assert_eq!(meter.as_100ms_windows().len(), 1);
+    /// ```
+    pub fn push<I: Iterator<Item = f32>>(&mut self, samples: I) {
+        let normalizer = 1.0 / self.samples_per_100ms as f32;
+
+        // LLVM, if you could go ahead and inline those apply calls, and then
+        // unroll and vectorize the loop, that'd be terrific.
+        for x in samples {
+            let y = self.filter_stage1.apply(x);
+            let z = self.filter_stage2.apply(y);
+
+            self.square_sum.add(z * z);
+            self.count += 1;
+
+            // TODO: Should this branch be marked cold?
+            if self.count == self.samples_per_100ms {
+                let mean_squares = Power(self.square_sum.sum * normalizer);
+                self.windows.inner.push(mean_squares);
+                // We intentionally do not reset the residue. That way, leftover
+                // energy from this window is not lost, so for the file overall,
+                // the sum remains more accurate.
+                self.square_sum.sum = 0.0;
+                self.count = 0;
+            }
+        }
+    }
+
+    /// Return a reference to the 100ms windows analyzed so far.
+    pub fn as_100ms_windows(&self) -> Windows100ms<&[Power]> {
+        self.windows.as_ref()
+    }
+
+    /// Return all 100ms windows analyzed so far.
+    pub fn into_100ms_windows(self) -> Windows100ms<Vec<Power>> {
+        self.windows
+    }
+}
+
+/// Combine power for multiple channels by taking a weighted sum.
+///
+/// Note that BS.1770-4 defines power for a multi-channel signal as a weighted
+/// sum over channels which is not normalized. This means that a stereo signal
+/// is inherently louder than a mono signal. For a mono signal played back on
+/// stereo speakers, you should therefore still apply `reduce_stereo`, passing
+/// in the same signal for both channels.
+pub fn reduce_stereo(
+    left: Windows100ms<&[Power]>,
+    right: Windows100ms<&[Power]>,
+) -> Windows100ms<Vec<Power>> {
+    assert_eq!(
+        left.len(),
+        right.len(),
+        "Channels must have the same length."
+    );
+    let mut result = Vec::with_capacity(left.len());
+    for (l, r) in left.inner.iter().zip(right.inner) {
+        result.push(Power(l.0 + r.0));
+    }
+    Windows100ms { inner: result }
+}
+
+/// In-place version of `reduce_stereo` that stores the result in the former left channel.
+pub fn reduce_stereo_in_place(left: Windows100ms<&mut [Power]>, right: Windows100ms<&[Power]>) {
+    assert_eq!(
+        left.len(),
+        right.len(),
+        "Channels must have the same length."
+    );
+    for (l, r) in left.inner.iter_mut().zip(right.inner) {
+        l.0 += r.0;
+    }
+}
+
+/// Perform gating and averaging for a BS.1770-4 integrated loudness measurement.
+///
+/// The integrated loudness measurement is not just the average power over the
+/// entire signal. BS.1770-4 defines two stages of gating that exclude
+/// parts of the signal, to ensure that silent parts do not contribute to the
+/// loudness measurement. This function performs that gating, and returns the
+/// average power over the windows that were not excluded.
+///
+/// The result of this function is the integrated loudness measurement.
+///
+/// When no signal remains after applying the gate, this function returns
+/// `None`. In particular, this happens when all of the signal is softer than
+/// -70 LKFS, including a signal that consists of pure silence.
+pub fn gated_mean(windows_100ms: Windows100ms<&[Power]>) -> Option<Power> {
+    let mut gating_blocks = Vec::with_capacity(windows_100ms.len());
+
+    // Stage 1: an absolute threshold of -70 LKFS. (Equation 6, p.6.)
+    let absolute_threshold = Power::from_lkfs(-70.0);
+
+    // Iterate over all 400ms windows.
+    for window in windows_100ms.inner.windows(4) {
+        // Note that the sum over channels has already been performed at this point.
+        let gating_block_power = Power(0.25 * window.iter().map(|mean| mean.0).sum::<f32>());
+
+        if gating_block_power > absolute_threshold {
+            gating_blocks.push(gating_block_power);
+        }
+    }
+
+    if gating_blocks.is_empty() {
+        return None;
+    }
+
+    // Compute the loudness after applying the absolute gate, in order to
+    // determine the threshold for the relative gate.
+    let mut sum_power = Sum::zero();
+    for &gating_block_power in &gating_blocks {
+        sum_power.add(gating_block_power.0);
+    }
+    let absolute_gated_power = Power(sum_power.sum / (gating_blocks.len() as f32));
+
+    // Stage 2: Apply the relative gate.
+    let relative_threshold = Power::from_lkfs(absolute_gated_power.loudness_lkfs() - 10.0);
+    let mut sum_power = Sum::zero();
+    let mut n_blocks = 0_usize;
+    for &gating_block_power in &gating_blocks {
+        if gating_block_power > relative_threshold {
+            sum_power.add(gating_block_power.0);
+            n_blocks += 1;
+        }
+    }
+
+    if n_blocks == 0 {
+        return None;
+    }
+
+    let relative_gated_power = Power(sum_power.sum / n_blocks as f32);
+    Some(relative_gated_power)
+}
--- a/crates/utils/src/coco_classes.rs
+++ b/crates/utils/src/coco_classes.rs
@@ -0,0 +1,82 @@
+pub const NAMES: [&str; 80] = [
+    "person",
+    "bicycle",
+    "car",
+    "motorbike",
+    "aeroplane",
+    "bus",
+    "train",
+    "truck",
+    "boat",
+    "traffic light",
+    "fire hydrant",
+    "stop sign",
+    "parking meter",
+    "bench",
+    "bird",
+    "cat",
+    "dog",
+    "horse",
+    "sheep",
+    "cow",
+    "elephant",
+    "bear",
+    "zebra",
+    "giraffe",
+    "backpack",
+    "umbrella",
+    "handbag",
+    "tie",
+    "suitcase",
+    "frisbee",
+    "skis",
+    "snowboard",
+    "sports ball",
+    "kite",
+    "baseball bat",
+    "baseball glove",
+    "skateboard",
+    "surfboard",
+    "tennis racket",
+    "bottle",
+    "wine glass",
+    "cup",
+    "fork",
+    "knife",
+    "spoon",
+    "bowl",
+    "banana",
+    "apple",
+    "sandwich",
+    "orange",
+    "broccoli",
+    "carrot",
+    "hot dog",
+    "pizza",
+    "donut",
+    "cake",
+    "chair",
+    "sofa",
+    "pottedplant",
+    "bed",
+    "diningtable",
+    "toilet",
+    "tvmonitor",
+    "laptop",
+    "mouse",
+    "remote",
+    "keyboard",
+    "cell phone",
+    "microwave",
+    "oven",
+    "toaster",
+    "sink",
+    "refrigerator",
+    "book",
+    "clock",
+    "vase",
+    "scissors",
+    "teddy bear",
+    "hair drier",
+    "toothbrush",
+];
--- a/crates/utils/src/imagenet.rs
+++ b/crates/utils/src/imagenet.rs
--- a/crates/utils/src/lib.rs
+++ b/crates/utils/src/lib.rs
@@ -0,0 +1,156 @@
+extern crate candle_core;
+extern crate candle_transformers;
+extern crate tokenizers;
+
+pub mod audio;
+pub mod bs1770;
+pub mod coco_classes;
+pub mod imagenet;
+pub mod token_output_stream;
+pub mod wav;
+use candle_core::{Device, Tensor, utils::{cuda_is_available, metal_is_available}};
+
+
+pub fn device(cpu: bool) -> Result<Device, anyhow::Error> {
+    if cpu {
+        Ok(Device::Cpu)
+    } else if cuda_is_available() {
+        Ok(Device::new_cuda(0)?)
+    } else if metal_is_available() {
+        Ok(Device::new_metal(0)?)
+    } else {
+        #[cfg(all(target_os = "macos", target_arch = "aarch64"))]
+        {
+            println!(
+                "Running on CPU, to run on GPU(metal), build this example with `--features metal`"
+            );
+        }
+        #[cfg(not(all(target_os = "macos", target_arch = "aarch64")))]
+        {
+            println!("Running on CPU, to run on GPU, build this example with `--features cuda`");
+        }
+        Ok(Device::Cpu)
+    }
+}
+
+pub fn load_image<P: AsRef<std::path::Path>>(
+    p: P,
+    resize_longest: Option<usize>,
+) -> Result<(Tensor, usize, usize), anyhow::Error> {
+    let img = image::ImageReader::open(p)?
+        .decode()
+        .map_err(candle_core::Error::wrap)?;
+    let (initial_h, initial_w) = (img.height() as usize, img.width() as usize);
+    let img = match resize_longest {
+        None => img,
+        Some(resize_longest) => {
+            let (height, width) = (img.height(), img.width());
+            let resize_longest = resize_longest as u32;
+            let (height, width) = if height < width {
+                let h = (resize_longest * height) / width;
+                (h, resize_longest)
+            } else {
+                let w = (resize_longest * width) / height;
+                (resize_longest, w)
+            };
+            img.resize_exact(width, height, image::imageops::FilterType::CatmullRom)
+        }
+    };
+    let (height, width) = (img.height() as usize, img.width() as usize);
+    let img = img.to_rgb8();
+    let data = img.into_raw();
+    let data = Tensor::from_vec(data, (height, width, 3), &Device::Cpu)?.permute((2, 0, 1))?;
+    Ok((data, initial_h, initial_w))
+}
+
+pub fn load_image_and_resize<P: AsRef<std::path::Path>>(
+    p: P,
+    width: usize,
+    height: usize,
+) -> candle_core::Result<Tensor> {
+    let img = image::ImageReader::open(p)?
+        .decode()
+        .map_err(candle_core::Error::wrap)?
+        .resize_to_fill(
+            width as u32,
+            height as u32,
+            image::imageops::FilterType::Triangle,
+        );
+    let img = img.to_rgb8();
+    let data = img.into_raw();
+    Tensor::from_vec(data, (width, height, 3), &Device::Cpu)?.permute((2, 0, 1))
+}
+
+/// Saves an image to disk using the image crate, this expects an input with shape
+/// (c, height, width).
+pub fn save_image<P: AsRef<std::path::Path>>(img: &Tensor, p: P) -> Result<(), anyhow::Error> {
+    let p = p.as_ref();
+    let (channel, height, width) = img.dims3()?;
+    if channel != 3 {
+        anyhow::bail!("save_image expects an input of shape (3, height, width)")
+    }
+    let img = img.permute((1, 2, 0))?.flatten_all()?;
+    let pixels = img.to_vec1::<u8>()?;
+    let image: image::ImageBuffer<image::Rgb<u8>, Vec<u8>> =
+        match image::ImageBuffer::from_raw(width as u32, height as u32, pixels) {
+            Some(image) => image,
+            None => anyhow::bail!("error saving image {p:?}"),
+        };
+    image.save(p).map_err(candle_core::Error::wrap)?;
+    Ok(())
+}
+
+/// Loads the safetensors files for a model from the hub based on a json index file.
+pub fn hub_load_safetensors(
+    repo: &hf_hub::api::sync::ApiRepo,
+    json_file: &str,
+) -> Result<Vec<std::path::PathBuf>, anyhow::Error> {
+    let json_file = repo.get(json_file).map_err(candle_core::Error::wrap)?;
+    let json_file = std::fs::File::open(json_file)?;
+    let json: serde_json::Value =
+        serde_json::from_reader(&json_file).map_err(candle_core::Error::wrap)?;
+    let weight_map = match json.get("weight_map") {
+        None => anyhow::bail!("no weight map in {json_file:?}"),
+        Some(serde_json::Value::Object(map)) => map,
+        Some(_) => anyhow::bail!("weight map in {json_file:?} is not a map"),
+    };
+    let mut safetensors_files = std::collections::HashSet::new();
+    for value in weight_map.values() {
+        if let Some(file) = value.as_str() {
+            safetensors_files.insert(file.to_string());
+        }
+    }
+    let safetensors_files = safetensors_files
+        .iter()
+        .map(|v| {
+            repo.get(v)
+                .map_err(|e| std::io::Error::new(std::io::ErrorKind::Other, e))
+        })
+        .collect::<Result<Vec<_>, std::io::Error, >>()?;
+    Ok(safetensors_files)
+}
+
+pub fn hub_load_local_safetensors<P: AsRef<std::path::Path>>(
+    path: P,
+    json_file: &str,
+) -> Result<Vec<std::path::PathBuf>, anyhow::Error> {
+    let path = path.as_ref();
+    let jsfile = std::fs::File::open(path.join(json_file))?;
+    let json: serde_json::Value = serde_json::from_reader(&jsfile).map_err(candle_core::Error::wrap)?;
+    let weight_map = match json.get("weight_map") {
+        None => anyhow::bail!("no weight map in {json_file:?}"),
+        Some(serde_json::Value::Object(map)) => map,
+        Some(_) => anyhow::bail!("weight map in {json_file:?} is not a map"),
+    };
+    let mut safetensors_files = std::collections::HashSet::new();
+    for value in weight_map.values() {
+        if let Some(file) = value.as_str() {
+            safetensors_files.insert(file);
+        }
+    }
+    let safetensors_files: Vec<_> = safetensors_files
+        .into_iter()
+        .map(|v| path.join(v))
+        .collect();
+    Ok(safetensors_files)
+}
--- a/crates/utils/src/main.rs
+++ b/crates/utils/src/main.rs
@@ -0,0 +1,3 @@
+fn main() {
+    println!("Hello, world!");
+}
--- a/crates/utils/src/token_output_stream.rs
+++ b/crates/utils/src/token_output_stream.rs
@@ -0,0 +1,85 @@
+use candle_core::Result;
+use tokenizers::Tokenizer;
+
+pub struct TokenOutputStream {
+    tokenizer: tokenizers::Tokenizer,
+    tokens: Vec<u32>,
+    prev_index: usize,
+    current_index: usize,
+}
+
+impl TokenOutputStream {
+    pub fn new(tokenizer: tokenizers::Tokenizer) -> Self {
+        Self {
+            tokenizer,
+            tokens: Vec::new(),
+            prev_index: 0,
+            current_index: 0,
+        }
+    }
+
+    pub fn into_inner(self) -> tokenizers::Tokenizer {
+        self.tokenizer
+    }
+
+    fn decode(&self, tokens: &[u32]) -> Result<String> {
+        match self.tokenizer.decode(tokens, true) {
+            Ok(str) => Ok(str),
+            Err(err) => candle_core::bail!("cannot decode: {err}"),
+        }
+    }
+
+    // https://github.com/huggingface/text-generation-inference/blob/5ba53d44a18983a4de32d122f4cb46f4a17d9ef6/server/text_generation_server/models/model.py#L68
+    pub fn next_token(&mut self, token: u32) -> Result<Option<String>> {
+        let prev_text = if self.tokens.is_empty() {
+            String::new()
+        } else {
+            let tokens = &self.tokens[self.prev_index..self.current_index];
+            self.decode(tokens)?
+        };
+        self.tokens.push(token);
+        let text = self.decode(&self.tokens[self.prev_index..])?;
+        if text.len() > prev_text.len() && text.chars().last().unwrap().is_alphanumeric() {
+            let text = text.split_at(prev_text.len());
+            self.prev_index = self.current_index;
+            self.current_index = self.tokens.len();
+            Ok(Some(text.1.to_string()))
+        } else {
+            Ok(None)
+        }
+    }
+
+    pub fn decode_rest(&self) -> Result<Option<String>> {
+        let prev_text = if self.tokens.is_empty() {
+            String::new()
+        } else {
+            let tokens = &self.tokens[self.prev_index..self.current_index];
+            self.decode(tokens)?
+        };
+        let text = self.decode(&self.tokens[self.prev_index..])?;
+        if text.len() > prev_text.len() {
+            let text = text.split_at(prev_text.len());
+            Ok(Some(text.1.to_string()))
+        } else {
+            Ok(None)
+        }
+    }
+
+    pub fn decode_all(&self) -> Result<String> {
+        self.decode(&self.tokens)
+    }
+
+    pub fn get_token(&self, token_s: &str) -> Option<u32> {
+        self.tokenizer.get_vocab(true).get(token_s).copied()
+    }
+
+    pub fn tokenizer(&self) -> &tokenizers::Tokenizer {
+        &self.tokenizer
+    }
+
+    pub fn clear(&mut self) {
+        self.tokens.clear();
+        self.prev_index = 0;
+        self.current_index = 0;
+    }
+}
--- a/crates/utils/src/wav.rs
+++ b/crates/utils/src/wav.rs
@@ -0,0 +1,56 @@
+use std::io::prelude::*;
+
+pub trait Sample {
+    fn to_i16(&self) -> i16;
+}
+
+impl Sample for f32 {
+    fn to_i16(&self) -> i16 {
+        (self.clamp(-1.0, 1.0) * 32767.0) as i16
+    }
+}
+
+impl Sample for f64 {
+    fn to_i16(&self) -> i16 {
+        (self.clamp(-1.0, 1.0) * 32767.0) as i16
+    }
+}
+
+impl Sample for i16 {
+    fn to_i16(&self) -> i16 {
+        *self
+    }
+}
+
+pub fn write_pcm_as_wav<W: Write, S: Sample>(
+    w: &mut W,
+    samples: &[S],
+    sample_rate: u32,
+) -> std::io::Result<()> {
+    let len = 12u32; // header
+    let len = len + 24u32; // fmt
+    let len = len + samples.len() as u32 * 2 + 8; // data
+    let n_channels = 1u16;
+    let bytes_per_second = sample_rate * 2 * n_channels as u32;
+    w.write_all(b"RIFF")?;
+    w.write_all(&(len - 8).to_le_bytes())?; // total length minus 8 bytes
+    w.write_all(b"WAVE")?;
+
+    // Format block
+    w.write_all(b"fmt ")?;
+    w.write_all(&16u32.to_le_bytes())?; // block len minus 8 bytes
+    w.write_all(&1u16.to_le_bytes())?; // PCM
+    w.write_all(&n_channels.to_le_bytes())?; // one channel
+    w.write_all(&sample_rate.to_le_bytes())?;
+    w.write_all(&bytes_per_second.to_le_bytes())?;
+    w.write_all(&2u16.to_le_bytes())?; // 2 bytes of data per sample
+    w.write_all(&16u16.to_le_bytes())?; // bits per sample
+
+    // Data block
+    w.write_all(b"data")?;
+    w.write_all(&(samples.len() as u32 * 2).to_le_bytes())?;
+    for sample in samples.iter() {
+        w.write_all(&sample.to_i16().to_le_bytes())?
+    }
+    Ok(())
+}