align dependencies across inference features

2025-09-08 22:46:44 +00:00 · 2025-08-31 10:49:04 -04:00
parent f5d2a85f2e
commit e6c417bd83
10 changed files with 17 additions and 3009 deletions
--- a/crates/inference-engine/Cargo.toml
+++ b/crates/inference-engine/Cargo.toml
@@ -4,26 +4,12 @@ version = "0.1.0"
 edition = "2021"

 [dependencies]
-accelerate-src = { version = "0.3.2", optional = true }
-candle-datasets = { version = "=0.9.1", optional = true }
-candle-nn = { version = "=0.9.1" }
-candle-transformers = { version = "=0.9.1" }
+candle-core = { git = "https://github.com/huggingface/candle.git" }
+candle-nn = { git = "https://github.com/huggingface/candle.git" }
+candle-transformers = { git = "https://github.com/huggingface/candle.git" }
 candle-flash-attn = { version = "=0.9.1", optional = true }
 candle-onnx = { version = "=0.9.1", optional = true }

-csv = "1.3.0"
-cudarc = { version = "0.16.3", features = ["std", "cublas", "cublaslt", "curand", "driver", "nvrtc", "f16", "cuda-version-from-build-system", "dynamic-linking"], default-features=false, optional = true }
-half = { version = "2.5.0", features = ["num-traits", "use-intrinsics", "rand_distr"], optional = true }
-hf-hub = { version = "0.4.1", features = ["tokio"] }
-image = { version = "0.25.2", default-features = false, features = ["jpeg", "png"] }
-intel-mkl-src = { version = "0.8.1", features = ["mkl-static-lp64-iomp"], optional = true }
-num-traits = { version = "0.2.15" }
-palette = { version = "0.7.6", optional = true }
-enterpolation = { version = "0.2.1", optional = true}
-pyo3 = { version = "0.22.0", features = ["auto-initialize", "abi3-py311"], optional = true }
-rayon = "1.7.0"
-rubato = { version = "0.15.0", optional = true }
-safetensors = "0.4.1"
 serde = { version = "1.0.171", features = ["derive"] }
 serde_json = "1.0.99"
 symphonia = { version = "0.5.3", features = ["all"], optional = true }
@@ -48,19 +34,11 @@ futures-util = "0.3.31"
 gemma-runner = { path = "../gemma-runner" }
 llama-runner = { path = "../llama-runner" }

-# --- Add this section for conditional compilation ---
 [target.'cfg(target_os = "macos")'.dependencies]
-# Use CPU backend for macOS to avoid Metal rotary-emb implementation issues
-candle-core = { version = "=0.9.1", features = ["metal"], optional = false }
+candle-core = { git = "https://github.com/huggingface/candle.git", features = ["metal"] }
+candle-nn = { git = "https://github.com/huggingface/candle.git", features = ["metal"] }
+candle-transformers = { git = "https://github.com/huggingface/candle.git", features = ["metal"] }

-[target.'cfg(not(target_os = "macos"))'.dependencies]
-# For Linux or other non-macOS systems, you likely want the CPU backend or CUDA
-# If you're building on Linux with a CUDA-enabled GPU:
-candle-core = { version = "=0.9.1", features = ["cuda"], default-features = false } # Or just "cuda" if not using default features
-
-# If you're building on Linux with only CPU:
-# candle-core = { version = "=0.9.1", default-features = false } # CPU is often the default, but good to be explicit
-# --- End of conditional compilation section ---

 [dev-dependencies]
 anyhow = { version = "1", features = ["backtrace"] }
--- a/crates/inference-engine/src/lib.rs
+++ b/crates/inference-engine/src/lib.rs
@@ -1,9 +1,6 @@
 // Expose modules for testing and library usage
 pub mod model;
 pub mod openai_types;
-pub mod text_generation;
-pub mod token_output_stream;
-pub mod utilities_lib;
 // pub mod cli;
 pub mod inference;
 pub mod server;
@@ -12,8 +9,6 @@ pub mod server;
 pub use inference::ModelInference;
 pub use model::{Model, Which};
 pub use server::{create_router, AppState};
-pub use text_generation::TextGeneration;
-pub use token_output_stream::TokenOutputStream;

 use std::env;
 use tracing_subscriber::{layer::SubscriberExt, util::SubscriberInitExt};
--- a/crates/inference-engine/src/openai_types.rs
+++ b/crates/inference-engine/src/openai_types.rs
@@ -1,6 +1,7 @@
 use either::Either;
 use serde::{Deserialize, Serialize};
 use std::collections::HashMap;
+use serde_json::json;
 use utoipa::ToSchema;

 /// Inner content structure for messages that can be either a string or key-value pairs
--- a/crates/inference-engine/src/text_generation.rs
+++ b/crates/inference-engine/src/text_generation.rs
--- a/crates/inference-engine/src/token_output_stream.rs
+++ b/crates/inference-engine/src/token_output_stream.rs
@@ -1,87 +0,0 @@
-use candle_core::Result;
-
-/// This is a wrapper around a tokenizer to ensure that tokens can be returned to the user in a
-/// streaming way rather than having to wait for the full decoding.
-pub struct TokenOutputStream {
-    tokenizer: tokenizers::Tokenizer,
-    tokens: Vec<u32>,
-    prev_index: usize,
-    current_index: usize,
-}
-
-impl TokenOutputStream {
-    pub fn new(tokenizer: tokenizers::Tokenizer) -> Self {
-        Self {
-            tokenizer,
-            tokens: Vec::new(),
-            prev_index: 0,
-            current_index: 0,
-        }
-    }
-
-    pub fn into_inner(self) -> tokenizers::Tokenizer {
-        self.tokenizer
-    }
-
-    fn decode(&self, tokens: &[u32]) -> Result<String> {
-        match self.tokenizer.decode(tokens, true) {
-            Ok(str) => Ok(str),
-            Err(err) => candle_core::bail!("cannot decode: {err}"),
-        }
-    }
-
-    // https://github.com/huggingface/text-generation-inference/blob/5ba53d44a18983a4de32d122f4cb46f4a17d9ef6/server/text_generation_server/models/model.py#L68
-    pub fn next_token(&mut self, token: u32) -> Result<Option<String>> {
-        let prev_text = if self.tokens.is_empty() {
-            String::new()
-        } else {
-            let tokens = &self.tokens[self.prev_index..self.current_index];
-            self.decode(tokens)?
-        };
-        self.tokens.push(token);
-        let text = self.decode(&self.tokens[self.prev_index..])?;
-        if text.len() > prev_text.len() {
-            // Modified to include all tokens, not just alphanumeric ones
-            let text = text.split_at(prev_text.len());
-            self.prev_index = self.current_index;
-            self.current_index = self.tokens.len();
-            Ok(Some(text.1.to_string()))
-        } else {
-            Ok(None)
-        }
-    }
-
-    pub fn decode_rest(&self) -> Result<Option<String>> {
-        let prev_text = if self.tokens.is_empty() {
-            String::new()
-        } else {
-            let tokens = &self.tokens[self.prev_index..self.current_index];
-            self.decode(tokens)?
-        };
-        let text = self.decode(&self.tokens[self.prev_index..])?;
-        if text.len() > prev_text.len() {
-            let text = text.split_at(prev_text.len());
-            Ok(Some(text.1.to_string()))
-        } else {
-            Ok(None)
-        }
-    }
-
-    pub fn decode_all(&self) -> Result<String> {
-        self.decode(&self.tokens)
-    }
-
-    pub fn get_token(&self, token_s: &str) -> Option<u32> {
-        self.tokenizer.get_vocab(true).get(token_s).copied()
-    }
-
-    pub fn tokenizer(&self) -> &tokenizers::Tokenizer {
-        &self.tokenizer
-    }
-
-    pub fn clear(&mut self) {
-        self.tokens.clear();
-        self.prev_index = 0;
-        self.current_index = 0;
-    }
-}
--- a/crates/inference-engine/src/utilities_lib.rs
+++ b/crates/inference-engine/src/utilities_lib.rs
@@ -1,168 +0,0 @@
-use candle_core::utils::{cuda_is_available, metal_is_available};
-use candle_core::{Device, Result, Tensor};
-
-pub fn device(cpu: bool) -> Result<Device> {
-    if cpu {
-        Ok(Device::Cpu)
-    } else if cuda_is_available() {
-        Ok(Device::new_cuda(0)?)
-    } else if metal_is_available() {
-        Ok(Device::new_metal(0)?)
-    } else {
-        #[cfg(all(target_os = "macos", target_arch = "aarch64"))]
-        {
-            println!(
-                "Running on CPU, to run on GPU(metal), build this example with `--features metal`"
-            );
-        }
-        #[cfg(not(all(target_os = "macos", target_arch = "aarch64")))]
-        {
-            println!("Running on CPU, to run on GPU, build this example with `--features cuda`");
-        }
-        Ok(Device::Cpu)
-    }
-}
-
-pub fn load_image<P: AsRef<std::path::Path>>(
-    p: P,
-    resize_longest: Option<usize>,
-) -> Result<(Tensor, usize, usize)> {
-    let img = image::ImageReader::open(p)?
-        .decode()
-        .map_err(candle_core::Error::wrap)?;
-    let (initial_h, initial_w) = (img.height() as usize, img.width() as usize);
-    let img = match resize_longest {
-        None => img,
-        Some(resize_longest) => {
-            let (height, width) = (img.height(), img.width());
-            let resize_longest = resize_longest as u32;
-            let (height, width) = if height < width {
-                let h = (resize_longest * height) / width;
-                (h, resize_longest)
-            } else {
-                let w = (resize_longest * width) / height;
-                (resize_longest, w)
-            };
-            img.resize_exact(width, height, image::imageops::FilterType::CatmullRom)
-        }
-    };
-    let (height, width) = (img.height() as usize, img.width() as usize);
-    let img = img.to_rgb8();
-    let data = img.into_raw();
-    let data = Tensor::from_vec(data, (height, width, 3), &Device::Cpu)?.permute((2, 0, 1))?;
-    Ok((data, initial_h, initial_w))
-}
-
-pub fn load_image_and_resize<P: AsRef<std::path::Path>>(
-    p: P,
-    width: usize,
-    height: usize,
-) -> Result<Tensor> {
-    let img = image::ImageReader::open(p)?
-        .decode()
-        .map_err(candle_core::Error::wrap)?
-        .resize_to_fill(
-            width as u32,
-            height as u32,
-            image::imageops::FilterType::Triangle,
-        );
-    let img = img.to_rgb8();
-    let data = img.into_raw();
-    Tensor::from_vec(data, (width, height, 3), &Device::Cpu)?.permute((2, 0, 1))
-}
-
-/// Saves an image to disk using the image crate, this expects an input with shape
-/// (c, height, width).
-pub fn save_image<P: AsRef<std::path::Path>>(img: &Tensor, p: P) -> Result<()> {
-    let p = p.as_ref();
-    let (channel, height, width) = img.dims3()?;
-    if channel != 3 {
-        candle_core::bail!("save_image expects an input of shape (3, height, width)")
-    }
-    let img = img.permute((1, 2, 0))?.flatten_all()?;
-    let pixels = img.to_vec1::<u8>()?;
-    let image: image::ImageBuffer<image::Rgb<u8>, Vec<u8>> =
-        match image::ImageBuffer::from_raw(width as u32, height as u32, pixels) {
-            Some(image) => image,
-            None => candle_core::bail!("error saving image {p:?}"),
-        };
-    image.save(p).map_err(candle_core::Error::wrap)?;
-    Ok(())
-}
-
-pub fn save_image_resize<P: AsRef<std::path::Path>>(
-    img: &Tensor,
-    p: P,
-    h: usize,
-    w: usize,
-) -> Result<()> {
-    let p = p.as_ref();
-    let (channel, height, width) = img.dims3()?;
-    if channel != 3 {
-        candle_core::bail!("save_image expects an input of shape (3, height, width)")
-    }
-    let img = img.permute((1, 2, 0))?.flatten_all()?;
-    let pixels = img.to_vec1::<u8>()?;
-    let image: image::ImageBuffer<image::Rgb<u8>, Vec<u8>> =
-        match image::ImageBuffer::from_raw(width as u32, height as u32, pixels) {
-            Some(image) => image,
-            None => candle_core::bail!("error saving image {p:?}"),
-        };
-    let image = image::DynamicImage::from(image);
-    let image = image.resize_to_fill(w as u32, h as u32, image::imageops::FilterType::CatmullRom);
-    image.save(p).map_err(candle_core::Error::wrap)?;
-    Ok(())
-}
-
-/// Loads the safetensors files for a model from the hub based on a json index file.
-pub fn hub_load_safetensors(
-    repo: &hf_hub::api::sync::ApiRepo,
-    json_file: &str,
-) -> Result<Vec<std::path::PathBuf>> {
-    let json_file = repo.get(json_file).map_err(candle_core::Error::wrap)?;
-    let json_file = std::fs::File::open(json_file)?;
-    let json: serde_json::Value =
-        serde_json::from_reader(&json_file).map_err(candle_core::Error::wrap)?;
-    let weight_map = match json.get("weight_map") {
-        None => candle_core::bail!("no weight map in {json_file:?}"),
-        Some(serde_json::Value::Object(map)) => map,
-        Some(_) => candle_core::bail!("weight map in {json_file:?} is not a map"),
-    };
-    let mut safetensors_files = std::collections::HashSet::new();
-    for value in weight_map.values() {
-        if let Some(file) = value.as_str() {
-            safetensors_files.insert(file.to_string());
-        }
-    }
-    let safetensors_files = safetensors_files
-        .iter()
-        .map(|v| repo.get(v).map_err(candle_core::Error::wrap))
-        .collect::<Result<Vec<_>>>()?;
-    Ok(safetensors_files)
-}
-
-pub fn hub_load_local_safetensors<P: AsRef<std::path::Path>>(
-    path: P,
-    json_file: &str,
-) -> Result<Vec<std::path::PathBuf>> {
-    let path = path.as_ref();
-    let jsfile = std::fs::File::open(path.join(json_file))?;
-    let json: serde_json::Value =
-        serde_json::from_reader(&jsfile).map_err(candle_core::Error::wrap)?;
-    let weight_map = match json.get("weight_map") {
-        None => candle_core::bail!("no weight map in {json_file:?}"),
-        Some(serde_json::Value::Object(map)) => map,
-        Some(_) => candle_core::bail!("weight map in {json_file:?} is not a map"),
-    };
-    let mut safetensors_files = std::collections::HashSet::new();
-    for value in weight_map.values() {
-        if let Some(file) = value.as_str() {
-            safetensors_files.insert(file);
-        }
-    }
-    let safetensors_files: Vec<_> = safetensors_files
-        .into_iter()
-        .map(|v| path.join(v))
-        .collect();
-    Ok(safetensors_files)
-}
--- a/crates/inference-engine/tests/text_generation_tests.rs
+++ b/crates/inference-engine/tests/text_generation_tests.rs
@@ -1,554 +0,0 @@
-use anyhow::Result;
-use candle_core::{Device, Tensor};
-use candle_transformers::generation::LogitsProcessor;
-use inference_engine::model::Which;
-use inference_engine::text_generation::TextGeneration;
-use inference_engine::token_output_stream::TokenOutputStream;
-use std::collections::HashMap;
-use tokenizers::Tokenizer;
-
-#[cfg(test)]
-mod tests {
-    use super::*;
-
-    // Helper function to create a simple tokenizer for testing
-    fn create_test_tokenizer() -> Result<Tokenizer> {
-        // Create a simple tokenizer from the pretrained model
-        // This uses the tokenizer from the Hugging Face hub
-        let tokenizer = Tokenizer::from_pretrained("google/gemma-2b", None).unwrap();
-        Ok(tokenizer)
-    }
-
-    // Test the Which enum's to_model_id method
-    #[test]
-    fn test_which_model_id() {
-        assert_eq!(Which::Base2B.to_model_id(), "google/gemma-2b");
-        assert_eq!(Which::Instruct7B.to_model_id(), "google/gemma-7b-it");
-    }
-
-    // Test the Which enum's is_instruct_model method
-    #[test]
-    fn test_which_is_instruct() {
-        assert!(!Which::Base2B.is_instruct_model());
-        assert!(Which::Instruct7B.is_instruct_model());
-    }
-
-    // Test the Which enum's is_v3_model method
-    #[test]
-    fn test_which_is_v3() {
-        assert!(!Which::Base2B.is_v3_model());
-        assert!(Which::BaseV3_1B.is_v3_model());
-    }
-
-    // Test the TokenOutputStream functionality
-    #[test]
-    fn test_token_output_stream() -> Result<()> {
-        let tokenizer = create_test_tokenizer()?;
-        let mut token_stream = TokenOutputStream::new(tokenizer);
-
-        // Test encoding and decoding
-        let text = "Hello, world!";
-        let encoded = token_stream.tokenizer().encode(text, true).unwrap();
-        let token_ids = encoded.get_ids();
-
-        // Add tokens one by one
-        for &token_id in token_ids {
-            token_stream.next_token(token_id)?;
-        }
-
-        // Decode all and check
-        let decoded = token_stream.decode_all()?;
-        assert_eq!(decoded.trim(), text);
-
-        Ok(())
-    }
-
-    // Test the LogitsProcessor
-    #[test]
-    fn test_logits_processor() -> Result<()> {
-        // Create a LogitsProcessor with default settings
-        let seed = 42;
-        let temp = Some(0.8);
-        let top_p = Some(0.9);
-        let logits_processor = LogitsProcessor::new(seed, temp, top_p);
-
-        // Create a simple logits tensor
-        // In a real test, we would create a tensor with known values and verify
-        // that sampling produces expected results
-
-        // For now, we'll just verify that the LogitsProcessor can be created
-        assert!(true);
-        Ok(())
-    }
-
-    // Test the TextGeneration constructor
-    #[test]
-    fn test_text_generation_constructor() -> Result<()> {
-        // We can't easily create a Model instance for testing,
-        // but we can test that the constructor compiles and the types are correct
-
-        // In a real test with a mock Model, we would:
-        // 1. Create a mock model
-        // 2. Create a tokenizer
-        // 3. Call TextGeneration::new
-        // 4. Verify the properties of the created instance
-
-        // For now, we'll just verify that the code compiles
-        assert!(true);
-        Ok(())
-    }
-
-    // Test apply_cached_repeat_penalty method with no penalty
-    #[test]
-    fn test_apply_cached_repeat_penalty_no_penalty() -> Result<()> {
-        // Create a simple test setup
-        let device = Device::Cpu;
-        let logits_data = vec![1.0f32, 2.0, 3.0, 4.0, 5.0];
-        let logits = Tensor::new(&logits_data[..], &device)?;
-        let tokens = vec![1u32, 2u32, 3u32];
-
-        // Create a mock TextGeneration instance
-        // Since we can't easily create a full TextGeneration instance without a model,
-        // we'll test the logic by creating a simple struct with the necessary fields
-        struct MockTextGeneration {
-            repeat_penalty: f32,
-            repeat_last_n: usize,
-            penalty_cache: HashMap<usize, f32>,
-        }
-
-        impl MockTextGeneration {
-            fn apply_cached_repeat_penalty(
-                &mut self,
-                logits: Tensor,
-                tokens: &[u32],
-            ) -> Result<(Tensor, std::time::Duration)> {
-                let repeat_start = std::time::Instant::now();
-
-                // If no penalty, return the original logits
-                if self.repeat_penalty == 1.0 {
-                    return Ok((logits, repeat_start.elapsed()));
-                }
-
-                // Get the tokens to penalize (the last n tokens)
-                let start_at = tokens.len().saturating_sub(self.repeat_last_n);
-                let penalty_tokens = &tokens[start_at..];
-
-                // Extract logits to a vector for modification
-                let mut logits_vec = logits.to_vec1::<f32>()?;
-                let cache_hits = std::cell::Cell::new(0);
-
-                // Apply penalties with caching
-                for &token_id in penalty_tokens {
-                    let token_id = token_id as usize;
-                    if token_id < logits_vec.len() {
-                        // Check if we've already calculated this token's penalty
-                        if let Some(penalized_score) = self.penalty_cache.get(&token_id) {
-                            // Use cached value
-                            logits_vec[token_id] = *penalized_score;
-                            cache_hits.set(cache_hits.get() + 1);
-                        } else {
-                            // Calculate and cache new value
-                            let score = logits_vec[token_id];
-                            let sign = if score < 0.0 { -1.0 } else { 1.0 };
-                            let penalized_score = sign * score / self.repeat_penalty;
-                            logits_vec[token_id] = penalized_score;
-                            self.penalty_cache.insert(token_id, penalized_score);
-                        }
-                    }
-                }
-
-                // Create a new tensor with the modified logits
-                let device = logits.device().clone();
-                let shape = logits.shape().clone();
-                let new_logits = Tensor::new(&logits_vec[..], &device)?;
-                let result = new_logits.reshape(shape)?;
-
-                let elapsed = repeat_start.elapsed();
-                Ok((result, elapsed))
-            }
-        }
-
-        let mut mock_gen = MockTextGeneration {
-            repeat_penalty: 1.0, // No penalty
-            repeat_last_n: 3,
-            penalty_cache: HashMap::new(),
-        };
-
-        let (result_logits, _duration) =
-            mock_gen.apply_cached_repeat_penalty(logits.clone(), &tokens)?;
-        let result_data = result_logits.to_vec1::<f32>()?;
-
-        // With no penalty, logits should be unchanged
-        assert_eq!(result_data, logits_data);
-        Ok(())
-    }
-
-    // Test apply_cached_repeat_penalty method with penalty
-    #[test]
-    fn test_apply_cached_repeat_penalty_with_penalty() -> Result<()> {
-        let device = Device::Cpu;
-        let logits_data = vec![1.0f32, 2.0, 3.0, 4.0, 5.0];
-        let logits = Tensor::new(&logits_data[..], &device)?;
-        let tokens = vec![1u32, 2u32, 3u32];
-
-        struct MockTextGeneration {
-            repeat_penalty: f32,
-            repeat_last_n: usize,
-            penalty_cache: HashMap<usize, f32>,
-        }
-
-        impl MockTextGeneration {
-            fn apply_cached_repeat_penalty(
-                &mut self,
-                logits: Tensor,
-                tokens: &[u32],
-            ) -> Result<(Tensor, std::time::Duration)> {
-                let repeat_start = std::time::Instant::now();
-
-                if self.repeat_penalty == 1.0 {
-                    return Ok((logits, repeat_start.elapsed()));
-                }
-
-                let start_at = tokens.len().saturating_sub(self.repeat_last_n);
-                let penalty_tokens = &tokens[start_at..];
-                let mut logits_vec = logits.to_vec1::<f32>()?;
-                let cache_hits = std::cell::Cell::new(0);
-
-                for &token_id in penalty_tokens {
-                    let token_id = token_id as usize;
-                    if token_id < logits_vec.len() {
-                        if let Some(penalized_score) = self.penalty_cache.get(&token_id) {
-                            logits_vec[token_id] = *penalized_score;
-                            cache_hits.set(cache_hits.get() + 1);
-                        } else {
-                            let score = logits_vec[token_id];
-                            let sign = if score < 0.0 { -1.0 } else { 1.0 };
-                            let penalized_score = sign * score / self.repeat_penalty;
-                            logits_vec[token_id] = penalized_score;
-                            self.penalty_cache.insert(token_id, penalized_score);
-                        }
-                    }
-                }
-
-                let device = logits.device().clone();
-                let shape = logits.shape().clone();
-                let new_logits = Tensor::new(&logits_vec[..], &device)?;
-                let result = new_logits.reshape(shape)?;
-
-                let elapsed = repeat_start.elapsed();
-                Ok((result, elapsed))
-            }
-        }
-
-        let mut mock_gen = MockTextGeneration {
-            repeat_penalty: 2.0, // Apply penalty
-            repeat_last_n: 3,
-            penalty_cache: HashMap::new(),
-        };
-
-        let (result_logits, _duration) =
-            mock_gen.apply_cached_repeat_penalty(logits.clone(), &tokens)?;
-        let result_data = result_logits.to_vec1::<f32>()?;
-
-        // Tokens 1, 2, 3 should be penalized (divided by 2.0)
-        let expected = vec![1.0f32, 1.0, 1.5, 2.0, 5.0]; // [1.0, 2.0/2.0, 3.0/2.0, 4.0/2.0, 5.0]
-        assert_eq!(result_data, expected);
-        Ok(())
-    }
-
-    // Test apply_cached_repeat_penalty caching behavior
-    #[test]
-    fn test_apply_cached_repeat_penalty_caching() -> Result<()> {
-        let device = Device::Cpu;
-        let logits_data = vec![1.0f32, 2.0, 3.0, 4.0, 5.0];
-        let logits = Tensor::new(&logits_data[..], &device)?;
-        let tokens = vec![1u32, 1u32, 1u32]; // Repeated token should use cache
-
-        struct MockTextGeneration {
-            repeat_penalty: f32,
-            repeat_last_n: usize,
-            penalty_cache: HashMap<usize, f32>,
-        }
-
-        impl MockTextGeneration {
-            fn apply_cached_repeat_penalty(
-                &mut self,
-                logits: Tensor,
-                tokens: &[u32],
-            ) -> Result<(Tensor, std::time::Duration)> {
-                let repeat_start = std::time::Instant::now();
-
-                if self.repeat_penalty == 1.0 {
-                    return Ok((logits, repeat_start.elapsed()));
-                }
-
-                let start_at = tokens.len().saturating_sub(self.repeat_last_n);
-                let penalty_tokens = &tokens[start_at..];
-                let mut logits_vec = logits.to_vec1::<f32>()?;
-
-                for &token_id in penalty_tokens {
-                    let token_id = token_id as usize;
-                    if token_id < logits_vec.len() {
-                        if let Some(penalized_score) = self.penalty_cache.get(&token_id) {
-                            logits_vec[token_id] = *penalized_score;
-                        } else {
-                            let score = logits_vec[token_id];
-                            let sign = if score < 0.0 { -1.0 } else { 1.0 };
-                            let penalized_score = sign * score / self.repeat_penalty;
-                            logits_vec[token_id] = penalized_score;
-                            self.penalty_cache.insert(token_id, penalized_score);
-                        }
-                    }
-                }
-
-                let device = logits.device().clone();
-                let shape = logits.shape().clone();
-                let new_logits = Tensor::new(&logits_vec[..], &device)?;
-                let result = new_logits.reshape(shape)?;
-
-                let elapsed = repeat_start.elapsed();
-                Ok((result, elapsed))
-            }
-        }
-
-        let mut mock_gen = MockTextGeneration {
-            repeat_penalty: 2.0,
-            repeat_last_n: 3,
-            penalty_cache: HashMap::new(),
-        };
-
-        // First call should cache the penalty for token 1
-        let (_result_logits, _duration) =
-            mock_gen.apply_cached_repeat_penalty(logits.clone(), &tokens)?;
-
-        // Cache should contain the penalized value for token 1
-        assert!(mock_gen.penalty_cache.contains_key(&1));
-        assert_eq!(mock_gen.penalty_cache.get(&1), Some(&1.0)); // 2.0 / 2.0 = 1.0
-
-        Ok(())
-    }
-
-    // Test edge case: empty tokens array
-    #[test]
-    fn test_apply_cached_repeat_penalty_empty_tokens() -> Result<()> {
-        let device = Device::Cpu;
-        let logits_data = vec![1.0f32, 2.0, 3.0, 4.0, 5.0];
-        let logits = Tensor::new(&logits_data[..], &device)?;
-        let tokens: Vec<u32> = vec![]; // Empty tokens
-
-        struct MockTextGeneration {
-            repeat_penalty: f32,
-            repeat_last_n: usize,
-            penalty_cache: HashMap<usize, f32>,
-        }
-
-        impl MockTextGeneration {
-            fn apply_cached_repeat_penalty(
-                &mut self,
-                logits: Tensor,
-                tokens: &[u32],
-            ) -> Result<(Tensor, std::time::Duration)> {
-                let repeat_start = std::time::Instant::now();
-
-                if self.repeat_penalty == 1.0 {
-                    return Ok((logits, repeat_start.elapsed()));
-                }
-
-                let start_at = tokens.len().saturating_sub(self.repeat_last_n);
-                let penalty_tokens = &tokens[start_at..];
-                let mut logits_vec = logits.to_vec1::<f32>()?;
-
-                for &token_id in penalty_tokens {
-                    let token_id = token_id as usize;
-                    if token_id < logits_vec.len() {
-                        if let Some(penalized_score) = self.penalty_cache.get(&token_id) {
-                            logits_vec[token_id] = *penalized_score;
-                        } else {
-                            let score = logits_vec[token_id];
-                            let sign = if score < 0.0 { -1.0 } else { 1.0 };
-                            let penalized_score = sign * score / self.repeat_penalty;
-                            logits_vec[token_id] = penalized_score;
-                            self.penalty_cache.insert(token_id, penalized_score);
-                        }
-                    }
-                }
-
-                let device = logits.device().clone();
-                let shape = logits.shape().clone();
-                let new_logits = Tensor::new(&logits_vec[..], &device)?;
-                let result = new_logits.reshape(shape)?;
-
-                let elapsed = repeat_start.elapsed();
-                Ok((result, elapsed))
-            }
-        }
-
-        let mut mock_gen = MockTextGeneration {
-            repeat_penalty: 2.0,
-            repeat_last_n: 3,
-            penalty_cache: HashMap::new(),
-        };
-
-        let (result_logits, _duration) =
-            mock_gen.apply_cached_repeat_penalty(logits.clone(), &tokens)?;
-        let result_data = result_logits.to_vec1::<f32>()?;
-
-        // With empty tokens, logits should be unchanged
-        assert_eq!(result_data, logits_data);
-        Ok(())
-    }
-
-    // Test edge case: out-of-bounds token IDs
-    #[test]
-    fn test_apply_cached_repeat_penalty_out_of_bounds() -> Result<()> {
-        let device = Device::Cpu;
-        let logits_data = vec![1.0f32, 2.0, 3.0];
-        let logits = Tensor::new(&logits_data[..], &device)?;
-        let tokens = vec![1u32, 5u32, 10u32]; // Token 5 and 10 are out of bounds
-
-        struct MockTextGeneration {
-            repeat_penalty: f32,
-            repeat_last_n: usize,
-            penalty_cache: HashMap<usize, f32>,
-        }
-
-        impl MockTextGeneration {
-            fn apply_cached_repeat_penalty(
-                &mut self,
-                logits: Tensor,
-                tokens: &[u32],
-            ) -> Result<(Tensor, std::time::Duration)> {
-                let repeat_start = std::time::Instant::now();
-
-                if self.repeat_penalty == 1.0 {
-                    return Ok((logits, repeat_start.elapsed()));
-                }
-
-                let start_at = tokens.len().saturating_sub(self.repeat_last_n);
-                let penalty_tokens = &tokens[start_at..];
-                let mut logits_vec = logits.to_vec1::<f32>()?;
-
-                for &token_id in penalty_tokens {
-                    let token_id = token_id as usize;
-                    if token_id < logits_vec.len() {
-                        if let Some(penalized_score) = self.penalty_cache.get(&token_id) {
-                            logits_vec[token_id] = *penalized_score;
-                        } else {
-                            let score = logits_vec[token_id];
-                            let sign = if score < 0.0 { -1.0 } else { 1.0 };
-                            let penalized_score = sign * score / self.repeat_penalty;
-                            logits_vec[token_id] = penalized_score;
-                            self.penalty_cache.insert(token_id, penalized_score);
-                        }
-                    }
-                }
-
-                let device = logits.device().clone();
-                let shape = logits.shape().clone();
-                let new_logits = Tensor::new(&logits_vec[..], &device)?;
-                let result = new_logits.reshape(shape)?;
-
-                let elapsed = repeat_start.elapsed();
-                Ok((result, elapsed))
-            }
-        }
-
-        let mut mock_gen = MockTextGeneration {
-            repeat_penalty: 2.0,
-            repeat_last_n: 3,
-            penalty_cache: HashMap::new(),
-        };
-
-        let (result_logits, _duration) =
-            mock_gen.apply_cached_repeat_penalty(logits.clone(), &tokens)?;
-        let result_data = result_logits.to_vec1::<f32>()?;
-
-        // Only token 1 should be penalized, out-of-bounds tokens should be ignored
-        let expected = vec![1.0f32, 1.0, 3.0]; // [1.0, 2.0/2.0, 3.0]
-        assert_eq!(result_data, expected);
-        Ok(())
-    }
-
-    // Test the actual apply_cached_repeat_penalty method from TextGeneration
-    // This test creates a TextGeneration instance with minimal dependencies to test the real method
-    #[test]
-    fn test_actual_apply_cached_repeat_penalty_implementation() -> Result<()> {
-        // Since creating a real TextGeneration instance requires a Model which needs model weights,
-        // we'll create a test that demonstrates the method is now public and can be accessed.
-        // The comprehensive functionality testing is already covered by the mock tests above.
-
-        // Test data setup
-        let device = Device::Cpu;
-        let logits_data = vec![1.0f32, 2.0, 3.0, 4.0, 5.0];
-        let logits = Tensor::new(&logits_data[..], &device)?;
-        let tokens = vec![1u32, 2u32, 3u32];
-
-        // Test that we can create the necessary components
-        let tokenizer = create_test_tokenizer()?;
-
-        // The method is now public as confirmed by making it pub fn apply_cached_repeat_penalty
-        // This test verifies the method signature and that it's accessible from external code
-
-        // We could create a TextGeneration instance if we had a way to mock the Model,
-        // but for now we confirm that the existing mock tests cover the functionality
-        // and the method is properly exposed as public
-
-        println!("apply_cached_repeat_penalty method is now public and accessible for testing");
-        assert!(true);
-        Ok(())
-    }
-
-    // Integration test that demonstrates the method usage pattern
-    #[test]
-    fn test_apply_cached_repeat_penalty_usage_pattern() -> Result<()> {
-        // This test demonstrates how the apply_cached_repeat_penalty method would be used
-        // in practice, even though we can't create a full TextGeneration instance in unit tests
-
-        let device = Device::Cpu;
-        let logits_data = vec![1.5f32, 2.5, 3.5, 4.5, 5.5];
-        let logits = Tensor::new(&logits_data[..], &device)?;
-        let tokens = vec![1u32, 2u32, 1u32, 3u32]; // Repeated token 1 to test caching
-
-        // Test parameters that would be used with TextGeneration
-        let repeat_penalty = 1.2f32;
-        let repeat_last_n = 3usize;
-        let mut penalty_cache: HashMap<usize, f32> = HashMap::new();
-
-        // Simulate the method's logic to verify it works as expected
-        let start_time = std::time::Instant::now();
-
-        if repeat_penalty != 1.0 {
-            let start_at = tokens.len().saturating_sub(repeat_last_n);
-            let penalty_tokens = &tokens[start_at..];
-            let mut logits_vec = logits.to_vec1::<f32>()?;
-
-            for &token_id in penalty_tokens {
-                let token_id = token_id as usize;
-                if token_id < logits_vec.len() {
-                    if let Some(_cached_score) = penalty_cache.get(&token_id) {
-                        // Cache hit simulation
-                    } else {
-                        let score = logits_vec[token_id];
-                        let sign = if score < 0.0 { -1.0 } else { 1.0 };
-                        let penalized_score = sign * score / repeat_penalty;
-                        penalty_cache.insert(token_id, penalized_score);
-                    }
-                }
-            }
-        }
-
-        let _duration = start_time.elapsed();
-
-        // Verify that tokens were processed correctly
-        assert!(penalty_cache.contains_key(&1)); // Token 1 should be cached
-        assert!(penalty_cache.contains_key(&2)); // Token 2 should be cached
-        assert!(penalty_cache.contains_key(&3)); // Token 3 should be cached
-
-        println!("Successfully demonstrated apply_cached_repeat_penalty usage pattern");
-        Ok(())
-    }
-
-    // Note: Testing the actual text generation functionality would require
-    // integration tests with real models, which is beyond the scope of these unit tests.
-    // The tests above focus on the components that can be tested in isolation.
-}
--- a/crates/inference-engine/tests/token_output_stream_tests.rs
+++ b/crates/inference-engine/tests/token_output_stream_tests.rs
@@ -1,135 +0,0 @@
-use anyhow::Result;
-use inference_engine::token_output_stream::TokenOutputStream;
-use std::path::PathBuf;
-use tokenizers::Tokenizer;
-
-#[cfg(test)]
-mod tests {
-    use super::*;
-
-    // Helper function to create a simple tokenizer for testing
-    fn create_test_tokenizer() -> Result<Tokenizer> {
-        // Create a simple tokenizer from the pretrained model
-        // This uses the tokenizer from the Hugging Face hub
-        let tokenizer = Tokenizer::from_pretrained("google/gemma-2b", None).unwrap();
-        Ok(tokenizer)
-    }
-
-    #[test]
-    fn test_new_token_output_stream() -> Result<()> {
-        let tokenizer = create_test_tokenizer()?;
-        let token_stream = TokenOutputStream::new(tokenizer);
-
-        // Check that the token stream was created successfully
-        assert!(token_stream.tokenizer().get_vocab(true).len() > 0);
-        Ok(())
-    }
-
-    #[test]
-    fn test_clear() -> Result<()> {
-        let tokenizer = create_test_tokenizer()?;
-        let mut token_stream = TokenOutputStream::new(tokenizer);
-
-        // Add a token
-        let token_id = token_stream.get_token("<eos>").unwrap();
-        token_stream.next_token(token_id)?;
-
-        // Clear the stream
-        token_stream.clear();
-
-        // Check that the stream is empty by trying to decode all
-        let decoded = token_stream.decode_all()?;
-        assert_eq!(decoded, "");
-
-        Ok(())
-    }
-
-    #[test]
-    fn test_get_token() -> Result<()> {
-        let tokenizer = create_test_tokenizer()?;
-        let token_stream = TokenOutputStream::new(tokenizer);
-
-        // Get a token that should exist
-        let eos_token = token_stream.get_token("<eos>");
-        assert!(eos_token.is_some());
-
-        // Get a token that shouldn't exist
-        let nonexistent_token = token_stream.get_token("<this_token_does_not_exist>");
-        assert!(nonexistent_token.is_none());
-
-        Ok(())
-    }
-
-    #[test]
-    fn test_next_token_and_decode() -> Result<()> {
-        let tokenizer = create_test_tokenizer()?;
-        let mut token_stream = TokenOutputStream::new(tokenizer);
-
-        // Get some tokens
-        let hello_tokens = token_stream
-            .tokenizer()
-            .encode("Hello world", true)
-            .unwrap();
-        let token_ids = hello_tokens.get_ids();
-
-        // Add tokens one by one
-        let mut output = String::new();
-        for &token_id in token_ids {
-            if let Some(text) = token_stream.next_token(token_id)? {
-                output.push_str(&text);
-            }
-        }
-
-        // Get any remaining text
-        if let Some(rest) = token_stream.decode_rest()? {
-            output.push_str(&rest);
-        }
-
-        // Check the output
-        assert!(!output.is_empty());
-        assert_eq!(output.trim(), "Hello world");
-
-        Ok(())
-    }
-
-    #[test]
-    fn test_decode_all() -> Result<()> {
-        let tokenizer = create_test_tokenizer()?;
-        let mut token_stream = TokenOutputStream::new(tokenizer);
-
-        // Get some tokens
-        let hello_tokens = token_stream
-            .tokenizer()
-            .encode("Hello world", true)
-            .unwrap();
-        let token_ids = hello_tokens.get_ids();
-
-        // Add tokens one by one
-        for &token_id in token_ids {
-            token_stream.next_token(token_id)?;
-        }
-
-        // Decode all
-        let decoded = token_stream.decode_all()?;
-
-        // Check the output
-        assert_eq!(decoded.trim(), "Hello world");
-
-        Ok(())
-    }
-
-    #[test]
-    fn test_into_inner() -> Result<()> {
-        let tokenizer = create_test_tokenizer()?;
-        let token_stream = TokenOutputStream::new(tokenizer);
-
-        // Get the inner tokenizer
-        let inner_tokenizer = token_stream.into_inner();
-
-        // Check that the inner tokenizer works
-        let encoded = inner_tokenizer.encode("Test", true).unwrap();
-        assert!(encoded.get_ids().len() > 0);
-
-        Ok(())
-    }
-}
--- a/crates/llama-runner/Cargo.toml
+++ b/crates/llama-runner/Cargo.toml
@@ -18,11 +18,6 @@ candle-core = { git = "https://github.com/huggingface/candle.git", features = ["
 candle-nn = { git = "https://github.com/huggingface/candle.git", features = ["metal"] }
 candle-transformers = { git = "https://github.com/huggingface/candle.git", features = ["metal"] }

-[target.'cfg(not(target_os = "macos"))'.dependencies]
-candle-core = { git = "https://github.com/huggingface/candle.git", features = ["cuda"], optional = true }
-candle-nn = { git = "https://github.com/huggingface/candle.git", features = ["cuda"], optional = true }
-candle-transformers = { git = "https://github.com/huggingface/candle.git", features = ["cuda"], optional = true }
-
 [features]
 default = []
 cuda = ["candle-core/cuda", "candle-nn/cuda", "candle-transformers/cuda"]