supports small llama and gemma models

Refactor inference dedicated crates for llama and gemma inferencing, not integrated
2025-09-08 22:46:44 +00:00 · 2025-08-29 18:15:29 -04:00
parent d06b16bb12
commit 315ef17605
26 changed files with 2136 additions and 1402 deletions
--- a/crates/llama-runner/Cargo.toml
+++ b/crates/llama-runner/Cargo.toml
@@ -0,0 +1,24 @@
+[package]
+name = "llama-runner"
+version = "0.1.0"
+edition = "2021"
+
+[dependencies]
+candle-core = { git = "https://github.com/huggingface/candle.git" }
+candle-nn = { git = "https://github.com/huggingface/candle.git" }  
+candle-transformers = { git = "https://github.com/huggingface/candle.git" }
+hf-hub = "0.3"
+tokenizers = "0.20"
+anyhow = "1.0"
+clap = { version = "4.0", features = ["derive", "string"] }
+serde_json = "1.0"
+
+[target.'cfg(target_os = "macos")'.dependencies]
+candle-core = { git = "https://github.com/huggingface/candle.git", features = ["metal"] }
+candle-nn = { git = "https://github.com/huggingface/candle.git", features = ["metal"] }
+candle-transformers = { git = "https://github.com/huggingface/candle.git", features = ["metal"] }
+
+[features]
+default = []
+cuda = ["candle-core/cuda", "candle-nn/cuda", "candle-transformers/cuda"]
+metal = ["candle-core/metal", "candle-nn/metal", "candle-transformers/metal"]