chat client only displays available models

2025-09-08 22:46:44 +00:00 · 2025-09-01 22:29:54 -04:00
parent 545e0c9831
commit 2deecb5e51
20 changed files with 3314 additions and 484 deletions
--- a/crates/llama-runner/Cargo.toml
+++ b/crates/llama-runner/Cargo.toml
@@ -5,8 +5,8 @@ edition = "2021"

 [dependencies]
 candle-core = { git = "https://github.com/huggingface/candle.git" }
-candle-nn = { git = "https://github.com/huggingface/candle.git" }  
-candle-transformers = { git = "https://github.com/huggingface/candle.git" }
+candle-nn = { git = "https://github.com/huggingface/candle.git" }
+candle-transformers = { git = "https://github.com/huggingface/candle.git"}
 hf-hub = "0.3"
 tokenizers = "0.20"
 anyhow = "1.0"
--- a/crates/llama-runner/src/llama_api.rs
+++ b/crates/llama-runner/src/llama_api.rs
@@ -82,7 +82,7 @@ impl Default for LlamaInferenceConfig {

            // Performance flags
            no_kv_cache: false,   // keep cache ON for speed
-            use_flash_attn: true, // great speed boost if supported
+            use_flash_attn: false, // great speed boost if supported

            // Precision: bf16 is a good default on Ampere+; fallback to fp16 if needed.
            dtype: Some("bf16".to_string()),