supports small llama and gemma models

Refactor inference dedicated crates for llama and gemma inferencing, not integrated
2025-09-08 22:46:44 +00:00 · 2025-08-29 18:15:29 -04:00
parent d06b16bb12
commit 315ef17605
26 changed files with 2136 additions and 1402 deletions
--- a/scripts/run_llama.sh
+++ b/scripts/run_llama.sh
@@ -0,0 +1,30 @@
+#!/usr/bin/env bash
+set -euo pipefail
+
+PROMPT=${1:-"Say hello in one short sentence."}
+MODEL=${2:-"meta-llama/Llama-3.2-1B-Instruct"}
+MAX_NEW=${3:-64}
+FORCE_CPU=${FORCE_CPU:-0}
+
+# Optional: keep HF cache local to repo if not already set
+export HF_HOME=${HF_HOME:-"$PWD/.hf-cache"}
+
+BIN="$(dirname "$0")/../target/release/llama_infer"
+
+if [[ ! -x "$BIN" ]]; then
+  echo "Building llama-runner (release)..."
+  cargo build -p llama-runner --release
+fi
+
+echo "Running llama inference..." >&2
+ARGS=(
+  --model-id "$MODEL"
+  --prompt "$PROMPT"
+  --max-new-tokens "$MAX_NEW"
+)
+
+if [[ "$FORCE_CPU" == "1" || "$FORCE_CPU" == "true" ]]; then
+  ARGS+=( --force-cpu )
+fi
+
+"$BIN" "${ARGS[@]}"