supports small llama and gemma models

Refactor inference

dedicated crates for llama and gemma inferencing, not integrated
This commit is contained in:
geoffsee
2025-08-29 18:15:29 -04:00
parent d06b16bb12
commit 315ef17605
26 changed files with 2136 additions and 1402 deletions

30
scripts/run_llama.sh Normal file
View File

@@ -0,0 +1,30 @@
#!/usr/bin/env bash
set -euo pipefail
PROMPT=${1:-"Say hello in one short sentence."}
MODEL=${2:-"meta-llama/Llama-3.2-1B-Instruct"}
MAX_NEW=${3:-64}
FORCE_CPU=${FORCE_CPU:-0}
# Optional: keep HF cache local to repo if not already set
export HF_HOME=${HF_HOME:-"$PWD/.hf-cache"}
BIN="$(dirname "$0")/../target/release/llama_infer"
if [[ ! -x "$BIN" ]]; then
echo "Building llama-runner (release)..."
cargo build -p llama-runner --release
fi
echo "Running llama inference..." >&2
ARGS=(
--model-id "$MODEL"
--prompt "$PROMPT"
--max-new-tokens "$MAX_NEW"
)
if [[ "$FORCE_CPU" == "1" || "$FORCE_CPU" == "true" ]]; then
ARGS+=( --force-cpu )
fi
"$BIN" "${ARGS[@]}"