mirror of
https://github.com/geoffsee/predict-otron-9001.git
synced 2025-09-08 22:46:44 +00:00
supports small llama and gemma models
Refactor inference dedicated crates for llama and gemma inferencing, not integrated
This commit is contained in:
30
scripts/run_llama.sh
Normal file
30
scripts/run_llama.sh
Normal file
@@ -0,0 +1,30 @@
|
||||
#!/usr/bin/env bash
|
||||
set -euo pipefail
|
||||
|
||||
PROMPT=${1:-"Say hello in one short sentence."}
|
||||
MODEL=${2:-"meta-llama/Llama-3.2-1B-Instruct"}
|
||||
MAX_NEW=${3:-64}
|
||||
FORCE_CPU=${FORCE_CPU:-0}
|
||||
|
||||
# Optional: keep HF cache local to repo if not already set
|
||||
export HF_HOME=${HF_HOME:-"$PWD/.hf-cache"}
|
||||
|
||||
BIN="$(dirname "$0")/../target/release/llama_infer"
|
||||
|
||||
if [[ ! -x "$BIN" ]]; then
|
||||
echo "Building llama-runner (release)..."
|
||||
cargo build -p llama-runner --release
|
||||
fi
|
||||
|
||||
echo "Running llama inference..." >&2
|
||||
ARGS=(
|
||||
--model-id "$MODEL"
|
||||
--prompt "$PROMPT"
|
||||
--max-new-tokens "$MAX_NEW"
|
||||
)
|
||||
|
||||
if [[ "$FORCE_CPU" == "1" || "$FORCE_CPU" == "true" ]]; then
|
||||
ARGS+=( --force-cpu )
|
||||
fi
|
||||
|
||||
"$BIN" "${ARGS[@]}"
|
Reference in New Issue
Block a user