My goal has always been to be productive with commodity hardware. So far my workhorses have been the MoE editions of gemma 4 and Qwen 3.6 on an old desktop with a single 9060XT with 16GB ram.
The problem has always been that every source is vague about Vram/ram requirements. Models are trained at 16 bits, many guides suggest the fast but obviously gimped Q4, while most peopel tend to get good results at Q6 or Q8. but this makes ram requirements hard to predict. So I decided to build a script that parses the verbose output of llama cpp and gives an easy to review summary.
You can see the output on the attached image. It reads all buffer allocations, groups them by function and backend, and provides useful sums to help you realize what is going on in your setup and plan accordingly.
I also get a few easy to groc stats that everyone should appreciate like t/s or MTP performance.
Below is the actual script. It expects linux, and that your llama cpp command sits in a script called run.sh that includes the -v flag for verbose output.
The script was vibe coded with chatgpt and probably still need some work to help with more graceful shutdown.
I hope you guys find it useful
#!/usr/bin/env bash set -euo pipefail RUN_SCRIPT="${RUN_SCRIPT:-./run.sh}" LOG_FILE="${LOG_FILE:-/tmp/llama-run.log}" MEM_FILE="${MEM_FILE:-/tmp/llama-mem.tsv}" STAT_FILE="${STAT_FILE:-/tmp/llama-stats.tsv}" INFO_FILE="${INFO_FILE:-/tmp/llama-info.tsv}" INTERVAL="${INTERVAL:-2}" : > "$LOG_FILE" : > "$MEM_FILE" : > "$STAT_FILE" : > "$INFO_FILE" parse_buffer_line() { sed -nE 's/.* ([A-Za-z0-9_]+)[[:space:]]+([A-Za-z]+) buffer size =[[:space:]]*([0-9.]+) MiB.*/\1:\2\t\3/p' } parse_info_line() { awk ' /llama_model_loader:/ && /general.name/ { line=$0 sub(/.*general.name[[:space:]]+str[[:space:]]*=[[:space:]]*/, "", line) if (line != "") print "model_name\t" line } /llm_load_print_meta:/ && /model ftype/ { line=$0 sub(/.*model ftype[[:space:]]*=[[:space:]]*/, "", line) if (line != "") print "model_quant\t" line } /"model":/ { line=$0 if (match(line, /"model":"[^"]+"/)) { model=substr(line, RSTART+9, RLENGTH-10) print "model_name\t" model if (match(model, /:([^:]+)$/)) { q=substr(model, RSTART+1, RLENGTH-1) print "model_quant\t" q } } } ' } parse_stat_line() { awk ' /prompt eval time/ && /tokens per second/ { line=$0 sub(/.*\(/, "", line) sub(/[[:space:]]*tokens per second.*/, "", line) sub(/.*,[[:space:]]*/, "", line) print "pp_tps\t" line } /eval time/ && !/prompt eval time/ && /tokens per second/ { line=$0 sub(/.*\(/, "", line) sub(/[[:space:]]*tokens per second.*/, "", line) sub(/.*,[[:space:]]*/, "", line) print "tg_tps\t" line } /prompt_per_second/ { line=$0 if (match(line, /"prompt_per_second":[0-9.]+/)) { v=substr(line, RSTART, RLENGTH) sub(/.*:/, "", v) print "pp_tps\t" v } } /predicted_per_second/ { line=$0 if (match(line, /"predicted_per_second":[0-9.]+/)) { v=substr(line, RSTART, RLENGTH) sub(/.*:/, "", v) print "tg_tps\t" v } } /n_ctx[[:space:]]*=/ { line=$0 sub(/.*n_ctx[[:space:]]*=[[:space:]]*/, "", line) sub(/[^0-9].*/, "", line) if (line != "") print "n_ctx\t" line } /n_tokens[[:space:]]*=/ { line=$0 sub(/.*n_tokens[[:space:]]*=[[:space:]]*/, "", line) sub(/[^0-9].*/, "", line) if (line != "") print "ctx_used\t" line } /draft acceptance[[:space:]]*=/ { line=$0 sub(/.*draft acceptance[[:space:]]*=[[:space:]]*/, "", line) sub(/[[:space:]].*/, "", line) print "mtp_acceptance\t" line } /accepted[[:space:]]+[0-9]+\/[0-9]+ draft tokens/ { line=$0 sub(/.*accepted[[:space:]]+/, "", line) sub(/[[:space:]]+draft tokens.*/, "", line) print "mtp_last_accept\t" line } /statistics[[:space:]]+draft-mtp:/ { line=$0 if (match(line, /#gen tokens =[[:space:]]*[0-9]+/)) { v=substr(line, RSTART, RLENGTH) sub(/.*=[[:space:]]*/, "", v) print "mtp_gen_tokens\t" v } if (match(line, /#acc tokens =[[:space:]]*[0-9]+/)) { v=substr(line, RSTART, RLENGTH) sub(/.*=[[:space:]]*/, "", v) print "mtp_acc_tokens\t" v } if (match(line, /#mean acc len =[[:space:]]*[0-9.]+/)) { v=substr(line, RSTART, RLENGTH) sub(/.*=[[:space:]]*/, "", v) print "mtp_mean_len\t" v } } ' } "$RUN_SCRIPT" "$@" -v > >(tee -a "$LOG_FILE" >/dev/null) 2> >( tee -a "$LOG_FILE" | while IFS= read -r line; do parsed_mem="$(printf '%s\n' "$line" | parse_buffer_line || true)" [[ -n "$parsed_mem" ]] && printf '%s\n' "$parsed_mem" >> "$MEM_FILE" parsed_info="$(printf '%s\n' "$line" | parse_info_line || true)" [[ -n "$parsed_info" ]] && printf '%s\n' "$parsed_info" >> "$INFO_FILE" parsed_stat="$(printf '%s\n' "$line" | parse_stat_line || true)" [[ -n "$parsed_stat" ]] && printf '%s\n' "$parsed_stat" >> "$STAT_FILE" done ) & LLAMA_PID=$! trap 'kill "$LLAMA_PID" 2>/dev/null || true; exit' INT TERM EXIT while kill -0 "$LLAMA_PID" 2>/dev/null; do clear echo "llama.cpp monitor" echo "PID: $LLAMA_PID" echo "Log: $LOG_FILE" echo echo "Model info" echo "----------" awk -F '\t' ' { info[$1] = $2 } END { printf "%-20s %s\n", "Name", info["model_name"] ? info["model_name"] : "-" printf "%-20s %s\n", "Quant", info["model_quant"] ? info["model_quant"] : "-" } ' "$INFO_FILE" echo echo "Runtime stats" echo "-------------" awk -F '\t' ' { stat[$1] = $2 } END { printf "%-20s %s\n", "Prompt eval t/s", stat["pp_tps"] ? stat["pp_tps"] : "-" printf "%-20s %s\n", "Token gen t/s", stat["tg_tps"] ? stat["tg_tps"] : "-" printf "%-20s %s\n", "Context used", stat["ctx_used"] ? stat["ctx_used"] : "-" printf "%-20s %s\n", "Context size", stat["n_ctx"] ? stat["n_ctx"] : "-" printf "%-20s %s\n", "MTP acceptance", stat["mtp_acceptance"] ? stat["mtp_acceptance"] : "-" printf "%-20s %s\n", "MTP accepted", stat["mtp_acc_tokens"] && stat["mtp_gen_tokens"] ? stat["mtp_acc_tokens"] "/" stat["mtp_gen_tokens"] : "-" printf "%-20s %s\n", "MTP mean len", stat["mtp_mean_len"] ? stat["mtp_mean_len"] : "-" printf "%-20s %s\n", "MTP last accept", stat["mtp_last_accept"] ? stat["mtp_last_accept"] : "-" } ' "$STAT_FILE" echo echo "Memory buffers" echo "--------------" if [[ ! -s "$MEM_FILE" ]]; then echo "Waiting for buffer allocation lines..." else awk -F '\t' ' { latest[$1] = $2 } END { grand = 0 n = asorti(latest, keys) printf "%-40s %12s\n", "Buffer", "MiB" printf "%-40s %12s\n", "------", "---" for (i = 1; i <= n; i++) { key = keys[i] mib = latest[key] + 0 split(key, parts, ":") backend = parts[1] type = parts[2] backend_total[backend] += mib type_total[type] += mib grand += mib printf "%-40s %12.2f\n", key, mib } printf "\n" printf "%-40s %12s\n", "Backend totals", "MiB" printf "%-40s %12s\n", "--------------", "---" m = asorti(backend_total, backend_keys) for (i = 1; i <= m; i++) { backend = backend_keys[i] printf "%-40s %12.2f\n", backend, backend_total[backend] } printf "\n" printf "%-40s %12s\n", "Allocation totals", "MiB" printf "%-40s %12s\n", "-----------------", "---" t = asorti(type_total, type_keys) for (i = 1; i <= t; i++) { type = type_keys[i] printf "%-40s %12.2f\n", type, type_total[type] } printf "\n" printf "%-40s %12.2f MiB\n", "Grand total explicit", grand printf "%-40s %12.2f GiB\n", "Grand total explicit", grand / 1024 } ' "$MEM_FILE" fi sleep "$INTERVAL" done wait "$LLAMA_PID"
submitted by
Discussion (0)
Sign in to join the discussion. Free account, 30 seconds — email code or GitHub.
Sign in →No comments yet. Sign in and be the first to say something.