Support -fa auto in llama-bench (#23714)

* Support `-fa auto` in llama-bench

Make the default value of `-ngl` -1, similar to other tools.

Update README with latest usage and examples

* Address review comments
This commit is contained in:
Gaurav Garg 2026-05-31 02:03:57 +05:30 committed by GitHub
parent d6588daa80
commit aa46bda89b
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
2 changed files with 108 additions and 51 deletions

View File

@ -26,17 +26,28 @@ options:
-h, --help
--numa <distribute|isolate|numactl> numa mode (default: disabled)
-r, --repetitions <n> number of times to repeat each test (default: 5)
--prio <0|1|2|3> process/thread priority (default: 0)
--prio <-1|0|1|2|3> process/thread priority (default: 0)
--delay <0...N> (seconds) delay between each test (default: 0)
-o, --output <csv|json|jsonl|md|sql> output format printed to stdout (default: md)
-oe, --output-err <csv|json|jsonl|md|sql> output format printed to stderr (default: none)
--list-devices list available devices and exit
-v, --verbose verbose output
--progress print test progress indicators
--no-warmup skip warmup runs before benchmarking
-fitt, --fit-target <MiB> fit model to device memory with this margin per device in MiB (default: off)
-fitc, --fit-ctx <n> minimum ctx size for --fit-target (default: 4096)
-rpc, --rpc <rpc_servers> register RPC devices (comma separated)
test parameters:
-m, --model <filename> (default: models/7B/ggml-model-q4_0.gguf)
-hf, -hfr, --hf-repo <user>/<model>[:quant] Hugging Face model repository; quant is optional, case-insensitive
default to Q4_K_M, or falls back to the first file in the repo if Q4_K_M doesn't exist.
example: ggml-org/GLM-4.7-Flash-GGUF:Q4_K_M
(default: unused)
-hff, --hf-file <file> Hugging Face model file. If specified, it will override the quant in --hf-repo
(default: unused)
-hft, --hf-token <token> Hugging Face access token
(default: value from HF_TOKEN environment variable)
-p, --n-prompt <n> (default: 512)
-n, --n-gen <n> (default: 128)
-pg <pp,tg> (default: )
@ -49,21 +60,21 @@ test parameters:
-C, --cpu-mask <hex,hex> (default: 0x0)
--cpu-strict <0|1> (default: 0)
--poll <0...100> (default: 50)
-ngl, --n-gpu-layers <n> (default: 99)
-ngl, --n-gpu-layers <n> (default: -1)
-ncmoe, --n-cpu-moe <n> (default: 0)
-sm, --split-mode <none|layer|row> (default: layer)
-sm, --split-mode <none|layer|row|tensor> (default: layer)
-mg, --main-gpu <i> (default: 0)
-nkvo, --no-kv-offload <0|1> (default: 0)
-fa, --flash-attn <0|1> (default: 0)
-fa, --flash-attn <on|off|auto> (default: auto)
-dev, --device <dev0/dev1/...> (default: auto)
-mmp, --mmap <0|1> (default: 1)
-dio, --direct-io <0|1> (default: 0)
-embd, --embeddings <0|1> (default: 0)
-ts, --tensor-split <ts0/ts1/..> (default: 0)
-ot --override-tensors <tensor name pattern>=<buffer type>;...
-ot --override-tensor <tensor name pattern>=<buffer type>;...
(default: disabled)
-nopo, --no-op-offload <0|1> (default: 0)
-fitt, --fit-target <MiB> fit model to device memory with this margin per device in MiB (default: off)
-fitc, --fit-ctx <n> minimum ctx size for --fit-target (default: 4096)
--no-host <0|1> (default: 0)
Multiple values can be given for each parameter by separating them with ','
or by specifying the parameter multiple times. Ranges can be given as
@ -97,12 +108,12 @@ $ ./llama-bench -m models/7B/ggml-model-q4_0.gguf -m models/13B/ggml-model-q4_0.
| model | size | params | backend | ngl | test | t/s |
| ------------------------------ | ---------: | ---------: | ---------- | --: | ---------- | ---------------: |
| llama 7B mostly Q4_0 | 3.56 GiB | 6.74 B | CUDA | 99 | tg 128 | 132.19 ± 0.55 |
| llama 7B mostly Q4_0 | 3.56 GiB | 6.74 B | CUDA | 99 | tg 256 | 129.37 ± 0.54 |
| llama 7B mostly Q4_0 | 3.56 GiB | 6.74 B | CUDA | 99 | tg 512 | 123.83 ± 0.25 |
| llama 13B mostly Q4_0 | 6.86 GiB | 13.02 B | CUDA | 99 | tg 128 | 82.17 ± 0.31 |
| llama 13B mostly Q4_0 | 6.86 GiB | 13.02 B | CUDA | 99 | tg 256 | 80.74 ± 0.23 |
| llama 13B mostly Q4_0 | 6.86 GiB | 13.02 B | CUDA | 99 | tg 512 | 78.08 ± 0.07 |
| llama 7B mostly Q4_0 | 3.56 GiB | 6.74 B | CUDA | -1 | tg 128 | 132.19 ± 0.55 |
| llama 7B mostly Q4_0 | 3.56 GiB | 6.74 B | CUDA | -1 | tg 256 | 129.37 ± 0.54 |
| llama 7B mostly Q4_0 | 3.56 GiB | 6.74 B | CUDA | -1 | tg 512 | 123.83 ± 0.25 |
| llama 13B mostly Q4_0 | 6.86 GiB | 13.02 B | CUDA | -1 | tg 128 | 82.17 ± 0.31 |
| llama 13B mostly Q4_0 | 6.86 GiB | 13.02 B | CUDA | -1 | tg 256 | 80.74 ± 0.23 |
| llama 13B mostly Q4_0 | 6.86 GiB | 13.02 B | CUDA | -1 | tg 512 | 78.08 ± 0.07 |
### Prompt processing with different batch sizes
@ -112,10 +123,10 @@ $ ./llama-bench -n 0 -p 1024 -b 128,256,512,1024
| model | size | params | backend | ngl | n_batch | test | t/s |
| ------------------------------ | ---------: | ---------: | ---------- | --: | ---------: | ---------- | ---------------: |
| llama 7B mostly Q4_0 | 3.56 GiB | 6.74 B | CUDA | 99 | 128 | pp 1024 | 1436.51 ± 3.66 |
| llama 7B mostly Q4_0 | 3.56 GiB | 6.74 B | CUDA | 99 | 256 | pp 1024 | 1932.43 ± 23.48 |
| llama 7B mostly Q4_0 | 3.56 GiB | 6.74 B | CUDA | 99 | 512 | pp 1024 | 2254.45 ± 15.59 |
| llama 7B mostly Q4_0 | 3.56 GiB | 6.74 B | CUDA | 99 | 1024 | pp 1024 | 2498.61 ± 13.58 |
| llama 7B mostly Q4_0 | 3.56 GiB | 6.74 B | CUDA | -1 | 128 | pp 1024 | 1436.51 ± 3.66 |
| llama 7B mostly Q4_0 | 3.56 GiB | 6.74 B | CUDA | -1 | 256 | pp 1024 | 1932.43 ± 23.48 |
| llama 7B mostly Q4_0 | 3.56 GiB | 6.74 B | CUDA | -1 | 512 | pp 1024 | 2254.45 ± 15.59 |
| llama 7B mostly Q4_0 | 3.56 GiB | 6.74 B | CUDA | -1 | 1024 | pp 1024 | 2498.61 ± 13.58 |
### Different numbers of threads
@ -171,10 +182,10 @@ $ ./llama-bench -d 0,512
| model | size | params | backend | ngl | test | t/s |
| ------------------------------ | ---------: | ---------: | ---------- | --: | --------------: | -------------------: |
| qwen2 7B Q4_K - Medium | 4.36 GiB | 7.62 B | CUDA | 99 | pp512 | 7340.20 ± 23.45 |
| qwen2 7B Q4_K - Medium | 4.36 GiB | 7.62 B | CUDA | 99 | tg128 | 120.60 ± 0.59 |
| qwen2 7B Q4_K - Medium | 4.36 GiB | 7.62 B | CUDA | 99 | pp512 @ d512 | 6425.91 ± 18.88 |
| qwen2 7B Q4_K - Medium | 4.36 GiB | 7.62 B | CUDA | 99 | tg128 @ d512 | 116.71 ± 0.60 |
| qwen2 7B Q4_K - Medium | 4.36 GiB | 7.62 B | CUDA | -1 | pp512 | 7340.20 ± 23.45 |
| qwen2 7B Q4_K - Medium | 4.36 GiB | 7.62 B | CUDA | -1 | tg128 | 120.60 ± 0.59 |
| qwen2 7B Q4_K - Medium | 4.36 GiB | 7.62 B | CUDA | -1 | pp512 @ d512 | 6425.91 ± 18.88 |
| qwen2 7B Q4_K - Medium | 4.36 GiB | 7.62 B | CUDA | -1 | tg128 @ d512 | 116.71 ± 0.60 |
## Output formats
@ -188,8 +199,8 @@ $ ./llama-bench -o md
| model | size | params | backend | ngl | test | t/s |
| ------------------------------ | ---------: | ---------: | ---------- | --: | ---------- | ---------------: |
| llama 7B mostly Q4_0 | 3.56 GiB | 6.74 B | CUDA | 99 | pp 512 | 2368.80 ± 93.24 |
| llama 7B mostly Q4_0 | 3.56 GiB | 6.74 B | CUDA | 99 | tg 128 | 131.42 ± 0.59 |
| llama 7B mostly Q4_0 | 3.56 GiB | 6.74 B | CUDA | -1 | pp 512 | 2368.80 ± 93.24 |
| llama 7B mostly Q4_0 | 3.56 GiB | 6.74 B | CUDA | -1 | tg 128 | 131.42 ± 0.59 |
### CSV
@ -198,9 +209,9 @@ $ ./llama-bench -o csv
```
```csv
build_commit,build_number,cpu_info,gpu_info,backends,model_filename,model_type,model_size,model_n_params,n_batch,n_ubatch,n_threads,cpu_mask,cpu_strict,poll,type_k,type_v,n_gpu_layers,split_mode,main_gpu,no_kv_offload,flash_attn,tensor_split,use_mmap,embeddings,n_prompt,n_gen,n_depth,test_time,avg_ns,stddev_ns,avg_ts,stddev_ts
"8cf427ff","5163","AMD Ryzen 7 7800X3D 8-Core Processor","NVIDIA GeForce RTX 4080","CUDA","models/Qwen2.5-7B-Instruct-Q4_K_M.gguf","qwen2 7B Q4_K - Medium","4677120000","7615616512","2048","512","8","0x0","0","50","f16","f16","99","layer","0","0","0","0.00","1","0","512","0","0","2025-04-24T11:57:09Z","70285660","982040","7285.676949","100.064434"
"8cf427ff","5163","AMD Ryzen 7 7800X3D 8-Core Processor","NVIDIA GeForce RTX 4080","CUDA","models/Qwen2.5-7B-Instruct-Q4_K_M.gguf","qwen2 7B Q4_K - Medium","4677120000","7615616512","2048","512","8","0x0","0","50","f16","f16","99","layer","0","0","0","0.00","1","0","0","128","0","2025-04-24T11:57:10Z","1067431600","3834831","119.915244","0.430617"
build_commit,build_number,cpu_info,gpu_info,backends,model_filename,model_type,model_size,model_n_params,n_batch,n_ubatch,n_threads,cpu_mask,cpu_strict,poll,type_k,type_v,n_gpu_layers,n_cpu_moe,split_mode,main_gpu,no_kv_offload,flash_attn,devices,tensor_split,tensor_buft_overrides,use_mmap,use_direct_io,embeddings,no_op_offload,no_host,fit_target,fit_min_ctx,n_prompt,n_gen,n_depth,test_time,avg_ns,stddev_ns,avg_ts,stddev_ts
"8cf427ff","5163","AMD Ryzen 7 7800X3D 8-Core Processor","NVIDIA GeForce RTX 4080","CUDA","models/Qwen2.5-7B-Instruct-Q4_K_M.gguf","qwen2 7B Q4_K - Medium","4677120000","7615616512","2048","512","8","0x0","0","50","f16","f16","-1","0","layer","0","0","-1","auto","0.00","none","1","0","0","0","0","0","0","512","0","0","2025-04-24T11:57:09Z","70285660","982040","7285.676949","100.064434"
"8cf427ff","5163","AMD Ryzen 7 7800X3D 8-Core Processor","NVIDIA GeForce RTX 4080","CUDA","models/Qwen2.5-7B-Instruct-Q4_K_M.gguf","qwen2 7B Q4_K - Medium","4677120000","7615616512","2048","512","8","0x0","0","50","f16","f16","-1","0","layer","0","0","-1","auto","0.00","none","1","0","0","0","0","0","0","0","128","0","2025-04-24T11:57:10Z","1067431600","3834831","119.915244","0.430617"
```
### JSON
@ -229,14 +240,22 @@ $ ./llama-bench -o json
"poll": 50,
"type_k": "f16",
"type_v": "f16",
"n_gpu_layers": 99,
"n_gpu_layers": -1,
"n_cpu_moe": 0,
"split_mode": "layer",
"main_gpu": 0,
"no_kv_offload": false,
"flash_attn": false,
"flash_attn": -1,
"devices": "auto",
"tensor_split": "0.00",
"tensor_buft_overrides": "none",
"use_mmap": true,
"use_direct_io": false,
"embeddings": false,
"no_op_offload": 0,
"no_host": false,
"fit_target": 0,
"fit_min_ctx": 0,
"n_prompt": 512,
"n_gen": 0,
"n_depth": 0,
@ -266,14 +285,22 @@ $ ./llama-bench -o json
"poll": 50,
"type_k": "f16",
"type_v": "f16",
"n_gpu_layers": 99,
"n_gpu_layers": -1,
"n_cpu_moe": 0,
"split_mode": "layer",
"main_gpu": 0,
"no_kv_offload": false,
"flash_attn": false,
"flash_attn": -1,
"devices": "auto",
"tensor_split": "0.00",
"tensor_buft_overrides": "none",
"use_mmap": true,
"use_direct_io": false,
"embeddings": false,
"no_op_offload": 0,
"no_host": false,
"fit_target": 0,
"fit_min_ctx": 0,
"n_prompt": 0,
"n_gen": 128,
"n_depth": 0,
@ -296,8 +323,8 @@ $ ./llama-bench -o jsonl
```
```json lines
{"build_commit": "8cf427ff", "build_number": 5163, "cpu_info": "AMD Ryzen 7 7800X3D 8-Core Processor", "gpu_info": "NVIDIA GeForce RTX 4080", "backends": "CUDA", "model_filename": "models/Qwen2.5-7B-Instruct-Q4_K_M.gguf", "model_type": "qwen2 7B Q4_K - Medium", "model_size": 4677120000, "model_n_params": 7615616512, "n_batch": 2048, "n_ubatch": 512, "n_threads": 8, "cpu_mask": "0x0", "cpu_strict": false, "poll": 50, "type_k": "f16", "type_v": "f16", "n_gpu_layers": 99, "split_mode": "layer", "main_gpu": 0, "no_kv_offload": false, "flash_attn": false, "tensor_split": "0.00", "use_mmap": true, "embeddings": false, "n_prompt": 512, "n_gen": 0, "n_depth": 0, "test_time": "2025-04-24T11:59:33Z", "avg_ns": 70497220, "stddev_ns": 883196, "avg_ts": 7263.609157, "stddev_ts": 90.940578, "samples_ns": [ 71551000, 71222800, 70364100, 69439100, 69909100 ],"samples_ts": [ 7155.74, 7188.71, 7276.44, 7373.37, 7323.8 ]}
{"build_commit": "8cf427ff", "build_number": 5163, "cpu_info": "AMD Ryzen 7 7800X3D 8-Core Processor", "gpu_info": "NVIDIA GeForce RTX 4080", "backends": "CUDA", "model_filename": "models/Qwen2.5-7B-Instruct-Q4_K_M.gguf", "model_type": "qwen2 7B Q4_K - Medium", "model_size": 4677120000, "model_n_params": 7615616512, "n_batch": 2048, "n_ubatch": 512, "n_threads": 8, "cpu_mask": "0x0", "cpu_strict": false, "poll": 50, "type_k": "f16", "type_v": "f16", "n_gpu_layers": 99, "split_mode": "layer", "main_gpu": 0, "no_kv_offload": false, "flash_attn": false, "tensor_split": "0.00", "use_mmap": true, "embeddings": false, "n_prompt": 0, "n_gen": 128, "n_depth": 0, "test_time": "2025-04-24T11:59:33Z", "avg_ns": 1068078400, "stddev_ns": 6279455, "avg_ts": 119.844681, "stddev_ts": 0.699739, "samples_ns": [ 1066331700, 1064864900, 1079042600, 1063328400, 1066824400 ],"samples_ts": [ 120.038, 120.203, 118.624, 120.377, 119.982 ]}
{"build_commit": "8cf427ff", "build_number": 5163, "cpu_info": "AMD Ryzen 7 7800X3D 8-Core Processor", "gpu_info": "NVIDIA GeForce RTX 4080", "backends": "CUDA", "model_filename": "models/Qwen2.5-7B-Instruct-Q4_K_M.gguf", "model_type": "qwen2 7B Q4_K - Medium", "model_size": 4677120000, "model_n_params": 7615616512, "n_batch": 2048, "n_ubatch": 512, "n_threads": 8, "cpu_mask": "0x0", "cpu_strict": false, "poll": 50, "type_k": "f16", "type_v": "f16", "n_gpu_layers": -1, "n_cpu_moe": 0, "split_mode": "layer", "main_gpu": 0, "no_kv_offload": false, "flash_attn": -1, "devices": "auto", "tensor_split": "0.00", "tensor_buft_overrides": "none", "use_mmap": true, "use_direct_io": false, "embeddings": false, "no_op_offload": 0, "no_host": false, "fit_target": 0, "fit_min_ctx": 0, "n_prompt": 512, "n_gen": 0, "n_depth": 0, "test_time": "2025-04-24T11:59:33Z", "avg_ns": 70497220, "stddev_ns": 883196, "avg_ts": 7263.609157, "stddev_ts": 90.940578, "samples_ns": [ 71551000, 71222800, 70364100, 69439100, 69909100 ],"samples_ts": [ 7155.74, 7188.71, 7276.44, 7373.37, 7323.8 ]}
{"build_commit": "8cf427ff", "build_number": 5163, "cpu_info": "AMD Ryzen 7 7800X3D 8-Core Processor", "gpu_info": "NVIDIA GeForce RTX 4080", "backends": "CUDA", "model_filename": "models/Qwen2.5-7B-Instruct-Q4_K_M.gguf", "model_type": "qwen2 7B Q4_K - Medium", "model_size": 4677120000, "model_n_params": 7615616512, "n_batch": 2048, "n_ubatch": 512, "n_threads": 8, "cpu_mask": "0x0", "cpu_strict": false, "poll": 50, "type_k": "f16", "type_v": "f16", "n_gpu_layers": -1, "n_cpu_moe": 0, "split_mode": "layer", "main_gpu": 0, "no_kv_offload": false, "flash_attn": -1, "devices": "auto", "tensor_split": "0.00", "tensor_buft_overrides": "none", "use_mmap": true, "use_direct_io": false, "embeddings": false, "no_op_offload": 0, "no_host": false, "fit_target": 0, "fit_min_ctx": 0, "n_prompt": 0, "n_gen": 128, "n_depth": 0, "test_time": "2025-04-24T11:59:33Z", "avg_ns": 1068078400, "stddev_ns": 6279455, "avg_ts": 119.844681, "stddev_ts": 0.699739, "samples_ns": [ 1066331700, 1064864900, 1079042600, 1063328400, 1066824400 ],"samples_ts": [ 120.038, 120.203, 118.624, 120.377, 119.982 ]}
```
@ -310,7 +337,7 @@ $ ./llama-bench -o sql
```
```sql
CREATE TABLE IF NOT EXISTS test (
CREATE TABLE IF NOT EXISTS llama_bench (
build_commit TEXT,
build_number INTEGER,
cpu_info TEXT,
@ -329,13 +356,21 @@ CREATE TABLE IF NOT EXISTS test (
type_k TEXT,
type_v TEXT,
n_gpu_layers INTEGER,
n_cpu_moe INTEGER,
split_mode TEXT,
main_gpu INTEGER,
no_kv_offload INTEGER,
flash_attn INTEGER,
devices TEXT,
tensor_split TEXT,
tensor_buft_overrides TEXT,
use_mmap INTEGER,
use_direct_io INTEGER,
embeddings INTEGER,
no_op_offload INTEGER,
no_host INTEGER,
fit_target INTEGER,
fit_min_ctx INTEGER,
n_prompt INTEGER,
n_gen INTEGER,
n_depth INTEGER,
@ -346,6 +381,6 @@ CREATE TABLE IF NOT EXISTS test (
stddev_ts REAL
);
INSERT INTO test (build_commit, build_number, cpu_info, gpu_info, backends, model_filename, model_type, model_size, model_n_params, n_batch, n_ubatch, n_threads, cpu_mask, cpu_strict, poll, type_k, type_v, n_gpu_layers, split_mode, main_gpu, no_kv_offload, flash_attn, tensor_split, use_mmap, embeddings, n_prompt, n_gen, n_depth, test_time, avg_ns, stddev_ns, avg_ts, stddev_ts) VALUES ('8cf427ff', '5163', 'AMD Ryzen 7 7800X3D 8-Core Processor', 'NVIDIA GeForce RTX 4080', 'CUDA', 'models/Qwen2.5-7B-Instruct-Q4_K_M.gguf', 'qwen2 7B Q4_K - Medium', '4677120000', '7615616512', '2048', '512', '8', '0x0', '0', '50', 'f16', 'f16', '99', 'layer', '0', '0', '0', '0.00', '1', '0', '512', '0', '0', '2025-04-24T12:00:08Z', '69905000', '519516', '7324.546977', '54.032613');
INSERT INTO test (build_commit, build_number, cpu_info, gpu_info, backends, model_filename, model_type, model_size, model_n_params, n_batch, n_ubatch, n_threads, cpu_mask, cpu_strict, poll, type_k, type_v, n_gpu_layers, split_mode, main_gpu, no_kv_offload, flash_attn, tensor_split, use_mmap, embeddings, n_prompt, n_gen, n_depth, test_time, avg_ns, stddev_ns, avg_ts, stddev_ts) VALUES ('8cf427ff', '5163', 'AMD Ryzen 7 7800X3D 8-Core Processor', 'NVIDIA GeForce RTX 4080', 'CUDA', 'models/Qwen2.5-7B-Instruct-Q4_K_M.gguf', 'qwen2 7B Q4_K - Medium', '4677120000', '7615616512', '2048', '512', '8', '0x0', '0', '50', 'f16', 'f16', '99', 'layer', '0', '0', '0', '0.00', '1', '0', '0', '128', '0', '2025-04-24T12:00:09Z', '1063608780', '4464130', '120.346696', '0.504647');
INSERT INTO llama_bench (build_commit, build_number, cpu_info, gpu_info, backends, model_filename, model_type, model_size, model_n_params, n_batch, n_ubatch, n_threads, cpu_mask, cpu_strict, poll, type_k, type_v, n_gpu_layers, n_cpu_moe, split_mode, main_gpu, no_kv_offload, flash_attn, devices, tensor_split, tensor_buft_overrides, use_mmap, use_direct_io, embeddings, no_op_offload, no_host, fit_target, fit_min_ctx, n_prompt, n_gen, n_depth, test_time, avg_ns, stddev_ns, avg_ts, stddev_ts) VALUES ('8cf427ff', '5163', 'AMD Ryzen 7 7800X3D 8-Core Processor', 'NVIDIA GeForce RTX 4080', 'CUDA', 'models/Qwen2.5-7B-Instruct-Q4_K_M.gguf', 'qwen2 7B Q4_K - Medium', '4677120000', '7615616512', '2048', '512', '8', '0x0', '0', '50', 'f16', 'f16', '-1', '0', 'layer', '0', '0', '-1', 'auto', '0.00', 'none', '1', '0', '0', '0', '0', '0', '0', '512', '0', '0', '2025-04-24T12:00:08Z', '69905000', '519516', '7324.546977', '54.032613');
INSERT INTO llama_bench (build_commit, build_number, cpu_info, gpu_info, backends, model_filename, model_type, model_size, model_n_params, n_batch, n_ubatch, n_threads, cpu_mask, cpu_strict, poll, type_k, type_v, n_gpu_layers, n_cpu_moe, split_mode, main_gpu, no_kv_offload, flash_attn, devices, tensor_split, tensor_buft_overrides, use_mmap, use_direct_io, embeddings, no_op_offload, no_host, fit_target, fit_min_ctx, n_prompt, n_gen, n_depth, test_time, avg_ns, stddev_ns, avg_ts, stddev_ts) VALUES ('8cf427ff', '5163', 'AMD Ryzen 7 7800X3D 8-Core Processor', 'NVIDIA GeForce RTX 4080', 'CUDA', 'models/Qwen2.5-7B-Instruct-Q4_K_M.gguf', 'qwen2 7B Q4_K - Medium', '4677120000', '7615616512', '2048', '512', '8', '0x0', '0', '50', 'f16', 'f16', '-1', '0', 'layer', '0', '0', '-1', 'auto', '0.00', 'none', '1', '0', '0', '0', '0', '0', '0', '0', '128', '0', '2025-04-24T12:00:09Z', '1063608780', '4464130', '120.346696', '0.504647');
```

View File

@ -19,6 +19,7 @@
#include <vector>
#include <unordered_set>
#include "arg.h"
#include "build-info.h"
#include "common.h"
#include "download.h"
@ -275,9 +276,11 @@ static std::string pair_str(const std::pair<int, int> & p) {
return buf;
}
static std::vector<int> parse_int_range(const std::string & s) {
static std::vector<int> parse_int_range(const std::string & s, bool allow_negative = false) {
// first[-last[(+|*)step]]
std::regex range_regex(R"(^(\d+)(?:-(\d+)(?:([\+|\*])(\d+))?)?(?:,|$))");
std::regex range_regex(allow_negative
? R"(^(-?\d+)(?:-(\d+)(?:([\+|\*])(\d+))?)?(?:,|$))"
: R"(^(\d+)(?:-(\d+)(?:([\+|\*])(\d+))?)?(?:,|$))");
std::smatch match;
std::string::const_iterator search_start(s.cbegin());
@ -337,7 +340,7 @@ struct cmd_params {
std::vector<llama_split_mode> split_mode;
std::vector<int> main_gpu;
std::vector<bool> no_kv_offload;
std::vector<bool> flash_attn;
std::vector<llama_flash_attn_type> flash_attn;
std::vector<std::vector<ggml_backend_dev_t>> devices;
std::vector<std::vector<float>> tensor_split;
std::vector<std::vector<llama_model_tensor_buft_override>> tensor_buft_overrides;
@ -376,12 +379,12 @@ static const cmd_params cmd_params_defaults = {
/* cpu_mask */ { "0x0" },
/* cpu_strict */ { false },
/* poll */ { 50 },
/* n_gpu_layers */ { 99 },
/* n_gpu_layers */ { -1 },
/* n_cpu_moe */ { 0 },
/* split_mode */ { LLAMA_SPLIT_MODE_LAYER },
/* main_gpu */ { 0 },
/* no_kv_offload */ { false },
/* flash_attn */ { false },
/* flash_attn */ { LLAMA_FLASH_ATTN_TYPE_AUTO },
/* devices */ { {} },
/* tensor_split */ { std::vector<float>(llama_max_devices(), 0.0f) },
/* tensor_buft_overrides*/ { std::vector<llama_model_tensor_buft_override>{ { nullptr, nullptr } } },
@ -451,7 +454,7 @@ static void print_usage(int /* argc */, char ** argv) {
printf(" -sm, --split-mode <none|layer|row|tensor> (default: %s)\n", join(transform_to_str(cmd_params_defaults.split_mode, split_mode_str), ",").c_str());
printf(" -mg, --main-gpu <i> (default: %s)\n", join(cmd_params_defaults.main_gpu, ",").c_str());
printf(" -nkvo, --no-kv-offload <0|1> (default: %s)\n", join(cmd_params_defaults.no_kv_offload, ",").c_str());
printf(" -fa, --flash-attn <0|1> (default: %s)\n", join(cmd_params_defaults.flash_attn, ",").c_str());
printf(" -fa, --flash-attn <on|off|auto> (default: %s)\n", join(transform_to_str(cmd_params_defaults.flash_attn, llama_flash_attn_type_name), ",").c_str());
printf(" -dev, --device <dev0/dev1/...> (default: auto)\n");
printf(" -mmp, --mmap <0|1> (default: %s)\n", join(cmd_params_defaults.use_mmap, ",").c_str());
printf(" -dio, --direct-io <0|1> (default: %s)\n", join(cmd_params_defaults.use_direct_io, ",").c_str());
@ -710,7 +713,7 @@ static cmd_params parse_cmd_params(int argc, char ** argv) {
invalid_param = true;
break;
}
auto p = parse_int_range(argv[i]);
auto p = parse_int_range(argv[i], /*allow_negative=*/true);
params.n_gpu_layers.insert(params.n_gpu_layers.end(), p.begin(), p.end());
} else if (arg == "-ncmoe" || arg == "--n-cpu-moe") {
if (++i >= argc) {
@ -793,8 +796,27 @@ static cmd_params parse_cmd_params(int argc, char ** argv) {
invalid_param = true;
break;
}
auto p = string_split<bool>(argv[i], split_delim);
params.flash_attn.insert(params.flash_attn.end(), p.begin(), p.end());
auto p = string_split<std::string>(argv[i], split_delim);
std::vector<llama_flash_attn_type> types;
for (const auto & v : p) {
llama_flash_attn_type type;
if (common_arg_utils::is_truthy(v)) {
type = LLAMA_FLASH_ATTN_TYPE_ENABLED;
} else if (common_arg_utils::is_falsey(v)) {
type = LLAMA_FLASH_ATTN_TYPE_DISABLED;
} else if (common_arg_utils::is_autoy(v)) {
type = LLAMA_FLASH_ATTN_TYPE_AUTO;
} else {
invalid_param = true;
break;
}
types.push_back(type);
}
if (invalid_param) {
break;
}
params.flash_attn.insert(params.flash_attn.end(), types.begin(), types.end());
} else if (arg == "-mmp" || arg == "--mmap") {
if (++i >= argc) {
invalid_param = true;
@ -1138,7 +1160,7 @@ struct cmd_params_instance {
llama_split_mode split_mode;
int main_gpu;
bool no_kv_offload;
bool flash_attn;
llama_flash_attn_type flash_attn;
std::vector<ggml_backend_dev_t> devices;
std::vector<float> tensor_split;
std::vector<llama_model_tensor_buft_override> tensor_buft_overrides;
@ -1222,7 +1244,7 @@ struct cmd_params_instance {
cparams.type_k = type_k;
cparams.type_v = type_v;
cparams.offload_kqv = !no_kv_offload;
cparams.flash_attn_type = flash_attn ? LLAMA_FLASH_ATTN_TYPE_ENABLED : LLAMA_FLASH_ATTN_TYPE_DISABLED;
cparams.flash_attn_type = flash_attn;
cparams.embeddings = embeddings;
cparams.op_offload = !no_op_offload;
cparams.swa_full = false;
@ -1400,7 +1422,7 @@ struct test {
llama_split_mode split_mode;
int main_gpu;
bool no_kv_offload;
bool flash_attn;
llama_flash_attn_type flash_attn;
std::vector<ggml_backend_dev_t> devices;
std::vector<float> tensor_split;
std::vector<llama_model_tensor_buft_override> tensor_buft_overrides;
@ -1522,10 +1544,10 @@ struct test {
field == "poll" || field == "model_size" || field == "model_n_params" || field == "n_gpu_layers" ||
field == "main_gpu" || field == "n_prompt" || field == "n_gen" || field == "n_depth" || field == "avg_ns" ||
field == "stddev_ns" || field == "no_op_offload" || field == "n_cpu_moe" ||
field == "fit_target" || field == "fit_min_ctx") {
field == "fit_target" || field == "fit_min_ctx" || field == "flash_attn") {
return INT;
}
if (field == "f16_kv" || field == "no_kv_offload" || field == "cpu_strict" || field == "flash_attn" ||
if (field == "f16_kv" || field == "no_kv_offload" || field == "cpu_strict" ||
field == "use_mmap" || field == "use_direct_io" || field == "embeddings" || field == "no_host") {
return BOOL;
}
@ -1594,7 +1616,7 @@ struct test {
split_mode_str(split_mode),
std::to_string(main_gpu),
std::to_string(no_kv_offload),
std::to_string(flash_attn),
std::to_string((int) flash_attn),
devices_to_string(devices),
tensor_split_str,
tensor_buft_overrides_str,
@ -1779,7 +1801,7 @@ struct markdown_printer : public printer {
return 6;
}
if (field == "flash_attn") {
return 2;
return 3;
}
if (field == "devices") {
return -12;