mirror of
https://github.com/ggml-org/llama.cpp.git
synced 2026-06-27 23:50:20 -05:00
Support -fa auto in llama-bench (#23714)
* Support `-fa auto` in llama-bench Make the default value of `-ngl` -1, similar to other tools. Update README with latest usage and examples * Address review comments
This commit is contained in:
parent
d6588daa80
commit
aa46bda89b
@ -26,17 +26,28 @@ options:
|
||||
-h, --help
|
||||
--numa <distribute|isolate|numactl> numa mode (default: disabled)
|
||||
-r, --repetitions <n> number of times to repeat each test (default: 5)
|
||||
--prio <0|1|2|3> process/thread priority (default: 0)
|
||||
--prio <-1|0|1|2|3> process/thread priority (default: 0)
|
||||
--delay <0...N> (seconds) delay between each test (default: 0)
|
||||
-o, --output <csv|json|jsonl|md|sql> output format printed to stdout (default: md)
|
||||
-oe, --output-err <csv|json|jsonl|md|sql> output format printed to stderr (default: none)
|
||||
--list-devices list available devices and exit
|
||||
-v, --verbose verbose output
|
||||
--progress print test progress indicators
|
||||
--no-warmup skip warmup runs before benchmarking
|
||||
-fitt, --fit-target <MiB> fit model to device memory with this margin per device in MiB (default: off)
|
||||
-fitc, --fit-ctx <n> minimum ctx size for --fit-target (default: 4096)
|
||||
-rpc, --rpc <rpc_servers> register RPC devices (comma separated)
|
||||
|
||||
test parameters:
|
||||
-m, --model <filename> (default: models/7B/ggml-model-q4_0.gguf)
|
||||
-hf, -hfr, --hf-repo <user>/<model>[:quant] Hugging Face model repository; quant is optional, case-insensitive
|
||||
default to Q4_K_M, or falls back to the first file in the repo if Q4_K_M doesn't exist.
|
||||
example: ggml-org/GLM-4.7-Flash-GGUF:Q4_K_M
|
||||
(default: unused)
|
||||
-hff, --hf-file <file> Hugging Face model file. If specified, it will override the quant in --hf-repo
|
||||
(default: unused)
|
||||
-hft, --hf-token <token> Hugging Face access token
|
||||
(default: value from HF_TOKEN environment variable)
|
||||
-p, --n-prompt <n> (default: 512)
|
||||
-n, --n-gen <n> (default: 128)
|
||||
-pg <pp,tg> (default: )
|
||||
@ -49,21 +60,21 @@ test parameters:
|
||||
-C, --cpu-mask <hex,hex> (default: 0x0)
|
||||
--cpu-strict <0|1> (default: 0)
|
||||
--poll <0...100> (default: 50)
|
||||
-ngl, --n-gpu-layers <n> (default: 99)
|
||||
-ngl, --n-gpu-layers <n> (default: -1)
|
||||
-ncmoe, --n-cpu-moe <n> (default: 0)
|
||||
-sm, --split-mode <none|layer|row> (default: layer)
|
||||
-sm, --split-mode <none|layer|row|tensor> (default: layer)
|
||||
-mg, --main-gpu <i> (default: 0)
|
||||
-nkvo, --no-kv-offload <0|1> (default: 0)
|
||||
-fa, --flash-attn <0|1> (default: 0)
|
||||
-fa, --flash-attn <on|off|auto> (default: auto)
|
||||
-dev, --device <dev0/dev1/...> (default: auto)
|
||||
-mmp, --mmap <0|1> (default: 1)
|
||||
-dio, --direct-io <0|1> (default: 0)
|
||||
-embd, --embeddings <0|1> (default: 0)
|
||||
-ts, --tensor-split <ts0/ts1/..> (default: 0)
|
||||
-ot --override-tensors <tensor name pattern>=<buffer type>;...
|
||||
-ot --override-tensor <tensor name pattern>=<buffer type>;...
|
||||
(default: disabled)
|
||||
-nopo, --no-op-offload <0|1> (default: 0)
|
||||
-fitt, --fit-target <MiB> fit model to device memory with this margin per device in MiB (default: off)
|
||||
-fitc, --fit-ctx <n> minimum ctx size for --fit-target (default: 4096)
|
||||
--no-host <0|1> (default: 0)
|
||||
|
||||
Multiple values can be given for each parameter by separating them with ','
|
||||
or by specifying the parameter multiple times. Ranges can be given as
|
||||
@ -97,12 +108,12 @@ $ ./llama-bench -m models/7B/ggml-model-q4_0.gguf -m models/13B/ggml-model-q4_0.
|
||||
|
||||
| model | size | params | backend | ngl | test | t/s |
|
||||
| ------------------------------ | ---------: | ---------: | ---------- | --: | ---------- | ---------------: |
|
||||
| llama 7B mostly Q4_0 | 3.56 GiB | 6.74 B | CUDA | 99 | tg 128 | 132.19 ± 0.55 |
|
||||
| llama 7B mostly Q4_0 | 3.56 GiB | 6.74 B | CUDA | 99 | tg 256 | 129.37 ± 0.54 |
|
||||
| llama 7B mostly Q4_0 | 3.56 GiB | 6.74 B | CUDA | 99 | tg 512 | 123.83 ± 0.25 |
|
||||
| llama 13B mostly Q4_0 | 6.86 GiB | 13.02 B | CUDA | 99 | tg 128 | 82.17 ± 0.31 |
|
||||
| llama 13B mostly Q4_0 | 6.86 GiB | 13.02 B | CUDA | 99 | tg 256 | 80.74 ± 0.23 |
|
||||
| llama 13B mostly Q4_0 | 6.86 GiB | 13.02 B | CUDA | 99 | tg 512 | 78.08 ± 0.07 |
|
||||
| llama 7B mostly Q4_0 | 3.56 GiB | 6.74 B | CUDA | -1 | tg 128 | 132.19 ± 0.55 |
|
||||
| llama 7B mostly Q4_0 | 3.56 GiB | 6.74 B | CUDA | -1 | tg 256 | 129.37 ± 0.54 |
|
||||
| llama 7B mostly Q4_0 | 3.56 GiB | 6.74 B | CUDA | -1 | tg 512 | 123.83 ± 0.25 |
|
||||
| llama 13B mostly Q4_0 | 6.86 GiB | 13.02 B | CUDA | -1 | tg 128 | 82.17 ± 0.31 |
|
||||
| llama 13B mostly Q4_0 | 6.86 GiB | 13.02 B | CUDA | -1 | tg 256 | 80.74 ± 0.23 |
|
||||
| llama 13B mostly Q4_0 | 6.86 GiB | 13.02 B | CUDA | -1 | tg 512 | 78.08 ± 0.07 |
|
||||
|
||||
### Prompt processing with different batch sizes
|
||||
|
||||
@ -112,10 +123,10 @@ $ ./llama-bench -n 0 -p 1024 -b 128,256,512,1024
|
||||
|
||||
| model | size | params | backend | ngl | n_batch | test | t/s |
|
||||
| ------------------------------ | ---------: | ---------: | ---------- | --: | ---------: | ---------- | ---------------: |
|
||||
| llama 7B mostly Q4_0 | 3.56 GiB | 6.74 B | CUDA | 99 | 128 | pp 1024 | 1436.51 ± 3.66 |
|
||||
| llama 7B mostly Q4_0 | 3.56 GiB | 6.74 B | CUDA | 99 | 256 | pp 1024 | 1932.43 ± 23.48 |
|
||||
| llama 7B mostly Q4_0 | 3.56 GiB | 6.74 B | CUDA | 99 | 512 | pp 1024 | 2254.45 ± 15.59 |
|
||||
| llama 7B mostly Q4_0 | 3.56 GiB | 6.74 B | CUDA | 99 | 1024 | pp 1024 | 2498.61 ± 13.58 |
|
||||
| llama 7B mostly Q4_0 | 3.56 GiB | 6.74 B | CUDA | -1 | 128 | pp 1024 | 1436.51 ± 3.66 |
|
||||
| llama 7B mostly Q4_0 | 3.56 GiB | 6.74 B | CUDA | -1 | 256 | pp 1024 | 1932.43 ± 23.48 |
|
||||
| llama 7B mostly Q4_0 | 3.56 GiB | 6.74 B | CUDA | -1 | 512 | pp 1024 | 2254.45 ± 15.59 |
|
||||
| llama 7B mostly Q4_0 | 3.56 GiB | 6.74 B | CUDA | -1 | 1024 | pp 1024 | 2498.61 ± 13.58 |
|
||||
|
||||
### Different numbers of threads
|
||||
|
||||
@ -171,10 +182,10 @@ $ ./llama-bench -d 0,512
|
||||
|
||||
| model | size | params | backend | ngl | test | t/s |
|
||||
| ------------------------------ | ---------: | ---------: | ---------- | --: | --------------: | -------------------: |
|
||||
| qwen2 7B Q4_K - Medium | 4.36 GiB | 7.62 B | CUDA | 99 | pp512 | 7340.20 ± 23.45 |
|
||||
| qwen2 7B Q4_K - Medium | 4.36 GiB | 7.62 B | CUDA | 99 | tg128 | 120.60 ± 0.59 |
|
||||
| qwen2 7B Q4_K - Medium | 4.36 GiB | 7.62 B | CUDA | 99 | pp512 @ d512 | 6425.91 ± 18.88 |
|
||||
| qwen2 7B Q4_K - Medium | 4.36 GiB | 7.62 B | CUDA | 99 | tg128 @ d512 | 116.71 ± 0.60 |
|
||||
| qwen2 7B Q4_K - Medium | 4.36 GiB | 7.62 B | CUDA | -1 | pp512 | 7340.20 ± 23.45 |
|
||||
| qwen2 7B Q4_K - Medium | 4.36 GiB | 7.62 B | CUDA | -1 | tg128 | 120.60 ± 0.59 |
|
||||
| qwen2 7B Q4_K - Medium | 4.36 GiB | 7.62 B | CUDA | -1 | pp512 @ d512 | 6425.91 ± 18.88 |
|
||||
| qwen2 7B Q4_K - Medium | 4.36 GiB | 7.62 B | CUDA | -1 | tg128 @ d512 | 116.71 ± 0.60 |
|
||||
|
||||
## Output formats
|
||||
|
||||
@ -188,8 +199,8 @@ $ ./llama-bench -o md
|
||||
|
||||
| model | size | params | backend | ngl | test | t/s |
|
||||
| ------------------------------ | ---------: | ---------: | ---------- | --: | ---------- | ---------------: |
|
||||
| llama 7B mostly Q4_0 | 3.56 GiB | 6.74 B | CUDA | 99 | pp 512 | 2368.80 ± 93.24 |
|
||||
| llama 7B mostly Q4_0 | 3.56 GiB | 6.74 B | CUDA | 99 | tg 128 | 131.42 ± 0.59 |
|
||||
| llama 7B mostly Q4_0 | 3.56 GiB | 6.74 B | CUDA | -1 | pp 512 | 2368.80 ± 93.24 |
|
||||
| llama 7B mostly Q4_0 | 3.56 GiB | 6.74 B | CUDA | -1 | tg 128 | 131.42 ± 0.59 |
|
||||
|
||||
### CSV
|
||||
|
||||
@ -198,9 +209,9 @@ $ ./llama-bench -o csv
|
||||
```
|
||||
|
||||
```csv
|
||||
build_commit,build_number,cpu_info,gpu_info,backends,model_filename,model_type,model_size,model_n_params,n_batch,n_ubatch,n_threads,cpu_mask,cpu_strict,poll,type_k,type_v,n_gpu_layers,split_mode,main_gpu,no_kv_offload,flash_attn,tensor_split,use_mmap,embeddings,n_prompt,n_gen,n_depth,test_time,avg_ns,stddev_ns,avg_ts,stddev_ts
|
||||
"8cf427ff","5163","AMD Ryzen 7 7800X3D 8-Core Processor","NVIDIA GeForce RTX 4080","CUDA","models/Qwen2.5-7B-Instruct-Q4_K_M.gguf","qwen2 7B Q4_K - Medium","4677120000","7615616512","2048","512","8","0x0","0","50","f16","f16","99","layer","0","0","0","0.00","1","0","512","0","0","2025-04-24T11:57:09Z","70285660","982040","7285.676949","100.064434"
|
||||
"8cf427ff","5163","AMD Ryzen 7 7800X3D 8-Core Processor","NVIDIA GeForce RTX 4080","CUDA","models/Qwen2.5-7B-Instruct-Q4_K_M.gguf","qwen2 7B Q4_K - Medium","4677120000","7615616512","2048","512","8","0x0","0","50","f16","f16","99","layer","0","0","0","0.00","1","0","0","128","0","2025-04-24T11:57:10Z","1067431600","3834831","119.915244","0.430617"
|
||||
build_commit,build_number,cpu_info,gpu_info,backends,model_filename,model_type,model_size,model_n_params,n_batch,n_ubatch,n_threads,cpu_mask,cpu_strict,poll,type_k,type_v,n_gpu_layers,n_cpu_moe,split_mode,main_gpu,no_kv_offload,flash_attn,devices,tensor_split,tensor_buft_overrides,use_mmap,use_direct_io,embeddings,no_op_offload,no_host,fit_target,fit_min_ctx,n_prompt,n_gen,n_depth,test_time,avg_ns,stddev_ns,avg_ts,stddev_ts
|
||||
"8cf427ff","5163","AMD Ryzen 7 7800X3D 8-Core Processor","NVIDIA GeForce RTX 4080","CUDA","models/Qwen2.5-7B-Instruct-Q4_K_M.gguf","qwen2 7B Q4_K - Medium","4677120000","7615616512","2048","512","8","0x0","0","50","f16","f16","-1","0","layer","0","0","-1","auto","0.00","none","1","0","0","0","0","0","0","512","0","0","2025-04-24T11:57:09Z","70285660","982040","7285.676949","100.064434"
|
||||
"8cf427ff","5163","AMD Ryzen 7 7800X3D 8-Core Processor","NVIDIA GeForce RTX 4080","CUDA","models/Qwen2.5-7B-Instruct-Q4_K_M.gguf","qwen2 7B Q4_K - Medium","4677120000","7615616512","2048","512","8","0x0","0","50","f16","f16","-1","0","layer","0","0","-1","auto","0.00","none","1","0","0","0","0","0","0","0","128","0","2025-04-24T11:57:10Z","1067431600","3834831","119.915244","0.430617"
|
||||
```
|
||||
|
||||
### JSON
|
||||
@ -229,14 +240,22 @@ $ ./llama-bench -o json
|
||||
"poll": 50,
|
||||
"type_k": "f16",
|
||||
"type_v": "f16",
|
||||
"n_gpu_layers": 99,
|
||||
"n_gpu_layers": -1,
|
||||
"n_cpu_moe": 0,
|
||||
"split_mode": "layer",
|
||||
"main_gpu": 0,
|
||||
"no_kv_offload": false,
|
||||
"flash_attn": false,
|
||||
"flash_attn": -1,
|
||||
"devices": "auto",
|
||||
"tensor_split": "0.00",
|
||||
"tensor_buft_overrides": "none",
|
||||
"use_mmap": true,
|
||||
"use_direct_io": false,
|
||||
"embeddings": false,
|
||||
"no_op_offload": 0,
|
||||
"no_host": false,
|
||||
"fit_target": 0,
|
||||
"fit_min_ctx": 0,
|
||||
"n_prompt": 512,
|
||||
"n_gen": 0,
|
||||
"n_depth": 0,
|
||||
@ -266,14 +285,22 @@ $ ./llama-bench -o json
|
||||
"poll": 50,
|
||||
"type_k": "f16",
|
||||
"type_v": "f16",
|
||||
"n_gpu_layers": 99,
|
||||
"n_gpu_layers": -1,
|
||||
"n_cpu_moe": 0,
|
||||
"split_mode": "layer",
|
||||
"main_gpu": 0,
|
||||
"no_kv_offload": false,
|
||||
"flash_attn": false,
|
||||
"flash_attn": -1,
|
||||
"devices": "auto",
|
||||
"tensor_split": "0.00",
|
||||
"tensor_buft_overrides": "none",
|
||||
"use_mmap": true,
|
||||
"use_direct_io": false,
|
||||
"embeddings": false,
|
||||
"no_op_offload": 0,
|
||||
"no_host": false,
|
||||
"fit_target": 0,
|
||||
"fit_min_ctx": 0,
|
||||
"n_prompt": 0,
|
||||
"n_gen": 128,
|
||||
"n_depth": 0,
|
||||
@ -296,8 +323,8 @@ $ ./llama-bench -o jsonl
|
||||
```
|
||||
|
||||
```json lines
|
||||
{"build_commit": "8cf427ff", "build_number": 5163, "cpu_info": "AMD Ryzen 7 7800X3D 8-Core Processor", "gpu_info": "NVIDIA GeForce RTX 4080", "backends": "CUDA", "model_filename": "models/Qwen2.5-7B-Instruct-Q4_K_M.gguf", "model_type": "qwen2 7B Q4_K - Medium", "model_size": 4677120000, "model_n_params": 7615616512, "n_batch": 2048, "n_ubatch": 512, "n_threads": 8, "cpu_mask": "0x0", "cpu_strict": false, "poll": 50, "type_k": "f16", "type_v": "f16", "n_gpu_layers": 99, "split_mode": "layer", "main_gpu": 0, "no_kv_offload": false, "flash_attn": false, "tensor_split": "0.00", "use_mmap": true, "embeddings": false, "n_prompt": 512, "n_gen": 0, "n_depth": 0, "test_time": "2025-04-24T11:59:33Z", "avg_ns": 70497220, "stddev_ns": 883196, "avg_ts": 7263.609157, "stddev_ts": 90.940578, "samples_ns": [ 71551000, 71222800, 70364100, 69439100, 69909100 ],"samples_ts": [ 7155.74, 7188.71, 7276.44, 7373.37, 7323.8 ]}
|
||||
{"build_commit": "8cf427ff", "build_number": 5163, "cpu_info": "AMD Ryzen 7 7800X3D 8-Core Processor", "gpu_info": "NVIDIA GeForce RTX 4080", "backends": "CUDA", "model_filename": "models/Qwen2.5-7B-Instruct-Q4_K_M.gguf", "model_type": "qwen2 7B Q4_K - Medium", "model_size": 4677120000, "model_n_params": 7615616512, "n_batch": 2048, "n_ubatch": 512, "n_threads": 8, "cpu_mask": "0x0", "cpu_strict": false, "poll": 50, "type_k": "f16", "type_v": "f16", "n_gpu_layers": 99, "split_mode": "layer", "main_gpu": 0, "no_kv_offload": false, "flash_attn": false, "tensor_split": "0.00", "use_mmap": true, "embeddings": false, "n_prompt": 0, "n_gen": 128, "n_depth": 0, "test_time": "2025-04-24T11:59:33Z", "avg_ns": 1068078400, "stddev_ns": 6279455, "avg_ts": 119.844681, "stddev_ts": 0.699739, "samples_ns": [ 1066331700, 1064864900, 1079042600, 1063328400, 1066824400 ],"samples_ts": [ 120.038, 120.203, 118.624, 120.377, 119.982 ]}
|
||||
{"build_commit": "8cf427ff", "build_number": 5163, "cpu_info": "AMD Ryzen 7 7800X3D 8-Core Processor", "gpu_info": "NVIDIA GeForce RTX 4080", "backends": "CUDA", "model_filename": "models/Qwen2.5-7B-Instruct-Q4_K_M.gguf", "model_type": "qwen2 7B Q4_K - Medium", "model_size": 4677120000, "model_n_params": 7615616512, "n_batch": 2048, "n_ubatch": 512, "n_threads": 8, "cpu_mask": "0x0", "cpu_strict": false, "poll": 50, "type_k": "f16", "type_v": "f16", "n_gpu_layers": -1, "n_cpu_moe": 0, "split_mode": "layer", "main_gpu": 0, "no_kv_offload": false, "flash_attn": -1, "devices": "auto", "tensor_split": "0.00", "tensor_buft_overrides": "none", "use_mmap": true, "use_direct_io": false, "embeddings": false, "no_op_offload": 0, "no_host": false, "fit_target": 0, "fit_min_ctx": 0, "n_prompt": 512, "n_gen": 0, "n_depth": 0, "test_time": "2025-04-24T11:59:33Z", "avg_ns": 70497220, "stddev_ns": 883196, "avg_ts": 7263.609157, "stddev_ts": 90.940578, "samples_ns": [ 71551000, 71222800, 70364100, 69439100, 69909100 ],"samples_ts": [ 7155.74, 7188.71, 7276.44, 7373.37, 7323.8 ]}
|
||||
{"build_commit": "8cf427ff", "build_number": 5163, "cpu_info": "AMD Ryzen 7 7800X3D 8-Core Processor", "gpu_info": "NVIDIA GeForce RTX 4080", "backends": "CUDA", "model_filename": "models/Qwen2.5-7B-Instruct-Q4_K_M.gguf", "model_type": "qwen2 7B Q4_K - Medium", "model_size": 4677120000, "model_n_params": 7615616512, "n_batch": 2048, "n_ubatch": 512, "n_threads": 8, "cpu_mask": "0x0", "cpu_strict": false, "poll": 50, "type_k": "f16", "type_v": "f16", "n_gpu_layers": -1, "n_cpu_moe": 0, "split_mode": "layer", "main_gpu": 0, "no_kv_offload": false, "flash_attn": -1, "devices": "auto", "tensor_split": "0.00", "tensor_buft_overrides": "none", "use_mmap": true, "use_direct_io": false, "embeddings": false, "no_op_offload": 0, "no_host": false, "fit_target": 0, "fit_min_ctx": 0, "n_prompt": 0, "n_gen": 128, "n_depth": 0, "test_time": "2025-04-24T11:59:33Z", "avg_ns": 1068078400, "stddev_ns": 6279455, "avg_ts": 119.844681, "stddev_ts": 0.699739, "samples_ns": [ 1066331700, 1064864900, 1079042600, 1063328400, 1066824400 ],"samples_ts": [ 120.038, 120.203, 118.624, 120.377, 119.982 ]}
|
||||
```
|
||||
|
||||
|
||||
@ -310,7 +337,7 @@ $ ./llama-bench -o sql
|
||||
```
|
||||
|
||||
```sql
|
||||
CREATE TABLE IF NOT EXISTS test (
|
||||
CREATE TABLE IF NOT EXISTS llama_bench (
|
||||
build_commit TEXT,
|
||||
build_number INTEGER,
|
||||
cpu_info TEXT,
|
||||
@ -329,13 +356,21 @@ CREATE TABLE IF NOT EXISTS test (
|
||||
type_k TEXT,
|
||||
type_v TEXT,
|
||||
n_gpu_layers INTEGER,
|
||||
n_cpu_moe INTEGER,
|
||||
split_mode TEXT,
|
||||
main_gpu INTEGER,
|
||||
no_kv_offload INTEGER,
|
||||
flash_attn INTEGER,
|
||||
devices TEXT,
|
||||
tensor_split TEXT,
|
||||
tensor_buft_overrides TEXT,
|
||||
use_mmap INTEGER,
|
||||
use_direct_io INTEGER,
|
||||
embeddings INTEGER,
|
||||
no_op_offload INTEGER,
|
||||
no_host INTEGER,
|
||||
fit_target INTEGER,
|
||||
fit_min_ctx INTEGER,
|
||||
n_prompt INTEGER,
|
||||
n_gen INTEGER,
|
||||
n_depth INTEGER,
|
||||
@ -346,6 +381,6 @@ CREATE TABLE IF NOT EXISTS test (
|
||||
stddev_ts REAL
|
||||
);
|
||||
|
||||
INSERT INTO test (build_commit, build_number, cpu_info, gpu_info, backends, model_filename, model_type, model_size, model_n_params, n_batch, n_ubatch, n_threads, cpu_mask, cpu_strict, poll, type_k, type_v, n_gpu_layers, split_mode, main_gpu, no_kv_offload, flash_attn, tensor_split, use_mmap, embeddings, n_prompt, n_gen, n_depth, test_time, avg_ns, stddev_ns, avg_ts, stddev_ts) VALUES ('8cf427ff', '5163', 'AMD Ryzen 7 7800X3D 8-Core Processor', 'NVIDIA GeForce RTX 4080', 'CUDA', 'models/Qwen2.5-7B-Instruct-Q4_K_M.gguf', 'qwen2 7B Q4_K - Medium', '4677120000', '7615616512', '2048', '512', '8', '0x0', '0', '50', 'f16', 'f16', '99', 'layer', '0', '0', '0', '0.00', '1', '0', '512', '0', '0', '2025-04-24T12:00:08Z', '69905000', '519516', '7324.546977', '54.032613');
|
||||
INSERT INTO test (build_commit, build_number, cpu_info, gpu_info, backends, model_filename, model_type, model_size, model_n_params, n_batch, n_ubatch, n_threads, cpu_mask, cpu_strict, poll, type_k, type_v, n_gpu_layers, split_mode, main_gpu, no_kv_offload, flash_attn, tensor_split, use_mmap, embeddings, n_prompt, n_gen, n_depth, test_time, avg_ns, stddev_ns, avg_ts, stddev_ts) VALUES ('8cf427ff', '5163', 'AMD Ryzen 7 7800X3D 8-Core Processor', 'NVIDIA GeForce RTX 4080', 'CUDA', 'models/Qwen2.5-7B-Instruct-Q4_K_M.gguf', 'qwen2 7B Q4_K - Medium', '4677120000', '7615616512', '2048', '512', '8', '0x0', '0', '50', 'f16', 'f16', '99', 'layer', '0', '0', '0', '0.00', '1', '0', '0', '128', '0', '2025-04-24T12:00:09Z', '1063608780', '4464130', '120.346696', '0.504647');
|
||||
INSERT INTO llama_bench (build_commit, build_number, cpu_info, gpu_info, backends, model_filename, model_type, model_size, model_n_params, n_batch, n_ubatch, n_threads, cpu_mask, cpu_strict, poll, type_k, type_v, n_gpu_layers, n_cpu_moe, split_mode, main_gpu, no_kv_offload, flash_attn, devices, tensor_split, tensor_buft_overrides, use_mmap, use_direct_io, embeddings, no_op_offload, no_host, fit_target, fit_min_ctx, n_prompt, n_gen, n_depth, test_time, avg_ns, stddev_ns, avg_ts, stddev_ts) VALUES ('8cf427ff', '5163', 'AMD Ryzen 7 7800X3D 8-Core Processor', 'NVIDIA GeForce RTX 4080', 'CUDA', 'models/Qwen2.5-7B-Instruct-Q4_K_M.gguf', 'qwen2 7B Q4_K - Medium', '4677120000', '7615616512', '2048', '512', '8', '0x0', '0', '50', 'f16', 'f16', '-1', '0', 'layer', '0', '0', '-1', 'auto', '0.00', 'none', '1', '0', '0', '0', '0', '0', '0', '512', '0', '0', '2025-04-24T12:00:08Z', '69905000', '519516', '7324.546977', '54.032613');
|
||||
INSERT INTO llama_bench (build_commit, build_number, cpu_info, gpu_info, backends, model_filename, model_type, model_size, model_n_params, n_batch, n_ubatch, n_threads, cpu_mask, cpu_strict, poll, type_k, type_v, n_gpu_layers, n_cpu_moe, split_mode, main_gpu, no_kv_offload, flash_attn, devices, tensor_split, tensor_buft_overrides, use_mmap, use_direct_io, embeddings, no_op_offload, no_host, fit_target, fit_min_ctx, n_prompt, n_gen, n_depth, test_time, avg_ns, stddev_ns, avg_ts, stddev_ts) VALUES ('8cf427ff', '5163', 'AMD Ryzen 7 7800X3D 8-Core Processor', 'NVIDIA GeForce RTX 4080', 'CUDA', 'models/Qwen2.5-7B-Instruct-Q4_K_M.gguf', 'qwen2 7B Q4_K - Medium', '4677120000', '7615616512', '2048', '512', '8', '0x0', '0', '50', 'f16', 'f16', '-1', '0', 'layer', '0', '0', '-1', 'auto', '0.00', 'none', '1', '0', '0', '0', '0', '0', '0', '0', '128', '0', '2025-04-24T12:00:09Z', '1063608780', '4464130', '120.346696', '0.504647');
|
||||
```
|
||||
|
||||
@ -19,6 +19,7 @@
|
||||
#include <vector>
|
||||
#include <unordered_set>
|
||||
|
||||
#include "arg.h"
|
||||
#include "build-info.h"
|
||||
#include "common.h"
|
||||
#include "download.h"
|
||||
@ -275,9 +276,11 @@ static std::string pair_str(const std::pair<int, int> & p) {
|
||||
return buf;
|
||||
}
|
||||
|
||||
static std::vector<int> parse_int_range(const std::string & s) {
|
||||
static std::vector<int> parse_int_range(const std::string & s, bool allow_negative = false) {
|
||||
// first[-last[(+|*)step]]
|
||||
std::regex range_regex(R"(^(\d+)(?:-(\d+)(?:([\+|\*])(\d+))?)?(?:,|$))");
|
||||
std::regex range_regex(allow_negative
|
||||
? R"(^(-?\d+)(?:-(\d+)(?:([\+|\*])(\d+))?)?(?:,|$))"
|
||||
: R"(^(\d+)(?:-(\d+)(?:([\+|\*])(\d+))?)?(?:,|$))");
|
||||
|
||||
std::smatch match;
|
||||
std::string::const_iterator search_start(s.cbegin());
|
||||
@ -337,7 +340,7 @@ struct cmd_params {
|
||||
std::vector<llama_split_mode> split_mode;
|
||||
std::vector<int> main_gpu;
|
||||
std::vector<bool> no_kv_offload;
|
||||
std::vector<bool> flash_attn;
|
||||
std::vector<llama_flash_attn_type> flash_attn;
|
||||
std::vector<std::vector<ggml_backend_dev_t>> devices;
|
||||
std::vector<std::vector<float>> tensor_split;
|
||||
std::vector<std::vector<llama_model_tensor_buft_override>> tensor_buft_overrides;
|
||||
@ -376,12 +379,12 @@ static const cmd_params cmd_params_defaults = {
|
||||
/* cpu_mask */ { "0x0" },
|
||||
/* cpu_strict */ { false },
|
||||
/* poll */ { 50 },
|
||||
/* n_gpu_layers */ { 99 },
|
||||
/* n_gpu_layers */ { -1 },
|
||||
/* n_cpu_moe */ { 0 },
|
||||
/* split_mode */ { LLAMA_SPLIT_MODE_LAYER },
|
||||
/* main_gpu */ { 0 },
|
||||
/* no_kv_offload */ { false },
|
||||
/* flash_attn */ { false },
|
||||
/* flash_attn */ { LLAMA_FLASH_ATTN_TYPE_AUTO },
|
||||
/* devices */ { {} },
|
||||
/* tensor_split */ { std::vector<float>(llama_max_devices(), 0.0f) },
|
||||
/* tensor_buft_overrides*/ { std::vector<llama_model_tensor_buft_override>{ { nullptr, nullptr } } },
|
||||
@ -451,7 +454,7 @@ static void print_usage(int /* argc */, char ** argv) {
|
||||
printf(" -sm, --split-mode <none|layer|row|tensor> (default: %s)\n", join(transform_to_str(cmd_params_defaults.split_mode, split_mode_str), ",").c_str());
|
||||
printf(" -mg, --main-gpu <i> (default: %s)\n", join(cmd_params_defaults.main_gpu, ",").c_str());
|
||||
printf(" -nkvo, --no-kv-offload <0|1> (default: %s)\n", join(cmd_params_defaults.no_kv_offload, ",").c_str());
|
||||
printf(" -fa, --flash-attn <0|1> (default: %s)\n", join(cmd_params_defaults.flash_attn, ",").c_str());
|
||||
printf(" -fa, --flash-attn <on|off|auto> (default: %s)\n", join(transform_to_str(cmd_params_defaults.flash_attn, llama_flash_attn_type_name), ",").c_str());
|
||||
printf(" -dev, --device <dev0/dev1/...> (default: auto)\n");
|
||||
printf(" -mmp, --mmap <0|1> (default: %s)\n", join(cmd_params_defaults.use_mmap, ",").c_str());
|
||||
printf(" -dio, --direct-io <0|1> (default: %s)\n", join(cmd_params_defaults.use_direct_io, ",").c_str());
|
||||
@ -710,7 +713,7 @@ static cmd_params parse_cmd_params(int argc, char ** argv) {
|
||||
invalid_param = true;
|
||||
break;
|
||||
}
|
||||
auto p = parse_int_range(argv[i]);
|
||||
auto p = parse_int_range(argv[i], /*allow_negative=*/true);
|
||||
params.n_gpu_layers.insert(params.n_gpu_layers.end(), p.begin(), p.end());
|
||||
} else if (arg == "-ncmoe" || arg == "--n-cpu-moe") {
|
||||
if (++i >= argc) {
|
||||
@ -793,8 +796,27 @@ static cmd_params parse_cmd_params(int argc, char ** argv) {
|
||||
invalid_param = true;
|
||||
break;
|
||||
}
|
||||
auto p = string_split<bool>(argv[i], split_delim);
|
||||
params.flash_attn.insert(params.flash_attn.end(), p.begin(), p.end());
|
||||
auto p = string_split<std::string>(argv[i], split_delim);
|
||||
|
||||
std::vector<llama_flash_attn_type> types;
|
||||
for (const auto & v : p) {
|
||||
llama_flash_attn_type type;
|
||||
if (common_arg_utils::is_truthy(v)) {
|
||||
type = LLAMA_FLASH_ATTN_TYPE_ENABLED;
|
||||
} else if (common_arg_utils::is_falsey(v)) {
|
||||
type = LLAMA_FLASH_ATTN_TYPE_DISABLED;
|
||||
} else if (common_arg_utils::is_autoy(v)) {
|
||||
type = LLAMA_FLASH_ATTN_TYPE_AUTO;
|
||||
} else {
|
||||
invalid_param = true;
|
||||
break;
|
||||
}
|
||||
types.push_back(type);
|
||||
}
|
||||
if (invalid_param) {
|
||||
break;
|
||||
}
|
||||
params.flash_attn.insert(params.flash_attn.end(), types.begin(), types.end());
|
||||
} else if (arg == "-mmp" || arg == "--mmap") {
|
||||
if (++i >= argc) {
|
||||
invalid_param = true;
|
||||
@ -1138,7 +1160,7 @@ struct cmd_params_instance {
|
||||
llama_split_mode split_mode;
|
||||
int main_gpu;
|
||||
bool no_kv_offload;
|
||||
bool flash_attn;
|
||||
llama_flash_attn_type flash_attn;
|
||||
std::vector<ggml_backend_dev_t> devices;
|
||||
std::vector<float> tensor_split;
|
||||
std::vector<llama_model_tensor_buft_override> tensor_buft_overrides;
|
||||
@ -1222,7 +1244,7 @@ struct cmd_params_instance {
|
||||
cparams.type_k = type_k;
|
||||
cparams.type_v = type_v;
|
||||
cparams.offload_kqv = !no_kv_offload;
|
||||
cparams.flash_attn_type = flash_attn ? LLAMA_FLASH_ATTN_TYPE_ENABLED : LLAMA_FLASH_ATTN_TYPE_DISABLED;
|
||||
cparams.flash_attn_type = flash_attn;
|
||||
cparams.embeddings = embeddings;
|
||||
cparams.op_offload = !no_op_offload;
|
||||
cparams.swa_full = false;
|
||||
@ -1400,7 +1422,7 @@ struct test {
|
||||
llama_split_mode split_mode;
|
||||
int main_gpu;
|
||||
bool no_kv_offload;
|
||||
bool flash_attn;
|
||||
llama_flash_attn_type flash_attn;
|
||||
std::vector<ggml_backend_dev_t> devices;
|
||||
std::vector<float> tensor_split;
|
||||
std::vector<llama_model_tensor_buft_override> tensor_buft_overrides;
|
||||
@ -1522,10 +1544,10 @@ struct test {
|
||||
field == "poll" || field == "model_size" || field == "model_n_params" || field == "n_gpu_layers" ||
|
||||
field == "main_gpu" || field == "n_prompt" || field == "n_gen" || field == "n_depth" || field == "avg_ns" ||
|
||||
field == "stddev_ns" || field == "no_op_offload" || field == "n_cpu_moe" ||
|
||||
field == "fit_target" || field == "fit_min_ctx") {
|
||||
field == "fit_target" || field == "fit_min_ctx" || field == "flash_attn") {
|
||||
return INT;
|
||||
}
|
||||
if (field == "f16_kv" || field == "no_kv_offload" || field == "cpu_strict" || field == "flash_attn" ||
|
||||
if (field == "f16_kv" || field == "no_kv_offload" || field == "cpu_strict" ||
|
||||
field == "use_mmap" || field == "use_direct_io" || field == "embeddings" || field == "no_host") {
|
||||
return BOOL;
|
||||
}
|
||||
@ -1594,7 +1616,7 @@ struct test {
|
||||
split_mode_str(split_mode),
|
||||
std::to_string(main_gpu),
|
||||
std::to_string(no_kv_offload),
|
||||
std::to_string(flash_attn),
|
||||
std::to_string((int) flash_attn),
|
||||
devices_to_string(devices),
|
||||
tensor_split_str,
|
||||
tensor_buft_overrides_str,
|
||||
@ -1779,7 +1801,7 @@ struct markdown_printer : public printer {
|
||||
return 6;
|
||||
}
|
||||
if (field == "flash_attn") {
|
||||
return 2;
|
||||
return 3;
|
||||
}
|
||||
if (field == "devices") {
|
||||
return -12;
|
||||
|
||||
Loading…
x
Reference in New Issue
Block a user