54 lines
3.7 KiB
Plaintext
54 lines
3.7 KiB
Plaintext
Running ubergarm's models:
|
|
|
|
--ctx-size $((1024*16)) Context Size
|
|
-ctk q8_0 KV cache data type for K (default: f16)
|
|
-mla 3 enable MLA
|
|
-fa enable Flash Attention
|
|
-amb 512 max batch size for attention computations (default: 0)
|
|
-fmoe enable fused MoE (default: disabled)
|
|
--n-gpu-layers 63 number of layers to store in VRAM
|
|
-b 4096 logical maximum batch size (default: 2048)
|
|
-ub 4096 physical maximum batch size (default: 512)
|
|
--override-tensor exps=CPU
|
|
--parallel 1
|
|
--threads 32 number of threads to use during generation (default: 32)
|
|
--threads-batch N number of threads to use during batch and prompt processing (default: same as --threads)
|
|
-cnv
|
|
|
|
-ser 6,1 smart-expert-reduction, experts reduction (Only works for DS)
|
|
|
|
|
|
DS-R1-528
|
|
./build/bin/llama-cli --model /models/DS-R1-528-IQ3_KS/DeepSeek-R1-0528-IQ3_KS-00001-of-00007.gguf --ctx-size 65536 -ctk q8_0 -mla 3 -fa -amb 512 -fmoe --n-gpu-layers 63 -b 4096 -ub 4096 --override-tensor exps=CPU --parallel 1 --threads 32 -ser 6,1 -cnv
|
|
|
|
|
|
Kimi-K2-Instruct-GGUF
|
|
./build/bin/llama-cli --model /models/Kimi-K2-Instruct-GGUF/Kimi-K2-Instruct-IQ2_KS-00001-of-00007.gguf --ctx-size $((1024*32)) -ctk q8_0 -mla 3 -fa -fmoe --override-tensor exps=CPU --parallel 1 --threads 32 -ngl 63 -cnv
|
|
|
|
|
|
GLM-4
|
|
./build/bin/llama-cli --model /models/GLM-4/GLM-4.5-IQ4_K-00001-of-00005.gguf --ctx-size $((1024*16)) -ctk q8_0 -ctv q8_0 -fa -fmoe --override-tensor exps=CPU --parallel 1 --threads 32 --threads-batch 64 -ngl 93 -cnv -p 'You are a helpful assistant'
|
|
If you want to disable thinking, add /nothink (correct, no underscore) at the end of your prompt.
|
|
|
|
|
|
git cmake vim python3 python3-venv
|
|
|
|
|
|
Docker Containers:
|
|
DS-R1-528
|
|
docker run --name llama-DS-R1-ser --network llms -v /sam4t/LLM:/models --device=nvidia.com/gpu=all -w /ik_llama.cpp --entrypoint ./build/bin/llama-server my-llama:1.0 --model /models/DS-R1-528-IQ3_KS/DeepSeek-R1-0528-IQ3_KS-00001-of-00007.gguf --ctx-size 65536 -ctk q8_0 -mla 3 -fa -amb 512 -fmoe --n-gpu-layers 63 -b 4096 -ub 4096 --override-tensor exps=CPU --parallel 1 --threads 32 -ser 6,1 -cnv -rtr --host 0.0.0.0 --alias "DS-R1-ser"
|
|
|
|
|
|
GLM-4-IQ4-K
|
|
docker run --name llama-glm4-IQ4-K --network llms -v /sam4t/LLM:/models --device=nvidia.com/gpu=all -w /ik_llama.cpp --entrypoint ./build/bin/llama-server my-llama:1.0 --model /models/GLM-4/GLM-4.5-IQ4_K-00001-of-00005.gguf --ctx-size $((1024*32)) -ctk q8_0 -ctv q8_0 -fa -fmoe --override-tensor exps=CPU --parallel 1 --threads 32 -ngl 93 -rtr -cnv --host 0.0.0.0 --alias "GLM4-IQ4-K"
|
|
|
|
|
|
GLM-4-IQ2-KS
|
|
docker run --name llama-glm4-IQ2-KS --network llms -v /sam4t/LLM:/models --device=nvidia.com/gpu=all -w /ik_llama.cpp --entrypoint ./build/bin/llama-server my-llama:1.0 --model /models/GLM-4/GLM-4.5-IQ2_KS-00001-of-00003.gguf --ctx-size $((1024*32)) -ctk q8_0 -ctv q8_0 -fa -fmoe --override-tensor exps=CPU --parallel 1 --threads 32 -ngl 93 -rtr -cnv --host 0.0.0.0 --alias "GLM4-IQ2-KS"
|
|
|
|
|
|
./build/bin/llama-cli --model /models/Kimi-K2-Instruct-GGUF/Kimi-K2-Instruct-IQ2_KS-00001-of-00007.gguf --ctx-size $((1024*32)) -ctk q8_0 -mla 3 -fa -fmoe --override-tensor exps=CPU --parallel 1 --threads 32 -ngl 61 -rtr -cnv
|
|
|
|
Kimi-K2-Inst-IQ2-KS
|
|
docker run --name llama-Kimi-K2-Inst-IQ2-KS --network llms -v /sam4t/LLM:/models --device=nvidia.com/gpu=all -w /ik_llama.cpp --entrypoint ./build/bin/llama-server my-llama:1.0 --model /models/Kimi-K2-Instruct-GGUF/Kimi-K2-Instruct-IQ2_KS-00001-of-00007.gguf --ctx-size $((1024*32)) -ctk q8_0 -mla 3 -fa -fmoe --override-tensor exps=CPU --parallel 1 --threads 32 -ngl 61 -rtr -cnv --host 0.0.0.0 --alias "Kimi-K2-Inst-IQ2-KS"
|