ik_llama.cpp/docker/ik_llama-cuda-swap.config.yaml
mcm007 bbc07002f7
Update docker and it's configs (#1497)
* Update ik_llama-cuda.Containerfile

- DGGML_NATIVE=ON, as most users are building images locally

- Latest llama-swap version

* Update ik_llama-cpu.Containerfile

- DGGML_NATIVE=ON, as most users are building images locally

- Latest llama-swap version

* Update README.md

- DGGML_NATIVE=ON is now default

* Update ik_llama-cuda-swap.config.yaml

- Adapt to the latest version

* Update ik_llama-cpu-swap.config.yaml

- Adapt to the latest version

* Update ik_llama-cuda-swap.config.yaml

- Add model with aliases

* Update ik_llama-cpu-swap.config.yaml

- Add model with aliases
2026-03-24 07:52:54 +01:00

85 lines
2.1 KiB
YAML

healthCheckTimeout: 1800
logLevel: info
metricsMaxInMemory: 1000
sendLoadingState: true
includeAliasesInList: true
models:
"qwen3 (you need to download .gguf first)":
proxy: "http://127.0.0.1:9999"
cmd: >
/app/llama-server
--model /models/Qwen_Qwen3-0.6B-Q6_K.gguf
--alias qwen3
--port 9999
--parallel 1
--webui llamacpp
--jinja
--ctx-size 12288
-fa on
--merge-qkv
-ngl 999 --threads-batch 1
-ctk q8_0 -ctv q8_0
"oss-moe (you need to download .gguf first)":
proxy: "http://127.0.0.1:9999"
cmd: >
/app/llama-server
--model /models/kldzj_gpt-oss-120b-heretic-MXFP4_MOE-00001-of-00002.gguf
--alias gpt-oss
--port 9999
--parallel 1
--webui llamacpp
--jinja
--ctx-size 12288
-fa on
--merge-qkv
-ngl 999
--n-cpu-moe 30
-ctk q8_0 -ctv q8_0
--grouped-expert-routing
--reasoning-format auto --chat-template-kwargs '{"reasoning_effort": "medium"}'
"qwen3.5 (you need to download .gguf first)":
proxy: "http://127.0.0.1:9999"
cmd: >
/app/llama-server
--model /models/Qwen_Qwen3.5-35B-A3B-IQ4_NL.gguf
--alias qwen3.5
--port 9999
--parallel 1
--webui llamacpp
--jinja
--ctx-size 12288
-fa on
--merge-qkv
-ngl 999 --threads-batch 1
--temp 1.0 --top-p 0.95 --top-k 20 --min-p 0 --presence-penalty 1.5 --repeat-penalty 1
aliases:
- "qwen3.5"
filters:
setParamsByID:
"${MODEL_ID}:thinking-coding":
temperature: 0.6
presence_penalty: 0.0
"${MODEL_ID}:instruct":
temperature: 0.7
top_p: 0.8
chat_template_kwargs:
enable_thinking: false
"smollm2 (will be downloaded automatically from huggingface.co)":
proxy: "http://127.0.0.1:9999"
cmd: >
/app/llama-server
--hf-repo mradermacher/SmolLM2-135M-i1-GGUF --hf-file SmolLM2-135M.i1-IQ4_NL.gguf
--alias smollm2
--port 9999
--parallel 1
--webui llamacpp
--jinja
--ctx-size 12288
-fa on
--merge-qkv
-ngl 999 --threads-batch 1