mirror of
https://github.com/ikawrakow/ik_llama.cpp.git
synced 2026-06-28 04:30:15 -05:00
* Update ik_llama-cuda.Containerfile - DGGML_NATIVE=ON, as most users are building images locally - Latest llama-swap version * Update ik_llama-cpu.Containerfile - DGGML_NATIVE=ON, as most users are building images locally - Latest llama-swap version * Update README.md - DGGML_NATIVE=ON is now default * Update ik_llama-cuda-swap.config.yaml - Adapt to the latest version * Update ik_llama-cpu-swap.config.yaml - Adapt to the latest version * Update ik_llama-cuda-swap.config.yaml - Add model with aliases * Update ik_llama-cpu-swap.config.yaml - Add model with aliases
85 lines
2.1 KiB
YAML
85 lines
2.1 KiB
YAML
healthCheckTimeout: 1800
|
|
logLevel: info
|
|
metricsMaxInMemory: 1000
|
|
sendLoadingState: true
|
|
includeAliasesInList: true
|
|
|
|
models:
|
|
"qwen3 (you need to download .gguf first)":
|
|
proxy: "http://127.0.0.1:9999"
|
|
cmd: >
|
|
/app/llama-server
|
|
--model /models/Qwen_Qwen3-0.6B-Q6_K.gguf
|
|
--alias qwen3
|
|
--port 9999
|
|
--parallel 1
|
|
--webui llamacpp
|
|
--jinja
|
|
--ctx-size 12288
|
|
-fa on
|
|
--merge-qkv
|
|
-ngl 999 --threads-batch 1
|
|
-ctk q8_0 -ctv q8_0
|
|
|
|
"oss-moe (you need to download .gguf first)":
|
|
proxy: "http://127.0.0.1:9999"
|
|
cmd: >
|
|
/app/llama-server
|
|
--model /models/kldzj_gpt-oss-120b-heretic-MXFP4_MOE-00001-of-00002.gguf
|
|
--alias gpt-oss
|
|
--port 9999
|
|
--parallel 1
|
|
--webui llamacpp
|
|
--jinja
|
|
--ctx-size 12288
|
|
-fa on
|
|
--merge-qkv
|
|
-ngl 999
|
|
--n-cpu-moe 30
|
|
-ctk q8_0 -ctv q8_0
|
|
--grouped-expert-routing
|
|
--reasoning-format auto --chat-template-kwargs '{"reasoning_effort": "medium"}'
|
|
|
|
"qwen3.5 (you need to download .gguf first)":
|
|
proxy: "http://127.0.0.1:9999"
|
|
cmd: >
|
|
/app/llama-server
|
|
--model /models/Qwen_Qwen3.5-35B-A3B-IQ4_NL.gguf
|
|
--alias qwen3.5
|
|
--port 9999
|
|
--parallel 1
|
|
--webui llamacpp
|
|
--jinja
|
|
--ctx-size 12288
|
|
-fa on
|
|
--merge-qkv
|
|
-ngl 999 --threads-batch 1
|
|
--temp 1.0 --top-p 0.95 --top-k 20 --min-p 0 --presence-penalty 1.5 --repeat-penalty 1
|
|
aliases:
|
|
- "qwen3.5"
|
|
filters:
|
|
setParamsByID:
|
|
"${MODEL_ID}:thinking-coding":
|
|
temperature: 0.6
|
|
presence_penalty: 0.0
|
|
"${MODEL_ID}:instruct":
|
|
temperature: 0.7
|
|
top_p: 0.8
|
|
chat_template_kwargs:
|
|
enable_thinking: false
|
|
|
|
"smollm2 (will be downloaded automatically from huggingface.co)":
|
|
proxy: "http://127.0.0.1:9999"
|
|
cmd: >
|
|
/app/llama-server
|
|
--hf-repo mradermacher/SmolLM2-135M-i1-GGUF --hf-file SmolLM2-135M.i1-IQ4_NL.gguf
|
|
--alias smollm2
|
|
--port 9999
|
|
--parallel 1
|
|
--webui llamacpp
|
|
--jinja
|
|
--ctx-size 12288
|
|
-fa on
|
|
--merge-qkv
|
|
-ngl 999 --threads-batch 1
|