ik_llama.cpp/docker/ik_llama-cuda-swap.config.yaml

healthCheckTimeout: 1800
logLevel: info
metricsMaxInMemory: 1000
sendLoadingState: true
includeAliasesInList: true

models:
  "qwen3 (you need to download .gguf first)":
    proxy: "http://127.0.0.1:9999"
    cmd: >
      /app/llama-server
      --model /models/Qwen_Qwen3-0.6B-Q6_K.gguf
      --alias qwen3
      --port 9999
      --parallel 1
      --webui llamacpp
      --jinja
      --ctx-size 12288
      -fa on
      --merge-qkv
      -ngl 999 --threads-batch 1
      -ctk q8_0 -ctv q8_0

  "oss-moe (you need to download .gguf first)":
    proxy: "http://127.0.0.1:9999"
    cmd: >
      /app/llama-server
      --model /models/kldzj_gpt-oss-120b-heretic-MXFP4_MOE-00001-of-00002.gguf
      --alias gpt-oss
      --port 9999
      --parallel 1
      --webui llamacpp
      --jinja
      --ctx-size 12288
      -fa on
      --merge-qkv
      -ngl 999
      --n-cpu-moe 30
      -ctk q8_0 -ctv q8_0
      --grouped-expert-routing
      --reasoning-format auto --chat-template-kwargs '{"reasoning_effort": "medium"}'

  "qwen3.5 (you need to download .gguf first)":
    proxy: "http://127.0.0.1:9999"
    cmd: >
      /app/llama-server
      --model /models/Qwen_Qwen3.5-35B-A3B-IQ4_NL.gguf
      --alias qwen3.5
      --port 9999
      --parallel 1
      --webui llamacpp
      --jinja
      --ctx-size 12288
      -fa on
      --merge-qkv
      -ngl 999 --threads-batch 1
      --temp 1.0 --top-p 0.95 --top-k 20 --min-p 0 --presence-penalty 1.5 --repeat-penalty 1
    aliases:
      - "qwen3.5"
    filters:
      setParamsByID:
        "${MODEL_ID}:thinking-coding":
          temperature: 0.6
          presence_penalty: 0.0
        "${MODEL_ID}:instruct":
          temperature: 0.7
          top_p: 0.8
          chat_template_kwargs:
            enable_thinking: false

  "smollm2 (will be downloaded automatically from huggingface.co)":
    proxy: "http://127.0.0.1:9999"
    cmd: >
      /app/llama-server
      --hf-repo mradermacher/SmolLM2-135M-i1-GGUF --hf-file SmolLM2-135M.i1-IQ4_NL.gguf
      --alias smollm2
      --port 9999
      --parallel 1
      --webui llamacpp
      --jinja
      --ctx-size 12288
      -fa on
      --merge-qkv
      -ngl 999 --threads-batch 1