Update docker and it's configs (#1497)

* Update ik_llama-cuda.Containerfile - DGGML_NATIVE=ON, as most users are building images locally - Latest llama-swap version * Update ik_llama-cpu.Containerfile - DGGML_NATIVE=ON, as most users are building images locally - Latest llama-swap version * Update README.md - DGGML_NATIVE=ON is now default * Update ik_llama-cuda-swap.config.yaml - Adapt to the latest version * Update ik_llama-cpu-swap.config.yaml - Adapt to the latest version * Update ik_llama-cuda-swap.config.yaml - Add model with aliases * Update ik_llama-cpu-swap.config.yaml - Add model with aliases
2026-06-28 04:30:15 -05:00 · 2026-03-24 08:52:54 +02:00 · 2026-03-24 08:52:54 +02:00 · bbc07002f7
commit bbc07002f7
parent 094f76ee86
5 changed files with 65 additions and 7 deletions
--- a/docker/README.md
+++ b/docker/README.md
@ -123,7 +123,7 @@ docker run  -it --name ik_llama_full --rm -v /my_local_files/gguf:/models:ro --r

 - Customize `llama-swap` config: save the `ik_llama-cpu-swap.config.yaml` or `ik_llama-cuda-swap.config.yaml` localy  (e.g. under `/my_local_files/`) then map it to `/app/config.yaml` inside the container appending `-v /my_local_files/ik_llama-cpu-swap.config.yaml:/app/config.yaml:ro` to your`podman run ...` or `docker run ...`.
 - To run the container in background, replace `-it` with `-d`: `podman run -d ...` or `docker run -d ...`. To stop it: `podman stop ik_llama` or `docker stop ik_llama`.
- If you build the image on the same machine where will be used, change `-DGGML_NATIVE=OFF` to `-DGGML_NATIVE=ON` in the `.Containerfile`.
+- If you build the image on a diferent machine, change `-DGGML_NATIVE=ON` to `-DGGML_NATIVE=OFF` in the `.Containerfile`.
 - If you build only for your GPU architecture and want to make use of more KV quantization types, build with `-DGGML_IQK_FA_ALL_QUANTS=ON`.
 - If you experiment with several `CUDA_VERSION`, remember to identify with `podman image ls` or `docker image ls` then delete (e.g.`podman image rm docker.io/nvidia/cuda:12.4.0-runtime-ubuntu22.04 && podman image rm docker.io/nvidia/cuda:12.4.0-devel-ubuntu22.04` or `docker image rm docker.io/nvidia/cuda:12.4.0-runtime-ubuntu22.04 && docker image rm docker.io/nvidia/cuda:12.4.0-devel-ubuntu22.04` the unused images as they have several GB.
 - If you want to build without `llama-swap`, change `--target swap` to `--target server` in `ik_llama Containerfiles`, e.g. `docker image build --file ik_llama-cuda.Containerfile --target full --tag ik_llama-cuda:full . && docker image build --file ik_llama-cuda.Containerfile --target server --tag ik_llama-cuda:server .`
--- a/docker/ik_llama-cpu-swap.config.yaml
+++ b/docker/ik_llama-cpu-swap.config.yaml
@ -1,6 +1,8 @@
 healthCheckTimeout: 1800
-logRequests: true
+logLevel: info
 metricsMaxInMemory: 1000
+sendLoadingState: true
+includeAliasesInList: true

 models:
  "qwen3 (you need to download .gguf first)":
@ -30,6 +32,32 @@ models:
      --ctx-size 12288
      -fa on

+  "qwen3.5 (you need to download .gguf first)":
+    proxy: "http://127.0.0.1:9999"
+    cmd: >
+      /app/llama-server
+      --model /models/Qwen_Qwen3.5-35B-A3B-IQ4_NL.gguf
+      --alias qwen3.5
+      --port 9999
+      --parallel 1
+      --webui llamacpp
+      --jinja
+      --ctx-size 12288
+      -fa on
+      --temp 1.0 --top-p 0.95 --top-k 20 --min-p 0 --presence-penalty 1.5 --repeat-penalty 1
+    aliases:
+      - "qwen3.5"
+    filters:
+      setParamsByID:
+        "${MODEL_ID}:thinking-coding":
+          temperature: 0.6
+          presence_penalty: 0.0
+        "${MODEL_ID}:instruct":
+          temperature: 0.7
+          top_p: 0.8
+          chat_template_kwargs:
+            enable_thinking: false
+
  "smollm2 (will be downloaded automatically from huggingface.co)":
    proxy: "http://127.0.0.1:9999"
    cmd: >
--- a/docker/ik_llama-cpu.Containerfile
+++ b/docker/ik_llama-cpu.Containerfile
@ -10,7 +10,7 @@ RUN apt-get update && apt-get install -yq build-essential git libcurl4-openssl-d
 RUN git clone https://github.com/ikawrakow/ik_llama.cpp.git /app
 WORKDIR /app
 RUN if [ -n "$CUSTOM_COMMIT" ]; then git switch --detach "$CUSTOM_COMMIT"; fi
-RUN cmake -B build -DGGML_NATIVE=OFF -DLLAMA_CURL=ON -DGGML_IQK_FA_ALL_QUANTS=ON && \
+RUN cmake -B build -DGGML_NATIVE=ON -DLLAMA_CURL=ON -DGGML_IQK_FA_ALL_QUANTS=ON && \
    cmake --build build --config Release -j$(nproc)
 RUN mkdir -p /app/lib && \
    find build -name "*.so" -exec cp {} /app/lib \;
@ -64,7 +64,7 @@ ENTRYPOINT [ "/app/llama-server" ]
 # Stage 5: Swap
 FROM server AS swap
 ARG LS_REPO=mostlygeek/llama-swap
-ARG LS_VER=189
+ARG LS_VER=198
 RUN curl -LO "https://github.com/${LS_REPO}/releases/download/v${LS_VER}/llama-swap_${LS_VER}_linux_amd64.tar.gz" \
    && tar -zxf "llama-swap_${LS_VER}_linux_amd64.tar.gz" \
    && rm "llama-swap_${LS_VER}_linux_amd64.tar.gz"
--- a/docker/ik_llama-cuda-swap.config.yaml
+++ b/docker/ik_llama-cuda-swap.config.yaml
@ -1,6 +1,8 @@
 healthCheckTimeout: 1800
-logRequests: true
+logLevel: info
 metricsMaxInMemory: 1000
+sendLoadingState: true
+includeAliasesInList: true

 models:
  "qwen3 (you need to download .gguf first)":
@ -38,6 +40,34 @@ models:
      --grouped-expert-routing
      --reasoning-format auto --chat-template-kwargs '{"reasoning_effort": "medium"}'

+  "qwen3.5 (you need to download .gguf first)":
+    proxy: "http://127.0.0.1:9999"
+    cmd: >
+      /app/llama-server
+      --model /models/Qwen_Qwen3.5-35B-A3B-IQ4_NL.gguf
+      --alias qwen3.5
+      --port 9999
+      --parallel 1
+      --webui llamacpp
+      --jinja
+      --ctx-size 12288
+      -fa on
+      --merge-qkv
+      -ngl 999 --threads-batch 1
+      --temp 1.0 --top-p 0.95 --top-k 20 --min-p 0 --presence-penalty 1.5 --repeat-penalty 1
+    aliases:
+      - "qwen3.5"
+    filters:
+      setParamsByID:
+        "${MODEL_ID}:thinking-coding":
+          temperature: 0.6
+          presence_penalty: 0.0
+        "${MODEL_ID}:instruct":
+          temperature: 0.7
+          top_p: 0.8
+          chat_template_kwargs:
+            enable_thinking: false
+
  "smollm2 (will be downloaded automatically from huggingface.co)":
    proxy: "http://127.0.0.1:9999"
    cmd: >
--- a/docker/ik_llama-cuda.Containerfile
+++ b/docker/ik_llama-cuda.Containerfile
@ -13,7 +13,7 @@ WORKDIR /app
 RUN if [ "${CUDA_DOCKER_ARCH}" != "default" ]; then \
    export CMAKE_ARGS="-DCMAKE_CUDA_ARCHITECTURES=${CUDA_DOCKER_ARCH}"; \
    fi && \
-    cmake -B build -DGGML_NATIVE=OFF -DGGML_CUDA=ON -DLLAMA_CURL=ON ${CMAKE_ARGS} -DCMAKE_EXE_LINKER_FLAGS=-Wl,--allow-shlib-undefined . && \
+    cmake -B build -DGGML_NATIVE=ON -DGGML_CUDA=ON -DLLAMA_CURL=ON ${CMAKE_ARGS} -DCMAKE_EXE_LINKER_FLAGS=-Wl,--allow-shlib-undefined . && \
    cmake --build build --config Release -j$(nproc)
 RUN mkdir -p /app/lib && \
    find build -name "*.so" -exec cp {} /app/lib \;
@ -67,7 +67,7 @@ ENTRYPOINT [ "/app/llama-server" ]
 # Stage 5: Swap
 FROM server AS swap
 ARG LS_REPO=mostlygeek/llama-swap
-ARG LS_VER=189
+ARG LS_VER=198
 RUN curl -LO "https://github.com/${LS_REPO}/releases/download/v${LS_VER}/llama-swap_${LS_VER}_linux_amd64.tar.gz" \
    && tar -zxf "llama-swap_${LS_VER}_linux_amd64.tar.gz" \
    && rm "llama-swap_${LS_VER}_linux_amd64.tar.gz"