From bbc07002f7f2e7275cab6a17f49cdf2ee481cf08 Mon Sep 17 00:00:00 2001
From: mcm007 <mcm007@users.noreply.github.com>
Date: Tue, 24 Mar 2026 08:52:54 +0200
Subject: [PATCH] Update docker and it's configs (#1497)

* Update ik_llama-cuda.Containerfile

- DGGML_NATIVE=ON, as most users are building images locally

- Latest llama-swap version

* Update ik_llama-cpu.Containerfile

- DGGML_NATIVE=ON, as most users are building images locally

- Latest llama-swap version

* Update README.md

- DGGML_NATIVE=ON is now default

* Update ik_llama-cuda-swap.config.yaml

- Adapt to the latest version

* Update ik_llama-cpu-swap.config.yaml

- Adapt to the latest version

* Update ik_llama-cuda-swap.config.yaml

- Add model with aliases

* Update ik_llama-cpu-swap.config.yaml

- Add model with aliases
---
 docker/README.md                      |  2 +-
 docker/ik_llama-cpu-swap.config.yaml  | 30 ++++++++++++++++++++++++-
 docker/ik_llama-cpu.Containerfile     |  4 ++--
 docker/ik_llama-cuda-swap.config.yaml | 32 ++++++++++++++++++++++++++-
 docker/ik_llama-cuda.Containerfile    |  4 ++--
 5 files changed, 65 insertions(+), 7 deletions(-)

diff --git a/docker/README.md b/docker/README.md
index af1b51fe..d6501d01 100644
--- a/docker/README.md
+++ b/docker/README.md
@@ -123,7 +123,7 @@ docker run  -it --name ik_llama_full --rm -v /my_local_files/gguf:/models:ro --r
 
 - Customize `llama-swap` config: save the `ik_llama-cpu-swap.config.yaml` or `ik_llama-cuda-swap.config.yaml` localy  (e.g. under `/my_local_files/`) then map it to `/app/config.yaml` inside the container appending `-v /my_local_files/ik_llama-cpu-swap.config.yaml:/app/config.yaml:ro` to your`podman run ...` or `docker run ...`.
 - To run the container in background, replace `-it` with `-d`: `podman run -d ...` or `docker run -d ...`. To stop it: `podman stop ik_llama` or `docker stop ik_llama`.
-- If you build the image on the same machine where will be used, change `-DGGML_NATIVE=OFF` to `-DGGML_NATIVE=ON` in the `.Containerfile`.
+- If you build the image on a diferent machine, change `-DGGML_NATIVE=ON` to `-DGGML_NATIVE=OFF` in the `.Containerfile`.
 - If you build only for your GPU architecture and want to make use of more KV quantization types, build with `-DGGML_IQK_FA_ALL_QUANTS=ON`.
 - If you experiment with several `CUDA_VERSION`, remember to identify with `podman image ls` or `docker image ls` then delete (e.g.`podman image rm docker.io/nvidia/cuda:12.4.0-runtime-ubuntu22.04 && podman image rm docker.io/nvidia/cuda:12.4.0-devel-ubuntu22.04` or `docker image rm docker.io/nvidia/cuda:12.4.0-runtime-ubuntu22.04 && docker image rm docker.io/nvidia/cuda:12.4.0-devel-ubuntu22.04` the unused images as they have several GB.
 - If you want to build without `llama-swap`, change `--target swap` to `--target server` in `ik_llama Containerfiles`, e.g. `docker image build --file ik_llama-cuda.Containerfile --target full --tag ik_llama-cuda:full . && docker image build --file ik_llama-cuda.Containerfile --target server --tag ik_llama-cuda:server .`
diff --git a/docker/ik_llama-cpu-swap.config.yaml b/docker/ik_llama-cpu-swap.config.yaml
index 1ff1b445..bdf3a4c5 100644
--- a/docker/ik_llama-cpu-swap.config.yaml
+++ b/docker/ik_llama-cpu-swap.config.yaml
@@ -1,6 +1,8 @@
 healthCheckTimeout: 1800
-logRequests: true
+logLevel: info
 metricsMaxInMemory: 1000
+sendLoadingState: true
+includeAliasesInList: true
 
 models:
   "qwen3 (you need to download .gguf first)":
@@ -30,6 +32,32 @@ models:
       --ctx-size 12288
       -fa on
 
+  "qwen3.5 (you need to download .gguf first)":
+    proxy: "http://127.0.0.1:9999"
+    cmd: >
+      /app/llama-server
+      --model /models/Qwen_Qwen3.5-35B-A3B-IQ4_NL.gguf
+      --alias qwen3.5
+      --port 9999
+      --parallel 1
+      --webui llamacpp
+      --jinja
+      --ctx-size 12288
+      -fa on
+      --temp 1.0 --top-p 0.95 --top-k 20 --min-p 0 --presence-penalty 1.5 --repeat-penalty 1
+    aliases:
+      - "qwen3.5"
+    filters:
+      setParamsByID:
+        "${MODEL_ID}:thinking-coding":
+          temperature: 0.6
+          presence_penalty: 0.0
+        "${MODEL_ID}:instruct":
+          temperature: 0.7
+          top_p: 0.8
+          chat_template_kwargs:
+            enable_thinking: false
+
   "smollm2 (will be downloaded automatically from huggingface.co)":
     proxy: "http://127.0.0.1:9999"
     cmd: >
diff --git a/docker/ik_llama-cpu.Containerfile b/docker/ik_llama-cpu.Containerfile
index 5d76dbb2..3aa23f3d 100644
--- a/docker/ik_llama-cpu.Containerfile
+++ b/docker/ik_llama-cpu.Containerfile
@@ -10,7 +10,7 @@ RUN apt-get update && apt-get install -yq build-essential git libcurl4-openssl-d
 RUN git clone https://github.com/ikawrakow/ik_llama.cpp.git /app
 WORKDIR /app
 RUN if [ -n "$CUSTOM_COMMIT" ]; then git switch --detach "$CUSTOM_COMMIT"; fi
-RUN cmake -B build -DGGML_NATIVE=OFF -DLLAMA_CURL=ON -DGGML_IQK_FA_ALL_QUANTS=ON && \
+RUN cmake -B build -DGGML_NATIVE=ON -DLLAMA_CURL=ON -DGGML_IQK_FA_ALL_QUANTS=ON && \
     cmake --build build --config Release -j$(nproc)
 RUN mkdir -p /app/lib && \
     find build -name "*.so" -exec cp {} /app/lib \;
@@ -64,7 +64,7 @@ ENTRYPOINT [ "/app/llama-server" ]
 # Stage 5: Swap
 FROM server AS swap
 ARG LS_REPO=mostlygeek/llama-swap
-ARG LS_VER=189
+ARG LS_VER=198
 RUN curl -LO "https://github.com/${LS_REPO}/releases/download/v${LS_VER}/llama-swap_${LS_VER}_linux_amd64.tar.gz" \
     && tar -zxf "llama-swap_${LS_VER}_linux_amd64.tar.gz" \
     && rm "llama-swap_${LS_VER}_linux_amd64.tar.gz"
diff --git a/docker/ik_llama-cuda-swap.config.yaml b/docker/ik_llama-cuda-swap.config.yaml
index 6fd0b3b1..a9570cb6 100644
--- a/docker/ik_llama-cuda-swap.config.yaml
+++ b/docker/ik_llama-cuda-swap.config.yaml
@@ -1,6 +1,8 @@
 healthCheckTimeout: 1800
-logRequests: true
+logLevel: info
 metricsMaxInMemory: 1000
+sendLoadingState: true
+includeAliasesInList: true
 
 models:
   "qwen3 (you need to download .gguf first)":
@@ -38,6 +40,34 @@ models:
       --grouped-expert-routing
       --reasoning-format auto --chat-template-kwargs '{"reasoning_effort": "medium"}'
 
+  "qwen3.5 (you need to download .gguf first)":
+    proxy: "http://127.0.0.1:9999"
+    cmd: >
+      /app/llama-server
+      --model /models/Qwen_Qwen3.5-35B-A3B-IQ4_NL.gguf
+      --alias qwen3.5
+      --port 9999
+      --parallel 1
+      --webui llamacpp
+      --jinja
+      --ctx-size 12288
+      -fa on
+      --merge-qkv
+      -ngl 999 --threads-batch 1
+      --temp 1.0 --top-p 0.95 --top-k 20 --min-p 0 --presence-penalty 1.5 --repeat-penalty 1
+    aliases:
+      - "qwen3.5"
+    filters:
+      setParamsByID:
+        "${MODEL_ID}:thinking-coding":
+          temperature: 0.6
+          presence_penalty: 0.0
+        "${MODEL_ID}:instruct":
+          temperature: 0.7
+          top_p: 0.8
+          chat_template_kwargs:
+            enable_thinking: false
+
   "smollm2 (will be downloaded automatically from huggingface.co)":
     proxy: "http://127.0.0.1:9999"
     cmd: >
diff --git a/docker/ik_llama-cuda.Containerfile b/docker/ik_llama-cuda.Containerfile
index 8ff92e52..0ac04236 100644
--- a/docker/ik_llama-cuda.Containerfile
+++ b/docker/ik_llama-cuda.Containerfile
@@ -13,7 +13,7 @@ WORKDIR /app
 RUN if [ "${CUDA_DOCKER_ARCH}" != "default" ]; then \
     export CMAKE_ARGS="-DCMAKE_CUDA_ARCHITECTURES=${CUDA_DOCKER_ARCH}"; \
     fi && \
-    cmake -B build -DGGML_NATIVE=OFF -DGGML_CUDA=ON -DLLAMA_CURL=ON ${CMAKE_ARGS} -DCMAKE_EXE_LINKER_FLAGS=-Wl,--allow-shlib-undefined . && \
+    cmake -B build -DGGML_NATIVE=ON -DGGML_CUDA=ON -DLLAMA_CURL=ON ${CMAKE_ARGS} -DCMAKE_EXE_LINKER_FLAGS=-Wl,--allow-shlib-undefined . && \
     cmake --build build --config Release -j$(nproc)
 RUN mkdir -p /app/lib && \
     find build -name "*.so" -exec cp {} /app/lib \;
@@ -67,7 +67,7 @@ ENTRYPOINT [ "/app/llama-server" ]
 # Stage 5: Swap
 FROM server AS swap
 ARG LS_REPO=mostlygeek/llama-swap
-ARG LS_VER=189
+ARG LS_VER=198
 RUN curl -LO "https://github.com/${LS_REPO}/releases/download/v${LS_VER}/llama-swap_${LS_VER}_linux_amd64.tar.gz" \
     && tar -zxf "llama-swap_${LS_VER}_linux_amd64.tar.gz" \
     && rm "llama-swap_${LS_VER}_linux_amd64.tar.gz"