From bbc07002f7f2e7275cab6a17f49cdf2ee481cf08 Mon Sep 17 00:00:00 2001 From: mcm007 Date: Tue, 24 Mar 2026 08:52:54 +0200 Subject: [PATCH] Update docker and it's configs (#1497) * Update ik_llama-cuda.Containerfile - DGGML_NATIVE=ON, as most users are building images locally - Latest llama-swap version * Update ik_llama-cpu.Containerfile - DGGML_NATIVE=ON, as most users are building images locally - Latest llama-swap version * Update README.md - DGGML_NATIVE=ON is now default * Update ik_llama-cuda-swap.config.yaml - Adapt to the latest version * Update ik_llama-cpu-swap.config.yaml - Adapt to the latest version * Update ik_llama-cuda-swap.config.yaml - Add model with aliases * Update ik_llama-cpu-swap.config.yaml - Add model with aliases --- docker/README.md | 2 +- docker/ik_llama-cpu-swap.config.yaml | 30 ++++++++++++++++++++++++- docker/ik_llama-cpu.Containerfile | 4 ++-- docker/ik_llama-cuda-swap.config.yaml | 32 ++++++++++++++++++++++++++- docker/ik_llama-cuda.Containerfile | 4 ++-- 5 files changed, 65 insertions(+), 7 deletions(-) diff --git a/docker/README.md b/docker/README.md index af1b51fe..d6501d01 100644 --- a/docker/README.md +++ b/docker/README.md @@ -123,7 +123,7 @@ docker run -it --name ik_llama_full --rm -v /my_local_files/gguf:/models:ro --r - Customize `llama-swap` config: save the `ik_llama-cpu-swap.config.yaml` or `ik_llama-cuda-swap.config.yaml` localy (e.g. under `/my_local_files/`) then map it to `/app/config.yaml` inside the container appending `-v /my_local_files/ik_llama-cpu-swap.config.yaml:/app/config.yaml:ro` to your`podman run ...` or `docker run ...`. - To run the container in background, replace `-it` with `-d`: `podman run -d ...` or `docker run -d ...`. To stop it: `podman stop ik_llama` or `docker stop ik_llama`. -- If you build the image on the same machine where will be used, change `-DGGML_NATIVE=OFF` to `-DGGML_NATIVE=ON` in the `.Containerfile`. +- If you build the image on a diferent machine, change `-DGGML_NATIVE=ON` to `-DGGML_NATIVE=OFF` in the `.Containerfile`. - If you build only for your GPU architecture and want to make use of more KV quantization types, build with `-DGGML_IQK_FA_ALL_QUANTS=ON`. - If you experiment with several `CUDA_VERSION`, remember to identify with `podman image ls` or `docker image ls` then delete (e.g.`podman image rm docker.io/nvidia/cuda:12.4.0-runtime-ubuntu22.04 && podman image rm docker.io/nvidia/cuda:12.4.0-devel-ubuntu22.04` or `docker image rm docker.io/nvidia/cuda:12.4.0-runtime-ubuntu22.04 && docker image rm docker.io/nvidia/cuda:12.4.0-devel-ubuntu22.04` the unused images as they have several GB. - If you want to build without `llama-swap`, change `--target swap` to `--target server` in `ik_llama Containerfiles`, e.g. `docker image build --file ik_llama-cuda.Containerfile --target full --tag ik_llama-cuda:full . && docker image build --file ik_llama-cuda.Containerfile --target server --tag ik_llama-cuda:server .` diff --git a/docker/ik_llama-cpu-swap.config.yaml b/docker/ik_llama-cpu-swap.config.yaml index 1ff1b445..bdf3a4c5 100644 --- a/docker/ik_llama-cpu-swap.config.yaml +++ b/docker/ik_llama-cpu-swap.config.yaml @@ -1,6 +1,8 @@ healthCheckTimeout: 1800 -logRequests: true +logLevel: info metricsMaxInMemory: 1000 +sendLoadingState: true +includeAliasesInList: true models: "qwen3 (you need to download .gguf first)": @@ -30,6 +32,32 @@ models: --ctx-size 12288 -fa on + "qwen3.5 (you need to download .gguf first)": + proxy: "http://127.0.0.1:9999" + cmd: > + /app/llama-server + --model /models/Qwen_Qwen3.5-35B-A3B-IQ4_NL.gguf + --alias qwen3.5 + --port 9999 + --parallel 1 + --webui llamacpp + --jinja + --ctx-size 12288 + -fa on + --temp 1.0 --top-p 0.95 --top-k 20 --min-p 0 --presence-penalty 1.5 --repeat-penalty 1 + aliases: + - "qwen3.5" + filters: + setParamsByID: + "${MODEL_ID}:thinking-coding": + temperature: 0.6 + presence_penalty: 0.0 + "${MODEL_ID}:instruct": + temperature: 0.7 + top_p: 0.8 + chat_template_kwargs: + enable_thinking: false + "smollm2 (will be downloaded automatically from huggingface.co)": proxy: "http://127.0.0.1:9999" cmd: > diff --git a/docker/ik_llama-cpu.Containerfile b/docker/ik_llama-cpu.Containerfile index 5d76dbb2..3aa23f3d 100644 --- a/docker/ik_llama-cpu.Containerfile +++ b/docker/ik_llama-cpu.Containerfile @@ -10,7 +10,7 @@ RUN apt-get update && apt-get install -yq build-essential git libcurl4-openssl-d RUN git clone https://github.com/ikawrakow/ik_llama.cpp.git /app WORKDIR /app RUN if [ -n "$CUSTOM_COMMIT" ]; then git switch --detach "$CUSTOM_COMMIT"; fi -RUN cmake -B build -DGGML_NATIVE=OFF -DLLAMA_CURL=ON -DGGML_IQK_FA_ALL_QUANTS=ON && \ +RUN cmake -B build -DGGML_NATIVE=ON -DLLAMA_CURL=ON -DGGML_IQK_FA_ALL_QUANTS=ON && \ cmake --build build --config Release -j$(nproc) RUN mkdir -p /app/lib && \ find build -name "*.so" -exec cp {} /app/lib \; @@ -64,7 +64,7 @@ ENTRYPOINT [ "/app/llama-server" ] # Stage 5: Swap FROM server AS swap ARG LS_REPO=mostlygeek/llama-swap -ARG LS_VER=189 +ARG LS_VER=198 RUN curl -LO "https://github.com/${LS_REPO}/releases/download/v${LS_VER}/llama-swap_${LS_VER}_linux_amd64.tar.gz" \ && tar -zxf "llama-swap_${LS_VER}_linux_amd64.tar.gz" \ && rm "llama-swap_${LS_VER}_linux_amd64.tar.gz" diff --git a/docker/ik_llama-cuda-swap.config.yaml b/docker/ik_llama-cuda-swap.config.yaml index 6fd0b3b1..a9570cb6 100644 --- a/docker/ik_llama-cuda-swap.config.yaml +++ b/docker/ik_llama-cuda-swap.config.yaml @@ -1,6 +1,8 @@ healthCheckTimeout: 1800 -logRequests: true +logLevel: info metricsMaxInMemory: 1000 +sendLoadingState: true +includeAliasesInList: true models: "qwen3 (you need to download .gguf first)": @@ -38,6 +40,34 @@ models: --grouped-expert-routing --reasoning-format auto --chat-template-kwargs '{"reasoning_effort": "medium"}' + "qwen3.5 (you need to download .gguf first)": + proxy: "http://127.0.0.1:9999" + cmd: > + /app/llama-server + --model /models/Qwen_Qwen3.5-35B-A3B-IQ4_NL.gguf + --alias qwen3.5 + --port 9999 + --parallel 1 + --webui llamacpp + --jinja + --ctx-size 12288 + -fa on + --merge-qkv + -ngl 999 --threads-batch 1 + --temp 1.0 --top-p 0.95 --top-k 20 --min-p 0 --presence-penalty 1.5 --repeat-penalty 1 + aliases: + - "qwen3.5" + filters: + setParamsByID: + "${MODEL_ID}:thinking-coding": + temperature: 0.6 + presence_penalty: 0.0 + "${MODEL_ID}:instruct": + temperature: 0.7 + top_p: 0.8 + chat_template_kwargs: + enable_thinking: false + "smollm2 (will be downloaded automatically from huggingface.co)": proxy: "http://127.0.0.1:9999" cmd: > diff --git a/docker/ik_llama-cuda.Containerfile b/docker/ik_llama-cuda.Containerfile index 8ff92e52..0ac04236 100644 --- a/docker/ik_llama-cuda.Containerfile +++ b/docker/ik_llama-cuda.Containerfile @@ -13,7 +13,7 @@ WORKDIR /app RUN if [ "${CUDA_DOCKER_ARCH}" != "default" ]; then \ export CMAKE_ARGS="-DCMAKE_CUDA_ARCHITECTURES=${CUDA_DOCKER_ARCH}"; \ fi && \ - cmake -B build -DGGML_NATIVE=OFF -DGGML_CUDA=ON -DLLAMA_CURL=ON ${CMAKE_ARGS} -DCMAKE_EXE_LINKER_FLAGS=-Wl,--allow-shlib-undefined . && \ + cmake -B build -DGGML_NATIVE=ON -DGGML_CUDA=ON -DLLAMA_CURL=ON ${CMAKE_ARGS} -DCMAKE_EXE_LINKER_FLAGS=-Wl,--allow-shlib-undefined . && \ cmake --build build --config Release -j$(nproc) RUN mkdir -p /app/lib && \ find build -name "*.so" -exec cp {} /app/lib \; @@ -67,7 +67,7 @@ ENTRYPOINT [ "/app/llama-server" ] # Stage 5: Swap FROM server AS swap ARG LS_REPO=mostlygeek/llama-swap -ARG LS_VER=189 +ARG LS_VER=198 RUN curl -LO "https://github.com/${LS_REPO}/releases/download/v${LS_VER}/llama-swap_${LS_VER}_linux_amd64.tar.gz" \ && tar -zxf "llama-swap_${LS_VER}_linux_amd64.tar.gz" \ && rm "llama-swap_${LS_VER}_linux_amd64.tar.gz"