Update docker and it's configs (#1497)

* Update ik_llama-cuda.Containerfile

- DGGML_NATIVE=ON, as most users are building images locally

- Latest llama-swap version

* Update ik_llama-cpu.Containerfile

- DGGML_NATIVE=ON, as most users are building images locally

- Latest llama-swap version

* Update README.md

- DGGML_NATIVE=ON is now default

* Update ik_llama-cuda-swap.config.yaml

- Adapt to the latest version

* Update ik_llama-cpu-swap.config.yaml

- Adapt to the latest version

* Update ik_llama-cuda-swap.config.yaml

- Add model with aliases

* Update ik_llama-cpu-swap.config.yaml

- Add model with aliases
This commit is contained in:
mcm007 2026-03-24 08:52:54 +02:00 committed by GitHub
parent 094f76ee86
commit bbc07002f7
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
5 changed files with 65 additions and 7 deletions

View File

@ -123,7 +123,7 @@ docker run -it --name ik_llama_full --rm -v /my_local_files/gguf:/models:ro --r
- Customize `llama-swap` config: save the `ik_llama-cpu-swap.config.yaml` or `ik_llama-cuda-swap.config.yaml` localy (e.g. under `/my_local_files/`) then map it to `/app/config.yaml` inside the container appending `-v /my_local_files/ik_llama-cpu-swap.config.yaml:/app/config.yaml:ro` to your`podman run ...` or `docker run ...`.
- To run the container in background, replace `-it` with `-d`: `podman run -d ...` or `docker run -d ...`. To stop it: `podman stop ik_llama` or `docker stop ik_llama`.
- If you build the image on the same machine where will be used, change `-DGGML_NATIVE=OFF` to `-DGGML_NATIVE=ON` in the `.Containerfile`.
- If you build the image on a diferent machine, change `-DGGML_NATIVE=ON` to `-DGGML_NATIVE=OFF` in the `.Containerfile`.
- If you build only for your GPU architecture and want to make use of more KV quantization types, build with `-DGGML_IQK_FA_ALL_QUANTS=ON`.
- If you experiment with several `CUDA_VERSION`, remember to identify with `podman image ls` or `docker image ls` then delete (e.g.`podman image rm docker.io/nvidia/cuda:12.4.0-runtime-ubuntu22.04 && podman image rm docker.io/nvidia/cuda:12.4.0-devel-ubuntu22.04` or `docker image rm docker.io/nvidia/cuda:12.4.0-runtime-ubuntu22.04 && docker image rm docker.io/nvidia/cuda:12.4.0-devel-ubuntu22.04` the unused images as they have several GB.
- If you want to build without `llama-swap`, change `--target swap` to `--target server` in `ik_llama Containerfiles`, e.g. `docker image build --file ik_llama-cuda.Containerfile --target full --tag ik_llama-cuda:full . && docker image build --file ik_llama-cuda.Containerfile --target server --tag ik_llama-cuda:server .`

View File

@ -1,6 +1,8 @@
healthCheckTimeout: 1800
logRequests: true
logLevel: info
metricsMaxInMemory: 1000
sendLoadingState: true
includeAliasesInList: true
models:
"qwen3 (you need to download .gguf first)":
@ -30,6 +32,32 @@ models:
--ctx-size 12288
-fa on
"qwen3.5 (you need to download .gguf first)":
proxy: "http://127.0.0.1:9999"
cmd: >
/app/llama-server
--model /models/Qwen_Qwen3.5-35B-A3B-IQ4_NL.gguf
--alias qwen3.5
--port 9999
--parallel 1
--webui llamacpp
--jinja
--ctx-size 12288
-fa on
--temp 1.0 --top-p 0.95 --top-k 20 --min-p 0 --presence-penalty 1.5 --repeat-penalty 1
aliases:
- "qwen3.5"
filters:
setParamsByID:
"${MODEL_ID}:thinking-coding":
temperature: 0.6
presence_penalty: 0.0
"${MODEL_ID}:instruct":
temperature: 0.7
top_p: 0.8
chat_template_kwargs:
enable_thinking: false
"smollm2 (will be downloaded automatically from huggingface.co)":
proxy: "http://127.0.0.1:9999"
cmd: >

View File

@ -10,7 +10,7 @@ RUN apt-get update && apt-get install -yq build-essential git libcurl4-openssl-d
RUN git clone https://github.com/ikawrakow/ik_llama.cpp.git /app
WORKDIR /app
RUN if [ -n "$CUSTOM_COMMIT" ]; then git switch --detach "$CUSTOM_COMMIT"; fi
RUN cmake -B build -DGGML_NATIVE=OFF -DLLAMA_CURL=ON -DGGML_IQK_FA_ALL_QUANTS=ON && \
RUN cmake -B build -DGGML_NATIVE=ON -DLLAMA_CURL=ON -DGGML_IQK_FA_ALL_QUANTS=ON && \
cmake --build build --config Release -j$(nproc)
RUN mkdir -p /app/lib && \
find build -name "*.so" -exec cp {} /app/lib \;
@ -64,7 +64,7 @@ ENTRYPOINT [ "/app/llama-server" ]
# Stage 5: Swap
FROM server AS swap
ARG LS_REPO=mostlygeek/llama-swap
ARG LS_VER=189
ARG LS_VER=198
RUN curl -LO "https://github.com/${LS_REPO}/releases/download/v${LS_VER}/llama-swap_${LS_VER}_linux_amd64.tar.gz" \
&& tar -zxf "llama-swap_${LS_VER}_linux_amd64.tar.gz" \
&& rm "llama-swap_${LS_VER}_linux_amd64.tar.gz"

View File

@ -1,6 +1,8 @@
healthCheckTimeout: 1800
logRequests: true
logLevel: info
metricsMaxInMemory: 1000
sendLoadingState: true
includeAliasesInList: true
models:
"qwen3 (you need to download .gguf first)":
@ -38,6 +40,34 @@ models:
--grouped-expert-routing
--reasoning-format auto --chat-template-kwargs '{"reasoning_effort": "medium"}'
"qwen3.5 (you need to download .gguf first)":
proxy: "http://127.0.0.1:9999"
cmd: >
/app/llama-server
--model /models/Qwen_Qwen3.5-35B-A3B-IQ4_NL.gguf
--alias qwen3.5
--port 9999
--parallel 1
--webui llamacpp
--jinja
--ctx-size 12288
-fa on
--merge-qkv
-ngl 999 --threads-batch 1
--temp 1.0 --top-p 0.95 --top-k 20 --min-p 0 --presence-penalty 1.5 --repeat-penalty 1
aliases:
- "qwen3.5"
filters:
setParamsByID:
"${MODEL_ID}:thinking-coding":
temperature: 0.6
presence_penalty: 0.0
"${MODEL_ID}:instruct":
temperature: 0.7
top_p: 0.8
chat_template_kwargs:
enable_thinking: false
"smollm2 (will be downloaded automatically from huggingface.co)":
proxy: "http://127.0.0.1:9999"
cmd: >

View File

@ -13,7 +13,7 @@ WORKDIR /app
RUN if [ "${CUDA_DOCKER_ARCH}" != "default" ]; then \
export CMAKE_ARGS="-DCMAKE_CUDA_ARCHITECTURES=${CUDA_DOCKER_ARCH}"; \
fi && \
cmake -B build -DGGML_NATIVE=OFF -DGGML_CUDA=ON -DLLAMA_CURL=ON ${CMAKE_ARGS} -DCMAKE_EXE_LINKER_FLAGS=-Wl,--allow-shlib-undefined . && \
cmake -B build -DGGML_NATIVE=ON -DGGML_CUDA=ON -DLLAMA_CURL=ON ${CMAKE_ARGS} -DCMAKE_EXE_LINKER_FLAGS=-Wl,--allow-shlib-undefined . && \
cmake --build build --config Release -j$(nproc)
RUN mkdir -p /app/lib && \
find build -name "*.so" -exec cp {} /app/lib \;
@ -67,7 +67,7 @@ ENTRYPOINT [ "/app/llama-server" ]
# Stage 5: Swap
FROM server AS swap
ARG LS_REPO=mostlygeek/llama-swap
ARG LS_VER=189
ARG LS_VER=198
RUN curl -LO "https://github.com/${LS_REPO}/releases/download/v${LS_VER}/llama-swap_${LS_VER}_linux_amd64.tar.gz" \
&& tar -zxf "llama-swap_${LS_VER}_linux_amd64.tar.gz" \
&& rm "llama-swap_${LS_VER}_linux_amd64.tar.gz"