mirror of
https://github.com/ikawrakow/ik_llama.cpp.git
synced 2026-06-28 04:30:15 -05:00
Update docker and it's configs (#1497)
* Update ik_llama-cuda.Containerfile - DGGML_NATIVE=ON, as most users are building images locally - Latest llama-swap version * Update ik_llama-cpu.Containerfile - DGGML_NATIVE=ON, as most users are building images locally - Latest llama-swap version * Update README.md - DGGML_NATIVE=ON is now default * Update ik_llama-cuda-swap.config.yaml - Adapt to the latest version * Update ik_llama-cpu-swap.config.yaml - Adapt to the latest version * Update ik_llama-cuda-swap.config.yaml - Add model with aliases * Update ik_llama-cpu-swap.config.yaml - Add model with aliases
This commit is contained in:
parent
094f76ee86
commit
bbc07002f7
@ -123,7 +123,7 @@ docker run -it --name ik_llama_full --rm -v /my_local_files/gguf:/models:ro --r
|
||||
|
||||
- Customize `llama-swap` config: save the `ik_llama-cpu-swap.config.yaml` or `ik_llama-cuda-swap.config.yaml` localy (e.g. under `/my_local_files/`) then map it to `/app/config.yaml` inside the container appending `-v /my_local_files/ik_llama-cpu-swap.config.yaml:/app/config.yaml:ro` to your`podman run ...` or `docker run ...`.
|
||||
- To run the container in background, replace `-it` with `-d`: `podman run -d ...` or `docker run -d ...`. To stop it: `podman stop ik_llama` or `docker stop ik_llama`.
|
||||
- If you build the image on the same machine where will be used, change `-DGGML_NATIVE=OFF` to `-DGGML_NATIVE=ON` in the `.Containerfile`.
|
||||
- If you build the image on a diferent machine, change `-DGGML_NATIVE=ON` to `-DGGML_NATIVE=OFF` in the `.Containerfile`.
|
||||
- If you build only for your GPU architecture and want to make use of more KV quantization types, build with `-DGGML_IQK_FA_ALL_QUANTS=ON`.
|
||||
- If you experiment with several `CUDA_VERSION`, remember to identify with `podman image ls` or `docker image ls` then delete (e.g.`podman image rm docker.io/nvidia/cuda:12.4.0-runtime-ubuntu22.04 && podman image rm docker.io/nvidia/cuda:12.4.0-devel-ubuntu22.04` or `docker image rm docker.io/nvidia/cuda:12.4.0-runtime-ubuntu22.04 && docker image rm docker.io/nvidia/cuda:12.4.0-devel-ubuntu22.04` the unused images as they have several GB.
|
||||
- If you want to build without `llama-swap`, change `--target swap` to `--target server` in `ik_llama Containerfiles`, e.g. `docker image build --file ik_llama-cuda.Containerfile --target full --tag ik_llama-cuda:full . && docker image build --file ik_llama-cuda.Containerfile --target server --tag ik_llama-cuda:server .`
|
||||
|
||||
@ -1,6 +1,8 @@
|
||||
healthCheckTimeout: 1800
|
||||
logRequests: true
|
||||
logLevel: info
|
||||
metricsMaxInMemory: 1000
|
||||
sendLoadingState: true
|
||||
includeAliasesInList: true
|
||||
|
||||
models:
|
||||
"qwen3 (you need to download .gguf first)":
|
||||
@ -30,6 +32,32 @@ models:
|
||||
--ctx-size 12288
|
||||
-fa on
|
||||
|
||||
"qwen3.5 (you need to download .gguf first)":
|
||||
proxy: "http://127.0.0.1:9999"
|
||||
cmd: >
|
||||
/app/llama-server
|
||||
--model /models/Qwen_Qwen3.5-35B-A3B-IQ4_NL.gguf
|
||||
--alias qwen3.5
|
||||
--port 9999
|
||||
--parallel 1
|
||||
--webui llamacpp
|
||||
--jinja
|
||||
--ctx-size 12288
|
||||
-fa on
|
||||
--temp 1.0 --top-p 0.95 --top-k 20 --min-p 0 --presence-penalty 1.5 --repeat-penalty 1
|
||||
aliases:
|
||||
- "qwen3.5"
|
||||
filters:
|
||||
setParamsByID:
|
||||
"${MODEL_ID}:thinking-coding":
|
||||
temperature: 0.6
|
||||
presence_penalty: 0.0
|
||||
"${MODEL_ID}:instruct":
|
||||
temperature: 0.7
|
||||
top_p: 0.8
|
||||
chat_template_kwargs:
|
||||
enable_thinking: false
|
||||
|
||||
"smollm2 (will be downloaded automatically from huggingface.co)":
|
||||
proxy: "http://127.0.0.1:9999"
|
||||
cmd: >
|
||||
|
||||
@ -10,7 +10,7 @@ RUN apt-get update && apt-get install -yq build-essential git libcurl4-openssl-d
|
||||
RUN git clone https://github.com/ikawrakow/ik_llama.cpp.git /app
|
||||
WORKDIR /app
|
||||
RUN if [ -n "$CUSTOM_COMMIT" ]; then git switch --detach "$CUSTOM_COMMIT"; fi
|
||||
RUN cmake -B build -DGGML_NATIVE=OFF -DLLAMA_CURL=ON -DGGML_IQK_FA_ALL_QUANTS=ON && \
|
||||
RUN cmake -B build -DGGML_NATIVE=ON -DLLAMA_CURL=ON -DGGML_IQK_FA_ALL_QUANTS=ON && \
|
||||
cmake --build build --config Release -j$(nproc)
|
||||
RUN mkdir -p /app/lib && \
|
||||
find build -name "*.so" -exec cp {} /app/lib \;
|
||||
@ -64,7 +64,7 @@ ENTRYPOINT [ "/app/llama-server" ]
|
||||
# Stage 5: Swap
|
||||
FROM server AS swap
|
||||
ARG LS_REPO=mostlygeek/llama-swap
|
||||
ARG LS_VER=189
|
||||
ARG LS_VER=198
|
||||
RUN curl -LO "https://github.com/${LS_REPO}/releases/download/v${LS_VER}/llama-swap_${LS_VER}_linux_amd64.tar.gz" \
|
||||
&& tar -zxf "llama-swap_${LS_VER}_linux_amd64.tar.gz" \
|
||||
&& rm "llama-swap_${LS_VER}_linux_amd64.tar.gz"
|
||||
|
||||
@ -1,6 +1,8 @@
|
||||
healthCheckTimeout: 1800
|
||||
logRequests: true
|
||||
logLevel: info
|
||||
metricsMaxInMemory: 1000
|
||||
sendLoadingState: true
|
||||
includeAliasesInList: true
|
||||
|
||||
models:
|
||||
"qwen3 (you need to download .gguf first)":
|
||||
@ -38,6 +40,34 @@ models:
|
||||
--grouped-expert-routing
|
||||
--reasoning-format auto --chat-template-kwargs '{"reasoning_effort": "medium"}'
|
||||
|
||||
"qwen3.5 (you need to download .gguf first)":
|
||||
proxy: "http://127.0.0.1:9999"
|
||||
cmd: >
|
||||
/app/llama-server
|
||||
--model /models/Qwen_Qwen3.5-35B-A3B-IQ4_NL.gguf
|
||||
--alias qwen3.5
|
||||
--port 9999
|
||||
--parallel 1
|
||||
--webui llamacpp
|
||||
--jinja
|
||||
--ctx-size 12288
|
||||
-fa on
|
||||
--merge-qkv
|
||||
-ngl 999 --threads-batch 1
|
||||
--temp 1.0 --top-p 0.95 --top-k 20 --min-p 0 --presence-penalty 1.5 --repeat-penalty 1
|
||||
aliases:
|
||||
- "qwen3.5"
|
||||
filters:
|
||||
setParamsByID:
|
||||
"${MODEL_ID}:thinking-coding":
|
||||
temperature: 0.6
|
||||
presence_penalty: 0.0
|
||||
"${MODEL_ID}:instruct":
|
||||
temperature: 0.7
|
||||
top_p: 0.8
|
||||
chat_template_kwargs:
|
||||
enable_thinking: false
|
||||
|
||||
"smollm2 (will be downloaded automatically from huggingface.co)":
|
||||
proxy: "http://127.0.0.1:9999"
|
||||
cmd: >
|
||||
|
||||
@ -13,7 +13,7 @@ WORKDIR /app
|
||||
RUN if [ "${CUDA_DOCKER_ARCH}" != "default" ]; then \
|
||||
export CMAKE_ARGS="-DCMAKE_CUDA_ARCHITECTURES=${CUDA_DOCKER_ARCH}"; \
|
||||
fi && \
|
||||
cmake -B build -DGGML_NATIVE=OFF -DGGML_CUDA=ON -DLLAMA_CURL=ON ${CMAKE_ARGS} -DCMAKE_EXE_LINKER_FLAGS=-Wl,--allow-shlib-undefined . && \
|
||||
cmake -B build -DGGML_NATIVE=ON -DGGML_CUDA=ON -DLLAMA_CURL=ON ${CMAKE_ARGS} -DCMAKE_EXE_LINKER_FLAGS=-Wl,--allow-shlib-undefined . && \
|
||||
cmake --build build --config Release -j$(nproc)
|
||||
RUN mkdir -p /app/lib && \
|
||||
find build -name "*.so" -exec cp {} /app/lib \;
|
||||
@ -67,7 +67,7 @@ ENTRYPOINT [ "/app/llama-server" ]
|
||||
# Stage 5: Swap
|
||||
FROM server AS swap
|
||||
ARG LS_REPO=mostlygeek/llama-swap
|
||||
ARG LS_VER=189
|
||||
ARG LS_VER=198
|
||||
RUN curl -LO "https://github.com/${LS_REPO}/releases/download/v${LS_VER}/llama-swap_${LS_VER}_linux_amd64.tar.gz" \
|
||||
&& tar -zxf "llama-swap_${LS_VER}_linux_amd64.tar.gz" \
|
||||
&& rm "llama-swap_${LS_VER}_linux_amd64.tar.gz"
|
||||
|
||||
Loading…
x
Reference in New Issue
Block a user