diff --git a/Notes b/Notes new file mode 100644 index 0000000..ca489dc --- /dev/null +++ b/Notes @@ -0,0 +1,53 @@ +Running ubergarm's models: + +--ctx-size $((1024*16)) Context Size +-ctk q8_0 KV cache data type for K (default: f16) +-mla 3 enable MLA +-fa enable Flash Attention +-amb 512 max batch size for attention computations (default: 0) +-fmoe enable fused MoE (default: disabled) +--n-gpu-layers 63 number of layers to store in VRAM +-b 4096 logical maximum batch size (default: 2048) +-ub 4096 physical maximum batch size (default: 512) +--override-tensor exps=CPU +--parallel 1 +--threads 32 number of threads to use during generation (default: 32) +--threads-batch N number of threads to use during batch and prompt processing (default: same as --threads) +-cnv + +-ser 6,1 smart-expert-reduction, experts reduction (Only works for DS) + + +DS-R1-528 +./build/bin/llama-cli --model /models/DS-R1-528-IQ3_KS/DeepSeek-R1-0528-IQ3_KS-00001-of-00007.gguf --ctx-size 65536 -ctk q8_0 -mla 3 -fa -amb 512 -fmoe --n-gpu-layers 63 -b 4096 -ub 4096 --override-tensor exps=CPU --parallel 1 --threads 32 -ser 6,1 -cnv + + +Kimi-K2-Instruct-GGUF +./build/bin/llama-cli --model /models/Kimi-K2-Instruct-GGUF/Kimi-K2-Instruct-IQ2_KS-00001-of-00007.gguf --ctx-size $((1024*32)) -ctk q8_0 -mla 3 -fa -fmoe --override-tensor exps=CPU --parallel 1 --threads 32 -ngl 63 -cnv + + +GLM-4 +./build/bin/llama-cli --model /models/GLM-4/GLM-4.5-IQ4_K-00001-of-00005.gguf --ctx-size $((1024*16)) -ctk q8_0 -ctv q8_0 -fa -fmoe --override-tensor exps=CPU --parallel 1 --threads 32 --threads-batch 64 -ngl 93 -cnv -p 'You are a helpful assistant' +If you want to disable thinking, add /nothink (correct, no underscore) at the end of your prompt. + + +git cmake vim python3 python3-venv + + +Docker Containers: +DS-R1-528 +docker run --name llama-DS-R1-ser --network llms -v /sam4t/LLM:/models --device=nvidia.com/gpu=all -w /ik_llama.cpp --entrypoint ./build/bin/llama-server my-llama:1.0 --model /models/DS-R1-528-IQ3_KS/DeepSeek-R1-0528-IQ3_KS-00001-of-00007.gguf --ctx-size 65536 -ctk q8_0 -mla 3 -fa -amb 512 -fmoe --n-gpu-layers 63 -b 4096 -ub 4096 --override-tensor exps=CPU --parallel 1 --threads 32 -ser 6,1 -cnv -rtr --host 0.0.0.0 --alias "DS-R1-ser" + + +GLM-4-IQ4-K +docker run --name llama-glm4-IQ4-K --network llms -v /sam4t/LLM:/models --device=nvidia.com/gpu=all -w /ik_llama.cpp --entrypoint ./build/bin/llama-server my-llama:1.0 --model /models/GLM-4/GLM-4.5-IQ4_K-00001-of-00005.gguf --ctx-size $((1024*32)) -ctk q8_0 -ctv q8_0 -fa -fmoe --override-tensor exps=CPU --parallel 1 --threads 32 -ngl 93 -rtr -cnv --host 0.0.0.0 --alias "GLM4-IQ4-K" + + +GLM-4-IQ2-KS +docker run --name llama-glm4-IQ2-KS --network llms -v /sam4t/LLM:/models --device=nvidia.com/gpu=all -w /ik_llama.cpp --entrypoint ./build/bin/llama-server my-llama:1.0 --model /models/GLM-4/GLM-4.5-IQ2_KS-00001-of-00003.gguf --ctx-size $((1024*32)) -ctk q8_0 -ctv q8_0 -fa -fmoe --override-tensor exps=CPU --parallel 1 --threads 32 -ngl 93 -rtr -cnv --host 0.0.0.0 --alias "GLM4-IQ2-KS" + + +./build/bin/llama-cli --model /models/Kimi-K2-Instruct-GGUF/Kimi-K2-Instruct-IQ2_KS-00001-of-00007.gguf --ctx-size $((1024*32)) -ctk q8_0 -mla 3 -fa -fmoe --override-tensor exps=CPU --parallel 1 --threads 32 -ngl 61 -rtr -cnv + +Kimi-K2-Inst-IQ2-KS +docker run --name llama-Kimi-K2-Inst-IQ2-KS --network llms -v /sam4t/LLM:/models --device=nvidia.com/gpu=all -w /ik_llama.cpp --entrypoint ./build/bin/llama-server my-llama:1.0 --model /models/Kimi-K2-Instruct-GGUF/Kimi-K2-Instruct-IQ2_KS-00001-of-00007.gguf --ctx-size $((1024*32)) -ctk q8_0 -mla 3 -fa -fmoe --override-tensor exps=CPU --parallel 1 --threads 32 -ngl 61 -rtr -cnv --host 0.0.0.0 --alias "Kimi-K2-Inst-IQ2-KS" diff --git a/create_new_image.sh b/create_new_image.sh new file mode 100755 index 0000000..12d6318 --- /dev/null +++ b/create_new_image.sh @@ -0,0 +1,22 @@ +#!/bin/bash +export BASE_IMAGE="nvcr.io/nvidia/cuda:13.2.0-cudnn-devel-ubuntu24.04" + +docker run --name ik_llama_dw_$(date +%Y-%m-%d) --gpus=all $BASE_IMAGE /bin/bash -c \ +'apt-get update +apt-get --yes upgrade +apt-get --yes install build-essential git libcurl4-openssl-dev curl libgomp1 cmake +git clone https://github.com/ikawrakow/ik_llama.cpp +cd ik_llama.cpp +cmake -B build -DGGML_NATIVE=ON -DGGML_CUDA=ON +cmake --build build --config Release -j$(nproc)' +docker commit ik_llama_dw_$(date +%Y-%m-%d) ik_llama_dw:$(date +%Y-%m-%d) +docker rm ik_llama_dw_$(date +%Y-%m-%d) + +#Updated env for other scripts +IMAGE=ik_llama_dw:$(date +%Y-%m-%d) + +cat > env.sh << EOF +#!/bin/bash +#updated from create_new_image.sh +export IK_IMAGE=$IMAGE +EOF diff --git a/env.sh b/env.sh new file mode 100755 index 0000000..efcaafc --- /dev/null +++ b/env.sh @@ -0,0 +1,3 @@ +#!/bin/bash +#updated from create_new_image.sh +export IK_IMAGE=ik_llama_dw:2026-03-18 diff --git a/qwen3.5-coder-next.sh b/qwen3.5-coder-next.sh new file mode 100755 index 0000000..0a84d19 --- /dev/null +++ b/qwen3.5-coder-next.sh @@ -0,0 +1,20 @@ +#!/bin/bash +#grab container image name +MODEL_NAME="Qwen3-Coder-Next" +MODEL="Qwen3-Coder-Next-IQ4_KSS.gguf" +MODEL_FOLDER="/sam4t/ENC/LLM/Qwen3-Coder-Next" + +source ./env.sh +echo $IK_IMAGE +docker run --name ik_$MODEL_NAME --network host -v $MODEL_FOLDER:/model --gpus=all $IK_IMAGE /ik_llama.cpp/build/bin/llama-server \ + --model /model/$MODEL \ + -ctk q8_0 -ctv q8_0 \ + --parallel 1 --threads 32 \ + --port 8654 -ngl 99 \ + --alias $MODEL_NAME \ + --no-mmap --jinja \ + -b 4096 -ub 4096 \ + -ns 1 -np 1 -cram 32000 \ + -cram-n-min 128 -crs .9 \ + --ctx-checkpoints 128 +