First major rewrite

2026-03-18 17:08:46 -05:00 · 2026-03-18 17:08:46 -05:00 · 38d8f2eccc
commit 38d8f2eccc
parent 072dd57439
4 changed files with 98 additions and 0 deletions
--- a/53
+++ b/53
@ -0,0 +1,53 @@
+Running ubergarm's models:
+
+--ctx-size $((1024*16))       Context Size
+-ctk q8_0                     KV cache data type for K (default: f16)
+-mla 3                        enable MLA
+-fa                           enable Flash Attention
+-amb 512                      max batch size for attention computations (default: 0)
+-fmoe                         enable fused MoE (default: disabled)
+--n-gpu-layers 63             number of layers to store in VRAM
+-b 4096                       logical maximum batch size (default: 2048)
+-ub 4096                      physical maximum batch size (default: 512)
+--override-tensor exps=CPU 
+--parallel 1 
+--threads 32                  number of threads to use during generation (default: 32)
+--threads-batch N             number of threads to use during batch and prompt processing (default: same as --threads) 
+-cnv
+
+-ser 6,1 smart-expert-reduction, experts reduction (Only works for DS)
+
+
+DS-R1-528
+./build/bin/llama-cli --model /models/DS-R1-528-IQ3_KS/DeepSeek-R1-0528-IQ3_KS-00001-of-00007.gguf --ctx-size 65536 -ctk q8_0 -mla 3 -fa -amb 512 -fmoe --n-gpu-layers 63 -b 4096 -ub 4096 --override-tensor exps=CPU --parallel 1 --threads 32 -ser 6,1 -cnv
+
+
+Kimi-K2-Instruct-GGUF
+./build/bin/llama-cli --model /models/Kimi-K2-Instruct-GGUF/Kimi-K2-Instruct-IQ2_KS-00001-of-00007.gguf --ctx-size $((1024*32)) -ctk q8_0 -mla 3 -fa -fmoe --override-tensor exps=CPU --parallel 1 --threads 32 -ngl 63 -cnv
+
+
+GLM-4 
+./build/bin/llama-cli --model /models/GLM-4/GLM-4.5-IQ4_K-00001-of-00005.gguf --ctx-size $((1024*16)) -ctk q8_0 -ctv q8_0 -fa -fmoe --override-tensor exps=CPU --parallel 1 --threads 32 --threads-batch 64 -ngl 93 -cnv -p 'You are a helpful assistant'
+If you want to disable thinking, add /nothink (correct, no underscore) at the end of your prompt.
+
+
+git cmake vim python3 python3-venv
+
+
+Docker Containers:
+DS-R1-528
+docker run --name llama-DS-R1-ser --network llms -v /sam4t/LLM:/models --device=nvidia.com/gpu=all -w /ik_llama.cpp --entrypoint ./build/bin/llama-server my-llama:1.0 --model /models/DS-R1-528-IQ3_KS/DeepSeek-R1-0528-IQ3_KS-00001-of-00007.gguf --ctx-size 65536 -ctk q8_0 -mla 3 -fa -amb 512 -fmoe --n-gpu-layers 63 -b 4096 -ub 4096 --override-tensor exps=CPU --parallel 1 --threads 32 -ser 6,1 -cnv -rtr --host 0.0.0.0 --alias "DS-R1-ser"
+
+
+GLM-4-IQ4-K
+docker run --name llama-glm4-IQ4-K --network llms -v /sam4t/LLM:/models --device=nvidia.com/gpu=all -w /ik_llama.cpp --entrypoint ./build/bin/llama-server my-llama:1.0 --model /models/GLM-4/GLM-4.5-IQ4_K-00001-of-00005.gguf --ctx-size $((1024*32)) -ctk q8_0 -ctv q8_0 -fa -fmoe --override-tensor exps=CPU --parallel 1 --threads 32 -ngl 93 -rtr -cnv --host 0.0.0.0 --alias "GLM4-IQ4-K"
+
+
+GLM-4-IQ2-KS
+docker run --name llama-glm4-IQ2-KS --network llms -v /sam4t/LLM:/models --device=nvidia.com/gpu=all -w /ik_llama.cpp --entrypoint ./build/bin/llama-server my-llama:1.0 --model /models/GLM-4/GLM-4.5-IQ2_KS-00001-of-00003.gguf --ctx-size $((1024*32)) -ctk q8_0 -ctv q8_0 -fa -fmoe --override-tensor exps=CPU --parallel 1 --threads 32 -ngl 93 -rtr -cnv --host 0.0.0.0 --alias "GLM4-IQ2-KS"
+
+
+./build/bin/llama-cli --model /models/Kimi-K2-Instruct-GGUF/Kimi-K2-Instruct-IQ2_KS-00001-of-00007.gguf --ctx-size $((1024*32)) -ctk q8_0 -mla 3 -fa -fmoe --override-tensor exps=CPU --parallel 1 --threads 32 -ngl 61 -rtr -cnv
+
+Kimi-K2-Inst-IQ2-KS
+docker run --name llama-Kimi-K2-Inst-IQ2-KS --network llms -v /sam4t/LLM:/models --device=nvidia.com/gpu=all -w /ik_llama.cpp --entrypoint ./build/bin/llama-server my-llama:1.0 --model /models/Kimi-K2-Instruct-GGUF/Kimi-K2-Instruct-IQ2_KS-00001-of-00007.gguf --ctx-size $((1024*32)) -ctk q8_0 -mla 3 -fa -fmoe --override-tensor exps=CPU --parallel 1 --threads 32 -ngl 61 -rtr -cnv --host 0.0.0.0 --alias "Kimi-K2-Inst-IQ2-KS"
--- a/create_new_image.sh
+++ b/create_new_image.sh
@ -0,0 +1,22 @@
+#!/bin/bash
+export BASE_IMAGE="nvcr.io/nvidia/cuda:13.2.0-cudnn-devel-ubuntu24.04"
+
+docker run --name ik_llama_dw_$(date +%Y-%m-%d) --gpus=all $BASE_IMAGE /bin/bash -c \
+'apt-get update
+apt-get --yes upgrade
+apt-get --yes install build-essential git libcurl4-openssl-dev curl libgomp1 cmake
+git clone https://github.com/ikawrakow/ik_llama.cpp
+cd ik_llama.cpp
+cmake -B build -DGGML_NATIVE=ON -DGGML_CUDA=ON
+cmake --build build --config Release -j$(nproc)'
+docker commit ik_llama_dw_$(date +%Y-%m-%d) ik_llama_dw:$(date +%Y-%m-%d)
+docker rm ik_llama_dw_$(date +%Y-%m-%d)
+
+#Updated env for other scripts
+IMAGE=ik_llama_dw:$(date +%Y-%m-%d)
+
+cat > env.sh << EOF
+#!/bin/bash
+#updated from create_new_image.sh
+export IK_IMAGE=$IMAGE
+EOF
--- a/env.sh
+++ b/env.sh
@ -0,0 +1,3 @@
+#!/bin/bash
+#updated from create_new_image.sh
+export IK_IMAGE=ik_llama_dw:2026-03-18
--- a/qwen3.5-coder-next.sh
+++ b/qwen3.5-coder-next.sh
@ -0,0 +1,20 @@
+#!/bin/bash
+#grab container image name
+MODEL_NAME="Qwen3-Coder-Next"
+MODEL="Qwen3-Coder-Next-IQ4_KSS.gguf"
+MODEL_FOLDER="/sam4t/ENC/LLM/Qwen3-Coder-Next"
+
+source ./env.sh
+echo $IK_IMAGE
+docker run --name ik_$MODEL_NAME --network host -v $MODEL_FOLDER:/model --gpus=all $IK_IMAGE /ik_llama.cpp/build/bin/llama-server \
+	--model /model/$MODEL \
+	-ctk q8_0 -ctv q8_0 \
+	--parallel 1 --threads 32 \
+	--port 8654 -ngl 99 \
+	--alias $MODEL_NAME \
+	--no-mmap --jinja \
+	-b 4096 -ub 4096 \
+	-ns 1 -np 1 -cram 32000 \
+	-cram-n-min 128 -crs .9 \
+	--ctx-checkpoints 128
+