From 9b7db9bc3f2a3d56384a5d53850dd6f1bd9d5242 Mon Sep 17 00:00:00 2001
From: Kawrakow <iwankawrakow@gmail.com>
Date: Thu, 19 Mar 2026 06:57:01 +0100
Subject: [PATCH] Better --n-cpu-moe (#1464)

---
 common/common.cpp          |  11 ++--
 common/common.h            |   1 +
 include/llama.h            |   1 +
 src/llama-load-tensors.cpp | 116 +++++++++++++++++++++++++++++++++----
 src/llama-model-loader.cpp |   3 +-
 src/llama-model-loader.h   |   4 +-
 src/llama-quantize.cpp     |   2 +-
 src/llama.cpp              |   3 +-
 8 files changed, 123 insertions(+), 18 deletions(-)

diff --git a/common/common.cpp b/common/common.cpp
index 1f9d2b32..68d77295 100644
--- a/common/common.cpp
+++ b/common/common.cpp
@@ -1493,10 +1493,11 @@ bool gpt_params_find_arg(int argc, char ** argv, const std::string & arg, gpt_pa
             invalid_param = true;
             return true;
         }
-        for (int32_t l = 0; l < n_layers; ++l) {
-            std::string pattern = "blk\\." + std::to_string(l) + "\\.(ffn_(up|down|gate|gate_up)_exps\\.weight)";
-            params.tensor_buft_overrides.push_back({strdup(pattern.c_str()), ggml_backend_cpu_buffer_type()});
-        }
+        params.ncmoe = n_layers;
+        //for (int32_t l = 0; l < n_layers; ++l) {
+        //    std::string pattern = "blk\\." + std::to_string(l) + "\\.(ffn_(up|down|gate|gate_up)_exps\\.weight)";
+        //    params.tensor_buft_overrides.push_back({strdup(pattern.c_str()), ggml_backend_cpu_buffer_type()});
+        //}
         return true;
     }
     if (arg == "--no-mmap") {
@@ -3242,6 +3243,7 @@ struct llama_model_params common_model_params_to_llama(const gpt_params & params
     mparams.rpc_servers     = params.rpc_servers.c_str();
     mparams.main_gpu        = params.main_gpu;
     mparams.max_gpu         = params.max_gpu;
+    mparams.ncmoe           = params.ncmoe;
     mparams.split_mode      = params.split_mode;
     mparams.tensor_split    = params.tensor_split;
     mparams.use_mmap        = params.use_mmap;
@@ -4319,6 +4321,7 @@ void yaml_dump_non_result_info(FILE * stream, const gpt_params & params, const l
     fprintf(stream, "lora_init_without_apply: %s # default: false\n", params.lora_init_without_apply ? "true" : "false");
     fprintf(stream, "main_gpu: %d # default: 0\n", params.main_gpu);
     fprintf(stream, "max_gpu: %d # default: 0\n", params.max_gpu);
+    fprintf(stream, "ncmoe: %d # default: 0\n", params.ncmoe);
     fprintf(stream, "min_keep: %d # default: 0 (disabled)\n", sparams.min_keep);
     fprintf(stream, "mirostat: %d # default: 0 (disabled)\n", sparams.mirostat);
     fprintf(stream, "mirostat_ent: %f # default: 5.0\n", sparams.mirostat_tau);
diff --git a/common/common.h b/common/common.h
index 0a75a59d..3ae18ceb 100644
--- a/common/common.h
+++ b/common/common.h
@@ -225,6 +225,7 @@ struct gpt_params {
     int32_t n_gpu_layers          =      -1; // number of layers to store in VRAM (-1 - use default)
     int32_t main_gpu              =       0; // the GPU that is used for scratch and small tensors
     int32_t max_gpu               =       0; // max number of GPUs to use at a time for split mode "graph"
+    int32_t ncmoe                 =       0; // number of layers in which MoE tensors are left in VRAM
     float   tensor_split[128]     =     {0}; // how split tensors should be distributed across GPUs
     int32_t grp_attn_n            =       1; // group-attention factor
     int32_t grp_attn_w            =     512; // group-attention width
diff --git a/include/llama.h b/include/llama.h
index 69a4707e..9fb7e77d 100644
--- a/include/llama.h
+++ b/include/llama.h
@@ -370,6 +370,7 @@ extern "C" {
         // LLAMA_SPLIT_LAYER: ignored
         int32_t main_gpu;
         int32_t max_gpu;
+        int32_t ncmoe;
 
         // proportion of the model (layers or rows) to offload to each GPU, size: llama_max_devices()
         const float * tensor_split;
diff --git a/src/llama-load-tensors.cpp b/src/llama-load-tensors.cpp
index f950143f..cb382194 100644
--- a/src/llama-load-tensors.cpp
+++ b/src/llama-load-tensors.cpp
@@ -183,6 +183,8 @@ struct create_tensors_helper : public create_tensors_helper_interface {
 
     std::unordered_set<ggml_tensor *> split_tensors;
 
+    std::vector<std::pair<std::regex, ggml_backend_buffer_type_t>> overrides;
+
     inline ggml_context * ctx_for_buft(ggml_backend_buffer_type_t buft) {
         if (auto it = ctx_map.find(buft); it != ctx_map.end()) return it->second;
 
@@ -213,6 +215,91 @@ create_tensors_helper::create_tensors_helper(llama_model_loader & _ml, llama_mod
         buft_layer_count[model.buft_layer[i].buft_matrix]++;
     }
 
+    if (ml.tensor_buft_overrides) {
+        for (const auto * o = ml.tensor_buft_overrides; o->pattern != nullptr; ++o) {
+            overrides.emplace_back(std::make_pair(std::regex(o->pattern), o->buft));
+        }
+    }
+
+    if (ml.ncmoe > 0) {
+        auto buft = ggml_backend_cpu_buffer_type();
+        if (model.split_mode == LLAMA_SPLIT_MODE_ATTN || model.split_mode == LLAMA_SPLIT_MODE_GRAPH || ml.ncmoe >= n_layer || model.devices.size() < 2) {
+            int nmax = std::min(ml.ncmoe, n_layer);
+            for (int i = 0; i < nmax; ++i) {
+                std::string pattern = "blk\\." + std::to_string(i) + "\\.(ffn_(up|down|gate|gate_up)_exps\\.weight)";
+                this->overrides.emplace_back(std::make_pair(std::regex(pattern), buft));
+            }
+        }
+        else if (model.split_mode == LLAMA_SPLIT_MODE_LAYER) {
+            std::vector<int> counts(model.devices.size(), 0);
+            int nbad = 0;
+            for (int i = 0; i < n_layer; ++i) {
+                if (model.default_layer_device[i] >= 0 && model.default_layer_device[i] < (int)model.devices.size()) {
+                    ++counts[model.default_layer_device[i]];
+                } else {
+                    LLAMA_LOG_WARN("%s: default device for layer %d is %d?\n", __func__, i, model.default_layer_device[i]);
+                    ++nbad;
+                }
+            }
+            if (nbad > 0) {
+                throw std::runtime_error("Unexpected device configuration");
+            }
+            std::vector<int> n_override(counts.size());
+            printf("================= %s: split mode layer with ncmoe = %d, %d devices\n", __func__, ml.ncmoe, (int)model.devices.size());
+            int ntot = 0;
+            for (int i = 0; i < int(counts.size()); ++i) {
+                float fraction = 1.f*counts[i]/n_layer;
+                n_override[i] = std::roundf(fraction*ml.ncmoe);
+                ntot += n_override[i];
+            }
+            while (ntot > ml.ncmoe) {
+                float best_err = -1e30; int ibest = -1;
+                for (int i = 0; i < int(counts.size()); ++i) {
+                    if (n_override[i] == 0) continue;
+                    float n_want = 1.f*counts[i]*ml.ncmoe/n_layer;
+                    float err = n_override[i] - 1 - n_want;
+                    if (err > best_err) {
+                        best_err = err; ibest = i;
+                    }
+                }
+                if (ibest < 0) { // shouldn't happen
+                    break;
+                }
+                --n_override[ibest];
+                --ntot;
+            }
+            while (ntot < ml.ncmoe) {
+                float best_err = 1e30; int ibest = -1;
+                for (int i = 0; i < int(counts.size()); ++i) {
+                    if (n_override[i] >= counts[i]) continue;
+                    float n_want = 1.f*counts[i]*ml.ncmoe/n_layer;
+                    float err = n_override[i] + 1 - n_want;
+                    if (err < best_err) {
+                        best_err = err; ibest = i;
+                    }
+                }
+                if (ibest < 0) { // shouldn't happen
+                    break;
+                }
+                ++n_override[ibest];
+                ++ntot;
+            }
+            for (int i = 0; i < int(counts.size()); ++i) {
+                printf("    device %d: %d layers -> %d overrides\n", i, counts[i], n_override[i]);
+            }
+            // it is better to go backwards to avoid (or at least reduce) issues when there are layers without MoE tensors
+            for (int i = n_layer-1; i >= 0; --i) {
+                int id = model.default_layer_device[i];
+                if (n_override[id] > 0) {
+                    std::string pattern = "blk\\." + std::to_string(i) + "\\.(ffn_(up|down|gate|gate_up)_exps\\.weight)";
+                    printf("Adding override %s=%s\n", pattern.c_str(), ggml_backend_buft_name(buft));
+                    this->overrides.emplace_back(std::make_pair(std::regex(pattern), buft));
+                    --n_override[id];
+                }
+            }
+        }
+    }
+
     auto n_tensors = ml.n_tensors;
     if (ml.merge_qkv) n_tensors += n_layer;
     if (ml.merge_up_gate_exps) n_tensors += n_layer;
@@ -310,18 +397,27 @@ static std::vector<int> create_split(int nr, int granularity, const std::vector<
 }
 
 ggml_context * create_tensors_helper::get_context_for_tensor(ggml_context * ctx, const std::string & name) {
-    if (ml.tensor_buft_overrides) {
-        for (const auto * overrides = ml.tensor_buft_overrides; overrides->pattern != nullptr; ++overrides) {
-            std::regex pattern(overrides->pattern);
-            if (std::regex_search(name, pattern)) {
-                const struct ggml_tensor * cur = ml.get_tensor_meta(name.c_str());
-                const size_t nbytes = cur ? ggml_nbytes(cur) : 0;
-                LLAMA_LOG_INFO("Tensor %s (size = %.2f MiB) buffer type overriden to %s\n", name.c_str(), nbytes/1024./1024., ggml_backend_buft_name(overrides->buft));
-                ctx = ctx_for_buft(overrides->buft);
-                break;
-            }
+    for (auto & o : overrides) {
+        if (std::regex_search(name, o.first)) {
+            const struct ggml_tensor * cur = ml.get_tensor_meta(name.c_str());
+            const size_t nbytes = cur ? ggml_nbytes(cur) : 0;
+            LLAMA_LOG_INFO("Tensor %s (size = %.2f MiB) buffer type overriden to %s\n", name.c_str(), nbytes/1024./1024., ggml_backend_buft_name(o.second));
+            ctx = ctx_for_buft(o.second);
+            break;
         }
     }
+    //if (ml.tensor_buft_overrides) {
+    //    for (const auto * overrides = ml.tensor_buft_overrides; overrides->pattern != nullptr; ++overrides) {
+    //        std::regex pattern(overrides->pattern);
+    //        if (std::regex_search(name, pattern)) {
+    //            const struct ggml_tensor * cur = ml.get_tensor_meta(name.c_str());
+    //            const size_t nbytes = cur ? ggml_nbytes(cur) : 0;
+    //            LLAMA_LOG_INFO("Tensor %s (size = %.2f MiB) buffer type overriden to %s\n", name.c_str(), nbytes/1024./1024., ggml_backend_buft_name(overrides->buft));
+    //            ctx = ctx_for_buft(overrides->buft);
+    //            break;
+    //        }
+    //    }
+    //}
     return ctx;
 }
 
diff --git a/src/llama-model-loader.cpp b/src/llama-model-loader.cpp
index 4437c1ec..39b9fd50 100644
--- a/src/llama-model-loader.cpp
+++ b/src/llama-model-loader.cpp
@@ -204,7 +204,7 @@ namespace GGUFMeta {
     };
 }
 
-llama_model_loader::llama_model_loader(const std::string & fname, bool use_mmap, bool check_tensors,
+llama_model_loader::llama_model_loader(const std::string & fname, int ncmoe, bool use_mmap, bool check_tensors,
         bool repack_tensors, bool use_thp, bool merge_qkv, bool merge_up_gate_exps,
         const llama_model_kv_override * param_overrides_p,
         const llama_model_tensor_buft_override * param_tensor_buft_overrides_p) {
@@ -493,6 +493,7 @@ llama_model_loader::llama_model_loader(const std::string & fname, bool use_mmap,
         use_mmap = false;
     }
 
+    this->ncmoe = ncmoe;
     this->use_mmap = use_mmap;
     this->check_tensors = check_tensors;
     this->repack_tensors = repack_tensors;
diff --git a/src/llama-model-loader.h b/src/llama-model-loader.h
index e9d72a43..21b4050e 100644
--- a/src/llama-model-loader.h
+++ b/src/llama-model-loader.h
@@ -40,6 +40,8 @@ struct llama_model_loader {
     int64_t n_elements = 0;
     size_t  n_bytes    = 0;
 
+    int     ncmoe      = 0;
+
     bool use_mmap = false;
     bool check_tensors;
     bool repack_tensors = false;
@@ -80,7 +82,7 @@ struct llama_model_loader {
     std::string arch_name;
     LLM_KV      llm_kv    = LLM_KV(LLM_ARCH_UNKNOWN);
 
-    llama_model_loader(const std::string & fname, bool use_mmap, bool check_tensors, bool repack_tensors, bool use_thp,
+    llama_model_loader(const std::string & fname, int ncmoe, bool use_mmap, bool check_tensors, bool repack_tensors, bool use_thp,
             bool merge_qkv, bool merge_up_gate_exps,
             const llama_model_kv_override * param_overrides_p,
             const llama_model_tensor_buft_override * param_tensor_buft_overrides_p);
diff --git a/src/llama-quantize.cpp b/src/llama-quantize.cpp
index e0d5383b..aa8b235b 100644
--- a/src/llama-quantize.cpp
+++ b/src/llama-quantize.cpp
@@ -1022,7 +1022,7 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
         auto v = (std::vector<llama_model_kv_override>*)params->kv_overrides;
         kv_overrides = v->data();
     }
-    llama_model_loader ml(fname_inp, use_mmap, /*check_tensors*/ true, /* repack_tensors */ false,
+    llama_model_loader ml(fname_inp, 0, use_mmap, /*check_tensors*/ true, /* repack_tensors */ false,
             /* use_thp */ false, /* merge_qkv */ false, /* merge_up_gate_exps */ false, kv_overrides, nullptr);
     ml.init_mappings(false); // no prefetching
 
diff --git a/src/llama.cpp b/src/llama.cpp
index d4f7f637..db099631 100644
--- a/src/llama.cpp
+++ b/src/llama.cpp
@@ -2378,7 +2378,7 @@ static bool llm_load_tensors(
 // Returns 0 on success, -1 on error, and -2 on cancellation via llama_progress_callback
 static int llama_model_load(const std::string & fname, llama_model & model, llama_model_params & params) {
     try {
-        llama_model_loader ml(fname, params.use_mmap, params.check_tensors,
+        llama_model_loader ml(fname, params.ncmoe, params.use_mmap, params.check_tensors,
                 params.repack_tensors, params.use_thp, params.merge_qkv, params.merge_up_gate_exps,
                 params.kv_overrides, params.tensor_buft_overrides);
 
@@ -4394,6 +4394,7 @@ struct llama_model_params llama_model_default_params() {
         /*.split_mode                  =*/ LLAMA_SPLIT_MODE_LAYER,
         /*.main_gpu                    =*/ 0,
         /*.max_gpu                     =*/ 0,
+        /*.ncmoe                       =*/ 0,
         /*.tensor_split                =*/ nullptr,
         /*.rpc_servers                 =*/ nullptr,
         /*.progress_callback           =*/ nullptr,