From 9b7db9bc3f2a3d56384a5d53850dd6f1bd9d5242 Mon Sep 17 00:00:00 2001 From: Kawrakow Date: Thu, 19 Mar 2026 06:57:01 +0100 Subject: [PATCH] Better --n-cpu-moe (#1464) --- common/common.cpp | 11 ++-- common/common.h | 1 + include/llama.h | 1 + src/llama-load-tensors.cpp | 116 +++++++++++++++++++++++++++++++++---- src/llama-model-loader.cpp | 3 +- src/llama-model-loader.h | 4 +- src/llama-quantize.cpp | 2 +- src/llama.cpp | 3 +- 8 files changed, 123 insertions(+), 18 deletions(-) diff --git a/common/common.cpp b/common/common.cpp index 1f9d2b32..68d77295 100644 --- a/common/common.cpp +++ b/common/common.cpp @@ -1493,10 +1493,11 @@ bool gpt_params_find_arg(int argc, char ** argv, const std::string & arg, gpt_pa invalid_param = true; return true; } - for (int32_t l = 0; l < n_layers; ++l) { - std::string pattern = "blk\\." + std::to_string(l) + "\\.(ffn_(up|down|gate|gate_up)_exps\\.weight)"; - params.tensor_buft_overrides.push_back({strdup(pattern.c_str()), ggml_backend_cpu_buffer_type()}); - } + params.ncmoe = n_layers; + //for (int32_t l = 0; l < n_layers; ++l) { + // std::string pattern = "blk\\." + std::to_string(l) + "\\.(ffn_(up|down|gate|gate_up)_exps\\.weight)"; + // params.tensor_buft_overrides.push_back({strdup(pattern.c_str()), ggml_backend_cpu_buffer_type()}); + //} return true; } if (arg == "--no-mmap") { @@ -3242,6 +3243,7 @@ struct llama_model_params common_model_params_to_llama(const gpt_params & params mparams.rpc_servers = params.rpc_servers.c_str(); mparams.main_gpu = params.main_gpu; mparams.max_gpu = params.max_gpu; + mparams.ncmoe = params.ncmoe; mparams.split_mode = params.split_mode; mparams.tensor_split = params.tensor_split; mparams.use_mmap = params.use_mmap; @@ -4319,6 +4321,7 @@ void yaml_dump_non_result_info(FILE * stream, const gpt_params & params, const l fprintf(stream, "lora_init_without_apply: %s # default: false\n", params.lora_init_without_apply ? "true" : "false"); fprintf(stream, "main_gpu: %d # default: 0\n", params.main_gpu); fprintf(stream, "max_gpu: %d # default: 0\n", params.max_gpu); + fprintf(stream, "ncmoe: %d # default: 0\n", params.ncmoe); fprintf(stream, "min_keep: %d # default: 0 (disabled)\n", sparams.min_keep); fprintf(stream, "mirostat: %d # default: 0 (disabled)\n", sparams.mirostat); fprintf(stream, "mirostat_ent: %f # default: 5.0\n", sparams.mirostat_tau); diff --git a/common/common.h b/common/common.h index 0a75a59d..3ae18ceb 100644 --- a/common/common.h +++ b/common/common.h @@ -225,6 +225,7 @@ struct gpt_params { int32_t n_gpu_layers = -1; // number of layers to store in VRAM (-1 - use default) int32_t main_gpu = 0; // the GPU that is used for scratch and small tensors int32_t max_gpu = 0; // max number of GPUs to use at a time for split mode "graph" + int32_t ncmoe = 0; // number of layers in which MoE tensors are left in VRAM float tensor_split[128] = {0}; // how split tensors should be distributed across GPUs int32_t grp_attn_n = 1; // group-attention factor int32_t grp_attn_w = 512; // group-attention width diff --git a/include/llama.h b/include/llama.h index 69a4707e..9fb7e77d 100644 --- a/include/llama.h +++ b/include/llama.h @@ -370,6 +370,7 @@ extern "C" { // LLAMA_SPLIT_LAYER: ignored int32_t main_gpu; int32_t max_gpu; + int32_t ncmoe; // proportion of the model (layers or rows) to offload to each GPU, size: llama_max_devices() const float * tensor_split; diff --git a/src/llama-load-tensors.cpp b/src/llama-load-tensors.cpp index f950143f..cb382194 100644 --- a/src/llama-load-tensors.cpp +++ b/src/llama-load-tensors.cpp @@ -183,6 +183,8 @@ struct create_tensors_helper : public create_tensors_helper_interface { std::unordered_set split_tensors; + std::vector> overrides; + inline ggml_context * ctx_for_buft(ggml_backend_buffer_type_t buft) { if (auto it = ctx_map.find(buft); it != ctx_map.end()) return it->second; @@ -213,6 +215,91 @@ create_tensors_helper::create_tensors_helper(llama_model_loader & _ml, llama_mod buft_layer_count[model.buft_layer[i].buft_matrix]++; } + if (ml.tensor_buft_overrides) { + for (const auto * o = ml.tensor_buft_overrides; o->pattern != nullptr; ++o) { + overrides.emplace_back(std::make_pair(std::regex(o->pattern), o->buft)); + } + } + + if (ml.ncmoe > 0) { + auto buft = ggml_backend_cpu_buffer_type(); + if (model.split_mode == LLAMA_SPLIT_MODE_ATTN || model.split_mode == LLAMA_SPLIT_MODE_GRAPH || ml.ncmoe >= n_layer || model.devices.size() < 2) { + int nmax = std::min(ml.ncmoe, n_layer); + for (int i = 0; i < nmax; ++i) { + std::string pattern = "blk\\." + std::to_string(i) + "\\.(ffn_(up|down|gate|gate_up)_exps\\.weight)"; + this->overrides.emplace_back(std::make_pair(std::regex(pattern), buft)); + } + } + else if (model.split_mode == LLAMA_SPLIT_MODE_LAYER) { + std::vector counts(model.devices.size(), 0); + int nbad = 0; + for (int i = 0; i < n_layer; ++i) { + if (model.default_layer_device[i] >= 0 && model.default_layer_device[i] < (int)model.devices.size()) { + ++counts[model.default_layer_device[i]]; + } else { + LLAMA_LOG_WARN("%s: default device for layer %d is %d?\n", __func__, i, model.default_layer_device[i]); + ++nbad; + } + } + if (nbad > 0) { + throw std::runtime_error("Unexpected device configuration"); + } + std::vector n_override(counts.size()); + printf("================= %s: split mode layer with ncmoe = %d, %d devices\n", __func__, ml.ncmoe, (int)model.devices.size()); + int ntot = 0; + for (int i = 0; i < int(counts.size()); ++i) { + float fraction = 1.f*counts[i]/n_layer; + n_override[i] = std::roundf(fraction*ml.ncmoe); + ntot += n_override[i]; + } + while (ntot > ml.ncmoe) { + float best_err = -1e30; int ibest = -1; + for (int i = 0; i < int(counts.size()); ++i) { + if (n_override[i] == 0) continue; + float n_want = 1.f*counts[i]*ml.ncmoe/n_layer; + float err = n_override[i] - 1 - n_want; + if (err > best_err) { + best_err = err; ibest = i; + } + } + if (ibest < 0) { // shouldn't happen + break; + } + --n_override[ibest]; + --ntot; + } + while (ntot < ml.ncmoe) { + float best_err = 1e30; int ibest = -1; + for (int i = 0; i < int(counts.size()); ++i) { + if (n_override[i] >= counts[i]) continue; + float n_want = 1.f*counts[i]*ml.ncmoe/n_layer; + float err = n_override[i] + 1 - n_want; + if (err < best_err) { + best_err = err; ibest = i; + } + } + if (ibest < 0) { // shouldn't happen + break; + } + ++n_override[ibest]; + ++ntot; + } + for (int i = 0; i < int(counts.size()); ++i) { + printf(" device %d: %d layers -> %d overrides\n", i, counts[i], n_override[i]); + } + // it is better to go backwards to avoid (or at least reduce) issues when there are layers without MoE tensors + for (int i = n_layer-1; i >= 0; --i) { + int id = model.default_layer_device[i]; + if (n_override[id] > 0) { + std::string pattern = "blk\\." + std::to_string(i) + "\\.(ffn_(up|down|gate|gate_up)_exps\\.weight)"; + printf("Adding override %s=%s\n", pattern.c_str(), ggml_backend_buft_name(buft)); + this->overrides.emplace_back(std::make_pair(std::regex(pattern), buft)); + --n_override[id]; + } + } + } + } + auto n_tensors = ml.n_tensors; if (ml.merge_qkv) n_tensors += n_layer; if (ml.merge_up_gate_exps) n_tensors += n_layer; @@ -310,18 +397,27 @@ static std::vector create_split(int nr, int granularity, const std::vector< } ggml_context * create_tensors_helper::get_context_for_tensor(ggml_context * ctx, const std::string & name) { - if (ml.tensor_buft_overrides) { - for (const auto * overrides = ml.tensor_buft_overrides; overrides->pattern != nullptr; ++overrides) { - std::regex pattern(overrides->pattern); - if (std::regex_search(name, pattern)) { - const struct ggml_tensor * cur = ml.get_tensor_meta(name.c_str()); - const size_t nbytes = cur ? ggml_nbytes(cur) : 0; - LLAMA_LOG_INFO("Tensor %s (size = %.2f MiB) buffer type overriden to %s\n", name.c_str(), nbytes/1024./1024., ggml_backend_buft_name(overrides->buft)); - ctx = ctx_for_buft(overrides->buft); - break; - } + for (auto & o : overrides) { + if (std::regex_search(name, o.first)) { + const struct ggml_tensor * cur = ml.get_tensor_meta(name.c_str()); + const size_t nbytes = cur ? ggml_nbytes(cur) : 0; + LLAMA_LOG_INFO("Tensor %s (size = %.2f MiB) buffer type overriden to %s\n", name.c_str(), nbytes/1024./1024., ggml_backend_buft_name(o.second)); + ctx = ctx_for_buft(o.second); + break; } } + //if (ml.tensor_buft_overrides) { + // for (const auto * overrides = ml.tensor_buft_overrides; overrides->pattern != nullptr; ++overrides) { + // std::regex pattern(overrides->pattern); + // if (std::regex_search(name, pattern)) { + // const struct ggml_tensor * cur = ml.get_tensor_meta(name.c_str()); + // const size_t nbytes = cur ? ggml_nbytes(cur) : 0; + // LLAMA_LOG_INFO("Tensor %s (size = %.2f MiB) buffer type overriden to %s\n", name.c_str(), nbytes/1024./1024., ggml_backend_buft_name(overrides->buft)); + // ctx = ctx_for_buft(overrides->buft); + // break; + // } + // } + //} return ctx; } diff --git a/src/llama-model-loader.cpp b/src/llama-model-loader.cpp index 4437c1ec..39b9fd50 100644 --- a/src/llama-model-loader.cpp +++ b/src/llama-model-loader.cpp @@ -204,7 +204,7 @@ namespace GGUFMeta { }; } -llama_model_loader::llama_model_loader(const std::string & fname, bool use_mmap, bool check_tensors, +llama_model_loader::llama_model_loader(const std::string & fname, int ncmoe, bool use_mmap, bool check_tensors, bool repack_tensors, bool use_thp, bool merge_qkv, bool merge_up_gate_exps, const llama_model_kv_override * param_overrides_p, const llama_model_tensor_buft_override * param_tensor_buft_overrides_p) { @@ -493,6 +493,7 @@ llama_model_loader::llama_model_loader(const std::string & fname, bool use_mmap, use_mmap = false; } + this->ncmoe = ncmoe; this->use_mmap = use_mmap; this->check_tensors = check_tensors; this->repack_tensors = repack_tensors; diff --git a/src/llama-model-loader.h b/src/llama-model-loader.h index e9d72a43..21b4050e 100644 --- a/src/llama-model-loader.h +++ b/src/llama-model-loader.h @@ -40,6 +40,8 @@ struct llama_model_loader { int64_t n_elements = 0; size_t n_bytes = 0; + int ncmoe = 0; + bool use_mmap = false; bool check_tensors; bool repack_tensors = false; @@ -80,7 +82,7 @@ struct llama_model_loader { std::string arch_name; LLM_KV llm_kv = LLM_KV(LLM_ARCH_UNKNOWN); - llama_model_loader(const std::string & fname, bool use_mmap, bool check_tensors, bool repack_tensors, bool use_thp, + llama_model_loader(const std::string & fname, int ncmoe, bool use_mmap, bool check_tensors, bool repack_tensors, bool use_thp, bool merge_qkv, bool merge_up_gate_exps, const llama_model_kv_override * param_overrides_p, const llama_model_tensor_buft_override * param_tensor_buft_overrides_p); diff --git a/src/llama-quantize.cpp b/src/llama-quantize.cpp index e0d5383b..aa8b235b 100644 --- a/src/llama-quantize.cpp +++ b/src/llama-quantize.cpp @@ -1022,7 +1022,7 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s auto v = (std::vector*)params->kv_overrides; kv_overrides = v->data(); } - llama_model_loader ml(fname_inp, use_mmap, /*check_tensors*/ true, /* repack_tensors */ false, + llama_model_loader ml(fname_inp, 0, use_mmap, /*check_tensors*/ true, /* repack_tensors */ false, /* use_thp */ false, /* merge_qkv */ false, /* merge_up_gate_exps */ false, kv_overrides, nullptr); ml.init_mappings(false); // no prefetching diff --git a/src/llama.cpp b/src/llama.cpp index d4f7f637..db099631 100644 --- a/src/llama.cpp +++ b/src/llama.cpp @@ -2378,7 +2378,7 @@ static bool llm_load_tensors( // Returns 0 on success, -1 on error, and -2 on cancellation via llama_progress_callback static int llama_model_load(const std::string & fname, llama_model & model, llama_model_params & params) { try { - llama_model_loader ml(fname, params.use_mmap, params.check_tensors, + llama_model_loader ml(fname, params.ncmoe, params.use_mmap, params.check_tensors, params.repack_tensors, params.use_thp, params.merge_qkv, params.merge_up_gate_exps, params.kv_overrides, params.tensor_buft_overrides); @@ -4394,6 +4394,7 @@ struct llama_model_params llama_model_default_params() { /*.split_mode =*/ LLAMA_SPLIT_MODE_LAYER, /*.main_gpu =*/ 0, /*.max_gpu =*/ 0, + /*.ncmoe =*/ 0, /*.tensor_split =*/ nullptr, /*.rpc_servers =*/ nullptr, /*.progress_callback =*/ nullptr,