Better --n-cpu-moe (#1464)

This commit is contained in:
Kawrakow 2026-03-19 06:57:01 +01:00 committed by GitHub
parent b8fa7936bf
commit 9b7db9bc3f
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
8 changed files with 123 additions and 18 deletions

View File

@ -1493,10 +1493,11 @@ bool gpt_params_find_arg(int argc, char ** argv, const std::string & arg, gpt_pa
invalid_param = true;
return true;
}
for (int32_t l = 0; l < n_layers; ++l) {
std::string pattern = "blk\\." + std::to_string(l) + "\\.(ffn_(up|down|gate|gate_up)_exps\\.weight)";
params.tensor_buft_overrides.push_back({strdup(pattern.c_str()), ggml_backend_cpu_buffer_type()});
}
params.ncmoe = n_layers;
//for (int32_t l = 0; l < n_layers; ++l) {
// std::string pattern = "blk\\." + std::to_string(l) + "\\.(ffn_(up|down|gate|gate_up)_exps\\.weight)";
// params.tensor_buft_overrides.push_back({strdup(pattern.c_str()), ggml_backend_cpu_buffer_type()});
//}
return true;
}
if (arg == "--no-mmap") {
@ -3242,6 +3243,7 @@ struct llama_model_params common_model_params_to_llama(const gpt_params & params
mparams.rpc_servers = params.rpc_servers.c_str();
mparams.main_gpu = params.main_gpu;
mparams.max_gpu = params.max_gpu;
mparams.ncmoe = params.ncmoe;
mparams.split_mode = params.split_mode;
mparams.tensor_split = params.tensor_split;
mparams.use_mmap = params.use_mmap;
@ -4319,6 +4321,7 @@ void yaml_dump_non_result_info(FILE * stream, const gpt_params & params, const l
fprintf(stream, "lora_init_without_apply: %s # default: false\n", params.lora_init_without_apply ? "true" : "false");
fprintf(stream, "main_gpu: %d # default: 0\n", params.main_gpu);
fprintf(stream, "max_gpu: %d # default: 0\n", params.max_gpu);
fprintf(stream, "ncmoe: %d # default: 0\n", params.ncmoe);
fprintf(stream, "min_keep: %d # default: 0 (disabled)\n", sparams.min_keep);
fprintf(stream, "mirostat: %d # default: 0 (disabled)\n", sparams.mirostat);
fprintf(stream, "mirostat_ent: %f # default: 5.0\n", sparams.mirostat_tau);

View File

@ -225,6 +225,7 @@ struct gpt_params {
int32_t n_gpu_layers = -1; // number of layers to store in VRAM (-1 - use default)
int32_t main_gpu = 0; // the GPU that is used for scratch and small tensors
int32_t max_gpu = 0; // max number of GPUs to use at a time for split mode "graph"
int32_t ncmoe = 0; // number of layers in which MoE tensors are left in VRAM
float tensor_split[128] = {0}; // how split tensors should be distributed across GPUs
int32_t grp_attn_n = 1; // group-attention factor
int32_t grp_attn_w = 512; // group-attention width

View File

@ -370,6 +370,7 @@ extern "C" {
// LLAMA_SPLIT_LAYER: ignored
int32_t main_gpu;
int32_t max_gpu;
int32_t ncmoe;
// proportion of the model (layers or rows) to offload to each GPU, size: llama_max_devices()
const float * tensor_split;

View File

@ -183,6 +183,8 @@ struct create_tensors_helper : public create_tensors_helper_interface {
std::unordered_set<ggml_tensor *> split_tensors;
std::vector<std::pair<std::regex, ggml_backend_buffer_type_t>> overrides;
inline ggml_context * ctx_for_buft(ggml_backend_buffer_type_t buft) {
if (auto it = ctx_map.find(buft); it != ctx_map.end()) return it->second;
@ -213,6 +215,91 @@ create_tensors_helper::create_tensors_helper(llama_model_loader & _ml, llama_mod
buft_layer_count[model.buft_layer[i].buft_matrix]++;
}
if (ml.tensor_buft_overrides) {
for (const auto * o = ml.tensor_buft_overrides; o->pattern != nullptr; ++o) {
overrides.emplace_back(std::make_pair(std::regex(o->pattern), o->buft));
}
}
if (ml.ncmoe > 0) {
auto buft = ggml_backend_cpu_buffer_type();
if (model.split_mode == LLAMA_SPLIT_MODE_ATTN || model.split_mode == LLAMA_SPLIT_MODE_GRAPH || ml.ncmoe >= n_layer || model.devices.size() < 2) {
int nmax = std::min(ml.ncmoe, n_layer);
for (int i = 0; i < nmax; ++i) {
std::string pattern = "blk\\." + std::to_string(i) + "\\.(ffn_(up|down|gate|gate_up)_exps\\.weight)";
this->overrides.emplace_back(std::make_pair(std::regex(pattern), buft));
}
}
else if (model.split_mode == LLAMA_SPLIT_MODE_LAYER) {
std::vector<int> counts(model.devices.size(), 0);
int nbad = 0;
for (int i = 0; i < n_layer; ++i) {
if (model.default_layer_device[i] >= 0 && model.default_layer_device[i] < (int)model.devices.size()) {
++counts[model.default_layer_device[i]];
} else {
LLAMA_LOG_WARN("%s: default device for layer %d is %d?\n", __func__, i, model.default_layer_device[i]);
++nbad;
}
}
if (nbad > 0) {
throw std::runtime_error("Unexpected device configuration");
}
std::vector<int> n_override(counts.size());
printf("================= %s: split mode layer with ncmoe = %d, %d devices\n", __func__, ml.ncmoe, (int)model.devices.size());
int ntot = 0;
for (int i = 0; i < int(counts.size()); ++i) {
float fraction = 1.f*counts[i]/n_layer;
n_override[i] = std::roundf(fraction*ml.ncmoe);
ntot += n_override[i];
}
while (ntot > ml.ncmoe) {
float best_err = -1e30; int ibest = -1;
for (int i = 0; i < int(counts.size()); ++i) {
if (n_override[i] == 0) continue;
float n_want = 1.f*counts[i]*ml.ncmoe/n_layer;
float err = n_override[i] - 1 - n_want;
if (err > best_err) {
best_err = err; ibest = i;
}
}
if (ibest < 0) { // shouldn't happen
break;
}
--n_override[ibest];
--ntot;
}
while (ntot < ml.ncmoe) {
float best_err = 1e30; int ibest = -1;
for (int i = 0; i < int(counts.size()); ++i) {
if (n_override[i] >= counts[i]) continue;
float n_want = 1.f*counts[i]*ml.ncmoe/n_layer;
float err = n_override[i] + 1 - n_want;
if (err < best_err) {
best_err = err; ibest = i;
}
}
if (ibest < 0) { // shouldn't happen
break;
}
++n_override[ibest];
++ntot;
}
for (int i = 0; i < int(counts.size()); ++i) {
printf(" device %d: %d layers -> %d overrides\n", i, counts[i], n_override[i]);
}
// it is better to go backwards to avoid (or at least reduce) issues when there are layers without MoE tensors
for (int i = n_layer-1; i >= 0; --i) {
int id = model.default_layer_device[i];
if (n_override[id] > 0) {
std::string pattern = "blk\\." + std::to_string(i) + "\\.(ffn_(up|down|gate|gate_up)_exps\\.weight)";
printf("Adding override %s=%s\n", pattern.c_str(), ggml_backend_buft_name(buft));
this->overrides.emplace_back(std::make_pair(std::regex(pattern), buft));
--n_override[id];
}
}
}
}
auto n_tensors = ml.n_tensors;
if (ml.merge_qkv) n_tensors += n_layer;
if (ml.merge_up_gate_exps) n_tensors += n_layer;
@ -310,18 +397,27 @@ static std::vector<int> create_split(int nr, int granularity, const std::vector<
}
ggml_context * create_tensors_helper::get_context_for_tensor(ggml_context * ctx, const std::string & name) {
if (ml.tensor_buft_overrides) {
for (const auto * overrides = ml.tensor_buft_overrides; overrides->pattern != nullptr; ++overrides) {
std::regex pattern(overrides->pattern);
if (std::regex_search(name, pattern)) {
const struct ggml_tensor * cur = ml.get_tensor_meta(name.c_str());
const size_t nbytes = cur ? ggml_nbytes(cur) : 0;
LLAMA_LOG_INFO("Tensor %s (size = %.2f MiB) buffer type overriden to %s\n", name.c_str(), nbytes/1024./1024., ggml_backend_buft_name(overrides->buft));
ctx = ctx_for_buft(overrides->buft);
break;
}
for (auto & o : overrides) {
if (std::regex_search(name, o.first)) {
const struct ggml_tensor * cur = ml.get_tensor_meta(name.c_str());
const size_t nbytes = cur ? ggml_nbytes(cur) : 0;
LLAMA_LOG_INFO("Tensor %s (size = %.2f MiB) buffer type overriden to %s\n", name.c_str(), nbytes/1024./1024., ggml_backend_buft_name(o.second));
ctx = ctx_for_buft(o.second);
break;
}
}
//if (ml.tensor_buft_overrides) {
// for (const auto * overrides = ml.tensor_buft_overrides; overrides->pattern != nullptr; ++overrides) {
// std::regex pattern(overrides->pattern);
// if (std::regex_search(name, pattern)) {
// const struct ggml_tensor * cur = ml.get_tensor_meta(name.c_str());
// const size_t nbytes = cur ? ggml_nbytes(cur) : 0;
// LLAMA_LOG_INFO("Tensor %s (size = %.2f MiB) buffer type overriden to %s\n", name.c_str(), nbytes/1024./1024., ggml_backend_buft_name(overrides->buft));
// ctx = ctx_for_buft(overrides->buft);
// break;
// }
// }
//}
return ctx;
}

View File

@ -204,7 +204,7 @@ namespace GGUFMeta {
};
}
llama_model_loader::llama_model_loader(const std::string & fname, bool use_mmap, bool check_tensors,
llama_model_loader::llama_model_loader(const std::string & fname, int ncmoe, bool use_mmap, bool check_tensors,
bool repack_tensors, bool use_thp, bool merge_qkv, bool merge_up_gate_exps,
const llama_model_kv_override * param_overrides_p,
const llama_model_tensor_buft_override * param_tensor_buft_overrides_p) {
@ -493,6 +493,7 @@ llama_model_loader::llama_model_loader(const std::string & fname, bool use_mmap,
use_mmap = false;
}
this->ncmoe = ncmoe;
this->use_mmap = use_mmap;
this->check_tensors = check_tensors;
this->repack_tensors = repack_tensors;

View File

@ -40,6 +40,8 @@ struct llama_model_loader {
int64_t n_elements = 0;
size_t n_bytes = 0;
int ncmoe = 0;
bool use_mmap = false;
bool check_tensors;
bool repack_tensors = false;
@ -80,7 +82,7 @@ struct llama_model_loader {
std::string arch_name;
LLM_KV llm_kv = LLM_KV(LLM_ARCH_UNKNOWN);
llama_model_loader(const std::string & fname, bool use_mmap, bool check_tensors, bool repack_tensors, bool use_thp,
llama_model_loader(const std::string & fname, int ncmoe, bool use_mmap, bool check_tensors, bool repack_tensors, bool use_thp,
bool merge_qkv, bool merge_up_gate_exps,
const llama_model_kv_override * param_overrides_p,
const llama_model_tensor_buft_override * param_tensor_buft_overrides_p);

View File

@ -1022,7 +1022,7 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
auto v = (std::vector<llama_model_kv_override>*)params->kv_overrides;
kv_overrides = v->data();
}
llama_model_loader ml(fname_inp, use_mmap, /*check_tensors*/ true, /* repack_tensors */ false,
llama_model_loader ml(fname_inp, 0, use_mmap, /*check_tensors*/ true, /* repack_tensors */ false,
/* use_thp */ false, /* merge_qkv */ false, /* merge_up_gate_exps */ false, kv_overrides, nullptr);
ml.init_mappings(false); // no prefetching

View File

@ -2378,7 +2378,7 @@ static bool llm_load_tensors(
// Returns 0 on success, -1 on error, and -2 on cancellation via llama_progress_callback
static int llama_model_load(const std::string & fname, llama_model & model, llama_model_params & params) {
try {
llama_model_loader ml(fname, params.use_mmap, params.check_tensors,
llama_model_loader ml(fname, params.ncmoe, params.use_mmap, params.check_tensors,
params.repack_tensors, params.use_thp, params.merge_qkv, params.merge_up_gate_exps,
params.kv_overrides, params.tensor_buft_overrides);
@ -4394,6 +4394,7 @@ struct llama_model_params llama_model_default_params() {
/*.split_mode =*/ LLAMA_SPLIT_MODE_LAYER,
/*.main_gpu =*/ 0,
/*.max_gpu =*/ 0,
/*.ncmoe =*/ 0,
/*.tensor_split =*/ nullptr,
/*.rpc_servers =*/ nullptr,
/*.progress_callback =*/ nullptr,