mirror of
https://github.com/ikawrakow/ik_llama.cpp.git
synced 2026-06-28 04:30:15 -05:00
Better --n-cpu-moe (#1464)
This commit is contained in:
parent
b8fa7936bf
commit
9b7db9bc3f
@ -1493,10 +1493,11 @@ bool gpt_params_find_arg(int argc, char ** argv, const std::string & arg, gpt_pa
|
||||
invalid_param = true;
|
||||
return true;
|
||||
}
|
||||
for (int32_t l = 0; l < n_layers; ++l) {
|
||||
std::string pattern = "blk\\." + std::to_string(l) + "\\.(ffn_(up|down|gate|gate_up)_exps\\.weight)";
|
||||
params.tensor_buft_overrides.push_back({strdup(pattern.c_str()), ggml_backend_cpu_buffer_type()});
|
||||
}
|
||||
params.ncmoe = n_layers;
|
||||
//for (int32_t l = 0; l < n_layers; ++l) {
|
||||
// std::string pattern = "blk\\." + std::to_string(l) + "\\.(ffn_(up|down|gate|gate_up)_exps\\.weight)";
|
||||
// params.tensor_buft_overrides.push_back({strdup(pattern.c_str()), ggml_backend_cpu_buffer_type()});
|
||||
//}
|
||||
return true;
|
||||
}
|
||||
if (arg == "--no-mmap") {
|
||||
@ -3242,6 +3243,7 @@ struct llama_model_params common_model_params_to_llama(const gpt_params & params
|
||||
mparams.rpc_servers = params.rpc_servers.c_str();
|
||||
mparams.main_gpu = params.main_gpu;
|
||||
mparams.max_gpu = params.max_gpu;
|
||||
mparams.ncmoe = params.ncmoe;
|
||||
mparams.split_mode = params.split_mode;
|
||||
mparams.tensor_split = params.tensor_split;
|
||||
mparams.use_mmap = params.use_mmap;
|
||||
@ -4319,6 +4321,7 @@ void yaml_dump_non_result_info(FILE * stream, const gpt_params & params, const l
|
||||
fprintf(stream, "lora_init_without_apply: %s # default: false\n", params.lora_init_without_apply ? "true" : "false");
|
||||
fprintf(stream, "main_gpu: %d # default: 0\n", params.main_gpu);
|
||||
fprintf(stream, "max_gpu: %d # default: 0\n", params.max_gpu);
|
||||
fprintf(stream, "ncmoe: %d # default: 0\n", params.ncmoe);
|
||||
fprintf(stream, "min_keep: %d # default: 0 (disabled)\n", sparams.min_keep);
|
||||
fprintf(stream, "mirostat: %d # default: 0 (disabled)\n", sparams.mirostat);
|
||||
fprintf(stream, "mirostat_ent: %f # default: 5.0\n", sparams.mirostat_tau);
|
||||
|
||||
@ -225,6 +225,7 @@ struct gpt_params {
|
||||
int32_t n_gpu_layers = -1; // number of layers to store in VRAM (-1 - use default)
|
||||
int32_t main_gpu = 0; // the GPU that is used for scratch and small tensors
|
||||
int32_t max_gpu = 0; // max number of GPUs to use at a time for split mode "graph"
|
||||
int32_t ncmoe = 0; // number of layers in which MoE tensors are left in VRAM
|
||||
float tensor_split[128] = {0}; // how split tensors should be distributed across GPUs
|
||||
int32_t grp_attn_n = 1; // group-attention factor
|
||||
int32_t grp_attn_w = 512; // group-attention width
|
||||
|
||||
@ -370,6 +370,7 @@ extern "C" {
|
||||
// LLAMA_SPLIT_LAYER: ignored
|
||||
int32_t main_gpu;
|
||||
int32_t max_gpu;
|
||||
int32_t ncmoe;
|
||||
|
||||
// proportion of the model (layers or rows) to offload to each GPU, size: llama_max_devices()
|
||||
const float * tensor_split;
|
||||
|
||||
@ -183,6 +183,8 @@ struct create_tensors_helper : public create_tensors_helper_interface {
|
||||
|
||||
std::unordered_set<ggml_tensor *> split_tensors;
|
||||
|
||||
std::vector<std::pair<std::regex, ggml_backend_buffer_type_t>> overrides;
|
||||
|
||||
inline ggml_context * ctx_for_buft(ggml_backend_buffer_type_t buft) {
|
||||
if (auto it = ctx_map.find(buft); it != ctx_map.end()) return it->second;
|
||||
|
||||
@ -213,6 +215,91 @@ create_tensors_helper::create_tensors_helper(llama_model_loader & _ml, llama_mod
|
||||
buft_layer_count[model.buft_layer[i].buft_matrix]++;
|
||||
}
|
||||
|
||||
if (ml.tensor_buft_overrides) {
|
||||
for (const auto * o = ml.tensor_buft_overrides; o->pattern != nullptr; ++o) {
|
||||
overrides.emplace_back(std::make_pair(std::regex(o->pattern), o->buft));
|
||||
}
|
||||
}
|
||||
|
||||
if (ml.ncmoe > 0) {
|
||||
auto buft = ggml_backend_cpu_buffer_type();
|
||||
if (model.split_mode == LLAMA_SPLIT_MODE_ATTN || model.split_mode == LLAMA_SPLIT_MODE_GRAPH || ml.ncmoe >= n_layer || model.devices.size() < 2) {
|
||||
int nmax = std::min(ml.ncmoe, n_layer);
|
||||
for (int i = 0; i < nmax; ++i) {
|
||||
std::string pattern = "blk\\." + std::to_string(i) + "\\.(ffn_(up|down|gate|gate_up)_exps\\.weight)";
|
||||
this->overrides.emplace_back(std::make_pair(std::regex(pattern), buft));
|
||||
}
|
||||
}
|
||||
else if (model.split_mode == LLAMA_SPLIT_MODE_LAYER) {
|
||||
std::vector<int> counts(model.devices.size(), 0);
|
||||
int nbad = 0;
|
||||
for (int i = 0; i < n_layer; ++i) {
|
||||
if (model.default_layer_device[i] >= 0 && model.default_layer_device[i] < (int)model.devices.size()) {
|
||||
++counts[model.default_layer_device[i]];
|
||||
} else {
|
||||
LLAMA_LOG_WARN("%s: default device for layer %d is %d?\n", __func__, i, model.default_layer_device[i]);
|
||||
++nbad;
|
||||
}
|
||||
}
|
||||
if (nbad > 0) {
|
||||
throw std::runtime_error("Unexpected device configuration");
|
||||
}
|
||||
std::vector<int> n_override(counts.size());
|
||||
printf("================= %s: split mode layer with ncmoe = %d, %d devices\n", __func__, ml.ncmoe, (int)model.devices.size());
|
||||
int ntot = 0;
|
||||
for (int i = 0; i < int(counts.size()); ++i) {
|
||||
float fraction = 1.f*counts[i]/n_layer;
|
||||
n_override[i] = std::roundf(fraction*ml.ncmoe);
|
||||
ntot += n_override[i];
|
||||
}
|
||||
while (ntot > ml.ncmoe) {
|
||||
float best_err = -1e30; int ibest = -1;
|
||||
for (int i = 0; i < int(counts.size()); ++i) {
|
||||
if (n_override[i] == 0) continue;
|
||||
float n_want = 1.f*counts[i]*ml.ncmoe/n_layer;
|
||||
float err = n_override[i] - 1 - n_want;
|
||||
if (err > best_err) {
|
||||
best_err = err; ibest = i;
|
||||
}
|
||||
}
|
||||
if (ibest < 0) { // shouldn't happen
|
||||
break;
|
||||
}
|
||||
--n_override[ibest];
|
||||
--ntot;
|
||||
}
|
||||
while (ntot < ml.ncmoe) {
|
||||
float best_err = 1e30; int ibest = -1;
|
||||
for (int i = 0; i < int(counts.size()); ++i) {
|
||||
if (n_override[i] >= counts[i]) continue;
|
||||
float n_want = 1.f*counts[i]*ml.ncmoe/n_layer;
|
||||
float err = n_override[i] + 1 - n_want;
|
||||
if (err < best_err) {
|
||||
best_err = err; ibest = i;
|
||||
}
|
||||
}
|
||||
if (ibest < 0) { // shouldn't happen
|
||||
break;
|
||||
}
|
||||
++n_override[ibest];
|
||||
++ntot;
|
||||
}
|
||||
for (int i = 0; i < int(counts.size()); ++i) {
|
||||
printf(" device %d: %d layers -> %d overrides\n", i, counts[i], n_override[i]);
|
||||
}
|
||||
// it is better to go backwards to avoid (or at least reduce) issues when there are layers without MoE tensors
|
||||
for (int i = n_layer-1; i >= 0; --i) {
|
||||
int id = model.default_layer_device[i];
|
||||
if (n_override[id] > 0) {
|
||||
std::string pattern = "blk\\." + std::to_string(i) + "\\.(ffn_(up|down|gate|gate_up)_exps\\.weight)";
|
||||
printf("Adding override %s=%s\n", pattern.c_str(), ggml_backend_buft_name(buft));
|
||||
this->overrides.emplace_back(std::make_pair(std::regex(pattern), buft));
|
||||
--n_override[id];
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
auto n_tensors = ml.n_tensors;
|
||||
if (ml.merge_qkv) n_tensors += n_layer;
|
||||
if (ml.merge_up_gate_exps) n_tensors += n_layer;
|
||||
@ -310,18 +397,27 @@ static std::vector<int> create_split(int nr, int granularity, const std::vector<
|
||||
}
|
||||
|
||||
ggml_context * create_tensors_helper::get_context_for_tensor(ggml_context * ctx, const std::string & name) {
|
||||
if (ml.tensor_buft_overrides) {
|
||||
for (const auto * overrides = ml.tensor_buft_overrides; overrides->pattern != nullptr; ++overrides) {
|
||||
std::regex pattern(overrides->pattern);
|
||||
if (std::regex_search(name, pattern)) {
|
||||
const struct ggml_tensor * cur = ml.get_tensor_meta(name.c_str());
|
||||
const size_t nbytes = cur ? ggml_nbytes(cur) : 0;
|
||||
LLAMA_LOG_INFO("Tensor %s (size = %.2f MiB) buffer type overriden to %s\n", name.c_str(), nbytes/1024./1024., ggml_backend_buft_name(overrides->buft));
|
||||
ctx = ctx_for_buft(overrides->buft);
|
||||
break;
|
||||
}
|
||||
for (auto & o : overrides) {
|
||||
if (std::regex_search(name, o.first)) {
|
||||
const struct ggml_tensor * cur = ml.get_tensor_meta(name.c_str());
|
||||
const size_t nbytes = cur ? ggml_nbytes(cur) : 0;
|
||||
LLAMA_LOG_INFO("Tensor %s (size = %.2f MiB) buffer type overriden to %s\n", name.c_str(), nbytes/1024./1024., ggml_backend_buft_name(o.second));
|
||||
ctx = ctx_for_buft(o.second);
|
||||
break;
|
||||
}
|
||||
}
|
||||
//if (ml.tensor_buft_overrides) {
|
||||
// for (const auto * overrides = ml.tensor_buft_overrides; overrides->pattern != nullptr; ++overrides) {
|
||||
// std::regex pattern(overrides->pattern);
|
||||
// if (std::regex_search(name, pattern)) {
|
||||
// const struct ggml_tensor * cur = ml.get_tensor_meta(name.c_str());
|
||||
// const size_t nbytes = cur ? ggml_nbytes(cur) : 0;
|
||||
// LLAMA_LOG_INFO("Tensor %s (size = %.2f MiB) buffer type overriden to %s\n", name.c_str(), nbytes/1024./1024., ggml_backend_buft_name(overrides->buft));
|
||||
// ctx = ctx_for_buft(overrides->buft);
|
||||
// break;
|
||||
// }
|
||||
// }
|
||||
//}
|
||||
return ctx;
|
||||
}
|
||||
|
||||
|
||||
@ -204,7 +204,7 @@ namespace GGUFMeta {
|
||||
};
|
||||
}
|
||||
|
||||
llama_model_loader::llama_model_loader(const std::string & fname, bool use_mmap, bool check_tensors,
|
||||
llama_model_loader::llama_model_loader(const std::string & fname, int ncmoe, bool use_mmap, bool check_tensors,
|
||||
bool repack_tensors, bool use_thp, bool merge_qkv, bool merge_up_gate_exps,
|
||||
const llama_model_kv_override * param_overrides_p,
|
||||
const llama_model_tensor_buft_override * param_tensor_buft_overrides_p) {
|
||||
@ -493,6 +493,7 @@ llama_model_loader::llama_model_loader(const std::string & fname, bool use_mmap,
|
||||
use_mmap = false;
|
||||
}
|
||||
|
||||
this->ncmoe = ncmoe;
|
||||
this->use_mmap = use_mmap;
|
||||
this->check_tensors = check_tensors;
|
||||
this->repack_tensors = repack_tensors;
|
||||
|
||||
@ -40,6 +40,8 @@ struct llama_model_loader {
|
||||
int64_t n_elements = 0;
|
||||
size_t n_bytes = 0;
|
||||
|
||||
int ncmoe = 0;
|
||||
|
||||
bool use_mmap = false;
|
||||
bool check_tensors;
|
||||
bool repack_tensors = false;
|
||||
@ -80,7 +82,7 @@ struct llama_model_loader {
|
||||
std::string arch_name;
|
||||
LLM_KV llm_kv = LLM_KV(LLM_ARCH_UNKNOWN);
|
||||
|
||||
llama_model_loader(const std::string & fname, bool use_mmap, bool check_tensors, bool repack_tensors, bool use_thp,
|
||||
llama_model_loader(const std::string & fname, int ncmoe, bool use_mmap, bool check_tensors, bool repack_tensors, bool use_thp,
|
||||
bool merge_qkv, bool merge_up_gate_exps,
|
||||
const llama_model_kv_override * param_overrides_p,
|
||||
const llama_model_tensor_buft_override * param_tensor_buft_overrides_p);
|
||||
|
||||
@ -1022,7 +1022,7 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
|
||||
auto v = (std::vector<llama_model_kv_override>*)params->kv_overrides;
|
||||
kv_overrides = v->data();
|
||||
}
|
||||
llama_model_loader ml(fname_inp, use_mmap, /*check_tensors*/ true, /* repack_tensors */ false,
|
||||
llama_model_loader ml(fname_inp, 0, use_mmap, /*check_tensors*/ true, /* repack_tensors */ false,
|
||||
/* use_thp */ false, /* merge_qkv */ false, /* merge_up_gate_exps */ false, kv_overrides, nullptr);
|
||||
ml.init_mappings(false); // no prefetching
|
||||
|
||||
|
||||
@ -2378,7 +2378,7 @@ static bool llm_load_tensors(
|
||||
// Returns 0 on success, -1 on error, and -2 on cancellation via llama_progress_callback
|
||||
static int llama_model_load(const std::string & fname, llama_model & model, llama_model_params & params) {
|
||||
try {
|
||||
llama_model_loader ml(fname, params.use_mmap, params.check_tensors,
|
||||
llama_model_loader ml(fname, params.ncmoe, params.use_mmap, params.check_tensors,
|
||||
params.repack_tensors, params.use_thp, params.merge_qkv, params.merge_up_gate_exps,
|
||||
params.kv_overrides, params.tensor_buft_overrides);
|
||||
|
||||
@ -4394,6 +4394,7 @@ struct llama_model_params llama_model_default_params() {
|
||||
/*.split_mode =*/ LLAMA_SPLIT_MODE_LAYER,
|
||||
/*.main_gpu =*/ 0,
|
||||
/*.max_gpu =*/ 0,
|
||||
/*.ncmoe =*/ 0,
|
||||
/*.tensor_split =*/ nullptr,
|
||||
/*.rpc_servers =*/ nullptr,
|
||||
/*.progress_callback =*/ nullptr,
|
||||
|
||||
Loading…
x
Reference in New Issue
Block a user