mirror of
https://github.com/ikawrakow/ik_llama.cpp.git
synced 2026-06-28 04:30:15 -05:00
Add --defer-experts flag to defer expert mmap residency on Linux (#1634)
* Add --defer-experts flag to defer expert mmap residency on Linux * Disable warmup when defer-experts is enabled
This commit is contained in:
parent
0b81212dea
commit
4f4bcfbe67
@ -1559,6 +1559,11 @@ bool gpt_params_find_arg(int argc, char ** argv, const std::string & arg, gpt_pa
|
||||
params.fit = true;
|
||||
return true;
|
||||
}
|
||||
if (arg == "--defer-experts") {
|
||||
params.defer_experts = true;
|
||||
params.warmup = false;
|
||||
return true;
|
||||
}
|
||||
if (arg == "--fit-margin") {
|
||||
CHECK_ARG;
|
||||
int32_t margin = std::stoi(argv[i]);
|
||||
@ -2595,6 +2600,7 @@ void gpt_params_print_usage(int /*argc*/, char ** argv, const gpt_params & param
|
||||
options.push_back({ "*", " --run-time-repack", "repack tensors if interleaved variant is available"});
|
||||
options.push_back({ "*", " --cpu-moe", "keep all MoE weights in CPU memory"});
|
||||
options.push_back({ "*", " --n-cpu-moe N", "keep MoE weights of the first N layers in CPU memory"});
|
||||
options.push_back({ "*", " --defer-experts", "defer expert mmap residency on Linux to reduce model load time"});
|
||||
options.push_back({ "*", " --fit-margin N", "safety margin in MiB when auto-fitting model offloading"});
|
||||
options.push_back({ "*", "-wgt, --worst-graph-tokens N", "number of tokens to use for worst-case graph"});
|
||||
options.push_back({ "*", " --fit", "automatically determine which tensors to offload to the GPU(s)"});
|
||||
@ -3460,6 +3466,7 @@ struct llama_model_params common_model_params_to_llama(const gpt_params & params
|
||||
mparams.merge_up_gate_exps = params.merge_up_gate_exps;
|
||||
mparams.mtp = params.has_mtp;
|
||||
mparams.flash_attn = params.flash_attn;
|
||||
mparams.defer_experts = params.defer_experts;
|
||||
if (params.kv_overrides.empty()) {
|
||||
mparams.kv_overrides = NULL;
|
||||
} else {
|
||||
@ -4527,6 +4534,7 @@ void yaml_dump_non_result_info(FILE * stream, const gpt_params & params, const l
|
||||
fprintf(stream, "validate_quants: %s # default: false\n", params.validate_quants ? "true" : "false");
|
||||
fprintf(stream, "merge_qkv: %s # default: false\n", params.merge_qkv ? "true" : "false");
|
||||
fprintf(stream, "merge_up_gate_exps: %s # default: false\n", params.merge_up_gate_exps ? "true" : "false");
|
||||
fprintf(stream, "defer_experts: %s # default: false\n", params.defer_experts ? "true" : "false");
|
||||
fprintf(stream, "max_extra_alloc: %d # default: 256\n", params.max_extra_alloc_MiB);
|
||||
fprintf(stream, "penalize_nl: %s # default: false\n", sparams.penalize_nl ? "true" : "false");
|
||||
fprintf(stream, "ppl_output_type: %d # default: 0\n", params.ppl_output_type);
|
||||
|
||||
@ -370,6 +370,7 @@ struct gpt_params {
|
||||
bool only_active_exps = true; // if true, offload only active experts (relevant only for hybrid CPU/GPU)
|
||||
bool merge_qkv = false; // if true, merge separate Q, K, V tensors into a single, contiguous tensor
|
||||
bool merge_up_gate_exps= false; // if true, merge ffn_up_exps and ffn_gate_exps into a single, contiguous tensor
|
||||
bool defer_experts = false; // if true, defer expert mmap residency to speed up model loading (Linux only)
|
||||
bool k_cache_hadamard = false; // if true, use Hadamard transform for the K-cache (only makes sense with quantized cache)
|
||||
bool v_cache_hadamard = false; // if true, use Hadamard transform for the V-cache (only makes sense with quantized cache, which requires FA)
|
||||
bool split_mode_graph_scheduling = false; // if true, force split mode graph scheduling
|
||||
|
||||
@ -269,6 +269,7 @@ struct cmd_params {
|
||||
bool no_ooae = false;
|
||||
bool mqkv = false;
|
||||
bool muge = false;
|
||||
bool defer_experts = false;
|
||||
bool rcache = false;
|
||||
bool sas = false;
|
||||
int max_gpu = 0;
|
||||
@ -317,6 +318,7 @@ static const cmd_params cmd_params_defaults = {
|
||||
/* no_ooae */ false,
|
||||
/* mqkv */ false,
|
||||
/* muge */ false,
|
||||
/* defer_experts */ false,
|
||||
/* rcache */ false,
|
||||
/* sas */ false,
|
||||
/* max_gpu */ 0,
|
||||
@ -367,6 +369,7 @@ static void print_usage(int /* argc */, char ** argv) {
|
||||
printf(" -cuda, --cuda-params <string> (default: %s)\n", cmd_params_defaults.cuda_params.c_str());
|
||||
printf(" -mqkv, --merge-qkv (default: %s)\n", cmd_params_defaults.mqkv ? "1" : "0");
|
||||
printf(" -muge, --merge-up-gate-experts (default: %s)\n", cmd_params_defaults.muge ? "1" : "0");
|
||||
printf(" --defer-experts (Linux only, default: %s)\n", cmd_params_defaults.defer_experts ? "1" : "0");
|
||||
printf(" -rcache, --rope-cache (default: %s)\n", cmd_params_defaults.rcache ? "1" : "0");
|
||||
printf(" -thp, --transparent-huge-pages <0|1> (default: %s)\n", cmd_params_defaults.use_thp? "1" : "0");
|
||||
printf(" -ot, --override-tensor pattern (default: none)\n");
|
||||
@ -813,6 +816,8 @@ static cmd_params parse_cmd_params(int argc, char ** argv) {
|
||||
break;
|
||||
}
|
||||
params.muge = std::stoi(argv[i]);
|
||||
} else if (arg == "--defer-experts") {
|
||||
params.defer_experts = true;
|
||||
} else if (arg == "-sas" || arg == "--scheduler-async") {
|
||||
if (++i >= argc) {
|
||||
invalid_param = true;
|
||||
@ -981,6 +986,7 @@ struct cmd_params_instance {
|
||||
bool no_ooae = false;
|
||||
bool mqkv = false;
|
||||
bool muge = false;
|
||||
bool defer_experts = false;
|
||||
bool rcache = false;
|
||||
bool sas = false;
|
||||
int max_gpu = 0;
|
||||
@ -1003,6 +1009,7 @@ struct cmd_params_instance {
|
||||
mparams.use_thp = use_thp;
|
||||
mparams.merge_qkv = mqkv;
|
||||
mparams.merge_up_gate_exps = muge;
|
||||
mparams.defer_experts = defer_experts;
|
||||
mparams.tensor_buft_overrides = buft_overrides;
|
||||
mparams.mla = mla_attn;
|
||||
mparams.max_gpu = max_gpu;
|
||||
@ -1024,6 +1031,7 @@ struct cmd_params_instance {
|
||||
repack == other.repack &&
|
||||
mqkv == other.mqkv &&
|
||||
muge == other.muge &&
|
||||
defer_experts == other.defer_experts &&
|
||||
use_thp == other.use_thp &&
|
||||
sas == other.sas &&
|
||||
fit == other.fit &&
|
||||
@ -1119,6 +1127,7 @@ static std::vector<cmd_params_instance> get_cmd_params_instances(const cmd_param
|
||||
/* .no_ooae = */ params.no_ooae,
|
||||
/* .mqkv = */ params.mqkv,
|
||||
/* .muge = */ params.muge,
|
||||
/* .defer_experts= */ params.defer_experts,
|
||||
/* .rcache = */ params.rcache,
|
||||
/* .sas = */ params.sas,
|
||||
/* .max_gpu = */ params.max_gpu,
|
||||
@ -1165,6 +1174,7 @@ static std::vector<cmd_params_instance> get_cmd_params_instances(const cmd_param
|
||||
/* .no_ooae = */ params.no_ooae,
|
||||
/* .mqkv = */ params.mqkv,
|
||||
/* .muge = */ params.muge,
|
||||
/* .defer_experts= */ params.defer_experts,
|
||||
/* .rcache = */ params.rcache,
|
||||
/* .sas = */ params.sas,
|
||||
/* .max_gpu = */ params.max_gpu,
|
||||
@ -1211,6 +1221,7 @@ static std::vector<cmd_params_instance> get_cmd_params_instances(const cmd_param
|
||||
/* .no_ooae = */ params.no_ooae,
|
||||
/* .mqkv = */ params.mqkv,
|
||||
/* .muge = */ params.muge,
|
||||
/* .defer_experts= */ params.defer_experts,
|
||||
/* .rcache = */ params.rcache,
|
||||
/* .sas = */ params.sas,
|
||||
/* .max_gpu = */ params.max_gpu,
|
||||
@ -1257,6 +1268,7 @@ static std::vector<cmd_params_instance> get_cmd_params_instances(const cmd_param
|
||||
/* .no_ooae = */ params.no_ooae,
|
||||
/* .mqkv = */ params.mqkv,
|
||||
/* .muge = */ params.muge,
|
||||
/* .defer_experts= */ params.defer_experts,
|
||||
/* .rcache = */ params.rcache,
|
||||
/* .sas = */ params.sas,
|
||||
/* .max_gpu = */ params.max_gpu,
|
||||
@ -1314,6 +1326,7 @@ struct test {
|
||||
bool no_ooae = false;
|
||||
bool mqkv = false;
|
||||
bool muge = false;
|
||||
bool defer_experts = false;
|
||||
bool rcache = false;
|
||||
bool sas = false;
|
||||
bool max_gpu = 0;
|
||||
@ -1356,6 +1369,7 @@ struct test {
|
||||
repack = inst.repack;
|
||||
mqkv = inst.mqkv;
|
||||
muge = inst.muge;
|
||||
defer_experts = inst.defer_experts;
|
||||
fmoe = inst.fmoe;
|
||||
ger = inst.ger;
|
||||
rcache = inst.rcache;
|
||||
@ -1474,7 +1488,7 @@ struct test {
|
||||
field == "gpu_blas" || field == "blas" || field == "sycl" || field == "no_kv_offload" ||
|
||||
field == "flash_attn" || field == "use_mmap" || field == "embeddings" || field == "repack" || field == "use_thp" ||
|
||||
field == "fused_moe" || field == "grouped_er" || field == "no_fused_up_gate" || field == "no_ooae" || field == "mqkv" ||
|
||||
field == "rcache" || field == "reuse" || field == "muge" || field == "sas") {
|
||||
field == "rcache" || field == "reuse" || field == "muge" || field == "defer_experts" || field == "sas") {
|
||||
return BOOL;
|
||||
}
|
||||
if (field == "avg_ts" || field == "stddev_ts") {
|
||||
@ -1517,7 +1531,7 @@ struct test {
|
||||
std::to_string(main_gpu), std::to_string(no_kv_offload), std::to_string(flash_attn),
|
||||
std::to_string(mla_attn), std::to_string(attn_max_batch), ser_to_string(ser), std::to_string(reuse),
|
||||
tensor_split_str, std::to_string(use_mmap), std::to_string(embeddings),
|
||||
std::to_string(repack), std::to_string(mqkv), std::to_string(muge), std::to_string(fmoe), std::to_string(ger),
|
||||
std::to_string(repack), std::to_string(mqkv), std::to_string(muge), std::to_string(defer_experts), std::to_string(fmoe), std::to_string(ger),
|
||||
std::to_string(no_fug), std::to_string(use_thp), std::to_string(no_ooae), std::to_string(rcache), std::to_string(sas),
|
||||
std::to_string(max_gpu),
|
||||
cuda_params, override_tensor,
|
||||
@ -1539,7 +1553,7 @@ struct test {
|
||||
"n_threads", "type_k", "type_v",
|
||||
"n_gpu_layers", "split_mode",
|
||||
"main_gpu", "no_kv_offload", "flash_attn", "mla_attn", "attn_max_batch", "ser", "reuse",
|
||||
"tensor_split", "use_mmap", "embeddings", "repack", "mqkv", "muge", "fused_moe", "grouped_er",
|
||||
"tensor_split", "use_mmap", "embeddings", "repack", "mqkv", "muge", "defer_experts", "fused_moe", "grouped_er",
|
||||
"no_fused_up_gate", "use_thp", "no_ooae", "rcache", "sas", "max_gpu", "cuda_params", "override_tensor",
|
||||
"n_prompt", "n_gen", "test_time",
|
||||
"avg_ns", "stddev_ns",
|
||||
@ -1727,6 +1741,9 @@ struct markdown_printer : public printer {
|
||||
if (field == "muge") {
|
||||
return 4;
|
||||
}
|
||||
if (field == "defer_experts") {
|
||||
return 5;
|
||||
}
|
||||
if (field == "sas") {
|
||||
return 3;
|
||||
}
|
||||
@ -1803,6 +1820,9 @@ struct markdown_printer : public printer {
|
||||
if (field == "muge") {
|
||||
return "muge";
|
||||
}
|
||||
if (field == "defer_experts") {
|
||||
return "defer";
|
||||
}
|
||||
if (field == "sas") {
|
||||
return "sas";
|
||||
}
|
||||
@ -1925,6 +1945,9 @@ struct markdown_printer : public printer {
|
||||
if (params.muge != cmd_params_defaults.muge) {
|
||||
fields.emplace_back("muge");
|
||||
}
|
||||
if (params.defer_experts != cmd_params_defaults.defer_experts) {
|
||||
fields.emplace_back("defer_experts");
|
||||
}
|
||||
if (params.use_thp != cmd_params_defaults.use_thp) {
|
||||
fields.emplace_back("use_thp");
|
||||
}
|
||||
|
||||
@ -426,6 +426,7 @@ extern "C" {
|
||||
bool mtp; // if true, load MTP layers if present
|
||||
bool dry_run; // skip loading tensors
|
||||
bool flash_attn;
|
||||
bool defer_experts; // defer expert mmap residency to speed up model loading (Linux only)
|
||||
};
|
||||
|
||||
// NOTE: changing the default values of parameters marked as [EXPERIMENTAL] may cause crashes or incorrect results in certain configurations
|
||||
|
||||
24
src/llama-expert-io.h
Normal file
24
src/llama-expert-io.h
Normal file
@ -0,0 +1,24 @@
|
||||
#pragma once
|
||||
|
||||
#include <cstddef>
|
||||
#include <vector>
|
||||
|
||||
struct llama_file_range {
|
||||
size_t first = 0;
|
||||
size_t last = 0;
|
||||
|
||||
bool empty() const {
|
||||
return first >= last;
|
||||
}
|
||||
};
|
||||
|
||||
struct llama_expert_tensor_index {
|
||||
size_t deferred_bytes = 0;
|
||||
size_t dense_bytes = 0;
|
||||
|
||||
std::vector<std::vector<llama_file_range>> file_ranges;
|
||||
|
||||
bool empty() const {
|
||||
return deferred_bytes == 0;
|
||||
}
|
||||
};
|
||||
@ -366,6 +366,26 @@ struct llama_mmap::impl {
|
||||
}
|
||||
}
|
||||
|
||||
void dontneed_fragment(size_t first, size_t last) {
|
||||
int page_size = mapped_page_size > 0 ? mapped_page_size : sysconf(_SC_PAGESIZE);
|
||||
align_range(&first, &last, page_size);
|
||||
size_t len = last - first;
|
||||
|
||||
if (len == 0) {
|
||||
return;
|
||||
}
|
||||
|
||||
GGML_ASSERT(first % page_size == 0);
|
||||
GGML_ASSERT(last % page_size == 0);
|
||||
GGML_ASSERT(last >= first);
|
||||
|
||||
#ifdef __linux__
|
||||
if (madvise((uint8_t *) addr + first, len, MADV_DONTNEED)) {
|
||||
LLAMA_LOG_WARN("warning: madvise(..., MADV_DONTNEED) failed: %s\n", strerror(errno));
|
||||
}
|
||||
#endif
|
||||
}
|
||||
|
||||
void unmap_fragment(size_t first, size_t last) {
|
||||
int page_size = mapped_page_size > 0 ? mapped_page_size : sysconf(_SC_PAGESIZE);
|
||||
align_range(&first, &last, page_size);
|
||||
@ -454,6 +474,11 @@ struct llama_mmap::impl {
|
||||
}
|
||||
}
|
||||
|
||||
void dontneed_fragment(size_t first, size_t last) {
|
||||
GGML_UNUSED(first);
|
||||
GGML_UNUSED(last);
|
||||
}
|
||||
|
||||
void unmap_fragment(size_t first, size_t last) {
|
||||
GGML_UNUSED(first);
|
||||
GGML_UNUSED(last);
|
||||
@ -474,6 +499,13 @@ struct llama_mmap::impl {
|
||||
throw std::runtime_error("mmap not supported");
|
||||
}
|
||||
|
||||
void dontneed_fragment(size_t first, size_t last) {
|
||||
GGML_UNUSED(first);
|
||||
GGML_UNUSED(last);
|
||||
|
||||
throw std::runtime_error("mmap not supported");
|
||||
}
|
||||
|
||||
void unmap_fragment(size_t first, size_t last) {
|
||||
GGML_UNUSED(first);
|
||||
GGML_UNUSED(last);
|
||||
@ -494,6 +526,7 @@ llama_mmap::~llama_mmap() = default;
|
||||
size_t llama_mmap::size() const { return pimpl->size; }
|
||||
void * llama_mmap::addr() const { return pimpl->addr; }
|
||||
|
||||
void llama_mmap::dontneed_fragment(size_t first, size_t last) { pimpl->dontneed_fragment(first, last); }
|
||||
void llama_mmap::unmap_fragment(size_t first, size_t last) { pimpl->unmap_fragment(first, last); }
|
||||
|
||||
#if defined(_POSIX_MEMLOCK_RANGE) || defined(_WIN32)
|
||||
|
||||
@ -42,6 +42,8 @@ struct llama_mmap {
|
||||
size_t size() const;
|
||||
void * addr() const;
|
||||
|
||||
void dontneed_fragment(size_t first, size_t last);
|
||||
|
||||
void unmap_fragment(size_t first, size_t last);
|
||||
|
||||
static const bool SUPPORTED;
|
||||
|
||||
@ -20,6 +20,7 @@
|
||||
#include <set>
|
||||
#include <map>
|
||||
#include <array>
|
||||
#include <charconv>
|
||||
#include <future>
|
||||
#include <regex>
|
||||
#include <algorithm>
|
||||
@ -204,8 +205,100 @@ namespace GGUFMeta {
|
||||
};
|
||||
}
|
||||
|
||||
static bool parse_tensor_layer_index(const std::string & name, uint32_t & layer) {
|
||||
if (name.rfind("blk.", 0) != 0) {
|
||||
return false;
|
||||
}
|
||||
|
||||
const char * first = name.data() + 4;
|
||||
const char * last = first;
|
||||
const char * end = name.data() + name.size();
|
||||
|
||||
while (last < end && *last != '.') {
|
||||
++last;
|
||||
}
|
||||
|
||||
if (last == first || last == end) {
|
||||
return false;
|
||||
}
|
||||
|
||||
auto result = std::from_chars(first, last, layer);
|
||||
return result.ec == std::errc() && result.ptr == last;
|
||||
}
|
||||
|
||||
static bool is_split_expert_tensor(const std::string & name, uint32_t & expert) {
|
||||
static const char * prefixes[] = { "ffn_gate.", "ffn_down.", "ffn_up." };
|
||||
|
||||
const size_t layer_end = name.find('.', 4);
|
||||
if (layer_end == std::string::npos) {
|
||||
return false;
|
||||
}
|
||||
|
||||
const size_t prefix_begin = layer_end + 1;
|
||||
|
||||
for (const char * prefix : prefixes) {
|
||||
const size_t prefix_len = std::char_traits<char>::length(prefix);
|
||||
if (name.compare(prefix_begin, prefix_len, prefix) != 0) {
|
||||
continue;
|
||||
}
|
||||
|
||||
const size_t expert_begin = prefix_begin + prefix_len;
|
||||
const size_t expert_end = name.find('.', expert_begin);
|
||||
if (expert_end == std::string::npos || expert_end == expert_begin) {
|
||||
continue;
|
||||
}
|
||||
|
||||
auto result = std::from_chars(name.data() + expert_begin, name.data() + expert_end, expert);
|
||||
if (result.ec == std::errc() && result.ptr == name.data() + expert_end) {
|
||||
return true;
|
||||
}
|
||||
}
|
||||
|
||||
return false;
|
||||
}
|
||||
|
||||
static bool is_merged_expert_tensor(llm_tensor tensor_type) {
|
||||
switch (tensor_type) {
|
||||
case LLM_TENSOR_FFN_NORM_EXPS:
|
||||
case LLM_TENSOR_FFN_DOWN_EXPS:
|
||||
case LLM_TENSOR_FFN_GATE_EXPS:
|
||||
case LLM_TENSOR_FFN_UP_EXPS:
|
||||
case LLM_TENSOR_FFN_GATE_UP_EXPS:
|
||||
case LLM_TENSOR_FFN_EXP_PROBS_B:
|
||||
return true;
|
||||
default:
|
||||
return false;
|
||||
}
|
||||
}
|
||||
|
||||
static void coalesce_ranges(std::vector<llama_file_range> & ranges) {
|
||||
ranges.erase(std::remove_if(ranges.begin(), ranges.end(), [](const llama_file_range & range) {
|
||||
return range.empty();
|
||||
}), ranges.end());
|
||||
|
||||
std::sort(ranges.begin(), ranges.end(), [](const llama_file_range & lhs, const llama_file_range & rhs) {
|
||||
if (lhs.first != rhs.first) {
|
||||
return lhs.first < rhs.first;
|
||||
}
|
||||
return lhs.last < rhs.last;
|
||||
});
|
||||
|
||||
std::vector<llama_file_range> merged;
|
||||
merged.reserve(ranges.size());
|
||||
|
||||
for (const auto & range : ranges) {
|
||||
if (merged.empty() || range.first > merged.back().last) {
|
||||
merged.push_back(range);
|
||||
continue;
|
||||
}
|
||||
merged.back().last = std::max(merged.back().last, range.last);
|
||||
}
|
||||
|
||||
ranges = std::move(merged);
|
||||
}
|
||||
|
||||
llama_model_loader::llama_model_loader(const std::string & fname, int ncmoe, bool use_mmap, bool check_tensors,
|
||||
bool repack_tensors, bool use_thp, bool merge_qkv, bool merge_up_gate_exps,
|
||||
bool repack_tensors, bool use_thp, bool merge_qkv, bool merge_up_gate_exps, bool defer_experts,
|
||||
const llama_model_kv_override * param_overrides_p,
|
||||
const llama_model_tensor_buft_override * param_tensor_buft_overrides_p) {
|
||||
int trace = 0;
|
||||
@ -500,6 +593,7 @@ llama_model_loader::llama_model_loader(const std::string & fname, int ncmoe, boo
|
||||
this->use_thp = use_thp;
|
||||
this->merge_qkv = merge_qkv;
|
||||
this->merge_up_gate_exps = merge_up_gate_exps;
|
||||
this->defer_experts = defer_experts;
|
||||
}
|
||||
|
||||
llama_model_loader::~llama_model_loader() {
|
||||
@ -511,6 +605,74 @@ llama_model_loader::~llama_model_loader() {
|
||||
}
|
||||
}
|
||||
|
||||
void llama_model_loader::build_expert_tensor_index(const llama_hparams & hparams) {
|
||||
expert_tensor_index = {};
|
||||
|
||||
if (hparams.n_expert == 0 || hparams.n_layer == 0) {
|
||||
return;
|
||||
}
|
||||
|
||||
expert_tensor_index.file_ranges.resize(files.size());
|
||||
|
||||
size_t deferred_bytes = 0;
|
||||
const llm_arch arch = get_arch();
|
||||
|
||||
for (const auto & weight : weights) {
|
||||
const std::string name(weight.tensor->name);
|
||||
uint32_t layer = 0;
|
||||
if (!parse_tensor_layer_index(name, layer)) {
|
||||
continue;
|
||||
}
|
||||
|
||||
if (layer >= hparams.n_layer) {
|
||||
throw std::runtime_error(format("expert tensor '%s' has invalid layer index %u", name.c_str(), layer));
|
||||
}
|
||||
|
||||
// check for split expert tensors (blk.N.ffn_gate.E.weight) by name pattern,
|
||||
// since llm_tensor_type can't resolve these (two %d in the format string)
|
||||
uint32_t expert = 0;
|
||||
if (is_split_expert_tensor(name, expert)) {
|
||||
if (expert >= hparams.n_expert) {
|
||||
throw std::runtime_error(format("expert tensor '%s' has invalid expert index %u", name.c_str(), expert));
|
||||
}
|
||||
} else {
|
||||
const llm_tensor tensor_type = llm_tensor_type(arch, name, int(layer));
|
||||
if (!is_merged_expert_tensor(tensor_type)) {
|
||||
continue;
|
||||
}
|
||||
}
|
||||
|
||||
const size_t tensor_bytes = ggml_nbytes(weight.tensor);
|
||||
deferred_bytes += tensor_bytes;
|
||||
expert_tensor_index.file_ranges.at(weight.idx).push_back({ weight.offs, weight.offs + tensor_bytes });
|
||||
}
|
||||
|
||||
for (auto & ranges : expert_tensor_index.file_ranges) {
|
||||
coalesce_ranges(ranges);
|
||||
}
|
||||
|
||||
expert_tensor_index.deferred_bytes = deferred_bytes;
|
||||
expert_tensor_index.dense_bytes = n_bytes > deferred_bytes ? n_bytes - deferred_bytes : 0;
|
||||
}
|
||||
|
||||
bool llama_model_loader::should_defer_expert_mmaps() const {
|
||||
return defer_experts && use_mmap && !expert_tensor_index.empty();
|
||||
}
|
||||
|
||||
void llama_model_loader::drop_mmap_expert_pages() const {
|
||||
if (!use_mmap || mappings.empty() || expert_tensor_index.file_ranges.empty()) {
|
||||
return;
|
||||
}
|
||||
|
||||
const size_t n_range_sets = std::min(mappings.size(), expert_tensor_index.file_ranges.size());
|
||||
for (size_t idx = 0; idx < n_range_sets; ++idx) {
|
||||
const auto & ranges = expert_tensor_index.file_ranges[idx];
|
||||
for (const auto & range : ranges) {
|
||||
mappings[idx]->dontneed_fragment(range.first, range.last);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
template<typename T>
|
||||
typename std::enable_if<std::is_integral<T>::value, bool>::type
|
||||
llama_model_loader::get_arr_n(const std::string & key, T & result, const bool required) {
|
||||
|
||||
@ -2,8 +2,10 @@
|
||||
|
||||
#include "llama.h"
|
||||
#include "llama-impl.h"
|
||||
#include "llama-expert-io.h"
|
||||
#include "llama-mmap.h"
|
||||
#include "llama-arch.h"
|
||||
#include "llama-hparams.h"
|
||||
|
||||
#include <cstdint>
|
||||
#include <cstddef>
|
||||
@ -48,6 +50,7 @@ struct llama_model_loader {
|
||||
bool use_thp = false;
|
||||
bool merge_qkv = false;
|
||||
bool merge_up_gate_exps = false;
|
||||
bool defer_experts = false;
|
||||
|
||||
llama_files files;
|
||||
llama_ftype ftype;
|
||||
@ -81,9 +84,10 @@ struct llama_model_loader {
|
||||
|
||||
std::string arch_name;
|
||||
LLM_KV llm_kv = LLM_KV(LLM_ARCH_UNKNOWN);
|
||||
llama_expert_tensor_index expert_tensor_index;
|
||||
|
||||
llama_model_loader(const std::string & fname, int ncmoe, bool use_mmap, bool check_tensors, bool repack_tensors, bool use_thp,
|
||||
bool merge_qkv, bool merge_up_gate_exps,
|
||||
bool merge_qkv, bool merge_up_gate_exps, bool defer_experts,
|
||||
const llama_model_kv_override * param_overrides_p,
|
||||
const llama_model_tensor_buft_override * param_tensor_buft_overrides_p);
|
||||
|
||||
@ -158,6 +162,12 @@ struct llama_model_loader {
|
||||
|
||||
void init_mappings(bool prefetch = true, llama_mlocks * mlock_mmaps = nullptr, bool use_thp = false);
|
||||
|
||||
void build_expert_tensor_index(const llama_hparams & hparams);
|
||||
|
||||
bool should_defer_expert_mmaps() const;
|
||||
|
||||
void drop_mmap_expert_pages() const;
|
||||
|
||||
void get_mapping_range(size_t * first, size_t * last, void ** addr, int idx, ggml_context * ctx) const;
|
||||
|
||||
// for backwards compatibility, does not support ggml-backend
|
||||
|
||||
@ -1037,7 +1037,8 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
|
||||
kv_overrides = v->data();
|
||||
}
|
||||
llama_model_loader ml(fname_inp, 0, use_mmap, /*check_tensors*/ true, /* repack_tensors */ false,
|
||||
/* use_thp */ false, /* merge_qkv */ false, /* merge_up_gate_exps */ false, kv_overrides, nullptr);
|
||||
/* use_thp */ false, /* merge_qkv */ false, /* merge_up_gate_exps */ false,
|
||||
/* defer_experts */ false, kv_overrides, nullptr);
|
||||
ml.init_mappings(false); // no prefetching
|
||||
|
||||
llama_model model;
|
||||
|
||||
@ -2665,9 +2665,19 @@ static bool llm_load_tensors(
|
||||
ml.use_mmap = false;
|
||||
}
|
||||
|
||||
bool defer_expert_mmap = ml.should_defer_expert_mmaps();
|
||||
if (defer_expert_mmap && use_mlock) {
|
||||
LLAMA_LOG_WARN("%s: deferred expert loading disabled because mlock keeps mmap ranges resident\n", __func__);
|
||||
defer_expert_mmap = false;
|
||||
}
|
||||
if (defer_expert_mmap && (ml.check_tensors || validate_quants)) {
|
||||
LLAMA_LOG_WARN("%s: deferred expert loading disabled because tensor validation would fault expert pages eagerly\n", __func__);
|
||||
defer_expert_mmap = false;
|
||||
}
|
||||
|
||||
ml.done_getting_tensors();
|
||||
|
||||
ml.init_mappings(true, use_mlock ? &model.mlock_mmaps : nullptr, ml.use_thp);
|
||||
ml.init_mappings(!defer_expert_mmap, use_mlock ? &model.mlock_mmaps : nullptr, ml.use_thp);
|
||||
model.mappings.reserve(ml.mappings.size());
|
||||
|
||||
// create the backend buffers
|
||||
@ -2804,6 +2814,10 @@ static bool llm_load_tensors(
|
||||
return false;
|
||||
}
|
||||
}
|
||||
|
||||
if (defer_expert_mmap) {
|
||||
ml.drop_mmap_expert_pages();
|
||||
}
|
||||
}
|
||||
|
||||
if (model.arch == LLM_ARCH_DEEPSEEK2 || model.arch == LLM_ARCH_GLM_DSA || model.arch == LLM_ARCH_MISTRAL4) {
|
||||
@ -2842,6 +2856,14 @@ static bool llm_load_tensors(
|
||||
}
|
||||
}
|
||||
|
||||
if (defer_expert_mmap && !dry_run) {
|
||||
LLAMA_LOG_INFO("%s: dense parameters loaded in %.2fs (%.2f GiB), expert parameters deferred (%.2f GiB)\n",
|
||||
__func__,
|
||||
(ggml_time_us() - model.t_start_us) / 1000000.0,
|
||||
ml.expert_tensor_index.dense_bytes / 1024.0 / 1024.0 / 1024.0,
|
||||
ml.expert_tensor_index.deferred_bytes / 1024.0 / 1024.0 / 1024.0);
|
||||
}
|
||||
|
||||
if (!ml.use_mmap && ml.repack_tensors) {
|
||||
int n_repacked = 0;
|
||||
for (auto& it : model.tensors_by_name) {
|
||||
@ -2899,6 +2921,7 @@ static int llama_model_load(const std::string & fname, llama_model & model, llam
|
||||
try {
|
||||
llama_model_loader ml(fname, params.ncmoe, params.use_mmap, params.check_tensors,
|
||||
params.repack_tensors, params.use_thp, params.merge_qkv, params.merge_up_gate_exps,
|
||||
params.defer_experts,
|
||||
params.kv_overrides, params.tensor_buft_overrides);
|
||||
|
||||
model.hparams.vocab_only = params.vocab_only;
|
||||
@ -2913,6 +2936,13 @@ static int llama_model_load(const std::string & fname, llama_model & model, llam
|
||||
} catch(const std::exception & e) {
|
||||
throw std::runtime_error("error loading model hyperparameters: " + std::string(e.what()));
|
||||
}
|
||||
if (params.defer_experts && params.use_mmap) {
|
||||
#ifdef __linux__
|
||||
ml.build_expert_tensor_index(model.hparams);
|
||||
#else
|
||||
LLAMA_LOG_WARN("%s: deferred expert loading is only supported on Linux; ignoring defer_experts\n", __func__);
|
||||
#endif
|
||||
}
|
||||
try {
|
||||
LLM_KV kv(model.arch);
|
||||
model.vocab.load(ml, kv);
|
||||
@ -4973,6 +5003,7 @@ struct llama_model_params llama_model_default_params() {
|
||||
/*.mtp =*/ false,
|
||||
/*.dry_run =*/ false,
|
||||
/*.flash_attn =*/ true,
|
||||
/*.defer_experts =*/ false,
|
||||
};
|
||||
|
||||
#ifdef GGML_USE_METAL
|
||||
|
||||
Loading…
x
Reference in New Issue
Block a user