From 4f4bcfbe67127e17d33e51ca5e7a230a7bdbb44d Mon Sep 17 00:00:00 2001 From: dmaivel Date: Thu, 16 Apr 2026 02:54:44 -0400 Subject: [PATCH] Add --defer-experts flag to defer expert mmap residency on Linux (#1634) * Add --defer-experts flag to defer expert mmap residency on Linux * Disable warmup when defer-experts is enabled --- common/common.cpp | 8 ++ common/common.h | 1 + examples/llama-bench/llama-bench.cpp | 29 ++++- include/llama.h | 1 + src/llama-expert-io.h | 24 ++++ src/llama-mmap.cpp | 33 ++++++ src/llama-mmap.h | 2 + src/llama-model-loader.cpp | 164 ++++++++++++++++++++++++++- src/llama-model-loader.h | 12 +- src/llama-quantize.cpp | 3 +- src/llama.cpp | 33 +++++- 11 files changed, 303 insertions(+), 7 deletions(-) create mode 100644 src/llama-expert-io.h diff --git a/common/common.cpp b/common/common.cpp index d95e1008..7aaff9a5 100644 --- a/common/common.cpp +++ b/common/common.cpp @@ -1559,6 +1559,11 @@ bool gpt_params_find_arg(int argc, char ** argv, const std::string & arg, gpt_pa params.fit = true; return true; } + if (arg == "--defer-experts") { + params.defer_experts = true; + params.warmup = false; + return true; + } if (arg == "--fit-margin") { CHECK_ARG; int32_t margin = std::stoi(argv[i]); @@ -2595,6 +2600,7 @@ void gpt_params_print_usage(int /*argc*/, char ** argv, const gpt_params & param options.push_back({ "*", " --run-time-repack", "repack tensors if interleaved variant is available"}); options.push_back({ "*", " --cpu-moe", "keep all MoE weights in CPU memory"}); options.push_back({ "*", " --n-cpu-moe N", "keep MoE weights of the first N layers in CPU memory"}); + options.push_back({ "*", " --defer-experts", "defer expert mmap residency on Linux to reduce model load time"}); options.push_back({ "*", " --fit-margin N", "safety margin in MiB when auto-fitting model offloading"}); options.push_back({ "*", "-wgt, --worst-graph-tokens N", "number of tokens to use for worst-case graph"}); options.push_back({ "*", " --fit", "automatically determine which tensors to offload to the GPU(s)"}); @@ -3460,6 +3466,7 @@ struct llama_model_params common_model_params_to_llama(const gpt_params & params mparams.merge_up_gate_exps = params.merge_up_gate_exps; mparams.mtp = params.has_mtp; mparams.flash_attn = params.flash_attn; + mparams.defer_experts = params.defer_experts; if (params.kv_overrides.empty()) { mparams.kv_overrides = NULL; } else { @@ -4527,6 +4534,7 @@ void yaml_dump_non_result_info(FILE * stream, const gpt_params & params, const l fprintf(stream, "validate_quants: %s # default: false\n", params.validate_quants ? "true" : "false"); fprintf(stream, "merge_qkv: %s # default: false\n", params.merge_qkv ? "true" : "false"); fprintf(stream, "merge_up_gate_exps: %s # default: false\n", params.merge_up_gate_exps ? "true" : "false"); + fprintf(stream, "defer_experts: %s # default: false\n", params.defer_experts ? "true" : "false"); fprintf(stream, "max_extra_alloc: %d # default: 256\n", params.max_extra_alloc_MiB); fprintf(stream, "penalize_nl: %s # default: false\n", sparams.penalize_nl ? "true" : "false"); fprintf(stream, "ppl_output_type: %d # default: 0\n", params.ppl_output_type); diff --git a/common/common.h b/common/common.h index 3616da8a..326c1b6e 100644 --- a/common/common.h +++ b/common/common.h @@ -370,6 +370,7 @@ struct gpt_params { bool only_active_exps = true; // if true, offload only active experts (relevant only for hybrid CPU/GPU) bool merge_qkv = false; // if true, merge separate Q, K, V tensors into a single, contiguous tensor bool merge_up_gate_exps= false; // if true, merge ffn_up_exps and ffn_gate_exps into a single, contiguous tensor + bool defer_experts = false; // if true, defer expert mmap residency to speed up model loading (Linux only) bool k_cache_hadamard = false; // if true, use Hadamard transform for the K-cache (only makes sense with quantized cache) bool v_cache_hadamard = false; // if true, use Hadamard transform for the V-cache (only makes sense with quantized cache, which requires FA) bool split_mode_graph_scheduling = false; // if true, force split mode graph scheduling diff --git a/examples/llama-bench/llama-bench.cpp b/examples/llama-bench/llama-bench.cpp index ed1b1aa9..89b34508 100644 --- a/examples/llama-bench/llama-bench.cpp +++ b/examples/llama-bench/llama-bench.cpp @@ -269,6 +269,7 @@ struct cmd_params { bool no_ooae = false; bool mqkv = false; bool muge = false; + bool defer_experts = false; bool rcache = false; bool sas = false; int max_gpu = 0; @@ -317,6 +318,7 @@ static const cmd_params cmd_params_defaults = { /* no_ooae */ false, /* mqkv */ false, /* muge */ false, + /* defer_experts */ false, /* rcache */ false, /* sas */ false, /* max_gpu */ 0, @@ -367,6 +369,7 @@ static void print_usage(int /* argc */, char ** argv) { printf(" -cuda, --cuda-params (default: %s)\n", cmd_params_defaults.cuda_params.c_str()); printf(" -mqkv, --merge-qkv (default: %s)\n", cmd_params_defaults.mqkv ? "1" : "0"); printf(" -muge, --merge-up-gate-experts (default: %s)\n", cmd_params_defaults.muge ? "1" : "0"); + printf(" --defer-experts (Linux only, default: %s)\n", cmd_params_defaults.defer_experts ? "1" : "0"); printf(" -rcache, --rope-cache (default: %s)\n", cmd_params_defaults.rcache ? "1" : "0"); printf(" -thp, --transparent-huge-pages <0|1> (default: %s)\n", cmd_params_defaults.use_thp? "1" : "0"); printf(" -ot, --override-tensor pattern (default: none)\n"); @@ -813,6 +816,8 @@ static cmd_params parse_cmd_params(int argc, char ** argv) { break; } params.muge = std::stoi(argv[i]); + } else if (arg == "--defer-experts") { + params.defer_experts = true; } else if (arg == "-sas" || arg == "--scheduler-async") { if (++i >= argc) { invalid_param = true; @@ -981,6 +986,7 @@ struct cmd_params_instance { bool no_ooae = false; bool mqkv = false; bool muge = false; + bool defer_experts = false; bool rcache = false; bool sas = false; int max_gpu = 0; @@ -1003,6 +1009,7 @@ struct cmd_params_instance { mparams.use_thp = use_thp; mparams.merge_qkv = mqkv; mparams.merge_up_gate_exps = muge; + mparams.defer_experts = defer_experts; mparams.tensor_buft_overrides = buft_overrides; mparams.mla = mla_attn; mparams.max_gpu = max_gpu; @@ -1024,6 +1031,7 @@ struct cmd_params_instance { repack == other.repack && mqkv == other.mqkv && muge == other.muge && + defer_experts == other.defer_experts && use_thp == other.use_thp && sas == other.sas && fit == other.fit && @@ -1119,6 +1127,7 @@ static std::vector get_cmd_params_instances(const cmd_param /* .no_ooae = */ params.no_ooae, /* .mqkv = */ params.mqkv, /* .muge = */ params.muge, + /* .defer_experts= */ params.defer_experts, /* .rcache = */ params.rcache, /* .sas = */ params.sas, /* .max_gpu = */ params.max_gpu, @@ -1165,6 +1174,7 @@ static std::vector get_cmd_params_instances(const cmd_param /* .no_ooae = */ params.no_ooae, /* .mqkv = */ params.mqkv, /* .muge = */ params.muge, + /* .defer_experts= */ params.defer_experts, /* .rcache = */ params.rcache, /* .sas = */ params.sas, /* .max_gpu = */ params.max_gpu, @@ -1211,6 +1221,7 @@ static std::vector get_cmd_params_instances(const cmd_param /* .no_ooae = */ params.no_ooae, /* .mqkv = */ params.mqkv, /* .muge = */ params.muge, + /* .defer_experts= */ params.defer_experts, /* .rcache = */ params.rcache, /* .sas = */ params.sas, /* .max_gpu = */ params.max_gpu, @@ -1257,6 +1268,7 @@ static std::vector get_cmd_params_instances(const cmd_param /* .no_ooae = */ params.no_ooae, /* .mqkv = */ params.mqkv, /* .muge = */ params.muge, + /* .defer_experts= */ params.defer_experts, /* .rcache = */ params.rcache, /* .sas = */ params.sas, /* .max_gpu = */ params.max_gpu, @@ -1314,6 +1326,7 @@ struct test { bool no_ooae = false; bool mqkv = false; bool muge = false; + bool defer_experts = false; bool rcache = false; bool sas = false; bool max_gpu = 0; @@ -1356,6 +1369,7 @@ struct test { repack = inst.repack; mqkv = inst.mqkv; muge = inst.muge; + defer_experts = inst.defer_experts; fmoe = inst.fmoe; ger = inst.ger; rcache = inst.rcache; @@ -1474,7 +1488,7 @@ struct test { field == "gpu_blas" || field == "blas" || field == "sycl" || field == "no_kv_offload" || field == "flash_attn" || field == "use_mmap" || field == "embeddings" || field == "repack" || field == "use_thp" || field == "fused_moe" || field == "grouped_er" || field == "no_fused_up_gate" || field == "no_ooae" || field == "mqkv" || - field == "rcache" || field == "reuse" || field == "muge" || field == "sas") { + field == "rcache" || field == "reuse" || field == "muge" || field == "defer_experts" || field == "sas") { return BOOL; } if (field == "avg_ts" || field == "stddev_ts") { @@ -1517,7 +1531,7 @@ struct test { std::to_string(main_gpu), std::to_string(no_kv_offload), std::to_string(flash_attn), std::to_string(mla_attn), std::to_string(attn_max_batch), ser_to_string(ser), std::to_string(reuse), tensor_split_str, std::to_string(use_mmap), std::to_string(embeddings), - std::to_string(repack), std::to_string(mqkv), std::to_string(muge), std::to_string(fmoe), std::to_string(ger), + std::to_string(repack), std::to_string(mqkv), std::to_string(muge), std::to_string(defer_experts), std::to_string(fmoe), std::to_string(ger), std::to_string(no_fug), std::to_string(use_thp), std::to_string(no_ooae), std::to_string(rcache), std::to_string(sas), std::to_string(max_gpu), cuda_params, override_tensor, @@ -1539,7 +1553,7 @@ struct test { "n_threads", "type_k", "type_v", "n_gpu_layers", "split_mode", "main_gpu", "no_kv_offload", "flash_attn", "mla_attn", "attn_max_batch", "ser", "reuse", - "tensor_split", "use_mmap", "embeddings", "repack", "mqkv", "muge", "fused_moe", "grouped_er", + "tensor_split", "use_mmap", "embeddings", "repack", "mqkv", "muge", "defer_experts", "fused_moe", "grouped_er", "no_fused_up_gate", "use_thp", "no_ooae", "rcache", "sas", "max_gpu", "cuda_params", "override_tensor", "n_prompt", "n_gen", "test_time", "avg_ns", "stddev_ns", @@ -1727,6 +1741,9 @@ struct markdown_printer : public printer { if (field == "muge") { return 4; } + if (field == "defer_experts") { + return 5; + } if (field == "sas") { return 3; } @@ -1803,6 +1820,9 @@ struct markdown_printer : public printer { if (field == "muge") { return "muge"; } + if (field == "defer_experts") { + return "defer"; + } if (field == "sas") { return "sas"; } @@ -1925,6 +1945,9 @@ struct markdown_printer : public printer { if (params.muge != cmd_params_defaults.muge) { fields.emplace_back("muge"); } + if (params.defer_experts != cmd_params_defaults.defer_experts) { + fields.emplace_back("defer_experts"); + } if (params.use_thp != cmd_params_defaults.use_thp) { fields.emplace_back("use_thp"); } diff --git a/include/llama.h b/include/llama.h index 388caff8..13d05a38 100644 --- a/include/llama.h +++ b/include/llama.h @@ -426,6 +426,7 @@ extern "C" { bool mtp; // if true, load MTP layers if present bool dry_run; // skip loading tensors bool flash_attn; + bool defer_experts; // defer expert mmap residency to speed up model loading (Linux only) }; // NOTE: changing the default values of parameters marked as [EXPERIMENTAL] may cause crashes or incorrect results in certain configurations diff --git a/src/llama-expert-io.h b/src/llama-expert-io.h new file mode 100644 index 00000000..8542501d --- /dev/null +++ b/src/llama-expert-io.h @@ -0,0 +1,24 @@ +#pragma once + +#include +#include + +struct llama_file_range { + size_t first = 0; + size_t last = 0; + + bool empty() const { + return first >= last; + } +}; + +struct llama_expert_tensor_index { + size_t deferred_bytes = 0; + size_t dense_bytes = 0; + + std::vector> file_ranges; + + bool empty() const { + return deferred_bytes == 0; + } +}; diff --git a/src/llama-mmap.cpp b/src/llama-mmap.cpp index 2fa0db9a..9bdbd998 100644 --- a/src/llama-mmap.cpp +++ b/src/llama-mmap.cpp @@ -366,6 +366,26 @@ struct llama_mmap::impl { } } + void dontneed_fragment(size_t first, size_t last) { + int page_size = mapped_page_size > 0 ? mapped_page_size : sysconf(_SC_PAGESIZE); + align_range(&first, &last, page_size); + size_t len = last - first; + + if (len == 0) { + return; + } + + GGML_ASSERT(first % page_size == 0); + GGML_ASSERT(last % page_size == 0); + GGML_ASSERT(last >= first); + +#ifdef __linux__ + if (madvise((uint8_t *) addr + first, len, MADV_DONTNEED)) { + LLAMA_LOG_WARN("warning: madvise(..., MADV_DONTNEED) failed: %s\n", strerror(errno)); + } +#endif + } + void unmap_fragment(size_t first, size_t last) { int page_size = mapped_page_size > 0 ? mapped_page_size : sysconf(_SC_PAGESIZE); align_range(&first, &last, page_size); @@ -454,6 +474,11 @@ struct llama_mmap::impl { } } + void dontneed_fragment(size_t first, size_t last) { + GGML_UNUSED(first); + GGML_UNUSED(last); + } + void unmap_fragment(size_t first, size_t last) { GGML_UNUSED(first); GGML_UNUSED(last); @@ -474,6 +499,13 @@ struct llama_mmap::impl { throw std::runtime_error("mmap not supported"); } + void dontneed_fragment(size_t first, size_t last) { + GGML_UNUSED(first); + GGML_UNUSED(last); + + throw std::runtime_error("mmap not supported"); + } + void unmap_fragment(size_t first, size_t last) { GGML_UNUSED(first); GGML_UNUSED(last); @@ -494,6 +526,7 @@ llama_mmap::~llama_mmap() = default; size_t llama_mmap::size() const { return pimpl->size; } void * llama_mmap::addr() const { return pimpl->addr; } +void llama_mmap::dontneed_fragment(size_t first, size_t last) { pimpl->dontneed_fragment(first, last); } void llama_mmap::unmap_fragment(size_t first, size_t last) { pimpl->unmap_fragment(first, last); } #if defined(_POSIX_MEMLOCK_RANGE) || defined(_WIN32) diff --git a/src/llama-mmap.h b/src/llama-mmap.h index a1efa068..04432880 100644 --- a/src/llama-mmap.h +++ b/src/llama-mmap.h @@ -42,6 +42,8 @@ struct llama_mmap { size_t size() const; void * addr() const; + void dontneed_fragment(size_t first, size_t last); + void unmap_fragment(size_t first, size_t last); static const bool SUPPORTED; diff --git a/src/llama-model-loader.cpp b/src/llama-model-loader.cpp index 39b9fd50..b986ae6c 100644 --- a/src/llama-model-loader.cpp +++ b/src/llama-model-loader.cpp @@ -20,6 +20,7 @@ #include #include #include +#include #include #include #include @@ -204,8 +205,100 @@ namespace GGUFMeta { }; } +static bool parse_tensor_layer_index(const std::string & name, uint32_t & layer) { + if (name.rfind("blk.", 0) != 0) { + return false; + } + + const char * first = name.data() + 4; + const char * last = first; + const char * end = name.data() + name.size(); + + while (last < end && *last != '.') { + ++last; + } + + if (last == first || last == end) { + return false; + } + + auto result = std::from_chars(first, last, layer); + return result.ec == std::errc() && result.ptr == last; +} + +static bool is_split_expert_tensor(const std::string & name, uint32_t & expert) { + static const char * prefixes[] = { "ffn_gate.", "ffn_down.", "ffn_up." }; + + const size_t layer_end = name.find('.', 4); + if (layer_end == std::string::npos) { + return false; + } + + const size_t prefix_begin = layer_end + 1; + + for (const char * prefix : prefixes) { + const size_t prefix_len = std::char_traits::length(prefix); + if (name.compare(prefix_begin, prefix_len, prefix) != 0) { + continue; + } + + const size_t expert_begin = prefix_begin + prefix_len; + const size_t expert_end = name.find('.', expert_begin); + if (expert_end == std::string::npos || expert_end == expert_begin) { + continue; + } + + auto result = std::from_chars(name.data() + expert_begin, name.data() + expert_end, expert); + if (result.ec == std::errc() && result.ptr == name.data() + expert_end) { + return true; + } + } + + return false; +} + +static bool is_merged_expert_tensor(llm_tensor tensor_type) { + switch (tensor_type) { + case LLM_TENSOR_FFN_NORM_EXPS: + case LLM_TENSOR_FFN_DOWN_EXPS: + case LLM_TENSOR_FFN_GATE_EXPS: + case LLM_TENSOR_FFN_UP_EXPS: + case LLM_TENSOR_FFN_GATE_UP_EXPS: + case LLM_TENSOR_FFN_EXP_PROBS_B: + return true; + default: + return false; + } +} + +static void coalesce_ranges(std::vector & ranges) { + ranges.erase(std::remove_if(ranges.begin(), ranges.end(), [](const llama_file_range & range) { + return range.empty(); + }), ranges.end()); + + std::sort(ranges.begin(), ranges.end(), [](const llama_file_range & lhs, const llama_file_range & rhs) { + if (lhs.first != rhs.first) { + return lhs.first < rhs.first; + } + return lhs.last < rhs.last; + }); + + std::vector merged; + merged.reserve(ranges.size()); + + for (const auto & range : ranges) { + if (merged.empty() || range.first > merged.back().last) { + merged.push_back(range); + continue; + } + merged.back().last = std::max(merged.back().last, range.last); + } + + ranges = std::move(merged); +} + llama_model_loader::llama_model_loader(const std::string & fname, int ncmoe, bool use_mmap, bool check_tensors, - bool repack_tensors, bool use_thp, bool merge_qkv, bool merge_up_gate_exps, + bool repack_tensors, bool use_thp, bool merge_qkv, bool merge_up_gate_exps, bool defer_experts, const llama_model_kv_override * param_overrides_p, const llama_model_tensor_buft_override * param_tensor_buft_overrides_p) { int trace = 0; @@ -500,6 +593,7 @@ llama_model_loader::llama_model_loader(const std::string & fname, int ncmoe, boo this->use_thp = use_thp; this->merge_qkv = merge_qkv; this->merge_up_gate_exps = merge_up_gate_exps; + this->defer_experts = defer_experts; } llama_model_loader::~llama_model_loader() { @@ -511,6 +605,74 @@ llama_model_loader::~llama_model_loader() { } } +void llama_model_loader::build_expert_tensor_index(const llama_hparams & hparams) { + expert_tensor_index = {}; + + if (hparams.n_expert == 0 || hparams.n_layer == 0) { + return; + } + + expert_tensor_index.file_ranges.resize(files.size()); + + size_t deferred_bytes = 0; + const llm_arch arch = get_arch(); + + for (const auto & weight : weights) { + const std::string name(weight.tensor->name); + uint32_t layer = 0; + if (!parse_tensor_layer_index(name, layer)) { + continue; + } + + if (layer >= hparams.n_layer) { + throw std::runtime_error(format("expert tensor '%s' has invalid layer index %u", name.c_str(), layer)); + } + + // check for split expert tensors (blk.N.ffn_gate.E.weight) by name pattern, + // since llm_tensor_type can't resolve these (two %d in the format string) + uint32_t expert = 0; + if (is_split_expert_tensor(name, expert)) { + if (expert >= hparams.n_expert) { + throw std::runtime_error(format("expert tensor '%s' has invalid expert index %u", name.c_str(), expert)); + } + } else { + const llm_tensor tensor_type = llm_tensor_type(arch, name, int(layer)); + if (!is_merged_expert_tensor(tensor_type)) { + continue; + } + } + + const size_t tensor_bytes = ggml_nbytes(weight.tensor); + deferred_bytes += tensor_bytes; + expert_tensor_index.file_ranges.at(weight.idx).push_back({ weight.offs, weight.offs + tensor_bytes }); + } + + for (auto & ranges : expert_tensor_index.file_ranges) { + coalesce_ranges(ranges); + } + + expert_tensor_index.deferred_bytes = deferred_bytes; + expert_tensor_index.dense_bytes = n_bytes > deferred_bytes ? n_bytes - deferred_bytes : 0; +} + +bool llama_model_loader::should_defer_expert_mmaps() const { + return defer_experts && use_mmap && !expert_tensor_index.empty(); +} + +void llama_model_loader::drop_mmap_expert_pages() const { + if (!use_mmap || mappings.empty() || expert_tensor_index.file_ranges.empty()) { + return; + } + + const size_t n_range_sets = std::min(mappings.size(), expert_tensor_index.file_ranges.size()); + for (size_t idx = 0; idx < n_range_sets; ++idx) { + const auto & ranges = expert_tensor_index.file_ranges[idx]; + for (const auto & range : ranges) { + mappings[idx]->dontneed_fragment(range.first, range.last); + } + } +} + template typename std::enable_if::value, bool>::type llama_model_loader::get_arr_n(const std::string & key, T & result, const bool required) { diff --git a/src/llama-model-loader.h b/src/llama-model-loader.h index 21b4050e..175c2072 100644 --- a/src/llama-model-loader.h +++ b/src/llama-model-loader.h @@ -2,8 +2,10 @@ #include "llama.h" #include "llama-impl.h" +#include "llama-expert-io.h" #include "llama-mmap.h" #include "llama-arch.h" +#include "llama-hparams.h" #include #include @@ -48,6 +50,7 @@ struct llama_model_loader { bool use_thp = false; bool merge_qkv = false; bool merge_up_gate_exps = false; + bool defer_experts = false; llama_files files; llama_ftype ftype; @@ -81,9 +84,10 @@ struct llama_model_loader { std::string arch_name; LLM_KV llm_kv = LLM_KV(LLM_ARCH_UNKNOWN); + llama_expert_tensor_index expert_tensor_index; llama_model_loader(const std::string & fname, int ncmoe, bool use_mmap, bool check_tensors, bool repack_tensors, bool use_thp, - bool merge_qkv, bool merge_up_gate_exps, + bool merge_qkv, bool merge_up_gate_exps, bool defer_experts, const llama_model_kv_override * param_overrides_p, const llama_model_tensor_buft_override * param_tensor_buft_overrides_p); @@ -158,6 +162,12 @@ struct llama_model_loader { void init_mappings(bool prefetch = true, llama_mlocks * mlock_mmaps = nullptr, bool use_thp = false); + void build_expert_tensor_index(const llama_hparams & hparams); + + bool should_defer_expert_mmaps() const; + + void drop_mmap_expert_pages() const; + void get_mapping_range(size_t * first, size_t * last, void ** addr, int idx, ggml_context * ctx) const; // for backwards compatibility, does not support ggml-backend diff --git a/src/llama-quantize.cpp b/src/llama-quantize.cpp index 56a94c39..ca5853e2 100644 --- a/src/llama-quantize.cpp +++ b/src/llama-quantize.cpp @@ -1037,7 +1037,8 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s kv_overrides = v->data(); } llama_model_loader ml(fname_inp, 0, use_mmap, /*check_tensors*/ true, /* repack_tensors */ false, - /* use_thp */ false, /* merge_qkv */ false, /* merge_up_gate_exps */ false, kv_overrides, nullptr); + /* use_thp */ false, /* merge_qkv */ false, /* merge_up_gate_exps */ false, + /* defer_experts */ false, kv_overrides, nullptr); ml.init_mappings(false); // no prefetching llama_model model; diff --git a/src/llama.cpp b/src/llama.cpp index c4d8e79f..de62849b 100644 --- a/src/llama.cpp +++ b/src/llama.cpp @@ -2665,9 +2665,19 @@ static bool llm_load_tensors( ml.use_mmap = false; } + bool defer_expert_mmap = ml.should_defer_expert_mmaps(); + if (defer_expert_mmap && use_mlock) { + LLAMA_LOG_WARN("%s: deferred expert loading disabled because mlock keeps mmap ranges resident\n", __func__); + defer_expert_mmap = false; + } + if (defer_expert_mmap && (ml.check_tensors || validate_quants)) { + LLAMA_LOG_WARN("%s: deferred expert loading disabled because tensor validation would fault expert pages eagerly\n", __func__); + defer_expert_mmap = false; + } + ml.done_getting_tensors(); - ml.init_mappings(true, use_mlock ? &model.mlock_mmaps : nullptr, ml.use_thp); + ml.init_mappings(!defer_expert_mmap, use_mlock ? &model.mlock_mmaps : nullptr, ml.use_thp); model.mappings.reserve(ml.mappings.size()); // create the backend buffers @@ -2804,6 +2814,10 @@ static bool llm_load_tensors( return false; } } + + if (defer_expert_mmap) { + ml.drop_mmap_expert_pages(); + } } if (model.arch == LLM_ARCH_DEEPSEEK2 || model.arch == LLM_ARCH_GLM_DSA || model.arch == LLM_ARCH_MISTRAL4) { @@ -2842,6 +2856,14 @@ static bool llm_load_tensors( } } + if (defer_expert_mmap && !dry_run) { + LLAMA_LOG_INFO("%s: dense parameters loaded in %.2fs (%.2f GiB), expert parameters deferred (%.2f GiB)\n", + __func__, + (ggml_time_us() - model.t_start_us) / 1000000.0, + ml.expert_tensor_index.dense_bytes / 1024.0 / 1024.0 / 1024.0, + ml.expert_tensor_index.deferred_bytes / 1024.0 / 1024.0 / 1024.0); + } + if (!ml.use_mmap && ml.repack_tensors) { int n_repacked = 0; for (auto& it : model.tensors_by_name) { @@ -2899,6 +2921,7 @@ static int llama_model_load(const std::string & fname, llama_model & model, llam try { llama_model_loader ml(fname, params.ncmoe, params.use_mmap, params.check_tensors, params.repack_tensors, params.use_thp, params.merge_qkv, params.merge_up_gate_exps, + params.defer_experts, params.kv_overrides, params.tensor_buft_overrides); model.hparams.vocab_only = params.vocab_only; @@ -2913,6 +2936,13 @@ static int llama_model_load(const std::string & fname, llama_model & model, llam } catch(const std::exception & e) { throw std::runtime_error("error loading model hyperparameters: " + std::string(e.what())); } + if (params.defer_experts && params.use_mmap) { +#ifdef __linux__ + ml.build_expert_tensor_index(model.hparams); +#else + LLAMA_LOG_WARN("%s: deferred expert loading is only supported on Linux; ignoring defer_experts\n", __func__); +#endif + } try { LLM_KV kv(model.arch); model.vocab.load(ml, kv); @@ -4973,6 +5003,7 @@ struct llama_model_params llama_model_default_params() { /*.mtp =*/ false, /*.dry_run =*/ false, /*.flash_attn =*/ true, + /*.defer_experts =*/ false, }; #ifdef GGML_USE_METAL