Enable split mode graph for MLA models and partial offload

This commit is contained in:
Kawrakow 2026-05-19 12:34:03 +00:00
parent 27d7a74389
commit 2575143637
3 changed files with 37 additions and 58 deletions

View File

@ -687,7 +687,9 @@ ggml_cgraph * llm_build_context::build_deepseek2() {
for (int il = 0; il < n_active_layers; ++il) {
struct ggml_tensor * inpSA = inpL;
if (tp_mode) {
bool is_tp_layer = tp_mode && model.layers[il].wo && model.layers[il].wo->extra;
if (is_tp_layer) {
cur = build_deepseek2_tp_attention(gf, il, inpL, KQ_mask, inp_pos, rope_cache,
kq_scale, attn_factor_scaled,
use_f32_attn_precision, is_lite);
@ -709,14 +711,14 @@ ggml_cgraph * llm_build_context::build_deepseek2() {
// TP path folds residual inside the per-rank FFN reduce; layer mode adds it here.
struct ggml_tensor * ffn_inp;
if (tp_mode) {
if (is_tp_layer) {
ffn_inp = cur;
} else {
ffn_inp = ggml_add(ctx0, cur, inpSA);
}
cb(ffn_inp, "ffn_inp", il);
if (tp_mode) {
if (is_tp_layer) {
cur = ffn_inp;
} else {
cur = llm_build_norm(ctx0, ffn_inp, hparams, model.layers[il].ffn_norm, NULL, LLM_NORM_RMS, cb, il);
@ -725,16 +727,16 @@ ggml_cgraph * llm_build_context::build_deepseek2() {
if ((uint32_t) il < hparams.n_layer_dense_lead) {
cur = llm_build_ffn(ctx0, lctx,
tp_mode ? model.layers[il].ffn_norm : nullptr, cur,
is_tp_layer ? model.layers[il].ffn_norm : nullptr, cur,
model.layers[il].ffn_up, NULL, NULL,
model.layers[il].ffn_gate, NULL, NULL,
model.layers[il].ffn_down, NULL, NULL,
NULL,
LLM_FFN_SILU, LLM_FFN_PAR, cb, il,
tp_mode ? gf : nullptr,
/*add_input=*/tp_mode);
gf,
/*add_input=*/is_tp_layer);
cb(cur, "ffn_out", il);
} else if (tp_mode) {
} else if (is_tp_layer) {
cur = llm_build_std_moe_ffn(ctx0, lctx, model.layers[il].ffn_norm, cur,
model.layers[il].ffn_gate_inp, nullptr,
model.layers[il].ffn_up_exps, nullptr,
@ -767,21 +769,19 @@ ggml_cgraph * llm_build_context::build_deepseek2() {
cb(moe_out, "ffn_moe_out", il);
// FFN shared expert
{
ggml_tensor * ffn_shexp = llm_build_ffn(ctx0, lctx, nullptr, cur,
model.layers[il].ffn_up_shexp, NULL, NULL,
model.layers[il].ffn_gate_shexp, NULL, NULL,
model.layers[il].ffn_down_shexp, NULL, NULL,
NULL,
LLM_FFN_SILU, LLM_FFN_PAR, cb, il);
cb(ffn_shexp, "ffn_shexp", il);
ggml_tensor * ffn_shexp = llm_build_ffn(ctx0, lctx, nullptr, cur,
model.layers[il].ffn_up_shexp, NULL, NULL,
model.layers[il].ffn_gate_shexp, NULL, NULL,
model.layers[il].ffn_down_shexp, NULL, NULL,
NULL,
LLM_FFN_SILU, LLM_FFN_PAR, cb, il, gf);
cb(ffn_shexp, "ffn_shexp", il);
cur = ggml_add(ctx0, moe_out, ffn_shexp);
cb(cur, "ffn_out", il);
}
cur = ggml_add(ctx0, moe_out, ffn_shexp);
cb(cur, "ffn_out", il);
}
if (!tp_mode) {
if (!is_tp_layer) {
cur = ggml_add(ctx0, cur, ffn_inp);
}
cur = lctx.cvec.apply_to(ctx0, cur, il);

View File

@ -1400,7 +1400,7 @@ llm_expert_gating_func_type gating_op,
up_shexp, up_b_shexp, nullptr,
gate_shexp, gate_b_shexp, nullptr,
down_shexp, down_b_shexp, nullptr,
nullptr, type_op_shexp, LLM_FFN_PAR, cb, il);
nullptr, type_op_shexp, LLM_FFN_PAR, cb, il, graph);
cb(shared_out, "ffn_shexp_out", il);
if (shexp_gate) {
auto shared_gate = llm_build_lora_mm(lctx, ctx, shexp_gate, cur);

View File

@ -936,30 +936,25 @@ static bool llama_kv_cache_init(
cache.v_l.push_back(kvt);
}
// Per-device replicas of the compressed latent KV cache (n_device from wo's split).
if (replicate_mla && !is_mtp_tail_layer) {
if (replicate_mla && !is_mtp_tail_layer && model.layers[i].wo && model.layers[i].wo->extra) {
auto wo = model.layers[i].wo;
if (wo && wo->extra) {
auto extra_wo = (const ggml_split_tensor_t *)wo->extra;
int n_device = extra_wo->n_device;
auto & repl_k_l = cache.replicated_k_l.emplace_back();
repl_k_l.tensor_splits.resize(n_device, nullptr);
for (int is = 0; is < n_device; ++is) {
if (!extra_wo->splits[is]) continue;
ggml_tensor * rkv = ggml_new_tensor_2d(ctx, primary_kv_type,
kv_lora_rank + n_embd_head_qk_rope, kv_size);
auto split_name = std::string("cache_k_l") + std::to_string(i) + '.' + std::to_string(is);
ggml_set_name(rkv, split_name.c_str());
repl_k_l.tensor_splits[is] = rkv;
mem_split[is] += ggml_nbytes(rkv);
}
repl_k_l.ggml.n_device = n_device;
repl_k_l.ggml.split_dim = -1;
repl_k_l.ggml.splits = repl_k_l.tensor_splits.data();
kv->extra = (void *)&repl_k_l.ggml;
} else {
GGML_ABORT("MLA layer %d: wo lacks split metadata under -sm graph "
"(distribute_mla_tensors_for_split_mode_graph not run?)", i);
auto extra_wo = (const ggml_split_tensor_t *)wo->extra;
int n_device = extra_wo->n_device;
auto & repl_k_l = cache.replicated_k_l.emplace_back();
repl_k_l.tensor_splits.resize(n_device, nullptr);
for (int is = 0; is < n_device; ++is) {
if (!extra_wo->splits[is]) continue;
ggml_tensor * rkv = ggml_new_tensor_2d(ctx, primary_kv_type,
kv_lora_rank + n_embd_head_qk_rope, kv_size);
auto split_name = std::string("cache_k_l") + std::to_string(i) + '.' + std::to_string(is);
ggml_set_name(rkv, split_name.c_str());
repl_k_l.tensor_splits[is] = rkv;
mem_split[is] += ggml_nbytes(rkv);
}
repl_k_l.ggml.n_device = n_device;
repl_k_l.ggml.split_dim = -1;
repl_k_l.ggml.splits = repl_k_l.tensor_splits.data();
kv->extra = (void *)&repl_k_l.ggml;
}
n_mla++;
}
@ -2975,12 +2970,6 @@ static bool llm_load_tensors(
const bool unsupported_gemma_split =
model.arch == LLM_ARCH_GEMMA4_MTP ||
(model.arch == LLM_ARCH_GEMMA4 && hparams.n_embd_per_layer > 0);
const bool is_mla_arch =
model.arch == LLM_ARCH_DEEPSEEK2 ||
model.arch == LLM_ARCH_GLM_DSA ||
model.arch == LLM_ARCH_MISTRAL4;
const bool incompatible_loader_opts = is_mla_arch &&
(ml.ncmoe > 0 || ml.repack_tensors || ml.merge_up_gate_exps || ml.tensor_buft_overrides);
if (unsupported_gemma_split) {
LLAMA_LOG_WARN("\n=========================================================\n");
@ -2990,16 +2979,6 @@ static bool llm_load_tensors(
LLAMA_LOG_WARN(" => changing split mode to 'layer'\n");
LLAMA_LOG_WARN("===========================================================\n\n");
split_mode = LLAMA_SPLIT_MODE_LAYER;
} else if (incompatible_loader_opts) {
const char * bad_flag = ml.ncmoe > 0 ? "-ncmoe | --n-cpu-moe"
: ml.repack_tensors ? "-rtr | --run-time-repack"
: ml.merge_up_gate_exps ? "-muge | --merge-up-gate-experts"
: "-ot | --override-tensor";
LLAMA_LOG_WARN("\n=======================================================\n");
LLAMA_LOG_WARN("Split mode 'graph' is not compatible with %s\n", bad_flag);
LLAMA_LOG_WARN(" => changing split mode to 'layer'\n");
LLAMA_LOG_WARN("=======================================================\n\n");
split_mode = LLAMA_SPLIT_MODE_LAYER;
} else if (!is_model_split_supported(model)) {
LLAMA_LOG_WARN("\n=======================================================\n");
LLAMA_LOG_WARN("Split mode 'graph' is not supported for this model\n");