From 6bb3ee3a32c29dc427a069be5c483a8f9102ce79 Mon Sep 17 00:00:00 2001 From: Kawrakow Date: Wed, 20 May 2026 07:13:55 +0300 Subject: [PATCH] Enable split mode graph for MLA models and partial offload (#1835) --- src/graphs/build_deepseek2.cpp | 38 +++++++++++------------ src/llama-build-context.cpp | 2 +- src/llama.cpp | 55 +++++++++++----------------------- 3 files changed, 37 insertions(+), 58 deletions(-) diff --git a/src/graphs/build_deepseek2.cpp b/src/graphs/build_deepseek2.cpp index 162a00bb..63e6ae21 100644 --- a/src/graphs/build_deepseek2.cpp +++ b/src/graphs/build_deepseek2.cpp @@ -687,7 +687,9 @@ ggml_cgraph * llm_build_context::build_deepseek2() { for (int il = 0; il < n_active_layers; ++il) { struct ggml_tensor * inpSA = inpL; - if (tp_mode) { + bool is_tp_layer = tp_mode && model.layers[il].wo && model.layers[il].wo->extra; + + if (is_tp_layer) { cur = build_deepseek2_tp_attention(gf, il, inpL, KQ_mask, inp_pos, rope_cache, kq_scale, attn_factor_scaled, use_f32_attn_precision, is_lite); @@ -709,14 +711,14 @@ ggml_cgraph * llm_build_context::build_deepseek2() { // TP path folds residual inside the per-rank FFN reduce; layer mode adds it here. struct ggml_tensor * ffn_inp; - if (tp_mode) { + if (is_tp_layer) { ffn_inp = cur; } else { ffn_inp = ggml_add(ctx0, cur, inpSA); } cb(ffn_inp, "ffn_inp", il); - if (tp_mode) { + if (is_tp_layer) { cur = ffn_inp; } else { cur = llm_build_norm(ctx0, ffn_inp, hparams, model.layers[il].ffn_norm, NULL, LLM_NORM_RMS, cb, il); @@ -725,16 +727,16 @@ ggml_cgraph * llm_build_context::build_deepseek2() { if ((uint32_t) il < hparams.n_layer_dense_lead) { cur = llm_build_ffn(ctx0, lctx, - tp_mode ? model.layers[il].ffn_norm : nullptr, cur, + is_tp_layer ? model.layers[il].ffn_norm : nullptr, cur, model.layers[il].ffn_up, NULL, NULL, model.layers[il].ffn_gate, NULL, NULL, model.layers[il].ffn_down, NULL, NULL, NULL, LLM_FFN_SILU, LLM_FFN_PAR, cb, il, - tp_mode ? gf : nullptr, - /*add_input=*/tp_mode); + gf, + /*add_input=*/is_tp_layer); cb(cur, "ffn_out", il); - } else if (tp_mode) { + } else if (is_tp_layer) { cur = llm_build_std_moe_ffn(ctx0, lctx, model.layers[il].ffn_norm, cur, model.layers[il].ffn_gate_inp, nullptr, model.layers[il].ffn_up_exps, nullptr, @@ -767,21 +769,19 @@ ggml_cgraph * llm_build_context::build_deepseek2() { cb(moe_out, "ffn_moe_out", il); // FFN shared expert - { - ggml_tensor * ffn_shexp = llm_build_ffn(ctx0, lctx, nullptr, cur, - model.layers[il].ffn_up_shexp, NULL, NULL, - model.layers[il].ffn_gate_shexp, NULL, NULL, - model.layers[il].ffn_down_shexp, NULL, NULL, - NULL, - LLM_FFN_SILU, LLM_FFN_PAR, cb, il); - cb(ffn_shexp, "ffn_shexp", il); + ggml_tensor * ffn_shexp = llm_build_ffn(ctx0, lctx, nullptr, cur, + model.layers[il].ffn_up_shexp, NULL, NULL, + model.layers[il].ffn_gate_shexp, NULL, NULL, + model.layers[il].ffn_down_shexp, NULL, NULL, + NULL, + LLM_FFN_SILU, LLM_FFN_PAR, cb, il, gf); + cb(ffn_shexp, "ffn_shexp", il); - cur = ggml_add(ctx0, moe_out, ffn_shexp); - cb(cur, "ffn_out", il); - } + cur = ggml_add(ctx0, moe_out, ffn_shexp); + cb(cur, "ffn_out", il); } - if (!tp_mode) { + if (!is_tp_layer) { cur = ggml_add(ctx0, cur, ffn_inp); } cur = lctx.cvec.apply_to(ctx0, cur, il); diff --git a/src/llama-build-context.cpp b/src/llama-build-context.cpp index f4d0f61e..04fe1911 100644 --- a/src/llama-build-context.cpp +++ b/src/llama-build-context.cpp @@ -1400,7 +1400,7 @@ llm_expert_gating_func_type gating_op, up_shexp, up_b_shexp, nullptr, gate_shexp, gate_b_shexp, nullptr, down_shexp, down_b_shexp, nullptr, - nullptr, type_op_shexp, LLM_FFN_PAR, cb, il); + nullptr, type_op_shexp, LLM_FFN_PAR, cb, il, graph); cb(shared_out, "ffn_shexp_out", il); if (shexp_gate) { auto shared_gate = llm_build_lora_mm(lctx, ctx, shexp_gate, cur); diff --git a/src/llama.cpp b/src/llama.cpp index f462b225..69ee1066 100644 --- a/src/llama.cpp +++ b/src/llama.cpp @@ -936,30 +936,25 @@ static bool llama_kv_cache_init( cache.v_l.push_back(kvt); } // Per-device replicas of the compressed latent KV cache (n_device from wo's split). - if (replicate_mla && !is_mtp_tail_layer) { + if (replicate_mla && !is_mtp_tail_layer && model.layers[i].wo && model.layers[i].wo->extra) { auto wo = model.layers[i].wo; - if (wo && wo->extra) { - auto extra_wo = (const ggml_split_tensor_t *)wo->extra; - int n_device = extra_wo->n_device; - auto & repl_k_l = cache.replicated_k_l.emplace_back(); - repl_k_l.tensor_splits.resize(n_device, nullptr); - for (int is = 0; is < n_device; ++is) { - if (!extra_wo->splits[is]) continue; - ggml_tensor * rkv = ggml_new_tensor_2d(ctx, primary_kv_type, - kv_lora_rank + n_embd_head_qk_rope, kv_size); - auto split_name = std::string("cache_k_l") + std::to_string(i) + '.' + std::to_string(is); - ggml_set_name(rkv, split_name.c_str()); - repl_k_l.tensor_splits[is] = rkv; - mem_split[is] += ggml_nbytes(rkv); - } - repl_k_l.ggml.n_device = n_device; - repl_k_l.ggml.split_dim = -1; - repl_k_l.ggml.splits = repl_k_l.tensor_splits.data(); - kv->extra = (void *)&repl_k_l.ggml; - } else { - GGML_ABORT("MLA layer %d: wo lacks split metadata under -sm graph " - "(distribute_mla_tensors_for_split_mode_graph not run?)", i); + auto extra_wo = (const ggml_split_tensor_t *)wo->extra; + int n_device = extra_wo->n_device; + auto & repl_k_l = cache.replicated_k_l.emplace_back(); + repl_k_l.tensor_splits.resize(n_device, nullptr); + for (int is = 0; is < n_device; ++is) { + if (!extra_wo->splits[is]) continue; + ggml_tensor * rkv = ggml_new_tensor_2d(ctx, primary_kv_type, + kv_lora_rank + n_embd_head_qk_rope, kv_size); + auto split_name = std::string("cache_k_l") + std::to_string(i) + '.' + std::to_string(is); + ggml_set_name(rkv, split_name.c_str()); + repl_k_l.tensor_splits[is] = rkv; + mem_split[is] += ggml_nbytes(rkv); } + repl_k_l.ggml.n_device = n_device; + repl_k_l.ggml.split_dim = -1; + repl_k_l.ggml.splits = repl_k_l.tensor_splits.data(); + kv->extra = (void *)&repl_k_l.ggml; } n_mla++; } @@ -2975,12 +2970,6 @@ static bool llm_load_tensors( const bool unsupported_gemma_split = model.arch == LLM_ARCH_GEMMA4_MTP || (model.arch == LLM_ARCH_GEMMA4 && hparams.n_embd_per_layer > 0); - const bool is_mla_arch = - model.arch == LLM_ARCH_DEEPSEEK2 || - model.arch == LLM_ARCH_GLM_DSA || - model.arch == LLM_ARCH_MISTRAL4; - const bool incompatible_loader_opts = is_mla_arch && - (ml.ncmoe > 0 || ml.repack_tensors || ml.merge_up_gate_exps || ml.tensor_buft_overrides); if (unsupported_gemma_split) { LLAMA_LOG_WARN("\n=========================================================\n"); @@ -2990,16 +2979,6 @@ static bool llm_load_tensors( LLAMA_LOG_WARN(" => changing split mode to 'layer'\n"); LLAMA_LOG_WARN("===========================================================\n\n"); split_mode = LLAMA_SPLIT_MODE_LAYER; - } else if (incompatible_loader_opts) { - const char * bad_flag = ml.ncmoe > 0 ? "-ncmoe | --n-cpu-moe" - : ml.repack_tensors ? "-rtr | --run-time-repack" - : ml.merge_up_gate_exps ? "-muge | --merge-up-gate-experts" - : "-ot | --override-tensor"; - LLAMA_LOG_WARN("\n=======================================================\n"); - LLAMA_LOG_WARN("Split mode 'graph' is not compatible with %s\n", bad_flag); - LLAMA_LOG_WARN(" => changing split mode to 'layer'\n"); - LLAMA_LOG_WARN("=======================================================\n\n"); - split_mode = LLAMA_SPLIT_MODE_LAYER; } else if (!is_model_split_supported(model)) { LLAMA_LOG_WARN("\n=======================================================\n"); LLAMA_LOG_WARN("Split mode 'graph' is not supported for this model\n");