mirror of
https://github.com/ikawrakow/ik_llama.cpp.git
synced 2026-06-28 04:30:15 -05:00
Enable split mode graph for MLA models and partial offload (#1835)
This commit is contained in:
parent
9ae0fb7b2f
commit
6bb3ee3a32
@ -687,7 +687,9 @@ ggml_cgraph * llm_build_context::build_deepseek2() {
|
||||
for (int il = 0; il < n_active_layers; ++il) {
|
||||
struct ggml_tensor * inpSA = inpL;
|
||||
|
||||
if (tp_mode) {
|
||||
bool is_tp_layer = tp_mode && model.layers[il].wo && model.layers[il].wo->extra;
|
||||
|
||||
if (is_tp_layer) {
|
||||
cur = build_deepseek2_tp_attention(gf, il, inpL, KQ_mask, inp_pos, rope_cache,
|
||||
kq_scale, attn_factor_scaled,
|
||||
use_f32_attn_precision, is_lite);
|
||||
@ -709,14 +711,14 @@ ggml_cgraph * llm_build_context::build_deepseek2() {
|
||||
|
||||
// TP path folds residual inside the per-rank FFN reduce; layer mode adds it here.
|
||||
struct ggml_tensor * ffn_inp;
|
||||
if (tp_mode) {
|
||||
if (is_tp_layer) {
|
||||
ffn_inp = cur;
|
||||
} else {
|
||||
ffn_inp = ggml_add(ctx0, cur, inpSA);
|
||||
}
|
||||
cb(ffn_inp, "ffn_inp", il);
|
||||
|
||||
if (tp_mode) {
|
||||
if (is_tp_layer) {
|
||||
cur = ffn_inp;
|
||||
} else {
|
||||
cur = llm_build_norm(ctx0, ffn_inp, hparams, model.layers[il].ffn_norm, NULL, LLM_NORM_RMS, cb, il);
|
||||
@ -725,16 +727,16 @@ ggml_cgraph * llm_build_context::build_deepseek2() {
|
||||
|
||||
if ((uint32_t) il < hparams.n_layer_dense_lead) {
|
||||
cur = llm_build_ffn(ctx0, lctx,
|
||||
tp_mode ? model.layers[il].ffn_norm : nullptr, cur,
|
||||
is_tp_layer ? model.layers[il].ffn_norm : nullptr, cur,
|
||||
model.layers[il].ffn_up, NULL, NULL,
|
||||
model.layers[il].ffn_gate, NULL, NULL,
|
||||
model.layers[il].ffn_down, NULL, NULL,
|
||||
NULL,
|
||||
LLM_FFN_SILU, LLM_FFN_PAR, cb, il,
|
||||
tp_mode ? gf : nullptr,
|
||||
/*add_input=*/tp_mode);
|
||||
gf,
|
||||
/*add_input=*/is_tp_layer);
|
||||
cb(cur, "ffn_out", il);
|
||||
} else if (tp_mode) {
|
||||
} else if (is_tp_layer) {
|
||||
cur = llm_build_std_moe_ffn(ctx0, lctx, model.layers[il].ffn_norm, cur,
|
||||
model.layers[il].ffn_gate_inp, nullptr,
|
||||
model.layers[il].ffn_up_exps, nullptr,
|
||||
@ -767,21 +769,19 @@ ggml_cgraph * llm_build_context::build_deepseek2() {
|
||||
cb(moe_out, "ffn_moe_out", il);
|
||||
|
||||
// FFN shared expert
|
||||
{
|
||||
ggml_tensor * ffn_shexp = llm_build_ffn(ctx0, lctx, nullptr, cur,
|
||||
model.layers[il].ffn_up_shexp, NULL, NULL,
|
||||
model.layers[il].ffn_gate_shexp, NULL, NULL,
|
||||
model.layers[il].ffn_down_shexp, NULL, NULL,
|
||||
NULL,
|
||||
LLM_FFN_SILU, LLM_FFN_PAR, cb, il);
|
||||
cb(ffn_shexp, "ffn_shexp", il);
|
||||
ggml_tensor * ffn_shexp = llm_build_ffn(ctx0, lctx, nullptr, cur,
|
||||
model.layers[il].ffn_up_shexp, NULL, NULL,
|
||||
model.layers[il].ffn_gate_shexp, NULL, NULL,
|
||||
model.layers[il].ffn_down_shexp, NULL, NULL,
|
||||
NULL,
|
||||
LLM_FFN_SILU, LLM_FFN_PAR, cb, il, gf);
|
||||
cb(ffn_shexp, "ffn_shexp", il);
|
||||
|
||||
cur = ggml_add(ctx0, moe_out, ffn_shexp);
|
||||
cb(cur, "ffn_out", il);
|
||||
}
|
||||
cur = ggml_add(ctx0, moe_out, ffn_shexp);
|
||||
cb(cur, "ffn_out", il);
|
||||
}
|
||||
|
||||
if (!tp_mode) {
|
||||
if (!is_tp_layer) {
|
||||
cur = ggml_add(ctx0, cur, ffn_inp);
|
||||
}
|
||||
cur = lctx.cvec.apply_to(ctx0, cur, il);
|
||||
|
||||
@ -1400,7 +1400,7 @@ llm_expert_gating_func_type gating_op,
|
||||
up_shexp, up_b_shexp, nullptr,
|
||||
gate_shexp, gate_b_shexp, nullptr,
|
||||
down_shexp, down_b_shexp, nullptr,
|
||||
nullptr, type_op_shexp, LLM_FFN_PAR, cb, il);
|
||||
nullptr, type_op_shexp, LLM_FFN_PAR, cb, il, graph);
|
||||
cb(shared_out, "ffn_shexp_out", il);
|
||||
if (shexp_gate) {
|
||||
auto shared_gate = llm_build_lora_mm(lctx, ctx, shexp_gate, cur);
|
||||
|
||||
@ -936,30 +936,25 @@ static bool llama_kv_cache_init(
|
||||
cache.v_l.push_back(kvt);
|
||||
}
|
||||
// Per-device replicas of the compressed latent KV cache (n_device from wo's split).
|
||||
if (replicate_mla && !is_mtp_tail_layer) {
|
||||
if (replicate_mla && !is_mtp_tail_layer && model.layers[i].wo && model.layers[i].wo->extra) {
|
||||
auto wo = model.layers[i].wo;
|
||||
if (wo && wo->extra) {
|
||||
auto extra_wo = (const ggml_split_tensor_t *)wo->extra;
|
||||
int n_device = extra_wo->n_device;
|
||||
auto & repl_k_l = cache.replicated_k_l.emplace_back();
|
||||
repl_k_l.tensor_splits.resize(n_device, nullptr);
|
||||
for (int is = 0; is < n_device; ++is) {
|
||||
if (!extra_wo->splits[is]) continue;
|
||||
ggml_tensor * rkv = ggml_new_tensor_2d(ctx, primary_kv_type,
|
||||
kv_lora_rank + n_embd_head_qk_rope, kv_size);
|
||||
auto split_name = std::string("cache_k_l") + std::to_string(i) + '.' + std::to_string(is);
|
||||
ggml_set_name(rkv, split_name.c_str());
|
||||
repl_k_l.tensor_splits[is] = rkv;
|
||||
mem_split[is] += ggml_nbytes(rkv);
|
||||
}
|
||||
repl_k_l.ggml.n_device = n_device;
|
||||
repl_k_l.ggml.split_dim = -1;
|
||||
repl_k_l.ggml.splits = repl_k_l.tensor_splits.data();
|
||||
kv->extra = (void *)&repl_k_l.ggml;
|
||||
} else {
|
||||
GGML_ABORT("MLA layer %d: wo lacks split metadata under -sm graph "
|
||||
"(distribute_mla_tensors_for_split_mode_graph not run?)", i);
|
||||
auto extra_wo = (const ggml_split_tensor_t *)wo->extra;
|
||||
int n_device = extra_wo->n_device;
|
||||
auto & repl_k_l = cache.replicated_k_l.emplace_back();
|
||||
repl_k_l.tensor_splits.resize(n_device, nullptr);
|
||||
for (int is = 0; is < n_device; ++is) {
|
||||
if (!extra_wo->splits[is]) continue;
|
||||
ggml_tensor * rkv = ggml_new_tensor_2d(ctx, primary_kv_type,
|
||||
kv_lora_rank + n_embd_head_qk_rope, kv_size);
|
||||
auto split_name = std::string("cache_k_l") + std::to_string(i) + '.' + std::to_string(is);
|
||||
ggml_set_name(rkv, split_name.c_str());
|
||||
repl_k_l.tensor_splits[is] = rkv;
|
||||
mem_split[is] += ggml_nbytes(rkv);
|
||||
}
|
||||
repl_k_l.ggml.n_device = n_device;
|
||||
repl_k_l.ggml.split_dim = -1;
|
||||
repl_k_l.ggml.splits = repl_k_l.tensor_splits.data();
|
||||
kv->extra = (void *)&repl_k_l.ggml;
|
||||
}
|
||||
n_mla++;
|
||||
}
|
||||
@ -2975,12 +2970,6 @@ static bool llm_load_tensors(
|
||||
const bool unsupported_gemma_split =
|
||||
model.arch == LLM_ARCH_GEMMA4_MTP ||
|
||||
(model.arch == LLM_ARCH_GEMMA4 && hparams.n_embd_per_layer > 0);
|
||||
const bool is_mla_arch =
|
||||
model.arch == LLM_ARCH_DEEPSEEK2 ||
|
||||
model.arch == LLM_ARCH_GLM_DSA ||
|
||||
model.arch == LLM_ARCH_MISTRAL4;
|
||||
const bool incompatible_loader_opts = is_mla_arch &&
|
||||
(ml.ncmoe > 0 || ml.repack_tensors || ml.merge_up_gate_exps || ml.tensor_buft_overrides);
|
||||
|
||||
if (unsupported_gemma_split) {
|
||||
LLAMA_LOG_WARN("\n=========================================================\n");
|
||||
@ -2990,16 +2979,6 @@ static bool llm_load_tensors(
|
||||
LLAMA_LOG_WARN(" => changing split mode to 'layer'\n");
|
||||
LLAMA_LOG_WARN("===========================================================\n\n");
|
||||
split_mode = LLAMA_SPLIT_MODE_LAYER;
|
||||
} else if (incompatible_loader_opts) {
|
||||
const char * bad_flag = ml.ncmoe > 0 ? "-ncmoe | --n-cpu-moe"
|
||||
: ml.repack_tensors ? "-rtr | --run-time-repack"
|
||||
: ml.merge_up_gate_exps ? "-muge | --merge-up-gate-experts"
|
||||
: "-ot | --override-tensor";
|
||||
LLAMA_LOG_WARN("\n=======================================================\n");
|
||||
LLAMA_LOG_WARN("Split mode 'graph' is not compatible with %s\n", bad_flag);
|
||||
LLAMA_LOG_WARN(" => changing split mode to 'layer'\n");
|
||||
LLAMA_LOG_WARN("=======================================================\n\n");
|
||||
split_mode = LLAMA_SPLIT_MODE_LAYER;
|
||||
} else if (!is_model_split_supported(model)) {
|
||||
LLAMA_LOG_WARN("\n=======================================================\n");
|
||||
LLAMA_LOG_WARN("Split mode 'graph' is not supported for this model\n");
|
||||
|
||||
Loading…
x
Reference in New Issue
Block a user