Fix MTP warmup for GLM models (#1992)

2026-06-28 04:30:15 -05:00 · 2026-06-19 08:59:55 +02:00 · 2026-06-19 08:59:55 +02:00 · 0d59973e4a
commit 0d59973e4a
parent b3dfb7858c
2 changed files with 3 additions and 3 deletions
--- a/common/speculative.cpp
+++ b/common/speculative.cpp
@ -2347,8 +2347,6 @@ void common_speculative_checkpoint_restore(
    common_speculative_checkpoint_discard(ckpt, ctx);
 }

-static bool mtp_model_uses_recurrent_conditioning(const common_speculative_state_mtp & state);
-
 void common_speculative_commit(
        common_speculative * spec,
        llama_context * ctx,
@ -2559,6 +2557,7 @@ static bool mtp_model_uses_recurrent_conditioning(const common_speculative_state
    if (state.ctx_mtp == nullptr) {
        return false;
    }
+    return true;

    const llama_model * model = llama_get_model(state.ctx_mtp);
    if (!llama_model_has_recurrent(model)) {
--- a/src/graphs/build_gemma4.cpp
+++ b/src/graphs/build_gemma4.cpp
@ -519,8 +519,8 @@ static ggml_cgraph * build_gemma4_graph_parallel(llm_build_context & llm, llama_
    }

    cur = llm_build_context::build_output(lctx, ctx0, cur, model.output, model.output_norm, cb);
-    cb(cur, "almost_result", -1);
    if (hparams.f_final_logit_softcapping > 0) {
+        cb(cur, "almost_result", -1);
        cur = ggml_softcap(ctx0, cur, 1.0f / hparams.f_final_logit_softcapping, hparams.f_final_logit_softcapping);
    }
    cb(cur, "result_output", -1);
@ -666,6 +666,7 @@ ggml_cgraph * llm_build_context::build_gemma4_mtp() {

    ggml_tensor * mtp_embd = llm_build_lora_mm(lctx, ctx0, model.mtp_post_proj, cur);
    cb(mtp_embd, "result_mtp_embd", -1);
+    ggml_set_output(mtp_embd);
    ggml_build_forward_expand(gf, mtp_embd);

    ggml_tensor * logits;