From 2c1dc8781b97963d095b9697a08febefafea22d2 Mon Sep 17 00:00:00 2001
From: Kawrakow <iwankawrakow@gmail.com>
Date: Thu, 18 Jun 2026 13:15:10 +0000
Subject: [PATCH] Fix MTP warmup for GLM models

---
 common/speculative.cpp      | 3 +--
 src/graphs/build_gemma4.cpp | 3 ++-
 2 files changed, 3 insertions(+), 3 deletions(-)

diff --git a/common/speculative.cpp b/common/speculative.cpp
index 9b956c10..d91e0296 100644
--- a/common/speculative.cpp
+++ b/common/speculative.cpp
@@ -2347,8 +2347,6 @@ void common_speculative_checkpoint_restore(
     common_speculative_checkpoint_discard(ckpt, ctx);
 }
 
-static bool mtp_model_uses_recurrent_conditioning(const common_speculative_state_mtp & state);
-
 void common_speculative_commit(
         common_speculative * spec,
         llama_context * ctx,
@@ -2559,6 +2557,7 @@ static bool mtp_model_uses_recurrent_conditioning(const common_speculative_state
     if (state.ctx_mtp == nullptr) {
         return false;
     }
+    return true;
 
     const llama_model * model = llama_get_model(state.ctx_mtp);
     if (!llama_model_has_recurrent(model)) {
diff --git a/src/graphs/build_gemma4.cpp b/src/graphs/build_gemma4.cpp
index a0c555df..7e4c655a 100644
--- a/src/graphs/build_gemma4.cpp
+++ b/src/graphs/build_gemma4.cpp
@@ -519,8 +519,8 @@ static ggml_cgraph * build_gemma4_graph_parallel(llm_build_context & llm, llama_
     }
 
     cur = llm_build_context::build_output(lctx, ctx0, cur, model.output, model.output_norm, cb);
-    cb(cur, "almost_result", -1);
     if (hparams.f_final_logit_softcapping > 0) {
+        cb(cur, "almost_result", -1);
         cur = ggml_softcap(ctx0, cur, 1.0f / hparams.f_final_logit_softcapping, hparams.f_final_logit_softcapping);
     }
     cb(cur, "result_output", -1);
@@ -666,6 +666,7 @@ ggml_cgraph * llm_build_context::build_gemma4_mtp() {
 
     ggml_tensor * mtp_embd = llm_build_lora_mm(lctx, ctx0, model.mtp_post_proj, cur);
     cb(mtp_embd, "result_mtp_embd", -1);
+    ggml_set_output(mtp_embd);
     ggml_build_forward_expand(gf, mtp_embd);
 
     ggml_tensor * logits;