Fix GLM MTP with split mode graph (#1887)

* Fix crash with GLM and MTP * Fix GLM MTP with split mode graph
2026-06-28 04:30:15 -05:00 · 2026-05-27 07:24:28 +03:00 · 2026-05-27 07:24:28 +03:00 · d503b046f7
commit d503b046f7
parent 1f66f9912f
1 changed files with 8 additions and 2 deletions
--- a/src/llama-load-tensors.cpp
+++ b/src/llama-load-tensors.cpp
@ -174,7 +174,9 @@ struct create_tensors_helper : public create_tensors_helper_interface {
        return ctx_map.at(model.buft_layer[i].buft);
    }
    inline ggml_context * ctx_for_layer_split(int i) const {
-        return ctx_map.at(model.buft_layer[i].buft_matrix);
+        const bool is_mtp_layer = model.hparams.nextn_predict_layers > 0 &&
+                                  static_cast<uint32_t>(i) >= model.hparams.n_layer - model.hparams.nextn_predict_layers;
+        return is_mtp_layer ? ctx_map.at(model.buft_layer[i].buft) : ctx_map.at(model.buft_layer[i].buft_matrix);
    }

    std::map<ggml_backend_buffer_type_t, int> buft_layer_count;
@ -2729,6 +2731,10 @@ bool create_tensors_helper::create_glm4_moe_tensors(const LLM_TN & tn) {
        const bool is_mtp_layer = hparams.nextn_predict_layers > 0 &&
                                  static_cast<uint32_t>(i) >= n_layer - hparams.nextn_predict_layers;

+        if (is_mtp_layer) {
+            ctx_split = ctx_layer;
+        }
+
        int flags = 0;
        // Skip loading MTP layers if the feature is disabled
        if (!model.mtp) {
@ -2780,7 +2786,7 @@ bool create_tensors_helper::create_glm4_moe_tensors(const LLM_TN & tn) {
            layer.ffn_exp_probs_b = create_tensor(ffn_ctx, tn(LLM_TENSOR_FFN_EXP_PROBS_B, "bias", i), { n_expert }, flags);

            // MoE branch
-            use_mmap_buffer &= !create_std_ffn_exps(n_embd, tn, i, flags);
+            use_mmap_buffer &= !create_std_ffn_exps(n_embd, tn, i, flags, 0, ffn_ctx);

            // Shared expert
            if (n_expert_shared > 0) {