diff --git a/src/llama-load-tensors.cpp b/src/llama-load-tensors.cpp index 3a882dd2..5fbfd010 100644 --- a/src/llama-load-tensors.cpp +++ b/src/llama-load-tensors.cpp @@ -174,7 +174,9 @@ struct create_tensors_helper : public create_tensors_helper_interface { return ctx_map.at(model.buft_layer[i].buft); } inline ggml_context * ctx_for_layer_split(int i) const { - return ctx_map.at(model.buft_layer[i].buft_matrix); + const bool is_mtp_layer = model.hparams.nextn_predict_layers > 0 && + static_cast(i) >= model.hparams.n_layer - model.hparams.nextn_predict_layers; + return is_mtp_layer ? ctx_map.at(model.buft_layer[i].buft) : ctx_map.at(model.buft_layer[i].buft_matrix); } std::map buft_layer_count; @@ -2729,6 +2731,10 @@ bool create_tensors_helper::create_glm4_moe_tensors(const LLM_TN & tn) { const bool is_mtp_layer = hparams.nextn_predict_layers > 0 && static_cast(i) >= n_layer - hparams.nextn_predict_layers; + if (is_mtp_layer) { + ctx_split = ctx_layer; + } + int flags = 0; // Skip loading MTP layers if the feature is disabled if (!model.mtp) { @@ -2780,7 +2786,7 @@ bool create_tensors_helper::create_glm4_moe_tensors(const LLM_TN & tn) { layer.ffn_exp_probs_b = create_tensor(ffn_ctx, tn(LLM_TENSOR_FFN_EXP_PROBS_B, "bias", i), { n_expert }, flags); // MoE branch - use_mmap_buffer &= !create_std_ffn_exps(n_embd, tn, i, flags); + use_mmap_buffer &= !create_std_ffn_exps(n_embd, tn, i, flags, 0, ffn_ctx); // Shared expert if (n_expert_shared > 0) {