mirror of
https://github.com/ikawrakow/ik_llama.cpp.git
synced 2026-06-28 04:30:15 -05:00
Fix split mode graph for Qwen35-MoE + MTP (#1861)
This commit is contained in:
parent
b26521b9ef
commit
b3d39cff8b
@ -4433,7 +4433,9 @@ static void evaluate_and_capture_cuda_graph(ggml_backend_cuda_context * cuda_ctx
|
||||
auto graph = use_cuda_graph ? ggml_cuda_get_graph(*cuda_ctx, ggml_cuda_graph_get_key(cgraph)) : nullptr;
|
||||
#endif
|
||||
|
||||
//printf("======================== %s: graph with %d nodes on device %d. time = %ld\n", __func__, cgraph->n_nodes, cuda_ctx->device, ggml_time_us());
|
||||
#if IK_PRINT_TIMING
|
||||
printf("======================== %s: graph with %d nodes on device %d. time = %ld\n", __func__, cgraph->n_nodes, cuda_ctx->device, ggml_time_us());
|
||||
#endif
|
||||
while (!graph_evaluated_or_captured) {
|
||||
// Only perform the graph execution if CUDA graphs are not enabled, or we are capturing the graph.
|
||||
// With the use of CUDA graphs, the execution will be performed by the graph launch.
|
||||
|
||||
@ -33,7 +33,8 @@ struct create_tensors_helper : public create_tensors_helper_interface {
|
||||
|
||||
bool merge_up_gate_exps(const LLM_TN & tn, int i, int bias);
|
||||
|
||||
bool create_std_ffn_exps(int64_t n_embd, const LLM_TN & tn, int i, int flags = 0, int n_ff_exps_input = 0);
|
||||
bool create_std_ffn_exps(int64_t n_embd, const LLM_TN & tn, int i, int flags = 0, int n_ff_exps_input = 0,
|
||||
ggml_context * ffn_ctx = nullptr);
|
||||
|
||||
bool create_tensors() override;
|
||||
|
||||
@ -1585,7 +1586,7 @@ bool create_tensors_helper::create_qwen35moe_tensors(const LLM_TN & tn) {
|
||||
}
|
||||
|
||||
layer.ffn_gate_inp = create_tensor(ctx_split, tn(LLM_TENSOR_FFN_GATE_INP, "weight", i), { n_embd, n_expert }, flags);
|
||||
use_mmap_buffer &= !create_std_ffn_exps(n_embd, tn, i, flags, n_ff_exp);
|
||||
use_mmap_buffer &= !create_std_ffn_exps(n_embd, tn, i, flags, n_ff_exp, ctx_split);
|
||||
|
||||
// Shared experts
|
||||
const int64_t n_ff_shexp = hparams.n_ff_shexp ? hparams.n_ff_shexp : n_ff;
|
||||
@ -3590,14 +3591,17 @@ bool create_tensors_helper::merge_up_gate_exps(const LLM_TN & tn, int i, int bia
|
||||
return true;
|
||||
}
|
||||
|
||||
bool create_tensors_helper::create_std_ffn_exps(int64_t n_embd, const LLM_TN & tn, int i, int flags, int n_ff_exps_input) {
|
||||
bool create_tensors_helper::create_std_ffn_exps(int64_t n_embd, const LLM_TN & tn, int i, int flags, int n_ff_exps_input,
|
||||
ggml_context * ffn_ctx) {
|
||||
const int64_t n_expert = model.hparams.n_expert;
|
||||
const int64_t n_expert_used = model.hparams.n_expert_used;
|
||||
const int64_t n_ff = model.hparams.n_ff();
|
||||
const int64_t n_ff_exp = n_ff_exps_input > 0 ? n_ff_exps_input : model.hparams.n_ff_exp ? model.hparams.n_ff_exp : n_ff / n_expert_used;
|
||||
|
||||
auto & layer = model.layers[i];
|
||||
auto ffn_ctx = ctx_for_layer_split(i);
|
||||
if (!ffn_ctx) {
|
||||
ffn_ctx = ctx_for_layer_split(i);
|
||||
}
|
||||
|
||||
bool merged = false;
|
||||
auto ug_name = tn(LLM_TENSOR_FFN_GATE_UP_EXPS, "weight", i);
|
||||
|
||||
@ -3319,12 +3319,6 @@ static bool llm_load_tensors(
|
||||
LLAMA_LOG_WARN(" => changing split mode to 'layer'\n");
|
||||
LLAMA_LOG_WARN("===========================================================\n\n");
|
||||
split_mode = LLAMA_SPLIT_MODE_LAYER;
|
||||
} else if (model.arch == LLM_ARCH_QWEN35MOE && mtp) {
|
||||
LLAMA_LOG_WARN("\n=========================================================\n");
|
||||
LLAMA_LOG_WARN("Split mode 'graph' curently does not work with MTP for Qwen35-MoE\n");
|
||||
LLAMA_LOG_WARN(" => changing split mode to 'layer'\n");
|
||||
LLAMA_LOG_WARN("=======================================================\n\n");
|
||||
split_mode = LLAMA_SPLIT_MODE_LAYER;
|
||||
} else if (!is_model_split_supported(model)) {
|
||||
LLAMA_LOG_WARN("\n=======================================================\n");
|
||||
LLAMA_LOG_WARN("Split mode 'graph' is not supported for this model\n");
|
||||
|
||||
Loading…
x
Reference in New Issue
Block a user