From d47f484d299cafad2e606afc0d31677a91b242d0 Mon Sep 17 00:00:00 2001 From: Kawrakow Date: Fri, 19 Jun 2026 18:17:13 +0200 Subject: [PATCH] Force Gemma4 assistant to be loaded on last GPU (#1999) * Allow graph reuse for Gemma4 MTP * Force Gemma4 assistant to be loaded on last GPU --- src/graphs/build_gemma4.cpp | 2 +- src/llama.cpp | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/src/graphs/build_gemma4.cpp b/src/graphs/build_gemma4.cpp index 0aee1f0d..3a0bb3fc 100644 --- a/src/graphs/build_gemma4.cpp +++ b/src/graphs/build_gemma4.cpp @@ -613,7 +613,7 @@ ggml_cgraph * llm_build_context::build_gemma4_mtp() { for (int il = 0; il < n_layer; ++il) { ggml_tensor * inpL = cur; - const bool is_sliding = hparams.swa_layers[il] ? true : false; + const bool is_sliding = hparams.swa_layers[il] ? true : false; const float freq_base_l = is_sliding ? target_hparams.rope_freq_base_train_swa : target_cparams.rope_freq_base; const float freq_scale_l = is_sliding ? target_hparams.rope_freq_scale_train_swa : target_cparams.rope_freq_scale; const int n_rot_l = is_sliding ? target_hparams.n_rot_swa : target_hparams.n_rot; diff --git a/src/llama.cpp b/src/llama.cpp index b71eed23..331b64fd 100644 --- a/src/llama.cpp +++ b/src/llama.cpp @@ -3735,7 +3735,7 @@ static bool llm_load_tensors( } } if ((model.arch == LLM_ARCH_GEMMA4_MTP || model.arch == LLM_ARCH_GEMMA4_ASSISTANT) && split_mode == LLAMA_SPLIT_MODE_LAYER && device_count > 0 && n_gpu_layers > 0) { - const int mtp_device = std::clamp(main_gpu, 0, device_count - 1); + const int mtp_device = device_count - 1; //std::clamp(main_gpu, 0, device_count - 1); LLAMA_LOG_INFO("%s: Gemma 4 MTP assistant forcing layer placement to GPU %d under layer split\n", __func__, mtp_device);