Force Gemma4 assistant to be loaded on last GPU (#1999)

* Allow graph reuse for Gemma4 MTP

* Force Gemma4 assistant to be loaded on last GPU
This commit is contained in:
Kawrakow 2026-06-19 18:17:13 +02:00 committed by GitHub
parent 8369cf7412
commit d47f484d29
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
2 changed files with 2 additions and 2 deletions

View File

@ -613,7 +613,7 @@ ggml_cgraph * llm_build_context::build_gemma4_mtp() {
for (int il = 0; il < n_layer; ++il) {
ggml_tensor * inpL = cur;
const bool is_sliding = hparams.swa_layers[il] ? true : false;
const bool is_sliding = hparams.swa_layers[il] ? true : false;
const float freq_base_l = is_sliding ? target_hparams.rope_freq_base_train_swa : target_cparams.rope_freq_base;
const float freq_scale_l = is_sliding ? target_hparams.rope_freq_scale_train_swa : target_cparams.rope_freq_scale;
const int n_rot_l = is_sliding ? target_hparams.n_rot_swa : target_hparams.n_rot;

View File

@ -3735,7 +3735,7 @@ static bool llm_load_tensors(
}
}
if ((model.arch == LLM_ARCH_GEMMA4_MTP || model.arch == LLM_ARCH_GEMMA4_ASSISTANT) && split_mode == LLAMA_SPLIT_MODE_LAYER && device_count > 0 && n_gpu_layers > 0) {
const int mtp_device = std::clamp(main_gpu, 0, device_count - 1);
const int mtp_device = device_count - 1; //std::clamp(main_gpu, 0, device_count - 1);
LLAMA_LOG_INFO("%s: Gemma 4 MTP assistant forcing layer placement to GPU %d under layer split\n",
__func__, mtp_device);