From bf23a7599ca99366dde90af5c85361757bc9522e Mon Sep 17 00:00:00 2001
From: Kawrakow <iwankawrakow@gmail.com>
Date: Wed, 24 Jun 2026 11:23:22 +0200
Subject: [PATCH 1/2] Avoid Gemma4 assistant strange tensor name warnings
 (#2023)

---
 src/llama.cpp | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)
diff --git a/src/llama.cpp b/src/llama.cpp
index f5e6f30c..fba96ccb 100644
--- a/src/llama.cpp
+++ b/src/llama.cpp
@@ -3174,7 +3174,9 @@ static std::pair<std::vector<double>, double> get_layer_sizes(const llama_model_
             }
         }
         if (name == "mtp_pre_proj.weight"  || name == "mtp_post_proj.weight" ||
-            name == "mtp_centroids.weight" || name == "mtp_token_ordering.weight") {
+            name == "mtp_centroids.weight" || name == "mtp_token_ordering.weight" ||
+            name == "nextn.post_projection.weight" || name == "nextn.pre_projection.weight" ||
+            name == "rope_freqs.weight") {
             continue;
         }
         if (name == "dflash_fc.weight" || name == "dflash_hidden_norm.weight") {

From d5507e33ae7ee2b7b41475f08044d3bde3b839ee Mon Sep 17 00:00:00 2001
From: Kawrakow <iwankawrakow@gmail.com>
Date: Wed, 24 Jun 2026 18:29:32 +0200
Subject: [PATCH 2/2] Split mode graph for dense Gemma4 assistant  (#2022)

* WIP: Split mode graph for Gemma4 assistant

Something is not right - acceptance drops to nearly zero.

* Per model CUDA contexts

Still not working!?

* This works

The issue was that I was not correctly calculating the number
of KV heads for the split KV cache.

* Compiler warnings

* It is better to use llama_context pointers as keys
---
 examples/cvector-generator/pca.hpp |   2 +-
 ggml/include/ggml-cuda.h           |   4 +-
 ggml/src/ggml-cuda.cu              |  54 +++---
 ggml/src/ggml-cuda/common.cuh      |   6 +-
 ggml/src/ggml-cuda/reduce.cu       | 142 ++++++++-------
 src/graphs/build_gemma4.cpp        | 281 ++++++++++++++++++++++++-----
 src/llama-impl.h                   |   2 +
 src/llama-load-tensors.cpp         |  71 +++++++-
 src/llama-model-loader.cpp         |   3 +-
 src/llama-model-loader.h           |   1 +
 src/llama-reload.cpp               |   2 +-
 src/llama.cpp                      |  75 +++++++-
 12 files changed, 483 insertions(+), 160 deletions(-)

diff --git a/examples/cvector-generator/pca.hpp b/examples/cvector-generator/pca.hpp
index 8d920453..ac7c45a7 100644
--- a/examples/cvector-generator/pca.hpp
+++ b/examples/cvector-generator/pca.hpp
@@ -66,7 +66,7 @@ struct pca_model {
     pca_model(struct ggml_tensor * t_input) {
 #ifdef GGML_USE_CUDA
         fprintf(stderr, "%s: using CUDA backend\n", __func__);
-        backend = ggml_backend_cuda_init(0, nullptr); // init device 0
+        backend = ggml_backend_cuda_init(0, nullptr, nullptr); // init device 0
         if (!backend) {
             fprintf(stderr, "%s: ggml_backend_cuda_init() failed\n", __func__);
         }
diff --git a/ggml/include/ggml-cuda.h b/ggml/include/ggml-cuda.h
index a9787eb5..291b7db9 100644
--- a/ggml/include/ggml-cuda.h
+++ b/ggml/include/ggml-cuda.h
@@ -21,7 +21,7 @@ extern "C" {
 #define GGML_CUDA_MAX_DEVICES       16
 
 // backend API
-GGML_API GGML_CALL ggml_backend_t ggml_backend_cuda_init(int device, const void * params);
+GGML_API GGML_CALL ggml_backend_t ggml_backend_cuda_init(int device, const void * params, const void * model);
 
 GGML_API GGML_CALL bool ggml_backend_is_cuda(ggml_backend_t backend);
 
@@ -43,7 +43,7 @@ GGML_API GGML_CALL void ggml_backend_cuda_unregister_host_buffer(void * buffer);
 
 GGML_API GGML_CALL void ggml_backend_cuda_log_set_callback(ggml_log_callback log_callback, void * user_data);
 
-GGML_API GGML_CALL void ggml_backend_cuda_invalidate_graphs(void);
+GGML_API GGML_CALL void ggml_backend_cuda_invalidate_graphs(const void * model);
 #ifdef  __cplusplus
 }
 #endif
diff --git a/ggml/src/ggml-cuda.cu b/ggml/src/ggml-cuda.cu
index 51693a09..8536179c 100644
--- a/ggml/src/ggml-cuda.cu
+++ b/ggml/src/ggml-cuda.cu
@@ -298,13 +298,22 @@ const ggml_cuda_device_info & ggml_cuda_info() {
 }
 
 /* ---------- hot-swap: invalidate all cached CUDA graphs ---------- */
-extern "C" void ggml_backend_cuda_invalidate_graphs(void) {
+extern "C" void ggml_backend_cuda_invalidate_graphs(const void * model) {
     auto & info = const_cast<ggml_cuda_device_info &>(ggml_cuda_info());
-    for (int i = 0; i < info.device_count; ++i) {
-        if (info.all_ctx[i]) {
-            info.all_ctx[i]->cuda_graphs.clear();
+    if (auto it = info.all_ctx.find(model); it != info.all_ctx.end()) {
+        for (auto ctx : it->second) {
+            if (ctx) {
+                ctx->cuda_graphs.clear();
+            }
         }
+    } else {
+        fprintf(stderr, "================================= %s: did not find entry for model at %p\n", __func__, model);
     }
+    //for (int i = 0; i < info.device_count; ++i) {
+    //    if (info.all_ctx[i]) {
+    //        info.all_ctx[i]->cuda_graphs.clear();
+    //    }
+    //}
 }
 
 // #define DEBUG_CUDA_MALLOC
@@ -517,13 +526,14 @@ static std::condition_variable ggml_cuda_lock_cv;
 //static std::atomic<int> ggml_cuda_lock_counter;
 static int ggml_cuda_lock_counter = 0;
 
-ggml_backend_cuda_context::ggml_backend_cuda_context(int device) :
-    device(device), name(GGML_CUDA_NAME + std::to_string(device)) {
+ggml_backend_cuda_context::ggml_backend_cuda_context(int device, const void * model) :
+    device(device), name(GGML_CUDA_NAME + std::to_string(device)), model(model) {
     auto info = const_cast<ggml_cuda_device_info*>(&ggml_cuda_info());
-    if (info->all_ctx[device]) {
+    auto & all_ctx = info->all_ctx[model];
+    if (all_ctx[device]) {
         GGML_CUDA_LOG_WARN("%s: a context for device %d already exists?\n", __func__, device);
     } else{
-        info->all_ctx[device] = this;
+        all_ctx[device] = this;
     }
 }
 
@@ -555,9 +565,12 @@ ggml_backend_cuda_context::~ggml_backend_cuda_context() {
         }
     }
     auto info = const_cast<ggml_cuda_device_info*>(&ggml_cuda_info());
-    if (info->all_ctx[device] == this) {
-        info->all_ctx[device] = nullptr;
+    if (auto it = info->all_ctx.find(model); it != info->all_ctx.end() && it->second[device] == this) {
+        it->second[device] = nullptr;
     }
+    //if (info->all_ctx[device] == this) {
+    //    info->all_ctx[device] = nullptr;
+    //}
 
 }
 
@@ -4247,23 +4260,8 @@ GGML_CALL static bool ggml_backend_cuda_cpy_tensor_async(ggml_backend_t backend_
                 needs_f16_f32_copy = true;
 
             } else {
-#ifdef GGML_USE_NCCL__
-                auto & info = ggml_cuda_info();
-                auto nbytes = ggml_nbytes(src);
-                ncclGroupStart();
-                ggml_cuda_set_device(cuda_ctx_src->device);
-                auto status1 = ncclSend(src->data, nbytes, ncclUint8, cuda_ctx_dst->device, info.nccl_coms[cuda_ctx_src->device],
-                        info.all_ctx[cuda_ctx_src->device]->stream());
-                ggml_cuda_set_device(cuda_ctx_dst->device);
-                auto status2 = ncclRecv(dst->data, nbytes, ncclUint8, cuda_ctx_src->device, info.nccl_coms[cuda_ctx_dst->device],
-                        info.all_ctx[cuda_ctx_dst->device]->stream());
-                ncclGroupEnd();
-                GGML_ASSERT(status1 == ncclSuccess && status2 == ncclSuccess);
-                return true;
-#else
                 ggml_cuda_set_device(cuda_ctx_src->device);
                 CUDA_CHECK(cudaMemcpyPeerAsync(dst->data, cuda_ctx_dst->device, src->data, cuda_ctx_src->device, ggml_nbytes(dst), cuda_ctx_src->stream()));
-#endif
             }
 #endif
         }
@@ -5249,13 +5247,13 @@ static cuda_params ggml_cuda_parse_params(const char * params_string) {
     return params;
 }
 
-GGML_CALL ggml_backend_t ggml_backend_cuda_init(int device, [[maybe_unused]] const void * param_string) {
+GGML_CALL ggml_backend_t ggml_backend_cuda_init(int device, [[maybe_unused]] const void * param_string, const void * model) {
     if (device < 0 || device >= ggml_backend_cuda_get_device_count()) {
         GGML_CUDA_LOG_ERROR("%s: invalid device %d\n", __func__, device);
         return nullptr;
     }
 
-    ggml_backend_cuda_context * ctx = new ggml_backend_cuda_context(device);
+    ggml_backend_cuda_context * ctx = new ggml_backend_cuda_context(device, model);
     if (ctx == nullptr) {
         GGML_CUDA_LOG_ERROR("%s: failed to allocate context\n", __func__);
         return nullptr;
@@ -5370,7 +5368,7 @@ GGML_CALL void ggml_backend_cuda_unregister_host_buffer(void * buffer) {
 
 // backend registry
 GGML_CALL static ggml_backend_t ggml_backend_reg_cuda_init(const char * params, void * user_data) {
-    ggml_backend_t cuda_backend = ggml_backend_cuda_init((int) (intptr_t) user_data, nullptr);
+    ggml_backend_t cuda_backend = ggml_backend_cuda_init((int) (intptr_t) user_data, nullptr, nullptr);
     return cuda_backend;
 
     GGML_UNUSED(params);
diff --git a/ggml/src/ggml-cuda/common.cuh b/ggml/src/ggml-cuda/common.cuh
index bb82edeb..924100d1 100644
--- a/ggml/src/ggml-cuda/common.cuh
+++ b/ggml/src/ggml-cuda/common.cuh
@@ -762,7 +762,8 @@ struct ggml_cuda_device_info {
 
     std::array<float, GGML_CUDA_MAX_DEVICES> default_tensor_split = {};
 
-    ggml_backend_cuda_context * all_ctx[GGML_CUDA_MAX_DEVICES] = { nullptr };
+    std::unordered_map<const void *, std::array<ggml_backend_cuda_context *, GGML_CUDA_MAX_DEVICES>> all_ctx;
+    //ggml_backend_cuda_context * all_ctx[GGML_CUDA_MAX_DEVICES] = { nullptr };
 #ifdef GGML_USE_NCCL
     ncclComm_t nccl_coms[GGML_CUDA_MAX_DEVICES];
     bool have_nccl;
@@ -864,10 +865,11 @@ struct ggml_backend_cuda_context {
 
 #endif
 
+    const void * model;
     void * copy_buffer = nullptr;
     size_t copy_size   = 0;
 
-    explicit ggml_backend_cuda_context(int device);
+    explicit ggml_backend_cuda_context(int device, const void * model);
 
     ~ggml_backend_cuda_context();
 
diff --git a/ggml/src/ggml-cuda/reduce.cu b/ggml/src/ggml-cuda/reduce.cu
index c3177be0..b3a87c8b 100644
--- a/ggml/src/ggml-cuda/reduce.cu
+++ b/ggml/src/ggml-cuda/reduce.cu
@@ -94,6 +94,11 @@ static void copy_missing_tensors(ggml_backend_cuda_context & ctx, ggml_tensor *
     if (ncopy < 1) return;
 
     auto & info = ggml_cuda_info();
+    auto it = info.all_ctx.find(ctx.model);
+    if (it == info.all_ctx.end()) {
+        GGML_ABORT("Fatal error");
+    }
+    auto & all_ctx = it->second;
     auto size = ggml_nbytes(dst);
     int isrc = 0;
     for (int ii = 0; ii < ncopy; ++ii) {
@@ -102,9 +107,9 @@ static void copy_missing_tensors(ggml_backend_cuda_context & ctx, ggml_tensor *
         isrc = (isrc + 1)%nhave;
         //printf("%s: copying from device %d to device %d: %p -> %p\n", __func__, j, i, dst->src[j]->data, dst->src[i]->data);
         ggml_cuda_set_device(j);
-        CUDA_CHECK(cudaMemcpyPeerAsync(dst->src[i]->data, info.all_ctx[i]->device, dst->src[j]->data, info.all_ctx[j]->device,
-                            size, info.all_ctx[j]->stream()));
-        CUDA_CHECK(cudaEventRecord(info.all_ctx[j]->copy_event, info.all_ctx[j]->stream()));
+        CUDA_CHECK(cudaMemcpyPeerAsync(dst->src[i]->data, all_ctx[i]->device, dst->src[j]->data, all_ctx[j]->device,
+                            size, all_ctx[j]->stream()));
+        CUDA_CHECK(cudaEventRecord(all_ctx[j]->copy_event, all_ctx[j]->stream()));
     }
     isrc = 0;
     for (int ii = 0; ii < ncopy; ++ii) {
@@ -112,7 +117,7 @@ static void copy_missing_tensors(ggml_backend_cuda_context & ctx, ggml_tensor *
         int j = idx[isrc];
         isrc = (isrc + 1)%nhave;
         ggml_cuda_set_device(i);
-        CUDA_CHECK(cudaStreamWaitEvent(info.all_ctx[i]->stream(), info.all_ctx[j]->copy_event, 0));
+        CUDA_CHECK(cudaStreamWaitEvent(all_ctx[i]->stream(), all_ctx[j]->copy_event, 0));
     }
     ggml_cuda_set_device(ctx.device);
 }
@@ -133,6 +138,11 @@ void ggml_cuda_op_reduce([[maybe_unused]] ggml_backend_cuda_context & ctx, ggml_
     }
 
     auto & info = ggml_cuda_info();
+    auto it = info.all_ctx.find(ctx.model);
+    if (it == info.all_ctx.end()) {
+        GGML_ABORT("Fatal error");
+    }
+    auto & all_ctx = it->second;
 #ifdef GGML_USE_NCCL
     // Somehow I'm not able to figure out how to use NCCL correctly.
     // It does not work at all if not all GPUs participate in the reduce op, and we
@@ -153,7 +163,7 @@ void ggml_cuda_op_reduce([[maybe_unused]] ggml_backend_cuda_context & ctx, ggml_
             ggml_cuda_set_device(i);
             auto status = ncclAllReduce(dst->src[i] ? dst->src[i]->data : nullptr,
                     dst->src[i] ? dst->src[i]->data : nullptr,
-                    ggml_nelements(dst), data_type, ncclSum, info.nccl_coms[i], info.all_ctx[i]->stream());
+                    ggml_nelements(dst), data_type, ncclSum, info.nccl_coms[i], all_ctx[i]->stream());
             if (status != ncclSuccess) {
                 fprintf(stderr, "%s: ncclAllReduce failed with status %d\n", __func__, (int)status);
                 GGML_ABORT("Fatal error");
@@ -275,7 +285,7 @@ void ggml_cuda_op_reduce([[maybe_unused]] ggml_backend_cuda_context & ctx, ggml_
         auto size_per_device  = nblocks_per_device * tt.type_size;
         for (int ii = 0; ii < nhave; ++ii) {
             int i = idx[ii];
-            auto this_ctx = info.all_ctx[i];
+            auto this_ctx = all_ctx[i];
             if (!this_ctx->copy_event || !this_ctx->compute_event || size_per_device > this_ctx->copy_size) {
                 ggml_cuda_set_device(this_ctx->device);
                 if (!this_ctx->copy_event) {
@@ -300,14 +310,14 @@ void ggml_cuda_op_reduce([[maybe_unused]] ggml_backend_cuda_context & ctx, ggml_
                 int peer = idx[(ii+1)%nhave];
                 auto this_nelem = std::min(nelem_per_device, nelem - ichunk*nelem_per_device);
                 auto this_size  = (this_nelem / tt.blck_size) * tt.type_size;
-                ggml_cuda_set_device(info.all_ctx[peer]->device);
+                ggml_cuda_set_device(all_ctx[peer]->device);
                 if (stage > 0) {
-                    CUDA_CHECK(cudaStreamWaitEvent(info.all_ctx[peer]->stream(), info.all_ctx[i]->compute_event, 0));
+                    CUDA_CHECK(cudaStreamWaitEvent(all_ctx[peer]->stream(), all_ctx[i]->compute_event, 0));
                 }
-                CUDA_CHECK(cudaMemcpyPeerAsync(info.all_ctx[i]->copy_buffer, info.all_ctx[i]->device,
-                            (const char *)dst->src[peer]->data + ichunk*size_per_device, info.all_ctx[peer]->device,
-                            this_size, info.all_ctx[peer]->stream()));
-                CUDA_CHECK(cudaEventRecord(info.all_ctx[peer]->copy_event, info.all_ctx[peer]->stream()));
+                CUDA_CHECK(cudaMemcpyPeerAsync(all_ctx[i]->copy_buffer, all_ctx[i]->device,
+                            (const char *)dst->src[peer]->data + ichunk*size_per_device, all_ctx[peer]->device,
+                            this_size, all_ctx[peer]->stream()));
+                CUDA_CHECK(cudaEventRecord(all_ctx[peer]->copy_event, all_ctx[peer]->stream()));
                 ichunk = (ichunk + 1)%nhave;
             }
             ichunk = stage;
@@ -315,24 +325,24 @@ void ggml_cuda_op_reduce([[maybe_unused]] ggml_backend_cuda_context & ctx, ggml_
                 int i = idx[ii];
                 int peer = idx[(ii+1)%nhave];
                 auto this_nelem = std::min(nelem_per_device, nelem - ichunk*nelem_per_device);
-                ggml_cuda_set_device(info.all_ctx[i]->device);
-                CUDA_CHECK(cudaStreamWaitEvent(info.all_ctx[i]->stream(), info.all_ctx[peer]->copy_event, 0));
+                ggml_cuda_set_device(all_ctx[i]->device);
+                CUDA_CHECK(cudaStreamWaitEvent(all_ctx[i]->stream(), all_ctx[peer]->copy_event, 0));
                 int num_blocks = (this_nelem + CUDA_REDUCE_BLOCK_SIZE - 1)/CUDA_REDUCE_BLOCK_SIZE;
                 if (dst->type == GGML_TYPE_F16) {
-                    k_add<half, CUDA_REDUCE_BLOCK_SIZE><<<num_blocks, CUDA_REDUCE_BLOCK_SIZE, 0, info.all_ctx[i]->stream()>>>(this_nelem,
-                            (const half *)info.all_ctx[i]->copy_buffer, (half *)dst->src[i]->data + ichunk*nelem_per_device);
+                    k_add<half, CUDA_REDUCE_BLOCK_SIZE><<<num_blocks, CUDA_REDUCE_BLOCK_SIZE, 0, all_ctx[i]->stream()>>>(this_nelem,
+                            (const half *)all_ctx[i]->copy_buffer, (half *)dst->src[i]->data + ichunk*nelem_per_device);
                 } else if (dst->type == GGML_TYPE_Q8_0) {
-                    k_add<CUDA_REDUCE_BLOCK_SIZE><<<num_blocks, CUDA_REDUCE_BLOCK_SIZE, 0, info.all_ctx[i]->stream()>>>(this_nelem,
-                            (const block_q8_0 *)info.all_ctx[i]->copy_buffer, (block_q8_0 *)dst->src[i]->data + ichunk*nelem_per_device/tt.blck_size);
+                    k_add<CUDA_REDUCE_BLOCK_SIZE><<<num_blocks, CUDA_REDUCE_BLOCK_SIZE, 0, all_ctx[i]->stream()>>>(this_nelem,
+                            (const block_q8_0 *)all_ctx[i]->copy_buffer, (block_q8_0 *)dst->src[i]->data + ichunk*nelem_per_device/tt.blck_size);
                 } else if (dst->type == GGML_TYPE_BF16) {
-                    k_add<nv_bfloat16, CUDA_REDUCE_BLOCK_SIZE><<<num_blocks, CUDA_REDUCE_BLOCK_SIZE, 0, info.all_ctx[i]->stream()>>>(
-                            this_nelem, (const nv_bfloat16 *)info.all_ctx[i]->copy_buffer,
+                    k_add<nv_bfloat16, CUDA_REDUCE_BLOCK_SIZE><<<num_blocks, CUDA_REDUCE_BLOCK_SIZE, 0, all_ctx[i]->stream()>>>(
+                            this_nelem, (const nv_bfloat16 *)all_ctx[i]->copy_buffer,
                             (nv_bfloat16 *)dst->src[i]->data + ichunk*nelem_per_device);
                 } else {
-                    k_add<float, CUDA_REDUCE_BLOCK_SIZE><<<num_blocks, CUDA_REDUCE_BLOCK_SIZE, 0, info.all_ctx[i]->stream()>>>(this_nelem,
-                            (const float *)info.all_ctx[i]->copy_buffer, (float *)dst->src[i]->data + ichunk*nelem_per_device);
+                    k_add<float, CUDA_REDUCE_BLOCK_SIZE><<<num_blocks, CUDA_REDUCE_BLOCK_SIZE, 0, all_ctx[i]->stream()>>>(this_nelem,
+                            (const float *)all_ctx[i]->copy_buffer, (float *)dst->src[i]->data + ichunk*nelem_per_device);
                 }
-                CUDA_CHECK(cudaEventRecord(info.all_ctx[i]->compute_event, info.all_ctx[i]->stream()));
+                CUDA_CHECK(cudaEventRecord(all_ctx[i]->compute_event, all_ctx[i]->stream()));
                 ichunk = (ichunk + 1)%nhave;
             }
         }
@@ -343,21 +353,21 @@ void ggml_cuda_op_reduce([[maybe_unused]] ggml_backend_cuda_context & ctx, ggml_
                 int peer = idx[(ii+1)%nhave];
                 auto this_nelem = std::min(nelem_per_device, nelem - ichunk*nelem_per_device);
                 auto this_size  = (this_nelem / tt.blck_size) * tt.type_size;
-                ggml_cuda_set_device(info.all_ctx[peer]->device);
+                ggml_cuda_set_device(all_ctx[peer]->device);
                 if (stage == 0) {
-                    CUDA_CHECK(cudaStreamWaitEvent(info.all_ctx[peer]->stream(), info.all_ctx[i]->compute_event, 0));
+                    CUDA_CHECK(cudaStreamWaitEvent(all_ctx[peer]->stream(), all_ctx[i]->compute_event, 0));
                 }
-                CUDA_CHECK(cudaMemcpyPeerAsync((char *)dst->src[i]->data + ichunk*size_per_device, info.all_ctx[i]->device,
-                            (const char *)dst->src[peer]->data + ichunk*size_per_device, info.all_ctx[peer]->device,
-                            this_size, info.all_ctx[peer]->stream()));
-                CUDA_CHECK(cudaEventRecord(info.all_ctx[peer]->copy_event, info.all_ctx[peer]->stream()));
+                CUDA_CHECK(cudaMemcpyPeerAsync((char *)dst->src[i]->data + ichunk*size_per_device, all_ctx[i]->device,
+                            (const char *)dst->src[peer]->data + ichunk*size_per_device, all_ctx[peer]->device,
+                            this_size, all_ctx[peer]->stream()));
+                CUDA_CHECK(cudaEventRecord(all_ctx[peer]->copy_event, all_ctx[peer]->stream()));
                 ichunk = (ichunk + 1)%nhave;
             }
             for (int ii = 0; ii < nhave; ++ii) {
                 int i = idx[ii];
                 int peer = idx[(ii+1)%nhave];
-                ggml_cuda_set_device(info.all_ctx[i]->device);
-                CUDA_CHECK(cudaStreamWaitEvent(info.all_ctx[i]->stream(), info.all_ctx[peer]->copy_event, 0));
+                ggml_cuda_set_device(all_ctx[i]->device);
+                CUDA_CHECK(cudaStreamWaitEvent(all_ctx[i]->stream(), all_ctx[peer]->copy_event, 0));
             }
         }
         ggml_cuda_set_device(ctx.device);
@@ -372,8 +382,8 @@ void ggml_cuda_op_reduce([[maybe_unused]] ggml_backend_cuda_context & ctx, ggml_
             GGML_ASSERT(dst->src[i]->type == dst->type);
             GGML_ASSERT(ggml_are_same_shape(dst, dst->src[i]));
             ggml_cuda_set_device(i);
-            if (!info.all_ctx[i]->copy_event) {
-                CUDA_CHECK(cudaEventCreateWithFlags(&info.all_ctx[i]->copy_event, cudaEventDisableTiming));
+            if (!all_ctx[i]->copy_event) {
+                CUDA_CHECK(cudaEventCreateWithFlags(&all_ctx[i]->copy_event, cudaEventDisableTiming));
             }
         }
         auto nelem = ggml_nelements(dst);
@@ -386,20 +396,20 @@ void ggml_cuda_op_reduce([[maybe_unused]] ggml_backend_cuda_context & ctx, ggml_
             task.ptrs[0] = (char *)dst->src[i]->data;
             int j = idx[2*ii+1];
             ggml_cuda_set_device(j);
-            CUDA_CHECK(cudaEventRecord(info.all_ctx[j]->copy_event, info.all_ctx[j]->stream()));
+            CUDA_CHECK(cudaEventRecord(all_ctx[j]->copy_event, all_ctx[j]->stream()));
             task.ptrs[1] = (char *)dst->src[j]->data;
             ggml_cuda_set_device(i);
-            CUDA_CHECK(cudaStreamWaitEvent(info.all_ctx[i]->stream(), info.all_ctx[j]->copy_event));
+            CUDA_CHECK(cudaStreamWaitEvent(all_ctx[i]->stream(), all_ctx[j]->copy_event));
             if (dst->type == GGML_TYPE_F16) {
-                k_reduce_add_T<half, CUDA_REDUCE_BLOCK_SIZE, 2><<<nblocks, CUDA_REDUCE_BLOCK_SIZE, 0, info.all_ctx[i]->stream()>>>(task);
+                k_reduce_add_T<half, CUDA_REDUCE_BLOCK_SIZE, 2><<<nblocks, CUDA_REDUCE_BLOCK_SIZE, 0, all_ctx[i]->stream()>>>(task);
             } else {
-                k_reduce_add_T<float, CUDA_REDUCE_BLOCK_SIZE, 2><<<nblocks, CUDA_REDUCE_BLOCK_SIZE, 0, info.all_ctx[i]->stream()>>>(task);
+                k_reduce_add_T<float, CUDA_REDUCE_BLOCK_SIZE, 2><<<nblocks, CUDA_REDUCE_BLOCK_SIZE, 0, all_ctx[i]->stream()>>>(task);
             }
         }
         for (int ii = 0; ii < nhave/2; ++ii) {
             int i = idx[2*ii+0];
             ggml_cuda_set_device(i);
-            CUDA_CHECK(cudaEventRecord(info.all_ctx[i]->copy_event, info.all_ctx[i]->stream()));
+            CUDA_CHECK(cudaEventRecord(all_ctx[i]->copy_event, all_ctx[i]->stream()));
         }
         for (int ii = 0; ii < nhave/2; ++ii) {
             int i = idx[2*ii+1];
@@ -411,23 +421,23 @@ void ggml_cuda_op_reduce([[maybe_unused]] ggml_backend_cuda_context & ctx, ggml_
             int j = idx[(2*ii+2)%nhave];
             task.ptrs[1] = (char *)dst->src[j]->data;
             ggml_cuda_set_device(i);
-            CUDA_CHECK(cudaStreamWaitEvent(info.all_ctx[i]->stream(), info.all_ctx[j]->copy_event));
+            CUDA_CHECK(cudaStreamWaitEvent(all_ctx[i]->stream(), all_ctx[j]->copy_event));
             if (dst->type == GGML_TYPE_F16) {
-                k_reduce_add_T<half, CUDA_REDUCE_BLOCK_SIZE, 2><<<nblocks, CUDA_REDUCE_BLOCK_SIZE, 0, info.all_ctx[i]->stream()>>>(task);
+                k_reduce_add_T<half, CUDA_REDUCE_BLOCK_SIZE, 2><<<nblocks, CUDA_REDUCE_BLOCK_SIZE, 0, all_ctx[i]->stream()>>>(task);
             } else {
-                k_reduce_add_T<float, CUDA_REDUCE_BLOCK_SIZE, 2><<<nblocks, CUDA_REDUCE_BLOCK_SIZE, 0, info.all_ctx[i]->stream()>>>(task);
+                k_reduce_add_T<float, CUDA_REDUCE_BLOCK_SIZE, 2><<<nblocks, CUDA_REDUCE_BLOCK_SIZE, 0, all_ctx[i]->stream()>>>(task);
             }
         }
         for (int ii = 0; ii < nhave/2; ++ii) {
             int i = idx[2*ii+1];
             ggml_cuda_set_device(i);
-            CUDA_CHECK(cudaEventRecord(info.all_ctx[i]->copy_event, info.all_ctx[i]->stream()));
+            CUDA_CHECK(cudaEventRecord(all_ctx[i]->copy_event, all_ctx[i]->stream()));
         }
         for (int ii = 0; ii < nhave/2; ++ii) {
             int i = idx[(2*ii+2)%nhave];
             ggml_cuda_set_device(i);
             int j = idx[2*ii+1];
-            CUDA_CHECK(cudaStreamWaitEvent(info.all_ctx[i]->stream(), info.all_ctx[j]->copy_event));
+            CUDA_CHECK(cudaStreamWaitEvent(all_ctx[i]->stream(), all_ctx[j]->copy_event));
         }
         ggml_cuda_set_device(ctx.device);
         if (ncopy > 0) {
@@ -442,10 +452,10 @@ void ggml_cuda_op_reduce([[maybe_unused]] ggml_backend_cuda_context & ctx, ggml_
             GGML_ASSERT(dst->src[i]->type == dst->type);
             GGML_ASSERT(ggml_are_same_shape(dst, dst->src[i]));
             ggml_cuda_set_device(i);
-            if (!info.all_ctx[i]->copy_event) {
-                CUDA_CHECK(cudaEventCreateWithFlags(&info.all_ctx[i]->copy_event, cudaEventDisableTiming));
+            if (!all_ctx[i]->copy_event) {
+                CUDA_CHECK(cudaEventCreateWithFlags(&all_ctx[i]->copy_event, cudaEventDisableTiming));
             }
-            CUDA_CHECK(cudaEventRecord(info.all_ctx[i]->copy_event, info.all_ctx[i]->stream()));
+            CUDA_CHECK(cudaEventRecord(all_ctx[i]->copy_event, all_ctx[i]->stream()));
         }
         //printf("Recorded events\n");
         auto nelem = ggml_nelements(dst);
@@ -465,37 +475,37 @@ void ggml_cuda_op_reduce([[maybe_unused]] ggml_backend_cuda_context & ctx, ggml_
             for (int jj = 0; jj < nhave; ++jj) {
                 if (jj == ii) continue;
                 int j = idx[jj];
-                CUDA_CHECK(cudaStreamWaitEvent(info.all_ctx[i]->stream(), info.all_ctx[j]->copy_event));
+                CUDA_CHECK(cudaStreamWaitEvent(all_ctx[i]->stream(), all_ctx[j]->copy_event));
                 task.ptrs[k++] = (char *)dst->src[j]->data + ii*nelem_per_device*elem_size;
             }
             int nblock = (this_nelem + CUDA_REDUCE_BLOCK_SIZE - 1)/CUDA_REDUCE_BLOCK_SIZE;
             if (dst->type == GGML_TYPE_F16) {
                 switch (nhave) {
                     case 2:
-                        k_reduce_add_T<half, CUDA_REDUCE_BLOCK_SIZE, 2><<<nblock, CUDA_REDUCE_BLOCK_SIZE, 0, info.all_ctx[i]->stream()>>>(task);
+                        k_reduce_add_T<half, CUDA_REDUCE_BLOCK_SIZE, 2><<<nblock, CUDA_REDUCE_BLOCK_SIZE, 0, all_ctx[i]->stream()>>>(task);
                         break;
                     case 3:
-                        k_reduce_add_T<half, CUDA_REDUCE_BLOCK_SIZE, 3><<<nblock, CUDA_REDUCE_BLOCK_SIZE, 0, info.all_ctx[i]->stream()>>>(task);
+                        k_reduce_add_T<half, CUDA_REDUCE_BLOCK_SIZE, 3><<<nblock, CUDA_REDUCE_BLOCK_SIZE, 0, all_ctx[i]->stream()>>>(task);
                         break;
                     case 4:
-                        k_reduce_add_T<half, CUDA_REDUCE_BLOCK_SIZE, 4><<<nblock, CUDA_REDUCE_BLOCK_SIZE, 0, info.all_ctx[i]->stream()>>>(task);
+                        k_reduce_add_T<half, CUDA_REDUCE_BLOCK_SIZE, 4><<<nblock, CUDA_REDUCE_BLOCK_SIZE, 0, all_ctx[i]->stream()>>>(task);
                         break;
                     default:
-                        k_reduce_add<half, CUDA_REDUCE_BLOCK_SIZE><<<nblock, CUDA_REDUCE_BLOCK_SIZE, 0, info.all_ctx[i]->stream()>>>(task);
+                        k_reduce_add<half, CUDA_REDUCE_BLOCK_SIZE><<<nblock, CUDA_REDUCE_BLOCK_SIZE, 0, all_ctx[i]->stream()>>>(task);
                 }
             } else {
                 switch (nhave) {
                     case 2:
-                        k_reduce_add_T<float, CUDA_REDUCE_BLOCK_SIZE, 2><<<nblock, CUDA_REDUCE_BLOCK_SIZE, 0, info.all_ctx[i]->stream()>>>(task);
+                        k_reduce_add_T<float, CUDA_REDUCE_BLOCK_SIZE, 2><<<nblock, CUDA_REDUCE_BLOCK_SIZE, 0, all_ctx[i]->stream()>>>(task);
                         break;
                     case 3:
-                        k_reduce_add_T<float, CUDA_REDUCE_BLOCK_SIZE, 3><<<nblock, CUDA_REDUCE_BLOCK_SIZE, 0, info.all_ctx[i]->stream()>>>(task);
+                        k_reduce_add_T<float, CUDA_REDUCE_BLOCK_SIZE, 3><<<nblock, CUDA_REDUCE_BLOCK_SIZE, 0, all_ctx[i]->stream()>>>(task);
                         break;
                     case 4:
-                        k_reduce_add_T<float, CUDA_REDUCE_BLOCK_SIZE, 4><<<nblock, CUDA_REDUCE_BLOCK_SIZE, 0, info.all_ctx[i]->stream()>>>(task);
+                        k_reduce_add_T<float, CUDA_REDUCE_BLOCK_SIZE, 4><<<nblock, CUDA_REDUCE_BLOCK_SIZE, 0, all_ctx[i]->stream()>>>(task);
                         break;
                     default:
-                        k_reduce_add<float, CUDA_REDUCE_BLOCK_SIZE><<<nblock, CUDA_REDUCE_BLOCK_SIZE, 0, info.all_ctx[i]->stream()>>>(task);
+                        k_reduce_add<float, CUDA_REDUCE_BLOCK_SIZE><<<nblock, CUDA_REDUCE_BLOCK_SIZE, 0, all_ctx[i]->stream()>>>(task);
                 }
             }
         }
@@ -503,7 +513,7 @@ void ggml_cuda_op_reduce([[maybe_unused]] ggml_backend_cuda_context & ctx, ggml_
         for (int ii = 0; ii < nhave; ++ii) {
             int i = idx[ii];
             ggml_cuda_set_device(i);
-            CUDA_CHECK(cudaEventRecord(info.all_ctx[i]->copy_event, info.all_ctx[i]->stream()));
+            CUDA_CHECK(cudaEventRecord(all_ctx[i]->copy_event, all_ctx[i]->stream()));
         }
         //printf("Recorded events again\n");
         for (int ii = 0; ii < nhave; ++ii) {
@@ -512,7 +522,7 @@ void ggml_cuda_op_reduce([[maybe_unused]] ggml_backend_cuda_context & ctx, ggml_
             for (int jj = 0; jj < nhave; ++jj) {
                 if (jj == ii) continue;
                 int j = idx[jj];
-                CUDA_CHECK(cudaStreamWaitEvent(info.all_ctx[i]->stream(), info.all_ctx[j]->copy_event));
+                CUDA_CHECK(cudaStreamWaitEvent(all_ctx[i]->stream(), all_ctx[j]->copy_event));
             }
         }
         ggml_cuda_set_device(ctx.device);
@@ -536,11 +546,11 @@ void ggml_cuda_op_reduce([[maybe_unused]] ggml_backend_cuda_context & ctx, ggml_
         GGML_ASSERT(ggml_are_same_shape(dst, dst->src[i]));
         if (i == ctx.device) continue;
         ggml_cuda_set_device(i);
-        CUDA_CHECK(cudaMemcpyPeerAsync(ptr, ctx.device, dst->src[i]->data, i, nbytes, info.all_ctx[i]->stream()));
-        if (!info.all_ctx[i]->copy_event) {
-            CUDA_CHECK(cudaEventCreateWithFlags(&info.all_ctx[i]->copy_event, cudaEventDisableTiming));
+        CUDA_CHECK(cudaMemcpyPeerAsync(ptr, ctx.device, dst->src[i]->data, i, nbytes, all_ctx[i]->stream()));
+        if (!all_ctx[i]->copy_event) {
+            CUDA_CHECK(cudaEventCreateWithFlags(&all_ctx[i]->copy_event, cudaEventDisableTiming));
         }
-        CUDA_CHECK(cudaEventRecord(info.all_ctx[i]->copy_event, info.all_ctx[i]->stream()));
+        CUDA_CHECK(cudaEventRecord(all_ctx[i]->copy_event, all_ctx[i]->stream()));
         ptr += nbytes;
     }
     auto nelem = ggml_nelements(dst);
@@ -550,7 +560,7 @@ void ggml_cuda_op_reduce([[maybe_unused]] ggml_backend_cuda_context & ctx, ggml_
     for (int ii = 0; ii < nhave; ++ii) {
         int i = idx[ii];
         if (i == ctx.device) continue;
-        CUDA_CHECK(cudaStreamWaitEvent(ctx.stream(), info.all_ctx[i]->copy_event, 0));
+        CUDA_CHECK(cudaStreamWaitEvent(ctx.stream(), all_ctx[i]->copy_event, 0));
         if (dst->type == GGML_TYPE_F16) {
             k_add<half, CUDA_REDUCE_BLOCK_SIZE><<<num_blocks, CUDA_REDUCE_BLOCK_SIZE, 0, ctx.stream()>>>(nelem, (const half *)ptr, (half *)dst->data);
         } else if (dst->type == GGML_TYPE_BF16) {
@@ -572,15 +582,15 @@ void ggml_cuda_op_reduce([[maybe_unused]] ggml_backend_cuda_context & ctx, ggml_
         int i = idx[ii];
         if (i == ctx.device) continue;
         ggml_cuda_set_device(i);
-        CUDA_CHECK(cudaStreamWaitEvent(info.all_ctx[i]->stream(), ctx.copy_event, 0));
-        CUDA_CHECK(cudaMemcpyPeerAsync(dst->src[i]->data, i, dst->data, ctx.device, nbytes, info.all_ctx[i]->stream()));
-        CUDA_CHECK(cudaEventRecord(info.all_ctx[i]->copy_event, info.all_ctx[i]->stream()));
+        CUDA_CHECK(cudaStreamWaitEvent(all_ctx[i]->stream(), ctx.copy_event, 0));
+        CUDA_CHECK(cudaMemcpyPeerAsync(dst->src[i]->data, i, dst->data, ctx.device, nbytes, all_ctx[i]->stream()));
+        CUDA_CHECK(cudaEventRecord(all_ctx[i]->copy_event, all_ctx[i]->stream()));
     }
     ggml_cuda_set_device(ctx.device);
     for (int ii = 0; ii < nhave; ++ii) {
         int i = idx[ii];
         if (i == ctx.device) continue;
-        CUDA_CHECK(cudaStreamWaitEvent(ctx.stream(), info.all_ctx[i]->copy_event, 0));
+        CUDA_CHECK(cudaStreamWaitEvent(ctx.stream(), all_ctx[i]->copy_event, 0));
     }
     if (ncopy > 0) {
         copy_missing_tensors(ctx, dst, nhave, ncopy, idx, copy_idx);
diff --git a/src/graphs/build_gemma4.cpp b/src/graphs/build_gemma4.cpp
index 3a0bb3fc..145a30b1 100644
--- a/src/graphs/build_gemma4.cpp
+++ b/src/graphs/build_gemma4.cpp
@@ -591,10 +591,6 @@ ggml_cgraph * llm_build_context::build_gemma4_mtp() {
 
     ggml_tensor * KQ_mask = nullptr;
     ggml_tensor * KQ_mask_swa = nullptr;
-    ggml_tensor * frozen_k_swa = nullptr;
-    ggml_tensor * frozen_v_swa = nullptr;
-    ggml_tensor * frozen_k_full = nullptr;
-    ggml_tensor * frozen_v_full = nullptr;
     {
         const int64_t n_mask_tokens = GGML_PAD(n_tokens, GGML_KQ_MASK_PAD);
         lctx.inp_KQ_mask = ggml_new_tensor_2d(ctx0, flash_attn ? GGML_TYPE_F16 : GGML_TYPE_F32, target_n_kv, n_mask_tokens);
@@ -610,59 +606,248 @@ ggml_cgraph * llm_build_context::build_gemma4_mtp() {
         }
     }
 
-    for (int il = 0; il < n_layer; ++il) {
+    if (model.split_mode == LLAMA_SPLIT_MODE_GRAPH) {
+        int n_device = model.splits.size();
+        std::vector<ggml_tensor *> sa_inp(n_device, nullptr);
+        std::vector<ggml_tensor *> sa_out(n_device, nullptr);
+        std::vector<ggml_tensor *> ffn_inp(n_device, nullptr);
+        std::vector<ggml_tensor *> ffn_out(n_device, nullptr);
         ggml_tensor * inpL = cur;
 
-        const bool  is_sliding   = hparams.swa_layers[il] ? true : false;
-        const float freq_base_l  = is_sliding ? target_hparams.rope_freq_base_train_swa  : target_cparams.rope_freq_base;
-        const float freq_scale_l = is_sliding ? target_hparams.rope_freq_scale_train_swa : target_cparams.rope_freq_scale;
-        const int   n_rot_l      = is_sliding ? target_hparams.n_rot_swa : target_hparams.n_rot;
-        const int   n_swa        = is_sliding ? target_hparams.n_swa : 0;
-        const int   n_embd_head  = hparams.n_embd_head_k(il);
-        const int   n_head       = hparams.n_head(il);
-        ggml_tensor * KQ_mask_l  = is_sliding ? KQ_mask_swa : KQ_mask;
+        for (int il = 0; il < n_layer; ++il) {
 
-        cur = llm_build_norm(ctx0, inpL, hparams, model.layers[il].attn_norm, nullptr, LLM_NORM_RMS, cb, il);
-        cb(cur, "attn_norm", il);
+            const bool  is_sliding   = hparams.swa_layers[il] ? true : false;
+            const float freq_base_l  = is_sliding ? target_hparams.rope_freq_base_train_swa  : target_cparams.rope_freq_base;
+            const float freq_scale_l = is_sliding ? target_hparams.rope_freq_scale_train_swa : target_cparams.rope_freq_scale;
+            const int   n_rot_l      = is_sliding ? target_hparams.n_rot_swa : target_hparams.n_rot;
+            const int   n_swa        = is_sliding ? target_hparams.n_swa : 0;
+            const int   n_embd_head  = hparams.n_embd_head_k(il);
+            ggml_tensor * KQ_mask_l  = is_sliding ? KQ_mask_swa : KQ_mask;
 
-        ggml_tensor * Qcur = llm_build_lora_mm(lctx, ctx0, model.layers[il].wq, cur);
-        cb(Qcur, "Qcur", il);
-        Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens);
-        Qcur = llm_build_norm(ctx0, Qcur, hparams, model.layers[il].attn_q_norm, nullptr, LLM_NORM_RMS, cb, il);
-        cb(Qcur, "Qcur_normed", il);
-        auto freq_factors = is_sliding ? nullptr : model.layers[il].rope_freqs;
-        Qcur = ggml_rope_ext(ctx0, Qcur, inp_pos, freq_factors, n_rot_l, rope_type, n_ctx_orig, freq_base_l, freq_scale_l,
-                ext_factor, attn_factor, beta_fast, beta_slow);
-        cb(Qcur, "Qcur_rope", il);
+            const int target_il = gemma4_mtp_target_kv_layer(hparams, target_hparams, il);
 
-        const int target_il = gemma4_mtp_target_kv_layer(hparams, target_hparams, il);
-        ggml_tensor *& frozen_k = is_sliding ? frozen_k_swa : frozen_k_full;
-        ggml_tensor *& frozen_v = is_sliding ? frozen_v_swa : frozen_v_full;
-        gemma4_mtp_prepare_frozen_kv_views(ctx0, lctx, target_kv, il, target_il, target_n_kv, &frozen_k, &frozen_v, cb);
-        cur = llm_build_kv(ctx0, lctx, target_kv, gf, model.layers[il].wo, model.layers[il].bo,
-            nullptr, nullptr, Qcur, KQ_mask_l, n_tokens, target_kv_head, target_n_kv, hparams.f_attention_scale, cb, il, nullptr, n_swa, target_il,
-            &frozen_k, &frozen_v);
+            auto split_kl = (const ggml_split_tensor_t *)target_kv.k_l[target_il]->extra;
+            auto split_vl = (const ggml_split_tensor_t *)target_kv.v_l[target_il]->extra;
+            GGML_ASSERT(split_kl && split_vl);
+            auto split_ql = (const ggml_split_tensor_t *)model.layers[il].wq->extra;
+            auto split_ol = (const ggml_split_tensor_t *)model.layers[il].wo->extra;
+            GGML_ASSERT(split_ql && split_ol);
+            GGML_ASSERT(split_ql->n_device == n_device && split_kl->n_device == n_device && split_vl->n_device == n_device && split_ol->n_device == n_device);
+            ggml_tensor * sa_last = nullptr;
+            int nhave = 0;
+            for (int id = 0; id < n_device; ++id) {
+                GGML_ASSERT((split_kl->splits[id] && split_vl->splits[id] && split_ql->splits[id] && split_ol->splits[id]) ||
+                           !(split_kl->splits[id] || split_vl->splits[id] || split_ql->splits[id] || split_ol->splits[id]));
+                if (!split_kl->splits[id]) {
+                    sa_inp[id] = sa_out[id] = nullptr;
+                    continue;
+                }
 
-        cur = llm_build_norm(ctx0, cur, hparams, model.layers[il].attn_post_norm, nullptr, LLM_NORM_RMS, cb, il);
-        cb(cur, "attn_post_norm", il);
-        cur = ggml_add(ctx0, cur, inpL);
-        cb(cur, "attn_out", il);
+                int il_cb = 1000*(il + 1) + id;
 
-        ggml_tensor * ffn = llm_build_ffn(ctx0, lctx, model.layers[il].ffn_norm, cur,
-                model.layers[il].ffn_up,   nullptr, nullptr,
-                model.layers[il].ffn_gate, nullptr, nullptr,
-                model.layers[il].ffn_down, nullptr, nullptr,
-                nullptr,
-                LLM_FFN_GELU, LLM_FFN_PAR, cb, il, gf, true, false, nullptr, model.layers[il].ffn_post_norm);
-        cb(ffn, "ffn_out", il);
+                if (il == 0) {
+                    sa_inp[id] = inpL;
+                } else {
+                    GGML_ASSERT(inpL->op == GGML_OP_REDUCE);
+                    cur = get_input_tensor_sm_graph(ctx0, inpL, id);
+                    GGML_ASSERT(model.layers[il-1].ffn_post_norm && model.layers[il-1].ffn_post_norm->extra);
+                    cur = do_split_norm(ctx0, cur, model.layers[il-1].ffn_post_norm, hparams, cb, id, il_cb, false);
+                    cb(cur, "ffn_normed", il_cb);
+                    auto add = ffn_inp[id];
+                    if (!add) {
+                        for (int j = 0; j < n_device; ++j) {
+                            if (ffn_inp[j]) {
+                                add = ffn_inp[j]; break;
+                            }
+                        }
+                        GGML_ASSERT(add);
+                    }
+                    sa_inp[id] = ggml_add(ctx0, cur, add);
+                    cb(sa_inp[id], "sa_inp", il_cb);
+                    if (model.layers[il-1].out_scale) {
+                        auto scale = (const ggml_split_tensor_t *)model.layers[il-1].out_scale->extra;
+                        sa_inp[id] = ggml_mul(ctx0, sa_inp[id], scale->splits[id]);
+                        cb(sa_inp[id], "sa_inp_scaled", il_cb);
+                    }
+                }
+                GGML_ASSERT(model.layers[il].attn_norm && model.layers[il].attn_norm->extra);
+                cur = do_split_norm(ctx0, sa_inp[id], model.layers[il].attn_norm, hparams, cb, id, il_cb, false);
+                cb(cur, "sa_inp_normed", il_cb);
+                auto Qcur = llm_build_lora_mm(lctx, ctx0, split_ql->splits[id], cur);
+                cb(Qcur, "Qcur", il_cb);
+                Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, Qcur->ne[0]/n_embd_head, n_tokens);
+                GGML_ASSERT(model.layers[il].attn_q_norm && model.layers[il].attn_q_norm->extra);
+                Qcur = do_split_norm(ctx0, Qcur, model.layers[il].attn_q_norm, hparams, cb, id, il_cb, false);
+                cb(Qcur, "Qcur_normed", il_cb);
+                auto freq_factors = is_sliding ? nullptr : ((const ggml_split_tensor_t *)model.layers[il].rope_freqs->extra)->splits[id];
+                Qcur = ggml_rope_ext(ctx0, Qcur, inp_pos, freq_factors, n_rot_l, rope_type, n_ctx_orig, freq_base_l, freq_scale_l,
+                        ext_factor, attn_factor, beta_fast, beta_slow);
+                cb(Qcur, "Qcur_rope", il_cb);
+                GGML_ASSERT(split_kl->splits[id]->ne[1] % target_kv.size == 0);
+                int n_head_kv = split_kl->splits[id]->ne[1] / target_kv.size;
+                auto q = ggml_permute(ctx0, Qcur, 0, 2, 1, 3);
+                auto k = ggml_view_3d(ctx0, split_kl->splits[id], n_embd_head, target_n_kv, n_head_kv,
+                    ggml_row_size(split_kl->splits[id]->type, n_embd_head)*n_head_kv,
+                    ggml_row_size(split_kl->splits[id]->type, n_embd_head), 0);
+                auto v = ggml_view_3d(ctx0, split_vl->splits[id], n_embd_head, target_n_kv, n_head_kv,
+                    ggml_row_size(split_vl->splits[id]->type, n_embd_head)*n_head_kv,
+                    ggml_row_size(split_vl->splits[id]->type, n_embd_head), 0);
+                cur = ggml_flash_attn_ext(ctx0, q, k, v, KQ_mask_l, hparams.f_attention_scale, 0.0f, 0.0f);
+                cur->op_params[4] = n_swa;
+                cb(cur, "fa", il_cb);
+                cur = ggml_reshape_2d(ctx0, cur, split_ol->splits[id]->ne[0], ggml_nelements(cur)/split_ol->splits[id]->ne[0]);
+                cur = llm_build_lora_mm(lctx, ctx0, split_ol->splits[id], cur);
+                cb(cur, "qkv", il_cb);
+                ggml_build_forward_expand(gf, cur);
+                sa_out[id] = cur;
+                sa_last = cur;
+                ++nhave;
+            }
 
-        cur = ffn;
-        if (model.layers[il].out_scale) {
-            cur = ggml_mul(ctx0, cur, model.layers[il].out_scale);
-            cb(cur, "out_scaled", il);
+            auto last_ffn_inp = nhave > 1 ? ggml_reduce(ctx0, sa_out.data(), n_device, GGML_OP_ADD) : sa_last;
+            ggml_build_forward_expand(gf, last_ffn_inp);
+            cb(last_ffn_inp, "sa_reduce", il);
+
+            auto ffn_up   = (const ggml_split_tensor_t *)model.layers[il].ffn_up->extra;
+            auto ffn_gate = (const ggml_split_tensor_t *)model.layers[il].ffn_gate->extra;
+            auto ffn_down = (const ggml_split_tensor_t *)model.layers[il].ffn_down->extra;
+            GGML_ASSERT(ffn_up && ffn_gate && ffn_down);
+
+            for (int id = 0; id < n_device; ++id) {
+                GGML_ASSERT((ffn_up->splits[id] && ffn_gate->splits[id] && ffn_down->splits[id]) ||
+                        (!ffn_up->splits[id] && !ffn_gate->splits[id] && !ffn_down->splits[id]));
+                if (!ffn_up->splits[id]) {
+                    ffn_inp[id] = ffn_out[id] = nullptr;
+                    continue;
+                }
+
+                GGML_ASSERT(last_ffn_inp && (nhave == 1 || last_ffn_inp->op == GGML_OP_REDUCE));
+
+                int il_cb = 1000*(il + 1) + id;
+
+                cur = get_input_tensor_sm_graph(ctx0, last_ffn_inp, id);
+                cur = do_split_norm(ctx0, cur, model.layers[il].attn_post_norm, hparams, cb, id, il_cb, false);
+                cb(cur, "sa_post", il_cb);
+                auto add = sa_inp[id];
+                if (!add) {
+                    for (int j = 0; j < n_device; ++j) {
+                        if (sa_inp[j]) {
+                            add = sa_inp[j]; break;
+                        }
+                    }
+                }
+                ffn_inp[id] = ggml_add(ctx0, cur, add);
+                cb(ffn_inp[id], "ffn_inp", il_cb);
+                cur = do_split_norm(ctx0, ffn_inp[id], model.layers[il].ffn_norm, hparams, cb, id, il_cb, false);
+                cb(cur, "ffn_inp_normed", il_cb);
+                cur = llm_build_ffn(ctx0, lctx, nullptr, cur,
+                        ffn_up->splits[id], nullptr, nullptr,
+                        ffn_gate->splits[id], nullptr, nullptr,
+                        ffn_down->splits[id], nullptr, nullptr,
+                        nullptr,
+                        LLM_FFN_GELU, LLM_FFN_PAR, cb, il, gf, false, false, nullptr, nullptr);
+                cb(cur, "ffn", il_cb);
+                ggml_build_forward_expand(gf, cur);
+                ffn_out[id] = cur;
+
+            }
+
+            inpL = ggml_reduce(ctx0, ffn_out.data(), n_device, GGML_OP_ADD);
+            cb(inpL, "ffn_reduce", il);
+            ggml_build_forward_expand(gf, inpL);
+        }
+
+        int idx = lctx.model.default_layer_device[lctx.model.hparams.n_layer];
+        int idx_out = ggml_backend_sched_get_backend_idx(lctx.sched, lctx.model.output->buffer);
+        if (idx_out >= 0) idx = idx_out;
+        cur = inpL->src[idx];
+        if (!cur) {
+            cur = inpL->view_src;
+        }
+
+        auto post_norm   = (const ggml_split_tensor_t *)model.layers[hparams.n_layer-1].ffn_post_norm->extra;
+        cur = llm_build_norm(ctx0, cur, hparams, post_norm->splits[idx], NULL, LLM_NORM_RMS, cb, -1);
+
+        cb(cur, "ffn_normed", hparams.n_layer-1);
+        auto add = ffn_inp[idx];
+        if (!add) {
+            for (int j = 0; j < n_device; ++j) {
+                if (ffn_inp[j]) {
+                    add = ffn_inp[j]; break;
+                }
+            }
+        }
+        cur = ggml_add(ctx0, cur, add);
+        cb(cur, "ffn_out", hparams.n_layer-1);
+
+        if (model.layers[hparams.n_layer-1].out_scale) {
+            auto scale = (const ggml_split_tensor_t *)model.layers[hparams.n_layer-1].out_scale->extra;
+            cur = ggml_mul(ctx0, cur, scale->splits[idx]);
+            cb(cur, "ffn_out_scaled", hparams.n_layer-1);
+        }
+
+    } else {
+
+        ggml_tensor * frozen_k_swa = nullptr;
+        ggml_tensor * frozen_v_swa = nullptr;
+        ggml_tensor * frozen_k_full = nullptr;
+        ggml_tensor * frozen_v_full = nullptr;
+
+        for (int il = 0; il < n_layer; ++il) {
+            ggml_tensor * inpL = cur;
+
+            const bool  is_sliding   = hparams.swa_layers[il] ? true : false;
+            const float freq_base_l  = is_sliding ? target_hparams.rope_freq_base_train_swa  : target_cparams.rope_freq_base;
+            const float freq_scale_l = is_sliding ? target_hparams.rope_freq_scale_train_swa : target_cparams.rope_freq_scale;
+            const int   n_rot_l      = is_sliding ? target_hparams.n_rot_swa : target_hparams.n_rot;
+            const int   n_swa        = is_sliding ? target_hparams.n_swa : 0;
+            const int   n_embd_head  = hparams.n_embd_head_k(il);
+            const int   n_head       = hparams.n_head(il);
+            ggml_tensor * KQ_mask_l  = is_sliding ? KQ_mask_swa : KQ_mask;
+
+            const int target_il = gemma4_mtp_target_kv_layer(hparams, target_hparams, il);
+
+            cur = llm_build_norm(ctx0, inpL, hparams, model.layers[il].attn_norm, nullptr, LLM_NORM_RMS, cb, il);
+            cb(cur, "attn_norm", il);
+
+            ggml_tensor * Qcur = llm_build_lora_mm(lctx, ctx0, model.layers[il].wq, cur);
+            cb(Qcur, "Qcur", il);
+            Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens);
+            Qcur = llm_build_norm(ctx0, Qcur, hparams, model.layers[il].attn_q_norm, nullptr, LLM_NORM_RMS, cb, il);
+            cb(Qcur, "Qcur_normed", il);
+            auto freq_factors = is_sliding ? nullptr : model.layers[il].rope_freqs;
+            Qcur = ggml_rope_ext(ctx0, Qcur, inp_pos, freq_factors, n_rot_l, rope_type, n_ctx_orig, freq_base_l, freq_scale_l,
+                    ext_factor, attn_factor, beta_fast, beta_slow);
+            cb(Qcur, "Qcur_rope", il);
+
+            ggml_tensor *& frozen_k = is_sliding ? frozen_k_swa : frozen_k_full;
+            ggml_tensor *& frozen_v = is_sliding ? frozen_v_swa : frozen_v_full;
+            gemma4_mtp_prepare_frozen_kv_views(ctx0, lctx, target_kv, il, target_il, target_n_kv, &frozen_k, &frozen_v, cb);
+            cur = llm_build_kv(ctx0, lctx, target_kv, gf, model.layers[il].wo, model.layers[il].bo,
+                    nullptr, nullptr, Qcur, KQ_mask_l, n_tokens, target_kv_head, target_n_kv, hparams.f_attention_scale, cb, il, nullptr, n_swa, target_il,
+                    &frozen_k, &frozen_v);
+
+            cur = llm_build_norm(ctx0, cur, hparams, model.layers[il].attn_post_norm, nullptr, LLM_NORM_RMS, cb, il);
+            cb(cur, "attn_post_norm", il);
+            cur = ggml_add(ctx0, cur, inpL);
+            cb(cur, "attn_out", il);
+
+            ggml_tensor * ffn = llm_build_ffn(ctx0, lctx, model.layers[il].ffn_norm, cur,
+                    model.layers[il].ffn_up,   nullptr, nullptr,
+                    model.layers[il].ffn_gate, nullptr, nullptr,
+                    model.layers[il].ffn_down, nullptr, nullptr,
+                    nullptr,
+                    LLM_FFN_GELU, LLM_FFN_PAR, cb, il, gf, true, false, nullptr, model.layers[il].ffn_post_norm);
+            cb(ffn, "ffn_out", il);
+
+            cur = ffn;
+            if (model.layers[il].out_scale) {
+                cur = ggml_mul(ctx0, cur, model.layers[il].out_scale);
+                cb(cur, "out_scaled", il);
+            }
+            cur = lctx.cvec.apply_to(ctx0, cur, il);
+            cb(cur, "l_out", il);
         }
-        cur = lctx.cvec.apply_to(ctx0, cur, il);
-        cb(cur, "l_out", il);
     }
 
     cur = llm_build_norm(ctx0, cur, hparams, model.output_norm, nullptr, LLM_NORM_RMS, cb, -1);
diff --git a/src/llama-impl.h b/src/llama-impl.h
index 741c11df..df6fc741 100644
--- a/src/llama-impl.h
+++ b/src/llama-impl.h
@@ -233,3 +233,5 @@ struct llama_split_tensor {
 
 void  llama_decode_reset();
 void  llama_decode_stop();
+
+std::vector<llama_model *> & llama_all_loaded_models();
diff --git a/src/llama-load-tensors.cpp b/src/llama-load-tensors.cpp
index 33ca9553..60bdd666 100644
--- a/src/llama-load-tensors.cpp
+++ b/src/llama-load-tensors.cpp
@@ -194,6 +194,8 @@ struct create_tensors_helper : public create_tensors_helper_interface {
     ggml_context * ctx_output;
     ggml_context * ctx_output_split;
 
+    llama_model  * tgt_model = nullptr;
+
     ggml_backend_buffer_type_t default_cpu_buft;
     bool has_buft_overrides = false;
 
@@ -221,6 +223,21 @@ struct create_tensors_helper : public create_tensors_helper_interface {
 
 create_tensors_helper::create_tensors_helper(llama_model_loader & _ml, llama_model & _model) : ml(_ml), model(_model) {
 
+    if (model.arch == LLM_ARCH_GEMMA4_MTP || model.arch == LLM_ARCH_GEMMA4_ASSISTANT) {
+        auto & all_models = llama_all_loaded_models();
+        for (auto model : all_models) {
+            if (model->arch == LLM_ARCH_GEMMA4) {
+                tgt_model = model;
+            }
+        }
+        if (tgt_model) {
+            LLAMA_LOG_INFO("==================== Found target model for Gemma4-Assistant. split mode graph: %d\n", model.split_mode == LLAMA_SPLIT_MODE_GRAPH);
+        } else {
+            LLAMA_LOG_INFO("==================== Did not find target model for Gemma4-Assistant\n");
+            model.split_mode = LLAMA_SPLIT_MODE_LAYER;
+        }
+    }
+
     const int n_layer = model.hparams.n_layer;
     buft_layer_count[model.buft_input.buft]++;
     buft_layer_count[model.buft_input.buft_matrix]++;
@@ -2242,7 +2259,7 @@ bool create_tensors_helper::create_gemma4_mtp_tensors(const LLM_TN & tn) {
         const int64_t n_ff_cur    = hparams.n_ff(i);
 
         if (!hparams.swa_layers[i]) {
-            layer.rope_freqs = create_tensor(ctx_layer, tn(LLM_TENSOR_ROPE_FREQS, "weight"), { n_rot/2 },
+            layer.rope_freqs = create_tensor(ctx_split, tn(LLM_TENSOR_ROPE_FREQS, "weight"), { n_rot/2 },
                     llama_model_loader::TENSOR_NOT_REQUIRED | rope_flag);
             rope_flag = llama_model_loader::TENSOR_DUPLICATED;
         }
@@ -4540,7 +4557,7 @@ bool create_tensors_helper::create_tensors() {
 
     {
         const bool unsupported =
-            (model.arch == LLM_ARCH_GEMMA4_MTP || model.arch == LLM_ARCH_GEMMA4_ASSISTANT) ||
+            //(model.arch == LLM_ARCH_GEMMA4_MTP || model.arch == LLM_ARCH_GEMMA4_ASSISTANT) ||
             (model.arch == LLM_ARCH_GEMMA4 && model.tok_embd_per_layer);
         if (unsupported && (model.split_mode == LLAMA_SPLIT_MODE_GRAPH || model.split_mode == LLAMA_SPLIT_MODE_ATTN)) {
             LLAMA_LOG_WARN("\n=========================================================\n");
@@ -4583,6 +4600,12 @@ bool create_tensors_helper::create_tensors() {
         if (model.max_gpu > 0 && model.max_gpu < int(model.splits.size())) {
             gpu_split_count.resize(model.splits.size(), 0.0f);
         }
+        auto is_gemma4_model = [this] () {
+            return model.arch == LLM_ARCH_GEMMA4 || model.arch == LLM_ARCH_GEMMA4_MTP || model.arch == LLM_ARCH_GEMMA4_ASSISTANT;
+        };
+        auto is_gemma4_assistant = [this] () {
+            return model.arch == LLM_ARCH_GEMMA4_MTP || model.arch == LLM_ARCH_GEMMA4_ASSISTANT;
+        };
         for (int il = 0; il < n_layer; ++il) {
             // For now only run MTP into the per-layer
             if (model.mtp && hparams.nextn_predict_layers > 0 &&
@@ -4620,7 +4643,7 @@ bool create_tensors_helper::create_tensors() {
             if (layer.attn_norm) {
                 prepare_split_tensors(-1, ctx_split, layer.attn_norm, layer.split_attn_norm, mirror, mem_used);
             }
-            if (model.arch == LLM_ARCH_GEMMA4 && layer.attn_post_norm) {
+            if (is_gemma4_model() && layer.attn_post_norm) {
                 prepare_split_tensors(-1, ctx_split, layer.attn_post_norm, layer.split_attn_post_norm, mirror, mem_used);
             }
             if (layer.rope_freqs) {
@@ -4630,6 +4653,48 @@ bool create_tensors_helper::create_tensors() {
             if (hparams.is_recurrent(il)) {
                 split_recurrent_tensors(hparams, layer, cur_splits, mem_used, ctx_split, il); //, model.arch == LLM_ARCH_QWEN3NEXT ? 0 : 1);
             }
+            else if (is_gemma4_assistant()) {
+                GGML_ASSERT(layer.wo && layer.wq);
+                GGML_ASSERT(tgt_model);
+                int n_embd_head = hparams.n_embd_head_k(il);
+                int n_head      = hparams.n_head(il);
+                bool is_sliding = hparams.swa_layers[il] != 0;
+                int target_n_kv_layer = tgt_model->hparams.n_layer_kv_from_start > 0
+                                      ? std::min<int>((int) tgt_model->hparams.n_layer, tgt_model->hparams.n_layer_kv_from_start)
+                                      : (int) tgt_model->hparams.n_layer;
+                int target_il = target_n_kv_layer - 1;
+                for (; target_il >= 0; --target_il) {
+                    if ((tgt_model->hparams.swa_layers[target_il] != 0) == is_sliding) break;
+                }
+                GGML_ASSERT(target_il >= 0 && "Gemma4 MTP could not find a matching target KV layer");
+                int n_head_tgt  = tgt_model->hparams.n_head(target_il);
+                GGML_ASSERT(tgt_model->hparams.n_embd_head_k(target_il) == n_embd_head);
+                auto & target_layer = tgt_model->layers[target_il];
+                auto split_wq = (const ggml_split_tensor_t *)target_layer.wq->extra;
+                auto split_wo = (const ggml_split_tensor_t *)target_layer.wo->extra;
+                GGML_ASSERT(split_wq && split_wo);
+                std::vector<int> q_split(split_wq->n_device, 0);
+                std::vector<int> o_split(split_wo->n_device, 0);
+                for (int id = 0; id < split_wq->n_device; ++id) {
+                    if (split_wq->splits[id]) {
+                        int nh = split_wq->splits[id]->ne[1] / n_embd_head;
+                        GGML_ASSERT((nh*n_head) % n_head_tgt == 0);
+                        q_split[id] = ((nh*n_head)/n_head_tgt)*n_embd_head;
+                    }
+                }
+                for (int id = 0; id < split_wo->n_device; ++id) {
+                    if (split_wo->splits[id]) {
+                        int64_t no = split_wo->splits[id]->ne[0] * layer.wo->ne[0];
+                        GGML_ASSERT(no % target_layer.wo->ne[0] == 0);
+                        o_split[id] = no / target_layer.wo->ne[0];
+                    }
+                }
+                prepare_split_tensors(1, ctx_split, layer.wq, layer.split_wq, q_split, mem_used);
+                prepare_split_tensors(0, ctx_split, layer.wo, layer.split_wo, o_split, mem_used);
+                if (layer.attn_q_norm) {
+                    prepare_split_tensors(-1, ctx_split, layer.attn_q_norm, layer.split_q_norm, o_split, mem_used);
+                }
+            }
             else if (layer.wo && layer.wq && layer.wk && (layer.wv || model.arch == LLM_ARCH_GEMMA4)) {
                 auto granularity_kq = hparams.n_embd_head_k(il) * gqa_ratio;
                 int wq_ne1 = layer.wq->ne[1];
diff --git a/src/llama-model-loader.cpp b/src/llama-model-loader.cpp
index b372a678..180669d6 100644
--- a/src/llama-model-loader.cpp
+++ b/src/llama-model-loader.cpp
@@ -1053,6 +1053,7 @@ void llama_model_loader::load_data_for(struct ggml_tensor * cur) const {
 // Returns false if cancelled by progress_callback
 bool llama_model_loader::load_all_data(
             struct ggml_context * ctx,
+            [[maybe_unused]] llama_model * model,
             llama_buf_map & bufs_mmap,
             llama_mlocks * lmlocks,
             llama_progress_callback progress_callback,
@@ -1083,7 +1084,7 @@ bool llama_model_loader::load_all_data(
             for (int i = 0; i < ggml_backend_cuda_get_device_count(); ++i) {
                 auto * cuda_buffer_type = ggml_backend_cuda_buffer_type(i);
                 if (buffer_type == cuda_buffer_type) {
-                    cuda_backend = ggml_backend_cuda_init(i, nullptr);
+                    cuda_backend = ggml_backend_cuda_init(i, nullptr, model);
                     break;
                 }
             }
diff --git a/src/llama-model-loader.h b/src/llama-model-loader.h
index f8c09c0e..01b47ebf 100644
--- a/src/llama-model-loader.h
+++ b/src/llama-model-loader.h
@@ -184,6 +184,7 @@ struct llama_model_loader {
     // Returns false if cancelled by progress_callback
     bool load_all_data(
             struct ggml_context * ctx,
+            struct llama_model * model,
             llama_buf_map & bufs_mmap,
             llama_mlocks * lmlocks,
             llama_progress_callback progress_callback,
diff --git a/src/llama-reload.cpp b/src/llama-reload.cpp
index 5dc163aa..c44f208e 100644
--- a/src/llama-reload.cpp
+++ b/src/llama-reload.cpp
@@ -939,7 +939,7 @@ bool reload_info::reload_changed_tensors(llama_model & model) {
 
     if (r) {
 #ifdef GGML_USE_CUDA
-        ggml_backend_cuda_invalidate_graphs();
+        ggml_backend_cuda_invalidate_graphs(&model);
 #endif
     }
     return r;
diff --git a/src/llama.cpp b/src/llama.cpp
index fba96ccb..308f296f 100644
--- a/src/llama.cpp
+++ b/src/llama.cpp
@@ -682,6 +682,11 @@ bool llama_context::update_cache_copies() {
     return true;
 }
 
+static std::vector<llama_context *> & llama_all_contexts() {
+    static std::vector<llama_context *> contexts;
+    return contexts;
+}
+
 llama_context::llama_context(const llama_model & model)
     : model(model) , sampling(llama_n_vocab(&model)) , t_start_us(model.t_start_us) , t_load_us(model.t_load_us) {
     const auto & hparams = model.hparams;
@@ -690,6 +695,7 @@ llama_context::llama_context(const llama_model & model)
     } else {
         cache_copies.resize(2*hparams.n_layer);
     }
+    llama_all_contexts().push_back(this);
 }
 
 void llama_context::set_mtp_op_type(llama_mtp_op_type value) {
@@ -710,6 +716,14 @@ llama_context::~llama_context() {
     }
 
     ggml_backend_buffer_free(buf_output);
+
+    auto & all_contexts = llama_all_contexts();
+    for (auto it = all_contexts.begin(); it != all_contexts.end(); ++it) {
+        if (*it == this) {
+            all_contexts.erase(it);
+            break;
+        }
+    }
 }
 
 int llama_context::max_nodes(int n_tokens, int n_kv) const {
@@ -3093,6 +3107,8 @@ static bool is_model_split_supported(const llama_model & model) {
         LLM_ARCH_QWEN35,
         LLM_ARCH_QWEN35MOE,
         LLM_ARCH_GEMMA4,
+        LLM_ARCH_GEMMA4_MTP,
+        LLM_ARCH_GEMMA4_ASSISTANT,
         LLM_ARCH_DEEPSEEK2,
         LLM_ARCH_GLM_DSA,
         LLM_ARCH_MISTRAL4,
@@ -3371,11 +3387,21 @@ static bool llm_load_tensors(
 
     auto & hparams = model.hparams;
 
+    if (model.arch == LLM_ARCH_GEMMA4_MTP || model.arch == LLM_ARCH_GEMMA4_ASSISTANT) {
+        auto & all_models = llama_all_loaded_models();
+        llama_model * tgt_model = nullptr;
+        for (auto model : all_models) {
+            if (model->arch == LLM_ARCH_GEMMA4) {
+                tgt_model = model;
+            }
+        }
+        if (tgt_model) {
+            split_mode = tgt_model->split_mode;
+        }
+    }
+
     if (split_mode == LLAMA_SPLIT_MODE_GRAPH || split_mode == LLAMA_SPLIT_MODE_ATTN) {
-        const bool unsupported_gemma_split =
-            model.arch == LLM_ARCH_GEMMA4_MTP ||
-            model.arch == LLM_ARCH_GEMMA4_ASSISTANT ||
-            (model.arch == LLM_ARCH_GEMMA4 && hparams.n_embd_per_layer > 0);
+        const bool unsupported_gemma_split = model.arch == LLM_ARCH_GEMMA4 && hparams.n_embd_per_layer > 0;
 
         if (unsupported_gemma_split) {
             LLAMA_LOG_WARN("\n=========================================================\n");
@@ -3401,6 +3427,25 @@ static bool llm_load_tensors(
             }
         }
     }
+    if ((split_mode == LLAMA_SPLIT_MODE_GRAPH || split_mode == LLAMA_SPLIT_MODE_ATTN) &&
+        (model.arch == LLM_ARCH_GEMMA4_MTP || model.arch == LLM_ARCH_GEMMA4_ASSISTANT)) {
+        auto & all_models = llama_all_loaded_models();
+        bool has_target_gemma = false;
+        for (auto model : all_models) {
+            if (model->arch == LLM_ARCH_GEMMA4) {
+                has_target_gemma = true;
+                break;
+            }
+        }
+        if (!has_target_gemma) {
+            LLAMA_LOG_WARN("\n=======================================================\n");
+            LLAMA_LOG_WARN("Split mode 'graph' requested for Gemma4-assistant model\n");
+            LLAMA_LOG_WARN("but no loaded Gemma4 model found.\n");
+            LLAMA_LOG_WARN("  => changing split mode to 'layer'\n");
+            LLAMA_LOG_WARN("=======================================================\n\n");
+            split_mode = LLAMA_SPLIT_MODE_LAYER;
+        }
+    }
 
     if (iqk_has_fancy_simd()) {
         LLAMA_LOG_INFO("======================================= HAVE_FANCY_SIMD is defined\n");
@@ -3971,7 +4016,7 @@ static bool llm_load_tensors(
         for (auto & it : ctx_bufs) {
             ggml_context * ctx = it.first;
             auto & bufs = it.second;
-            if (!ml.load_all_data(ctx, bufs, use_mlock ? &model.mlock_mmaps : NULL, progress_callback, progress_callback_user_data)) {
+            if (!ml.load_all_data(ctx, &model, bufs, use_mlock ? &model.mlock_mmaps : NULL, progress_callback, progress_callback_user_data)) {
                 return false;
             }
         }
@@ -6722,10 +6767,19 @@ struct llama_model * llama_model_load_from_file(
         return nullptr;
     }
 
+    llama_all_loaded_models().push_back(model);
+
     return model;
 }
 
 void llama_free_model(struct llama_model * model) {
+    auto & all_models = llama_all_loaded_models();
+    for (auto it = all_models.begin(); it != all_models.end(); ++it) {
+        if (*it == model) {
+            all_models.erase(it);
+            break;
+        }
+    }
     delete model;
 }
 
@@ -7094,7 +7148,7 @@ struct llama_context * llama_init_from_model(
         // main_gpu is a local index into model->devices throughout the codebase
         // (auto-fit assigns device_count-1, MTP clamps to [0, device_count), buffer-type
         // setup wraps with model.devices[main_gpu]). Translate to a raw device id here.
-        const int main_gpu_id = (model->main_gpu >= 0 && model->main_gpu < (int)model->devices.size())
+        [[maybe_unused]] const int main_gpu_id = (model->main_gpu >= 0 && model->main_gpu < (int)model->devices.size())
             ? model->devices[model->main_gpu]
             : model->main_gpu;
 #if defined(GGML_USE_METAL)
@@ -7110,7 +7164,7 @@ struct llama_context * llama_init_from_model(
 #elif defined(GGML_USE_CUDA)
         if (model->split_mode == LLAMA_SPLIT_MODE_NONE) {
             // with split_mode LLAMA_SPLIT_MODE_NONE or LLAMA_SPLIT_MODE_GRAPH, only the main GPU backend is used
-            ggml_backend_t backend = ggml_backend_cuda_init(main_gpu_id, cparams.cuda_params);
+            ggml_backend_t backend = ggml_backend_cuda_init(main_gpu_id, cparams.cuda_params, ctx);
             if (backend == nullptr) {
                 LLAMA_LOG_ERROR("%s: failed to initialize CUDA%d backend\n", __func__, main_gpu_id);
                 llama_free(ctx);
@@ -7129,7 +7183,7 @@ struct llama_context * llama_init_from_model(
                 params = new_params.data();
             }
             for (int device = 0; device < ggml_backend_cuda_get_device_count(); ++device) {
-                ggml_backend_t backend = ggml_backend_cuda_init(device, params);
+                ggml_backend_t backend = ggml_backend_cuda_init(device, params, ctx);
                 if (backend == nullptr) {
                     LLAMA_LOG_ERROR("%s: failed to initialize CUDA%d backend\n", __func__, device);
                     llama_free(ctx);
@@ -11099,3 +11153,8 @@ bool llama_reload_changed_tensors(struct llama_context * ctx) {
     }
     return result;
 }
+
+std::vector<llama_model *> & llama_all_loaded_models() {
+    static std::vector<llama_model *> models;
+    return models;
+}