Split mode graph for dense Gemma4 assistant (#2022)

* WIP: Split mode graph for Gemma4 assistant Something is not right - acceptance drops to nearly zero. * Per model CUDA contexts Still not working!? * This works The issue was that I was not correctly calculating the number of KV heads for the split KV cache. * Compiler warnings * It is better to use llama_context pointers as keys
2026-06-28 04:30:15 -05:00 · 2026-06-24 18:29:32 +02:00 · 2026-06-24 18:29:32 +02:00 · d5507e33ae
commit d5507e33ae
parent bf23a7599c
12 changed files with 483 additions and 160 deletions
--- a/examples/cvector-generator/pca.hpp
+++ b/examples/cvector-generator/pca.hpp
@ -66,7 +66,7 @@ struct pca_model {
    pca_model(struct ggml_tensor * t_input) {
 #ifdef GGML_USE_CUDA
        fprintf(stderr, "%s: using CUDA backend\n", __func__);
-        backend = ggml_backend_cuda_init(0, nullptr); // init device 0
+        backend = ggml_backend_cuda_init(0, nullptr, nullptr); // init device 0
        if (!backend) {
            fprintf(stderr, "%s: ggml_backend_cuda_init() failed\n", __func__);
        }
--- a/ggml/include/ggml-cuda.h
+++ b/ggml/include/ggml-cuda.h
@ -21,7 +21,7 @@ extern "C" {
 #define GGML_CUDA_MAX_DEVICES       16

 // backend API
-GGML_API GGML_CALL ggml_backend_t ggml_backend_cuda_init(int device, const void * params);
+GGML_API GGML_CALL ggml_backend_t ggml_backend_cuda_init(int device, const void * params, const void * model);

 GGML_API GGML_CALL bool ggml_backend_is_cuda(ggml_backend_t backend);

@ -43,7 +43,7 @@ GGML_API GGML_CALL void ggml_backend_cuda_unregister_host_buffer(void * buffer);

 GGML_API GGML_CALL void ggml_backend_cuda_log_set_callback(ggml_log_callback log_callback, void * user_data);

-GGML_API GGML_CALL void ggml_backend_cuda_invalidate_graphs(void);
+GGML_API GGML_CALL void ggml_backend_cuda_invalidate_graphs(const void * model);
 #ifdef  __cplusplus
 }
 #endif
--- a/ggml/src/ggml-cuda.cu
+++ b/ggml/src/ggml-cuda.cu
@ -298,13 +298,22 @@ const ggml_cuda_device_info & ggml_cuda_info() {
 }

 /* ---------- hot-swap: invalidate all cached CUDA graphs ---------- */
-extern "C" void ggml_backend_cuda_invalidate_graphs(void) {
+extern "C" void ggml_backend_cuda_invalidate_graphs(const void * model) {
    auto & info = const_cast<ggml_cuda_device_info &>(ggml_cuda_info());
-    for (int i = 0; i < info.device_count; ++i) {
-        if (info.all_ctx[i]) {
-            info.all_ctx[i]->cuda_graphs.clear();
+    if (auto it = info.all_ctx.find(model); it != info.all_ctx.end()) {
+        for (auto ctx : it->second) {
+            if (ctx) {
+                ctx->cuda_graphs.clear();
+            }
        }
+    } else {
+        fprintf(stderr, "================================= %s: did not find entry for model at %p\n", __func__, model);
    }
+    //for (int i = 0; i < info.device_count; ++i) {
+    //    if (info.all_ctx[i]) {
+    //        info.all_ctx[i]->cuda_graphs.clear();
+    //    }
+    //}
 }

 // #define DEBUG_CUDA_MALLOC
@ -517,13 +526,14 @@ static std::condition_variable ggml_cuda_lock_cv;
 //static std::atomic<int> ggml_cuda_lock_counter;
 static int ggml_cuda_lock_counter = 0;

-ggml_backend_cuda_context::ggml_backend_cuda_context(int device) :
-    device(device), name(GGML_CUDA_NAME + std::to_string(device)) {
+ggml_backend_cuda_context::ggml_backend_cuda_context(int device, const void * model) :
+    device(device), name(GGML_CUDA_NAME + std::to_string(device)), model(model) {
    auto info = const_cast<ggml_cuda_device_info*>(&ggml_cuda_info());
-    if (info->all_ctx[device]) {
+    auto & all_ctx = info->all_ctx[model];
+    if (all_ctx[device]) {
        GGML_CUDA_LOG_WARN("%s: a context for device %d already exists?\n", __func__, device);
    } else{
-        info->all_ctx[device] = this;
+        all_ctx[device] = this;
    }
 }

@ -555,9 +565,12 @@ ggml_backend_cuda_context::~ggml_backend_cuda_context() {
        }
    }
    auto info = const_cast<ggml_cuda_device_info*>(&ggml_cuda_info());
-    if (info->all_ctx[device] == this) {
-        info->all_ctx[device] = nullptr;
+    if (auto it = info->all_ctx.find(model); it != info->all_ctx.end() && it->second[device] == this) {
+        it->second[device] = nullptr;
    }
+    //if (info->all_ctx[device] == this) {
+    //    info->all_ctx[device] = nullptr;
+    //}

 }

@ -4247,23 +4260,8 @@ GGML_CALL static bool ggml_backend_cuda_cpy_tensor_async(ggml_backend_t backend_
                needs_f16_f32_copy = true;

            } else {
-#ifdef GGML_USE_NCCL__
-                auto & info = ggml_cuda_info();
-                auto nbytes = ggml_nbytes(src);
-                ncclGroupStart();
-                ggml_cuda_set_device(cuda_ctx_src->device);
-                auto status1 = ncclSend(src->data, nbytes, ncclUint8, cuda_ctx_dst->device, info.nccl_coms[cuda_ctx_src->device],
-                        info.all_ctx[cuda_ctx_src->device]->stream());
-                ggml_cuda_set_device(cuda_ctx_dst->device);
-                auto status2 = ncclRecv(dst->data, nbytes, ncclUint8, cuda_ctx_src->device, info.nccl_coms[cuda_ctx_dst->device],
-                        info.all_ctx[cuda_ctx_dst->device]->stream());
-                ncclGroupEnd();
-                GGML_ASSERT(status1 == ncclSuccess && status2 == ncclSuccess);
-                return true;
-#else
                ggml_cuda_set_device(cuda_ctx_src->device);
                CUDA_CHECK(cudaMemcpyPeerAsync(dst->data, cuda_ctx_dst->device, src->data, cuda_ctx_src->device, ggml_nbytes(dst), cuda_ctx_src->stream()));
-#endif
            }
 #endif
        }
@ -5249,13 +5247,13 @@ static cuda_params ggml_cuda_parse_params(const char * params_string) {
    return params;
 }

-GGML_CALL ggml_backend_t ggml_backend_cuda_init(int device, [[maybe_unused]] const void * param_string) {
+GGML_CALL ggml_backend_t ggml_backend_cuda_init(int device, [[maybe_unused]] const void * param_string, const void * model) {
    if (device < 0 || device >= ggml_backend_cuda_get_device_count()) {
        GGML_CUDA_LOG_ERROR("%s: invalid device %d\n", __func__, device);
        return nullptr;
    }

-    ggml_backend_cuda_context * ctx = new ggml_backend_cuda_context(device);
+    ggml_backend_cuda_context * ctx = new ggml_backend_cuda_context(device, model);
    if (ctx == nullptr) {
        GGML_CUDA_LOG_ERROR("%s: failed to allocate context\n", __func__);
        return nullptr;
@ -5370,7 +5368,7 @@ GGML_CALL void ggml_backend_cuda_unregister_host_buffer(void * buffer) {

 // backend registry
 GGML_CALL static ggml_backend_t ggml_backend_reg_cuda_init(const char * params, void * user_data) {
-    ggml_backend_t cuda_backend = ggml_backend_cuda_init((int) (intptr_t) user_data, nullptr);
+    ggml_backend_t cuda_backend = ggml_backend_cuda_init((int) (intptr_t) user_data, nullptr, nullptr);
    return cuda_backend;

    GGML_UNUSED(params);
--- a/ggml/src/ggml-cuda/common.cuh
+++ b/ggml/src/ggml-cuda/common.cuh
@ -762,7 +762,8 @@ struct ggml_cuda_device_info {

    std::array<float, GGML_CUDA_MAX_DEVICES> default_tensor_split = {};

-    ggml_backend_cuda_context * all_ctx[GGML_CUDA_MAX_DEVICES] = { nullptr };
+    std::unordered_map<const void *, std::array<ggml_backend_cuda_context *, GGML_CUDA_MAX_DEVICES>> all_ctx;
+    //ggml_backend_cuda_context * all_ctx[GGML_CUDA_MAX_DEVICES] = { nullptr };
 #ifdef GGML_USE_NCCL
    ncclComm_t nccl_coms[GGML_CUDA_MAX_DEVICES];
    bool have_nccl;
@ -864,10 +865,11 @@ struct ggml_backend_cuda_context {

 #endif

+    const void * model;
    void * copy_buffer = nullptr;
    size_t copy_size   = 0;

-    explicit ggml_backend_cuda_context(int device);
+    explicit ggml_backend_cuda_context(int device, const void * model);

    ~ggml_backend_cuda_context();

--- a/ggml/src/ggml-cuda/reduce.cu
+++ b/ggml/src/ggml-cuda/reduce.cu
@ -94,6 +94,11 @@ static void copy_missing_tensors(ggml_backend_cuda_context & ctx, ggml_tensor *
    if (ncopy < 1) return;

    auto & info = ggml_cuda_info();
+    auto it = info.all_ctx.find(ctx.model);
+    if (it == info.all_ctx.end()) {
+        GGML_ABORT("Fatal error");
+    }
+    auto & all_ctx = it->second;
    auto size = ggml_nbytes(dst);
    int isrc = 0;
    for (int ii = 0; ii < ncopy; ++ii) {
@ -102,9 +107,9 @@ static void copy_missing_tensors(ggml_backend_cuda_context & ctx, ggml_tensor *
        isrc = (isrc + 1)%nhave;
        //printf("%s: copying from device %d to device %d: %p -> %p\n", __func__, j, i, dst->src[j]->data, dst->src[i]->data);
        ggml_cuda_set_device(j);
-        CUDA_CHECK(cudaMemcpyPeerAsync(dst->src[i]->data, info.all_ctx[i]->device, dst->src[j]->data, info.all_ctx[j]->device,
-                            size, info.all_ctx[j]->stream()));
-        CUDA_CHECK(cudaEventRecord(info.all_ctx[j]->copy_event, info.all_ctx[j]->stream()));
+        CUDA_CHECK(cudaMemcpyPeerAsync(dst->src[i]->data, all_ctx[i]->device, dst->src[j]->data, all_ctx[j]->device,
+                            size, all_ctx[j]->stream()));
+        CUDA_CHECK(cudaEventRecord(all_ctx[j]->copy_event, all_ctx[j]->stream()));
    }
    isrc = 0;
    for (int ii = 0; ii < ncopy; ++ii) {
@ -112,7 +117,7 @@ static void copy_missing_tensors(ggml_backend_cuda_context & ctx, ggml_tensor *
        int j = idx[isrc];
        isrc = (isrc + 1)%nhave;
        ggml_cuda_set_device(i);
-        CUDA_CHECK(cudaStreamWaitEvent(info.all_ctx[i]->stream(), info.all_ctx[j]->copy_event, 0));
+        CUDA_CHECK(cudaStreamWaitEvent(all_ctx[i]->stream(), all_ctx[j]->copy_event, 0));
    }
    ggml_cuda_set_device(ctx.device);
 }
@ -133,6 +138,11 @@ void ggml_cuda_op_reduce([[maybe_unused]] ggml_backend_cuda_context & ctx, ggml_
    }

    auto & info = ggml_cuda_info();
+    auto it = info.all_ctx.find(ctx.model);
+    if (it == info.all_ctx.end()) {
+        GGML_ABORT("Fatal error");
+    }
+    auto & all_ctx = it->second;
 #ifdef GGML_USE_NCCL
    // Somehow I'm not able to figure out how to use NCCL correctly.
    // It does not work at all if not all GPUs participate in the reduce op, and we
@ -153,7 +163,7 @@ void ggml_cuda_op_reduce([[maybe_unused]] ggml_backend_cuda_context & ctx, ggml_
            ggml_cuda_set_device(i);
            auto status = ncclAllReduce(dst->src[i] ? dst->src[i]->data : nullptr,
                    dst->src[i] ? dst->src[i]->data : nullptr,
-                    ggml_nelements(dst), data_type, ncclSum, info.nccl_coms[i], info.all_ctx[i]->stream());
+                    ggml_nelements(dst), data_type, ncclSum, info.nccl_coms[i], all_ctx[i]->stream());
            if (status != ncclSuccess) {
                fprintf(stderr, "%s: ncclAllReduce failed with status %d\n", __func__, (int)status);
                GGML_ABORT("Fatal error");
@ -275,7 +285,7 @@ void ggml_cuda_op_reduce([[maybe_unused]] ggml_backend_cuda_context & ctx, ggml_
        auto size_per_device  = nblocks_per_device * tt.type_size;
        for (int ii = 0; ii < nhave; ++ii) {
            int i = idx[ii];
-            auto this_ctx = info.all_ctx[i];
+            auto this_ctx = all_ctx[i];
            if (!this_ctx->copy_event || !this_ctx->compute_event || size_per_device > this_ctx->copy_size) {
                ggml_cuda_set_device(this_ctx->device);
                if (!this_ctx->copy_event) {
@ -300,14 +310,14 @@ void ggml_cuda_op_reduce([[maybe_unused]] ggml_backend_cuda_context & ctx, ggml_
                int peer = idx[(ii+1)%nhave];
                auto this_nelem = std::min(nelem_per_device, nelem - ichunk*nelem_per_device);
                auto this_size  = (this_nelem / tt.blck_size) * tt.type_size;
-                ggml_cuda_set_device(info.all_ctx[peer]->device);
+                ggml_cuda_set_device(all_ctx[peer]->device);
                if (stage > 0) {
-                    CUDA_CHECK(cudaStreamWaitEvent(info.all_ctx[peer]->stream(), info.all_ctx[i]->compute_event, 0));
+                    CUDA_CHECK(cudaStreamWaitEvent(all_ctx[peer]->stream(), all_ctx[i]->compute_event, 0));
                }
-                CUDA_CHECK(cudaMemcpyPeerAsync(info.all_ctx[i]->copy_buffer, info.all_ctx[i]->device,
-                            (const char *)dst->src[peer]->data + ichunk*size_per_device, info.all_ctx[peer]->device,
-                            this_size, info.all_ctx[peer]->stream()));
-                CUDA_CHECK(cudaEventRecord(info.all_ctx[peer]->copy_event, info.all_ctx[peer]->stream()));
+                CUDA_CHECK(cudaMemcpyPeerAsync(all_ctx[i]->copy_buffer, all_ctx[i]->device,
+                            (const char *)dst->src[peer]->data + ichunk*size_per_device, all_ctx[peer]->device,
+                            this_size, all_ctx[peer]->stream()));
+                CUDA_CHECK(cudaEventRecord(all_ctx[peer]->copy_event, all_ctx[peer]->stream()));
                ichunk = (ichunk + 1)%nhave;
            }
            ichunk = stage;
@ -315,24 +325,24 @@ void ggml_cuda_op_reduce([[maybe_unused]] ggml_backend_cuda_context & ctx, ggml_
                int i = idx[ii];
                int peer = idx[(ii+1)%nhave];
                auto this_nelem = std::min(nelem_per_device, nelem - ichunk*nelem_per_device);
-                ggml_cuda_set_device(info.all_ctx[i]->device);
-                CUDA_CHECK(cudaStreamWaitEvent(info.all_ctx[i]->stream(), info.all_ctx[peer]->copy_event, 0));
+                ggml_cuda_set_device(all_ctx[i]->device);
+                CUDA_CHECK(cudaStreamWaitEvent(all_ctx[i]->stream(), all_ctx[peer]->copy_event, 0));
                int num_blocks = (this_nelem + CUDA_REDUCE_BLOCK_SIZE - 1)/CUDA_REDUCE_BLOCK_SIZE;
                if (dst->type == GGML_TYPE_F16) {
-                    k_add<half, CUDA_REDUCE_BLOCK_SIZE><<<num_blocks, CUDA_REDUCE_BLOCK_SIZE, 0, info.all_ctx[i]->stream()>>>(this_nelem,
-                            (const half *)info.all_ctx[i]->copy_buffer, (half *)dst->src[i]->data + ichunk*nelem_per_device);
+                    k_add<half, CUDA_REDUCE_BLOCK_SIZE><<<num_blocks, CUDA_REDUCE_BLOCK_SIZE, 0, all_ctx[i]->stream()>>>(this_nelem,
+                            (const half *)all_ctx[i]->copy_buffer, (half *)dst->src[i]->data + ichunk*nelem_per_device);
                } else if (dst->type == GGML_TYPE_Q8_0) {
-                    k_add<CUDA_REDUCE_BLOCK_SIZE><<<num_blocks, CUDA_REDUCE_BLOCK_SIZE, 0, info.all_ctx[i]->stream()>>>(this_nelem,
-                            (const block_q8_0 *)info.all_ctx[i]->copy_buffer, (block_q8_0 *)dst->src[i]->data + ichunk*nelem_per_device/tt.blck_size);
+                    k_add<CUDA_REDUCE_BLOCK_SIZE><<<num_blocks, CUDA_REDUCE_BLOCK_SIZE, 0, all_ctx[i]->stream()>>>(this_nelem,
+                            (const block_q8_0 *)all_ctx[i]->copy_buffer, (block_q8_0 *)dst->src[i]->data + ichunk*nelem_per_device/tt.blck_size);
                } else if (dst->type == GGML_TYPE_BF16) {
-                    k_add<nv_bfloat16, CUDA_REDUCE_BLOCK_SIZE><<<num_blocks, CUDA_REDUCE_BLOCK_SIZE, 0, info.all_ctx[i]->stream()>>>(
-                            this_nelem, (const nv_bfloat16 *)info.all_ctx[i]->copy_buffer,
+                    k_add<nv_bfloat16, CUDA_REDUCE_BLOCK_SIZE><<<num_blocks, CUDA_REDUCE_BLOCK_SIZE, 0, all_ctx[i]->stream()>>>(
+                            this_nelem, (const nv_bfloat16 *)all_ctx[i]->copy_buffer,
                            (nv_bfloat16 *)dst->src[i]->data + ichunk*nelem_per_device);
                } else {
-                    k_add<float, CUDA_REDUCE_BLOCK_SIZE><<<num_blocks, CUDA_REDUCE_BLOCK_SIZE, 0, info.all_ctx[i]->stream()>>>(this_nelem,
-                            (const float *)info.all_ctx[i]->copy_buffer, (float *)dst->src[i]->data + ichunk*nelem_per_device);
+                    k_add<float, CUDA_REDUCE_BLOCK_SIZE><<<num_blocks, CUDA_REDUCE_BLOCK_SIZE, 0, all_ctx[i]->stream()>>>(this_nelem,
+                            (const float *)all_ctx[i]->copy_buffer, (float *)dst->src[i]->data + ichunk*nelem_per_device);
                }
-                CUDA_CHECK(cudaEventRecord(info.all_ctx[i]->compute_event, info.all_ctx[i]->stream()));
+                CUDA_CHECK(cudaEventRecord(all_ctx[i]->compute_event, all_ctx[i]->stream()));
                ichunk = (ichunk + 1)%nhave;
            }
        }
@ -343,21 +353,21 @@ void ggml_cuda_op_reduce([[maybe_unused]] ggml_backend_cuda_context & ctx, ggml_
                int peer = idx[(ii+1)%nhave];
                auto this_nelem = std::min(nelem_per_device, nelem - ichunk*nelem_per_device);
                auto this_size  = (this_nelem / tt.blck_size) * tt.type_size;
-                ggml_cuda_set_device(info.all_ctx[peer]->device);
+                ggml_cuda_set_device(all_ctx[peer]->device);
                if (stage == 0) {
-                    CUDA_CHECK(cudaStreamWaitEvent(info.all_ctx[peer]->stream(), info.all_ctx[i]->compute_event, 0));
+                    CUDA_CHECK(cudaStreamWaitEvent(all_ctx[peer]->stream(), all_ctx[i]->compute_event, 0));
                }
-                CUDA_CHECK(cudaMemcpyPeerAsync((char *)dst->src[i]->data + ichunk*size_per_device, info.all_ctx[i]->device,
-                            (const char *)dst->src[peer]->data + ichunk*size_per_device, info.all_ctx[peer]->device,
-                            this_size, info.all_ctx[peer]->stream()));
-                CUDA_CHECK(cudaEventRecord(info.all_ctx[peer]->copy_event, info.all_ctx[peer]->stream()));
+                CUDA_CHECK(cudaMemcpyPeerAsync((char *)dst->src[i]->data + ichunk*size_per_device, all_ctx[i]->device,
+                            (const char *)dst->src[peer]->data + ichunk*size_per_device, all_ctx[peer]->device,
+                            this_size, all_ctx[peer]->stream()));
+                CUDA_CHECK(cudaEventRecord(all_ctx[peer]->copy_event, all_ctx[peer]->stream()));
                ichunk = (ichunk + 1)%nhave;
            }
            for (int ii = 0; ii < nhave; ++ii) {
                int i = idx[ii];
                int peer = idx[(ii+1)%nhave];
-                ggml_cuda_set_device(info.all_ctx[i]->device);
-                CUDA_CHECK(cudaStreamWaitEvent(info.all_ctx[i]->stream(), info.all_ctx[peer]->copy_event, 0));
+                ggml_cuda_set_device(all_ctx[i]->device);
+                CUDA_CHECK(cudaStreamWaitEvent(all_ctx[i]->stream(), all_ctx[peer]->copy_event, 0));
            }
        }
        ggml_cuda_set_device(ctx.device);
@ -372,8 +382,8 @@ void ggml_cuda_op_reduce([[maybe_unused]] ggml_backend_cuda_context & ctx, ggml_
            GGML_ASSERT(dst->src[i]->type == dst->type);
            GGML_ASSERT(ggml_are_same_shape(dst, dst->src[i]));
            ggml_cuda_set_device(i);
-            if (!info.all_ctx[i]->copy_event) {
-                CUDA_CHECK(cudaEventCreateWithFlags(&info.all_ctx[i]->copy_event, cudaEventDisableTiming));
+            if (!all_ctx[i]->copy_event) {
+                CUDA_CHECK(cudaEventCreateWithFlags(&all_ctx[i]->copy_event, cudaEventDisableTiming));
            }
        }
        auto nelem = ggml_nelements(dst);
@ -386,20 +396,20 @@ void ggml_cuda_op_reduce([[maybe_unused]] ggml_backend_cuda_context & ctx, ggml_
            task.ptrs[0] = (char *)dst->src[i]->data;
            int j = idx[2*ii+1];
            ggml_cuda_set_device(j);
-            CUDA_CHECK(cudaEventRecord(info.all_ctx[j]->copy_event, info.all_ctx[j]->stream()));
+            CUDA_CHECK(cudaEventRecord(all_ctx[j]->copy_event, all_ctx[j]->stream()));
            task.ptrs[1] = (char *)dst->src[j]->data;
            ggml_cuda_set_device(i);
-            CUDA_CHECK(cudaStreamWaitEvent(info.all_ctx[i]->stream(), info.all_ctx[j]->copy_event));
+            CUDA_CHECK(cudaStreamWaitEvent(all_ctx[i]->stream(), all_ctx[j]->copy_event));
            if (dst->type == GGML_TYPE_F16) {
-                k_reduce_add_T<half, CUDA_REDUCE_BLOCK_SIZE, 2><<<nblocks, CUDA_REDUCE_BLOCK_SIZE, 0, info.all_ctx[i]->stream()>>>(task);
+                k_reduce_add_T<half, CUDA_REDUCE_BLOCK_SIZE, 2><<<nblocks, CUDA_REDUCE_BLOCK_SIZE, 0, all_ctx[i]->stream()>>>(task);
            } else {
-                k_reduce_add_T<float, CUDA_REDUCE_BLOCK_SIZE, 2><<<nblocks, CUDA_REDUCE_BLOCK_SIZE, 0, info.all_ctx[i]->stream()>>>(task);
+                k_reduce_add_T<float, CUDA_REDUCE_BLOCK_SIZE, 2><<<nblocks, CUDA_REDUCE_BLOCK_SIZE, 0, all_ctx[i]->stream()>>>(task);
            }
        }
        for (int ii = 0; ii < nhave/2; ++ii) {
            int i = idx[2*ii+0];
            ggml_cuda_set_device(i);
-            CUDA_CHECK(cudaEventRecord(info.all_ctx[i]->copy_event, info.all_ctx[i]->stream()));
+            CUDA_CHECK(cudaEventRecord(all_ctx[i]->copy_event, all_ctx[i]->stream()));
        }
        for (int ii = 0; ii < nhave/2; ++ii) {
            int i = idx[2*ii+1];
@ -411,23 +421,23 @@ void ggml_cuda_op_reduce([[maybe_unused]] ggml_backend_cuda_context & ctx, ggml_
            int j = idx[(2*ii+2)%nhave];
            task.ptrs[1] = (char *)dst->src[j]->data;
            ggml_cuda_set_device(i);
-            CUDA_CHECK(cudaStreamWaitEvent(info.all_ctx[i]->stream(), info.all_ctx[j]->copy_event));
+            CUDA_CHECK(cudaStreamWaitEvent(all_ctx[i]->stream(), all_ctx[j]->copy_event));
            if (dst->type == GGML_TYPE_F16) {
-                k_reduce_add_T<half, CUDA_REDUCE_BLOCK_SIZE, 2><<<nblocks, CUDA_REDUCE_BLOCK_SIZE, 0, info.all_ctx[i]->stream()>>>(task);
+                k_reduce_add_T<half, CUDA_REDUCE_BLOCK_SIZE, 2><<<nblocks, CUDA_REDUCE_BLOCK_SIZE, 0, all_ctx[i]->stream()>>>(task);
            } else {
-                k_reduce_add_T<float, CUDA_REDUCE_BLOCK_SIZE, 2><<<nblocks, CUDA_REDUCE_BLOCK_SIZE, 0, info.all_ctx[i]->stream()>>>(task);
+                k_reduce_add_T<float, CUDA_REDUCE_BLOCK_SIZE, 2><<<nblocks, CUDA_REDUCE_BLOCK_SIZE, 0, all_ctx[i]->stream()>>>(task);
            }
        }
        for (int ii = 0; ii < nhave/2; ++ii) {
            int i = idx[2*ii+1];
            ggml_cuda_set_device(i);
-            CUDA_CHECK(cudaEventRecord(info.all_ctx[i]->copy_event, info.all_ctx[i]->stream()));
+            CUDA_CHECK(cudaEventRecord(all_ctx[i]->copy_event, all_ctx[i]->stream()));
        }
        for (int ii = 0; ii < nhave/2; ++ii) {
            int i = idx[(2*ii+2)%nhave];
            ggml_cuda_set_device(i);
            int j = idx[2*ii+1];
-            CUDA_CHECK(cudaStreamWaitEvent(info.all_ctx[i]->stream(), info.all_ctx[j]->copy_event));
+            CUDA_CHECK(cudaStreamWaitEvent(all_ctx[i]->stream(), all_ctx[j]->copy_event));
        }
        ggml_cuda_set_device(ctx.device);
        if (ncopy > 0) {
@ -442,10 +452,10 @@ void ggml_cuda_op_reduce([[maybe_unused]] ggml_backend_cuda_context & ctx, ggml_
            GGML_ASSERT(dst->src[i]->type == dst->type);
            GGML_ASSERT(ggml_are_same_shape(dst, dst->src[i]));
            ggml_cuda_set_device(i);
-            if (!info.all_ctx[i]->copy_event) {
-                CUDA_CHECK(cudaEventCreateWithFlags(&info.all_ctx[i]->copy_event, cudaEventDisableTiming));
+            if (!all_ctx[i]->copy_event) {
+                CUDA_CHECK(cudaEventCreateWithFlags(&all_ctx[i]->copy_event, cudaEventDisableTiming));
            }
-            CUDA_CHECK(cudaEventRecord(info.all_ctx[i]->copy_event, info.all_ctx[i]->stream()));
+            CUDA_CHECK(cudaEventRecord(all_ctx[i]->copy_event, all_ctx[i]->stream()));
        }
        //printf("Recorded events\n");
        auto nelem = ggml_nelements(dst);
@ -465,37 +475,37 @@ void ggml_cuda_op_reduce([[maybe_unused]] ggml_backend_cuda_context & ctx, ggml_
            for (int jj = 0; jj < nhave; ++jj) {
                if (jj == ii) continue;
                int j = idx[jj];
-                CUDA_CHECK(cudaStreamWaitEvent(info.all_ctx[i]->stream(), info.all_ctx[j]->copy_event));
+                CUDA_CHECK(cudaStreamWaitEvent(all_ctx[i]->stream(), all_ctx[j]->copy_event));
                task.ptrs[k++] = (char *)dst->src[j]->data + ii*nelem_per_device*elem_size;
            }
            int nblock = (this_nelem + CUDA_REDUCE_BLOCK_SIZE - 1)/CUDA_REDUCE_BLOCK_SIZE;
            if (dst->type == GGML_TYPE_F16) {
                switch (nhave) {
                    case 2:
-                        k_reduce_add_T<half, CUDA_REDUCE_BLOCK_SIZE, 2><<<nblock, CUDA_REDUCE_BLOCK_SIZE, 0, info.all_ctx[i]->stream()>>>(task);
+                        k_reduce_add_T<half, CUDA_REDUCE_BLOCK_SIZE, 2><<<nblock, CUDA_REDUCE_BLOCK_SIZE, 0, all_ctx[i]->stream()>>>(task);
                        break;
                    case 3:
-                        k_reduce_add_T<half, CUDA_REDUCE_BLOCK_SIZE, 3><<<nblock, CUDA_REDUCE_BLOCK_SIZE, 0, info.all_ctx[i]->stream()>>>(task);
+                        k_reduce_add_T<half, CUDA_REDUCE_BLOCK_SIZE, 3><<<nblock, CUDA_REDUCE_BLOCK_SIZE, 0, all_ctx[i]->stream()>>>(task);
                        break;
                    case 4:
-                        k_reduce_add_T<half, CUDA_REDUCE_BLOCK_SIZE, 4><<<nblock, CUDA_REDUCE_BLOCK_SIZE, 0, info.all_ctx[i]->stream()>>>(task);
+                        k_reduce_add_T<half, CUDA_REDUCE_BLOCK_SIZE, 4><<<nblock, CUDA_REDUCE_BLOCK_SIZE, 0, all_ctx[i]->stream()>>>(task);
                        break;
                    default:
-                        k_reduce_add<half, CUDA_REDUCE_BLOCK_SIZE><<<nblock, CUDA_REDUCE_BLOCK_SIZE, 0, info.all_ctx[i]->stream()>>>(task);
+                        k_reduce_add<half, CUDA_REDUCE_BLOCK_SIZE><<<nblock, CUDA_REDUCE_BLOCK_SIZE, 0, all_ctx[i]->stream()>>>(task);
                }
            } else {
                switch (nhave) {
                    case 2:
-                        k_reduce_add_T<float, CUDA_REDUCE_BLOCK_SIZE, 2><<<nblock, CUDA_REDUCE_BLOCK_SIZE, 0, info.all_ctx[i]->stream()>>>(task);
+                        k_reduce_add_T<float, CUDA_REDUCE_BLOCK_SIZE, 2><<<nblock, CUDA_REDUCE_BLOCK_SIZE, 0, all_ctx[i]->stream()>>>(task);
                        break;
                    case 3:
-                        k_reduce_add_T<float, CUDA_REDUCE_BLOCK_SIZE, 3><<<nblock, CUDA_REDUCE_BLOCK_SIZE, 0, info.all_ctx[i]->stream()>>>(task);
+                        k_reduce_add_T<float, CUDA_REDUCE_BLOCK_SIZE, 3><<<nblock, CUDA_REDUCE_BLOCK_SIZE, 0, all_ctx[i]->stream()>>>(task);
                        break;
                    case 4:
-                        k_reduce_add_T<float, CUDA_REDUCE_BLOCK_SIZE, 4><<<nblock, CUDA_REDUCE_BLOCK_SIZE, 0, info.all_ctx[i]->stream()>>>(task);
+                        k_reduce_add_T<float, CUDA_REDUCE_BLOCK_SIZE, 4><<<nblock, CUDA_REDUCE_BLOCK_SIZE, 0, all_ctx[i]->stream()>>>(task);
                        break;
                    default:
-                        k_reduce_add<float, CUDA_REDUCE_BLOCK_SIZE><<<nblock, CUDA_REDUCE_BLOCK_SIZE, 0, info.all_ctx[i]->stream()>>>(task);
+                        k_reduce_add<float, CUDA_REDUCE_BLOCK_SIZE><<<nblock, CUDA_REDUCE_BLOCK_SIZE, 0, all_ctx[i]->stream()>>>(task);
                }
            }
        }
@ -503,7 +513,7 @@ void ggml_cuda_op_reduce([[maybe_unused]] ggml_backend_cuda_context & ctx, ggml_
        for (int ii = 0; ii < nhave; ++ii) {
            int i = idx[ii];
            ggml_cuda_set_device(i);
-            CUDA_CHECK(cudaEventRecord(info.all_ctx[i]->copy_event, info.all_ctx[i]->stream()));
+            CUDA_CHECK(cudaEventRecord(all_ctx[i]->copy_event, all_ctx[i]->stream()));
        }
        //printf("Recorded events again\n");
        for (int ii = 0; ii < nhave; ++ii) {
@ -512,7 +522,7 @@ void ggml_cuda_op_reduce([[maybe_unused]] ggml_backend_cuda_context & ctx, ggml_
            for (int jj = 0; jj < nhave; ++jj) {
                if (jj == ii) continue;
                int j = idx[jj];
-                CUDA_CHECK(cudaStreamWaitEvent(info.all_ctx[i]->stream(), info.all_ctx[j]->copy_event));
+                CUDA_CHECK(cudaStreamWaitEvent(all_ctx[i]->stream(), all_ctx[j]->copy_event));
            }
        }
        ggml_cuda_set_device(ctx.device);
@ -536,11 +546,11 @@ void ggml_cuda_op_reduce([[maybe_unused]] ggml_backend_cuda_context & ctx, ggml_
        GGML_ASSERT(ggml_are_same_shape(dst, dst->src[i]));
        if (i == ctx.device) continue;
        ggml_cuda_set_device(i);
-        CUDA_CHECK(cudaMemcpyPeerAsync(ptr, ctx.device, dst->src[i]->data, i, nbytes, info.all_ctx[i]->stream()));
-        if (!info.all_ctx[i]->copy_event) {
-            CUDA_CHECK(cudaEventCreateWithFlags(&info.all_ctx[i]->copy_event, cudaEventDisableTiming));
+        CUDA_CHECK(cudaMemcpyPeerAsync(ptr, ctx.device, dst->src[i]->data, i, nbytes, all_ctx[i]->stream()));
+        if (!all_ctx[i]->copy_event) {
+            CUDA_CHECK(cudaEventCreateWithFlags(&all_ctx[i]->copy_event, cudaEventDisableTiming));
        }
-        CUDA_CHECK(cudaEventRecord(info.all_ctx[i]->copy_event, info.all_ctx[i]->stream()));
+        CUDA_CHECK(cudaEventRecord(all_ctx[i]->copy_event, all_ctx[i]->stream()));
        ptr += nbytes;
    }
    auto nelem = ggml_nelements(dst);
@ -550,7 +560,7 @@ void ggml_cuda_op_reduce([[maybe_unused]] ggml_backend_cuda_context & ctx, ggml_
    for (int ii = 0; ii < nhave; ++ii) {
        int i = idx[ii];
        if (i == ctx.device) continue;
-        CUDA_CHECK(cudaStreamWaitEvent(ctx.stream(), info.all_ctx[i]->copy_event, 0));
+        CUDA_CHECK(cudaStreamWaitEvent(ctx.stream(), all_ctx[i]->copy_event, 0));
        if (dst->type == GGML_TYPE_F16) {
            k_add<half, CUDA_REDUCE_BLOCK_SIZE><<<num_blocks, CUDA_REDUCE_BLOCK_SIZE, 0, ctx.stream()>>>(nelem, (const half *)ptr, (half *)dst->data);
        } else if (dst->type == GGML_TYPE_BF16) {
@ -572,15 +582,15 @@ void ggml_cuda_op_reduce([[maybe_unused]] ggml_backend_cuda_context & ctx, ggml_
        int i = idx[ii];
        if (i == ctx.device) continue;
        ggml_cuda_set_device(i);
-        CUDA_CHECK(cudaStreamWaitEvent(info.all_ctx[i]->stream(), ctx.copy_event, 0));
-        CUDA_CHECK(cudaMemcpyPeerAsync(dst->src[i]->data, i, dst->data, ctx.device, nbytes, info.all_ctx[i]->stream()));
-        CUDA_CHECK(cudaEventRecord(info.all_ctx[i]->copy_event, info.all_ctx[i]->stream()));
+        CUDA_CHECK(cudaStreamWaitEvent(all_ctx[i]->stream(), ctx.copy_event, 0));
+        CUDA_CHECK(cudaMemcpyPeerAsync(dst->src[i]->data, i, dst->data, ctx.device, nbytes, all_ctx[i]->stream()));
+        CUDA_CHECK(cudaEventRecord(all_ctx[i]->copy_event, all_ctx[i]->stream()));
    }
    ggml_cuda_set_device(ctx.device);
    for (int ii = 0; ii < nhave; ++ii) {
        int i = idx[ii];
        if (i == ctx.device) continue;
-        CUDA_CHECK(cudaStreamWaitEvent(ctx.stream(), info.all_ctx[i]->copy_event, 0));
+        CUDA_CHECK(cudaStreamWaitEvent(ctx.stream(), all_ctx[i]->copy_event, 0));
    }
    if (ncopy > 0) {
        copy_missing_tensors(ctx, dst, nhave, ncopy, idx, copy_idx);
--- a/src/graphs/build_gemma4.cpp
+++ b/src/graphs/build_gemma4.cpp
@ -591,10 +591,6 @@ ggml_cgraph * llm_build_context::build_gemma4_mtp() {

    ggml_tensor * KQ_mask = nullptr;
    ggml_tensor * KQ_mask_swa = nullptr;
-    ggml_tensor * frozen_k_swa = nullptr;
-    ggml_tensor * frozen_v_swa = nullptr;
-    ggml_tensor * frozen_k_full = nullptr;
-    ggml_tensor * frozen_v_full = nullptr;
    {
        const int64_t n_mask_tokens = GGML_PAD(n_tokens, GGML_KQ_MASK_PAD);
        lctx.inp_KQ_mask = ggml_new_tensor_2d(ctx0, flash_attn ? GGML_TYPE_F16 : GGML_TYPE_F32, target_n_kv, n_mask_tokens);
@ -610,59 +606,248 @@ ggml_cgraph * llm_build_context::build_gemma4_mtp() {
        }
    }

-    for (int il = 0; il < n_layer; ++il) {
+    if (model.split_mode == LLAMA_SPLIT_MODE_GRAPH) {
+        int n_device = model.splits.size();
+        std::vector<ggml_tensor *> sa_inp(n_device, nullptr);
+        std::vector<ggml_tensor *> sa_out(n_device, nullptr);
+        std::vector<ggml_tensor *> ffn_inp(n_device, nullptr);
+        std::vector<ggml_tensor *> ffn_out(n_device, nullptr);
        ggml_tensor * inpL = cur;

-        const bool  is_sliding   = hparams.swa_layers[il] ? true : false;
-        const float freq_base_l  = is_sliding ? target_hparams.rope_freq_base_train_swa  : target_cparams.rope_freq_base;
-        const float freq_scale_l = is_sliding ? target_hparams.rope_freq_scale_train_swa : target_cparams.rope_freq_scale;
-        const int   n_rot_l      = is_sliding ? target_hparams.n_rot_swa : target_hparams.n_rot;
-        const int   n_swa        = is_sliding ? target_hparams.n_swa : 0;
-        const int   n_embd_head  = hparams.n_embd_head_k(il);
-        const int   n_head       = hparams.n_head(il);
-        ggml_tensor * KQ_mask_l  = is_sliding ? KQ_mask_swa : KQ_mask;
+        for (int il = 0; il < n_layer; ++il) {

-        cur = llm_build_norm(ctx0, inpL, hparams, model.layers[il].attn_norm, nullptr, LLM_NORM_RMS, cb, il);
-        cb(cur, "attn_norm", il);
+            const bool  is_sliding   = hparams.swa_layers[il] ? true : false;
+            const float freq_base_l  = is_sliding ? target_hparams.rope_freq_base_train_swa  : target_cparams.rope_freq_base;
+            const float freq_scale_l = is_sliding ? target_hparams.rope_freq_scale_train_swa : target_cparams.rope_freq_scale;
+            const int   n_rot_l      = is_sliding ? target_hparams.n_rot_swa : target_hparams.n_rot;
+            const int   n_swa        = is_sliding ? target_hparams.n_swa : 0;
+            const int   n_embd_head  = hparams.n_embd_head_k(il);
+            ggml_tensor * KQ_mask_l  = is_sliding ? KQ_mask_swa : KQ_mask;

-        ggml_tensor * Qcur = llm_build_lora_mm(lctx, ctx0, model.layers[il].wq, cur);
-        cb(Qcur, "Qcur", il);
-        Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens);
-        Qcur = llm_build_norm(ctx0, Qcur, hparams, model.layers[il].attn_q_norm, nullptr, LLM_NORM_RMS, cb, il);
-        cb(Qcur, "Qcur_normed", il);
-        auto freq_factors = is_sliding ? nullptr : model.layers[il].rope_freqs;
-        Qcur = ggml_rope_ext(ctx0, Qcur, inp_pos, freq_factors, n_rot_l, rope_type, n_ctx_orig, freq_base_l, freq_scale_l,
-                ext_factor, attn_factor, beta_fast, beta_slow);
-        cb(Qcur, "Qcur_rope", il);
+            const int target_il = gemma4_mtp_target_kv_layer(hparams, target_hparams, il);

-        const int target_il = gemma4_mtp_target_kv_layer(hparams, target_hparams, il);
-        ggml_tensor *& frozen_k = is_sliding ? frozen_k_swa : frozen_k_full;
-        ggml_tensor *& frozen_v = is_sliding ? frozen_v_swa : frozen_v_full;
-        gemma4_mtp_prepare_frozen_kv_views(ctx0, lctx, target_kv, il, target_il, target_n_kv, &frozen_k, &frozen_v, cb);
-        cur = llm_build_kv(ctx0, lctx, target_kv, gf, model.layers[il].wo, model.layers[il].bo,
-            nullptr, nullptr, Qcur, KQ_mask_l, n_tokens, target_kv_head, target_n_kv, hparams.f_attention_scale, cb, il, nullptr, n_swa, target_il,
-            &frozen_k, &frozen_v);
+            auto split_kl = (const ggml_split_tensor_t *)target_kv.k_l[target_il]->extra;
+            auto split_vl = (const ggml_split_tensor_t *)target_kv.v_l[target_il]->extra;
+            GGML_ASSERT(split_kl && split_vl);
+            auto split_ql = (const ggml_split_tensor_t *)model.layers[il].wq->extra;
+            auto split_ol = (const ggml_split_tensor_t *)model.layers[il].wo->extra;
+            GGML_ASSERT(split_ql && split_ol);
+            GGML_ASSERT(split_ql->n_device == n_device && split_kl->n_device == n_device && split_vl->n_device == n_device && split_ol->n_device == n_device);
+            ggml_tensor * sa_last = nullptr;
+            int nhave = 0;
+            for (int id = 0; id < n_device; ++id) {
+                GGML_ASSERT((split_kl->splits[id] && split_vl->splits[id] && split_ql->splits[id] && split_ol->splits[id]) ||
+                           !(split_kl->splits[id] || split_vl->splits[id] || split_ql->splits[id] || split_ol->splits[id]));
+                if (!split_kl->splits[id]) {
+                    sa_inp[id] = sa_out[id] = nullptr;
+                    continue;
+                }

-        cur = llm_build_norm(ctx0, cur, hparams, model.layers[il].attn_post_norm, nullptr, LLM_NORM_RMS, cb, il);
-        cb(cur, "attn_post_norm", il);
-        cur = ggml_add(ctx0, cur, inpL);
-        cb(cur, "attn_out", il);
+                int il_cb = 1000*(il + 1) + id;

-        ggml_tensor * ffn = llm_build_ffn(ctx0, lctx, model.layers[il].ffn_norm, cur,
-                model.layers[il].ffn_up,   nullptr, nullptr,
-                model.layers[il].ffn_gate, nullptr, nullptr,
-                model.layers[il].ffn_down, nullptr, nullptr,
-                nullptr,
-                LLM_FFN_GELU, LLM_FFN_PAR, cb, il, gf, true, false, nullptr, model.layers[il].ffn_post_norm);
-        cb(ffn, "ffn_out", il);
+                if (il == 0) {
+                    sa_inp[id] = inpL;
+                } else {
+                    GGML_ASSERT(inpL->op == GGML_OP_REDUCE);
+                    cur = get_input_tensor_sm_graph(ctx0, inpL, id);
+                    GGML_ASSERT(model.layers[il-1].ffn_post_norm && model.layers[il-1].ffn_post_norm->extra);
+                    cur = do_split_norm(ctx0, cur, model.layers[il-1].ffn_post_norm, hparams, cb, id, il_cb, false);
+                    cb(cur, "ffn_normed", il_cb);
+                    auto add = ffn_inp[id];
+                    if (!add) {
+                        for (int j = 0; j < n_device; ++j) {
+                            if (ffn_inp[j]) {
+                                add = ffn_inp[j]; break;
+                            }
+                        }
+                        GGML_ASSERT(add);
+                    }
+                    sa_inp[id] = ggml_add(ctx0, cur, add);
+                    cb(sa_inp[id], "sa_inp", il_cb);
+                    if (model.layers[il-1].out_scale) {
+                        auto scale = (const ggml_split_tensor_t *)model.layers[il-1].out_scale->extra;
+                        sa_inp[id] = ggml_mul(ctx0, sa_inp[id], scale->splits[id]);
+                        cb(sa_inp[id], "sa_inp_scaled", il_cb);
+                    }
+                }
+                GGML_ASSERT(model.layers[il].attn_norm && model.layers[il].attn_norm->extra);
+                cur = do_split_norm(ctx0, sa_inp[id], model.layers[il].attn_norm, hparams, cb, id, il_cb, false);
+                cb(cur, "sa_inp_normed", il_cb);
+                auto Qcur = llm_build_lora_mm(lctx, ctx0, split_ql->splits[id], cur);
+                cb(Qcur, "Qcur", il_cb);
+                Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, Qcur->ne[0]/n_embd_head, n_tokens);
+                GGML_ASSERT(model.layers[il].attn_q_norm && model.layers[il].attn_q_norm->extra);
+                Qcur = do_split_norm(ctx0, Qcur, model.layers[il].attn_q_norm, hparams, cb, id, il_cb, false);
+                cb(Qcur, "Qcur_normed", il_cb);
+                auto freq_factors = is_sliding ? nullptr : ((const ggml_split_tensor_t *)model.layers[il].rope_freqs->extra)->splits[id];
+                Qcur = ggml_rope_ext(ctx0, Qcur, inp_pos, freq_factors, n_rot_l, rope_type, n_ctx_orig, freq_base_l, freq_scale_l,
+                        ext_factor, attn_factor, beta_fast, beta_slow);
+                cb(Qcur, "Qcur_rope", il_cb);
+                GGML_ASSERT(split_kl->splits[id]->ne[1] % target_kv.size == 0);
+                int n_head_kv = split_kl->splits[id]->ne[1] / target_kv.size;
+                auto q = ggml_permute(ctx0, Qcur, 0, 2, 1, 3);
+                auto k = ggml_view_3d(ctx0, split_kl->splits[id], n_embd_head, target_n_kv, n_head_kv,
+                    ggml_row_size(split_kl->splits[id]->type, n_embd_head)*n_head_kv,
+                    ggml_row_size(split_kl->splits[id]->type, n_embd_head), 0);
+                auto v = ggml_view_3d(ctx0, split_vl->splits[id], n_embd_head, target_n_kv, n_head_kv,
+                    ggml_row_size(split_vl->splits[id]->type, n_embd_head)*n_head_kv,
+                    ggml_row_size(split_vl->splits[id]->type, n_embd_head), 0);
+                cur = ggml_flash_attn_ext(ctx0, q, k, v, KQ_mask_l, hparams.f_attention_scale, 0.0f, 0.0f);
+                cur->op_params[4] = n_swa;
+                cb(cur, "fa", il_cb);
+                cur = ggml_reshape_2d(ctx0, cur, split_ol->splits[id]->ne[0], ggml_nelements(cur)/split_ol->splits[id]->ne[0]);
+                cur = llm_build_lora_mm(lctx, ctx0, split_ol->splits[id], cur);
+                cb(cur, "qkv", il_cb);
+                ggml_build_forward_expand(gf, cur);
+                sa_out[id] = cur;
+                sa_last = cur;
+                ++nhave;
+            }

-        cur = ffn;
-        if (model.layers[il].out_scale) {
-            cur = ggml_mul(ctx0, cur, model.layers[il].out_scale);
-            cb(cur, "out_scaled", il);
+            auto last_ffn_inp = nhave > 1 ? ggml_reduce(ctx0, sa_out.data(), n_device, GGML_OP_ADD) : sa_last;
+            ggml_build_forward_expand(gf, last_ffn_inp);
+            cb(last_ffn_inp, "sa_reduce", il);
+
+            auto ffn_up   = (const ggml_split_tensor_t *)model.layers[il].ffn_up->extra;
+            auto ffn_gate = (const ggml_split_tensor_t *)model.layers[il].ffn_gate->extra;
+            auto ffn_down = (const ggml_split_tensor_t *)model.layers[il].ffn_down->extra;
+            GGML_ASSERT(ffn_up && ffn_gate && ffn_down);
+
+            for (int id = 0; id < n_device; ++id) {
+                GGML_ASSERT((ffn_up->splits[id] && ffn_gate->splits[id] && ffn_down->splits[id]) ||
+                        (!ffn_up->splits[id] && !ffn_gate->splits[id] && !ffn_down->splits[id]));
+                if (!ffn_up->splits[id]) {
+                    ffn_inp[id] = ffn_out[id] = nullptr;
+                    continue;
+                }
+
+                GGML_ASSERT(last_ffn_inp && (nhave == 1 || last_ffn_inp->op == GGML_OP_REDUCE));
+
+                int il_cb = 1000*(il + 1) + id;
+
+                cur = get_input_tensor_sm_graph(ctx0, last_ffn_inp, id);
+                cur = do_split_norm(ctx0, cur, model.layers[il].attn_post_norm, hparams, cb, id, il_cb, false);
+                cb(cur, "sa_post", il_cb);
+                auto add = sa_inp[id];
+                if (!add) {
+                    for (int j = 0; j < n_device; ++j) {
+                        if (sa_inp[j]) {
+                            add = sa_inp[j]; break;
+                        }
+                    }
+                }
+                ffn_inp[id] = ggml_add(ctx0, cur, add);
+                cb(ffn_inp[id], "ffn_inp", il_cb);
+                cur = do_split_norm(ctx0, ffn_inp[id], model.layers[il].ffn_norm, hparams, cb, id, il_cb, false);
+                cb(cur, "ffn_inp_normed", il_cb);
+                cur = llm_build_ffn(ctx0, lctx, nullptr, cur,
+                        ffn_up->splits[id], nullptr, nullptr,
+                        ffn_gate->splits[id], nullptr, nullptr,
+                        ffn_down->splits[id], nullptr, nullptr,
+                        nullptr,
+                        LLM_FFN_GELU, LLM_FFN_PAR, cb, il, gf, false, false, nullptr, nullptr);
+                cb(cur, "ffn", il_cb);
+                ggml_build_forward_expand(gf, cur);
+                ffn_out[id] = cur;
+
+            }
+
+            inpL = ggml_reduce(ctx0, ffn_out.data(), n_device, GGML_OP_ADD);
+            cb(inpL, "ffn_reduce", il);
+            ggml_build_forward_expand(gf, inpL);
+        }
+
+        int idx = lctx.model.default_layer_device[lctx.model.hparams.n_layer];
+        int idx_out = ggml_backend_sched_get_backend_idx(lctx.sched, lctx.model.output->buffer);
+        if (idx_out >= 0) idx = idx_out;
+        cur = inpL->src[idx];
+        if (!cur) {
+            cur = inpL->view_src;
+        }
+
+        auto post_norm   = (const ggml_split_tensor_t *)model.layers[hparams.n_layer-1].ffn_post_norm->extra;
+        cur = llm_build_norm(ctx0, cur, hparams, post_norm->splits[idx], NULL, LLM_NORM_RMS, cb, -1);
+
+        cb(cur, "ffn_normed", hparams.n_layer-1);
+        auto add = ffn_inp[idx];
+        if (!add) {
+            for (int j = 0; j < n_device; ++j) {
+                if (ffn_inp[j]) {
+                    add = ffn_inp[j]; break;
+                }
+            }
+        }
+        cur = ggml_add(ctx0, cur, add);
+        cb(cur, "ffn_out", hparams.n_layer-1);
+
+        if (model.layers[hparams.n_layer-1].out_scale) {
+            auto scale = (const ggml_split_tensor_t *)model.layers[hparams.n_layer-1].out_scale->extra;
+            cur = ggml_mul(ctx0, cur, scale->splits[idx]);
+            cb(cur, "ffn_out_scaled", hparams.n_layer-1);
+        }
+
+    } else {
+
+        ggml_tensor * frozen_k_swa = nullptr;
+        ggml_tensor * frozen_v_swa = nullptr;
+        ggml_tensor * frozen_k_full = nullptr;
+        ggml_tensor * frozen_v_full = nullptr;
+
+        for (int il = 0; il < n_layer; ++il) {
+            ggml_tensor * inpL = cur;
+
+            const bool  is_sliding   = hparams.swa_layers[il] ? true : false;
+            const float freq_base_l  = is_sliding ? target_hparams.rope_freq_base_train_swa  : target_cparams.rope_freq_base;
+            const float freq_scale_l = is_sliding ? target_hparams.rope_freq_scale_train_swa : target_cparams.rope_freq_scale;
+            const int   n_rot_l      = is_sliding ? target_hparams.n_rot_swa : target_hparams.n_rot;
+            const int   n_swa        = is_sliding ? target_hparams.n_swa : 0;
+            const int   n_embd_head  = hparams.n_embd_head_k(il);
+            const int   n_head       = hparams.n_head(il);
+            ggml_tensor * KQ_mask_l  = is_sliding ? KQ_mask_swa : KQ_mask;
+
+            const int target_il = gemma4_mtp_target_kv_layer(hparams, target_hparams, il);
+
+            cur = llm_build_norm(ctx0, inpL, hparams, model.layers[il].attn_norm, nullptr, LLM_NORM_RMS, cb, il);
+            cb(cur, "attn_norm", il);
+
+            ggml_tensor * Qcur = llm_build_lora_mm(lctx, ctx0, model.layers[il].wq, cur);
+            cb(Qcur, "Qcur", il);
+            Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens);
+            Qcur = llm_build_norm(ctx0, Qcur, hparams, model.layers[il].attn_q_norm, nullptr, LLM_NORM_RMS, cb, il);
+            cb(Qcur, "Qcur_normed", il);
+            auto freq_factors = is_sliding ? nullptr : model.layers[il].rope_freqs;
+            Qcur = ggml_rope_ext(ctx0, Qcur, inp_pos, freq_factors, n_rot_l, rope_type, n_ctx_orig, freq_base_l, freq_scale_l,
+                    ext_factor, attn_factor, beta_fast, beta_slow);
+            cb(Qcur, "Qcur_rope", il);
+
+            ggml_tensor *& frozen_k = is_sliding ? frozen_k_swa : frozen_k_full;
+            ggml_tensor *& frozen_v = is_sliding ? frozen_v_swa : frozen_v_full;
+            gemma4_mtp_prepare_frozen_kv_views(ctx0, lctx, target_kv, il, target_il, target_n_kv, &frozen_k, &frozen_v, cb);
+            cur = llm_build_kv(ctx0, lctx, target_kv, gf, model.layers[il].wo, model.layers[il].bo,
+                    nullptr, nullptr, Qcur, KQ_mask_l, n_tokens, target_kv_head, target_n_kv, hparams.f_attention_scale, cb, il, nullptr, n_swa, target_il,
+                    &frozen_k, &frozen_v);
+
+            cur = llm_build_norm(ctx0, cur, hparams, model.layers[il].attn_post_norm, nullptr, LLM_NORM_RMS, cb, il);
+            cb(cur, "attn_post_norm", il);
+            cur = ggml_add(ctx0, cur, inpL);
+            cb(cur, "attn_out", il);
+
+            ggml_tensor * ffn = llm_build_ffn(ctx0, lctx, model.layers[il].ffn_norm, cur,
+                    model.layers[il].ffn_up,   nullptr, nullptr,
+                    model.layers[il].ffn_gate, nullptr, nullptr,
+                    model.layers[il].ffn_down, nullptr, nullptr,
+                    nullptr,
+                    LLM_FFN_GELU, LLM_FFN_PAR, cb, il, gf, true, false, nullptr, model.layers[il].ffn_post_norm);
+            cb(ffn, "ffn_out", il);
+
+            cur = ffn;
+            if (model.layers[il].out_scale) {
+                cur = ggml_mul(ctx0, cur, model.layers[il].out_scale);
+                cb(cur, "out_scaled", il);
+            }
+            cur = lctx.cvec.apply_to(ctx0, cur, il);
+            cb(cur, "l_out", il);
        }
-        cur = lctx.cvec.apply_to(ctx0, cur, il);
-        cb(cur, "l_out", il);
    }

    cur = llm_build_norm(ctx0, cur, hparams, model.output_norm, nullptr, LLM_NORM_RMS, cb, -1);
--- a/src/llama-impl.h
+++ b/src/llama-impl.h
@ -233,3 +233,5 @@ struct llama_split_tensor {

 void  llama_decode_reset();
 void  llama_decode_stop();
+
+std::vector<llama_model *> & llama_all_loaded_models();
--- a/src/llama-load-tensors.cpp
+++ b/src/llama-load-tensors.cpp
@ -194,6 +194,8 @@ struct create_tensors_helper : public create_tensors_helper_interface {
    ggml_context * ctx_output;
    ggml_context * ctx_output_split;

+    llama_model  * tgt_model = nullptr;
+
    ggml_backend_buffer_type_t default_cpu_buft;
    bool has_buft_overrides = false;

@ -221,6 +223,21 @@ struct create_tensors_helper : public create_tensors_helper_interface {

 create_tensors_helper::create_tensors_helper(llama_model_loader & _ml, llama_model & _model) : ml(_ml), model(_model) {

+    if (model.arch == LLM_ARCH_GEMMA4_MTP || model.arch == LLM_ARCH_GEMMA4_ASSISTANT) {
+        auto & all_models = llama_all_loaded_models();
+        for (auto model : all_models) {
+            if (model->arch == LLM_ARCH_GEMMA4) {
+                tgt_model = model;
+            }
+        }
+        if (tgt_model) {
+            LLAMA_LOG_INFO("==================== Found target model for Gemma4-Assistant. split mode graph: %d\n", model.split_mode == LLAMA_SPLIT_MODE_GRAPH);
+        } else {
+            LLAMA_LOG_INFO("==================== Did not find target model for Gemma4-Assistant\n");
+            model.split_mode = LLAMA_SPLIT_MODE_LAYER;
+        }
+    }
+
    const int n_layer = model.hparams.n_layer;
    buft_layer_count[model.buft_input.buft]++;
    buft_layer_count[model.buft_input.buft_matrix]++;
@ -2242,7 +2259,7 @@ bool create_tensors_helper::create_gemma4_mtp_tensors(const LLM_TN & tn) {
        const int64_t n_ff_cur    = hparams.n_ff(i);

        if (!hparams.swa_layers[i]) {
-            layer.rope_freqs = create_tensor(ctx_layer, tn(LLM_TENSOR_ROPE_FREQS, "weight"), { n_rot/2 },
+            layer.rope_freqs = create_tensor(ctx_split, tn(LLM_TENSOR_ROPE_FREQS, "weight"), { n_rot/2 },
                    llama_model_loader::TENSOR_NOT_REQUIRED | rope_flag);
            rope_flag = llama_model_loader::TENSOR_DUPLICATED;
        }
@ -4540,7 +4557,7 @@ bool create_tensors_helper::create_tensors() {

    {
        const bool unsupported =
-            (model.arch == LLM_ARCH_GEMMA4_MTP || model.arch == LLM_ARCH_GEMMA4_ASSISTANT) ||
+            //(model.arch == LLM_ARCH_GEMMA4_MTP || model.arch == LLM_ARCH_GEMMA4_ASSISTANT) ||
            (model.arch == LLM_ARCH_GEMMA4 && model.tok_embd_per_layer);
        if (unsupported && (model.split_mode == LLAMA_SPLIT_MODE_GRAPH || model.split_mode == LLAMA_SPLIT_MODE_ATTN)) {
            LLAMA_LOG_WARN("\n=========================================================\n");
@ -4583,6 +4600,12 @@ bool create_tensors_helper::create_tensors() {
        if (model.max_gpu > 0 && model.max_gpu < int(model.splits.size())) {
            gpu_split_count.resize(model.splits.size(), 0.0f);
        }
+        auto is_gemma4_model = [this] () {
+            return model.arch == LLM_ARCH_GEMMA4 || model.arch == LLM_ARCH_GEMMA4_MTP || model.arch == LLM_ARCH_GEMMA4_ASSISTANT;
+        };
+        auto is_gemma4_assistant = [this] () {
+            return model.arch == LLM_ARCH_GEMMA4_MTP || model.arch == LLM_ARCH_GEMMA4_ASSISTANT;
+        };
        for (int il = 0; il < n_layer; ++il) {
            // For now only run MTP into the per-layer
            if (model.mtp && hparams.nextn_predict_layers > 0 &&
@ -4620,7 +4643,7 @@ bool create_tensors_helper::create_tensors() {
            if (layer.attn_norm) {
                prepare_split_tensors(-1, ctx_split, layer.attn_norm, layer.split_attn_norm, mirror, mem_used);
            }
-            if (model.arch == LLM_ARCH_GEMMA4 && layer.attn_post_norm) {
+            if (is_gemma4_model() && layer.attn_post_norm) {
                prepare_split_tensors(-1, ctx_split, layer.attn_post_norm, layer.split_attn_post_norm, mirror, mem_used);
            }
            if (layer.rope_freqs) {
@ -4630,6 +4653,48 @@ bool create_tensors_helper::create_tensors() {
            if (hparams.is_recurrent(il)) {
                split_recurrent_tensors(hparams, layer, cur_splits, mem_used, ctx_split, il); //, model.arch == LLM_ARCH_QWEN3NEXT ? 0 : 1);
            }
+            else if (is_gemma4_assistant()) {
+                GGML_ASSERT(layer.wo && layer.wq);
+                GGML_ASSERT(tgt_model);
+                int n_embd_head = hparams.n_embd_head_k(il);
+                int n_head      = hparams.n_head(il);
+                bool is_sliding = hparams.swa_layers[il] != 0;
+                int target_n_kv_layer = tgt_model->hparams.n_layer_kv_from_start > 0
+                                      ? std::min<int>((int) tgt_model->hparams.n_layer, tgt_model->hparams.n_layer_kv_from_start)
+                                      : (int) tgt_model->hparams.n_layer;
+                int target_il = target_n_kv_layer - 1;
+                for (; target_il >= 0; --target_il) {
+                    if ((tgt_model->hparams.swa_layers[target_il] != 0) == is_sliding) break;
+                }
+                GGML_ASSERT(target_il >= 0 && "Gemma4 MTP could not find a matching target KV layer");
+                int n_head_tgt  = tgt_model->hparams.n_head(target_il);
+                GGML_ASSERT(tgt_model->hparams.n_embd_head_k(target_il) == n_embd_head);
+                auto & target_layer = tgt_model->layers[target_il];
+                auto split_wq = (const ggml_split_tensor_t *)target_layer.wq->extra;
+                auto split_wo = (const ggml_split_tensor_t *)target_layer.wo->extra;
+                GGML_ASSERT(split_wq && split_wo);
+                std::vector<int> q_split(split_wq->n_device, 0);
+                std::vector<int> o_split(split_wo->n_device, 0);
+                for (int id = 0; id < split_wq->n_device; ++id) {
+                    if (split_wq->splits[id]) {
+                        int nh = split_wq->splits[id]->ne[1] / n_embd_head;
+                        GGML_ASSERT((nh*n_head) % n_head_tgt == 0);
+                        q_split[id] = ((nh*n_head)/n_head_tgt)*n_embd_head;
+                    }
+                }
+                for (int id = 0; id < split_wo->n_device; ++id) {
+                    if (split_wo->splits[id]) {
+                        int64_t no = split_wo->splits[id]->ne[0] * layer.wo->ne[0];
+                        GGML_ASSERT(no % target_layer.wo->ne[0] == 0);
+                        o_split[id] = no / target_layer.wo->ne[0];
+                    }
+                }
+                prepare_split_tensors(1, ctx_split, layer.wq, layer.split_wq, q_split, mem_used);
+                prepare_split_tensors(0, ctx_split, layer.wo, layer.split_wo, o_split, mem_used);
+                if (layer.attn_q_norm) {
+                    prepare_split_tensors(-1, ctx_split, layer.attn_q_norm, layer.split_q_norm, o_split, mem_used);
+                }
+            }
            else if (layer.wo && layer.wq && layer.wk && (layer.wv || model.arch == LLM_ARCH_GEMMA4)) {
                auto granularity_kq = hparams.n_embd_head_k(il) * gqa_ratio;
                int wq_ne1 = layer.wq->ne[1];
--- a/src/llama-model-loader.cpp
+++ b/src/llama-model-loader.cpp
@ -1053,6 +1053,7 @@ void llama_model_loader::load_data_for(struct ggml_tensor * cur) const {
 // Returns false if cancelled by progress_callback
 bool llama_model_loader::load_all_data(
            struct ggml_context * ctx,
+            [[maybe_unused]] llama_model * model,
            llama_buf_map & bufs_mmap,
            llama_mlocks * lmlocks,
            llama_progress_callback progress_callback,
@ -1083,7 +1084,7 @@ bool llama_model_loader::load_all_data(
            for (int i = 0; i < ggml_backend_cuda_get_device_count(); ++i) {
                auto * cuda_buffer_type = ggml_backend_cuda_buffer_type(i);
                if (buffer_type == cuda_buffer_type) {
-                    cuda_backend = ggml_backend_cuda_init(i, nullptr);
+                    cuda_backend = ggml_backend_cuda_init(i, nullptr, model);
                    break;
                }
            }
--- a/src/llama-model-loader.h
+++ b/src/llama-model-loader.h
@ -184,6 +184,7 @@ struct llama_model_loader {
    // Returns false if cancelled by progress_callback
    bool load_all_data(
            struct ggml_context * ctx,
+            struct llama_model * model,
            llama_buf_map & bufs_mmap,
            llama_mlocks * lmlocks,
            llama_progress_callback progress_callback,
--- a/src/llama-reload.cpp
+++ b/src/llama-reload.cpp
@ -939,7 +939,7 @@ bool reload_info::reload_changed_tensors(llama_model & model) {

    if (r) {
 #ifdef GGML_USE_CUDA
-        ggml_backend_cuda_invalidate_graphs();
+        ggml_backend_cuda_invalidate_graphs(&model);
 #endif
    }
    return r;
--- a/src/llama.cpp
+++ b/src/llama.cpp
@ -682,6 +682,11 @@ bool llama_context::update_cache_copies() {
    return true;
 }

+static std::vector<llama_context *> & llama_all_contexts() {
+    static std::vector<llama_context *> contexts;
+    return contexts;
+}
+
 llama_context::llama_context(const llama_model & model)
    : model(model) , sampling(llama_n_vocab(&model)) , t_start_us(model.t_start_us) , t_load_us(model.t_load_us) {
    const auto & hparams = model.hparams;
@ -690,6 +695,7 @@ llama_context::llama_context(const llama_model & model)
    } else {
        cache_copies.resize(2*hparams.n_layer);
    }
+    llama_all_contexts().push_back(this);
 }

 void llama_context::set_mtp_op_type(llama_mtp_op_type value) {
@ -710,6 +716,14 @@ llama_context::~llama_context() {
    }

    ggml_backend_buffer_free(buf_output);
+
+    auto & all_contexts = llama_all_contexts();
+    for (auto it = all_contexts.begin(); it != all_contexts.end(); ++it) {
+        if (*it == this) {
+            all_contexts.erase(it);
+            break;
+        }
+    }
 }

 int llama_context::max_nodes(int n_tokens, int n_kv) const {
@ -3093,6 +3107,8 @@ static bool is_model_split_supported(const llama_model & model) {
        LLM_ARCH_QWEN35,
        LLM_ARCH_QWEN35MOE,
        LLM_ARCH_GEMMA4,
+        LLM_ARCH_GEMMA4_MTP,
+        LLM_ARCH_GEMMA4_ASSISTANT,
        LLM_ARCH_DEEPSEEK2,
        LLM_ARCH_GLM_DSA,
        LLM_ARCH_MISTRAL4,
@ -3371,11 +3387,21 @@ static bool llm_load_tensors(

    auto & hparams = model.hparams;

+    if (model.arch == LLM_ARCH_GEMMA4_MTP || model.arch == LLM_ARCH_GEMMA4_ASSISTANT) {
+        auto & all_models = llama_all_loaded_models();
+        llama_model * tgt_model = nullptr;
+        for (auto model : all_models) {
+            if (model->arch == LLM_ARCH_GEMMA4) {
+                tgt_model = model;
+            }
+        }
+        if (tgt_model) {
+            split_mode = tgt_model->split_mode;
+        }
+    }
+
    if (split_mode == LLAMA_SPLIT_MODE_GRAPH || split_mode == LLAMA_SPLIT_MODE_ATTN) {
-        const bool unsupported_gemma_split =
-            model.arch == LLM_ARCH_GEMMA4_MTP ||
-            model.arch == LLM_ARCH_GEMMA4_ASSISTANT ||
-            (model.arch == LLM_ARCH_GEMMA4 && hparams.n_embd_per_layer > 0);
+        const bool unsupported_gemma_split = model.arch == LLM_ARCH_GEMMA4 && hparams.n_embd_per_layer > 0;

        if (unsupported_gemma_split) {
            LLAMA_LOG_WARN("\n=========================================================\n");
@ -3401,6 +3427,25 @@ static bool llm_load_tensors(
            }
        }
    }
+    if ((split_mode == LLAMA_SPLIT_MODE_GRAPH || split_mode == LLAMA_SPLIT_MODE_ATTN) &&
+        (model.arch == LLM_ARCH_GEMMA4_MTP || model.arch == LLM_ARCH_GEMMA4_ASSISTANT)) {
+        auto & all_models = llama_all_loaded_models();
+        bool has_target_gemma = false;
+        for (auto model : all_models) {
+            if (model->arch == LLM_ARCH_GEMMA4) {
+                has_target_gemma = true;
+                break;
+            }
+        }
+        if (!has_target_gemma) {
+            LLAMA_LOG_WARN("\n=======================================================\n");
+            LLAMA_LOG_WARN("Split mode 'graph' requested for Gemma4-assistant model\n");
+            LLAMA_LOG_WARN("but no loaded Gemma4 model found.\n");
+            LLAMA_LOG_WARN("  => changing split mode to 'layer'\n");
+            LLAMA_LOG_WARN("=======================================================\n\n");
+            split_mode = LLAMA_SPLIT_MODE_LAYER;
+        }
+    }

    if (iqk_has_fancy_simd()) {
        LLAMA_LOG_INFO("======================================= HAVE_FANCY_SIMD is defined\n");
@ -3971,7 +4016,7 @@ static bool llm_load_tensors(
        for (auto & it : ctx_bufs) {
            ggml_context * ctx = it.first;
            auto & bufs = it.second;
-            if (!ml.load_all_data(ctx, bufs, use_mlock ? &model.mlock_mmaps : NULL, progress_callback, progress_callback_user_data)) {
+            if (!ml.load_all_data(ctx, &model, bufs, use_mlock ? &model.mlock_mmaps : NULL, progress_callback, progress_callback_user_data)) {
                return false;
            }
        }
@ -6722,10 +6767,19 @@ struct llama_model * llama_model_load_from_file(
        return nullptr;
    }

+    llama_all_loaded_models().push_back(model);
+
    return model;
 }

 void llama_free_model(struct llama_model * model) {
+    auto & all_models = llama_all_loaded_models();
+    for (auto it = all_models.begin(); it != all_models.end(); ++it) {
+        if (*it == model) {
+            all_models.erase(it);
+            break;
+        }
+    }
    delete model;
 }

@ -7094,7 +7148,7 @@ struct llama_context * llama_init_from_model(
        // main_gpu is a local index into model->devices throughout the codebase
        // (auto-fit assigns device_count-1, MTP clamps to [0, device_count), buffer-type
        // setup wraps with model.devices[main_gpu]). Translate to a raw device id here.
-        const int main_gpu_id = (model->main_gpu >= 0 && model->main_gpu < (int)model->devices.size())
+        [[maybe_unused]] const int main_gpu_id = (model->main_gpu >= 0 && model->main_gpu < (int)model->devices.size())
            ? model->devices[model->main_gpu]
            : model->main_gpu;
 #if defined(GGML_USE_METAL)
@ -7110,7 +7164,7 @@ struct llama_context * llama_init_from_model(
 #elif defined(GGML_USE_CUDA)
        if (model->split_mode == LLAMA_SPLIT_MODE_NONE) {
            // with split_mode LLAMA_SPLIT_MODE_NONE or LLAMA_SPLIT_MODE_GRAPH, only the main GPU backend is used
-            ggml_backend_t backend = ggml_backend_cuda_init(main_gpu_id, cparams.cuda_params);
+            ggml_backend_t backend = ggml_backend_cuda_init(main_gpu_id, cparams.cuda_params, ctx);
            if (backend == nullptr) {
                LLAMA_LOG_ERROR("%s: failed to initialize CUDA%d backend\n", __func__, main_gpu_id);
                llama_free(ctx);
@ -7129,7 +7183,7 @@ struct llama_context * llama_init_from_model(
                params = new_params.data();
            }
            for (int device = 0; device < ggml_backend_cuda_get_device_count(); ++device) {
-                ggml_backend_t backend = ggml_backend_cuda_init(device, params);
+                ggml_backend_t backend = ggml_backend_cuda_init(device, params, ctx);
                if (backend == nullptr) {
                    LLAMA_LOG_ERROR("%s: failed to initialize CUDA%d backend\n", __func__, device);
                    llama_free(ctx);
@ -11099,3 +11153,8 @@ bool llama_reload_changed_tensors(struct llama_context * ctx) {
    }
    return result;
 }
+
+std::vector<llama_model *> & llama_all_loaded_models() {
+    static std::vector<llama_model *> models;
+    return models;
+}