From bf23a7599ca99366dde90af5c85361757bc9522e Mon Sep 17 00:00:00 2001 From: Kawrakow Date: Wed, 24 Jun 2026 11:23:22 +0200 Subject: [PATCH 1/2] Avoid Gemma4 assistant strange tensor name warnings (#2023) --- src/llama.cpp | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/src/llama.cpp b/src/llama.cpp index f5e6f30c..fba96ccb 100644 --- a/src/llama.cpp +++ b/src/llama.cpp @@ -3174,7 +3174,9 @@ static std::pair, double> get_layer_sizes(const llama_model_ } } if (name == "mtp_pre_proj.weight" || name == "mtp_post_proj.weight" || - name == "mtp_centroids.weight" || name == "mtp_token_ordering.weight") { + name == "mtp_centroids.weight" || name == "mtp_token_ordering.weight" || + name == "nextn.post_projection.weight" || name == "nextn.pre_projection.weight" || + name == "rope_freqs.weight") { continue; } if (name == "dflash_fc.weight" || name == "dflash_hidden_norm.weight") { From d5507e33ae7ee2b7b41475f08044d3bde3b839ee Mon Sep 17 00:00:00 2001 From: Kawrakow Date: Wed, 24 Jun 2026 18:29:32 +0200 Subject: [PATCH 2/2] Split mode graph for dense Gemma4 assistant (#2022) * WIP: Split mode graph for Gemma4 assistant Something is not right - acceptance drops to nearly zero. * Per model CUDA contexts Still not working!? * This works The issue was that I was not correctly calculating the number of KV heads for the split KV cache. * Compiler warnings * It is better to use llama_context pointers as keys --- examples/cvector-generator/pca.hpp | 2 +- ggml/include/ggml-cuda.h | 4 +- ggml/src/ggml-cuda.cu | 54 +++--- ggml/src/ggml-cuda/common.cuh | 6 +- ggml/src/ggml-cuda/reduce.cu | 142 ++++++++------- src/graphs/build_gemma4.cpp | 281 ++++++++++++++++++++++++----- src/llama-impl.h | 2 + src/llama-load-tensors.cpp | 71 +++++++- src/llama-model-loader.cpp | 3 +- src/llama-model-loader.h | 1 + src/llama-reload.cpp | 2 +- src/llama.cpp | 75 +++++++- 12 files changed, 483 insertions(+), 160 deletions(-) diff --git a/examples/cvector-generator/pca.hpp b/examples/cvector-generator/pca.hpp index 8d920453..ac7c45a7 100644 --- a/examples/cvector-generator/pca.hpp +++ b/examples/cvector-generator/pca.hpp @@ -66,7 +66,7 @@ struct pca_model { pca_model(struct ggml_tensor * t_input) { #ifdef GGML_USE_CUDA fprintf(stderr, "%s: using CUDA backend\n", __func__); - backend = ggml_backend_cuda_init(0, nullptr); // init device 0 + backend = ggml_backend_cuda_init(0, nullptr, nullptr); // init device 0 if (!backend) { fprintf(stderr, "%s: ggml_backend_cuda_init() failed\n", __func__); } diff --git a/ggml/include/ggml-cuda.h b/ggml/include/ggml-cuda.h index a9787eb5..291b7db9 100644 --- a/ggml/include/ggml-cuda.h +++ b/ggml/include/ggml-cuda.h @@ -21,7 +21,7 @@ extern "C" { #define GGML_CUDA_MAX_DEVICES 16 // backend API -GGML_API GGML_CALL ggml_backend_t ggml_backend_cuda_init(int device, const void * params); +GGML_API GGML_CALL ggml_backend_t ggml_backend_cuda_init(int device, const void * params, const void * model); GGML_API GGML_CALL bool ggml_backend_is_cuda(ggml_backend_t backend); @@ -43,7 +43,7 @@ GGML_API GGML_CALL void ggml_backend_cuda_unregister_host_buffer(void * buffer); GGML_API GGML_CALL void ggml_backend_cuda_log_set_callback(ggml_log_callback log_callback, void * user_data); -GGML_API GGML_CALL void ggml_backend_cuda_invalidate_graphs(void); +GGML_API GGML_CALL void ggml_backend_cuda_invalidate_graphs(const void * model); #ifdef __cplusplus } #endif diff --git a/ggml/src/ggml-cuda.cu b/ggml/src/ggml-cuda.cu index 51693a09..8536179c 100644 --- a/ggml/src/ggml-cuda.cu +++ b/ggml/src/ggml-cuda.cu @@ -298,13 +298,22 @@ const ggml_cuda_device_info & ggml_cuda_info() { } /* ---------- hot-swap: invalidate all cached CUDA graphs ---------- */ -extern "C" void ggml_backend_cuda_invalidate_graphs(void) { +extern "C" void ggml_backend_cuda_invalidate_graphs(const void * model) { auto & info = const_cast(ggml_cuda_info()); - for (int i = 0; i < info.device_count; ++i) { - if (info.all_ctx[i]) { - info.all_ctx[i]->cuda_graphs.clear(); + if (auto it = info.all_ctx.find(model); it != info.all_ctx.end()) { + for (auto ctx : it->second) { + if (ctx) { + ctx->cuda_graphs.clear(); + } } + } else { + fprintf(stderr, "================================= %s: did not find entry for model at %p\n", __func__, model); } + //for (int i = 0; i < info.device_count; ++i) { + // if (info.all_ctx[i]) { + // info.all_ctx[i]->cuda_graphs.clear(); + // } + //} } // #define DEBUG_CUDA_MALLOC @@ -517,13 +526,14 @@ static std::condition_variable ggml_cuda_lock_cv; //static std::atomic ggml_cuda_lock_counter; static int ggml_cuda_lock_counter = 0; -ggml_backend_cuda_context::ggml_backend_cuda_context(int device) : - device(device), name(GGML_CUDA_NAME + std::to_string(device)) { +ggml_backend_cuda_context::ggml_backend_cuda_context(int device, const void * model) : + device(device), name(GGML_CUDA_NAME + std::to_string(device)), model(model) { auto info = const_cast(&ggml_cuda_info()); - if (info->all_ctx[device]) { + auto & all_ctx = info->all_ctx[model]; + if (all_ctx[device]) { GGML_CUDA_LOG_WARN("%s: a context for device %d already exists?\n", __func__, device); } else{ - info->all_ctx[device] = this; + all_ctx[device] = this; } } @@ -555,9 +565,12 @@ ggml_backend_cuda_context::~ggml_backend_cuda_context() { } } auto info = const_cast(&ggml_cuda_info()); - if (info->all_ctx[device] == this) { - info->all_ctx[device] = nullptr; + if (auto it = info->all_ctx.find(model); it != info->all_ctx.end() && it->second[device] == this) { + it->second[device] = nullptr; } + //if (info->all_ctx[device] == this) { + // info->all_ctx[device] = nullptr; + //} } @@ -4247,23 +4260,8 @@ GGML_CALL static bool ggml_backend_cuda_cpy_tensor_async(ggml_backend_t backend_ needs_f16_f32_copy = true; } else { -#ifdef GGML_USE_NCCL__ - auto & info = ggml_cuda_info(); - auto nbytes = ggml_nbytes(src); - ncclGroupStart(); - ggml_cuda_set_device(cuda_ctx_src->device); - auto status1 = ncclSend(src->data, nbytes, ncclUint8, cuda_ctx_dst->device, info.nccl_coms[cuda_ctx_src->device], - info.all_ctx[cuda_ctx_src->device]->stream()); - ggml_cuda_set_device(cuda_ctx_dst->device); - auto status2 = ncclRecv(dst->data, nbytes, ncclUint8, cuda_ctx_src->device, info.nccl_coms[cuda_ctx_dst->device], - info.all_ctx[cuda_ctx_dst->device]->stream()); - ncclGroupEnd(); - GGML_ASSERT(status1 == ncclSuccess && status2 == ncclSuccess); - return true; -#else ggml_cuda_set_device(cuda_ctx_src->device); CUDA_CHECK(cudaMemcpyPeerAsync(dst->data, cuda_ctx_dst->device, src->data, cuda_ctx_src->device, ggml_nbytes(dst), cuda_ctx_src->stream())); -#endif } #endif } @@ -5249,13 +5247,13 @@ static cuda_params ggml_cuda_parse_params(const char * params_string) { return params; } -GGML_CALL ggml_backend_t ggml_backend_cuda_init(int device, [[maybe_unused]] const void * param_string) { +GGML_CALL ggml_backend_t ggml_backend_cuda_init(int device, [[maybe_unused]] const void * param_string, const void * model) { if (device < 0 || device >= ggml_backend_cuda_get_device_count()) { GGML_CUDA_LOG_ERROR("%s: invalid device %d\n", __func__, device); return nullptr; } - ggml_backend_cuda_context * ctx = new ggml_backend_cuda_context(device); + ggml_backend_cuda_context * ctx = new ggml_backend_cuda_context(device, model); if (ctx == nullptr) { GGML_CUDA_LOG_ERROR("%s: failed to allocate context\n", __func__); return nullptr; @@ -5370,7 +5368,7 @@ GGML_CALL void ggml_backend_cuda_unregister_host_buffer(void * buffer) { // backend registry GGML_CALL static ggml_backend_t ggml_backend_reg_cuda_init(const char * params, void * user_data) { - ggml_backend_t cuda_backend = ggml_backend_cuda_init((int) (intptr_t) user_data, nullptr); + ggml_backend_t cuda_backend = ggml_backend_cuda_init((int) (intptr_t) user_data, nullptr, nullptr); return cuda_backend; GGML_UNUSED(params); diff --git a/ggml/src/ggml-cuda/common.cuh b/ggml/src/ggml-cuda/common.cuh index bb82edeb..924100d1 100644 --- a/ggml/src/ggml-cuda/common.cuh +++ b/ggml/src/ggml-cuda/common.cuh @@ -762,7 +762,8 @@ struct ggml_cuda_device_info { std::array default_tensor_split = {}; - ggml_backend_cuda_context * all_ctx[GGML_CUDA_MAX_DEVICES] = { nullptr }; + std::unordered_map> all_ctx; + //ggml_backend_cuda_context * all_ctx[GGML_CUDA_MAX_DEVICES] = { nullptr }; #ifdef GGML_USE_NCCL ncclComm_t nccl_coms[GGML_CUDA_MAX_DEVICES]; bool have_nccl; @@ -864,10 +865,11 @@ struct ggml_backend_cuda_context { #endif + const void * model; void * copy_buffer = nullptr; size_t copy_size = 0; - explicit ggml_backend_cuda_context(int device); + explicit ggml_backend_cuda_context(int device, const void * model); ~ggml_backend_cuda_context(); diff --git a/ggml/src/ggml-cuda/reduce.cu b/ggml/src/ggml-cuda/reduce.cu index c3177be0..b3a87c8b 100644 --- a/ggml/src/ggml-cuda/reduce.cu +++ b/ggml/src/ggml-cuda/reduce.cu @@ -94,6 +94,11 @@ static void copy_missing_tensors(ggml_backend_cuda_context & ctx, ggml_tensor * if (ncopy < 1) return; auto & info = ggml_cuda_info(); + auto it = info.all_ctx.find(ctx.model); + if (it == info.all_ctx.end()) { + GGML_ABORT("Fatal error"); + } + auto & all_ctx = it->second; auto size = ggml_nbytes(dst); int isrc = 0; for (int ii = 0; ii < ncopy; ++ii) { @@ -102,9 +107,9 @@ static void copy_missing_tensors(ggml_backend_cuda_context & ctx, ggml_tensor * isrc = (isrc + 1)%nhave; //printf("%s: copying from device %d to device %d: %p -> %p\n", __func__, j, i, dst->src[j]->data, dst->src[i]->data); ggml_cuda_set_device(j); - CUDA_CHECK(cudaMemcpyPeerAsync(dst->src[i]->data, info.all_ctx[i]->device, dst->src[j]->data, info.all_ctx[j]->device, - size, info.all_ctx[j]->stream())); - CUDA_CHECK(cudaEventRecord(info.all_ctx[j]->copy_event, info.all_ctx[j]->stream())); + CUDA_CHECK(cudaMemcpyPeerAsync(dst->src[i]->data, all_ctx[i]->device, dst->src[j]->data, all_ctx[j]->device, + size, all_ctx[j]->stream())); + CUDA_CHECK(cudaEventRecord(all_ctx[j]->copy_event, all_ctx[j]->stream())); } isrc = 0; for (int ii = 0; ii < ncopy; ++ii) { @@ -112,7 +117,7 @@ static void copy_missing_tensors(ggml_backend_cuda_context & ctx, ggml_tensor * int j = idx[isrc]; isrc = (isrc + 1)%nhave; ggml_cuda_set_device(i); - CUDA_CHECK(cudaStreamWaitEvent(info.all_ctx[i]->stream(), info.all_ctx[j]->copy_event, 0)); + CUDA_CHECK(cudaStreamWaitEvent(all_ctx[i]->stream(), all_ctx[j]->copy_event, 0)); } ggml_cuda_set_device(ctx.device); } @@ -133,6 +138,11 @@ void ggml_cuda_op_reduce([[maybe_unused]] ggml_backend_cuda_context & ctx, ggml_ } auto & info = ggml_cuda_info(); + auto it = info.all_ctx.find(ctx.model); + if (it == info.all_ctx.end()) { + GGML_ABORT("Fatal error"); + } + auto & all_ctx = it->second; #ifdef GGML_USE_NCCL // Somehow I'm not able to figure out how to use NCCL correctly. // It does not work at all if not all GPUs participate in the reduce op, and we @@ -153,7 +163,7 @@ void ggml_cuda_op_reduce([[maybe_unused]] ggml_backend_cuda_context & ctx, ggml_ ggml_cuda_set_device(i); auto status = ncclAllReduce(dst->src[i] ? dst->src[i]->data : nullptr, dst->src[i] ? dst->src[i]->data : nullptr, - ggml_nelements(dst), data_type, ncclSum, info.nccl_coms[i], info.all_ctx[i]->stream()); + ggml_nelements(dst), data_type, ncclSum, info.nccl_coms[i], all_ctx[i]->stream()); if (status != ncclSuccess) { fprintf(stderr, "%s: ncclAllReduce failed with status %d\n", __func__, (int)status); GGML_ABORT("Fatal error"); @@ -275,7 +285,7 @@ void ggml_cuda_op_reduce([[maybe_unused]] ggml_backend_cuda_context & ctx, ggml_ auto size_per_device = nblocks_per_device * tt.type_size; for (int ii = 0; ii < nhave; ++ii) { int i = idx[ii]; - auto this_ctx = info.all_ctx[i]; + auto this_ctx = all_ctx[i]; if (!this_ctx->copy_event || !this_ctx->compute_event || size_per_device > this_ctx->copy_size) { ggml_cuda_set_device(this_ctx->device); if (!this_ctx->copy_event) { @@ -300,14 +310,14 @@ void ggml_cuda_op_reduce([[maybe_unused]] ggml_backend_cuda_context & ctx, ggml_ int peer = idx[(ii+1)%nhave]; auto this_nelem = std::min(nelem_per_device, nelem - ichunk*nelem_per_device); auto this_size = (this_nelem / tt.blck_size) * tt.type_size; - ggml_cuda_set_device(info.all_ctx[peer]->device); + ggml_cuda_set_device(all_ctx[peer]->device); if (stage > 0) { - CUDA_CHECK(cudaStreamWaitEvent(info.all_ctx[peer]->stream(), info.all_ctx[i]->compute_event, 0)); + CUDA_CHECK(cudaStreamWaitEvent(all_ctx[peer]->stream(), all_ctx[i]->compute_event, 0)); } - CUDA_CHECK(cudaMemcpyPeerAsync(info.all_ctx[i]->copy_buffer, info.all_ctx[i]->device, - (const char *)dst->src[peer]->data + ichunk*size_per_device, info.all_ctx[peer]->device, - this_size, info.all_ctx[peer]->stream())); - CUDA_CHECK(cudaEventRecord(info.all_ctx[peer]->copy_event, info.all_ctx[peer]->stream())); + CUDA_CHECK(cudaMemcpyPeerAsync(all_ctx[i]->copy_buffer, all_ctx[i]->device, + (const char *)dst->src[peer]->data + ichunk*size_per_device, all_ctx[peer]->device, + this_size, all_ctx[peer]->stream())); + CUDA_CHECK(cudaEventRecord(all_ctx[peer]->copy_event, all_ctx[peer]->stream())); ichunk = (ichunk + 1)%nhave; } ichunk = stage; @@ -315,24 +325,24 @@ void ggml_cuda_op_reduce([[maybe_unused]] ggml_backend_cuda_context & ctx, ggml_ int i = idx[ii]; int peer = idx[(ii+1)%nhave]; auto this_nelem = std::min(nelem_per_device, nelem - ichunk*nelem_per_device); - ggml_cuda_set_device(info.all_ctx[i]->device); - CUDA_CHECK(cudaStreamWaitEvent(info.all_ctx[i]->stream(), info.all_ctx[peer]->copy_event, 0)); + ggml_cuda_set_device(all_ctx[i]->device); + CUDA_CHECK(cudaStreamWaitEvent(all_ctx[i]->stream(), all_ctx[peer]->copy_event, 0)); int num_blocks = (this_nelem + CUDA_REDUCE_BLOCK_SIZE - 1)/CUDA_REDUCE_BLOCK_SIZE; if (dst->type == GGML_TYPE_F16) { - k_add<<stream()>>>(this_nelem, - (const half *)info.all_ctx[i]->copy_buffer, (half *)dst->src[i]->data + ichunk*nelem_per_device); + k_add<<stream()>>>(this_nelem, + (const half *)all_ctx[i]->copy_buffer, (half *)dst->src[i]->data + ichunk*nelem_per_device); } else if (dst->type == GGML_TYPE_Q8_0) { - k_add<<stream()>>>(this_nelem, - (const block_q8_0 *)info.all_ctx[i]->copy_buffer, (block_q8_0 *)dst->src[i]->data + ichunk*nelem_per_device/tt.blck_size); + k_add<<stream()>>>(this_nelem, + (const block_q8_0 *)all_ctx[i]->copy_buffer, (block_q8_0 *)dst->src[i]->data + ichunk*nelem_per_device/tt.blck_size); } else if (dst->type == GGML_TYPE_BF16) { - k_add<<stream()>>>( - this_nelem, (const nv_bfloat16 *)info.all_ctx[i]->copy_buffer, + k_add<<stream()>>>( + this_nelem, (const nv_bfloat16 *)all_ctx[i]->copy_buffer, (nv_bfloat16 *)dst->src[i]->data + ichunk*nelem_per_device); } else { - k_add<<stream()>>>(this_nelem, - (const float *)info.all_ctx[i]->copy_buffer, (float *)dst->src[i]->data + ichunk*nelem_per_device); + k_add<<stream()>>>(this_nelem, + (const float *)all_ctx[i]->copy_buffer, (float *)dst->src[i]->data + ichunk*nelem_per_device); } - CUDA_CHECK(cudaEventRecord(info.all_ctx[i]->compute_event, info.all_ctx[i]->stream())); + CUDA_CHECK(cudaEventRecord(all_ctx[i]->compute_event, all_ctx[i]->stream())); ichunk = (ichunk + 1)%nhave; } } @@ -343,21 +353,21 @@ void ggml_cuda_op_reduce([[maybe_unused]] ggml_backend_cuda_context & ctx, ggml_ int peer = idx[(ii+1)%nhave]; auto this_nelem = std::min(nelem_per_device, nelem - ichunk*nelem_per_device); auto this_size = (this_nelem / tt.blck_size) * tt.type_size; - ggml_cuda_set_device(info.all_ctx[peer]->device); + ggml_cuda_set_device(all_ctx[peer]->device); if (stage == 0) { - CUDA_CHECK(cudaStreamWaitEvent(info.all_ctx[peer]->stream(), info.all_ctx[i]->compute_event, 0)); + CUDA_CHECK(cudaStreamWaitEvent(all_ctx[peer]->stream(), all_ctx[i]->compute_event, 0)); } - CUDA_CHECK(cudaMemcpyPeerAsync((char *)dst->src[i]->data + ichunk*size_per_device, info.all_ctx[i]->device, - (const char *)dst->src[peer]->data + ichunk*size_per_device, info.all_ctx[peer]->device, - this_size, info.all_ctx[peer]->stream())); - CUDA_CHECK(cudaEventRecord(info.all_ctx[peer]->copy_event, info.all_ctx[peer]->stream())); + CUDA_CHECK(cudaMemcpyPeerAsync((char *)dst->src[i]->data + ichunk*size_per_device, all_ctx[i]->device, + (const char *)dst->src[peer]->data + ichunk*size_per_device, all_ctx[peer]->device, + this_size, all_ctx[peer]->stream())); + CUDA_CHECK(cudaEventRecord(all_ctx[peer]->copy_event, all_ctx[peer]->stream())); ichunk = (ichunk + 1)%nhave; } for (int ii = 0; ii < nhave; ++ii) { int i = idx[ii]; int peer = idx[(ii+1)%nhave]; - ggml_cuda_set_device(info.all_ctx[i]->device); - CUDA_CHECK(cudaStreamWaitEvent(info.all_ctx[i]->stream(), info.all_ctx[peer]->copy_event, 0)); + ggml_cuda_set_device(all_ctx[i]->device); + CUDA_CHECK(cudaStreamWaitEvent(all_ctx[i]->stream(), all_ctx[peer]->copy_event, 0)); } } ggml_cuda_set_device(ctx.device); @@ -372,8 +382,8 @@ void ggml_cuda_op_reduce([[maybe_unused]] ggml_backend_cuda_context & ctx, ggml_ GGML_ASSERT(dst->src[i]->type == dst->type); GGML_ASSERT(ggml_are_same_shape(dst, dst->src[i])); ggml_cuda_set_device(i); - if (!info.all_ctx[i]->copy_event) { - CUDA_CHECK(cudaEventCreateWithFlags(&info.all_ctx[i]->copy_event, cudaEventDisableTiming)); + if (!all_ctx[i]->copy_event) { + CUDA_CHECK(cudaEventCreateWithFlags(&all_ctx[i]->copy_event, cudaEventDisableTiming)); } } auto nelem = ggml_nelements(dst); @@ -386,20 +396,20 @@ void ggml_cuda_op_reduce([[maybe_unused]] ggml_backend_cuda_context & ctx, ggml_ task.ptrs[0] = (char *)dst->src[i]->data; int j = idx[2*ii+1]; ggml_cuda_set_device(j); - CUDA_CHECK(cudaEventRecord(info.all_ctx[j]->copy_event, info.all_ctx[j]->stream())); + CUDA_CHECK(cudaEventRecord(all_ctx[j]->copy_event, all_ctx[j]->stream())); task.ptrs[1] = (char *)dst->src[j]->data; ggml_cuda_set_device(i); - CUDA_CHECK(cudaStreamWaitEvent(info.all_ctx[i]->stream(), info.all_ctx[j]->copy_event)); + CUDA_CHECK(cudaStreamWaitEvent(all_ctx[i]->stream(), all_ctx[j]->copy_event)); if (dst->type == GGML_TYPE_F16) { - k_reduce_add_T<<stream()>>>(task); + k_reduce_add_T<<stream()>>>(task); } else { - k_reduce_add_T<<stream()>>>(task); + k_reduce_add_T<<stream()>>>(task); } } for (int ii = 0; ii < nhave/2; ++ii) { int i = idx[2*ii+0]; ggml_cuda_set_device(i); - CUDA_CHECK(cudaEventRecord(info.all_ctx[i]->copy_event, info.all_ctx[i]->stream())); + CUDA_CHECK(cudaEventRecord(all_ctx[i]->copy_event, all_ctx[i]->stream())); } for (int ii = 0; ii < nhave/2; ++ii) { int i = idx[2*ii+1]; @@ -411,23 +421,23 @@ void ggml_cuda_op_reduce([[maybe_unused]] ggml_backend_cuda_context & ctx, ggml_ int j = idx[(2*ii+2)%nhave]; task.ptrs[1] = (char *)dst->src[j]->data; ggml_cuda_set_device(i); - CUDA_CHECK(cudaStreamWaitEvent(info.all_ctx[i]->stream(), info.all_ctx[j]->copy_event)); + CUDA_CHECK(cudaStreamWaitEvent(all_ctx[i]->stream(), all_ctx[j]->copy_event)); if (dst->type == GGML_TYPE_F16) { - k_reduce_add_T<<stream()>>>(task); + k_reduce_add_T<<stream()>>>(task); } else { - k_reduce_add_T<<stream()>>>(task); + k_reduce_add_T<<stream()>>>(task); } } for (int ii = 0; ii < nhave/2; ++ii) { int i = idx[2*ii+1]; ggml_cuda_set_device(i); - CUDA_CHECK(cudaEventRecord(info.all_ctx[i]->copy_event, info.all_ctx[i]->stream())); + CUDA_CHECK(cudaEventRecord(all_ctx[i]->copy_event, all_ctx[i]->stream())); } for (int ii = 0; ii < nhave/2; ++ii) { int i = idx[(2*ii+2)%nhave]; ggml_cuda_set_device(i); int j = idx[2*ii+1]; - CUDA_CHECK(cudaStreamWaitEvent(info.all_ctx[i]->stream(), info.all_ctx[j]->copy_event)); + CUDA_CHECK(cudaStreamWaitEvent(all_ctx[i]->stream(), all_ctx[j]->copy_event)); } ggml_cuda_set_device(ctx.device); if (ncopy > 0) { @@ -442,10 +452,10 @@ void ggml_cuda_op_reduce([[maybe_unused]] ggml_backend_cuda_context & ctx, ggml_ GGML_ASSERT(dst->src[i]->type == dst->type); GGML_ASSERT(ggml_are_same_shape(dst, dst->src[i])); ggml_cuda_set_device(i); - if (!info.all_ctx[i]->copy_event) { - CUDA_CHECK(cudaEventCreateWithFlags(&info.all_ctx[i]->copy_event, cudaEventDisableTiming)); + if (!all_ctx[i]->copy_event) { + CUDA_CHECK(cudaEventCreateWithFlags(&all_ctx[i]->copy_event, cudaEventDisableTiming)); } - CUDA_CHECK(cudaEventRecord(info.all_ctx[i]->copy_event, info.all_ctx[i]->stream())); + CUDA_CHECK(cudaEventRecord(all_ctx[i]->copy_event, all_ctx[i]->stream())); } //printf("Recorded events\n"); auto nelem = ggml_nelements(dst); @@ -465,37 +475,37 @@ void ggml_cuda_op_reduce([[maybe_unused]] ggml_backend_cuda_context & ctx, ggml_ for (int jj = 0; jj < nhave; ++jj) { if (jj == ii) continue; int j = idx[jj]; - CUDA_CHECK(cudaStreamWaitEvent(info.all_ctx[i]->stream(), info.all_ctx[j]->copy_event)); + CUDA_CHECK(cudaStreamWaitEvent(all_ctx[i]->stream(), all_ctx[j]->copy_event)); task.ptrs[k++] = (char *)dst->src[j]->data + ii*nelem_per_device*elem_size; } int nblock = (this_nelem + CUDA_REDUCE_BLOCK_SIZE - 1)/CUDA_REDUCE_BLOCK_SIZE; if (dst->type == GGML_TYPE_F16) { switch (nhave) { case 2: - k_reduce_add_T<<stream()>>>(task); + k_reduce_add_T<<stream()>>>(task); break; case 3: - k_reduce_add_T<<stream()>>>(task); + k_reduce_add_T<<stream()>>>(task); break; case 4: - k_reduce_add_T<<stream()>>>(task); + k_reduce_add_T<<stream()>>>(task); break; default: - k_reduce_add<<stream()>>>(task); + k_reduce_add<<stream()>>>(task); } } else { switch (nhave) { case 2: - k_reduce_add_T<<stream()>>>(task); + k_reduce_add_T<<stream()>>>(task); break; case 3: - k_reduce_add_T<<stream()>>>(task); + k_reduce_add_T<<stream()>>>(task); break; case 4: - k_reduce_add_T<<stream()>>>(task); + k_reduce_add_T<<stream()>>>(task); break; default: - k_reduce_add<<stream()>>>(task); + k_reduce_add<<stream()>>>(task); } } } @@ -503,7 +513,7 @@ void ggml_cuda_op_reduce([[maybe_unused]] ggml_backend_cuda_context & ctx, ggml_ for (int ii = 0; ii < nhave; ++ii) { int i = idx[ii]; ggml_cuda_set_device(i); - CUDA_CHECK(cudaEventRecord(info.all_ctx[i]->copy_event, info.all_ctx[i]->stream())); + CUDA_CHECK(cudaEventRecord(all_ctx[i]->copy_event, all_ctx[i]->stream())); } //printf("Recorded events again\n"); for (int ii = 0; ii < nhave; ++ii) { @@ -512,7 +522,7 @@ void ggml_cuda_op_reduce([[maybe_unused]] ggml_backend_cuda_context & ctx, ggml_ for (int jj = 0; jj < nhave; ++jj) { if (jj == ii) continue; int j = idx[jj]; - CUDA_CHECK(cudaStreamWaitEvent(info.all_ctx[i]->stream(), info.all_ctx[j]->copy_event)); + CUDA_CHECK(cudaStreamWaitEvent(all_ctx[i]->stream(), all_ctx[j]->copy_event)); } } ggml_cuda_set_device(ctx.device); @@ -536,11 +546,11 @@ void ggml_cuda_op_reduce([[maybe_unused]] ggml_backend_cuda_context & ctx, ggml_ GGML_ASSERT(ggml_are_same_shape(dst, dst->src[i])); if (i == ctx.device) continue; ggml_cuda_set_device(i); - CUDA_CHECK(cudaMemcpyPeerAsync(ptr, ctx.device, dst->src[i]->data, i, nbytes, info.all_ctx[i]->stream())); - if (!info.all_ctx[i]->copy_event) { - CUDA_CHECK(cudaEventCreateWithFlags(&info.all_ctx[i]->copy_event, cudaEventDisableTiming)); + CUDA_CHECK(cudaMemcpyPeerAsync(ptr, ctx.device, dst->src[i]->data, i, nbytes, all_ctx[i]->stream())); + if (!all_ctx[i]->copy_event) { + CUDA_CHECK(cudaEventCreateWithFlags(&all_ctx[i]->copy_event, cudaEventDisableTiming)); } - CUDA_CHECK(cudaEventRecord(info.all_ctx[i]->copy_event, info.all_ctx[i]->stream())); + CUDA_CHECK(cudaEventRecord(all_ctx[i]->copy_event, all_ctx[i]->stream())); ptr += nbytes; } auto nelem = ggml_nelements(dst); @@ -550,7 +560,7 @@ void ggml_cuda_op_reduce([[maybe_unused]] ggml_backend_cuda_context & ctx, ggml_ for (int ii = 0; ii < nhave; ++ii) { int i = idx[ii]; if (i == ctx.device) continue; - CUDA_CHECK(cudaStreamWaitEvent(ctx.stream(), info.all_ctx[i]->copy_event, 0)); + CUDA_CHECK(cudaStreamWaitEvent(ctx.stream(), all_ctx[i]->copy_event, 0)); if (dst->type == GGML_TYPE_F16) { k_add<<>>(nelem, (const half *)ptr, (half *)dst->data); } else if (dst->type == GGML_TYPE_BF16) { @@ -572,15 +582,15 @@ void ggml_cuda_op_reduce([[maybe_unused]] ggml_backend_cuda_context & ctx, ggml_ int i = idx[ii]; if (i == ctx.device) continue; ggml_cuda_set_device(i); - CUDA_CHECK(cudaStreamWaitEvent(info.all_ctx[i]->stream(), ctx.copy_event, 0)); - CUDA_CHECK(cudaMemcpyPeerAsync(dst->src[i]->data, i, dst->data, ctx.device, nbytes, info.all_ctx[i]->stream())); - CUDA_CHECK(cudaEventRecord(info.all_ctx[i]->copy_event, info.all_ctx[i]->stream())); + CUDA_CHECK(cudaStreamWaitEvent(all_ctx[i]->stream(), ctx.copy_event, 0)); + CUDA_CHECK(cudaMemcpyPeerAsync(dst->src[i]->data, i, dst->data, ctx.device, nbytes, all_ctx[i]->stream())); + CUDA_CHECK(cudaEventRecord(all_ctx[i]->copy_event, all_ctx[i]->stream())); } ggml_cuda_set_device(ctx.device); for (int ii = 0; ii < nhave; ++ii) { int i = idx[ii]; if (i == ctx.device) continue; - CUDA_CHECK(cudaStreamWaitEvent(ctx.stream(), info.all_ctx[i]->copy_event, 0)); + CUDA_CHECK(cudaStreamWaitEvent(ctx.stream(), all_ctx[i]->copy_event, 0)); } if (ncopy > 0) { copy_missing_tensors(ctx, dst, nhave, ncopy, idx, copy_idx); diff --git a/src/graphs/build_gemma4.cpp b/src/graphs/build_gemma4.cpp index 3a0bb3fc..145a30b1 100644 --- a/src/graphs/build_gemma4.cpp +++ b/src/graphs/build_gemma4.cpp @@ -591,10 +591,6 @@ ggml_cgraph * llm_build_context::build_gemma4_mtp() { ggml_tensor * KQ_mask = nullptr; ggml_tensor * KQ_mask_swa = nullptr; - ggml_tensor * frozen_k_swa = nullptr; - ggml_tensor * frozen_v_swa = nullptr; - ggml_tensor * frozen_k_full = nullptr; - ggml_tensor * frozen_v_full = nullptr; { const int64_t n_mask_tokens = GGML_PAD(n_tokens, GGML_KQ_MASK_PAD); lctx.inp_KQ_mask = ggml_new_tensor_2d(ctx0, flash_attn ? GGML_TYPE_F16 : GGML_TYPE_F32, target_n_kv, n_mask_tokens); @@ -610,59 +606,248 @@ ggml_cgraph * llm_build_context::build_gemma4_mtp() { } } - for (int il = 0; il < n_layer; ++il) { + if (model.split_mode == LLAMA_SPLIT_MODE_GRAPH) { + int n_device = model.splits.size(); + std::vector sa_inp(n_device, nullptr); + std::vector sa_out(n_device, nullptr); + std::vector ffn_inp(n_device, nullptr); + std::vector ffn_out(n_device, nullptr); ggml_tensor * inpL = cur; - const bool is_sliding = hparams.swa_layers[il] ? true : false; - const float freq_base_l = is_sliding ? target_hparams.rope_freq_base_train_swa : target_cparams.rope_freq_base; - const float freq_scale_l = is_sliding ? target_hparams.rope_freq_scale_train_swa : target_cparams.rope_freq_scale; - const int n_rot_l = is_sliding ? target_hparams.n_rot_swa : target_hparams.n_rot; - const int n_swa = is_sliding ? target_hparams.n_swa : 0; - const int n_embd_head = hparams.n_embd_head_k(il); - const int n_head = hparams.n_head(il); - ggml_tensor * KQ_mask_l = is_sliding ? KQ_mask_swa : KQ_mask; + for (int il = 0; il < n_layer; ++il) { - cur = llm_build_norm(ctx0, inpL, hparams, model.layers[il].attn_norm, nullptr, LLM_NORM_RMS, cb, il); - cb(cur, "attn_norm", il); + const bool is_sliding = hparams.swa_layers[il] ? true : false; + const float freq_base_l = is_sliding ? target_hparams.rope_freq_base_train_swa : target_cparams.rope_freq_base; + const float freq_scale_l = is_sliding ? target_hparams.rope_freq_scale_train_swa : target_cparams.rope_freq_scale; + const int n_rot_l = is_sliding ? target_hparams.n_rot_swa : target_hparams.n_rot; + const int n_swa = is_sliding ? target_hparams.n_swa : 0; + const int n_embd_head = hparams.n_embd_head_k(il); + ggml_tensor * KQ_mask_l = is_sliding ? KQ_mask_swa : KQ_mask; - ggml_tensor * Qcur = llm_build_lora_mm(lctx, ctx0, model.layers[il].wq, cur); - cb(Qcur, "Qcur", il); - Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens); - Qcur = llm_build_norm(ctx0, Qcur, hparams, model.layers[il].attn_q_norm, nullptr, LLM_NORM_RMS, cb, il); - cb(Qcur, "Qcur_normed", il); - auto freq_factors = is_sliding ? nullptr : model.layers[il].rope_freqs; - Qcur = ggml_rope_ext(ctx0, Qcur, inp_pos, freq_factors, n_rot_l, rope_type, n_ctx_orig, freq_base_l, freq_scale_l, - ext_factor, attn_factor, beta_fast, beta_slow); - cb(Qcur, "Qcur_rope", il); + const int target_il = gemma4_mtp_target_kv_layer(hparams, target_hparams, il); - const int target_il = gemma4_mtp_target_kv_layer(hparams, target_hparams, il); - ggml_tensor *& frozen_k = is_sliding ? frozen_k_swa : frozen_k_full; - ggml_tensor *& frozen_v = is_sliding ? frozen_v_swa : frozen_v_full; - gemma4_mtp_prepare_frozen_kv_views(ctx0, lctx, target_kv, il, target_il, target_n_kv, &frozen_k, &frozen_v, cb); - cur = llm_build_kv(ctx0, lctx, target_kv, gf, model.layers[il].wo, model.layers[il].bo, - nullptr, nullptr, Qcur, KQ_mask_l, n_tokens, target_kv_head, target_n_kv, hparams.f_attention_scale, cb, il, nullptr, n_swa, target_il, - &frozen_k, &frozen_v); + auto split_kl = (const ggml_split_tensor_t *)target_kv.k_l[target_il]->extra; + auto split_vl = (const ggml_split_tensor_t *)target_kv.v_l[target_il]->extra; + GGML_ASSERT(split_kl && split_vl); + auto split_ql = (const ggml_split_tensor_t *)model.layers[il].wq->extra; + auto split_ol = (const ggml_split_tensor_t *)model.layers[il].wo->extra; + GGML_ASSERT(split_ql && split_ol); + GGML_ASSERT(split_ql->n_device == n_device && split_kl->n_device == n_device && split_vl->n_device == n_device && split_ol->n_device == n_device); + ggml_tensor * sa_last = nullptr; + int nhave = 0; + for (int id = 0; id < n_device; ++id) { + GGML_ASSERT((split_kl->splits[id] && split_vl->splits[id] && split_ql->splits[id] && split_ol->splits[id]) || + !(split_kl->splits[id] || split_vl->splits[id] || split_ql->splits[id] || split_ol->splits[id])); + if (!split_kl->splits[id]) { + sa_inp[id] = sa_out[id] = nullptr; + continue; + } - cur = llm_build_norm(ctx0, cur, hparams, model.layers[il].attn_post_norm, nullptr, LLM_NORM_RMS, cb, il); - cb(cur, "attn_post_norm", il); - cur = ggml_add(ctx0, cur, inpL); - cb(cur, "attn_out", il); + int il_cb = 1000*(il + 1) + id; - ggml_tensor * ffn = llm_build_ffn(ctx0, lctx, model.layers[il].ffn_norm, cur, - model.layers[il].ffn_up, nullptr, nullptr, - model.layers[il].ffn_gate, nullptr, nullptr, - model.layers[il].ffn_down, nullptr, nullptr, - nullptr, - LLM_FFN_GELU, LLM_FFN_PAR, cb, il, gf, true, false, nullptr, model.layers[il].ffn_post_norm); - cb(ffn, "ffn_out", il); + if (il == 0) { + sa_inp[id] = inpL; + } else { + GGML_ASSERT(inpL->op == GGML_OP_REDUCE); + cur = get_input_tensor_sm_graph(ctx0, inpL, id); + GGML_ASSERT(model.layers[il-1].ffn_post_norm && model.layers[il-1].ffn_post_norm->extra); + cur = do_split_norm(ctx0, cur, model.layers[il-1].ffn_post_norm, hparams, cb, id, il_cb, false); + cb(cur, "ffn_normed", il_cb); + auto add = ffn_inp[id]; + if (!add) { + for (int j = 0; j < n_device; ++j) { + if (ffn_inp[j]) { + add = ffn_inp[j]; break; + } + } + GGML_ASSERT(add); + } + sa_inp[id] = ggml_add(ctx0, cur, add); + cb(sa_inp[id], "sa_inp", il_cb); + if (model.layers[il-1].out_scale) { + auto scale = (const ggml_split_tensor_t *)model.layers[il-1].out_scale->extra; + sa_inp[id] = ggml_mul(ctx0, sa_inp[id], scale->splits[id]); + cb(sa_inp[id], "sa_inp_scaled", il_cb); + } + } + GGML_ASSERT(model.layers[il].attn_norm && model.layers[il].attn_norm->extra); + cur = do_split_norm(ctx0, sa_inp[id], model.layers[il].attn_norm, hparams, cb, id, il_cb, false); + cb(cur, "sa_inp_normed", il_cb); + auto Qcur = llm_build_lora_mm(lctx, ctx0, split_ql->splits[id], cur); + cb(Qcur, "Qcur", il_cb); + Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, Qcur->ne[0]/n_embd_head, n_tokens); + GGML_ASSERT(model.layers[il].attn_q_norm && model.layers[il].attn_q_norm->extra); + Qcur = do_split_norm(ctx0, Qcur, model.layers[il].attn_q_norm, hparams, cb, id, il_cb, false); + cb(Qcur, "Qcur_normed", il_cb); + auto freq_factors = is_sliding ? nullptr : ((const ggml_split_tensor_t *)model.layers[il].rope_freqs->extra)->splits[id]; + Qcur = ggml_rope_ext(ctx0, Qcur, inp_pos, freq_factors, n_rot_l, rope_type, n_ctx_orig, freq_base_l, freq_scale_l, + ext_factor, attn_factor, beta_fast, beta_slow); + cb(Qcur, "Qcur_rope", il_cb); + GGML_ASSERT(split_kl->splits[id]->ne[1] % target_kv.size == 0); + int n_head_kv = split_kl->splits[id]->ne[1] / target_kv.size; + auto q = ggml_permute(ctx0, Qcur, 0, 2, 1, 3); + auto k = ggml_view_3d(ctx0, split_kl->splits[id], n_embd_head, target_n_kv, n_head_kv, + ggml_row_size(split_kl->splits[id]->type, n_embd_head)*n_head_kv, + ggml_row_size(split_kl->splits[id]->type, n_embd_head), 0); + auto v = ggml_view_3d(ctx0, split_vl->splits[id], n_embd_head, target_n_kv, n_head_kv, + ggml_row_size(split_vl->splits[id]->type, n_embd_head)*n_head_kv, + ggml_row_size(split_vl->splits[id]->type, n_embd_head), 0); + cur = ggml_flash_attn_ext(ctx0, q, k, v, KQ_mask_l, hparams.f_attention_scale, 0.0f, 0.0f); + cur->op_params[4] = n_swa; + cb(cur, "fa", il_cb); + cur = ggml_reshape_2d(ctx0, cur, split_ol->splits[id]->ne[0], ggml_nelements(cur)/split_ol->splits[id]->ne[0]); + cur = llm_build_lora_mm(lctx, ctx0, split_ol->splits[id], cur); + cb(cur, "qkv", il_cb); + ggml_build_forward_expand(gf, cur); + sa_out[id] = cur; + sa_last = cur; + ++nhave; + } - cur = ffn; - if (model.layers[il].out_scale) { - cur = ggml_mul(ctx0, cur, model.layers[il].out_scale); - cb(cur, "out_scaled", il); + auto last_ffn_inp = nhave > 1 ? ggml_reduce(ctx0, sa_out.data(), n_device, GGML_OP_ADD) : sa_last; + ggml_build_forward_expand(gf, last_ffn_inp); + cb(last_ffn_inp, "sa_reduce", il); + + auto ffn_up = (const ggml_split_tensor_t *)model.layers[il].ffn_up->extra; + auto ffn_gate = (const ggml_split_tensor_t *)model.layers[il].ffn_gate->extra; + auto ffn_down = (const ggml_split_tensor_t *)model.layers[il].ffn_down->extra; + GGML_ASSERT(ffn_up && ffn_gate && ffn_down); + + for (int id = 0; id < n_device; ++id) { + GGML_ASSERT((ffn_up->splits[id] && ffn_gate->splits[id] && ffn_down->splits[id]) || + (!ffn_up->splits[id] && !ffn_gate->splits[id] && !ffn_down->splits[id])); + if (!ffn_up->splits[id]) { + ffn_inp[id] = ffn_out[id] = nullptr; + continue; + } + + GGML_ASSERT(last_ffn_inp && (nhave == 1 || last_ffn_inp->op == GGML_OP_REDUCE)); + + int il_cb = 1000*(il + 1) + id; + + cur = get_input_tensor_sm_graph(ctx0, last_ffn_inp, id); + cur = do_split_norm(ctx0, cur, model.layers[il].attn_post_norm, hparams, cb, id, il_cb, false); + cb(cur, "sa_post", il_cb); + auto add = sa_inp[id]; + if (!add) { + for (int j = 0; j < n_device; ++j) { + if (sa_inp[j]) { + add = sa_inp[j]; break; + } + } + } + ffn_inp[id] = ggml_add(ctx0, cur, add); + cb(ffn_inp[id], "ffn_inp", il_cb); + cur = do_split_norm(ctx0, ffn_inp[id], model.layers[il].ffn_norm, hparams, cb, id, il_cb, false); + cb(cur, "ffn_inp_normed", il_cb); + cur = llm_build_ffn(ctx0, lctx, nullptr, cur, + ffn_up->splits[id], nullptr, nullptr, + ffn_gate->splits[id], nullptr, nullptr, + ffn_down->splits[id], nullptr, nullptr, + nullptr, + LLM_FFN_GELU, LLM_FFN_PAR, cb, il, gf, false, false, nullptr, nullptr); + cb(cur, "ffn", il_cb); + ggml_build_forward_expand(gf, cur); + ffn_out[id] = cur; + + } + + inpL = ggml_reduce(ctx0, ffn_out.data(), n_device, GGML_OP_ADD); + cb(inpL, "ffn_reduce", il); + ggml_build_forward_expand(gf, inpL); + } + + int idx = lctx.model.default_layer_device[lctx.model.hparams.n_layer]; + int idx_out = ggml_backend_sched_get_backend_idx(lctx.sched, lctx.model.output->buffer); + if (idx_out >= 0) idx = idx_out; + cur = inpL->src[idx]; + if (!cur) { + cur = inpL->view_src; + } + + auto post_norm = (const ggml_split_tensor_t *)model.layers[hparams.n_layer-1].ffn_post_norm->extra; + cur = llm_build_norm(ctx0, cur, hparams, post_norm->splits[idx], NULL, LLM_NORM_RMS, cb, -1); + + cb(cur, "ffn_normed", hparams.n_layer-1); + auto add = ffn_inp[idx]; + if (!add) { + for (int j = 0; j < n_device; ++j) { + if (ffn_inp[j]) { + add = ffn_inp[j]; break; + } + } + } + cur = ggml_add(ctx0, cur, add); + cb(cur, "ffn_out", hparams.n_layer-1); + + if (model.layers[hparams.n_layer-1].out_scale) { + auto scale = (const ggml_split_tensor_t *)model.layers[hparams.n_layer-1].out_scale->extra; + cur = ggml_mul(ctx0, cur, scale->splits[idx]); + cb(cur, "ffn_out_scaled", hparams.n_layer-1); + } + + } else { + + ggml_tensor * frozen_k_swa = nullptr; + ggml_tensor * frozen_v_swa = nullptr; + ggml_tensor * frozen_k_full = nullptr; + ggml_tensor * frozen_v_full = nullptr; + + for (int il = 0; il < n_layer; ++il) { + ggml_tensor * inpL = cur; + + const bool is_sliding = hparams.swa_layers[il] ? true : false; + const float freq_base_l = is_sliding ? target_hparams.rope_freq_base_train_swa : target_cparams.rope_freq_base; + const float freq_scale_l = is_sliding ? target_hparams.rope_freq_scale_train_swa : target_cparams.rope_freq_scale; + const int n_rot_l = is_sliding ? target_hparams.n_rot_swa : target_hparams.n_rot; + const int n_swa = is_sliding ? target_hparams.n_swa : 0; + const int n_embd_head = hparams.n_embd_head_k(il); + const int n_head = hparams.n_head(il); + ggml_tensor * KQ_mask_l = is_sliding ? KQ_mask_swa : KQ_mask; + + const int target_il = gemma4_mtp_target_kv_layer(hparams, target_hparams, il); + + cur = llm_build_norm(ctx0, inpL, hparams, model.layers[il].attn_norm, nullptr, LLM_NORM_RMS, cb, il); + cb(cur, "attn_norm", il); + + ggml_tensor * Qcur = llm_build_lora_mm(lctx, ctx0, model.layers[il].wq, cur); + cb(Qcur, "Qcur", il); + Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens); + Qcur = llm_build_norm(ctx0, Qcur, hparams, model.layers[il].attn_q_norm, nullptr, LLM_NORM_RMS, cb, il); + cb(Qcur, "Qcur_normed", il); + auto freq_factors = is_sliding ? nullptr : model.layers[il].rope_freqs; + Qcur = ggml_rope_ext(ctx0, Qcur, inp_pos, freq_factors, n_rot_l, rope_type, n_ctx_orig, freq_base_l, freq_scale_l, + ext_factor, attn_factor, beta_fast, beta_slow); + cb(Qcur, "Qcur_rope", il); + + ggml_tensor *& frozen_k = is_sliding ? frozen_k_swa : frozen_k_full; + ggml_tensor *& frozen_v = is_sliding ? frozen_v_swa : frozen_v_full; + gemma4_mtp_prepare_frozen_kv_views(ctx0, lctx, target_kv, il, target_il, target_n_kv, &frozen_k, &frozen_v, cb); + cur = llm_build_kv(ctx0, lctx, target_kv, gf, model.layers[il].wo, model.layers[il].bo, + nullptr, nullptr, Qcur, KQ_mask_l, n_tokens, target_kv_head, target_n_kv, hparams.f_attention_scale, cb, il, nullptr, n_swa, target_il, + &frozen_k, &frozen_v); + + cur = llm_build_norm(ctx0, cur, hparams, model.layers[il].attn_post_norm, nullptr, LLM_NORM_RMS, cb, il); + cb(cur, "attn_post_norm", il); + cur = ggml_add(ctx0, cur, inpL); + cb(cur, "attn_out", il); + + ggml_tensor * ffn = llm_build_ffn(ctx0, lctx, model.layers[il].ffn_norm, cur, + model.layers[il].ffn_up, nullptr, nullptr, + model.layers[il].ffn_gate, nullptr, nullptr, + model.layers[il].ffn_down, nullptr, nullptr, + nullptr, + LLM_FFN_GELU, LLM_FFN_PAR, cb, il, gf, true, false, nullptr, model.layers[il].ffn_post_norm); + cb(ffn, "ffn_out", il); + + cur = ffn; + if (model.layers[il].out_scale) { + cur = ggml_mul(ctx0, cur, model.layers[il].out_scale); + cb(cur, "out_scaled", il); + } + cur = lctx.cvec.apply_to(ctx0, cur, il); + cb(cur, "l_out", il); } - cur = lctx.cvec.apply_to(ctx0, cur, il); - cb(cur, "l_out", il); } cur = llm_build_norm(ctx0, cur, hparams, model.output_norm, nullptr, LLM_NORM_RMS, cb, -1); diff --git a/src/llama-impl.h b/src/llama-impl.h index 741c11df..df6fc741 100644 --- a/src/llama-impl.h +++ b/src/llama-impl.h @@ -233,3 +233,5 @@ struct llama_split_tensor { void llama_decode_reset(); void llama_decode_stop(); + +std::vector & llama_all_loaded_models(); diff --git a/src/llama-load-tensors.cpp b/src/llama-load-tensors.cpp index 33ca9553..60bdd666 100644 --- a/src/llama-load-tensors.cpp +++ b/src/llama-load-tensors.cpp @@ -194,6 +194,8 @@ struct create_tensors_helper : public create_tensors_helper_interface { ggml_context * ctx_output; ggml_context * ctx_output_split; + llama_model * tgt_model = nullptr; + ggml_backend_buffer_type_t default_cpu_buft; bool has_buft_overrides = false; @@ -221,6 +223,21 @@ struct create_tensors_helper : public create_tensors_helper_interface { create_tensors_helper::create_tensors_helper(llama_model_loader & _ml, llama_model & _model) : ml(_ml), model(_model) { + if (model.arch == LLM_ARCH_GEMMA4_MTP || model.arch == LLM_ARCH_GEMMA4_ASSISTANT) { + auto & all_models = llama_all_loaded_models(); + for (auto model : all_models) { + if (model->arch == LLM_ARCH_GEMMA4) { + tgt_model = model; + } + } + if (tgt_model) { + LLAMA_LOG_INFO("==================== Found target model for Gemma4-Assistant. split mode graph: %d\n", model.split_mode == LLAMA_SPLIT_MODE_GRAPH); + } else { + LLAMA_LOG_INFO("==================== Did not find target model for Gemma4-Assistant\n"); + model.split_mode = LLAMA_SPLIT_MODE_LAYER; + } + } + const int n_layer = model.hparams.n_layer; buft_layer_count[model.buft_input.buft]++; buft_layer_count[model.buft_input.buft_matrix]++; @@ -2242,7 +2259,7 @@ bool create_tensors_helper::create_gemma4_mtp_tensors(const LLM_TN & tn) { const int64_t n_ff_cur = hparams.n_ff(i); if (!hparams.swa_layers[i]) { - layer.rope_freqs = create_tensor(ctx_layer, tn(LLM_TENSOR_ROPE_FREQS, "weight"), { n_rot/2 }, + layer.rope_freqs = create_tensor(ctx_split, tn(LLM_TENSOR_ROPE_FREQS, "weight"), { n_rot/2 }, llama_model_loader::TENSOR_NOT_REQUIRED | rope_flag); rope_flag = llama_model_loader::TENSOR_DUPLICATED; } @@ -4540,7 +4557,7 @@ bool create_tensors_helper::create_tensors() { { const bool unsupported = - (model.arch == LLM_ARCH_GEMMA4_MTP || model.arch == LLM_ARCH_GEMMA4_ASSISTANT) || + //(model.arch == LLM_ARCH_GEMMA4_MTP || model.arch == LLM_ARCH_GEMMA4_ASSISTANT) || (model.arch == LLM_ARCH_GEMMA4 && model.tok_embd_per_layer); if (unsupported && (model.split_mode == LLAMA_SPLIT_MODE_GRAPH || model.split_mode == LLAMA_SPLIT_MODE_ATTN)) { LLAMA_LOG_WARN("\n=========================================================\n"); @@ -4583,6 +4600,12 @@ bool create_tensors_helper::create_tensors() { if (model.max_gpu > 0 && model.max_gpu < int(model.splits.size())) { gpu_split_count.resize(model.splits.size(), 0.0f); } + auto is_gemma4_model = [this] () { + return model.arch == LLM_ARCH_GEMMA4 || model.arch == LLM_ARCH_GEMMA4_MTP || model.arch == LLM_ARCH_GEMMA4_ASSISTANT; + }; + auto is_gemma4_assistant = [this] () { + return model.arch == LLM_ARCH_GEMMA4_MTP || model.arch == LLM_ARCH_GEMMA4_ASSISTANT; + }; for (int il = 0; il < n_layer; ++il) { // For now only run MTP into the per-layer if (model.mtp && hparams.nextn_predict_layers > 0 && @@ -4620,7 +4643,7 @@ bool create_tensors_helper::create_tensors() { if (layer.attn_norm) { prepare_split_tensors(-1, ctx_split, layer.attn_norm, layer.split_attn_norm, mirror, mem_used); } - if (model.arch == LLM_ARCH_GEMMA4 && layer.attn_post_norm) { + if (is_gemma4_model() && layer.attn_post_norm) { prepare_split_tensors(-1, ctx_split, layer.attn_post_norm, layer.split_attn_post_norm, mirror, mem_used); } if (layer.rope_freqs) { @@ -4630,6 +4653,48 @@ bool create_tensors_helper::create_tensors() { if (hparams.is_recurrent(il)) { split_recurrent_tensors(hparams, layer, cur_splits, mem_used, ctx_split, il); //, model.arch == LLM_ARCH_QWEN3NEXT ? 0 : 1); } + else if (is_gemma4_assistant()) { + GGML_ASSERT(layer.wo && layer.wq); + GGML_ASSERT(tgt_model); + int n_embd_head = hparams.n_embd_head_k(il); + int n_head = hparams.n_head(il); + bool is_sliding = hparams.swa_layers[il] != 0; + int target_n_kv_layer = tgt_model->hparams.n_layer_kv_from_start > 0 + ? std::min((int) tgt_model->hparams.n_layer, tgt_model->hparams.n_layer_kv_from_start) + : (int) tgt_model->hparams.n_layer; + int target_il = target_n_kv_layer - 1; + for (; target_il >= 0; --target_il) { + if ((tgt_model->hparams.swa_layers[target_il] != 0) == is_sliding) break; + } + GGML_ASSERT(target_il >= 0 && "Gemma4 MTP could not find a matching target KV layer"); + int n_head_tgt = tgt_model->hparams.n_head(target_il); + GGML_ASSERT(tgt_model->hparams.n_embd_head_k(target_il) == n_embd_head); + auto & target_layer = tgt_model->layers[target_il]; + auto split_wq = (const ggml_split_tensor_t *)target_layer.wq->extra; + auto split_wo = (const ggml_split_tensor_t *)target_layer.wo->extra; + GGML_ASSERT(split_wq && split_wo); + std::vector q_split(split_wq->n_device, 0); + std::vector o_split(split_wo->n_device, 0); + for (int id = 0; id < split_wq->n_device; ++id) { + if (split_wq->splits[id]) { + int nh = split_wq->splits[id]->ne[1] / n_embd_head; + GGML_ASSERT((nh*n_head) % n_head_tgt == 0); + q_split[id] = ((nh*n_head)/n_head_tgt)*n_embd_head; + } + } + for (int id = 0; id < split_wo->n_device; ++id) { + if (split_wo->splits[id]) { + int64_t no = split_wo->splits[id]->ne[0] * layer.wo->ne[0]; + GGML_ASSERT(no % target_layer.wo->ne[0] == 0); + o_split[id] = no / target_layer.wo->ne[0]; + } + } + prepare_split_tensors(1, ctx_split, layer.wq, layer.split_wq, q_split, mem_used); + prepare_split_tensors(0, ctx_split, layer.wo, layer.split_wo, o_split, mem_used); + if (layer.attn_q_norm) { + prepare_split_tensors(-1, ctx_split, layer.attn_q_norm, layer.split_q_norm, o_split, mem_used); + } + } else if (layer.wo && layer.wq && layer.wk && (layer.wv || model.arch == LLM_ARCH_GEMMA4)) { auto granularity_kq = hparams.n_embd_head_k(il) * gqa_ratio; int wq_ne1 = layer.wq->ne[1]; diff --git a/src/llama-model-loader.cpp b/src/llama-model-loader.cpp index b372a678..180669d6 100644 --- a/src/llama-model-loader.cpp +++ b/src/llama-model-loader.cpp @@ -1053,6 +1053,7 @@ void llama_model_loader::load_data_for(struct ggml_tensor * cur) const { // Returns false if cancelled by progress_callback bool llama_model_loader::load_all_data( struct ggml_context * ctx, + [[maybe_unused]] llama_model * model, llama_buf_map & bufs_mmap, llama_mlocks * lmlocks, llama_progress_callback progress_callback, @@ -1083,7 +1084,7 @@ bool llama_model_loader::load_all_data( for (int i = 0; i < ggml_backend_cuda_get_device_count(); ++i) { auto * cuda_buffer_type = ggml_backend_cuda_buffer_type(i); if (buffer_type == cuda_buffer_type) { - cuda_backend = ggml_backend_cuda_init(i, nullptr); + cuda_backend = ggml_backend_cuda_init(i, nullptr, model); break; } } diff --git a/src/llama-model-loader.h b/src/llama-model-loader.h index f8c09c0e..01b47ebf 100644 --- a/src/llama-model-loader.h +++ b/src/llama-model-loader.h @@ -184,6 +184,7 @@ struct llama_model_loader { // Returns false if cancelled by progress_callback bool load_all_data( struct ggml_context * ctx, + struct llama_model * model, llama_buf_map & bufs_mmap, llama_mlocks * lmlocks, llama_progress_callback progress_callback, diff --git a/src/llama-reload.cpp b/src/llama-reload.cpp index 5dc163aa..c44f208e 100644 --- a/src/llama-reload.cpp +++ b/src/llama-reload.cpp @@ -939,7 +939,7 @@ bool reload_info::reload_changed_tensors(llama_model & model) { if (r) { #ifdef GGML_USE_CUDA - ggml_backend_cuda_invalidate_graphs(); + ggml_backend_cuda_invalidate_graphs(&model); #endif } return r; diff --git a/src/llama.cpp b/src/llama.cpp index fba96ccb..308f296f 100644 --- a/src/llama.cpp +++ b/src/llama.cpp @@ -682,6 +682,11 @@ bool llama_context::update_cache_copies() { return true; } +static std::vector & llama_all_contexts() { + static std::vector contexts; + return contexts; +} + llama_context::llama_context(const llama_model & model) : model(model) , sampling(llama_n_vocab(&model)) , t_start_us(model.t_start_us) , t_load_us(model.t_load_us) { const auto & hparams = model.hparams; @@ -690,6 +695,7 @@ llama_context::llama_context(const llama_model & model) } else { cache_copies.resize(2*hparams.n_layer); } + llama_all_contexts().push_back(this); } void llama_context::set_mtp_op_type(llama_mtp_op_type value) { @@ -710,6 +716,14 @@ llama_context::~llama_context() { } ggml_backend_buffer_free(buf_output); + + auto & all_contexts = llama_all_contexts(); + for (auto it = all_contexts.begin(); it != all_contexts.end(); ++it) { + if (*it == this) { + all_contexts.erase(it); + break; + } + } } int llama_context::max_nodes(int n_tokens, int n_kv) const { @@ -3093,6 +3107,8 @@ static bool is_model_split_supported(const llama_model & model) { LLM_ARCH_QWEN35, LLM_ARCH_QWEN35MOE, LLM_ARCH_GEMMA4, + LLM_ARCH_GEMMA4_MTP, + LLM_ARCH_GEMMA4_ASSISTANT, LLM_ARCH_DEEPSEEK2, LLM_ARCH_GLM_DSA, LLM_ARCH_MISTRAL4, @@ -3371,11 +3387,21 @@ static bool llm_load_tensors( auto & hparams = model.hparams; + if (model.arch == LLM_ARCH_GEMMA4_MTP || model.arch == LLM_ARCH_GEMMA4_ASSISTANT) { + auto & all_models = llama_all_loaded_models(); + llama_model * tgt_model = nullptr; + for (auto model : all_models) { + if (model->arch == LLM_ARCH_GEMMA4) { + tgt_model = model; + } + } + if (tgt_model) { + split_mode = tgt_model->split_mode; + } + } + if (split_mode == LLAMA_SPLIT_MODE_GRAPH || split_mode == LLAMA_SPLIT_MODE_ATTN) { - const bool unsupported_gemma_split = - model.arch == LLM_ARCH_GEMMA4_MTP || - model.arch == LLM_ARCH_GEMMA4_ASSISTANT || - (model.arch == LLM_ARCH_GEMMA4 && hparams.n_embd_per_layer > 0); + const bool unsupported_gemma_split = model.arch == LLM_ARCH_GEMMA4 && hparams.n_embd_per_layer > 0; if (unsupported_gemma_split) { LLAMA_LOG_WARN("\n=========================================================\n"); @@ -3401,6 +3427,25 @@ static bool llm_load_tensors( } } } + if ((split_mode == LLAMA_SPLIT_MODE_GRAPH || split_mode == LLAMA_SPLIT_MODE_ATTN) && + (model.arch == LLM_ARCH_GEMMA4_MTP || model.arch == LLM_ARCH_GEMMA4_ASSISTANT)) { + auto & all_models = llama_all_loaded_models(); + bool has_target_gemma = false; + for (auto model : all_models) { + if (model->arch == LLM_ARCH_GEMMA4) { + has_target_gemma = true; + break; + } + } + if (!has_target_gemma) { + LLAMA_LOG_WARN("\n=======================================================\n"); + LLAMA_LOG_WARN("Split mode 'graph' requested for Gemma4-assistant model\n"); + LLAMA_LOG_WARN("but no loaded Gemma4 model found.\n"); + LLAMA_LOG_WARN(" => changing split mode to 'layer'\n"); + LLAMA_LOG_WARN("=======================================================\n\n"); + split_mode = LLAMA_SPLIT_MODE_LAYER; + } + } if (iqk_has_fancy_simd()) { LLAMA_LOG_INFO("======================================= HAVE_FANCY_SIMD is defined\n"); @@ -3971,7 +4016,7 @@ static bool llm_load_tensors( for (auto & it : ctx_bufs) { ggml_context * ctx = it.first; auto & bufs = it.second; - if (!ml.load_all_data(ctx, bufs, use_mlock ? &model.mlock_mmaps : NULL, progress_callback, progress_callback_user_data)) { + if (!ml.load_all_data(ctx, &model, bufs, use_mlock ? &model.mlock_mmaps : NULL, progress_callback, progress_callback_user_data)) { return false; } } @@ -6722,10 +6767,19 @@ struct llama_model * llama_model_load_from_file( return nullptr; } + llama_all_loaded_models().push_back(model); + return model; } void llama_free_model(struct llama_model * model) { + auto & all_models = llama_all_loaded_models(); + for (auto it = all_models.begin(); it != all_models.end(); ++it) { + if (*it == model) { + all_models.erase(it); + break; + } + } delete model; } @@ -7094,7 +7148,7 @@ struct llama_context * llama_init_from_model( // main_gpu is a local index into model->devices throughout the codebase // (auto-fit assigns device_count-1, MTP clamps to [0, device_count), buffer-type // setup wraps with model.devices[main_gpu]). Translate to a raw device id here. - const int main_gpu_id = (model->main_gpu >= 0 && model->main_gpu < (int)model->devices.size()) + [[maybe_unused]] const int main_gpu_id = (model->main_gpu >= 0 && model->main_gpu < (int)model->devices.size()) ? model->devices[model->main_gpu] : model->main_gpu; #if defined(GGML_USE_METAL) @@ -7110,7 +7164,7 @@ struct llama_context * llama_init_from_model( #elif defined(GGML_USE_CUDA) if (model->split_mode == LLAMA_SPLIT_MODE_NONE) { // with split_mode LLAMA_SPLIT_MODE_NONE or LLAMA_SPLIT_MODE_GRAPH, only the main GPU backend is used - ggml_backend_t backend = ggml_backend_cuda_init(main_gpu_id, cparams.cuda_params); + ggml_backend_t backend = ggml_backend_cuda_init(main_gpu_id, cparams.cuda_params, ctx); if (backend == nullptr) { LLAMA_LOG_ERROR("%s: failed to initialize CUDA%d backend\n", __func__, main_gpu_id); llama_free(ctx); @@ -7129,7 +7183,7 @@ struct llama_context * llama_init_from_model( params = new_params.data(); } for (int device = 0; device < ggml_backend_cuda_get_device_count(); ++device) { - ggml_backend_t backend = ggml_backend_cuda_init(device, params); + ggml_backend_t backend = ggml_backend_cuda_init(device, params, ctx); if (backend == nullptr) { LLAMA_LOG_ERROR("%s: failed to initialize CUDA%d backend\n", __func__, device); llama_free(ctx); @@ -11099,3 +11153,8 @@ bool llama_reload_changed_tensors(struct llama_context * ctx) { } return result; } + +std::vector & llama_all_loaded_models() { + static std::vector models; + return models; +}