mirror of
https://github.com/ikawrakow/ik_llama.cpp.git
synced 2026-06-28 04:30:15 -05:00
Split mode graph for dense Gemma4 assistant (#2022)
* WIP: Split mode graph for Gemma4 assistant Something is not right - acceptance drops to nearly zero. * Per model CUDA contexts Still not working!? * This works The issue was that I was not correctly calculating the number of KV heads for the split KV cache. * Compiler warnings * It is better to use llama_context pointers as keys
This commit is contained in:
parent
bf23a7599c
commit
d5507e33ae
@ -66,7 +66,7 @@ struct pca_model {
|
||||
pca_model(struct ggml_tensor * t_input) {
|
||||
#ifdef GGML_USE_CUDA
|
||||
fprintf(stderr, "%s: using CUDA backend\n", __func__);
|
||||
backend = ggml_backend_cuda_init(0, nullptr); // init device 0
|
||||
backend = ggml_backend_cuda_init(0, nullptr, nullptr); // init device 0
|
||||
if (!backend) {
|
||||
fprintf(stderr, "%s: ggml_backend_cuda_init() failed\n", __func__);
|
||||
}
|
||||
|
||||
@ -21,7 +21,7 @@ extern "C" {
|
||||
#define GGML_CUDA_MAX_DEVICES 16
|
||||
|
||||
// backend API
|
||||
GGML_API GGML_CALL ggml_backend_t ggml_backend_cuda_init(int device, const void * params);
|
||||
GGML_API GGML_CALL ggml_backend_t ggml_backend_cuda_init(int device, const void * params, const void * model);
|
||||
|
||||
GGML_API GGML_CALL bool ggml_backend_is_cuda(ggml_backend_t backend);
|
||||
|
||||
@ -43,7 +43,7 @@ GGML_API GGML_CALL void ggml_backend_cuda_unregister_host_buffer(void * buffer);
|
||||
|
||||
GGML_API GGML_CALL void ggml_backend_cuda_log_set_callback(ggml_log_callback log_callback, void * user_data);
|
||||
|
||||
GGML_API GGML_CALL void ggml_backend_cuda_invalidate_graphs(void);
|
||||
GGML_API GGML_CALL void ggml_backend_cuda_invalidate_graphs(const void * model);
|
||||
#ifdef __cplusplus
|
||||
}
|
||||
#endif
|
||||
|
||||
@ -298,13 +298,22 @@ const ggml_cuda_device_info & ggml_cuda_info() {
|
||||
}
|
||||
|
||||
/* ---------- hot-swap: invalidate all cached CUDA graphs ---------- */
|
||||
extern "C" void ggml_backend_cuda_invalidate_graphs(void) {
|
||||
extern "C" void ggml_backend_cuda_invalidate_graphs(const void * model) {
|
||||
auto & info = const_cast<ggml_cuda_device_info &>(ggml_cuda_info());
|
||||
for (int i = 0; i < info.device_count; ++i) {
|
||||
if (info.all_ctx[i]) {
|
||||
info.all_ctx[i]->cuda_graphs.clear();
|
||||
if (auto it = info.all_ctx.find(model); it != info.all_ctx.end()) {
|
||||
for (auto ctx : it->second) {
|
||||
if (ctx) {
|
||||
ctx->cuda_graphs.clear();
|
||||
}
|
||||
}
|
||||
} else {
|
||||
fprintf(stderr, "================================= %s: did not find entry for model at %p\n", __func__, model);
|
||||
}
|
||||
//for (int i = 0; i < info.device_count; ++i) {
|
||||
// if (info.all_ctx[i]) {
|
||||
// info.all_ctx[i]->cuda_graphs.clear();
|
||||
// }
|
||||
//}
|
||||
}
|
||||
|
||||
// #define DEBUG_CUDA_MALLOC
|
||||
@ -517,13 +526,14 @@ static std::condition_variable ggml_cuda_lock_cv;
|
||||
//static std::atomic<int> ggml_cuda_lock_counter;
|
||||
static int ggml_cuda_lock_counter = 0;
|
||||
|
||||
ggml_backend_cuda_context::ggml_backend_cuda_context(int device) :
|
||||
device(device), name(GGML_CUDA_NAME + std::to_string(device)) {
|
||||
ggml_backend_cuda_context::ggml_backend_cuda_context(int device, const void * model) :
|
||||
device(device), name(GGML_CUDA_NAME + std::to_string(device)), model(model) {
|
||||
auto info = const_cast<ggml_cuda_device_info*>(&ggml_cuda_info());
|
||||
if (info->all_ctx[device]) {
|
||||
auto & all_ctx = info->all_ctx[model];
|
||||
if (all_ctx[device]) {
|
||||
GGML_CUDA_LOG_WARN("%s: a context for device %d already exists?\n", __func__, device);
|
||||
} else{
|
||||
info->all_ctx[device] = this;
|
||||
all_ctx[device] = this;
|
||||
}
|
||||
}
|
||||
|
||||
@ -555,9 +565,12 @@ ggml_backend_cuda_context::~ggml_backend_cuda_context() {
|
||||
}
|
||||
}
|
||||
auto info = const_cast<ggml_cuda_device_info*>(&ggml_cuda_info());
|
||||
if (info->all_ctx[device] == this) {
|
||||
info->all_ctx[device] = nullptr;
|
||||
if (auto it = info->all_ctx.find(model); it != info->all_ctx.end() && it->second[device] == this) {
|
||||
it->second[device] = nullptr;
|
||||
}
|
||||
//if (info->all_ctx[device] == this) {
|
||||
// info->all_ctx[device] = nullptr;
|
||||
//}
|
||||
|
||||
}
|
||||
|
||||
@ -4247,23 +4260,8 @@ GGML_CALL static bool ggml_backend_cuda_cpy_tensor_async(ggml_backend_t backend_
|
||||
needs_f16_f32_copy = true;
|
||||
|
||||
} else {
|
||||
#ifdef GGML_USE_NCCL__
|
||||
auto & info = ggml_cuda_info();
|
||||
auto nbytes = ggml_nbytes(src);
|
||||
ncclGroupStart();
|
||||
ggml_cuda_set_device(cuda_ctx_src->device);
|
||||
auto status1 = ncclSend(src->data, nbytes, ncclUint8, cuda_ctx_dst->device, info.nccl_coms[cuda_ctx_src->device],
|
||||
info.all_ctx[cuda_ctx_src->device]->stream());
|
||||
ggml_cuda_set_device(cuda_ctx_dst->device);
|
||||
auto status2 = ncclRecv(dst->data, nbytes, ncclUint8, cuda_ctx_src->device, info.nccl_coms[cuda_ctx_dst->device],
|
||||
info.all_ctx[cuda_ctx_dst->device]->stream());
|
||||
ncclGroupEnd();
|
||||
GGML_ASSERT(status1 == ncclSuccess && status2 == ncclSuccess);
|
||||
return true;
|
||||
#else
|
||||
ggml_cuda_set_device(cuda_ctx_src->device);
|
||||
CUDA_CHECK(cudaMemcpyPeerAsync(dst->data, cuda_ctx_dst->device, src->data, cuda_ctx_src->device, ggml_nbytes(dst), cuda_ctx_src->stream()));
|
||||
#endif
|
||||
}
|
||||
#endif
|
||||
}
|
||||
@ -5249,13 +5247,13 @@ static cuda_params ggml_cuda_parse_params(const char * params_string) {
|
||||
return params;
|
||||
}
|
||||
|
||||
GGML_CALL ggml_backend_t ggml_backend_cuda_init(int device, [[maybe_unused]] const void * param_string) {
|
||||
GGML_CALL ggml_backend_t ggml_backend_cuda_init(int device, [[maybe_unused]] const void * param_string, const void * model) {
|
||||
if (device < 0 || device >= ggml_backend_cuda_get_device_count()) {
|
||||
GGML_CUDA_LOG_ERROR("%s: invalid device %d\n", __func__, device);
|
||||
return nullptr;
|
||||
}
|
||||
|
||||
ggml_backend_cuda_context * ctx = new ggml_backend_cuda_context(device);
|
||||
ggml_backend_cuda_context * ctx = new ggml_backend_cuda_context(device, model);
|
||||
if (ctx == nullptr) {
|
||||
GGML_CUDA_LOG_ERROR("%s: failed to allocate context\n", __func__);
|
||||
return nullptr;
|
||||
@ -5370,7 +5368,7 @@ GGML_CALL void ggml_backend_cuda_unregister_host_buffer(void * buffer) {
|
||||
|
||||
// backend registry
|
||||
GGML_CALL static ggml_backend_t ggml_backend_reg_cuda_init(const char * params, void * user_data) {
|
||||
ggml_backend_t cuda_backend = ggml_backend_cuda_init((int) (intptr_t) user_data, nullptr);
|
||||
ggml_backend_t cuda_backend = ggml_backend_cuda_init((int) (intptr_t) user_data, nullptr, nullptr);
|
||||
return cuda_backend;
|
||||
|
||||
GGML_UNUSED(params);
|
||||
|
||||
@ -762,7 +762,8 @@ struct ggml_cuda_device_info {
|
||||
|
||||
std::array<float, GGML_CUDA_MAX_DEVICES> default_tensor_split = {};
|
||||
|
||||
ggml_backend_cuda_context * all_ctx[GGML_CUDA_MAX_DEVICES] = { nullptr };
|
||||
std::unordered_map<const void *, std::array<ggml_backend_cuda_context *, GGML_CUDA_MAX_DEVICES>> all_ctx;
|
||||
//ggml_backend_cuda_context * all_ctx[GGML_CUDA_MAX_DEVICES] = { nullptr };
|
||||
#ifdef GGML_USE_NCCL
|
||||
ncclComm_t nccl_coms[GGML_CUDA_MAX_DEVICES];
|
||||
bool have_nccl;
|
||||
@ -864,10 +865,11 @@ struct ggml_backend_cuda_context {
|
||||
|
||||
#endif
|
||||
|
||||
const void * model;
|
||||
void * copy_buffer = nullptr;
|
||||
size_t copy_size = 0;
|
||||
|
||||
explicit ggml_backend_cuda_context(int device);
|
||||
explicit ggml_backend_cuda_context(int device, const void * model);
|
||||
|
||||
~ggml_backend_cuda_context();
|
||||
|
||||
|
||||
@ -94,6 +94,11 @@ static void copy_missing_tensors(ggml_backend_cuda_context & ctx, ggml_tensor *
|
||||
if (ncopy < 1) return;
|
||||
|
||||
auto & info = ggml_cuda_info();
|
||||
auto it = info.all_ctx.find(ctx.model);
|
||||
if (it == info.all_ctx.end()) {
|
||||
GGML_ABORT("Fatal error");
|
||||
}
|
||||
auto & all_ctx = it->second;
|
||||
auto size = ggml_nbytes(dst);
|
||||
int isrc = 0;
|
||||
for (int ii = 0; ii < ncopy; ++ii) {
|
||||
@ -102,9 +107,9 @@ static void copy_missing_tensors(ggml_backend_cuda_context & ctx, ggml_tensor *
|
||||
isrc = (isrc + 1)%nhave;
|
||||
//printf("%s: copying from device %d to device %d: %p -> %p\n", __func__, j, i, dst->src[j]->data, dst->src[i]->data);
|
||||
ggml_cuda_set_device(j);
|
||||
CUDA_CHECK(cudaMemcpyPeerAsync(dst->src[i]->data, info.all_ctx[i]->device, dst->src[j]->data, info.all_ctx[j]->device,
|
||||
size, info.all_ctx[j]->stream()));
|
||||
CUDA_CHECK(cudaEventRecord(info.all_ctx[j]->copy_event, info.all_ctx[j]->stream()));
|
||||
CUDA_CHECK(cudaMemcpyPeerAsync(dst->src[i]->data, all_ctx[i]->device, dst->src[j]->data, all_ctx[j]->device,
|
||||
size, all_ctx[j]->stream()));
|
||||
CUDA_CHECK(cudaEventRecord(all_ctx[j]->copy_event, all_ctx[j]->stream()));
|
||||
}
|
||||
isrc = 0;
|
||||
for (int ii = 0; ii < ncopy; ++ii) {
|
||||
@ -112,7 +117,7 @@ static void copy_missing_tensors(ggml_backend_cuda_context & ctx, ggml_tensor *
|
||||
int j = idx[isrc];
|
||||
isrc = (isrc + 1)%nhave;
|
||||
ggml_cuda_set_device(i);
|
||||
CUDA_CHECK(cudaStreamWaitEvent(info.all_ctx[i]->stream(), info.all_ctx[j]->copy_event, 0));
|
||||
CUDA_CHECK(cudaStreamWaitEvent(all_ctx[i]->stream(), all_ctx[j]->copy_event, 0));
|
||||
}
|
||||
ggml_cuda_set_device(ctx.device);
|
||||
}
|
||||
@ -133,6 +138,11 @@ void ggml_cuda_op_reduce([[maybe_unused]] ggml_backend_cuda_context & ctx, ggml_
|
||||
}
|
||||
|
||||
auto & info = ggml_cuda_info();
|
||||
auto it = info.all_ctx.find(ctx.model);
|
||||
if (it == info.all_ctx.end()) {
|
||||
GGML_ABORT("Fatal error");
|
||||
}
|
||||
auto & all_ctx = it->second;
|
||||
#ifdef GGML_USE_NCCL
|
||||
// Somehow I'm not able to figure out how to use NCCL correctly.
|
||||
// It does not work at all if not all GPUs participate in the reduce op, and we
|
||||
@ -153,7 +163,7 @@ void ggml_cuda_op_reduce([[maybe_unused]] ggml_backend_cuda_context & ctx, ggml_
|
||||
ggml_cuda_set_device(i);
|
||||
auto status = ncclAllReduce(dst->src[i] ? dst->src[i]->data : nullptr,
|
||||
dst->src[i] ? dst->src[i]->data : nullptr,
|
||||
ggml_nelements(dst), data_type, ncclSum, info.nccl_coms[i], info.all_ctx[i]->stream());
|
||||
ggml_nelements(dst), data_type, ncclSum, info.nccl_coms[i], all_ctx[i]->stream());
|
||||
if (status != ncclSuccess) {
|
||||
fprintf(stderr, "%s: ncclAllReduce failed with status %d\n", __func__, (int)status);
|
||||
GGML_ABORT("Fatal error");
|
||||
@ -275,7 +285,7 @@ void ggml_cuda_op_reduce([[maybe_unused]] ggml_backend_cuda_context & ctx, ggml_
|
||||
auto size_per_device = nblocks_per_device * tt.type_size;
|
||||
for (int ii = 0; ii < nhave; ++ii) {
|
||||
int i = idx[ii];
|
||||
auto this_ctx = info.all_ctx[i];
|
||||
auto this_ctx = all_ctx[i];
|
||||
if (!this_ctx->copy_event || !this_ctx->compute_event || size_per_device > this_ctx->copy_size) {
|
||||
ggml_cuda_set_device(this_ctx->device);
|
||||
if (!this_ctx->copy_event) {
|
||||
@ -300,14 +310,14 @@ void ggml_cuda_op_reduce([[maybe_unused]] ggml_backend_cuda_context & ctx, ggml_
|
||||
int peer = idx[(ii+1)%nhave];
|
||||
auto this_nelem = std::min(nelem_per_device, nelem - ichunk*nelem_per_device);
|
||||
auto this_size = (this_nelem / tt.blck_size) * tt.type_size;
|
||||
ggml_cuda_set_device(info.all_ctx[peer]->device);
|
||||
ggml_cuda_set_device(all_ctx[peer]->device);
|
||||
if (stage > 0) {
|
||||
CUDA_CHECK(cudaStreamWaitEvent(info.all_ctx[peer]->stream(), info.all_ctx[i]->compute_event, 0));
|
||||
CUDA_CHECK(cudaStreamWaitEvent(all_ctx[peer]->stream(), all_ctx[i]->compute_event, 0));
|
||||
}
|
||||
CUDA_CHECK(cudaMemcpyPeerAsync(info.all_ctx[i]->copy_buffer, info.all_ctx[i]->device,
|
||||
(const char *)dst->src[peer]->data + ichunk*size_per_device, info.all_ctx[peer]->device,
|
||||
this_size, info.all_ctx[peer]->stream()));
|
||||
CUDA_CHECK(cudaEventRecord(info.all_ctx[peer]->copy_event, info.all_ctx[peer]->stream()));
|
||||
CUDA_CHECK(cudaMemcpyPeerAsync(all_ctx[i]->copy_buffer, all_ctx[i]->device,
|
||||
(const char *)dst->src[peer]->data + ichunk*size_per_device, all_ctx[peer]->device,
|
||||
this_size, all_ctx[peer]->stream()));
|
||||
CUDA_CHECK(cudaEventRecord(all_ctx[peer]->copy_event, all_ctx[peer]->stream()));
|
||||
ichunk = (ichunk + 1)%nhave;
|
||||
}
|
||||
ichunk = stage;
|
||||
@ -315,24 +325,24 @@ void ggml_cuda_op_reduce([[maybe_unused]] ggml_backend_cuda_context & ctx, ggml_
|
||||
int i = idx[ii];
|
||||
int peer = idx[(ii+1)%nhave];
|
||||
auto this_nelem = std::min(nelem_per_device, nelem - ichunk*nelem_per_device);
|
||||
ggml_cuda_set_device(info.all_ctx[i]->device);
|
||||
CUDA_CHECK(cudaStreamWaitEvent(info.all_ctx[i]->stream(), info.all_ctx[peer]->copy_event, 0));
|
||||
ggml_cuda_set_device(all_ctx[i]->device);
|
||||
CUDA_CHECK(cudaStreamWaitEvent(all_ctx[i]->stream(), all_ctx[peer]->copy_event, 0));
|
||||
int num_blocks = (this_nelem + CUDA_REDUCE_BLOCK_SIZE - 1)/CUDA_REDUCE_BLOCK_SIZE;
|
||||
if (dst->type == GGML_TYPE_F16) {
|
||||
k_add<half, CUDA_REDUCE_BLOCK_SIZE><<<num_blocks, CUDA_REDUCE_BLOCK_SIZE, 0, info.all_ctx[i]->stream()>>>(this_nelem,
|
||||
(const half *)info.all_ctx[i]->copy_buffer, (half *)dst->src[i]->data + ichunk*nelem_per_device);
|
||||
k_add<half, CUDA_REDUCE_BLOCK_SIZE><<<num_blocks, CUDA_REDUCE_BLOCK_SIZE, 0, all_ctx[i]->stream()>>>(this_nelem,
|
||||
(const half *)all_ctx[i]->copy_buffer, (half *)dst->src[i]->data + ichunk*nelem_per_device);
|
||||
} else if (dst->type == GGML_TYPE_Q8_0) {
|
||||
k_add<CUDA_REDUCE_BLOCK_SIZE><<<num_blocks, CUDA_REDUCE_BLOCK_SIZE, 0, info.all_ctx[i]->stream()>>>(this_nelem,
|
||||
(const block_q8_0 *)info.all_ctx[i]->copy_buffer, (block_q8_0 *)dst->src[i]->data + ichunk*nelem_per_device/tt.blck_size);
|
||||
k_add<CUDA_REDUCE_BLOCK_SIZE><<<num_blocks, CUDA_REDUCE_BLOCK_SIZE, 0, all_ctx[i]->stream()>>>(this_nelem,
|
||||
(const block_q8_0 *)all_ctx[i]->copy_buffer, (block_q8_0 *)dst->src[i]->data + ichunk*nelem_per_device/tt.blck_size);
|
||||
} else if (dst->type == GGML_TYPE_BF16) {
|
||||
k_add<nv_bfloat16, CUDA_REDUCE_BLOCK_SIZE><<<num_blocks, CUDA_REDUCE_BLOCK_SIZE, 0, info.all_ctx[i]->stream()>>>(
|
||||
this_nelem, (const nv_bfloat16 *)info.all_ctx[i]->copy_buffer,
|
||||
k_add<nv_bfloat16, CUDA_REDUCE_BLOCK_SIZE><<<num_blocks, CUDA_REDUCE_BLOCK_SIZE, 0, all_ctx[i]->stream()>>>(
|
||||
this_nelem, (const nv_bfloat16 *)all_ctx[i]->copy_buffer,
|
||||
(nv_bfloat16 *)dst->src[i]->data + ichunk*nelem_per_device);
|
||||
} else {
|
||||
k_add<float, CUDA_REDUCE_BLOCK_SIZE><<<num_blocks, CUDA_REDUCE_BLOCK_SIZE, 0, info.all_ctx[i]->stream()>>>(this_nelem,
|
||||
(const float *)info.all_ctx[i]->copy_buffer, (float *)dst->src[i]->data + ichunk*nelem_per_device);
|
||||
k_add<float, CUDA_REDUCE_BLOCK_SIZE><<<num_blocks, CUDA_REDUCE_BLOCK_SIZE, 0, all_ctx[i]->stream()>>>(this_nelem,
|
||||
(const float *)all_ctx[i]->copy_buffer, (float *)dst->src[i]->data + ichunk*nelem_per_device);
|
||||
}
|
||||
CUDA_CHECK(cudaEventRecord(info.all_ctx[i]->compute_event, info.all_ctx[i]->stream()));
|
||||
CUDA_CHECK(cudaEventRecord(all_ctx[i]->compute_event, all_ctx[i]->stream()));
|
||||
ichunk = (ichunk + 1)%nhave;
|
||||
}
|
||||
}
|
||||
@ -343,21 +353,21 @@ void ggml_cuda_op_reduce([[maybe_unused]] ggml_backend_cuda_context & ctx, ggml_
|
||||
int peer = idx[(ii+1)%nhave];
|
||||
auto this_nelem = std::min(nelem_per_device, nelem - ichunk*nelem_per_device);
|
||||
auto this_size = (this_nelem / tt.blck_size) * tt.type_size;
|
||||
ggml_cuda_set_device(info.all_ctx[peer]->device);
|
||||
ggml_cuda_set_device(all_ctx[peer]->device);
|
||||
if (stage == 0) {
|
||||
CUDA_CHECK(cudaStreamWaitEvent(info.all_ctx[peer]->stream(), info.all_ctx[i]->compute_event, 0));
|
||||
CUDA_CHECK(cudaStreamWaitEvent(all_ctx[peer]->stream(), all_ctx[i]->compute_event, 0));
|
||||
}
|
||||
CUDA_CHECK(cudaMemcpyPeerAsync((char *)dst->src[i]->data + ichunk*size_per_device, info.all_ctx[i]->device,
|
||||
(const char *)dst->src[peer]->data + ichunk*size_per_device, info.all_ctx[peer]->device,
|
||||
this_size, info.all_ctx[peer]->stream()));
|
||||
CUDA_CHECK(cudaEventRecord(info.all_ctx[peer]->copy_event, info.all_ctx[peer]->stream()));
|
||||
CUDA_CHECK(cudaMemcpyPeerAsync((char *)dst->src[i]->data + ichunk*size_per_device, all_ctx[i]->device,
|
||||
(const char *)dst->src[peer]->data + ichunk*size_per_device, all_ctx[peer]->device,
|
||||
this_size, all_ctx[peer]->stream()));
|
||||
CUDA_CHECK(cudaEventRecord(all_ctx[peer]->copy_event, all_ctx[peer]->stream()));
|
||||
ichunk = (ichunk + 1)%nhave;
|
||||
}
|
||||
for (int ii = 0; ii < nhave; ++ii) {
|
||||
int i = idx[ii];
|
||||
int peer = idx[(ii+1)%nhave];
|
||||
ggml_cuda_set_device(info.all_ctx[i]->device);
|
||||
CUDA_CHECK(cudaStreamWaitEvent(info.all_ctx[i]->stream(), info.all_ctx[peer]->copy_event, 0));
|
||||
ggml_cuda_set_device(all_ctx[i]->device);
|
||||
CUDA_CHECK(cudaStreamWaitEvent(all_ctx[i]->stream(), all_ctx[peer]->copy_event, 0));
|
||||
}
|
||||
}
|
||||
ggml_cuda_set_device(ctx.device);
|
||||
@ -372,8 +382,8 @@ void ggml_cuda_op_reduce([[maybe_unused]] ggml_backend_cuda_context & ctx, ggml_
|
||||
GGML_ASSERT(dst->src[i]->type == dst->type);
|
||||
GGML_ASSERT(ggml_are_same_shape(dst, dst->src[i]));
|
||||
ggml_cuda_set_device(i);
|
||||
if (!info.all_ctx[i]->copy_event) {
|
||||
CUDA_CHECK(cudaEventCreateWithFlags(&info.all_ctx[i]->copy_event, cudaEventDisableTiming));
|
||||
if (!all_ctx[i]->copy_event) {
|
||||
CUDA_CHECK(cudaEventCreateWithFlags(&all_ctx[i]->copy_event, cudaEventDisableTiming));
|
||||
}
|
||||
}
|
||||
auto nelem = ggml_nelements(dst);
|
||||
@ -386,20 +396,20 @@ void ggml_cuda_op_reduce([[maybe_unused]] ggml_backend_cuda_context & ctx, ggml_
|
||||
task.ptrs[0] = (char *)dst->src[i]->data;
|
||||
int j = idx[2*ii+1];
|
||||
ggml_cuda_set_device(j);
|
||||
CUDA_CHECK(cudaEventRecord(info.all_ctx[j]->copy_event, info.all_ctx[j]->stream()));
|
||||
CUDA_CHECK(cudaEventRecord(all_ctx[j]->copy_event, all_ctx[j]->stream()));
|
||||
task.ptrs[1] = (char *)dst->src[j]->data;
|
||||
ggml_cuda_set_device(i);
|
||||
CUDA_CHECK(cudaStreamWaitEvent(info.all_ctx[i]->stream(), info.all_ctx[j]->copy_event));
|
||||
CUDA_CHECK(cudaStreamWaitEvent(all_ctx[i]->stream(), all_ctx[j]->copy_event));
|
||||
if (dst->type == GGML_TYPE_F16) {
|
||||
k_reduce_add_T<half, CUDA_REDUCE_BLOCK_SIZE, 2><<<nblocks, CUDA_REDUCE_BLOCK_SIZE, 0, info.all_ctx[i]->stream()>>>(task);
|
||||
k_reduce_add_T<half, CUDA_REDUCE_BLOCK_SIZE, 2><<<nblocks, CUDA_REDUCE_BLOCK_SIZE, 0, all_ctx[i]->stream()>>>(task);
|
||||
} else {
|
||||
k_reduce_add_T<float, CUDA_REDUCE_BLOCK_SIZE, 2><<<nblocks, CUDA_REDUCE_BLOCK_SIZE, 0, info.all_ctx[i]->stream()>>>(task);
|
||||
k_reduce_add_T<float, CUDA_REDUCE_BLOCK_SIZE, 2><<<nblocks, CUDA_REDUCE_BLOCK_SIZE, 0, all_ctx[i]->stream()>>>(task);
|
||||
}
|
||||
}
|
||||
for (int ii = 0; ii < nhave/2; ++ii) {
|
||||
int i = idx[2*ii+0];
|
||||
ggml_cuda_set_device(i);
|
||||
CUDA_CHECK(cudaEventRecord(info.all_ctx[i]->copy_event, info.all_ctx[i]->stream()));
|
||||
CUDA_CHECK(cudaEventRecord(all_ctx[i]->copy_event, all_ctx[i]->stream()));
|
||||
}
|
||||
for (int ii = 0; ii < nhave/2; ++ii) {
|
||||
int i = idx[2*ii+1];
|
||||
@ -411,23 +421,23 @@ void ggml_cuda_op_reduce([[maybe_unused]] ggml_backend_cuda_context & ctx, ggml_
|
||||
int j = idx[(2*ii+2)%nhave];
|
||||
task.ptrs[1] = (char *)dst->src[j]->data;
|
||||
ggml_cuda_set_device(i);
|
||||
CUDA_CHECK(cudaStreamWaitEvent(info.all_ctx[i]->stream(), info.all_ctx[j]->copy_event));
|
||||
CUDA_CHECK(cudaStreamWaitEvent(all_ctx[i]->stream(), all_ctx[j]->copy_event));
|
||||
if (dst->type == GGML_TYPE_F16) {
|
||||
k_reduce_add_T<half, CUDA_REDUCE_BLOCK_SIZE, 2><<<nblocks, CUDA_REDUCE_BLOCK_SIZE, 0, info.all_ctx[i]->stream()>>>(task);
|
||||
k_reduce_add_T<half, CUDA_REDUCE_BLOCK_SIZE, 2><<<nblocks, CUDA_REDUCE_BLOCK_SIZE, 0, all_ctx[i]->stream()>>>(task);
|
||||
} else {
|
||||
k_reduce_add_T<float, CUDA_REDUCE_BLOCK_SIZE, 2><<<nblocks, CUDA_REDUCE_BLOCK_SIZE, 0, info.all_ctx[i]->stream()>>>(task);
|
||||
k_reduce_add_T<float, CUDA_REDUCE_BLOCK_SIZE, 2><<<nblocks, CUDA_REDUCE_BLOCK_SIZE, 0, all_ctx[i]->stream()>>>(task);
|
||||
}
|
||||
}
|
||||
for (int ii = 0; ii < nhave/2; ++ii) {
|
||||
int i = idx[2*ii+1];
|
||||
ggml_cuda_set_device(i);
|
||||
CUDA_CHECK(cudaEventRecord(info.all_ctx[i]->copy_event, info.all_ctx[i]->stream()));
|
||||
CUDA_CHECK(cudaEventRecord(all_ctx[i]->copy_event, all_ctx[i]->stream()));
|
||||
}
|
||||
for (int ii = 0; ii < nhave/2; ++ii) {
|
||||
int i = idx[(2*ii+2)%nhave];
|
||||
ggml_cuda_set_device(i);
|
||||
int j = idx[2*ii+1];
|
||||
CUDA_CHECK(cudaStreamWaitEvent(info.all_ctx[i]->stream(), info.all_ctx[j]->copy_event));
|
||||
CUDA_CHECK(cudaStreamWaitEvent(all_ctx[i]->stream(), all_ctx[j]->copy_event));
|
||||
}
|
||||
ggml_cuda_set_device(ctx.device);
|
||||
if (ncopy > 0) {
|
||||
@ -442,10 +452,10 @@ void ggml_cuda_op_reduce([[maybe_unused]] ggml_backend_cuda_context & ctx, ggml_
|
||||
GGML_ASSERT(dst->src[i]->type == dst->type);
|
||||
GGML_ASSERT(ggml_are_same_shape(dst, dst->src[i]));
|
||||
ggml_cuda_set_device(i);
|
||||
if (!info.all_ctx[i]->copy_event) {
|
||||
CUDA_CHECK(cudaEventCreateWithFlags(&info.all_ctx[i]->copy_event, cudaEventDisableTiming));
|
||||
if (!all_ctx[i]->copy_event) {
|
||||
CUDA_CHECK(cudaEventCreateWithFlags(&all_ctx[i]->copy_event, cudaEventDisableTiming));
|
||||
}
|
||||
CUDA_CHECK(cudaEventRecord(info.all_ctx[i]->copy_event, info.all_ctx[i]->stream()));
|
||||
CUDA_CHECK(cudaEventRecord(all_ctx[i]->copy_event, all_ctx[i]->stream()));
|
||||
}
|
||||
//printf("Recorded events\n");
|
||||
auto nelem = ggml_nelements(dst);
|
||||
@ -465,37 +475,37 @@ void ggml_cuda_op_reduce([[maybe_unused]] ggml_backend_cuda_context & ctx, ggml_
|
||||
for (int jj = 0; jj < nhave; ++jj) {
|
||||
if (jj == ii) continue;
|
||||
int j = idx[jj];
|
||||
CUDA_CHECK(cudaStreamWaitEvent(info.all_ctx[i]->stream(), info.all_ctx[j]->copy_event));
|
||||
CUDA_CHECK(cudaStreamWaitEvent(all_ctx[i]->stream(), all_ctx[j]->copy_event));
|
||||
task.ptrs[k++] = (char *)dst->src[j]->data + ii*nelem_per_device*elem_size;
|
||||
}
|
||||
int nblock = (this_nelem + CUDA_REDUCE_BLOCK_SIZE - 1)/CUDA_REDUCE_BLOCK_SIZE;
|
||||
if (dst->type == GGML_TYPE_F16) {
|
||||
switch (nhave) {
|
||||
case 2:
|
||||
k_reduce_add_T<half, CUDA_REDUCE_BLOCK_SIZE, 2><<<nblock, CUDA_REDUCE_BLOCK_SIZE, 0, info.all_ctx[i]->stream()>>>(task);
|
||||
k_reduce_add_T<half, CUDA_REDUCE_BLOCK_SIZE, 2><<<nblock, CUDA_REDUCE_BLOCK_SIZE, 0, all_ctx[i]->stream()>>>(task);
|
||||
break;
|
||||
case 3:
|
||||
k_reduce_add_T<half, CUDA_REDUCE_BLOCK_SIZE, 3><<<nblock, CUDA_REDUCE_BLOCK_SIZE, 0, info.all_ctx[i]->stream()>>>(task);
|
||||
k_reduce_add_T<half, CUDA_REDUCE_BLOCK_SIZE, 3><<<nblock, CUDA_REDUCE_BLOCK_SIZE, 0, all_ctx[i]->stream()>>>(task);
|
||||
break;
|
||||
case 4:
|
||||
k_reduce_add_T<half, CUDA_REDUCE_BLOCK_SIZE, 4><<<nblock, CUDA_REDUCE_BLOCK_SIZE, 0, info.all_ctx[i]->stream()>>>(task);
|
||||
k_reduce_add_T<half, CUDA_REDUCE_BLOCK_SIZE, 4><<<nblock, CUDA_REDUCE_BLOCK_SIZE, 0, all_ctx[i]->stream()>>>(task);
|
||||
break;
|
||||
default:
|
||||
k_reduce_add<half, CUDA_REDUCE_BLOCK_SIZE><<<nblock, CUDA_REDUCE_BLOCK_SIZE, 0, info.all_ctx[i]->stream()>>>(task);
|
||||
k_reduce_add<half, CUDA_REDUCE_BLOCK_SIZE><<<nblock, CUDA_REDUCE_BLOCK_SIZE, 0, all_ctx[i]->stream()>>>(task);
|
||||
}
|
||||
} else {
|
||||
switch (nhave) {
|
||||
case 2:
|
||||
k_reduce_add_T<float, CUDA_REDUCE_BLOCK_SIZE, 2><<<nblock, CUDA_REDUCE_BLOCK_SIZE, 0, info.all_ctx[i]->stream()>>>(task);
|
||||
k_reduce_add_T<float, CUDA_REDUCE_BLOCK_SIZE, 2><<<nblock, CUDA_REDUCE_BLOCK_SIZE, 0, all_ctx[i]->stream()>>>(task);
|
||||
break;
|
||||
case 3:
|
||||
k_reduce_add_T<float, CUDA_REDUCE_BLOCK_SIZE, 3><<<nblock, CUDA_REDUCE_BLOCK_SIZE, 0, info.all_ctx[i]->stream()>>>(task);
|
||||
k_reduce_add_T<float, CUDA_REDUCE_BLOCK_SIZE, 3><<<nblock, CUDA_REDUCE_BLOCK_SIZE, 0, all_ctx[i]->stream()>>>(task);
|
||||
break;
|
||||
case 4:
|
||||
k_reduce_add_T<float, CUDA_REDUCE_BLOCK_SIZE, 4><<<nblock, CUDA_REDUCE_BLOCK_SIZE, 0, info.all_ctx[i]->stream()>>>(task);
|
||||
k_reduce_add_T<float, CUDA_REDUCE_BLOCK_SIZE, 4><<<nblock, CUDA_REDUCE_BLOCK_SIZE, 0, all_ctx[i]->stream()>>>(task);
|
||||
break;
|
||||
default:
|
||||
k_reduce_add<float, CUDA_REDUCE_BLOCK_SIZE><<<nblock, CUDA_REDUCE_BLOCK_SIZE, 0, info.all_ctx[i]->stream()>>>(task);
|
||||
k_reduce_add<float, CUDA_REDUCE_BLOCK_SIZE><<<nblock, CUDA_REDUCE_BLOCK_SIZE, 0, all_ctx[i]->stream()>>>(task);
|
||||
}
|
||||
}
|
||||
}
|
||||
@ -503,7 +513,7 @@ void ggml_cuda_op_reduce([[maybe_unused]] ggml_backend_cuda_context & ctx, ggml_
|
||||
for (int ii = 0; ii < nhave; ++ii) {
|
||||
int i = idx[ii];
|
||||
ggml_cuda_set_device(i);
|
||||
CUDA_CHECK(cudaEventRecord(info.all_ctx[i]->copy_event, info.all_ctx[i]->stream()));
|
||||
CUDA_CHECK(cudaEventRecord(all_ctx[i]->copy_event, all_ctx[i]->stream()));
|
||||
}
|
||||
//printf("Recorded events again\n");
|
||||
for (int ii = 0; ii < nhave; ++ii) {
|
||||
@ -512,7 +522,7 @@ void ggml_cuda_op_reduce([[maybe_unused]] ggml_backend_cuda_context & ctx, ggml_
|
||||
for (int jj = 0; jj < nhave; ++jj) {
|
||||
if (jj == ii) continue;
|
||||
int j = idx[jj];
|
||||
CUDA_CHECK(cudaStreamWaitEvent(info.all_ctx[i]->stream(), info.all_ctx[j]->copy_event));
|
||||
CUDA_CHECK(cudaStreamWaitEvent(all_ctx[i]->stream(), all_ctx[j]->copy_event));
|
||||
}
|
||||
}
|
||||
ggml_cuda_set_device(ctx.device);
|
||||
@ -536,11 +546,11 @@ void ggml_cuda_op_reduce([[maybe_unused]] ggml_backend_cuda_context & ctx, ggml_
|
||||
GGML_ASSERT(ggml_are_same_shape(dst, dst->src[i]));
|
||||
if (i == ctx.device) continue;
|
||||
ggml_cuda_set_device(i);
|
||||
CUDA_CHECK(cudaMemcpyPeerAsync(ptr, ctx.device, dst->src[i]->data, i, nbytes, info.all_ctx[i]->stream()));
|
||||
if (!info.all_ctx[i]->copy_event) {
|
||||
CUDA_CHECK(cudaEventCreateWithFlags(&info.all_ctx[i]->copy_event, cudaEventDisableTiming));
|
||||
CUDA_CHECK(cudaMemcpyPeerAsync(ptr, ctx.device, dst->src[i]->data, i, nbytes, all_ctx[i]->stream()));
|
||||
if (!all_ctx[i]->copy_event) {
|
||||
CUDA_CHECK(cudaEventCreateWithFlags(&all_ctx[i]->copy_event, cudaEventDisableTiming));
|
||||
}
|
||||
CUDA_CHECK(cudaEventRecord(info.all_ctx[i]->copy_event, info.all_ctx[i]->stream()));
|
||||
CUDA_CHECK(cudaEventRecord(all_ctx[i]->copy_event, all_ctx[i]->stream()));
|
||||
ptr += nbytes;
|
||||
}
|
||||
auto nelem = ggml_nelements(dst);
|
||||
@ -550,7 +560,7 @@ void ggml_cuda_op_reduce([[maybe_unused]] ggml_backend_cuda_context & ctx, ggml_
|
||||
for (int ii = 0; ii < nhave; ++ii) {
|
||||
int i = idx[ii];
|
||||
if (i == ctx.device) continue;
|
||||
CUDA_CHECK(cudaStreamWaitEvent(ctx.stream(), info.all_ctx[i]->copy_event, 0));
|
||||
CUDA_CHECK(cudaStreamWaitEvent(ctx.stream(), all_ctx[i]->copy_event, 0));
|
||||
if (dst->type == GGML_TYPE_F16) {
|
||||
k_add<half, CUDA_REDUCE_BLOCK_SIZE><<<num_blocks, CUDA_REDUCE_BLOCK_SIZE, 0, ctx.stream()>>>(nelem, (const half *)ptr, (half *)dst->data);
|
||||
} else if (dst->type == GGML_TYPE_BF16) {
|
||||
@ -572,15 +582,15 @@ void ggml_cuda_op_reduce([[maybe_unused]] ggml_backend_cuda_context & ctx, ggml_
|
||||
int i = idx[ii];
|
||||
if (i == ctx.device) continue;
|
||||
ggml_cuda_set_device(i);
|
||||
CUDA_CHECK(cudaStreamWaitEvent(info.all_ctx[i]->stream(), ctx.copy_event, 0));
|
||||
CUDA_CHECK(cudaMemcpyPeerAsync(dst->src[i]->data, i, dst->data, ctx.device, nbytes, info.all_ctx[i]->stream()));
|
||||
CUDA_CHECK(cudaEventRecord(info.all_ctx[i]->copy_event, info.all_ctx[i]->stream()));
|
||||
CUDA_CHECK(cudaStreamWaitEvent(all_ctx[i]->stream(), ctx.copy_event, 0));
|
||||
CUDA_CHECK(cudaMemcpyPeerAsync(dst->src[i]->data, i, dst->data, ctx.device, nbytes, all_ctx[i]->stream()));
|
||||
CUDA_CHECK(cudaEventRecord(all_ctx[i]->copy_event, all_ctx[i]->stream()));
|
||||
}
|
||||
ggml_cuda_set_device(ctx.device);
|
||||
for (int ii = 0; ii < nhave; ++ii) {
|
||||
int i = idx[ii];
|
||||
if (i == ctx.device) continue;
|
||||
CUDA_CHECK(cudaStreamWaitEvent(ctx.stream(), info.all_ctx[i]->copy_event, 0));
|
||||
CUDA_CHECK(cudaStreamWaitEvent(ctx.stream(), all_ctx[i]->copy_event, 0));
|
||||
}
|
||||
if (ncopy > 0) {
|
||||
copy_missing_tensors(ctx, dst, nhave, ncopy, idx, copy_idx);
|
||||
|
||||
@ -591,10 +591,6 @@ ggml_cgraph * llm_build_context::build_gemma4_mtp() {
|
||||
|
||||
ggml_tensor * KQ_mask = nullptr;
|
||||
ggml_tensor * KQ_mask_swa = nullptr;
|
||||
ggml_tensor * frozen_k_swa = nullptr;
|
||||
ggml_tensor * frozen_v_swa = nullptr;
|
||||
ggml_tensor * frozen_k_full = nullptr;
|
||||
ggml_tensor * frozen_v_full = nullptr;
|
||||
{
|
||||
const int64_t n_mask_tokens = GGML_PAD(n_tokens, GGML_KQ_MASK_PAD);
|
||||
lctx.inp_KQ_mask = ggml_new_tensor_2d(ctx0, flash_attn ? GGML_TYPE_F16 : GGML_TYPE_F32, target_n_kv, n_mask_tokens);
|
||||
@ -610,59 +606,248 @@ ggml_cgraph * llm_build_context::build_gemma4_mtp() {
|
||||
}
|
||||
}
|
||||
|
||||
for (int il = 0; il < n_layer; ++il) {
|
||||
if (model.split_mode == LLAMA_SPLIT_MODE_GRAPH) {
|
||||
int n_device = model.splits.size();
|
||||
std::vector<ggml_tensor *> sa_inp(n_device, nullptr);
|
||||
std::vector<ggml_tensor *> sa_out(n_device, nullptr);
|
||||
std::vector<ggml_tensor *> ffn_inp(n_device, nullptr);
|
||||
std::vector<ggml_tensor *> ffn_out(n_device, nullptr);
|
||||
ggml_tensor * inpL = cur;
|
||||
|
||||
const bool is_sliding = hparams.swa_layers[il] ? true : false;
|
||||
const float freq_base_l = is_sliding ? target_hparams.rope_freq_base_train_swa : target_cparams.rope_freq_base;
|
||||
const float freq_scale_l = is_sliding ? target_hparams.rope_freq_scale_train_swa : target_cparams.rope_freq_scale;
|
||||
const int n_rot_l = is_sliding ? target_hparams.n_rot_swa : target_hparams.n_rot;
|
||||
const int n_swa = is_sliding ? target_hparams.n_swa : 0;
|
||||
const int n_embd_head = hparams.n_embd_head_k(il);
|
||||
const int n_head = hparams.n_head(il);
|
||||
ggml_tensor * KQ_mask_l = is_sliding ? KQ_mask_swa : KQ_mask;
|
||||
for (int il = 0; il < n_layer; ++il) {
|
||||
|
||||
cur = llm_build_norm(ctx0, inpL, hparams, model.layers[il].attn_norm, nullptr, LLM_NORM_RMS, cb, il);
|
||||
cb(cur, "attn_norm", il);
|
||||
const bool is_sliding = hparams.swa_layers[il] ? true : false;
|
||||
const float freq_base_l = is_sliding ? target_hparams.rope_freq_base_train_swa : target_cparams.rope_freq_base;
|
||||
const float freq_scale_l = is_sliding ? target_hparams.rope_freq_scale_train_swa : target_cparams.rope_freq_scale;
|
||||
const int n_rot_l = is_sliding ? target_hparams.n_rot_swa : target_hparams.n_rot;
|
||||
const int n_swa = is_sliding ? target_hparams.n_swa : 0;
|
||||
const int n_embd_head = hparams.n_embd_head_k(il);
|
||||
ggml_tensor * KQ_mask_l = is_sliding ? KQ_mask_swa : KQ_mask;
|
||||
|
||||
ggml_tensor * Qcur = llm_build_lora_mm(lctx, ctx0, model.layers[il].wq, cur);
|
||||
cb(Qcur, "Qcur", il);
|
||||
Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens);
|
||||
Qcur = llm_build_norm(ctx0, Qcur, hparams, model.layers[il].attn_q_norm, nullptr, LLM_NORM_RMS, cb, il);
|
||||
cb(Qcur, "Qcur_normed", il);
|
||||
auto freq_factors = is_sliding ? nullptr : model.layers[il].rope_freqs;
|
||||
Qcur = ggml_rope_ext(ctx0, Qcur, inp_pos, freq_factors, n_rot_l, rope_type, n_ctx_orig, freq_base_l, freq_scale_l,
|
||||
ext_factor, attn_factor, beta_fast, beta_slow);
|
||||
cb(Qcur, "Qcur_rope", il);
|
||||
const int target_il = gemma4_mtp_target_kv_layer(hparams, target_hparams, il);
|
||||
|
||||
const int target_il = gemma4_mtp_target_kv_layer(hparams, target_hparams, il);
|
||||
ggml_tensor *& frozen_k = is_sliding ? frozen_k_swa : frozen_k_full;
|
||||
ggml_tensor *& frozen_v = is_sliding ? frozen_v_swa : frozen_v_full;
|
||||
gemma4_mtp_prepare_frozen_kv_views(ctx0, lctx, target_kv, il, target_il, target_n_kv, &frozen_k, &frozen_v, cb);
|
||||
cur = llm_build_kv(ctx0, lctx, target_kv, gf, model.layers[il].wo, model.layers[il].bo,
|
||||
nullptr, nullptr, Qcur, KQ_mask_l, n_tokens, target_kv_head, target_n_kv, hparams.f_attention_scale, cb, il, nullptr, n_swa, target_il,
|
||||
&frozen_k, &frozen_v);
|
||||
auto split_kl = (const ggml_split_tensor_t *)target_kv.k_l[target_il]->extra;
|
||||
auto split_vl = (const ggml_split_tensor_t *)target_kv.v_l[target_il]->extra;
|
||||
GGML_ASSERT(split_kl && split_vl);
|
||||
auto split_ql = (const ggml_split_tensor_t *)model.layers[il].wq->extra;
|
||||
auto split_ol = (const ggml_split_tensor_t *)model.layers[il].wo->extra;
|
||||
GGML_ASSERT(split_ql && split_ol);
|
||||
GGML_ASSERT(split_ql->n_device == n_device && split_kl->n_device == n_device && split_vl->n_device == n_device && split_ol->n_device == n_device);
|
||||
ggml_tensor * sa_last = nullptr;
|
||||
int nhave = 0;
|
||||
for (int id = 0; id < n_device; ++id) {
|
||||
GGML_ASSERT((split_kl->splits[id] && split_vl->splits[id] && split_ql->splits[id] && split_ol->splits[id]) ||
|
||||
!(split_kl->splits[id] || split_vl->splits[id] || split_ql->splits[id] || split_ol->splits[id]));
|
||||
if (!split_kl->splits[id]) {
|
||||
sa_inp[id] = sa_out[id] = nullptr;
|
||||
continue;
|
||||
}
|
||||
|
||||
cur = llm_build_norm(ctx0, cur, hparams, model.layers[il].attn_post_norm, nullptr, LLM_NORM_RMS, cb, il);
|
||||
cb(cur, "attn_post_norm", il);
|
||||
cur = ggml_add(ctx0, cur, inpL);
|
||||
cb(cur, "attn_out", il);
|
||||
int il_cb = 1000*(il + 1) + id;
|
||||
|
||||
ggml_tensor * ffn = llm_build_ffn(ctx0, lctx, model.layers[il].ffn_norm, cur,
|
||||
model.layers[il].ffn_up, nullptr, nullptr,
|
||||
model.layers[il].ffn_gate, nullptr, nullptr,
|
||||
model.layers[il].ffn_down, nullptr, nullptr,
|
||||
nullptr,
|
||||
LLM_FFN_GELU, LLM_FFN_PAR, cb, il, gf, true, false, nullptr, model.layers[il].ffn_post_norm);
|
||||
cb(ffn, "ffn_out", il);
|
||||
if (il == 0) {
|
||||
sa_inp[id] = inpL;
|
||||
} else {
|
||||
GGML_ASSERT(inpL->op == GGML_OP_REDUCE);
|
||||
cur = get_input_tensor_sm_graph(ctx0, inpL, id);
|
||||
GGML_ASSERT(model.layers[il-1].ffn_post_norm && model.layers[il-1].ffn_post_norm->extra);
|
||||
cur = do_split_norm(ctx0, cur, model.layers[il-1].ffn_post_norm, hparams, cb, id, il_cb, false);
|
||||
cb(cur, "ffn_normed", il_cb);
|
||||
auto add = ffn_inp[id];
|
||||
if (!add) {
|
||||
for (int j = 0; j < n_device; ++j) {
|
||||
if (ffn_inp[j]) {
|
||||
add = ffn_inp[j]; break;
|
||||
}
|
||||
}
|
||||
GGML_ASSERT(add);
|
||||
}
|
||||
sa_inp[id] = ggml_add(ctx0, cur, add);
|
||||
cb(sa_inp[id], "sa_inp", il_cb);
|
||||
if (model.layers[il-1].out_scale) {
|
||||
auto scale = (const ggml_split_tensor_t *)model.layers[il-1].out_scale->extra;
|
||||
sa_inp[id] = ggml_mul(ctx0, sa_inp[id], scale->splits[id]);
|
||||
cb(sa_inp[id], "sa_inp_scaled", il_cb);
|
||||
}
|
||||
}
|
||||
GGML_ASSERT(model.layers[il].attn_norm && model.layers[il].attn_norm->extra);
|
||||
cur = do_split_norm(ctx0, sa_inp[id], model.layers[il].attn_norm, hparams, cb, id, il_cb, false);
|
||||
cb(cur, "sa_inp_normed", il_cb);
|
||||
auto Qcur = llm_build_lora_mm(lctx, ctx0, split_ql->splits[id], cur);
|
||||
cb(Qcur, "Qcur", il_cb);
|
||||
Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, Qcur->ne[0]/n_embd_head, n_tokens);
|
||||
GGML_ASSERT(model.layers[il].attn_q_norm && model.layers[il].attn_q_norm->extra);
|
||||
Qcur = do_split_norm(ctx0, Qcur, model.layers[il].attn_q_norm, hparams, cb, id, il_cb, false);
|
||||
cb(Qcur, "Qcur_normed", il_cb);
|
||||
auto freq_factors = is_sliding ? nullptr : ((const ggml_split_tensor_t *)model.layers[il].rope_freqs->extra)->splits[id];
|
||||
Qcur = ggml_rope_ext(ctx0, Qcur, inp_pos, freq_factors, n_rot_l, rope_type, n_ctx_orig, freq_base_l, freq_scale_l,
|
||||
ext_factor, attn_factor, beta_fast, beta_slow);
|
||||
cb(Qcur, "Qcur_rope", il_cb);
|
||||
GGML_ASSERT(split_kl->splits[id]->ne[1] % target_kv.size == 0);
|
||||
int n_head_kv = split_kl->splits[id]->ne[1] / target_kv.size;
|
||||
auto q = ggml_permute(ctx0, Qcur, 0, 2, 1, 3);
|
||||
auto k = ggml_view_3d(ctx0, split_kl->splits[id], n_embd_head, target_n_kv, n_head_kv,
|
||||
ggml_row_size(split_kl->splits[id]->type, n_embd_head)*n_head_kv,
|
||||
ggml_row_size(split_kl->splits[id]->type, n_embd_head), 0);
|
||||
auto v = ggml_view_3d(ctx0, split_vl->splits[id], n_embd_head, target_n_kv, n_head_kv,
|
||||
ggml_row_size(split_vl->splits[id]->type, n_embd_head)*n_head_kv,
|
||||
ggml_row_size(split_vl->splits[id]->type, n_embd_head), 0);
|
||||
cur = ggml_flash_attn_ext(ctx0, q, k, v, KQ_mask_l, hparams.f_attention_scale, 0.0f, 0.0f);
|
||||
cur->op_params[4] = n_swa;
|
||||
cb(cur, "fa", il_cb);
|
||||
cur = ggml_reshape_2d(ctx0, cur, split_ol->splits[id]->ne[0], ggml_nelements(cur)/split_ol->splits[id]->ne[0]);
|
||||
cur = llm_build_lora_mm(lctx, ctx0, split_ol->splits[id], cur);
|
||||
cb(cur, "qkv", il_cb);
|
||||
ggml_build_forward_expand(gf, cur);
|
||||
sa_out[id] = cur;
|
||||
sa_last = cur;
|
||||
++nhave;
|
||||
}
|
||||
|
||||
cur = ffn;
|
||||
if (model.layers[il].out_scale) {
|
||||
cur = ggml_mul(ctx0, cur, model.layers[il].out_scale);
|
||||
cb(cur, "out_scaled", il);
|
||||
auto last_ffn_inp = nhave > 1 ? ggml_reduce(ctx0, sa_out.data(), n_device, GGML_OP_ADD) : sa_last;
|
||||
ggml_build_forward_expand(gf, last_ffn_inp);
|
||||
cb(last_ffn_inp, "sa_reduce", il);
|
||||
|
||||
auto ffn_up = (const ggml_split_tensor_t *)model.layers[il].ffn_up->extra;
|
||||
auto ffn_gate = (const ggml_split_tensor_t *)model.layers[il].ffn_gate->extra;
|
||||
auto ffn_down = (const ggml_split_tensor_t *)model.layers[il].ffn_down->extra;
|
||||
GGML_ASSERT(ffn_up && ffn_gate && ffn_down);
|
||||
|
||||
for (int id = 0; id < n_device; ++id) {
|
||||
GGML_ASSERT((ffn_up->splits[id] && ffn_gate->splits[id] && ffn_down->splits[id]) ||
|
||||
(!ffn_up->splits[id] && !ffn_gate->splits[id] && !ffn_down->splits[id]));
|
||||
if (!ffn_up->splits[id]) {
|
||||
ffn_inp[id] = ffn_out[id] = nullptr;
|
||||
continue;
|
||||
}
|
||||
|
||||
GGML_ASSERT(last_ffn_inp && (nhave == 1 || last_ffn_inp->op == GGML_OP_REDUCE));
|
||||
|
||||
int il_cb = 1000*(il + 1) + id;
|
||||
|
||||
cur = get_input_tensor_sm_graph(ctx0, last_ffn_inp, id);
|
||||
cur = do_split_norm(ctx0, cur, model.layers[il].attn_post_norm, hparams, cb, id, il_cb, false);
|
||||
cb(cur, "sa_post", il_cb);
|
||||
auto add = sa_inp[id];
|
||||
if (!add) {
|
||||
for (int j = 0; j < n_device; ++j) {
|
||||
if (sa_inp[j]) {
|
||||
add = sa_inp[j]; break;
|
||||
}
|
||||
}
|
||||
}
|
||||
ffn_inp[id] = ggml_add(ctx0, cur, add);
|
||||
cb(ffn_inp[id], "ffn_inp", il_cb);
|
||||
cur = do_split_norm(ctx0, ffn_inp[id], model.layers[il].ffn_norm, hparams, cb, id, il_cb, false);
|
||||
cb(cur, "ffn_inp_normed", il_cb);
|
||||
cur = llm_build_ffn(ctx0, lctx, nullptr, cur,
|
||||
ffn_up->splits[id], nullptr, nullptr,
|
||||
ffn_gate->splits[id], nullptr, nullptr,
|
||||
ffn_down->splits[id], nullptr, nullptr,
|
||||
nullptr,
|
||||
LLM_FFN_GELU, LLM_FFN_PAR, cb, il, gf, false, false, nullptr, nullptr);
|
||||
cb(cur, "ffn", il_cb);
|
||||
ggml_build_forward_expand(gf, cur);
|
||||
ffn_out[id] = cur;
|
||||
|
||||
}
|
||||
|
||||
inpL = ggml_reduce(ctx0, ffn_out.data(), n_device, GGML_OP_ADD);
|
||||
cb(inpL, "ffn_reduce", il);
|
||||
ggml_build_forward_expand(gf, inpL);
|
||||
}
|
||||
|
||||
int idx = lctx.model.default_layer_device[lctx.model.hparams.n_layer];
|
||||
int idx_out = ggml_backend_sched_get_backend_idx(lctx.sched, lctx.model.output->buffer);
|
||||
if (idx_out >= 0) idx = idx_out;
|
||||
cur = inpL->src[idx];
|
||||
if (!cur) {
|
||||
cur = inpL->view_src;
|
||||
}
|
||||
|
||||
auto post_norm = (const ggml_split_tensor_t *)model.layers[hparams.n_layer-1].ffn_post_norm->extra;
|
||||
cur = llm_build_norm(ctx0, cur, hparams, post_norm->splits[idx], NULL, LLM_NORM_RMS, cb, -1);
|
||||
|
||||
cb(cur, "ffn_normed", hparams.n_layer-1);
|
||||
auto add = ffn_inp[idx];
|
||||
if (!add) {
|
||||
for (int j = 0; j < n_device; ++j) {
|
||||
if (ffn_inp[j]) {
|
||||
add = ffn_inp[j]; break;
|
||||
}
|
||||
}
|
||||
}
|
||||
cur = ggml_add(ctx0, cur, add);
|
||||
cb(cur, "ffn_out", hparams.n_layer-1);
|
||||
|
||||
if (model.layers[hparams.n_layer-1].out_scale) {
|
||||
auto scale = (const ggml_split_tensor_t *)model.layers[hparams.n_layer-1].out_scale->extra;
|
||||
cur = ggml_mul(ctx0, cur, scale->splits[idx]);
|
||||
cb(cur, "ffn_out_scaled", hparams.n_layer-1);
|
||||
}
|
||||
|
||||
} else {
|
||||
|
||||
ggml_tensor * frozen_k_swa = nullptr;
|
||||
ggml_tensor * frozen_v_swa = nullptr;
|
||||
ggml_tensor * frozen_k_full = nullptr;
|
||||
ggml_tensor * frozen_v_full = nullptr;
|
||||
|
||||
for (int il = 0; il < n_layer; ++il) {
|
||||
ggml_tensor * inpL = cur;
|
||||
|
||||
const bool is_sliding = hparams.swa_layers[il] ? true : false;
|
||||
const float freq_base_l = is_sliding ? target_hparams.rope_freq_base_train_swa : target_cparams.rope_freq_base;
|
||||
const float freq_scale_l = is_sliding ? target_hparams.rope_freq_scale_train_swa : target_cparams.rope_freq_scale;
|
||||
const int n_rot_l = is_sliding ? target_hparams.n_rot_swa : target_hparams.n_rot;
|
||||
const int n_swa = is_sliding ? target_hparams.n_swa : 0;
|
||||
const int n_embd_head = hparams.n_embd_head_k(il);
|
||||
const int n_head = hparams.n_head(il);
|
||||
ggml_tensor * KQ_mask_l = is_sliding ? KQ_mask_swa : KQ_mask;
|
||||
|
||||
const int target_il = gemma4_mtp_target_kv_layer(hparams, target_hparams, il);
|
||||
|
||||
cur = llm_build_norm(ctx0, inpL, hparams, model.layers[il].attn_norm, nullptr, LLM_NORM_RMS, cb, il);
|
||||
cb(cur, "attn_norm", il);
|
||||
|
||||
ggml_tensor * Qcur = llm_build_lora_mm(lctx, ctx0, model.layers[il].wq, cur);
|
||||
cb(Qcur, "Qcur", il);
|
||||
Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens);
|
||||
Qcur = llm_build_norm(ctx0, Qcur, hparams, model.layers[il].attn_q_norm, nullptr, LLM_NORM_RMS, cb, il);
|
||||
cb(Qcur, "Qcur_normed", il);
|
||||
auto freq_factors = is_sliding ? nullptr : model.layers[il].rope_freqs;
|
||||
Qcur = ggml_rope_ext(ctx0, Qcur, inp_pos, freq_factors, n_rot_l, rope_type, n_ctx_orig, freq_base_l, freq_scale_l,
|
||||
ext_factor, attn_factor, beta_fast, beta_slow);
|
||||
cb(Qcur, "Qcur_rope", il);
|
||||
|
||||
ggml_tensor *& frozen_k = is_sliding ? frozen_k_swa : frozen_k_full;
|
||||
ggml_tensor *& frozen_v = is_sliding ? frozen_v_swa : frozen_v_full;
|
||||
gemma4_mtp_prepare_frozen_kv_views(ctx0, lctx, target_kv, il, target_il, target_n_kv, &frozen_k, &frozen_v, cb);
|
||||
cur = llm_build_kv(ctx0, lctx, target_kv, gf, model.layers[il].wo, model.layers[il].bo,
|
||||
nullptr, nullptr, Qcur, KQ_mask_l, n_tokens, target_kv_head, target_n_kv, hparams.f_attention_scale, cb, il, nullptr, n_swa, target_il,
|
||||
&frozen_k, &frozen_v);
|
||||
|
||||
cur = llm_build_norm(ctx0, cur, hparams, model.layers[il].attn_post_norm, nullptr, LLM_NORM_RMS, cb, il);
|
||||
cb(cur, "attn_post_norm", il);
|
||||
cur = ggml_add(ctx0, cur, inpL);
|
||||
cb(cur, "attn_out", il);
|
||||
|
||||
ggml_tensor * ffn = llm_build_ffn(ctx0, lctx, model.layers[il].ffn_norm, cur,
|
||||
model.layers[il].ffn_up, nullptr, nullptr,
|
||||
model.layers[il].ffn_gate, nullptr, nullptr,
|
||||
model.layers[il].ffn_down, nullptr, nullptr,
|
||||
nullptr,
|
||||
LLM_FFN_GELU, LLM_FFN_PAR, cb, il, gf, true, false, nullptr, model.layers[il].ffn_post_norm);
|
||||
cb(ffn, "ffn_out", il);
|
||||
|
||||
cur = ffn;
|
||||
if (model.layers[il].out_scale) {
|
||||
cur = ggml_mul(ctx0, cur, model.layers[il].out_scale);
|
||||
cb(cur, "out_scaled", il);
|
||||
}
|
||||
cur = lctx.cvec.apply_to(ctx0, cur, il);
|
||||
cb(cur, "l_out", il);
|
||||
}
|
||||
cur = lctx.cvec.apply_to(ctx0, cur, il);
|
||||
cb(cur, "l_out", il);
|
||||
}
|
||||
|
||||
cur = llm_build_norm(ctx0, cur, hparams, model.output_norm, nullptr, LLM_NORM_RMS, cb, -1);
|
||||
|
||||
@ -233,3 +233,5 @@ struct llama_split_tensor {
|
||||
|
||||
void llama_decode_reset();
|
||||
void llama_decode_stop();
|
||||
|
||||
std::vector<llama_model *> & llama_all_loaded_models();
|
||||
|
||||
@ -194,6 +194,8 @@ struct create_tensors_helper : public create_tensors_helper_interface {
|
||||
ggml_context * ctx_output;
|
||||
ggml_context * ctx_output_split;
|
||||
|
||||
llama_model * tgt_model = nullptr;
|
||||
|
||||
ggml_backend_buffer_type_t default_cpu_buft;
|
||||
bool has_buft_overrides = false;
|
||||
|
||||
@ -221,6 +223,21 @@ struct create_tensors_helper : public create_tensors_helper_interface {
|
||||
|
||||
create_tensors_helper::create_tensors_helper(llama_model_loader & _ml, llama_model & _model) : ml(_ml), model(_model) {
|
||||
|
||||
if (model.arch == LLM_ARCH_GEMMA4_MTP || model.arch == LLM_ARCH_GEMMA4_ASSISTANT) {
|
||||
auto & all_models = llama_all_loaded_models();
|
||||
for (auto model : all_models) {
|
||||
if (model->arch == LLM_ARCH_GEMMA4) {
|
||||
tgt_model = model;
|
||||
}
|
||||
}
|
||||
if (tgt_model) {
|
||||
LLAMA_LOG_INFO("==================== Found target model for Gemma4-Assistant. split mode graph: %d\n", model.split_mode == LLAMA_SPLIT_MODE_GRAPH);
|
||||
} else {
|
||||
LLAMA_LOG_INFO("==================== Did not find target model for Gemma4-Assistant\n");
|
||||
model.split_mode = LLAMA_SPLIT_MODE_LAYER;
|
||||
}
|
||||
}
|
||||
|
||||
const int n_layer = model.hparams.n_layer;
|
||||
buft_layer_count[model.buft_input.buft]++;
|
||||
buft_layer_count[model.buft_input.buft_matrix]++;
|
||||
@ -2242,7 +2259,7 @@ bool create_tensors_helper::create_gemma4_mtp_tensors(const LLM_TN & tn) {
|
||||
const int64_t n_ff_cur = hparams.n_ff(i);
|
||||
|
||||
if (!hparams.swa_layers[i]) {
|
||||
layer.rope_freqs = create_tensor(ctx_layer, tn(LLM_TENSOR_ROPE_FREQS, "weight"), { n_rot/2 },
|
||||
layer.rope_freqs = create_tensor(ctx_split, tn(LLM_TENSOR_ROPE_FREQS, "weight"), { n_rot/2 },
|
||||
llama_model_loader::TENSOR_NOT_REQUIRED | rope_flag);
|
||||
rope_flag = llama_model_loader::TENSOR_DUPLICATED;
|
||||
}
|
||||
@ -4540,7 +4557,7 @@ bool create_tensors_helper::create_tensors() {
|
||||
|
||||
{
|
||||
const bool unsupported =
|
||||
(model.arch == LLM_ARCH_GEMMA4_MTP || model.arch == LLM_ARCH_GEMMA4_ASSISTANT) ||
|
||||
//(model.arch == LLM_ARCH_GEMMA4_MTP || model.arch == LLM_ARCH_GEMMA4_ASSISTANT) ||
|
||||
(model.arch == LLM_ARCH_GEMMA4 && model.tok_embd_per_layer);
|
||||
if (unsupported && (model.split_mode == LLAMA_SPLIT_MODE_GRAPH || model.split_mode == LLAMA_SPLIT_MODE_ATTN)) {
|
||||
LLAMA_LOG_WARN("\n=========================================================\n");
|
||||
@ -4583,6 +4600,12 @@ bool create_tensors_helper::create_tensors() {
|
||||
if (model.max_gpu > 0 && model.max_gpu < int(model.splits.size())) {
|
||||
gpu_split_count.resize(model.splits.size(), 0.0f);
|
||||
}
|
||||
auto is_gemma4_model = [this] () {
|
||||
return model.arch == LLM_ARCH_GEMMA4 || model.arch == LLM_ARCH_GEMMA4_MTP || model.arch == LLM_ARCH_GEMMA4_ASSISTANT;
|
||||
};
|
||||
auto is_gemma4_assistant = [this] () {
|
||||
return model.arch == LLM_ARCH_GEMMA4_MTP || model.arch == LLM_ARCH_GEMMA4_ASSISTANT;
|
||||
};
|
||||
for (int il = 0; il < n_layer; ++il) {
|
||||
// For now only run MTP into the per-layer
|
||||
if (model.mtp && hparams.nextn_predict_layers > 0 &&
|
||||
@ -4620,7 +4643,7 @@ bool create_tensors_helper::create_tensors() {
|
||||
if (layer.attn_norm) {
|
||||
prepare_split_tensors(-1, ctx_split, layer.attn_norm, layer.split_attn_norm, mirror, mem_used);
|
||||
}
|
||||
if (model.arch == LLM_ARCH_GEMMA4 && layer.attn_post_norm) {
|
||||
if (is_gemma4_model() && layer.attn_post_norm) {
|
||||
prepare_split_tensors(-1, ctx_split, layer.attn_post_norm, layer.split_attn_post_norm, mirror, mem_used);
|
||||
}
|
||||
if (layer.rope_freqs) {
|
||||
@ -4630,6 +4653,48 @@ bool create_tensors_helper::create_tensors() {
|
||||
if (hparams.is_recurrent(il)) {
|
||||
split_recurrent_tensors(hparams, layer, cur_splits, mem_used, ctx_split, il); //, model.arch == LLM_ARCH_QWEN3NEXT ? 0 : 1);
|
||||
}
|
||||
else if (is_gemma4_assistant()) {
|
||||
GGML_ASSERT(layer.wo && layer.wq);
|
||||
GGML_ASSERT(tgt_model);
|
||||
int n_embd_head = hparams.n_embd_head_k(il);
|
||||
int n_head = hparams.n_head(il);
|
||||
bool is_sliding = hparams.swa_layers[il] != 0;
|
||||
int target_n_kv_layer = tgt_model->hparams.n_layer_kv_from_start > 0
|
||||
? std::min<int>((int) tgt_model->hparams.n_layer, tgt_model->hparams.n_layer_kv_from_start)
|
||||
: (int) tgt_model->hparams.n_layer;
|
||||
int target_il = target_n_kv_layer - 1;
|
||||
for (; target_il >= 0; --target_il) {
|
||||
if ((tgt_model->hparams.swa_layers[target_il] != 0) == is_sliding) break;
|
||||
}
|
||||
GGML_ASSERT(target_il >= 0 && "Gemma4 MTP could not find a matching target KV layer");
|
||||
int n_head_tgt = tgt_model->hparams.n_head(target_il);
|
||||
GGML_ASSERT(tgt_model->hparams.n_embd_head_k(target_il) == n_embd_head);
|
||||
auto & target_layer = tgt_model->layers[target_il];
|
||||
auto split_wq = (const ggml_split_tensor_t *)target_layer.wq->extra;
|
||||
auto split_wo = (const ggml_split_tensor_t *)target_layer.wo->extra;
|
||||
GGML_ASSERT(split_wq && split_wo);
|
||||
std::vector<int> q_split(split_wq->n_device, 0);
|
||||
std::vector<int> o_split(split_wo->n_device, 0);
|
||||
for (int id = 0; id < split_wq->n_device; ++id) {
|
||||
if (split_wq->splits[id]) {
|
||||
int nh = split_wq->splits[id]->ne[1] / n_embd_head;
|
||||
GGML_ASSERT((nh*n_head) % n_head_tgt == 0);
|
||||
q_split[id] = ((nh*n_head)/n_head_tgt)*n_embd_head;
|
||||
}
|
||||
}
|
||||
for (int id = 0; id < split_wo->n_device; ++id) {
|
||||
if (split_wo->splits[id]) {
|
||||
int64_t no = split_wo->splits[id]->ne[0] * layer.wo->ne[0];
|
||||
GGML_ASSERT(no % target_layer.wo->ne[0] == 0);
|
||||
o_split[id] = no / target_layer.wo->ne[0];
|
||||
}
|
||||
}
|
||||
prepare_split_tensors(1, ctx_split, layer.wq, layer.split_wq, q_split, mem_used);
|
||||
prepare_split_tensors(0, ctx_split, layer.wo, layer.split_wo, o_split, mem_used);
|
||||
if (layer.attn_q_norm) {
|
||||
prepare_split_tensors(-1, ctx_split, layer.attn_q_norm, layer.split_q_norm, o_split, mem_used);
|
||||
}
|
||||
}
|
||||
else if (layer.wo && layer.wq && layer.wk && (layer.wv || model.arch == LLM_ARCH_GEMMA4)) {
|
||||
auto granularity_kq = hparams.n_embd_head_k(il) * gqa_ratio;
|
||||
int wq_ne1 = layer.wq->ne[1];
|
||||
|
||||
@ -1053,6 +1053,7 @@ void llama_model_loader::load_data_for(struct ggml_tensor * cur) const {
|
||||
// Returns false if cancelled by progress_callback
|
||||
bool llama_model_loader::load_all_data(
|
||||
struct ggml_context * ctx,
|
||||
[[maybe_unused]] llama_model * model,
|
||||
llama_buf_map & bufs_mmap,
|
||||
llama_mlocks * lmlocks,
|
||||
llama_progress_callback progress_callback,
|
||||
@ -1083,7 +1084,7 @@ bool llama_model_loader::load_all_data(
|
||||
for (int i = 0; i < ggml_backend_cuda_get_device_count(); ++i) {
|
||||
auto * cuda_buffer_type = ggml_backend_cuda_buffer_type(i);
|
||||
if (buffer_type == cuda_buffer_type) {
|
||||
cuda_backend = ggml_backend_cuda_init(i, nullptr);
|
||||
cuda_backend = ggml_backend_cuda_init(i, nullptr, model);
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
@ -184,6 +184,7 @@ struct llama_model_loader {
|
||||
// Returns false if cancelled by progress_callback
|
||||
bool load_all_data(
|
||||
struct ggml_context * ctx,
|
||||
struct llama_model * model,
|
||||
llama_buf_map & bufs_mmap,
|
||||
llama_mlocks * lmlocks,
|
||||
llama_progress_callback progress_callback,
|
||||
|
||||
@ -939,7 +939,7 @@ bool reload_info::reload_changed_tensors(llama_model & model) {
|
||||
|
||||
if (r) {
|
||||
#ifdef GGML_USE_CUDA
|
||||
ggml_backend_cuda_invalidate_graphs();
|
||||
ggml_backend_cuda_invalidate_graphs(&model);
|
||||
#endif
|
||||
}
|
||||
return r;
|
||||
|
||||
@ -682,6 +682,11 @@ bool llama_context::update_cache_copies() {
|
||||
return true;
|
||||
}
|
||||
|
||||
static std::vector<llama_context *> & llama_all_contexts() {
|
||||
static std::vector<llama_context *> contexts;
|
||||
return contexts;
|
||||
}
|
||||
|
||||
llama_context::llama_context(const llama_model & model)
|
||||
: model(model) , sampling(llama_n_vocab(&model)) , t_start_us(model.t_start_us) , t_load_us(model.t_load_us) {
|
||||
const auto & hparams = model.hparams;
|
||||
@ -690,6 +695,7 @@ llama_context::llama_context(const llama_model & model)
|
||||
} else {
|
||||
cache_copies.resize(2*hparams.n_layer);
|
||||
}
|
||||
llama_all_contexts().push_back(this);
|
||||
}
|
||||
|
||||
void llama_context::set_mtp_op_type(llama_mtp_op_type value) {
|
||||
@ -710,6 +716,14 @@ llama_context::~llama_context() {
|
||||
}
|
||||
|
||||
ggml_backend_buffer_free(buf_output);
|
||||
|
||||
auto & all_contexts = llama_all_contexts();
|
||||
for (auto it = all_contexts.begin(); it != all_contexts.end(); ++it) {
|
||||
if (*it == this) {
|
||||
all_contexts.erase(it);
|
||||
break;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
int llama_context::max_nodes(int n_tokens, int n_kv) const {
|
||||
@ -3093,6 +3107,8 @@ static bool is_model_split_supported(const llama_model & model) {
|
||||
LLM_ARCH_QWEN35,
|
||||
LLM_ARCH_QWEN35MOE,
|
||||
LLM_ARCH_GEMMA4,
|
||||
LLM_ARCH_GEMMA4_MTP,
|
||||
LLM_ARCH_GEMMA4_ASSISTANT,
|
||||
LLM_ARCH_DEEPSEEK2,
|
||||
LLM_ARCH_GLM_DSA,
|
||||
LLM_ARCH_MISTRAL4,
|
||||
@ -3371,11 +3387,21 @@ static bool llm_load_tensors(
|
||||
|
||||
auto & hparams = model.hparams;
|
||||
|
||||
if (model.arch == LLM_ARCH_GEMMA4_MTP || model.arch == LLM_ARCH_GEMMA4_ASSISTANT) {
|
||||
auto & all_models = llama_all_loaded_models();
|
||||
llama_model * tgt_model = nullptr;
|
||||
for (auto model : all_models) {
|
||||
if (model->arch == LLM_ARCH_GEMMA4) {
|
||||
tgt_model = model;
|
||||
}
|
||||
}
|
||||
if (tgt_model) {
|
||||
split_mode = tgt_model->split_mode;
|
||||
}
|
||||
}
|
||||
|
||||
if (split_mode == LLAMA_SPLIT_MODE_GRAPH || split_mode == LLAMA_SPLIT_MODE_ATTN) {
|
||||
const bool unsupported_gemma_split =
|
||||
model.arch == LLM_ARCH_GEMMA4_MTP ||
|
||||
model.arch == LLM_ARCH_GEMMA4_ASSISTANT ||
|
||||
(model.arch == LLM_ARCH_GEMMA4 && hparams.n_embd_per_layer > 0);
|
||||
const bool unsupported_gemma_split = model.arch == LLM_ARCH_GEMMA4 && hparams.n_embd_per_layer > 0;
|
||||
|
||||
if (unsupported_gemma_split) {
|
||||
LLAMA_LOG_WARN("\n=========================================================\n");
|
||||
@ -3401,6 +3427,25 @@ static bool llm_load_tensors(
|
||||
}
|
||||
}
|
||||
}
|
||||
if ((split_mode == LLAMA_SPLIT_MODE_GRAPH || split_mode == LLAMA_SPLIT_MODE_ATTN) &&
|
||||
(model.arch == LLM_ARCH_GEMMA4_MTP || model.arch == LLM_ARCH_GEMMA4_ASSISTANT)) {
|
||||
auto & all_models = llama_all_loaded_models();
|
||||
bool has_target_gemma = false;
|
||||
for (auto model : all_models) {
|
||||
if (model->arch == LLM_ARCH_GEMMA4) {
|
||||
has_target_gemma = true;
|
||||
break;
|
||||
}
|
||||
}
|
||||
if (!has_target_gemma) {
|
||||
LLAMA_LOG_WARN("\n=======================================================\n");
|
||||
LLAMA_LOG_WARN("Split mode 'graph' requested for Gemma4-assistant model\n");
|
||||
LLAMA_LOG_WARN("but no loaded Gemma4 model found.\n");
|
||||
LLAMA_LOG_WARN(" => changing split mode to 'layer'\n");
|
||||
LLAMA_LOG_WARN("=======================================================\n\n");
|
||||
split_mode = LLAMA_SPLIT_MODE_LAYER;
|
||||
}
|
||||
}
|
||||
|
||||
if (iqk_has_fancy_simd()) {
|
||||
LLAMA_LOG_INFO("======================================= HAVE_FANCY_SIMD is defined\n");
|
||||
@ -3971,7 +4016,7 @@ static bool llm_load_tensors(
|
||||
for (auto & it : ctx_bufs) {
|
||||
ggml_context * ctx = it.first;
|
||||
auto & bufs = it.second;
|
||||
if (!ml.load_all_data(ctx, bufs, use_mlock ? &model.mlock_mmaps : NULL, progress_callback, progress_callback_user_data)) {
|
||||
if (!ml.load_all_data(ctx, &model, bufs, use_mlock ? &model.mlock_mmaps : NULL, progress_callback, progress_callback_user_data)) {
|
||||
return false;
|
||||
}
|
||||
}
|
||||
@ -6722,10 +6767,19 @@ struct llama_model * llama_model_load_from_file(
|
||||
return nullptr;
|
||||
}
|
||||
|
||||
llama_all_loaded_models().push_back(model);
|
||||
|
||||
return model;
|
||||
}
|
||||
|
||||
void llama_free_model(struct llama_model * model) {
|
||||
auto & all_models = llama_all_loaded_models();
|
||||
for (auto it = all_models.begin(); it != all_models.end(); ++it) {
|
||||
if (*it == model) {
|
||||
all_models.erase(it);
|
||||
break;
|
||||
}
|
||||
}
|
||||
delete model;
|
||||
}
|
||||
|
||||
@ -7094,7 +7148,7 @@ struct llama_context * llama_init_from_model(
|
||||
// main_gpu is a local index into model->devices throughout the codebase
|
||||
// (auto-fit assigns device_count-1, MTP clamps to [0, device_count), buffer-type
|
||||
// setup wraps with model.devices[main_gpu]). Translate to a raw device id here.
|
||||
const int main_gpu_id = (model->main_gpu >= 0 && model->main_gpu < (int)model->devices.size())
|
||||
[[maybe_unused]] const int main_gpu_id = (model->main_gpu >= 0 && model->main_gpu < (int)model->devices.size())
|
||||
? model->devices[model->main_gpu]
|
||||
: model->main_gpu;
|
||||
#if defined(GGML_USE_METAL)
|
||||
@ -7110,7 +7164,7 @@ struct llama_context * llama_init_from_model(
|
||||
#elif defined(GGML_USE_CUDA)
|
||||
if (model->split_mode == LLAMA_SPLIT_MODE_NONE) {
|
||||
// with split_mode LLAMA_SPLIT_MODE_NONE or LLAMA_SPLIT_MODE_GRAPH, only the main GPU backend is used
|
||||
ggml_backend_t backend = ggml_backend_cuda_init(main_gpu_id, cparams.cuda_params);
|
||||
ggml_backend_t backend = ggml_backend_cuda_init(main_gpu_id, cparams.cuda_params, ctx);
|
||||
if (backend == nullptr) {
|
||||
LLAMA_LOG_ERROR("%s: failed to initialize CUDA%d backend\n", __func__, main_gpu_id);
|
||||
llama_free(ctx);
|
||||
@ -7129,7 +7183,7 @@ struct llama_context * llama_init_from_model(
|
||||
params = new_params.data();
|
||||
}
|
||||
for (int device = 0; device < ggml_backend_cuda_get_device_count(); ++device) {
|
||||
ggml_backend_t backend = ggml_backend_cuda_init(device, params);
|
||||
ggml_backend_t backend = ggml_backend_cuda_init(device, params, ctx);
|
||||
if (backend == nullptr) {
|
||||
LLAMA_LOG_ERROR("%s: failed to initialize CUDA%d backend\n", __func__, device);
|
||||
llama_free(ctx);
|
||||
@ -11099,3 +11153,8 @@ bool llama_reload_changed_tensors(struct llama_context * ctx) {
|
||||
}
|
||||
return result;
|
||||
}
|
||||
|
||||
std::vector<llama_model *> & llama_all_loaded_models() {
|
||||
static std::vector<llama_model *> models;
|
||||
return models;
|
||||
}
|
||||
|
||||
Loading…
x
Reference in New Issue
Block a user