mirror of
https://github.com/ikawrakow/ik_llama.cpp.git
synced 2026-06-28 04:30:15 -05:00
Some tweaks
This commit is contained in:
parent
d2ccbe92a6
commit
7dd19e197d
@ -240,7 +240,7 @@
|
||||
// if you need to load more than 64 model shards.
|
||||
#define GGML_MAX_CONTEXTS 64
|
||||
#endif
|
||||
#define GGML_MAX_SRC 10
|
||||
#define GGML_MAX_SRC 12
|
||||
#ifndef GGML_MAX_NAME
|
||||
#define GGML_MAX_NAME 64
|
||||
#endif
|
||||
|
||||
@ -1127,6 +1127,18 @@ GGML_CALL static void ggml_backend_cuda_split_buffer_set_tensor([[maybe_unused]]
|
||||
}
|
||||
}
|
||||
}
|
||||
else if (extra->split_dim == 2) {
|
||||
size_t cur_offset = 0;
|
||||
for (int i = 0; i < extra->n_device; ++i) {
|
||||
auto split = extra->splits[i];
|
||||
if (!split) continue;
|
||||
ggml_cuda_set_device(i);
|
||||
auto size = ggml_nbytes(split);
|
||||
const char * buf_host = (const char *)data + cur_offset;
|
||||
CUDA_CHECK(cudaMemcpyAsync(split->data, buf_host, size, cudaMemcpyHostToDevice, cudaStreamPerThread));
|
||||
cur_offset += size;
|
||||
}
|
||||
}
|
||||
else {
|
||||
fprintf(stderr, "%s: not implemented for split dim %d\n", __func__, extra->split_dim == 0);
|
||||
GGML_ABORT("fatal error");
|
||||
@ -1147,6 +1159,8 @@ GGML_CALL static void ggml_backend_cuda_split_buffer_get_tensor([[maybe_unused]]
|
||||
|
||||
if (!tensor->extra) return;
|
||||
|
||||
printf("%s(%s): offset = %zu, size = %zu\n", __func__, tensor->name, offset, size);
|
||||
|
||||
// Inverse of split_buffer_set_tensor; refuses paths with no defined inverse.
|
||||
auto extra = (ggml_split_tensor_t *)tensor->extra;
|
||||
GGML_ASSERT(extra->n_device <= ggml_backend_cuda_get_device_count());
|
||||
|
||||
@ -2132,7 +2132,7 @@ static void ggml_cuda_flash_attn_ext_mma_f16_switch_ncols1(ggml_backend_cuda_con
|
||||
// return;
|
||||
//} else {
|
||||
if (Q->ne[1] <= 8/ncols2) {
|
||||
if constexpr (DKQ == 512) {
|
||||
if constexpr (DKQ == 512 || DKQ == 576) {
|
||||
ggml_cuda_flash_attn_ext_mma_f16_case<DKQ, DV, 2, ncols2>(ctx, dst);
|
||||
} else {
|
||||
ggml_cuda_flash_attn_ext_mma_f16_case<DKQ, DV, 8/ncols2, ncols2>(ctx, dst);
|
||||
@ -2275,8 +2275,14 @@ void ggml_cuda_flash_attn_ext_mma_new(ggml_backend_cuda_context & ctx, ggml_tens
|
||||
}
|
||||
return;
|
||||
}
|
||||
if (gqa_ratio % 12 == 0 && Q->ne[1] <= 4 && K->ne[1] >= 2048) {
|
||||
ggml_cuda_flash_attn_ext_mma_f16_case<576, 512, 1, 16>(ctx, dst);
|
||||
return;
|
||||
}
|
||||
if (gqa_ratio % 16 == 0) {
|
||||
ggml_cuda_flash_attn_ext_mma_f16_switch_ncols1<576, 512, 16>(ctx, dst);
|
||||
} else if (gqa_ratio % 8 == 0) {
|
||||
ggml_cuda_flash_attn_ext_mma_f16_switch_ncols1<576, 512, 8>(ctx, dst);
|
||||
} else if (gqa_ratio % 4 == 0) {
|
||||
ggml_cuda_flash_attn_ext_mma_f16_switch_ncols1<576, 512, 4>(ctx, dst);
|
||||
} else {
|
||||
|
||||
@ -124,6 +124,9 @@ ggml_tensor * llm_build_context::build_deepseek2_tp_attention(
|
||||
cache_local->ne[0], n_tokens, row_size_cache, row_size_cache * kv_head);
|
||||
|
||||
ggml_tensor * kvr = ggml_concat(ctx0, ggml_permute(ctx0, k_rope, 0, 2, 1, 3), kv_compressed, 0);
|
||||
if (cparams.k_cache_hadamard) {
|
||||
kvr = ggml_hadamard(ctx0, kvr, 64);
|
||||
}
|
||||
|
||||
// Per-rank cache_copies slot.
|
||||
const int cc_idx = 2 * n_device * il + 2 * id;
|
||||
@ -142,20 +145,31 @@ ggml_tensor * llm_build_context::build_deepseek2_tp_attention(
|
||||
auto wk_b_split = (const ggml_split_tensor_t *)model.layers[il].wk_b->extra;
|
||||
GGML_ASSERT(wk_b_split);
|
||||
ggml_tensor * wk_b_local = wk_b_split->splits[id];
|
||||
const int head_offset = head_offsets[id];
|
||||
const size_t wk_b_head_stride = wk_b_local->nb[1] * kv_lora_rank;
|
||||
ggml_tensor * wk_b_slice = ggml_view_3d(ctx0, wk_b_local,
|
||||
n_embd_head_qk_nope, kv_lora_rank, n_head_local,
|
||||
wk_b_local->nb[1], wk_b_head_stride,
|
||||
head_offset * wk_b_head_stride);
|
||||
cb(wk_b_slice, "wk_b_slice", il_id);
|
||||
//const int head_offset = head_offsets[id];
|
||||
//const size_t wk_b_head_stride = wk_b_local->nb[1] * kv_lora_rank;
|
||||
//ggml_tensor * wk_b_slice = ggml_view_3d(ctx0, wk_b_local,
|
||||
// n_embd_head_qk_nope, kv_lora_rank, n_head_local,
|
||||
// wk_b_local->nb[1], wk_b_head_stride,
|
||||
// head_offset * wk_b_head_stride);
|
||||
//cb(wk_b_slice, "wk_b_slice", il_id);
|
||||
//if (il == 0) {
|
||||
// auto wk_b = model.layers[il].wk_b;
|
||||
// printf("wk_b(%2d): %ld x %ld x %ld; %zu x %zu x %zu. view: %ld x %ld x %ld; %zu x %zu x %zu, offset = %zu, %d\n", id,
|
||||
// wk_b->ne[0], wk_b->ne[1], wk_b->ne[2], wk_b->nb[1], wk_b->nb[2], wk_b->nb[3],
|
||||
// wk_b_slice->ne[0], wk_b_slice->ne[1], wk_b_slice->ne[2], wk_b_slice->nb[1], wk_b_slice->nb[2], wk_b_slice->nb[3],
|
||||
// head_offset * wk_b_head_stride, head_offset);
|
||||
//}
|
||||
|
||||
ggml_tensor * q_nope_perm = ggml_permute(ctx0, q_nope, 0, 2, 1, 3);
|
||||
|
||||
ggml_tensor * q_nope2 = ggml_mul_mat(ctx0, wk_b_slice, q_nope_perm);
|
||||
//ggml_tensor * q_nope2 = ggml_mul_mat(ctx0, wk_b_slice, q_nope_perm);
|
||||
ggml_tensor * q_nope2 = ggml_mul_mat(ctx0, wk_b_local, q_nope_perm);
|
||||
|
||||
ggml_tensor * q_combined = ggml_concat(ctx0,
|
||||
ggml_permute(ctx0, q_rope, 0, 2, 1, 3), q_nope2, 0);
|
||||
if (cparams.k_cache_hadamard) {
|
||||
q_combined = ggml_hadamard(ctx0, q_combined, 64);
|
||||
}
|
||||
|
||||
// FlashMLA-3 path: K = kv_cache (full latent + rope), V = kv_cache_lora (latent only)
|
||||
ggml_tensor * kv_cache_lora = ggml_view_2d(ctx0, cache_local,
|
||||
@ -171,18 +185,29 @@ ggml_tensor * llm_build_context::build_deepseek2_tp_attention(
|
||||
if (use_f32_attn_precision) {
|
||||
ggml_flash_attn_ext_set_prec(kqv_compressed, GGML_PREC_F32);
|
||||
}
|
||||
if (cparams.k_cache_hadamard) {
|
||||
kqv_compressed = ggml_hadamard(ctx0, kqv_compressed, 64);
|
||||
}
|
||||
kqv_compressed = ggml_permute(ctx0, kqv_compressed, 0, 2, 1, 3);
|
||||
|
||||
auto wv_b_split = (const ggml_split_tensor_t *)model.layers[il].wv_b->extra;
|
||||
GGML_ASSERT(wv_b_split);
|
||||
ggml_tensor * wv_b_local = wv_b_split->splits[id];
|
||||
const size_t wv_b_head_stride = wv_b_local->nb[1] * n_embd_head_v;
|
||||
ggml_tensor * wv_b_slice = ggml_view_3d(ctx0, wv_b_local,
|
||||
kv_lora_rank, n_embd_head_v, n_head_local,
|
||||
wv_b_local->nb[1], wv_b_head_stride,
|
||||
head_offset * wv_b_head_stride);
|
||||
//const size_t wv_b_head_stride = wv_b_local->nb[1] * n_embd_head_v;
|
||||
//ggml_tensor * wv_b_slice = ggml_view_3d(ctx0, wv_b_local,
|
||||
// kv_lora_rank, n_embd_head_v, n_head_local,
|
||||
// wv_b_local->nb[1], wv_b_head_stride,
|
||||
// head_offset * wv_b_head_stride);
|
||||
//if (il == 0) {
|
||||
// auto wv_b = model.layers[il].wv_b;
|
||||
// printf("wv_b(%2d): %ld x %ld x %ld; %zu x %zu x %zu. view: %ld x %ld x %ld; %zu x %zu x %zu, offset = %zu, %d\n", id,
|
||||
// wv_b->ne[0], wv_b->ne[1], wv_b->ne[2], wv_b->nb[1], wv_b->nb[2], wv_b->nb[3],
|
||||
// wv_b_slice->ne[0], wv_b_slice->ne[1], wv_b_slice->ne[2], wv_b_slice->nb[1], wv_b_slice->nb[2], wv_b_slice->nb[3],
|
||||
// head_offset * wv_b_head_stride, head_offset);
|
||||
//}
|
||||
|
||||
ggml_tensor * kqv = ggml_mul_mat(ctx0, wv_b_slice, kqv_compressed);
|
||||
//ggml_tensor * kqv = ggml_mul_mat(ctx0, wv_b_slice, kqv_compressed);
|
||||
ggml_tensor * kqv = ggml_mul_mat(ctx0, wv_b_local, kqv_compressed);
|
||||
if (n_tokens > 1) {
|
||||
kqv = ggml_cont(ctx0, ggml_permute(ctx0, kqv, 0, 2, 1, 3));
|
||||
}
|
||||
|
||||
@ -3730,7 +3730,7 @@ bool create_tensors_helper::merge_qkv(const LLM_TN & tn, int i, int bias, bool i
|
||||
|
||||
static void prepare_split_tensors(int split_dim, ggml_context * ctx, ggml_tensor * tensor, llama_split_tensor & split_tensor,
|
||||
const std::vector<int> & splits, std::vector<size_t> & mem_used) {
|
||||
GGML_ASSERT(split_dim <= 1);
|
||||
GGML_ASSERT(split_dim <= 2);
|
||||
GGML_ASSERT(splits.size() > 1);
|
||||
std::string name{tensor->name};
|
||||
split_tensor.tensor_splits.resize(splits.size());
|
||||
@ -3755,6 +3755,17 @@ static void prepare_split_tensors(int split_dim, ggml_context * ctx, ggml_tensor
|
||||
split_tensor.tensor_splits[i] = nullptr;
|
||||
}
|
||||
}
|
||||
}
|
||||
else if (split_dim == 2) {
|
||||
for (int i = 0; i < int(splits.size()); ++i) {
|
||||
if (splits[i] > 0) {
|
||||
split_tensor.tensor_splits[i] = ggml_new_tensor_3d(ctx, tensor->type, tensor->ne[0], tensor->ne[1], splits[i]);
|
||||
auto name_i = name + '.' + std::to_string(i);
|
||||
ggml_set_name(split_tensor.tensor_splits[i], name_i.c_str());
|
||||
} else {
|
||||
split_tensor.tensor_splits[i] = nullptr;
|
||||
}
|
||||
}
|
||||
} else {
|
||||
for (int i = 0; i < int(splits.size()); ++i) {
|
||||
if (splits[i] > 0) {
|
||||
@ -3837,10 +3848,10 @@ static void distribute_mla_tensors_for_split_mode_graph(
|
||||
prepare_split_tensors(-1, ctx_split, layer.wkv_a_mqa, layer.split_wkv_a_mqa, mirror, mem_used);
|
||||
}
|
||||
if (layer.wk_b) {
|
||||
prepare_split_tensors(-1, ctx_split, layer.wk_b, layer.split_wk_b, mirror, mem_used);
|
||||
prepare_split_tensors( 2, ctx_split, layer.wk_b, layer.split_wk_b, split_heads, mem_used);
|
||||
}
|
||||
if (layer.wv_b) {
|
||||
prepare_split_tensors(-1, ctx_split, layer.wv_b, layer.split_wv_b, mirror, mem_used);
|
||||
prepare_split_tensors( 2, ctx_split, layer.wv_b, layer.split_wv_b, split_heads, mem_used);
|
||||
}
|
||||
|
||||
// Output projection: row-split, partial outputs all-reduced after.
|
||||
|
||||
@ -2429,7 +2429,7 @@ static void llm_prepare_mla(llama_model & model, int mla) {
|
||||
}
|
||||
ggml_free(ctx);
|
||||
}
|
||||
if (mla == 1) return;
|
||||
if (mla == 1 || model.split_mode == LLAMA_SPLIT_MODE_GRAPH) return;
|
||||
|
||||
n_to_compute = 0;
|
||||
for (auto& l : model.layers) {
|
||||
|
||||
Loading…
x
Reference in New Issue
Block a user