Some tweaks

2026-06-28 04:30:15 -05:00 · 2026-05-18 15:52:26 +00:00 · 2026-05-18 15:52:26 +00:00 · 7dd19e197d
commit 7dd19e197d
parent d2ccbe92a6
6 changed files with 76 additions and 20 deletions
--- a/ggml/include/ggml.h
+++ b/ggml/include/ggml.h
@ -240,7 +240,7 @@
 // if you need to load more than 64 model shards.
 #define GGML_MAX_CONTEXTS       64
 #endif
-#define GGML_MAX_SRC            10
+#define GGML_MAX_SRC            12
 #ifndef GGML_MAX_NAME
 #define GGML_MAX_NAME           64
 #endif
--- a/ggml/src/ggml-cuda.cu
+++ b/ggml/src/ggml-cuda.cu
@ -1127,6 +1127,18 @@ GGML_CALL static void ggml_backend_cuda_split_buffer_set_tensor([[maybe_unused]]
            }
        }
    }
+    else if (extra->split_dim == 2) {
+        size_t cur_offset = 0;
+        for (int i = 0; i < extra->n_device; ++i) {
+            auto split = extra->splits[i];
+            if (!split) continue;
+            ggml_cuda_set_device(i);
+            auto size = ggml_nbytes(split);
+            const char * buf_host = (const char *)data + cur_offset;
+            CUDA_CHECK(cudaMemcpyAsync(split->data, buf_host, size, cudaMemcpyHostToDevice, cudaStreamPerThread));
+            cur_offset += size;
+        }
+    }
    else {
        fprintf(stderr, "%s: not implemented for split dim %d\n", __func__, extra->split_dim == 0);
        GGML_ABORT("fatal error");
@ -1147,6 +1159,8 @@ GGML_CALL static void ggml_backend_cuda_split_buffer_get_tensor([[maybe_unused]]

    if (!tensor->extra) return;

+    printf("%s(%s): offset = %zu, size = %zu\n", __func__, tensor->name, offset, size);
+
    // Inverse of split_buffer_set_tensor; refuses paths with no defined inverse.
    auto extra = (ggml_split_tensor_t *)tensor->extra;
    GGML_ASSERT(extra->n_device <= ggml_backend_cuda_get_device_count());
--- a/ggml/src/ggml-cuda/fattn-new-mma.cu
+++ b/ggml/src/ggml-cuda/fattn-new-mma.cu
@ -2132,7 +2132,7 @@ static void ggml_cuda_flash_attn_ext_mma_f16_switch_ncols1(ggml_backend_cuda_con
        //    return;
        //} else {
        if (Q->ne[1] <= 8/ncols2) {
-            if constexpr (DKQ == 512) {
+            if constexpr (DKQ == 512 || DKQ == 576) {
                ggml_cuda_flash_attn_ext_mma_f16_case<DKQ, DV, 2, ncols2>(ctx, dst);
            } else {
                ggml_cuda_flash_attn_ext_mma_f16_case<DKQ, DV, 8/ncols2, ncols2>(ctx, dst);
@ -2275,8 +2275,14 @@ void ggml_cuda_flash_attn_ext_mma_new(ggml_backend_cuda_context & ctx, ggml_tens
        }
        return;
    }
+    if (gqa_ratio % 12 == 0 && Q->ne[1] <= 4 && K->ne[1] >= 2048) {
+        ggml_cuda_flash_attn_ext_mma_f16_case<576, 512, 1, 16>(ctx, dst);
+        return;
+    }
    if (gqa_ratio % 16 == 0) {
        ggml_cuda_flash_attn_ext_mma_f16_switch_ncols1<576, 512, 16>(ctx, dst);
+    } else if (gqa_ratio % 8 == 0) {
+        ggml_cuda_flash_attn_ext_mma_f16_switch_ncols1<576, 512, 8>(ctx, dst);
    } else if (gqa_ratio % 4 == 0) {
        ggml_cuda_flash_attn_ext_mma_f16_switch_ncols1<576, 512, 4>(ctx, dst);
    } else {
--- a/src/graphs/build_deepseek2.cpp
+++ b/src/graphs/build_deepseek2.cpp
@ -124,6 +124,9 @@ ggml_tensor * llm_build_context::build_deepseek2_tp_attention(
                cache_local->ne[0], n_tokens, row_size_cache, row_size_cache * kv_head);

        ggml_tensor * kvr = ggml_concat(ctx0, ggml_permute(ctx0, k_rope, 0, 2, 1, 3), kv_compressed, 0);
+        if (cparams.k_cache_hadamard) {
+            kvr = ggml_hadamard(ctx0, kvr, 64);
+        }

        // Per-rank cache_copies slot.
        const int cc_idx = 2 * n_device * il + 2 * id;
@ -142,20 +145,31 @@ ggml_tensor * llm_build_context::build_deepseek2_tp_attention(
        auto wk_b_split = (const ggml_split_tensor_t *)model.layers[il].wk_b->extra;
        GGML_ASSERT(wk_b_split);
        ggml_tensor * wk_b_local = wk_b_split->splits[id];
-        const int head_offset = head_offsets[id];
-        const size_t wk_b_head_stride = wk_b_local->nb[1] * kv_lora_rank;
-        ggml_tensor * wk_b_slice = ggml_view_3d(ctx0, wk_b_local,
-                n_embd_head_qk_nope, kv_lora_rank, n_head_local,
-                wk_b_local->nb[1], wk_b_head_stride,
-                head_offset * wk_b_head_stride);
-        cb(wk_b_slice, "wk_b_slice", il_id);
+        //const int head_offset = head_offsets[id];
+        //const size_t wk_b_head_stride = wk_b_local->nb[1] * kv_lora_rank;
+        //ggml_tensor * wk_b_slice = ggml_view_3d(ctx0, wk_b_local,
+        //        n_embd_head_qk_nope, kv_lora_rank, n_head_local,
+        //        wk_b_local->nb[1], wk_b_head_stride,
+        //        head_offset * wk_b_head_stride);
+        //cb(wk_b_slice, "wk_b_slice", il_id);
+        //if (il == 0) {
+        //    auto wk_b = model.layers[il].wk_b;
+        //    printf("wk_b(%2d): %ld x %ld x %ld; %zu x %zu x %zu. view: %ld x %ld x %ld; %zu x %zu x %zu, offset = %zu, %d\n", id,
+        //            wk_b->ne[0], wk_b->ne[1], wk_b->ne[2], wk_b->nb[1], wk_b->nb[2], wk_b->nb[3],
+        //            wk_b_slice->ne[0], wk_b_slice->ne[1], wk_b_slice->ne[2], wk_b_slice->nb[1], wk_b_slice->nb[2], wk_b_slice->nb[3],
+        //            head_offset * wk_b_head_stride, head_offset);
+        //}

        ggml_tensor * q_nope_perm = ggml_permute(ctx0, q_nope, 0, 2, 1, 3);

-        ggml_tensor * q_nope2 = ggml_mul_mat(ctx0, wk_b_slice, q_nope_perm);
+        //ggml_tensor * q_nope2 = ggml_mul_mat(ctx0, wk_b_slice, q_nope_perm);
+        ggml_tensor * q_nope2 = ggml_mul_mat(ctx0, wk_b_local, q_nope_perm);

        ggml_tensor * q_combined = ggml_concat(ctx0,
                ggml_permute(ctx0, q_rope, 0, 2, 1, 3), q_nope2, 0);
+        if (cparams.k_cache_hadamard) {
+            q_combined = ggml_hadamard(ctx0, q_combined, 64);
+        }

        // FlashMLA-3 path: K = kv_cache (full latent + rope), V = kv_cache_lora (latent only)
        ggml_tensor * kv_cache_lora = ggml_view_2d(ctx0, cache_local,
@ -171,18 +185,29 @@ ggml_tensor * llm_build_context::build_deepseek2_tp_attention(
        if (use_f32_attn_precision) {
            ggml_flash_attn_ext_set_prec(kqv_compressed, GGML_PREC_F32);
        }
+        if (cparams.k_cache_hadamard) {
+            kqv_compressed = ggml_hadamard(ctx0, kqv_compressed, 64);
+        }
        kqv_compressed = ggml_permute(ctx0, kqv_compressed, 0, 2, 1, 3);

        auto wv_b_split = (const ggml_split_tensor_t *)model.layers[il].wv_b->extra;
        GGML_ASSERT(wv_b_split);
        ggml_tensor * wv_b_local = wv_b_split->splits[id];
-        const size_t wv_b_head_stride = wv_b_local->nb[1] * n_embd_head_v;
-        ggml_tensor * wv_b_slice = ggml_view_3d(ctx0, wv_b_local,
-                kv_lora_rank, n_embd_head_v, n_head_local,
-                wv_b_local->nb[1], wv_b_head_stride,
-                head_offset * wv_b_head_stride);
+        //const size_t wv_b_head_stride = wv_b_local->nb[1] * n_embd_head_v;
+        //ggml_tensor * wv_b_slice = ggml_view_3d(ctx0, wv_b_local,
+        //        kv_lora_rank, n_embd_head_v, n_head_local,
+        //        wv_b_local->nb[1], wv_b_head_stride,
+        //        head_offset * wv_b_head_stride);
+        //if (il == 0) {
+        //    auto wv_b = model.layers[il].wv_b;
+        //    printf("wv_b(%2d): %ld x %ld x %ld; %zu x %zu x %zu. view: %ld x %ld x %ld; %zu x %zu x %zu, offset = %zu, %d\n", id,
+        //            wv_b->ne[0], wv_b->ne[1], wv_b->ne[2], wv_b->nb[1], wv_b->nb[2], wv_b->nb[3],
+        //            wv_b_slice->ne[0], wv_b_slice->ne[1], wv_b_slice->ne[2], wv_b_slice->nb[1], wv_b_slice->nb[2], wv_b_slice->nb[3],
+        //            head_offset * wv_b_head_stride, head_offset);
+        //}

-        ggml_tensor * kqv = ggml_mul_mat(ctx0, wv_b_slice, kqv_compressed);
+        //ggml_tensor * kqv = ggml_mul_mat(ctx0, wv_b_slice, kqv_compressed);
+        ggml_tensor * kqv = ggml_mul_mat(ctx0, wv_b_local, kqv_compressed);
        if (n_tokens > 1) {
            kqv = ggml_cont(ctx0, ggml_permute(ctx0, kqv, 0, 2, 1, 3));
        }
--- a/src/llama-load-tensors.cpp
+++ b/src/llama-load-tensors.cpp
@ -3730,7 +3730,7 @@ bool create_tensors_helper::merge_qkv(const LLM_TN & tn, int i, int bias, bool i

 static void prepare_split_tensors(int split_dim, ggml_context * ctx, ggml_tensor * tensor, llama_split_tensor & split_tensor,
        const std::vector<int> & splits, std::vector<size_t> & mem_used) {
-    GGML_ASSERT(split_dim <= 1);
+    GGML_ASSERT(split_dim <= 2);
    GGML_ASSERT(splits.size() > 1);
    std::string name{tensor->name};
    split_tensor.tensor_splits.resize(splits.size());
@ -3755,6 +3755,17 @@ static void prepare_split_tensors(int split_dim, ggml_context * ctx, ggml_tensor
                split_tensor.tensor_splits[i] = nullptr;
            }
        }
+    }
+    else if (split_dim == 2) {
+        for (int i = 0; i < int(splits.size()); ++i) {
+            if (splits[i] > 0) {
+                split_tensor.tensor_splits[i] = ggml_new_tensor_3d(ctx, tensor->type, tensor->ne[0], tensor->ne[1], splits[i]);
+                auto name_i = name + '.' + std::to_string(i);
+                ggml_set_name(split_tensor.tensor_splits[i], name_i.c_str());
+            } else {
+                split_tensor.tensor_splits[i] = nullptr;
+            }
+        }
    } else {
        for (int i = 0; i < int(splits.size()); ++i) {
            if (splits[i] > 0) {
@ -3837,10 +3848,10 @@ static void distribute_mla_tensors_for_split_mode_graph(
        prepare_split_tensors(-1, ctx_split, layer.wkv_a_mqa, layer.split_wkv_a_mqa, mirror, mem_used);
    }
    if (layer.wk_b) {
-        prepare_split_tensors(-1, ctx_split, layer.wk_b, layer.split_wk_b, mirror, mem_used);
+        prepare_split_tensors( 2, ctx_split, layer.wk_b, layer.split_wk_b, split_heads, mem_used);
    }
    if (layer.wv_b) {
-        prepare_split_tensors(-1, ctx_split, layer.wv_b, layer.split_wv_b, mirror, mem_used);
+        prepare_split_tensors( 2, ctx_split, layer.wv_b, layer.split_wv_b, split_heads, mem_used);
    }

    // Output projection: row-split, partial outputs all-reduced after.
--- a/src/llama.cpp
+++ b/src/llama.cpp
@ -2429,7 +2429,7 @@ static void llm_prepare_mla(llama_model & model, int mla) {
        }
        ggml_free(ctx);
    }
-    if (mla == 1) return;
+    if (mla == 1 || model.split_mode == LLAMA_SPLIT_MODE_GRAPH) return;

    n_to_compute = 0;
    for (auto& l : model.layers) {