Add Laguna M.1 GGUF support (#2003)

2026-06-28 04:30:15 -05:00 · 2026-06-22 10:53:10 -04:00 · 2026-06-22 10:53:10 -04:00 · b47b90d0be
commit b47b90d0be
parent 64fceb70bc
5 changed files with 78 additions and 26 deletions
--- a/convert_hf_to_gguf.py
+++ b/convert_hf_to_gguf.py
@ -5427,6 +5427,12 @@ class LagunaModel(Model):
        rope_params = hparams.get("rope_parameters", {})
        full_rope = rope_params.get("full_attention", rope_params)
        swa_rope = rope_params.get("sliding_attention", {})
+        # Laguna can specify different rotary widths for full-attention and SWA layers.
+        # M.1 uses the full-attention value from rope_parameters; XS.2 SWA omits the key
+        # because those layers rotate the whole head.
+        partial_rotary_factor = float(hparams.get("partial_rotary_factor", 1.0))
+        partial_rotary_factor_full = float(full_rope.get("partial_rotary_factor", partial_rotary_factor))
+        partial_rotary_factor_swa = float(swa_rope.get("partial_rotary_factor", 1.0))

        self.gguf_writer.add_context_length(int(hparams["max_position_embeddings"]))
        self.gguf_writer.add_embedding_length(int(hparams["hidden_size"]))
@ -5443,8 +5449,11 @@ class LagunaModel(Model):
        self.gguf_writer.add_file_type(self.ftype)

        self.gguf_writer.add_sliding_window(int(hparams["sliding_window"]))
-        self.gguf_writer.add_rope_dimension_count(head_dim // 2)
-        self.gguf_writer.add_uint32(f"{arch}.rope.dimension_count_swa", head_dim)
+        # GGUF's rope.dimension_count is the number of scalar Q/K dimensions
+        # that ggml_rope_ext should rotate. It is not the number of RoPE pairs;
+        # the frequency table uses dimension_count / 2 entries later.
+        self.gguf_writer.add_rope_dimension_count(int(head_dim * partial_rotary_factor_full))
+        self.gguf_writer.add_uint32(f"{arch}.rope.dimension_count_swa", int(head_dim * partial_rotary_factor_swa))
        self.gguf_writer.add_rope_freq_base(float(full_rope.get("rope_theta", 500000.0)))
        self.gguf_writer.add_float32(f"{arch}.rope.freq_base_swa", float(swa_rope.get("rope_theta", 10000.0)))
        if full_rope.get("rope_type") == "yarn":
@ -5454,7 +5463,9 @@ class LagunaModel(Model):
                "original_max_position_embeddings",
                rope_params.get("original_max_position_embeddings", hparams["max_position_embeddings"]),
            )))
-            self.gguf_writer.add_rope_scaling_yarn_ext_factor(float(full_rope.get("factor", 1.0)))
+            # GGUF's YaRN ext_factor is the config's extrapolation_factor. The main
+            # factor above is the context-extension scale and should not be mirrored here.
+            self.gguf_writer.add_rope_scaling_yarn_ext_factor(float(full_rope.get("extrapolation_factor", 1.0)))
            self.gguf_writer.add_rope_scaling_yarn_attn_factor(float(full_rope.get("attention_factor", 1.0)))
            self.gguf_writer.add_rope_scaling_yarn_beta_fast(float(full_rope.get("beta_fast", 32.0)))
            self.gguf_writer.add_rope_scaling_yarn_beta_slow(float(full_rope.get("beta_slow", 1.0)))
--- a/src/graphs/build_laguna.cpp
+++ b/src/graphs/build_laguna.cpp
@ -9,13 +9,18 @@ ggml_cgraph * llm_build_context::build_laguna() {
    ggml_tensor * inp_pos     = build_inp_pos();
    ggml_tensor * inp_out_ids = n_tokens > 1 ? build_inp_out_ids() : nullptr;
    ggml_tensor * KQ_mask     = build_inp_KQ_mask();
-    ggml_tensor * KQ_mask_swa = build_inp_KQ_mask_swa();
+    // Laguna M.1 has only global-attention layers and leaves n_swa at zero; building
+    // the SWA mask in that case trips the generic SWA precondition.
+    ggml_tensor * KQ_mask_swa = hparams.n_swa > 0 ? build_inp_KQ_mask_swa() : nullptr;

    for (int il = 0; il < n_layer; ++il) {
        const bool is_swa = hparams.swa_layers[il];
        const int n_swa_l = is_swa ? hparams.n_swa : 0;

        auto KQ_mask_l = is_swa ? KQ_mask_swa : KQ_mask;
+        // If a future Laguna GGUF marks SWA layers, it must also carry a real
+        // sliding-window size so those layers get an SWA mask.
+        GGML_ASSERT(KQ_mask_l != nullptr);
        auto rope_factors = is_swa ? nullptr : build_rope_factors(il);

        auto cur = build_std_attention(gf, model.layers[il].attn_norm, inpL,
--- a/src/llama-build-context.cpp
+++ b/src/llama-build-context.cpp
@ -2873,10 +2873,19 @@ ggml_tensor * llm_build_context::build_std_attention(ggml_cgraph * gf, ggml_tens
                    cb(gate, "attn_gate", il_cb);
                    int nh = split_wo->ne[0]/n_embd_head_v;
                    auto attn_3d = ggml_reshape_3d(ctx0, cur, n_embd_head_v, nh, n_tokens);
-                    auto gate_3d = ggml_reshape_3d(ctx0, gate,            1, nh, n_tokens);
                    if (model.arch == LLM_ARCH_LAGUNA) {
-                        cur = ggml_mul(ctx0, attn_3d, gate_3d);
+                        // Laguna uses a softplus gate. XS.2 stores one gate per head,
+                        // while M.1 stores one gate per attention output element.
+                        if (gate->ne[0] == nh) {
+                            auto gate_3d = ggml_reshape_3d(ctx0, gate, 1, nh, n_tokens);
+                            cur = ggml_mul(ctx0, attn_3d, gate_3d);
+                        } else {
+                            GGML_ASSERT(gate->ne[0] == split_wo->ne[0]);
+                            cur = ggml_reshape_2d(ctx0, cur, split_wo->ne[0], n_tokens);
+                            cur = ggml_mul(ctx0, cur, gate);
+                        }
                    } else {
+                        auto gate_3d = ggml_reshape_3d(ctx0, gate, 1, nh, n_tokens);
                        cur = ggml_fused_mul_unary(ctx0, gate_3d, attn_3d, GGML_UNARY_OP_SIGMOID);
                    }
                    cb(attn_3d, "attn_gated_3d", il_cb);
@ -2994,17 +3003,25 @@ ggml_tensor * llm_build_context::build_std_attention(ggml_cgraph * gf, ggml_tens
                nullptr, nullptr,
                Kcur, Vcur, Qcur, KQ_mask, n_tokens, kv_head, n_kv, KQ_scale, cb, il, sinks, n_swa);
        cb(cur, "wqkv", il);
-        auto gate = llm_build_lora_mm(lctx, ctx0, wqkv_gate, input_normed); // [n_head_l, n_tokens]
+        auto gate = llm_build_lora_mm(lctx, ctx0, wqkv_gate, input_normed);
        if (model.arch == LLM_ARCH_LAGUNA) {
            gate = ggml_softplus(ctx0, gate);
        }
        cb(gate, "attn_gate", il);
        int n_head_l = hparams.n_head(il);
        auto attn_3d = ggml_reshape_3d(ctx0, cur, n_embd_head_v, n_head_l, n_tokens);
-        auto gate_3d = ggml_reshape_3d(ctx0, gate,            1, n_head_l, n_tokens);
        if (model.arch == LLM_ARCH_LAGUNA) {
-            cur = ggml_mul(ctx0, attn_3d, gate_3d);
+            // Laguna uses a softplus gate. XS.2 stores one gate per head,
+            // while M.1 stores one gate per attention output element.
+            if (gate->ne[0] == n_head_l) {
+                auto gate_3d = ggml_reshape_3d(ctx0, gate, 1, n_head_l, n_tokens);
+                cur = ggml_mul(ctx0, attn_3d, gate_3d);
+            } else {
+                GGML_ASSERT(gate->ne[0] == n_embd_head_v * n_head_l);
+                cur = ggml_mul(ctx0, cur, gate);
+            }
        } else {
+            auto gate_3d = ggml_reshape_3d(ctx0, gate, 1, n_head_l, n_tokens);
            cur = ggml_fused_mul_unary(ctx0, gate_3d, attn_3d, GGML_UNARY_OP_SIGMOID);
        }
        cb(cur, "attn_gated_3d", il);
--- a/src/llama-hparams.cpp
+++ b/src/llama-hparams.cpp
@ -1538,11 +1538,22 @@ void llm_load_hparams(
                    }
                }

-                // GGUF stores the Poolside partial-rotary setting; the graph RoPE
-                // argument for full-attention Laguna layers follows the upstream
-                // Laguna loader and uses half of that count. SWA layers remain
-                // full-head rotary via n_rot_swa.
-                hparams.n_rot /= 2;
+                const bool found_rope_dim     = ml.get_key(LLM_KV_ROPE_DIMENSION_COUNT,     hparams.n_rot,     false);
+                const bool found_rope_dim_swa = ml.get_key(LLM_KV_ROPE_DIMENSION_COUNT_SWA, hparams.n_rot_swa, false);
+
+                // Laguna GGUFs store the number of scalar Q/K dimensions that ggml_rope_ext
+                // rotates. Correct files carry those values explicitly. Some early public
+                // XS.2 GGUFs omitted both keys, so fall back to the HF XS.2 layout only for
+                // missing metadata: full-attention layers rotate half the head, SWA layers
+                // rotate the full head. Explicit but wrong halved metadata still needs repair.
+                if (hparams.n_swa > 0) {
+                    if (!found_rope_dim) {
+                        hparams.n_rot = hparams.n_embd_head_k_full / 2;
+                    }
+                    if (!found_rope_dim_swa) {
+                        hparams.n_rot_swa = hparams.n_embd_head_k_swa;
+                    }
+                }

                ml.get_key(LLM_KV_ROPE_SCALING_YARN_EXT_FACTOR, hparams.yarn_ext_factor, false);
                ml.get_key(LLM_KV_ROPE_SCALING_YARN_ATTN_FACTOR, hparams.yarn_attn_factor, false);
@ -1553,12 +1564,6 @@ void llm_load_hparams(
                    for (uint32_t i = 0; i < hparams.n_layer; ++i) {
                        hparams.rope_dim_per_layer[i] = hparams.swa_layers[i] ? hparams.n_rot_swa : hparams.n_rot;
                    }
-                } else {
-                    for (uint32_t i = 0; i < hparams.n_layer; ++i) {
-                        if (!hparams.swa_layers[i]) {
-                            hparams.rope_dim_per_layer[i] /= 2;
-                        }
-                    }
                }

                switch (hparams.n_layer) {
--- a/src/llama-load-tensors.cpp
+++ b/src/llama-load-tensors.cpp
@ -1198,8 +1198,18 @@ bool create_tensors_helper::create_step35_tensors(const LLM_TN & tn) {
        //layer.wk = create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_K,   "weight", i), {n_embd, n_embd_k_gqa}, 0);
        //layer.wv = create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_V,   "weight", i), {n_embd, n_embd_v_gqa}, 0);
        layer.wo = create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd_head_v * n_head_l, n_embd}, 0);
-        // head-wise attention gate (Step35 self_attn.g_proj)
-        layer.wqkv_gate = create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_GATE, "weight", i), {n_embd, n_head_l}, llama_model_loader::TENSOR_NOT_REQUIRED);
+        const std::string attn_gate_name = tn(LLM_TENSOR_ATTN_GATE, "weight", i);
+        int64_t n_attn_gate = n_head_l;
+        if (model.arch == LLM_ARCH_LAGUNA) {
+            // Step35-style models normally use a head-wise attention gate. Laguna
+            // XS.2 keeps that layout, but M.1 gates every attention output element,
+            // so infer the width from GGUF metadata instead of baking in a model size.
+            const ggml_tensor * meta = ml.get_tensor_meta(attn_gate_name.c_str());
+            if (meta && meta->ne[1] == n_embd_head_v * n_head_l) {
+                n_attn_gate = n_embd_head_v * n_head_l;
+            }
+        }
+        layer.wqkv_gate = create_tensor(ctx_split, attn_gate_name, {n_embd, n_attn_gate}, llama_model_loader::TENSOR_NOT_REQUIRED);
        layer.ffn_norm = create_tensor(ctx_split, tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, 0);
        // dense MLP (leading dense blocks)
        layer.ffn_gate = create_tensor(ctx_split, tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd,   n_ff}, llama_model_loader::TENSOR_NOT_REQUIRED);
@ -4681,11 +4691,15 @@ bool create_tensors_helper::create_tensors() {
                }
                if (layer.wqkv_gate) {
                    auto wqkv_gate_split = split_kq;
-                    LLAMA_LOG_DEBUG("=================== wqkv_gate_split:");
-                    for (auto & s : wqkv_gate_split) {
-                        s /= hparams.n_embd_head_k(il);
-                        LLAMA_LOG_DEBUG(" %d", s);
+                    if (model.arch == LLM_ARCH_LAGUNA && layer.wqkv_gate->ne[1] == layer.wo->ne[0]) {
+                        // Full-width Laguna M.1 gates follow the value/output partition.
+                        // Head-wise gates still follow the K/Q partition collapsed by head size.
+                        wqkv_gate_split = split_vo;
+                    } else {
+                        for (auto & s : wqkv_gate_split) s /= hparams.n_embd_head_k(il);
                    }
+                    LLAMA_LOG_DEBUG("=================== wqkv_gate_split:");
+                    for ([[maybe_unused]] auto s : wqkv_gate_split) LLAMA_LOG_DEBUG(" %d", s);
                    LLAMA_LOG_DEBUG("\n");
                    prepare_split_tensors(1, ctx_split, layer.wqkv_gate, layer.split_wqkv_gate, wqkv_gate_split, mem_used);
                }