From b47b90d0be80981f6f476c997afbdfab99bba6c7 Mon Sep 17 00:00:00 2001
From: empty-quiver <empty-quiver@github.com>
Date: Mon, 22 Jun 2026 10:53:10 -0400
Subject: [PATCH] Add Laguna M.1 GGUF support (#2003)

---
 convert_hf_to_gguf.py       | 17 ++++++++++++++---
 src/graphs/build_laguna.cpp |  7 ++++++-
 src/llama-build-context.cpp | 27 ++++++++++++++++++++++-----
 src/llama-hparams.cpp       | 27 ++++++++++++++++-----------
 src/llama-load-tensors.cpp  | 26 ++++++++++++++++++++------
 5 files changed, 78 insertions(+), 26 deletions(-)

diff --git a/convert_hf_to_gguf.py b/convert_hf_to_gguf.py
index e615d85d..cfe5bcd5 100644
--- a/convert_hf_to_gguf.py
+++ b/convert_hf_to_gguf.py
@@ -5427,6 +5427,12 @@ class LagunaModel(Model):
         rope_params = hparams.get("rope_parameters", {})
         full_rope = rope_params.get("full_attention", rope_params)
         swa_rope = rope_params.get("sliding_attention", {})
+        # Laguna can specify different rotary widths for full-attention and SWA layers.
+        # M.1 uses the full-attention value from rope_parameters; XS.2 SWA omits the key
+        # because those layers rotate the whole head.
+        partial_rotary_factor = float(hparams.get("partial_rotary_factor", 1.0))
+        partial_rotary_factor_full = float(full_rope.get("partial_rotary_factor", partial_rotary_factor))
+        partial_rotary_factor_swa = float(swa_rope.get("partial_rotary_factor", 1.0))
 
         self.gguf_writer.add_context_length(int(hparams["max_position_embeddings"]))
         self.gguf_writer.add_embedding_length(int(hparams["hidden_size"]))
@@ -5443,8 +5449,11 @@ class LagunaModel(Model):
         self.gguf_writer.add_file_type(self.ftype)
 
         self.gguf_writer.add_sliding_window(int(hparams["sliding_window"]))
-        self.gguf_writer.add_rope_dimension_count(head_dim // 2)
-        self.gguf_writer.add_uint32(f"{arch}.rope.dimension_count_swa", head_dim)
+        # GGUF's rope.dimension_count is the number of scalar Q/K dimensions
+        # that ggml_rope_ext should rotate. It is not the number of RoPE pairs;
+        # the frequency table uses dimension_count / 2 entries later.
+        self.gguf_writer.add_rope_dimension_count(int(head_dim * partial_rotary_factor_full))
+        self.gguf_writer.add_uint32(f"{arch}.rope.dimension_count_swa", int(head_dim * partial_rotary_factor_swa))
         self.gguf_writer.add_rope_freq_base(float(full_rope.get("rope_theta", 500000.0)))
         self.gguf_writer.add_float32(f"{arch}.rope.freq_base_swa", float(swa_rope.get("rope_theta", 10000.0)))
         if full_rope.get("rope_type") == "yarn":
@@ -5454,7 +5463,9 @@ class LagunaModel(Model):
                 "original_max_position_embeddings",
                 rope_params.get("original_max_position_embeddings", hparams["max_position_embeddings"]),
             )))
-            self.gguf_writer.add_rope_scaling_yarn_ext_factor(float(full_rope.get("factor", 1.0)))
+            # GGUF's YaRN ext_factor is the config's extrapolation_factor. The main
+            # factor above is the context-extension scale and should not be mirrored here.
+            self.gguf_writer.add_rope_scaling_yarn_ext_factor(float(full_rope.get("extrapolation_factor", 1.0)))
             self.gguf_writer.add_rope_scaling_yarn_attn_factor(float(full_rope.get("attention_factor", 1.0)))
             self.gguf_writer.add_rope_scaling_yarn_beta_fast(float(full_rope.get("beta_fast", 32.0)))
             self.gguf_writer.add_rope_scaling_yarn_beta_slow(float(full_rope.get("beta_slow", 1.0)))
diff --git a/src/graphs/build_laguna.cpp b/src/graphs/build_laguna.cpp
index 6be22d73..55d60902 100644
--- a/src/graphs/build_laguna.cpp
+++ b/src/graphs/build_laguna.cpp
@@ -9,13 +9,18 @@ ggml_cgraph * llm_build_context::build_laguna() {
     ggml_tensor * inp_pos     = build_inp_pos();
     ggml_tensor * inp_out_ids = n_tokens > 1 ? build_inp_out_ids() : nullptr;
     ggml_tensor * KQ_mask     = build_inp_KQ_mask();
-    ggml_tensor * KQ_mask_swa = build_inp_KQ_mask_swa();
+    // Laguna M.1 has only global-attention layers and leaves n_swa at zero; building
+    // the SWA mask in that case trips the generic SWA precondition.
+    ggml_tensor * KQ_mask_swa = hparams.n_swa > 0 ? build_inp_KQ_mask_swa() : nullptr;
 
     for (int il = 0; il < n_layer; ++il) {
         const bool is_swa = hparams.swa_layers[il];
         const int n_swa_l = is_swa ? hparams.n_swa : 0;
 
         auto KQ_mask_l = is_swa ? KQ_mask_swa : KQ_mask;
+        // If a future Laguna GGUF marks SWA layers, it must also carry a real
+        // sliding-window size so those layers get an SWA mask.
+        GGML_ASSERT(KQ_mask_l != nullptr);
         auto rope_factors = is_swa ? nullptr : build_rope_factors(il);
 
         auto cur = build_std_attention(gf, model.layers[il].attn_norm, inpL,
diff --git a/src/llama-build-context.cpp b/src/llama-build-context.cpp
index 91d20401..93d9a607 100644
--- a/src/llama-build-context.cpp
+++ b/src/llama-build-context.cpp
@@ -2873,10 +2873,19 @@ ggml_tensor * llm_build_context::build_std_attention(ggml_cgraph * gf, ggml_tens
                     cb(gate, "attn_gate", il_cb);
                     int nh = split_wo->ne[0]/n_embd_head_v;
                     auto attn_3d = ggml_reshape_3d(ctx0, cur, n_embd_head_v, nh, n_tokens);
-                    auto gate_3d = ggml_reshape_3d(ctx0, gate,            1, nh, n_tokens);
                     if (model.arch == LLM_ARCH_LAGUNA) {
-                        cur = ggml_mul(ctx0, attn_3d, gate_3d);
+                        // Laguna uses a softplus gate. XS.2 stores one gate per head,
+                        // while M.1 stores one gate per attention output element.
+                        if (gate->ne[0] == nh) {
+                            auto gate_3d = ggml_reshape_3d(ctx0, gate, 1, nh, n_tokens);
+                            cur = ggml_mul(ctx0, attn_3d, gate_3d);
+                        } else {
+                            GGML_ASSERT(gate->ne[0] == split_wo->ne[0]);
+                            cur = ggml_reshape_2d(ctx0, cur, split_wo->ne[0], n_tokens);
+                            cur = ggml_mul(ctx0, cur, gate);
+                        }
                     } else {
+                        auto gate_3d = ggml_reshape_3d(ctx0, gate, 1, nh, n_tokens);
                         cur = ggml_fused_mul_unary(ctx0, gate_3d, attn_3d, GGML_UNARY_OP_SIGMOID);
                     }
                     cb(attn_3d, "attn_gated_3d", il_cb);
@@ -2994,17 +3003,25 @@ ggml_tensor * llm_build_context::build_std_attention(ggml_cgraph * gf, ggml_tens
                 nullptr, nullptr,
                 Kcur, Vcur, Qcur, KQ_mask, n_tokens, kv_head, n_kv, KQ_scale, cb, il, sinks, n_swa);
         cb(cur, "wqkv", il);
-        auto gate = llm_build_lora_mm(lctx, ctx0, wqkv_gate, input_normed); // [n_head_l, n_tokens]
+        auto gate = llm_build_lora_mm(lctx, ctx0, wqkv_gate, input_normed);
         if (model.arch == LLM_ARCH_LAGUNA) {
             gate = ggml_softplus(ctx0, gate);
         }
         cb(gate, "attn_gate", il);
         int n_head_l = hparams.n_head(il);
         auto attn_3d = ggml_reshape_3d(ctx0, cur, n_embd_head_v, n_head_l, n_tokens);
-        auto gate_3d = ggml_reshape_3d(ctx0, gate,            1, n_head_l, n_tokens);
         if (model.arch == LLM_ARCH_LAGUNA) {
-            cur = ggml_mul(ctx0, attn_3d, gate_3d);
+            // Laguna uses a softplus gate. XS.2 stores one gate per head,
+            // while M.1 stores one gate per attention output element.
+            if (gate->ne[0] == n_head_l) {
+                auto gate_3d = ggml_reshape_3d(ctx0, gate, 1, n_head_l, n_tokens);
+                cur = ggml_mul(ctx0, attn_3d, gate_3d);
+            } else {
+                GGML_ASSERT(gate->ne[0] == n_embd_head_v * n_head_l);
+                cur = ggml_mul(ctx0, cur, gate);
+            }
         } else {
+            auto gate_3d = ggml_reshape_3d(ctx0, gate, 1, n_head_l, n_tokens);
             cur = ggml_fused_mul_unary(ctx0, gate_3d, attn_3d, GGML_UNARY_OP_SIGMOID);
         }
         cb(cur, "attn_gated_3d", il);
diff --git a/src/llama-hparams.cpp b/src/llama-hparams.cpp
index 589e281d..fbca1a4b 100644
--- a/src/llama-hparams.cpp
+++ b/src/llama-hparams.cpp
@@ -1538,11 +1538,22 @@ void llm_load_hparams(
                     }
                 }
 
-                // GGUF stores the Poolside partial-rotary setting; the graph RoPE
-                // argument for full-attention Laguna layers follows the upstream
-                // Laguna loader and uses half of that count. SWA layers remain
-                // full-head rotary via n_rot_swa.
-                hparams.n_rot /= 2;
+                const bool found_rope_dim     = ml.get_key(LLM_KV_ROPE_DIMENSION_COUNT,     hparams.n_rot,     false);
+                const bool found_rope_dim_swa = ml.get_key(LLM_KV_ROPE_DIMENSION_COUNT_SWA, hparams.n_rot_swa, false);
+
+                // Laguna GGUFs store the number of scalar Q/K dimensions that ggml_rope_ext
+                // rotates. Correct files carry those values explicitly. Some early public
+                // XS.2 GGUFs omitted both keys, so fall back to the HF XS.2 layout only for
+                // missing metadata: full-attention layers rotate half the head, SWA layers
+                // rotate the full head. Explicit but wrong halved metadata still needs repair.
+                if (hparams.n_swa > 0) {
+                    if (!found_rope_dim) {
+                        hparams.n_rot = hparams.n_embd_head_k_full / 2;
+                    }
+                    if (!found_rope_dim_swa) {
+                        hparams.n_rot_swa = hparams.n_embd_head_k_swa;
+                    }
+                }
 
                 ml.get_key(LLM_KV_ROPE_SCALING_YARN_EXT_FACTOR, hparams.yarn_ext_factor, false);
                 ml.get_key(LLM_KV_ROPE_SCALING_YARN_ATTN_FACTOR, hparams.yarn_attn_factor, false);
@@ -1553,12 +1564,6 @@ void llm_load_hparams(
                     for (uint32_t i = 0; i < hparams.n_layer; ++i) {
                         hparams.rope_dim_per_layer[i] = hparams.swa_layers[i] ? hparams.n_rot_swa : hparams.n_rot;
                     }
-                } else {
-                    for (uint32_t i = 0; i < hparams.n_layer; ++i) {
-                        if (!hparams.swa_layers[i]) {
-                            hparams.rope_dim_per_layer[i] /= 2;
-                        }
-                    }
                 }
 
                 switch (hparams.n_layer) {
diff --git a/src/llama-load-tensors.cpp b/src/llama-load-tensors.cpp
index f151d885..ccc2962b 100644
--- a/src/llama-load-tensors.cpp
+++ b/src/llama-load-tensors.cpp
@@ -1198,8 +1198,18 @@ bool create_tensors_helper::create_step35_tensors(const LLM_TN & tn) {
         //layer.wk = create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_K,   "weight", i), {n_embd, n_embd_k_gqa}, 0);
         //layer.wv = create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_V,   "weight", i), {n_embd, n_embd_v_gqa}, 0);
         layer.wo = create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd_head_v * n_head_l, n_embd}, 0);
-        // head-wise attention gate (Step35 self_attn.g_proj)
-        layer.wqkv_gate = create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_GATE, "weight", i), {n_embd, n_head_l}, llama_model_loader::TENSOR_NOT_REQUIRED);
+        const std::string attn_gate_name = tn(LLM_TENSOR_ATTN_GATE, "weight", i);
+        int64_t n_attn_gate = n_head_l;
+        if (model.arch == LLM_ARCH_LAGUNA) {
+            // Step35-style models normally use a head-wise attention gate. Laguna
+            // XS.2 keeps that layout, but M.1 gates every attention output element,
+            // so infer the width from GGUF metadata instead of baking in a model size.
+            const ggml_tensor * meta = ml.get_tensor_meta(attn_gate_name.c_str());
+            if (meta && meta->ne[1] == n_embd_head_v * n_head_l) {
+                n_attn_gate = n_embd_head_v * n_head_l;
+            }
+        }
+        layer.wqkv_gate = create_tensor(ctx_split, attn_gate_name, {n_embd, n_attn_gate}, llama_model_loader::TENSOR_NOT_REQUIRED);
         layer.ffn_norm = create_tensor(ctx_split, tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, 0);
         // dense MLP (leading dense blocks)
         layer.ffn_gate = create_tensor(ctx_split, tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd,   n_ff}, llama_model_loader::TENSOR_NOT_REQUIRED);
@@ -4681,11 +4691,15 @@ bool create_tensors_helper::create_tensors() {
                 }
                 if (layer.wqkv_gate) {
                     auto wqkv_gate_split = split_kq;
-                    LLAMA_LOG_DEBUG("=================== wqkv_gate_split:");
-                    for (auto & s : wqkv_gate_split) {
-                        s /= hparams.n_embd_head_k(il);
-                        LLAMA_LOG_DEBUG(" %d", s);
+                    if (model.arch == LLM_ARCH_LAGUNA && layer.wqkv_gate->ne[1] == layer.wo->ne[0]) {
+                        // Full-width Laguna M.1 gates follow the value/output partition.
+                        // Head-wise gates still follow the K/Q partition collapsed by head size.
+                        wqkv_gate_split = split_vo;
+                    } else {
+                        for (auto & s : wqkv_gate_split) s /= hparams.n_embd_head_k(il);
                     }
+                    LLAMA_LOG_DEBUG("=================== wqkv_gate_split:");
+                    for ([[maybe_unused]] auto s : wqkv_gate_split) LLAMA_LOG_DEBUG(" %d", s);
                     LLAMA_LOG_DEBUG("\n");
                     prepare_split_tensors(1, ctx_split, layer.wqkv_gate, layer.split_wqkv_gate, wqkv_gate_split, mem_used);
                 }