From b47b90d0be80981f6f476c997afbdfab99bba6c7 Mon Sep 17 00:00:00 2001 From: empty-quiver Date: Mon, 22 Jun 2026 10:53:10 -0400 Subject: [PATCH] Add Laguna M.1 GGUF support (#2003) --- convert_hf_to_gguf.py | 17 ++++++++++++++--- src/graphs/build_laguna.cpp | 7 ++++++- src/llama-build-context.cpp | 27 ++++++++++++++++++++++----- src/llama-hparams.cpp | 27 ++++++++++++++++----------- src/llama-load-tensors.cpp | 26 ++++++++++++++++++++------ 5 files changed, 78 insertions(+), 26 deletions(-) diff --git a/convert_hf_to_gguf.py b/convert_hf_to_gguf.py index e615d85d..cfe5bcd5 100644 --- a/convert_hf_to_gguf.py +++ b/convert_hf_to_gguf.py @@ -5427,6 +5427,12 @@ class LagunaModel(Model): rope_params = hparams.get("rope_parameters", {}) full_rope = rope_params.get("full_attention", rope_params) swa_rope = rope_params.get("sliding_attention", {}) + # Laguna can specify different rotary widths for full-attention and SWA layers. + # M.1 uses the full-attention value from rope_parameters; XS.2 SWA omits the key + # because those layers rotate the whole head. + partial_rotary_factor = float(hparams.get("partial_rotary_factor", 1.0)) + partial_rotary_factor_full = float(full_rope.get("partial_rotary_factor", partial_rotary_factor)) + partial_rotary_factor_swa = float(swa_rope.get("partial_rotary_factor", 1.0)) self.gguf_writer.add_context_length(int(hparams["max_position_embeddings"])) self.gguf_writer.add_embedding_length(int(hparams["hidden_size"])) @@ -5443,8 +5449,11 @@ class LagunaModel(Model): self.gguf_writer.add_file_type(self.ftype) self.gguf_writer.add_sliding_window(int(hparams["sliding_window"])) - self.gguf_writer.add_rope_dimension_count(head_dim // 2) - self.gguf_writer.add_uint32(f"{arch}.rope.dimension_count_swa", head_dim) + # GGUF's rope.dimension_count is the number of scalar Q/K dimensions + # that ggml_rope_ext should rotate. It is not the number of RoPE pairs; + # the frequency table uses dimension_count / 2 entries later. + self.gguf_writer.add_rope_dimension_count(int(head_dim * partial_rotary_factor_full)) + self.gguf_writer.add_uint32(f"{arch}.rope.dimension_count_swa", int(head_dim * partial_rotary_factor_swa)) self.gguf_writer.add_rope_freq_base(float(full_rope.get("rope_theta", 500000.0))) self.gguf_writer.add_float32(f"{arch}.rope.freq_base_swa", float(swa_rope.get("rope_theta", 10000.0))) if full_rope.get("rope_type") == "yarn": @@ -5454,7 +5463,9 @@ class LagunaModel(Model): "original_max_position_embeddings", rope_params.get("original_max_position_embeddings", hparams["max_position_embeddings"]), ))) - self.gguf_writer.add_rope_scaling_yarn_ext_factor(float(full_rope.get("factor", 1.0))) + # GGUF's YaRN ext_factor is the config's extrapolation_factor. The main + # factor above is the context-extension scale and should not be mirrored here. + self.gguf_writer.add_rope_scaling_yarn_ext_factor(float(full_rope.get("extrapolation_factor", 1.0))) self.gguf_writer.add_rope_scaling_yarn_attn_factor(float(full_rope.get("attention_factor", 1.0))) self.gguf_writer.add_rope_scaling_yarn_beta_fast(float(full_rope.get("beta_fast", 32.0))) self.gguf_writer.add_rope_scaling_yarn_beta_slow(float(full_rope.get("beta_slow", 1.0))) diff --git a/src/graphs/build_laguna.cpp b/src/graphs/build_laguna.cpp index 6be22d73..55d60902 100644 --- a/src/graphs/build_laguna.cpp +++ b/src/graphs/build_laguna.cpp @@ -9,13 +9,18 @@ ggml_cgraph * llm_build_context::build_laguna() { ggml_tensor * inp_pos = build_inp_pos(); ggml_tensor * inp_out_ids = n_tokens > 1 ? build_inp_out_ids() : nullptr; ggml_tensor * KQ_mask = build_inp_KQ_mask(); - ggml_tensor * KQ_mask_swa = build_inp_KQ_mask_swa(); + // Laguna M.1 has only global-attention layers and leaves n_swa at zero; building + // the SWA mask in that case trips the generic SWA precondition. + ggml_tensor * KQ_mask_swa = hparams.n_swa > 0 ? build_inp_KQ_mask_swa() : nullptr; for (int il = 0; il < n_layer; ++il) { const bool is_swa = hparams.swa_layers[il]; const int n_swa_l = is_swa ? hparams.n_swa : 0; auto KQ_mask_l = is_swa ? KQ_mask_swa : KQ_mask; + // If a future Laguna GGUF marks SWA layers, it must also carry a real + // sliding-window size so those layers get an SWA mask. + GGML_ASSERT(KQ_mask_l != nullptr); auto rope_factors = is_swa ? nullptr : build_rope_factors(il); auto cur = build_std_attention(gf, model.layers[il].attn_norm, inpL, diff --git a/src/llama-build-context.cpp b/src/llama-build-context.cpp index 91d20401..93d9a607 100644 --- a/src/llama-build-context.cpp +++ b/src/llama-build-context.cpp @@ -2873,10 +2873,19 @@ ggml_tensor * llm_build_context::build_std_attention(ggml_cgraph * gf, ggml_tens cb(gate, "attn_gate", il_cb); int nh = split_wo->ne[0]/n_embd_head_v; auto attn_3d = ggml_reshape_3d(ctx0, cur, n_embd_head_v, nh, n_tokens); - auto gate_3d = ggml_reshape_3d(ctx0, gate, 1, nh, n_tokens); if (model.arch == LLM_ARCH_LAGUNA) { - cur = ggml_mul(ctx0, attn_3d, gate_3d); + // Laguna uses a softplus gate. XS.2 stores one gate per head, + // while M.1 stores one gate per attention output element. + if (gate->ne[0] == nh) { + auto gate_3d = ggml_reshape_3d(ctx0, gate, 1, nh, n_tokens); + cur = ggml_mul(ctx0, attn_3d, gate_3d); + } else { + GGML_ASSERT(gate->ne[0] == split_wo->ne[0]); + cur = ggml_reshape_2d(ctx0, cur, split_wo->ne[0], n_tokens); + cur = ggml_mul(ctx0, cur, gate); + } } else { + auto gate_3d = ggml_reshape_3d(ctx0, gate, 1, nh, n_tokens); cur = ggml_fused_mul_unary(ctx0, gate_3d, attn_3d, GGML_UNARY_OP_SIGMOID); } cb(attn_3d, "attn_gated_3d", il_cb); @@ -2994,17 +3003,25 @@ ggml_tensor * llm_build_context::build_std_attention(ggml_cgraph * gf, ggml_tens nullptr, nullptr, Kcur, Vcur, Qcur, KQ_mask, n_tokens, kv_head, n_kv, KQ_scale, cb, il, sinks, n_swa); cb(cur, "wqkv", il); - auto gate = llm_build_lora_mm(lctx, ctx0, wqkv_gate, input_normed); // [n_head_l, n_tokens] + auto gate = llm_build_lora_mm(lctx, ctx0, wqkv_gate, input_normed); if (model.arch == LLM_ARCH_LAGUNA) { gate = ggml_softplus(ctx0, gate); } cb(gate, "attn_gate", il); int n_head_l = hparams.n_head(il); auto attn_3d = ggml_reshape_3d(ctx0, cur, n_embd_head_v, n_head_l, n_tokens); - auto gate_3d = ggml_reshape_3d(ctx0, gate, 1, n_head_l, n_tokens); if (model.arch == LLM_ARCH_LAGUNA) { - cur = ggml_mul(ctx0, attn_3d, gate_3d); + // Laguna uses a softplus gate. XS.2 stores one gate per head, + // while M.1 stores one gate per attention output element. + if (gate->ne[0] == n_head_l) { + auto gate_3d = ggml_reshape_3d(ctx0, gate, 1, n_head_l, n_tokens); + cur = ggml_mul(ctx0, attn_3d, gate_3d); + } else { + GGML_ASSERT(gate->ne[0] == n_embd_head_v * n_head_l); + cur = ggml_mul(ctx0, cur, gate); + } } else { + auto gate_3d = ggml_reshape_3d(ctx0, gate, 1, n_head_l, n_tokens); cur = ggml_fused_mul_unary(ctx0, gate_3d, attn_3d, GGML_UNARY_OP_SIGMOID); } cb(cur, "attn_gated_3d", il); diff --git a/src/llama-hparams.cpp b/src/llama-hparams.cpp index 589e281d..fbca1a4b 100644 --- a/src/llama-hparams.cpp +++ b/src/llama-hparams.cpp @@ -1538,11 +1538,22 @@ void llm_load_hparams( } } - // GGUF stores the Poolside partial-rotary setting; the graph RoPE - // argument for full-attention Laguna layers follows the upstream - // Laguna loader and uses half of that count. SWA layers remain - // full-head rotary via n_rot_swa. - hparams.n_rot /= 2; + const bool found_rope_dim = ml.get_key(LLM_KV_ROPE_DIMENSION_COUNT, hparams.n_rot, false); + const bool found_rope_dim_swa = ml.get_key(LLM_KV_ROPE_DIMENSION_COUNT_SWA, hparams.n_rot_swa, false); + + // Laguna GGUFs store the number of scalar Q/K dimensions that ggml_rope_ext + // rotates. Correct files carry those values explicitly. Some early public + // XS.2 GGUFs omitted both keys, so fall back to the HF XS.2 layout only for + // missing metadata: full-attention layers rotate half the head, SWA layers + // rotate the full head. Explicit but wrong halved metadata still needs repair. + if (hparams.n_swa > 0) { + if (!found_rope_dim) { + hparams.n_rot = hparams.n_embd_head_k_full / 2; + } + if (!found_rope_dim_swa) { + hparams.n_rot_swa = hparams.n_embd_head_k_swa; + } + } ml.get_key(LLM_KV_ROPE_SCALING_YARN_EXT_FACTOR, hparams.yarn_ext_factor, false); ml.get_key(LLM_KV_ROPE_SCALING_YARN_ATTN_FACTOR, hparams.yarn_attn_factor, false); @@ -1553,12 +1564,6 @@ void llm_load_hparams( for (uint32_t i = 0; i < hparams.n_layer; ++i) { hparams.rope_dim_per_layer[i] = hparams.swa_layers[i] ? hparams.n_rot_swa : hparams.n_rot; } - } else { - for (uint32_t i = 0; i < hparams.n_layer; ++i) { - if (!hparams.swa_layers[i]) { - hparams.rope_dim_per_layer[i] /= 2; - } - } } switch (hparams.n_layer) { diff --git a/src/llama-load-tensors.cpp b/src/llama-load-tensors.cpp index f151d885..ccc2962b 100644 --- a/src/llama-load-tensors.cpp +++ b/src/llama-load-tensors.cpp @@ -1198,8 +1198,18 @@ bool create_tensors_helper::create_step35_tensors(const LLM_TN & tn) { //layer.wk = create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_K, "weight", i), {n_embd, n_embd_k_gqa}, 0); //layer.wv = create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_V, "weight", i), {n_embd, n_embd_v_gqa}, 0); layer.wo = create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd_head_v * n_head_l, n_embd}, 0); - // head-wise attention gate (Step35 self_attn.g_proj) - layer.wqkv_gate = create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_GATE, "weight", i), {n_embd, n_head_l}, llama_model_loader::TENSOR_NOT_REQUIRED); + const std::string attn_gate_name = tn(LLM_TENSOR_ATTN_GATE, "weight", i); + int64_t n_attn_gate = n_head_l; + if (model.arch == LLM_ARCH_LAGUNA) { + // Step35-style models normally use a head-wise attention gate. Laguna + // XS.2 keeps that layout, but M.1 gates every attention output element, + // so infer the width from GGUF metadata instead of baking in a model size. + const ggml_tensor * meta = ml.get_tensor_meta(attn_gate_name.c_str()); + if (meta && meta->ne[1] == n_embd_head_v * n_head_l) { + n_attn_gate = n_embd_head_v * n_head_l; + } + } + layer.wqkv_gate = create_tensor(ctx_split, attn_gate_name, {n_embd, n_attn_gate}, llama_model_loader::TENSOR_NOT_REQUIRED); layer.ffn_norm = create_tensor(ctx_split, tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, 0); // dense MLP (leading dense blocks) layer.ffn_gate = create_tensor(ctx_split, tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd, n_ff}, llama_model_loader::TENSOR_NOT_REQUIRED); @@ -4681,11 +4691,15 @@ bool create_tensors_helper::create_tensors() { } if (layer.wqkv_gate) { auto wqkv_gate_split = split_kq; - LLAMA_LOG_DEBUG("=================== wqkv_gate_split:"); - for (auto & s : wqkv_gate_split) { - s /= hparams.n_embd_head_k(il); - LLAMA_LOG_DEBUG(" %d", s); + if (model.arch == LLM_ARCH_LAGUNA && layer.wqkv_gate->ne[1] == layer.wo->ne[0]) { + // Full-width Laguna M.1 gates follow the value/output partition. + // Head-wise gates still follow the K/Q partition collapsed by head size. + wqkv_gate_split = split_vo; + } else { + for (auto & s : wqkv_gate_split) s /= hparams.n_embd_head_k(il); } + LLAMA_LOG_DEBUG("=================== wqkv_gate_split:"); + for ([[maybe_unused]] auto s : wqkv_gate_split) LLAMA_LOG_DEBUG(" %d", s); LLAMA_LOG_DEBUG("\n"); prepare_split_tensors(1, ctx_split, layer.wqkv_gate, layer.split_wqkv_gate, wqkv_gate_split, mem_used); }