Add Laguna M.1 GGUF support (#2003)

This commit is contained in:
empty-quiver 2026-06-22 10:53:10 -04:00 committed by GitHub
parent 64fceb70bc
commit b47b90d0be
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
5 changed files with 78 additions and 26 deletions

View File

@ -5427,6 +5427,12 @@ class LagunaModel(Model):
rope_params = hparams.get("rope_parameters", {})
full_rope = rope_params.get("full_attention", rope_params)
swa_rope = rope_params.get("sliding_attention", {})
# Laguna can specify different rotary widths for full-attention and SWA layers.
# M.1 uses the full-attention value from rope_parameters; XS.2 SWA omits the key
# because those layers rotate the whole head.
partial_rotary_factor = float(hparams.get("partial_rotary_factor", 1.0))
partial_rotary_factor_full = float(full_rope.get("partial_rotary_factor", partial_rotary_factor))
partial_rotary_factor_swa = float(swa_rope.get("partial_rotary_factor", 1.0))
self.gguf_writer.add_context_length(int(hparams["max_position_embeddings"]))
self.gguf_writer.add_embedding_length(int(hparams["hidden_size"]))
@ -5443,8 +5449,11 @@ class LagunaModel(Model):
self.gguf_writer.add_file_type(self.ftype)
self.gguf_writer.add_sliding_window(int(hparams["sliding_window"]))
self.gguf_writer.add_rope_dimension_count(head_dim // 2)
self.gguf_writer.add_uint32(f"{arch}.rope.dimension_count_swa", head_dim)
# GGUF's rope.dimension_count is the number of scalar Q/K dimensions
# that ggml_rope_ext should rotate. It is not the number of RoPE pairs;
# the frequency table uses dimension_count / 2 entries later.
self.gguf_writer.add_rope_dimension_count(int(head_dim * partial_rotary_factor_full))
self.gguf_writer.add_uint32(f"{arch}.rope.dimension_count_swa", int(head_dim * partial_rotary_factor_swa))
self.gguf_writer.add_rope_freq_base(float(full_rope.get("rope_theta", 500000.0)))
self.gguf_writer.add_float32(f"{arch}.rope.freq_base_swa", float(swa_rope.get("rope_theta", 10000.0)))
if full_rope.get("rope_type") == "yarn":
@ -5454,7 +5463,9 @@ class LagunaModel(Model):
"original_max_position_embeddings",
rope_params.get("original_max_position_embeddings", hparams["max_position_embeddings"]),
)))
self.gguf_writer.add_rope_scaling_yarn_ext_factor(float(full_rope.get("factor", 1.0)))
# GGUF's YaRN ext_factor is the config's extrapolation_factor. The main
# factor above is the context-extension scale and should not be mirrored here.
self.gguf_writer.add_rope_scaling_yarn_ext_factor(float(full_rope.get("extrapolation_factor", 1.0)))
self.gguf_writer.add_rope_scaling_yarn_attn_factor(float(full_rope.get("attention_factor", 1.0)))
self.gguf_writer.add_rope_scaling_yarn_beta_fast(float(full_rope.get("beta_fast", 32.0)))
self.gguf_writer.add_rope_scaling_yarn_beta_slow(float(full_rope.get("beta_slow", 1.0)))

View File

@ -9,13 +9,18 @@ ggml_cgraph * llm_build_context::build_laguna() {
ggml_tensor * inp_pos = build_inp_pos();
ggml_tensor * inp_out_ids = n_tokens > 1 ? build_inp_out_ids() : nullptr;
ggml_tensor * KQ_mask = build_inp_KQ_mask();
ggml_tensor * KQ_mask_swa = build_inp_KQ_mask_swa();
// Laguna M.1 has only global-attention layers and leaves n_swa at zero; building
// the SWA mask in that case trips the generic SWA precondition.
ggml_tensor * KQ_mask_swa = hparams.n_swa > 0 ? build_inp_KQ_mask_swa() : nullptr;
for (int il = 0; il < n_layer; ++il) {
const bool is_swa = hparams.swa_layers[il];
const int n_swa_l = is_swa ? hparams.n_swa : 0;
auto KQ_mask_l = is_swa ? KQ_mask_swa : KQ_mask;
// If a future Laguna GGUF marks SWA layers, it must also carry a real
// sliding-window size so those layers get an SWA mask.
GGML_ASSERT(KQ_mask_l != nullptr);
auto rope_factors = is_swa ? nullptr : build_rope_factors(il);
auto cur = build_std_attention(gf, model.layers[il].attn_norm, inpL,

View File

@ -2873,10 +2873,19 @@ ggml_tensor * llm_build_context::build_std_attention(ggml_cgraph * gf, ggml_tens
cb(gate, "attn_gate", il_cb);
int nh = split_wo->ne[0]/n_embd_head_v;
auto attn_3d = ggml_reshape_3d(ctx0, cur, n_embd_head_v, nh, n_tokens);
auto gate_3d = ggml_reshape_3d(ctx0, gate, 1, nh, n_tokens);
if (model.arch == LLM_ARCH_LAGUNA) {
cur = ggml_mul(ctx0, attn_3d, gate_3d);
// Laguna uses a softplus gate. XS.2 stores one gate per head,
// while M.1 stores one gate per attention output element.
if (gate->ne[0] == nh) {
auto gate_3d = ggml_reshape_3d(ctx0, gate, 1, nh, n_tokens);
cur = ggml_mul(ctx0, attn_3d, gate_3d);
} else {
GGML_ASSERT(gate->ne[0] == split_wo->ne[0]);
cur = ggml_reshape_2d(ctx0, cur, split_wo->ne[0], n_tokens);
cur = ggml_mul(ctx0, cur, gate);
}
} else {
auto gate_3d = ggml_reshape_3d(ctx0, gate, 1, nh, n_tokens);
cur = ggml_fused_mul_unary(ctx0, gate_3d, attn_3d, GGML_UNARY_OP_SIGMOID);
}
cb(attn_3d, "attn_gated_3d", il_cb);
@ -2994,17 +3003,25 @@ ggml_tensor * llm_build_context::build_std_attention(ggml_cgraph * gf, ggml_tens
nullptr, nullptr,
Kcur, Vcur, Qcur, KQ_mask, n_tokens, kv_head, n_kv, KQ_scale, cb, il, sinks, n_swa);
cb(cur, "wqkv", il);
auto gate = llm_build_lora_mm(lctx, ctx0, wqkv_gate, input_normed); // [n_head_l, n_tokens]
auto gate = llm_build_lora_mm(lctx, ctx0, wqkv_gate, input_normed);
if (model.arch == LLM_ARCH_LAGUNA) {
gate = ggml_softplus(ctx0, gate);
}
cb(gate, "attn_gate", il);
int n_head_l = hparams.n_head(il);
auto attn_3d = ggml_reshape_3d(ctx0, cur, n_embd_head_v, n_head_l, n_tokens);
auto gate_3d = ggml_reshape_3d(ctx0, gate, 1, n_head_l, n_tokens);
if (model.arch == LLM_ARCH_LAGUNA) {
cur = ggml_mul(ctx0, attn_3d, gate_3d);
// Laguna uses a softplus gate. XS.2 stores one gate per head,
// while M.1 stores one gate per attention output element.
if (gate->ne[0] == n_head_l) {
auto gate_3d = ggml_reshape_3d(ctx0, gate, 1, n_head_l, n_tokens);
cur = ggml_mul(ctx0, attn_3d, gate_3d);
} else {
GGML_ASSERT(gate->ne[0] == n_embd_head_v * n_head_l);
cur = ggml_mul(ctx0, cur, gate);
}
} else {
auto gate_3d = ggml_reshape_3d(ctx0, gate, 1, n_head_l, n_tokens);
cur = ggml_fused_mul_unary(ctx0, gate_3d, attn_3d, GGML_UNARY_OP_SIGMOID);
}
cb(cur, "attn_gated_3d", il);

View File

@ -1538,11 +1538,22 @@ void llm_load_hparams(
}
}
// GGUF stores the Poolside partial-rotary setting; the graph RoPE
// argument for full-attention Laguna layers follows the upstream
// Laguna loader and uses half of that count. SWA layers remain
// full-head rotary via n_rot_swa.
hparams.n_rot /= 2;
const bool found_rope_dim = ml.get_key(LLM_KV_ROPE_DIMENSION_COUNT, hparams.n_rot, false);
const bool found_rope_dim_swa = ml.get_key(LLM_KV_ROPE_DIMENSION_COUNT_SWA, hparams.n_rot_swa, false);
// Laguna GGUFs store the number of scalar Q/K dimensions that ggml_rope_ext
// rotates. Correct files carry those values explicitly. Some early public
// XS.2 GGUFs omitted both keys, so fall back to the HF XS.2 layout only for
// missing metadata: full-attention layers rotate half the head, SWA layers
// rotate the full head. Explicit but wrong halved metadata still needs repair.
if (hparams.n_swa > 0) {
if (!found_rope_dim) {
hparams.n_rot = hparams.n_embd_head_k_full / 2;
}
if (!found_rope_dim_swa) {
hparams.n_rot_swa = hparams.n_embd_head_k_swa;
}
}
ml.get_key(LLM_KV_ROPE_SCALING_YARN_EXT_FACTOR, hparams.yarn_ext_factor, false);
ml.get_key(LLM_KV_ROPE_SCALING_YARN_ATTN_FACTOR, hparams.yarn_attn_factor, false);
@ -1553,12 +1564,6 @@ void llm_load_hparams(
for (uint32_t i = 0; i < hparams.n_layer; ++i) {
hparams.rope_dim_per_layer[i] = hparams.swa_layers[i] ? hparams.n_rot_swa : hparams.n_rot;
}
} else {
for (uint32_t i = 0; i < hparams.n_layer; ++i) {
if (!hparams.swa_layers[i]) {
hparams.rope_dim_per_layer[i] /= 2;
}
}
}
switch (hparams.n_layer) {

View File

@ -1198,8 +1198,18 @@ bool create_tensors_helper::create_step35_tensors(const LLM_TN & tn) {
//layer.wk = create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_K, "weight", i), {n_embd, n_embd_k_gqa}, 0);
//layer.wv = create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_V, "weight", i), {n_embd, n_embd_v_gqa}, 0);
layer.wo = create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd_head_v * n_head_l, n_embd}, 0);
// head-wise attention gate (Step35 self_attn.g_proj)
layer.wqkv_gate = create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_GATE, "weight", i), {n_embd, n_head_l}, llama_model_loader::TENSOR_NOT_REQUIRED);
const std::string attn_gate_name = tn(LLM_TENSOR_ATTN_GATE, "weight", i);
int64_t n_attn_gate = n_head_l;
if (model.arch == LLM_ARCH_LAGUNA) {
// Step35-style models normally use a head-wise attention gate. Laguna
// XS.2 keeps that layout, but M.1 gates every attention output element,
// so infer the width from GGUF metadata instead of baking in a model size.
const ggml_tensor * meta = ml.get_tensor_meta(attn_gate_name.c_str());
if (meta && meta->ne[1] == n_embd_head_v * n_head_l) {
n_attn_gate = n_embd_head_v * n_head_l;
}
}
layer.wqkv_gate = create_tensor(ctx_split, attn_gate_name, {n_embd, n_attn_gate}, llama_model_loader::TENSOR_NOT_REQUIRED);
layer.ffn_norm = create_tensor(ctx_split, tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, 0);
// dense MLP (leading dense blocks)
layer.ffn_gate = create_tensor(ctx_split, tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd, n_ff}, llama_model_loader::TENSOR_NOT_REQUIRED);
@ -4681,11 +4691,15 @@ bool create_tensors_helper::create_tensors() {
}
if (layer.wqkv_gate) {
auto wqkv_gate_split = split_kq;
LLAMA_LOG_DEBUG("=================== wqkv_gate_split:");
for (auto & s : wqkv_gate_split) {
s /= hparams.n_embd_head_k(il);
LLAMA_LOG_DEBUG(" %d", s);
if (model.arch == LLM_ARCH_LAGUNA && layer.wqkv_gate->ne[1] == layer.wo->ne[0]) {
// Full-width Laguna M.1 gates follow the value/output partition.
// Head-wise gates still follow the K/Q partition collapsed by head size.
wqkv_gate_split = split_vo;
} else {
for (auto & s : wqkv_gate_split) s /= hparams.n_embd_head_k(il);
}
LLAMA_LOG_DEBUG("=================== wqkv_gate_split:");
for ([[maybe_unused]] auto s : wqkv_gate_split) LLAMA_LOG_DEBUG(" %d", s);
LLAMA_LOG_DEBUG("\n");
prepare_split_tensors(1, ctx_split, layer.wqkv_gate, layer.split_wqkv_gate, wqkv_gate_split, mem_used);
}