diff --git a/convert_hf_to_gguf.py b/convert_hf_to_gguf.py index 59803bd6..187dfe80 100644 --- a/convert_hf_to_gguf.py +++ b/convert_hf_to_gguf.py @@ -561,6 +561,9 @@ class Model: if chkhsh == "0ef9807a4087ebef797fc749390439009c3b9eda9ad1a097abbe738f486c01e5": # ref: https://huggingface.co/meta-llama/Meta-Llama-3-8B res = "llama-bpe" + if chkhsh == "972da7b59cec44d1f0a490a86c96df53859e486e481563e5dddac155013d87ac": + # ref: https://huggingface.co/poolside/Laguna-XS.2 + res = "laguna" if chkhsh == "049ecf7629871e3041641907f3de7c733e4dbfdc736f57d882ba0b0845599754": # ref: https://huggingface.co/deepseek-ai/deepseek-llm-7b-base res = "deepseek-llm" @@ -5041,6 +5044,138 @@ class BailingMoeV2Model(Model): raise ValueError(f"Unprocessed experts: {experts}") +@Model.register("LagunaForCausalLM") +class LagunaModel(Model): + model_arch = gguf.MODEL_ARCH.LAGUNA + + _experts: list[dict[str, Tensor]] | None = None + + def set_gguf_parameters(self): + hparams = self.hparams + arch = gguf.MODEL_ARCH_NAMES[self.model_arch] + n_layers = int(hparams["num_hidden_layers"]) + n_head_base = int(hparams["num_attention_heads"]) + n_kv_base = int(hparams.get("num_key_value_heads", n_head_base)) + head_dim = int(hparams.get("head_dim", hparams["hidden_size"] // n_head_base)) + + heads_per_layer = hparams.get("num_attention_heads_per_layer") + kv_per_layer = hparams.get("num_key_value_heads_per_layer") + + head_arr: list[int] = [] + kv_arr: list[int] = [] + for i in range(n_layers): + head_arr.append(int(heads_per_layer[i]) if heads_per_layer is not None else n_head_base) + kv_arr.append(int(kv_per_layer[i]) if kv_per_layer is not None else n_kv_base) + + rope_params = hparams.get("rope_parameters", {}) + full_rope = rope_params.get("full_attention", rope_params) + swa_rope = rope_params.get("sliding_attention", {}) + + self.gguf_writer.add_context_length(int(hparams["max_position_embeddings"])) + self.gguf_writer.add_embedding_length(int(hparams["hidden_size"])) + self.gguf_writer.add_block_count(n_layers) + self.gguf_writer.add_feed_forward_length(int(hparams["intermediate_size"])) + self.gguf_writer.add_head_count(head_arr) + if all(n_kv == kv_arr[0] for n_kv in kv_arr): + self.gguf_writer.add_head_count_kv(kv_arr[0]) + else: + self.gguf_writer.add_head_count_kv(kv_arr) + self.gguf_writer.add_key_length(head_dim) + self.gguf_writer.add_value_length(head_dim) + self.gguf_writer.add_layer_norm_rms_eps(float(hparams["rms_norm_eps"])) + self.gguf_writer.add_file_type(self.ftype) + + self.gguf_writer.add_sliding_window(int(hparams["sliding_window"])) + self.gguf_writer.add_rope_dimension_count(head_dim // 2) + self.gguf_writer.add_uint32(f"{arch}.rope.dimension_count_swa", head_dim) + self.gguf_writer.add_rope_freq_base(float(full_rope.get("rope_theta", 500000.0))) + self.gguf_writer.add_float32(f"{arch}.rope.freq_base_swa", float(swa_rope.get("rope_theta", 10000.0))) + if full_rope.get("rope_type") == "yarn": + self.gguf_writer.add_rope_scaling_type(gguf.RopeScalingType.YARN) + self.gguf_writer.add_rope_scaling_factor(float(full_rope.get("factor", 1.0))) + self.gguf_writer.add_rope_scaling_orig_ctx_len(int(full_rope.get( + "original_max_position_embeddings", + rope_params.get("original_max_position_embeddings", hparams["max_position_embeddings"]), + ))) + self.gguf_writer.add_rope_scaling_yarn_ext_factor(float(full_rope.get("factor", 1.0))) + self.gguf_writer.add_rope_scaling_yarn_attn_factor(float(full_rope.get("attention_factor", 1.0))) + self.gguf_writer.add_rope_scaling_yarn_beta_fast(float(full_rope.get("beta_fast", 32.0))) + self.gguf_writer.add_rope_scaling_yarn_beta_slow(float(full_rope.get("beta_slow", 1.0))) + + self.gguf_writer.add_expert_count(int(hparams["num_experts"])) + self.gguf_writer.add_expert_used_count(int(hparams["num_experts_per_tok"])) + self.gguf_writer.add_expert_feed_forward_length(int(hparams["moe_intermediate_size"])) + if (shared_dim := hparams.get("shared_expert_intermediate_size")) is not None and int(shared_dim) > 0: + self.gguf_writer.add_expert_shared_feed_forward_length(int(shared_dim)) + if (routing_scale := hparams.get("moe_routed_scaling_factor")) is not None: + self.gguf_writer.add_expert_weights_scale(float(routing_scale)) + self.gguf_writer.add_expert_weights_norm(True) + self.gguf_writer.add_expert_gating_func(gguf.ExpertGatingFuncType.SIGMOID) + + leading_dense = 0 + for mlp_type in hparams.get("mlp_layer_types", []): + if mlp_type != "dense": + break + leading_dense += 1 + self.gguf_writer.add_uint32(f"{arch}.leading_dense_block_count", leading_dense) + + if hparams.get("moe_apply_router_weight_on_input", False): + raise ValueError("moe_apply_router_weight_on_input=True is not supported for Laguna") + + def set_vocab(self) -> None: + super().set_vocab() + if isinstance(eos_token_id := self.hparams.get("eos_token_id"), list) and len(eos_token_id) > 1: + # Poolside uses token 24 () as a turn boundary. + self.gguf_writer.add_eot_token_id(int(eos_token_id[1])) + template_file = self.dir_model / "chat_template.jinja" + if template_file.is_file(): + self.gguf_writer.add_chat_template(template_file.read_text(encoding="utf-8")) + + def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]: + if bid is not None and name in ( + f"model.layers.{bid}.mlp.experts.e_score_correction_bias", + f"model.layers.{bid}.mlp.experts.e_score_correction", + ): + # The C++ loader asks for this tensor through the ".bias" suffix. + # Keep the Laguna converter aligned with existing community GGUFs. + yield f"blk.{bid}.exp_probs_b.bias", data_torch + return + + if name.endswith(".self_attn.g_proj.weight"): + # HF stores the head-wise attention gate with a singleton dimension. + data_torch = data_torch.squeeze().contiguous() + + if bid is not None and re.match(r"model\.layers\.\d+\.mlp\.experts\.\d+\.(gate_proj|up_proj|down_proj)\.weight$", name): + n_experts = int(self.find_hparam(["num_experts"])) + if self._experts is None: + self._experts = [{} for _ in range(self.block_count)] + + self._experts[bid][name] = data_torch + if len(self._experts[bid]) < n_experts * 3: + return + + for w_name in ("down_proj", "gate_proj", "up_proj"): + datas: list[Tensor] = [] + for xid in range(n_experts): + ename = f"model.layers.{bid}.mlp.experts.{xid}.{w_name}.weight" + datas.append(self._experts[bid][ename]) + del self._experts[bid][ename] + + merged = torch.stack(datas, dim=0) + merged_name = f"model.layers.{bid}.mlp.experts.{w_name}.weight" + yield from super().modify_tensors(merged, merged_name, bid) + return + + yield from super().modify_tensors(data_torch, name, bid) + + def prepare_tensors(self): + super().prepare_tensors() + if self._experts is not None: + experts = [k for d in self._experts for k in d.keys()] + if experts: + raise ValueError(f"Unprocessed experts: {experts}") + + ###### CONVERSION LOGIC ###### diff --git a/gguf-py/gguf/constants.py b/gguf-py/gguf/constants.py index 2cd58b17..0c81201b 100644 --- a/gguf-py/gguf/constants.py +++ b/gguf-py/gguf/constants.py @@ -116,11 +116,17 @@ class Keys: REL_BUCKETS_COUNT = "{arch}.attention.relative_buckets_count" SLIDING_WINDOW = "{arch}.attention.sliding_window" SLIDING_WINDOW_PATTERN = "{arch}.attention.sliding_window_pattern" + SHARED_KV_LAYERS = "{arch}.attention.shared_kv_layers" + KEY_LENGTH_SWA = "{arch}.attention.key_length_swa" + VALUE_LENGTH_SWA = "{arch}.attention.value_length_swa" OUTPUT_SCALE = "{arch}.attention.output_scale" TEMPERATURE_LENGTH = "{arch}.attention.temperature_length" class Rope: DIMENSION_COUNT = "{arch}.rope.dimension_count" + DIMENSION_COUNT_SWA = "{arch}.rope.dimension_count_swa" + DIMENSION_COUNT_PER_LAYER = "{arch}.rope.dimension_count_per_layer" + DIMENSION_SECTIONS = "{arch}.rope.dimension_sections" FREQ_BASE = "{arch}.rope.freq_base" FREQ_BASE_SWA = "{arch}.rope.freq_base_swa" SCALING_TYPE = "{arch}.rope.scaling.type" @@ -162,6 +168,7 @@ class Keys: MASK_ID = "tokenizer.ggml.mask_token_id" ADD_BOS = "tokenizer.ggml.add_bos_token" ADD_EOS = "tokenizer.ggml.add_eos_token" + ADD_SEP = "tokenizer.ggml.add_sep_token" ADD_PREFIX = "tokenizer.ggml.add_space_prefix" REMOVE_EXTRA_WS = "tokenizer.ggml.remove_extra_whitespaces" PRECOMPILED_CHARSMAP = "tokenizer.ggml.precompiled_charsmap" @@ -262,6 +269,7 @@ class MODEL_ARCH(IntEnum): MINIMAXM2 = auto() SMOLLM3 = auto() SEED_OSS = auto() + LAGUNA = auto() class MODEL_TENSOR(IntEnum): TOKEN_EMBD = auto() @@ -282,6 +290,7 @@ class MODEL_TENSOR(IntEnum): ATTN_NORM_2 = auto() ATTN_OUT_NORM = auto() ATTN_POST_NORM = auto() + ATTN_GATE = auto() ATTN_ROT_EMBD = auto() FFN_GATE_INP = auto() FFN_GATE_INP_SHEXP = auto() @@ -429,6 +438,7 @@ MODEL_ARCH_NAMES: dict[MODEL_ARCH, str] = { MODEL_ARCH.MINIMAXM2: "minimax-m2", MODEL_ARCH.SMOLLM3: "smollm3", MODEL_ARCH.SEED_OSS: "seed_oss", + MODEL_ARCH.LAGUNA: "laguna", } TENSOR_NAMES: dict[MODEL_TENSOR, str] = { @@ -453,6 +463,7 @@ TENSOR_NAMES: dict[MODEL_TENSOR, str] = { MODEL_TENSOR.ATTN_K_NORM: "blk.{bid}.attn_k_norm", MODEL_TENSOR.ATTN_OUT_NORM: "blk.{bid}.attn_output_norm", MODEL_TENSOR.ATTN_POST_NORM: "blk.{bid}.post_attention_norm", + MODEL_TENSOR.ATTN_GATE: "blk.{bid}.attn_gate", MODEL_TENSOR.FFN_GATE_INP: "blk.{bid}.ffn_gate_inp", MODEL_TENSOR.FFN_GATE_INP_SHEXP: "blk.{bid}.ffn_gate_inp_shexp", MODEL_TENSOR.FFN_NORM: "blk.{bid}.ffn_norm", @@ -1489,6 +1500,32 @@ MODEL_TENSORS: dict[MODEL_ARCH, list[MODEL_TENSOR]] = { MODEL_TENSOR.OUTPUT_NORM, MODEL_TENSOR.OUTPUT, ], + MODEL_ARCH.LAGUNA: [ + MODEL_TENSOR.TOKEN_EMBD, + MODEL_TENSOR.OUTPUT_NORM, + MODEL_TENSOR.OUTPUT, + MODEL_TENSOR.ROPE_FREQS, + MODEL_TENSOR.ATTN_NORM, + MODEL_TENSOR.ATTN_Q, + MODEL_TENSOR.ATTN_Q_NORM, + MODEL_TENSOR.ATTN_K, + MODEL_TENSOR.ATTN_K_NORM, + MODEL_TENSOR.ATTN_V, + MODEL_TENSOR.ATTN_GATE, + MODEL_TENSOR.ATTN_OUT, + MODEL_TENSOR.FFN_NORM, + MODEL_TENSOR.FFN_GATE, + MODEL_TENSOR.FFN_DOWN, + MODEL_TENSOR.FFN_UP, + MODEL_TENSOR.FFN_GATE_INP, + MODEL_TENSOR.FFN_GATE_EXP, + MODEL_TENSOR.FFN_DOWN_EXP, + MODEL_TENSOR.FFN_UP_EXP, + MODEL_TENSOR.FFN_GATE_SHEXP, + MODEL_TENSOR.FFN_DOWN_SHEXP, + MODEL_TENSOR.FFN_UP_SHEXP, + MODEL_TENSOR.FFN_EXP_PROBS_B, + ], # TODO } diff --git a/gguf-py/gguf/tensor_mapping.py b/gguf-py/gguf/tensor_mapping.py index 63c45f4b..63932395 100644 --- a/gguf-py/gguf/tensor_mapping.py +++ b/gguf-py/gguf/tensor_mapping.py @@ -214,6 +214,10 @@ class TensorNameMap: "model.layers.{bid}.post_attention_layernorm", # gemma2 ), + MODEL_TENSOR.ATTN_GATE: ( + "model.layers.{bid}.self_attn.g_proj", # laguna + ), + # Rotary embeddings MODEL_TENSOR.ATTN_ROT_EMBD: ( "model.layers.{bid}.self_attn.rotary_emb.inv_freq", # llama-hf @@ -280,6 +284,8 @@ class TensorNameMap: "model.layers.{bid}.mlp.moe_statics.e_score_correction", # ernie4.5-moe "model.layers.{bid}.mlp.gate.expert_bias", # bailingmoe2 "model.layers.{bid}.block_sparse_moe.e_score_correction", # minimax-m2 + "model.layers.{bid}.mlp.experts.e_score_correction_bias", # laguna + "model.layers.{bid}.mlp.experts.e_score_correction", # laguna ), # Feed-forward up diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt index d5a18c65..18856bed 100644 --- a/src/CMakeLists.txt +++ b/src/CMakeLists.txt @@ -79,6 +79,7 @@ add_library(llama graphs/build_mpt.cpp graphs/build_stablelm.cpp graphs/build_seedoss.cpp + graphs/build_laguna.cpp graphs/build_step35.cpp graphs/build_qwen.cpp graphs/build_qwen2.cpp diff --git a/src/graphs/build_laguna.cpp b/src/graphs/build_laguna.cpp new file mode 100644 index 00000000..cc73e9d2 --- /dev/null +++ b/src/graphs/build_laguna.cpp @@ -0,0 +1,143 @@ +#include "../llama-build-context.h" +#include "../llama-model.h" +#include "../llama-context.h" + +ggml_cgraph * llm_build_context::build_laguna() { + ggml_cgraph * gf = new_graph_custom(); + + ggml_tensor * inpL = llm_build_inp_embd(ctx0, lctx, hparams, batch, model.tok_embd, cb); + ggml_tensor * inp_pos = build_inp_pos(); + ggml_tensor * inp_out_ids = build_inp_out_ids(); + ggml_tensor * KQ_mask = build_inp_KQ_mask(); + ggml_tensor * KQ_mask_swa = build_inp_KQ_mask_swa(); + + for (int il = 0; il < n_layer; ++il) { + const bool is_swa = hparams.swa_layers[il]; + const int n_swa_l = is_swa ? hparams.n_swa : 0; + + ggml_tensor * inpSA = inpL; + + ggml_tensor * cur = llm_build_norm(ctx0, inpL, hparams, model.layers[il].attn_norm, nullptr, LLM_NORM_RMS, cb, il); + cb(cur, "attn_norm", il); + ggml_tensor * input_normed = cur; + + ggml_tensor * Qcur = llm_build_lora_mm(lctx, ctx0, model.layers[il].wq, cur); + cb(Qcur, "Qcur", il); + ggml_tensor * Kcur = llm_build_lora_mm(lctx, ctx0, model.layers[il].wk, cur); + cb(Kcur, "Kcur", il); + ggml_tensor * Vcur = llm_build_lora_mm(lctx, ctx0, model.layers[il].wv, cur); + cb(Vcur, "Vcur", il); + ggml_build_forward_expand(gf, Qcur); + ggml_build_forward_expand(gf, Kcur); + ggml_build_forward_expand(gf, Vcur); + + const int64_t n_head_l = hparams.n_head(il); + const int64_t n_head_kv_l = hparams.n_head_kv(il); + const int64_t n_embd_head_k = hparams.n_embd_head_k(il); + const int64_t n_embd_head_v = hparams.n_embd_head_v(il); + + Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head_k, n_head_l, n_tokens); + Kcur = ggml_reshape_3d(ctx0, Kcur, n_embd_head_k, n_head_kv_l, n_tokens); + + if (model.layers[il].attn_q_norm) { + Qcur = llm_build_norm(ctx0, Qcur, hparams, model.layers[il].attn_q_norm, nullptr, LLM_NORM_RMS, cb, il); + cb(Qcur, "Qcur_normed", il); + ggml_build_forward_expand(gf, Qcur); + } + if (model.layers[il].attn_k_norm) { + Kcur = llm_build_norm(ctx0, Kcur, hparams, model.layers[il].attn_k_norm, nullptr, LLM_NORM_RMS, cb, il); + cb(Kcur, "Kcur_normed", il); + ggml_build_forward_expand(gf, Kcur); + } + + const float freq_base_l = is_swa ? hparams.rope_freq_base_train_swa : cparams.rope_freq_base; + const float freq_scale_l = is_swa ? hparams.rope_freq_scale_train_swa : cparams.rope_freq_scale; + const float ext_factor_l = is_swa ? 0.0f : ext_factor; + const float attn_factor_l = is_swa ? 1.0f : attn_factor; + const float beta_fast_l = is_swa ? 32.0f : beta_fast; + const float beta_slow_l = is_swa ? 1.0f : beta_slow; + ggml_tensor * rope_factors = is_swa ? nullptr : build_rope_factors(il); + const int n_rot_l = hparams.rope_n_rot(il); + + Qcur = ggml_rope_ext(ctx0, Qcur, inp_pos, rope_factors, + n_rot_l, rope_type, n_ctx_orig, freq_base_l, freq_scale_l, + ext_factor_l, attn_factor_l, beta_fast_l, beta_slow_l); + Kcur = ggml_rope_ext(ctx0, Kcur, inp_pos, rope_factors, + n_rot_l, rope_type, n_ctx_orig, freq_base_l, freq_scale_l, + ext_factor_l, attn_factor_l, beta_fast_l, beta_slow_l); + cb(Qcur, "Qcur_roped", il); + cb(Kcur, "Kcur_roped", il); + + cur = llm_build_kv(ctx0, lctx, kv_self, gf, + nullptr, nullptr, + Kcur, Vcur, Qcur, + is_swa ? KQ_mask_swa : KQ_mask, + n_tokens, kv_head, n_kv, + 1.0f / sqrtf(float(n_embd_head_k)), cb, il, nullptr, n_swa_l); + cb(cur, "attn_out", il); + + if (model.layers[il].wqkv_gate) { + ggml_tensor * gate = llm_build_lora_mm(lctx, ctx0, model.layers[il].wqkv_gate, input_normed); + cb(gate, "attn_gate", il); + gate = ggml_softplus(ctx0, gate); + cb(gate, "attn_gate_softplus", il); + + ggml_tensor * attn_3d = ggml_reshape_3d(ctx0, cur, n_embd_head_v, n_head_l, n_tokens); + ggml_tensor * gate_3d = ggml_reshape_3d(ctx0, gate, 1, n_head_l, n_tokens); + cb(gate_3d, "attn_gate_3d", il); + + cur = ggml_mul(ctx0, attn_3d, gate_3d); + cb(cur, "attn_gated_3d", il); + cur = ggml_reshape_2d(ctx0, cur, n_embd_head_v * n_head_l, n_tokens); + cb(cur, "attn_gated", il); + } + + cur = llm_build_lora_mm(lctx, ctx0, model.layers[il].wo, cur); + cb(cur, "attn_proj", il); + + if (il == n_layer - 1 && inp_out_ids) { + cur = ggml_get_rows(ctx0, cur, inp_out_ids); + inpSA = ggml_get_rows(ctx0, inpSA, inp_out_ids); + } + + ggml_tensor * ffn_inp = ggml_add(ctx0, cur, inpSA); + cb(ffn_inp, "ffn_inp", il); + + if (model.layers[il].ffn_gate_inp == nullptr) { + cur = llm_build_ffn(ctx0, lctx, model.layers[il].ffn_norm, ffn_inp, + model.layers[il].ffn_up, nullptr, nullptr, + model.layers[il].ffn_gate, nullptr, nullptr, + model.layers[il].ffn_down, nullptr, nullptr, + nullptr, + LLM_FFN_SILU, LLM_FFN_PAR, cb, il); + cb(cur, "ffn_out", il); + cur = ggml_add(ctx0, cur, ffn_inp); + } else { + cur = llm_build_std_moe_ffn(ctx0, lctx, model.layers[il].ffn_norm, ffn_inp, + model.layers[il].ffn_gate_inp, model.layers[il].ffn_gate_inp_b, + model.layers[il].ffn_up_exps, model.layers[il].ffn_up_exps_b, + model.layers[il].ffn_gate_exps, model.layers[il].ffn_gate_exps_b, + model.layers[il].ffn_down_exps, model.layers[il].ffn_down_exps_b, + model.layers[il].ffn_exp_probs_b, + model.layers[il].ffn_up_shexp, nullptr, + model.layers[il].ffn_gate_shexp, nullptr, + model.layers[il].ffn_down_shexp, nullptr, + n_expert, n_expert_used, + LLM_FFN_SILU, hparams.expert_weights_norm, hparams.expert_weights_scale != 0.0f, hparams.expert_weights_scale, + (llm_expert_gating_func_type) hparams.expert_gating_func, + LLM_FFN_SILU, cb, il, gf, true, model.layers[il].ffn_up_gate_exps); + } + + cur = lctx.cvec.apply_to(ctx0, cur, il); + cb(cur, "l_out", il); + + inpL = cur; + } + + ggml_tensor * cur = build_output(lctx, ctx0, inpL, model.output, model.output_norm, cb); + cb(cur, "result_output", -1); + + ggml_build_forward_expand(gf, cur); + + return gf; +} diff --git a/src/llama-arch.cpp b/src/llama-arch.cpp index c0b66cf1..a5aa9fa8 100644 --- a/src/llama-arch.cpp +++ b/src/llama-arch.cpp @@ -76,6 +76,7 @@ static const std::map LLM_ARCH_NAMES = { { LLM_ARCH_MIMO2, "mimo2" }, { LLM_ARCH_SEED_OSS, "seed_oss" }, { LLM_ARCH_STEP35, "step35" }, + { LLM_ARCH_LAGUNA, "laguna" }, { LLM_ARCH_GLM_DSA, "glm-dsa" }, { LLM_ARCH_MISTRAL4, "mistral4" }, { LLM_ARCH_GEMMA4, "gemma4" }, diff --git a/src/llama-arch.h b/src/llama-arch.h index 9ea08b85..cc82d63a 100644 --- a/src/llama-arch.h +++ b/src/llama-arch.h @@ -75,6 +75,7 @@ enum llm_arch { LLM_ARCH_MIMO2, LLM_ARCH_SEED_OSS, LLM_ARCH_STEP35, + LLM_ARCH_LAGUNA, LLM_ARCH_GLM_DSA, LLM_ARCH_MISTRAL4, LLM_ARCH_GEMMA4, diff --git a/src/llama-build-context.cpp b/src/llama-build-context.cpp index b50e020f..d2e0fd97 100644 --- a/src/llama-build-context.cpp +++ b/src/llama-build-context.cpp @@ -1110,7 +1110,10 @@ llm_expert_gating_func_type gating_op, ggml_tensor * weights_sum = ggml_sum_rows(ctx, weights); // [1, n_tokens] cb(weights_sum, "ffn_moe_weights_sum", il); - if (lctx.model.arch == LLM_ARCH_BAILINGMOE2 || lctx.model.arch == LLM_ARCH_STEP35) { + if (lctx.model.arch == LLM_ARCH_LAGUNA) { + weights_sum = ggml_clamp(ctx, weights_sum, 6.103515625e-5f, INFINITY); + cb(weights_sum, "ffn_moe_weights_sum_clamped", il); + } else if (lctx.model.arch == LLM_ARCH_BAILINGMOE2 || lctx.model.arch == LLM_ARCH_STEP35) { weights_sum = ggml_scale_bias(ctx, weights_sum, 1.0, 1e-20); cb(weights_sum, "ffn_moe_weights_sum_biased", il); } @@ -1595,6 +1598,7 @@ static ggml_tensor * llm_build_kqv( || model.arch == LLM_ARCH_COMMAND_R || model.arch == LLM_ARCH_GLM4 || model.arch == LLM_ARCH_GLM4_MOE + || model.arch == LLM_ARCH_LAGUNA || model.arch == LLM_ARCH_MIMO2; // || (model.arch == LLM_ARCH_DEEPSEEK2 && q->ne[1] <= 8); @@ -2507,6 +2511,10 @@ ggml_cgraph * llm_build_context::llama_build_graph( { result = llm.build_step35(); } break; + case LLM_ARCH_LAGUNA: + { + result = llm.build_laguna(); + } break; default: GGML_ABORT("fatal error"); } diff --git a/src/llama-build-context.h b/src/llama-build-context.h index d46ff1ba..76f2c714 100644 --- a/src/llama-build-context.h +++ b/src/llama-build-context.h @@ -318,6 +318,8 @@ struct llm_build_context { ggml_cgraph * build_seedoss(); + ggml_cgraph * build_laguna(); + ggml_cgraph * build_step35(); // diff --git a/src/llama-hparams.cpp b/src/llama-hparams.cpp index 448774ae..f00951f0 100644 --- a/src/llama-hparams.cpp +++ b/src/llama-hparams.cpp @@ -1368,6 +1368,65 @@ void llm_load_hparams( hparams.rope_freq_base_per_layer, hparams.n_layer, false); GGML_ASSERT(hparams.has_rope_freq_base_per_layer || have_rfb_train_swa); } break; + case LLM_ARCH_LAGUNA: + { + ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps); + + ml.get_key(LLM_KV_EXPERT_FEED_FORWARD_LENGTH, hparams.n_ff_exp); + ml.get_key(LLM_KV_EXPERT_SHARED_FEED_FORWARD_LENGTH, hparams.n_ff_shexp, false); + hparams.expert_gating_func = LLM_EXPERT_GATING_FUNC_TYPE_NONE; + ml.get_key(LLM_KV_EXPERT_GATING_FUNC, hparams.expert_gating_func, false); + ml.get_key(LLM_KV_EXPERT_WEIGHTS_SCALE, hparams.expert_weights_scale, false); + ml.get_key(LLM_KV_EXPERT_WEIGHTS_NORM, hparams.expert_weights_norm, false); + ml.get_key(LLM_KV_LEADING_DENSE_BLOCK_COUNT, hparams.n_layer_dense_lead, false); + ml.get_key(LLM_KV_EXPERT_SHARED_COUNT, hparams.n_expert_shared, false); + // Older Laguna GGUFs encode one shared expert through the shared FFN length. + if (hparams.n_expert_shared == 0 && hparams.n_ff_shexp > 0) { + hparams.n_expert_shared = 1; + } + if (hparams.expert_gating_func == LLM_EXPERT_GATING_FUNC_TYPE_NONE) { + hparams.expert_gating_func = LLM_EXPERT_GATING_FUNC_SIGMOID; + } + + ml.get_key(LLM_KV_ATTENTION_SLIDING_WINDOW, hparams.n_swa); + ml.get_key(LLM_KV_ROPE_FREQ_BASE_SWA, hparams.rope_freq_base_train_swa, false); + hparams.rope_freq_scale_train_swa = 1.0f; + if (!ml.get_key_or_arr(LLM_KV_ATTENTION_SLIDING_WINDOW_PATTERN, hparams.swa_layers, hparams.n_layer, false)) { + // Laguna XS.2 alternates full-attention and SWA layers via per-layer head counts. + const uint32_t n_head_full = hparams.n_head(0); + for (uint32_t i = 0; i < hparams.n_layer; ++i) { + hparams.swa_layers[i] = hparams.n_head(i) != n_head_full; + } + } + + // GGUF stores the Poolside partial-rotary setting; the graph RoPE + // argument for full-attention Laguna layers follows the upstream + // Laguna loader and uses half of that count. SWA layers remain + // full-head rotary via n_rot_swa. + hparams.n_rot /= 2; + + ml.get_key(LLM_KV_ROPE_SCALING_YARN_EXT_FACTOR, hparams.yarn_ext_factor, false); + ml.get_key(LLM_KV_ROPE_SCALING_YARN_ATTN_FACTOR, hparams.yarn_attn_factor, false); + ml.get_key(LLM_KV_ROPE_SCALING_YARN_BETA_FAST, hparams.yarn_beta_fast, false); + ml.get_key(LLM_KV_ROPE_SCALING_YARN_BETA_SLOW, hparams.yarn_beta_slow, false); + + if (!ml.get_key_or_arr(LLM_KV_ROPE_DIMENSION_COUNT_PER_LAYER, hparams.rope_dim_per_layer, hparams.n_layer, false)) { + for (uint32_t i = 0; i < hparams.n_layer; ++i) { + hparams.rope_dim_per_layer[i] = hparams.swa_layers[i] ? hparams.n_rot_swa : hparams.n_rot; + } + } else { + for (uint32_t i = 0; i < hparams.n_layer; ++i) { + if (!hparams.swa_layers[i]) { + hparams.rope_dim_per_layer[i] /= 2; + } + } + } + + switch (hparams.n_layer) { + case 40: model.type = e_model::MODEL_33B_A3B; break; + default: model.type = e_model::MODEL_UNKNOWN; + } + } break; case LLM_ARCH_GLM_DSA: { ml.get_key(LLM_KV_EXPERT_FEED_FORWARD_LENGTH, hparams.n_ff_exp); diff --git a/src/llama-hparams.h b/src/llama-hparams.h index 13fe1811..ca8b4db0 100644 --- a/src/llama-hparams.h +++ b/src/llama-hparams.h @@ -82,7 +82,6 @@ struct llama_hparams { float yarn_attn_factor = 1.0f; float yarn_beta_fast = 32.0f; float yarn_beta_slow = 1.0f; - std::array rope_sections; std::array rope_freq_base_per_layer; std::array rope_dim_per_layer; diff --git a/src/llama-load-tensors.cpp b/src/llama-load-tensors.cpp index 268094bd..6d65f084 100644 --- a/src/llama-load-tensors.cpp +++ b/src/llama-load-tensors.cpp @@ -4370,6 +4370,8 @@ bool create_tensors_helper::create_tensors() { use_mmap_buffer = create_seedoss_tensors(tn); break; case LLM_ARCH_STEP35: use_mmap_buffer = create_step35_tensors(tn); break; + case LLM_ARCH_LAGUNA: + use_mmap_buffer = create_step35_tensors(tn); break; default: throw std::runtime_error("unknown architecture"); } diff --git a/src/llama-model.cpp b/src/llama-model.cpp index f459849f..c5a0ac03 100644 --- a/src/llama-model.cpp +++ b/src/llama-model.cpp @@ -1605,6 +1605,38 @@ static const std::map> LLM_TENSOR_NA { LLM_TENSOR_FFN_EXP_PROBS_B, "blk.%d.exp_probs_b" }, }, }, + { + LLM_ARCH_LAGUNA, + { + { LLM_TENSOR_TOKEN_EMBD, "token_embd" }, + { LLM_TENSOR_OUTPUT_NORM, "output_norm" }, + { LLM_TENSOR_OUTPUT, "output" }, + { LLM_TENSOR_ROPE_FREQS, "rope_freqs" }, + { LLM_TENSOR_ROPE_FACTORS_LONG, "rope_factors_long" }, + { LLM_TENSOR_ROPE_FACTORS_SHORT,"rope_factors_short" }, + { LLM_TENSOR_ATTN_NORM, "blk.%d.attn_norm" }, + { LLM_TENSOR_ATTN_Q, "blk.%d.attn_q" }, + { LLM_TENSOR_ATTN_Q_NORM, "blk.%d.attn_q_norm" }, + { LLM_TENSOR_ATTN_K, "blk.%d.attn_k" }, + { LLM_TENSOR_ATTN_K_NORM, "blk.%d.attn_k_norm" }, + { LLM_TENSOR_ATTN_V, "blk.%d.attn_v" }, + { LLM_TENSOR_ATTN_GATE, "blk.%d.attn_gate" }, + { LLM_TENSOR_ATTN_OUT, "blk.%d.attn_output" }, + { LLM_TENSOR_FFN_NORM, "blk.%d.ffn_norm" }, + { LLM_TENSOR_FFN_GATE, "blk.%d.ffn_gate" }, + { LLM_TENSOR_FFN_DOWN, "blk.%d.ffn_down" }, + { LLM_TENSOR_FFN_UP, "blk.%d.ffn_up" }, + { LLM_TENSOR_FFN_GATE_INP, "blk.%d.ffn_gate_inp" }, + { LLM_TENSOR_FFN_GATE_EXPS, "blk.%d.ffn_gate_exps" }, + { LLM_TENSOR_FFN_DOWN_EXPS, "blk.%d.ffn_down_exps" }, + { LLM_TENSOR_FFN_UP_EXPS, "blk.%d.ffn_up_exps" }, + { LLM_TENSOR_FFN_GATE_UP_EXPS, "blk.%d.ffn_gate_up_exps" }, + { LLM_TENSOR_FFN_GATE_SHEXP, "blk.%d.ffn_gate_shexp" }, + { LLM_TENSOR_FFN_DOWN_SHEXP, "blk.%d.ffn_down_shexp" }, + { LLM_TENSOR_FFN_UP_SHEXP, "blk.%d.ffn_up_shexp" }, + { LLM_TENSOR_FFN_EXP_PROBS_B, "blk.%d.exp_probs_b" }, + }, + }, { LLM_ARCH_GLM_DSA, { @@ -1892,6 +1924,7 @@ const char * llama_model_type_name(e_model type) { case MODEL_16B_A1B: return "16B.A1B"; case MODEL_21B_A3B: return "21B.A3B"; case MODEL_30B_A3B: return "30B.A3B"; + case MODEL_33B_A3B: return "33B.A3B"; case MODEL_35B_A3B: return "35B.A3B"; case MODEL_80B_A3B: return "80B.A3B"; case MODEL_80B_A13B: return "80B.A13B"; diff --git a/src/llama-model.h b/src/llama-model.h index 9dfff26b..99dddecf 100644 --- a/src/llama-model.h +++ b/src/llama-model.h @@ -109,6 +109,7 @@ enum e_model { MODEL_16B_A1B, MODEL_21B_A3B, // Ernie MoE small MODEL_30B_A3B, + MODEL_33B_A3B, MODEL_35B_A3B, MODEL_80B_A3B, // Qwen3-Next MODEL_80B_A13B, diff --git a/src/llama-vocab.cpp b/src/llama-vocab.cpp index 1eca29eb..43ee758f 100644 --- a/src/llama-vocab.cpp +++ b/src/llama-vocab.cpp @@ -1948,9 +1948,11 @@ void llama_vocab::impl::load(llama_model_loader & ml, const LLM_KV & kv) { } else if (tokenizer_pre == "default") { pre_type = LLAMA_VOCAB_PRE_TYPE_DEFAULT; } else if ( + // Laguna reuses the Llama-3 byte-level BPE pre-tokenizer shape. tokenizer_pre == "llama3" || tokenizer_pre == "llama-v3" || tokenizer_pre == "llama-bpe"|| + tokenizer_pre == "laguna" || tokenizer_pre == "falcon3" || tokenizer_pre == "falcon-h1" || tokenizer_pre == "pixtral" || diff --git a/src/llama.cpp b/src/llama.cpp index e55d7c20..4aced532 100644 --- a/src/llama.cpp +++ b/src/llama.cpp @@ -3070,6 +3070,7 @@ static bool is_model_split_supported(const llama_model & model) { LLM_ARCH_MINIMAX_M2, LLM_ARCH_SEED_OSS, LLM_ARCH_STEP35, + LLM_ARCH_LAGUNA, //LLM_ARCH_QWEN3NEXT, LLM_ARCH_QWEN35, LLM_ARCH_QWEN35MOE, @@ -6832,7 +6833,6 @@ struct llama_context * llama_init_from_model( if (cparams.yarn_ext_factor < 0.0f) { // negative indicates 'not set' cparams.yarn_ext_factor = rope_scaling_type == LLAMA_ROPE_SCALING_TYPE_YARN ? 1.0f : 0.0f; } - cparams.yarn_attn_factor *= hparams.rope_attn_factor; if (cparams.pooling_type == LLAMA_POOLING_TYPE_UNSPECIFIED) { @@ -7431,6 +7431,7 @@ enum llama_rope_type llama_rope_type(const struct llama_model * model) { case LLM_ARCH_MIMO2: case LLM_ARCH_SEED_OSS: case LLM_ARCH_STEP35: + case LLM_ARCH_LAGUNA: case LLM_ARCH_GEMMA4: case LLM_ARCH_GEMMA4_MTP: return LLAMA_ROPE_TYPE_NEOX;