mirror of
https://github.com/ikawrakow/ik_llama.cpp.git
synced 2026-06-28 04:30:15 -05:00
model: add Poolside Laguna XS.2 support (#1911)
* llama: register Laguna architecture * llama: add Laguna graph support * llama: place Laguna MoE tensors for cpu-moe * gguf: add Laguna metadata and tokenizer ids * convert: support Poolside Laguna XS.2 * model: align Laguna RoPE and graph semantics * model: align Laguna partial offload with review feedback * model: localize Laguna SWA YaRN defaults * model: localize Laguna SWA RoPE constants --------- Co-authored-by: Joel Farthing <262452229+joelfarthing@users.noreply.github.com>
This commit is contained in:
parent
eea6a82b25
commit
bbe1a511ee
@ -561,6 +561,9 @@ class Model:
|
||||
if chkhsh == "0ef9807a4087ebef797fc749390439009c3b9eda9ad1a097abbe738f486c01e5":
|
||||
# ref: https://huggingface.co/meta-llama/Meta-Llama-3-8B
|
||||
res = "llama-bpe"
|
||||
if chkhsh == "972da7b59cec44d1f0a490a86c96df53859e486e481563e5dddac155013d87ac":
|
||||
# ref: https://huggingface.co/poolside/Laguna-XS.2
|
||||
res = "laguna"
|
||||
if chkhsh == "049ecf7629871e3041641907f3de7c733e4dbfdc736f57d882ba0b0845599754":
|
||||
# ref: https://huggingface.co/deepseek-ai/deepseek-llm-7b-base
|
||||
res = "deepseek-llm"
|
||||
@ -5041,6 +5044,138 @@ class BailingMoeV2Model(Model):
|
||||
raise ValueError(f"Unprocessed experts: {experts}")
|
||||
|
||||
|
||||
@Model.register("LagunaForCausalLM")
|
||||
class LagunaModel(Model):
|
||||
model_arch = gguf.MODEL_ARCH.LAGUNA
|
||||
|
||||
_experts: list[dict[str, Tensor]] | None = None
|
||||
|
||||
def set_gguf_parameters(self):
|
||||
hparams = self.hparams
|
||||
arch = gguf.MODEL_ARCH_NAMES[self.model_arch]
|
||||
n_layers = int(hparams["num_hidden_layers"])
|
||||
n_head_base = int(hparams["num_attention_heads"])
|
||||
n_kv_base = int(hparams.get("num_key_value_heads", n_head_base))
|
||||
head_dim = int(hparams.get("head_dim", hparams["hidden_size"] // n_head_base))
|
||||
|
||||
heads_per_layer = hparams.get("num_attention_heads_per_layer")
|
||||
kv_per_layer = hparams.get("num_key_value_heads_per_layer")
|
||||
|
||||
head_arr: list[int] = []
|
||||
kv_arr: list[int] = []
|
||||
for i in range(n_layers):
|
||||
head_arr.append(int(heads_per_layer[i]) if heads_per_layer is not None else n_head_base)
|
||||
kv_arr.append(int(kv_per_layer[i]) if kv_per_layer is not None else n_kv_base)
|
||||
|
||||
rope_params = hparams.get("rope_parameters", {})
|
||||
full_rope = rope_params.get("full_attention", rope_params)
|
||||
swa_rope = rope_params.get("sliding_attention", {})
|
||||
|
||||
self.gguf_writer.add_context_length(int(hparams["max_position_embeddings"]))
|
||||
self.gguf_writer.add_embedding_length(int(hparams["hidden_size"]))
|
||||
self.gguf_writer.add_block_count(n_layers)
|
||||
self.gguf_writer.add_feed_forward_length(int(hparams["intermediate_size"]))
|
||||
self.gguf_writer.add_head_count(head_arr)
|
||||
if all(n_kv == kv_arr[0] for n_kv in kv_arr):
|
||||
self.gguf_writer.add_head_count_kv(kv_arr[0])
|
||||
else:
|
||||
self.gguf_writer.add_head_count_kv(kv_arr)
|
||||
self.gguf_writer.add_key_length(head_dim)
|
||||
self.gguf_writer.add_value_length(head_dim)
|
||||
self.gguf_writer.add_layer_norm_rms_eps(float(hparams["rms_norm_eps"]))
|
||||
self.gguf_writer.add_file_type(self.ftype)
|
||||
|
||||
self.gguf_writer.add_sliding_window(int(hparams["sliding_window"]))
|
||||
self.gguf_writer.add_rope_dimension_count(head_dim // 2)
|
||||
self.gguf_writer.add_uint32(f"{arch}.rope.dimension_count_swa", head_dim)
|
||||
self.gguf_writer.add_rope_freq_base(float(full_rope.get("rope_theta", 500000.0)))
|
||||
self.gguf_writer.add_float32(f"{arch}.rope.freq_base_swa", float(swa_rope.get("rope_theta", 10000.0)))
|
||||
if full_rope.get("rope_type") == "yarn":
|
||||
self.gguf_writer.add_rope_scaling_type(gguf.RopeScalingType.YARN)
|
||||
self.gguf_writer.add_rope_scaling_factor(float(full_rope.get("factor", 1.0)))
|
||||
self.gguf_writer.add_rope_scaling_orig_ctx_len(int(full_rope.get(
|
||||
"original_max_position_embeddings",
|
||||
rope_params.get("original_max_position_embeddings", hparams["max_position_embeddings"]),
|
||||
)))
|
||||
self.gguf_writer.add_rope_scaling_yarn_ext_factor(float(full_rope.get("factor", 1.0)))
|
||||
self.gguf_writer.add_rope_scaling_yarn_attn_factor(float(full_rope.get("attention_factor", 1.0)))
|
||||
self.gguf_writer.add_rope_scaling_yarn_beta_fast(float(full_rope.get("beta_fast", 32.0)))
|
||||
self.gguf_writer.add_rope_scaling_yarn_beta_slow(float(full_rope.get("beta_slow", 1.0)))
|
||||
|
||||
self.gguf_writer.add_expert_count(int(hparams["num_experts"]))
|
||||
self.gguf_writer.add_expert_used_count(int(hparams["num_experts_per_tok"]))
|
||||
self.gguf_writer.add_expert_feed_forward_length(int(hparams["moe_intermediate_size"]))
|
||||
if (shared_dim := hparams.get("shared_expert_intermediate_size")) is not None and int(shared_dim) > 0:
|
||||
self.gguf_writer.add_expert_shared_feed_forward_length(int(shared_dim))
|
||||
if (routing_scale := hparams.get("moe_routed_scaling_factor")) is not None:
|
||||
self.gguf_writer.add_expert_weights_scale(float(routing_scale))
|
||||
self.gguf_writer.add_expert_weights_norm(True)
|
||||
self.gguf_writer.add_expert_gating_func(gguf.ExpertGatingFuncType.SIGMOID)
|
||||
|
||||
leading_dense = 0
|
||||
for mlp_type in hparams.get("mlp_layer_types", []):
|
||||
if mlp_type != "dense":
|
||||
break
|
||||
leading_dense += 1
|
||||
self.gguf_writer.add_uint32(f"{arch}.leading_dense_block_count", leading_dense)
|
||||
|
||||
if hparams.get("moe_apply_router_weight_on_input", False):
|
||||
raise ValueError("moe_apply_router_weight_on_input=True is not supported for Laguna")
|
||||
|
||||
def set_vocab(self) -> None:
|
||||
super().set_vocab()
|
||||
if isinstance(eos_token_id := self.hparams.get("eos_token_id"), list) and len(eos_token_id) > 1:
|
||||
# Poolside uses token 24 (</assistant>) as a turn boundary.
|
||||
self.gguf_writer.add_eot_token_id(int(eos_token_id[1]))
|
||||
template_file = self.dir_model / "chat_template.jinja"
|
||||
if template_file.is_file():
|
||||
self.gguf_writer.add_chat_template(template_file.read_text(encoding="utf-8"))
|
||||
|
||||
def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]:
|
||||
if bid is not None and name in (
|
||||
f"model.layers.{bid}.mlp.experts.e_score_correction_bias",
|
||||
f"model.layers.{bid}.mlp.experts.e_score_correction",
|
||||
):
|
||||
# The C++ loader asks for this tensor through the ".bias" suffix.
|
||||
# Keep the Laguna converter aligned with existing community GGUFs.
|
||||
yield f"blk.{bid}.exp_probs_b.bias", data_torch
|
||||
return
|
||||
|
||||
if name.endswith(".self_attn.g_proj.weight"):
|
||||
# HF stores the head-wise attention gate with a singleton dimension.
|
||||
data_torch = data_torch.squeeze().contiguous()
|
||||
|
||||
if bid is not None and re.match(r"model\.layers\.\d+\.mlp\.experts\.\d+\.(gate_proj|up_proj|down_proj)\.weight$", name):
|
||||
n_experts = int(self.find_hparam(["num_experts"]))
|
||||
if self._experts is None:
|
||||
self._experts = [{} for _ in range(self.block_count)]
|
||||
|
||||
self._experts[bid][name] = data_torch
|
||||
if len(self._experts[bid]) < n_experts * 3:
|
||||
return
|
||||
|
||||
for w_name in ("down_proj", "gate_proj", "up_proj"):
|
||||
datas: list[Tensor] = []
|
||||
for xid in range(n_experts):
|
||||
ename = f"model.layers.{bid}.mlp.experts.{xid}.{w_name}.weight"
|
||||
datas.append(self._experts[bid][ename])
|
||||
del self._experts[bid][ename]
|
||||
|
||||
merged = torch.stack(datas, dim=0)
|
||||
merged_name = f"model.layers.{bid}.mlp.experts.{w_name}.weight"
|
||||
yield from super().modify_tensors(merged, merged_name, bid)
|
||||
return
|
||||
|
||||
yield from super().modify_tensors(data_torch, name, bid)
|
||||
|
||||
def prepare_tensors(self):
|
||||
super().prepare_tensors()
|
||||
if self._experts is not None:
|
||||
experts = [k for d in self._experts for k in d.keys()]
|
||||
if experts:
|
||||
raise ValueError(f"Unprocessed experts: {experts}")
|
||||
|
||||
|
||||
###### CONVERSION LOGIC ######
|
||||
|
||||
|
||||
|
||||
@ -116,11 +116,17 @@ class Keys:
|
||||
REL_BUCKETS_COUNT = "{arch}.attention.relative_buckets_count"
|
||||
SLIDING_WINDOW = "{arch}.attention.sliding_window"
|
||||
SLIDING_WINDOW_PATTERN = "{arch}.attention.sliding_window_pattern"
|
||||
SHARED_KV_LAYERS = "{arch}.attention.shared_kv_layers"
|
||||
KEY_LENGTH_SWA = "{arch}.attention.key_length_swa"
|
||||
VALUE_LENGTH_SWA = "{arch}.attention.value_length_swa"
|
||||
OUTPUT_SCALE = "{arch}.attention.output_scale"
|
||||
TEMPERATURE_LENGTH = "{arch}.attention.temperature_length"
|
||||
|
||||
class Rope:
|
||||
DIMENSION_COUNT = "{arch}.rope.dimension_count"
|
||||
DIMENSION_COUNT_SWA = "{arch}.rope.dimension_count_swa"
|
||||
DIMENSION_COUNT_PER_LAYER = "{arch}.rope.dimension_count_per_layer"
|
||||
DIMENSION_SECTIONS = "{arch}.rope.dimension_sections"
|
||||
FREQ_BASE = "{arch}.rope.freq_base"
|
||||
FREQ_BASE_SWA = "{arch}.rope.freq_base_swa"
|
||||
SCALING_TYPE = "{arch}.rope.scaling.type"
|
||||
@ -162,6 +168,7 @@ class Keys:
|
||||
MASK_ID = "tokenizer.ggml.mask_token_id"
|
||||
ADD_BOS = "tokenizer.ggml.add_bos_token"
|
||||
ADD_EOS = "tokenizer.ggml.add_eos_token"
|
||||
ADD_SEP = "tokenizer.ggml.add_sep_token"
|
||||
ADD_PREFIX = "tokenizer.ggml.add_space_prefix"
|
||||
REMOVE_EXTRA_WS = "tokenizer.ggml.remove_extra_whitespaces"
|
||||
PRECOMPILED_CHARSMAP = "tokenizer.ggml.precompiled_charsmap"
|
||||
@ -262,6 +269,7 @@ class MODEL_ARCH(IntEnum):
|
||||
MINIMAXM2 = auto()
|
||||
SMOLLM3 = auto()
|
||||
SEED_OSS = auto()
|
||||
LAGUNA = auto()
|
||||
|
||||
class MODEL_TENSOR(IntEnum):
|
||||
TOKEN_EMBD = auto()
|
||||
@ -282,6 +290,7 @@ class MODEL_TENSOR(IntEnum):
|
||||
ATTN_NORM_2 = auto()
|
||||
ATTN_OUT_NORM = auto()
|
||||
ATTN_POST_NORM = auto()
|
||||
ATTN_GATE = auto()
|
||||
ATTN_ROT_EMBD = auto()
|
||||
FFN_GATE_INP = auto()
|
||||
FFN_GATE_INP_SHEXP = auto()
|
||||
@ -429,6 +438,7 @@ MODEL_ARCH_NAMES: dict[MODEL_ARCH, str] = {
|
||||
MODEL_ARCH.MINIMAXM2: "minimax-m2",
|
||||
MODEL_ARCH.SMOLLM3: "smollm3",
|
||||
MODEL_ARCH.SEED_OSS: "seed_oss",
|
||||
MODEL_ARCH.LAGUNA: "laguna",
|
||||
}
|
||||
|
||||
TENSOR_NAMES: dict[MODEL_TENSOR, str] = {
|
||||
@ -453,6 +463,7 @@ TENSOR_NAMES: dict[MODEL_TENSOR, str] = {
|
||||
MODEL_TENSOR.ATTN_K_NORM: "blk.{bid}.attn_k_norm",
|
||||
MODEL_TENSOR.ATTN_OUT_NORM: "blk.{bid}.attn_output_norm",
|
||||
MODEL_TENSOR.ATTN_POST_NORM: "blk.{bid}.post_attention_norm",
|
||||
MODEL_TENSOR.ATTN_GATE: "blk.{bid}.attn_gate",
|
||||
MODEL_TENSOR.FFN_GATE_INP: "blk.{bid}.ffn_gate_inp",
|
||||
MODEL_TENSOR.FFN_GATE_INP_SHEXP: "blk.{bid}.ffn_gate_inp_shexp",
|
||||
MODEL_TENSOR.FFN_NORM: "blk.{bid}.ffn_norm",
|
||||
@ -1489,6 +1500,32 @@ MODEL_TENSORS: dict[MODEL_ARCH, list[MODEL_TENSOR]] = {
|
||||
MODEL_TENSOR.OUTPUT_NORM,
|
||||
MODEL_TENSOR.OUTPUT,
|
||||
],
|
||||
MODEL_ARCH.LAGUNA: [
|
||||
MODEL_TENSOR.TOKEN_EMBD,
|
||||
MODEL_TENSOR.OUTPUT_NORM,
|
||||
MODEL_TENSOR.OUTPUT,
|
||||
MODEL_TENSOR.ROPE_FREQS,
|
||||
MODEL_TENSOR.ATTN_NORM,
|
||||
MODEL_TENSOR.ATTN_Q,
|
||||
MODEL_TENSOR.ATTN_Q_NORM,
|
||||
MODEL_TENSOR.ATTN_K,
|
||||
MODEL_TENSOR.ATTN_K_NORM,
|
||||
MODEL_TENSOR.ATTN_V,
|
||||
MODEL_TENSOR.ATTN_GATE,
|
||||
MODEL_TENSOR.ATTN_OUT,
|
||||
MODEL_TENSOR.FFN_NORM,
|
||||
MODEL_TENSOR.FFN_GATE,
|
||||
MODEL_TENSOR.FFN_DOWN,
|
||||
MODEL_TENSOR.FFN_UP,
|
||||
MODEL_TENSOR.FFN_GATE_INP,
|
||||
MODEL_TENSOR.FFN_GATE_EXP,
|
||||
MODEL_TENSOR.FFN_DOWN_EXP,
|
||||
MODEL_TENSOR.FFN_UP_EXP,
|
||||
MODEL_TENSOR.FFN_GATE_SHEXP,
|
||||
MODEL_TENSOR.FFN_DOWN_SHEXP,
|
||||
MODEL_TENSOR.FFN_UP_SHEXP,
|
||||
MODEL_TENSOR.FFN_EXP_PROBS_B,
|
||||
],
|
||||
# TODO
|
||||
}
|
||||
|
||||
|
||||
@ -214,6 +214,10 @@ class TensorNameMap:
|
||||
"model.layers.{bid}.post_attention_layernorm", # gemma2
|
||||
),
|
||||
|
||||
MODEL_TENSOR.ATTN_GATE: (
|
||||
"model.layers.{bid}.self_attn.g_proj", # laguna
|
||||
),
|
||||
|
||||
# Rotary embeddings
|
||||
MODEL_TENSOR.ATTN_ROT_EMBD: (
|
||||
"model.layers.{bid}.self_attn.rotary_emb.inv_freq", # llama-hf
|
||||
@ -280,6 +284,8 @@ class TensorNameMap:
|
||||
"model.layers.{bid}.mlp.moe_statics.e_score_correction", # ernie4.5-moe
|
||||
"model.layers.{bid}.mlp.gate.expert_bias", # bailingmoe2
|
||||
"model.layers.{bid}.block_sparse_moe.e_score_correction", # minimax-m2
|
||||
"model.layers.{bid}.mlp.experts.e_score_correction_bias", # laguna
|
||||
"model.layers.{bid}.mlp.experts.e_score_correction", # laguna
|
||||
),
|
||||
|
||||
# Feed-forward up
|
||||
|
||||
@ -79,6 +79,7 @@ add_library(llama
|
||||
graphs/build_mpt.cpp
|
||||
graphs/build_stablelm.cpp
|
||||
graphs/build_seedoss.cpp
|
||||
graphs/build_laguna.cpp
|
||||
graphs/build_step35.cpp
|
||||
graphs/build_qwen.cpp
|
||||
graphs/build_qwen2.cpp
|
||||
|
||||
143
src/graphs/build_laguna.cpp
Normal file
143
src/graphs/build_laguna.cpp
Normal file
@ -0,0 +1,143 @@
|
||||
#include "../llama-build-context.h"
|
||||
#include "../llama-model.h"
|
||||
#include "../llama-context.h"
|
||||
|
||||
ggml_cgraph * llm_build_context::build_laguna() {
|
||||
ggml_cgraph * gf = new_graph_custom();
|
||||
|
||||
ggml_tensor * inpL = llm_build_inp_embd(ctx0, lctx, hparams, batch, model.tok_embd, cb);
|
||||
ggml_tensor * inp_pos = build_inp_pos();
|
||||
ggml_tensor * inp_out_ids = build_inp_out_ids();
|
||||
ggml_tensor * KQ_mask = build_inp_KQ_mask();
|
||||
ggml_tensor * KQ_mask_swa = build_inp_KQ_mask_swa();
|
||||
|
||||
for (int il = 0; il < n_layer; ++il) {
|
||||
const bool is_swa = hparams.swa_layers[il];
|
||||
const int n_swa_l = is_swa ? hparams.n_swa : 0;
|
||||
|
||||
ggml_tensor * inpSA = inpL;
|
||||
|
||||
ggml_tensor * cur = llm_build_norm(ctx0, inpL, hparams, model.layers[il].attn_norm, nullptr, LLM_NORM_RMS, cb, il);
|
||||
cb(cur, "attn_norm", il);
|
||||
ggml_tensor * input_normed = cur;
|
||||
|
||||
ggml_tensor * Qcur = llm_build_lora_mm(lctx, ctx0, model.layers[il].wq, cur);
|
||||
cb(Qcur, "Qcur", il);
|
||||
ggml_tensor * Kcur = llm_build_lora_mm(lctx, ctx0, model.layers[il].wk, cur);
|
||||
cb(Kcur, "Kcur", il);
|
||||
ggml_tensor * Vcur = llm_build_lora_mm(lctx, ctx0, model.layers[il].wv, cur);
|
||||
cb(Vcur, "Vcur", il);
|
||||
ggml_build_forward_expand(gf, Qcur);
|
||||
ggml_build_forward_expand(gf, Kcur);
|
||||
ggml_build_forward_expand(gf, Vcur);
|
||||
|
||||
const int64_t n_head_l = hparams.n_head(il);
|
||||
const int64_t n_head_kv_l = hparams.n_head_kv(il);
|
||||
const int64_t n_embd_head_k = hparams.n_embd_head_k(il);
|
||||
const int64_t n_embd_head_v = hparams.n_embd_head_v(il);
|
||||
|
||||
Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head_k, n_head_l, n_tokens);
|
||||
Kcur = ggml_reshape_3d(ctx0, Kcur, n_embd_head_k, n_head_kv_l, n_tokens);
|
||||
|
||||
if (model.layers[il].attn_q_norm) {
|
||||
Qcur = llm_build_norm(ctx0, Qcur, hparams, model.layers[il].attn_q_norm, nullptr, LLM_NORM_RMS, cb, il);
|
||||
cb(Qcur, "Qcur_normed", il);
|
||||
ggml_build_forward_expand(gf, Qcur);
|
||||
}
|
||||
if (model.layers[il].attn_k_norm) {
|
||||
Kcur = llm_build_norm(ctx0, Kcur, hparams, model.layers[il].attn_k_norm, nullptr, LLM_NORM_RMS, cb, il);
|
||||
cb(Kcur, "Kcur_normed", il);
|
||||
ggml_build_forward_expand(gf, Kcur);
|
||||
}
|
||||
|
||||
const float freq_base_l = is_swa ? hparams.rope_freq_base_train_swa : cparams.rope_freq_base;
|
||||
const float freq_scale_l = is_swa ? hparams.rope_freq_scale_train_swa : cparams.rope_freq_scale;
|
||||
const float ext_factor_l = is_swa ? 0.0f : ext_factor;
|
||||
const float attn_factor_l = is_swa ? 1.0f : attn_factor;
|
||||
const float beta_fast_l = is_swa ? 32.0f : beta_fast;
|
||||
const float beta_slow_l = is_swa ? 1.0f : beta_slow;
|
||||
ggml_tensor * rope_factors = is_swa ? nullptr : build_rope_factors(il);
|
||||
const int n_rot_l = hparams.rope_n_rot(il);
|
||||
|
||||
Qcur = ggml_rope_ext(ctx0, Qcur, inp_pos, rope_factors,
|
||||
n_rot_l, rope_type, n_ctx_orig, freq_base_l, freq_scale_l,
|
||||
ext_factor_l, attn_factor_l, beta_fast_l, beta_slow_l);
|
||||
Kcur = ggml_rope_ext(ctx0, Kcur, inp_pos, rope_factors,
|
||||
n_rot_l, rope_type, n_ctx_orig, freq_base_l, freq_scale_l,
|
||||
ext_factor_l, attn_factor_l, beta_fast_l, beta_slow_l);
|
||||
cb(Qcur, "Qcur_roped", il);
|
||||
cb(Kcur, "Kcur_roped", il);
|
||||
|
||||
cur = llm_build_kv(ctx0, lctx, kv_self, gf,
|
||||
nullptr, nullptr,
|
||||
Kcur, Vcur, Qcur,
|
||||
is_swa ? KQ_mask_swa : KQ_mask,
|
||||
n_tokens, kv_head, n_kv,
|
||||
1.0f / sqrtf(float(n_embd_head_k)), cb, il, nullptr, n_swa_l);
|
||||
cb(cur, "attn_out", il);
|
||||
|
||||
if (model.layers[il].wqkv_gate) {
|
||||
ggml_tensor * gate = llm_build_lora_mm(lctx, ctx0, model.layers[il].wqkv_gate, input_normed);
|
||||
cb(gate, "attn_gate", il);
|
||||
gate = ggml_softplus(ctx0, gate);
|
||||
cb(gate, "attn_gate_softplus", il);
|
||||
|
||||
ggml_tensor * attn_3d = ggml_reshape_3d(ctx0, cur, n_embd_head_v, n_head_l, n_tokens);
|
||||
ggml_tensor * gate_3d = ggml_reshape_3d(ctx0, gate, 1, n_head_l, n_tokens);
|
||||
cb(gate_3d, "attn_gate_3d", il);
|
||||
|
||||
cur = ggml_mul(ctx0, attn_3d, gate_3d);
|
||||
cb(cur, "attn_gated_3d", il);
|
||||
cur = ggml_reshape_2d(ctx0, cur, n_embd_head_v * n_head_l, n_tokens);
|
||||
cb(cur, "attn_gated", il);
|
||||
}
|
||||
|
||||
cur = llm_build_lora_mm(lctx, ctx0, model.layers[il].wo, cur);
|
||||
cb(cur, "attn_proj", il);
|
||||
|
||||
if (il == n_layer - 1 && inp_out_ids) {
|
||||
cur = ggml_get_rows(ctx0, cur, inp_out_ids);
|
||||
inpSA = ggml_get_rows(ctx0, inpSA, inp_out_ids);
|
||||
}
|
||||
|
||||
ggml_tensor * ffn_inp = ggml_add(ctx0, cur, inpSA);
|
||||
cb(ffn_inp, "ffn_inp", il);
|
||||
|
||||
if (model.layers[il].ffn_gate_inp == nullptr) {
|
||||
cur = llm_build_ffn(ctx0, lctx, model.layers[il].ffn_norm, ffn_inp,
|
||||
model.layers[il].ffn_up, nullptr, nullptr,
|
||||
model.layers[il].ffn_gate, nullptr, nullptr,
|
||||
model.layers[il].ffn_down, nullptr, nullptr,
|
||||
nullptr,
|
||||
LLM_FFN_SILU, LLM_FFN_PAR, cb, il);
|
||||
cb(cur, "ffn_out", il);
|
||||
cur = ggml_add(ctx0, cur, ffn_inp);
|
||||
} else {
|
||||
cur = llm_build_std_moe_ffn(ctx0, lctx, model.layers[il].ffn_norm, ffn_inp,
|
||||
model.layers[il].ffn_gate_inp, model.layers[il].ffn_gate_inp_b,
|
||||
model.layers[il].ffn_up_exps, model.layers[il].ffn_up_exps_b,
|
||||
model.layers[il].ffn_gate_exps, model.layers[il].ffn_gate_exps_b,
|
||||
model.layers[il].ffn_down_exps, model.layers[il].ffn_down_exps_b,
|
||||
model.layers[il].ffn_exp_probs_b,
|
||||
model.layers[il].ffn_up_shexp, nullptr,
|
||||
model.layers[il].ffn_gate_shexp, nullptr,
|
||||
model.layers[il].ffn_down_shexp, nullptr,
|
||||
n_expert, n_expert_used,
|
||||
LLM_FFN_SILU, hparams.expert_weights_norm, hparams.expert_weights_scale != 0.0f, hparams.expert_weights_scale,
|
||||
(llm_expert_gating_func_type) hparams.expert_gating_func,
|
||||
LLM_FFN_SILU, cb, il, gf, true, model.layers[il].ffn_up_gate_exps);
|
||||
}
|
||||
|
||||
cur = lctx.cvec.apply_to(ctx0, cur, il);
|
||||
cb(cur, "l_out", il);
|
||||
|
||||
inpL = cur;
|
||||
}
|
||||
|
||||
ggml_tensor * cur = build_output(lctx, ctx0, inpL, model.output, model.output_norm, cb);
|
||||
cb(cur, "result_output", -1);
|
||||
|
||||
ggml_build_forward_expand(gf, cur);
|
||||
|
||||
return gf;
|
||||
}
|
||||
@ -76,6 +76,7 @@ static const std::map<llm_arch, const char *> LLM_ARCH_NAMES = {
|
||||
{ LLM_ARCH_MIMO2, "mimo2" },
|
||||
{ LLM_ARCH_SEED_OSS, "seed_oss" },
|
||||
{ LLM_ARCH_STEP35, "step35" },
|
||||
{ LLM_ARCH_LAGUNA, "laguna" },
|
||||
{ LLM_ARCH_GLM_DSA, "glm-dsa" },
|
||||
{ LLM_ARCH_MISTRAL4, "mistral4" },
|
||||
{ LLM_ARCH_GEMMA4, "gemma4" },
|
||||
|
||||
@ -75,6 +75,7 @@ enum llm_arch {
|
||||
LLM_ARCH_MIMO2,
|
||||
LLM_ARCH_SEED_OSS,
|
||||
LLM_ARCH_STEP35,
|
||||
LLM_ARCH_LAGUNA,
|
||||
LLM_ARCH_GLM_DSA,
|
||||
LLM_ARCH_MISTRAL4,
|
||||
LLM_ARCH_GEMMA4,
|
||||
|
||||
@ -1110,7 +1110,10 @@ llm_expert_gating_func_type gating_op,
|
||||
ggml_tensor * weights_sum = ggml_sum_rows(ctx, weights); // [1, n_tokens]
|
||||
cb(weights_sum, "ffn_moe_weights_sum", il);
|
||||
|
||||
if (lctx.model.arch == LLM_ARCH_BAILINGMOE2 || lctx.model.arch == LLM_ARCH_STEP35) {
|
||||
if (lctx.model.arch == LLM_ARCH_LAGUNA) {
|
||||
weights_sum = ggml_clamp(ctx, weights_sum, 6.103515625e-5f, INFINITY);
|
||||
cb(weights_sum, "ffn_moe_weights_sum_clamped", il);
|
||||
} else if (lctx.model.arch == LLM_ARCH_BAILINGMOE2 || lctx.model.arch == LLM_ARCH_STEP35) {
|
||||
weights_sum = ggml_scale_bias(ctx, weights_sum, 1.0, 1e-20);
|
||||
cb(weights_sum, "ffn_moe_weights_sum_biased", il);
|
||||
}
|
||||
@ -1595,6 +1598,7 @@ static ggml_tensor * llm_build_kqv(
|
||||
|| model.arch == LLM_ARCH_COMMAND_R
|
||||
|| model.arch == LLM_ARCH_GLM4
|
||||
|| model.arch == LLM_ARCH_GLM4_MOE
|
||||
|| model.arch == LLM_ARCH_LAGUNA
|
||||
|| model.arch == LLM_ARCH_MIMO2;
|
||||
// || (model.arch == LLM_ARCH_DEEPSEEK2 && q->ne[1] <= 8);
|
||||
|
||||
@ -2507,6 +2511,10 @@ ggml_cgraph * llm_build_context::llama_build_graph(
|
||||
{
|
||||
result = llm.build_step35();
|
||||
} break;
|
||||
case LLM_ARCH_LAGUNA:
|
||||
{
|
||||
result = llm.build_laguna();
|
||||
} break;
|
||||
default:
|
||||
GGML_ABORT("fatal error");
|
||||
}
|
||||
|
||||
@ -318,6 +318,8 @@ struct llm_build_context {
|
||||
|
||||
ggml_cgraph * build_seedoss();
|
||||
|
||||
ggml_cgraph * build_laguna();
|
||||
|
||||
ggml_cgraph * build_step35();
|
||||
|
||||
//
|
||||
|
||||
@ -1368,6 +1368,65 @@ void llm_load_hparams(
|
||||
hparams.rope_freq_base_per_layer, hparams.n_layer, false);
|
||||
GGML_ASSERT(hparams.has_rope_freq_base_per_layer || have_rfb_train_swa);
|
||||
} break;
|
||||
case LLM_ARCH_LAGUNA:
|
||||
{
|
||||
ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
|
||||
|
||||
ml.get_key(LLM_KV_EXPERT_FEED_FORWARD_LENGTH, hparams.n_ff_exp);
|
||||
ml.get_key(LLM_KV_EXPERT_SHARED_FEED_FORWARD_LENGTH, hparams.n_ff_shexp, false);
|
||||
hparams.expert_gating_func = LLM_EXPERT_GATING_FUNC_TYPE_NONE;
|
||||
ml.get_key(LLM_KV_EXPERT_GATING_FUNC, hparams.expert_gating_func, false);
|
||||
ml.get_key(LLM_KV_EXPERT_WEIGHTS_SCALE, hparams.expert_weights_scale, false);
|
||||
ml.get_key(LLM_KV_EXPERT_WEIGHTS_NORM, hparams.expert_weights_norm, false);
|
||||
ml.get_key(LLM_KV_LEADING_DENSE_BLOCK_COUNT, hparams.n_layer_dense_lead, false);
|
||||
ml.get_key(LLM_KV_EXPERT_SHARED_COUNT, hparams.n_expert_shared, false);
|
||||
// Older Laguna GGUFs encode one shared expert through the shared FFN length.
|
||||
if (hparams.n_expert_shared == 0 && hparams.n_ff_shexp > 0) {
|
||||
hparams.n_expert_shared = 1;
|
||||
}
|
||||
if (hparams.expert_gating_func == LLM_EXPERT_GATING_FUNC_TYPE_NONE) {
|
||||
hparams.expert_gating_func = LLM_EXPERT_GATING_FUNC_SIGMOID;
|
||||
}
|
||||
|
||||
ml.get_key(LLM_KV_ATTENTION_SLIDING_WINDOW, hparams.n_swa);
|
||||
ml.get_key(LLM_KV_ROPE_FREQ_BASE_SWA, hparams.rope_freq_base_train_swa, false);
|
||||
hparams.rope_freq_scale_train_swa = 1.0f;
|
||||
if (!ml.get_key_or_arr(LLM_KV_ATTENTION_SLIDING_WINDOW_PATTERN, hparams.swa_layers, hparams.n_layer, false)) {
|
||||
// Laguna XS.2 alternates full-attention and SWA layers via per-layer head counts.
|
||||
const uint32_t n_head_full = hparams.n_head(0);
|
||||
for (uint32_t i = 0; i < hparams.n_layer; ++i) {
|
||||
hparams.swa_layers[i] = hparams.n_head(i) != n_head_full;
|
||||
}
|
||||
}
|
||||
|
||||
// GGUF stores the Poolside partial-rotary setting; the graph RoPE
|
||||
// argument for full-attention Laguna layers follows the upstream
|
||||
// Laguna loader and uses half of that count. SWA layers remain
|
||||
// full-head rotary via n_rot_swa.
|
||||
hparams.n_rot /= 2;
|
||||
|
||||
ml.get_key(LLM_KV_ROPE_SCALING_YARN_EXT_FACTOR, hparams.yarn_ext_factor, false);
|
||||
ml.get_key(LLM_KV_ROPE_SCALING_YARN_ATTN_FACTOR, hparams.yarn_attn_factor, false);
|
||||
ml.get_key(LLM_KV_ROPE_SCALING_YARN_BETA_FAST, hparams.yarn_beta_fast, false);
|
||||
ml.get_key(LLM_KV_ROPE_SCALING_YARN_BETA_SLOW, hparams.yarn_beta_slow, false);
|
||||
|
||||
if (!ml.get_key_or_arr(LLM_KV_ROPE_DIMENSION_COUNT_PER_LAYER, hparams.rope_dim_per_layer, hparams.n_layer, false)) {
|
||||
for (uint32_t i = 0; i < hparams.n_layer; ++i) {
|
||||
hparams.rope_dim_per_layer[i] = hparams.swa_layers[i] ? hparams.n_rot_swa : hparams.n_rot;
|
||||
}
|
||||
} else {
|
||||
for (uint32_t i = 0; i < hparams.n_layer; ++i) {
|
||||
if (!hparams.swa_layers[i]) {
|
||||
hparams.rope_dim_per_layer[i] /= 2;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
switch (hparams.n_layer) {
|
||||
case 40: model.type = e_model::MODEL_33B_A3B; break;
|
||||
default: model.type = e_model::MODEL_UNKNOWN;
|
||||
}
|
||||
} break;
|
||||
case LLM_ARCH_GLM_DSA:
|
||||
{
|
||||
ml.get_key(LLM_KV_EXPERT_FEED_FORWARD_LENGTH, hparams.n_ff_exp);
|
||||
|
||||
@ -82,7 +82,6 @@ struct llama_hparams {
|
||||
float yarn_attn_factor = 1.0f;
|
||||
float yarn_beta_fast = 32.0f;
|
||||
float yarn_beta_slow = 1.0f;
|
||||
|
||||
std::array<int, 4> rope_sections;
|
||||
std::array<float, LLAMA_MAX_LAYERS> rope_freq_base_per_layer;
|
||||
std::array<uint32_t, LLAMA_MAX_LAYERS> rope_dim_per_layer;
|
||||
|
||||
@ -4370,6 +4370,8 @@ bool create_tensors_helper::create_tensors() {
|
||||
use_mmap_buffer = create_seedoss_tensors(tn); break;
|
||||
case LLM_ARCH_STEP35:
|
||||
use_mmap_buffer = create_step35_tensors(tn); break;
|
||||
case LLM_ARCH_LAGUNA:
|
||||
use_mmap_buffer = create_step35_tensors(tn); break;
|
||||
default:
|
||||
throw std::runtime_error("unknown architecture");
|
||||
}
|
||||
|
||||
@ -1605,6 +1605,38 @@ static const std::map<llm_arch, std::map<llm_tensor, std::string>> LLM_TENSOR_NA
|
||||
{ LLM_TENSOR_FFN_EXP_PROBS_B, "blk.%d.exp_probs_b" },
|
||||
},
|
||||
},
|
||||
{
|
||||
LLM_ARCH_LAGUNA,
|
||||
{
|
||||
{ LLM_TENSOR_TOKEN_EMBD, "token_embd" },
|
||||
{ LLM_TENSOR_OUTPUT_NORM, "output_norm" },
|
||||
{ LLM_TENSOR_OUTPUT, "output" },
|
||||
{ LLM_TENSOR_ROPE_FREQS, "rope_freqs" },
|
||||
{ LLM_TENSOR_ROPE_FACTORS_LONG, "rope_factors_long" },
|
||||
{ LLM_TENSOR_ROPE_FACTORS_SHORT,"rope_factors_short" },
|
||||
{ LLM_TENSOR_ATTN_NORM, "blk.%d.attn_norm" },
|
||||
{ LLM_TENSOR_ATTN_Q, "blk.%d.attn_q" },
|
||||
{ LLM_TENSOR_ATTN_Q_NORM, "blk.%d.attn_q_norm" },
|
||||
{ LLM_TENSOR_ATTN_K, "blk.%d.attn_k" },
|
||||
{ LLM_TENSOR_ATTN_K_NORM, "blk.%d.attn_k_norm" },
|
||||
{ LLM_TENSOR_ATTN_V, "blk.%d.attn_v" },
|
||||
{ LLM_TENSOR_ATTN_GATE, "blk.%d.attn_gate" },
|
||||
{ LLM_TENSOR_ATTN_OUT, "blk.%d.attn_output" },
|
||||
{ LLM_TENSOR_FFN_NORM, "blk.%d.ffn_norm" },
|
||||
{ LLM_TENSOR_FFN_GATE, "blk.%d.ffn_gate" },
|
||||
{ LLM_TENSOR_FFN_DOWN, "blk.%d.ffn_down" },
|
||||
{ LLM_TENSOR_FFN_UP, "blk.%d.ffn_up" },
|
||||
{ LLM_TENSOR_FFN_GATE_INP, "blk.%d.ffn_gate_inp" },
|
||||
{ LLM_TENSOR_FFN_GATE_EXPS, "blk.%d.ffn_gate_exps" },
|
||||
{ LLM_TENSOR_FFN_DOWN_EXPS, "blk.%d.ffn_down_exps" },
|
||||
{ LLM_TENSOR_FFN_UP_EXPS, "blk.%d.ffn_up_exps" },
|
||||
{ LLM_TENSOR_FFN_GATE_UP_EXPS, "blk.%d.ffn_gate_up_exps" },
|
||||
{ LLM_TENSOR_FFN_GATE_SHEXP, "blk.%d.ffn_gate_shexp" },
|
||||
{ LLM_TENSOR_FFN_DOWN_SHEXP, "blk.%d.ffn_down_shexp" },
|
||||
{ LLM_TENSOR_FFN_UP_SHEXP, "blk.%d.ffn_up_shexp" },
|
||||
{ LLM_TENSOR_FFN_EXP_PROBS_B, "blk.%d.exp_probs_b" },
|
||||
},
|
||||
},
|
||||
{
|
||||
LLM_ARCH_GLM_DSA,
|
||||
{
|
||||
@ -1892,6 +1924,7 @@ const char * llama_model_type_name(e_model type) {
|
||||
case MODEL_16B_A1B: return "16B.A1B";
|
||||
case MODEL_21B_A3B: return "21B.A3B";
|
||||
case MODEL_30B_A3B: return "30B.A3B";
|
||||
case MODEL_33B_A3B: return "33B.A3B";
|
||||
case MODEL_35B_A3B: return "35B.A3B";
|
||||
case MODEL_80B_A3B: return "80B.A3B";
|
||||
case MODEL_80B_A13B: return "80B.A13B";
|
||||
|
||||
@ -109,6 +109,7 @@ enum e_model {
|
||||
MODEL_16B_A1B,
|
||||
MODEL_21B_A3B, // Ernie MoE small
|
||||
MODEL_30B_A3B,
|
||||
MODEL_33B_A3B,
|
||||
MODEL_35B_A3B,
|
||||
MODEL_80B_A3B, // Qwen3-Next
|
||||
MODEL_80B_A13B,
|
||||
|
||||
@ -1948,9 +1948,11 @@ void llama_vocab::impl::load(llama_model_loader & ml, const LLM_KV & kv) {
|
||||
} else if (tokenizer_pre == "default") {
|
||||
pre_type = LLAMA_VOCAB_PRE_TYPE_DEFAULT;
|
||||
} else if (
|
||||
// Laguna reuses the Llama-3 byte-level BPE pre-tokenizer shape.
|
||||
tokenizer_pre == "llama3" ||
|
||||
tokenizer_pre == "llama-v3" ||
|
||||
tokenizer_pre == "llama-bpe"||
|
||||
tokenizer_pre == "laguna" ||
|
||||
tokenizer_pre == "falcon3" ||
|
||||
tokenizer_pre == "falcon-h1" ||
|
||||
tokenizer_pre == "pixtral" ||
|
||||
|
||||
@ -3070,6 +3070,7 @@ static bool is_model_split_supported(const llama_model & model) {
|
||||
LLM_ARCH_MINIMAX_M2,
|
||||
LLM_ARCH_SEED_OSS,
|
||||
LLM_ARCH_STEP35,
|
||||
LLM_ARCH_LAGUNA,
|
||||
//LLM_ARCH_QWEN3NEXT,
|
||||
LLM_ARCH_QWEN35,
|
||||
LLM_ARCH_QWEN35MOE,
|
||||
@ -6832,7 +6833,6 @@ struct llama_context * llama_init_from_model(
|
||||
if (cparams.yarn_ext_factor < 0.0f) { // negative indicates 'not set'
|
||||
cparams.yarn_ext_factor = rope_scaling_type == LLAMA_ROPE_SCALING_TYPE_YARN ? 1.0f : 0.0f;
|
||||
}
|
||||
|
||||
cparams.yarn_attn_factor *= hparams.rope_attn_factor;
|
||||
|
||||
if (cparams.pooling_type == LLAMA_POOLING_TYPE_UNSPECIFIED) {
|
||||
@ -7431,6 +7431,7 @@ enum llama_rope_type llama_rope_type(const struct llama_model * model) {
|
||||
case LLM_ARCH_MIMO2:
|
||||
case LLM_ARCH_SEED_OSS:
|
||||
case LLM_ARCH_STEP35:
|
||||
case LLM_ARCH_LAGUNA:
|
||||
case LLM_ARCH_GEMMA4:
|
||||
case LLM_ARCH_GEMMA4_MTP:
|
||||
return LLAMA_ROPE_TYPE_NEOX;
|
||||
|
||||
Loading…
x
Reference in New Issue
Block a user