Support Mimo-2.5 (#1723)

This commit is contained in:
Kawrakow 2026-05-03 08:16:02 +03:00 committed by GitHub
parent 2dd3818083
commit e76700119d
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
5 changed files with 12 additions and 0 deletions

View File

@ -168,6 +168,7 @@ static const std::map<llm_kv, const char *> LLM_KV_NAMES = {
{ LLM_KV_ATTENTION_SHARED_KV_LAYERS, "%s.attention.shared_kv_layers" },
{ LLM_KV_ATTENTION_KEY_LENGTH_SWA, "%s.attention.key_length_swa" },
{ LLM_KV_ATTENTION_VALUE_LENGTH_SWA, "%s.attention.value_length_swa" },
{ LLM_KV_ATTENTION_VALUE_SCALE, "%s.attention.value_scale" },
{ LLM_KV_ROPE_DIMENSION_COUNT, "%s.rope.dimension_count" },
{ LLM_KV_ROPE_DIMENSION_COUNT_SWA, "%s.rope.dimension_count_swa" },

View File

@ -161,6 +161,7 @@ enum llm_kv {
LLM_KV_ATTENTION_SHARED_KV_LAYERS,
LLM_KV_ATTENTION_KEY_LENGTH_SWA,
LLM_KV_ATTENTION_VALUE_LENGTH_SWA,
LLM_KV_ATTENTION_VALUE_SCALE,
LLM_KV_ROPE_DIMENSION_COUNT,
LLM_KV_ROPE_DIMENSION_COUNT_SWA,

View File

@ -2575,6 +2575,10 @@ ggml_tensor * llm_build_context::build_std_attention(ggml_cgraph * gf, ggml_tens
split_wv, bv ? bv->splits[id] : nullptr,
the_q_norm, the_k_norm, f_attn_scale, il, add_graph_split);
Qcur = Q; Kcur = K; Vcur = V;
if (model.arch == LLM_ARCH_MIMO2 && std::abs(model.hparams.f_attn_v_scale - 1) > 1e-4f) {
Vcur = ggml_scale(ctx0, Vcur, model.hparams.f_attn_v_scale);
cb(Vcur, "Vcur_scales", il_cb);
}
}
auto rope_factors = rope_factors_in;
if (rope_factors) {
@ -2781,6 +2785,10 @@ ggml_tensor * llm_build_context::build_std_attention(ggml_cgraph * gf, ggml_tens
model.layers[il].wq, model.layers[il].bq, model.layers[il].wk, model.layers[il].bk, model.layers[il].wv, model.layers[il].bv,
model.layers[il].attn_q_norm, model.layers[il].attn_k_norm, f_attn_scale, il);
Qcur = Q; Kcur = K; Vcur = V;
if (model.arch == LLM_ARCH_MIMO2 && std::abs(model.hparams.f_attn_v_scale - 1) > 1e-4f) {
Vcur = ggml_scale(ctx0, Vcur, model.hparams.f_attn_v_scale);
cb(Vcur, "Vcur_scales", il);
}
if (model.arch == LLM_ARCH_GEMMA4) {
Vcur = ggml_reshape_3d(ctx0, Vcur, model.hparams.n_embd_head_v(il), model.hparams.n_head_kv(il), n_tokens);
Vcur = ggml_rms_norm(ctx0, Vcur, model.hparams.f_norm_rms_eps);

View File

@ -1248,6 +1248,7 @@ void llm_load_hparams(
ml.get_key(LLM_KV_EXPERT_FEED_FORWARD_LENGTH, hparams.n_ff_exp);
ml.get_key(LLM_KV_ATTENTION_SLIDING_WINDOW, hparams.n_swa);
ml.get_key(LLM_KV_ROPE_FREQ_BASE_SWA, hparams.rope_freq_base_train_swa);
ml.get_key(LLM_KV_ATTENTION_VALUE_SCALE, hparams.f_attn_v_scale, false);
//TODO
//hparams.swa_type = LLAMA_SWA_TYPE_STANDARD; // which is the same as OpenAI
ml.get_key_or_arr(LLM_KV_ATTENTION_SLIDING_WINDOW_PATTERN, hparams.swa_layers, hparams.n_layer);

View File

@ -105,6 +105,7 @@ struct llama_hparams {
float f_residual_scale = 0.0f;
float f_embedding_scale = 0.0f;
float f_attention_scale = 0.0f;
float f_attn_v_scale = 1.0f;
// grok-2
float f_attn_out_scale = 0.0f;