diff --git a/src/llama-arch.cpp b/src/llama-arch.cpp index ff6fe0be..74568260 100644 --- a/src/llama-arch.cpp +++ b/src/llama-arch.cpp @@ -168,6 +168,7 @@ static const std::map LLM_KV_NAMES = { { LLM_KV_ATTENTION_SHARED_KV_LAYERS, "%s.attention.shared_kv_layers" }, { LLM_KV_ATTENTION_KEY_LENGTH_SWA, "%s.attention.key_length_swa" }, { LLM_KV_ATTENTION_VALUE_LENGTH_SWA, "%s.attention.value_length_swa" }, + { LLM_KV_ATTENTION_VALUE_SCALE, "%s.attention.value_scale" }, { LLM_KV_ROPE_DIMENSION_COUNT, "%s.rope.dimension_count" }, { LLM_KV_ROPE_DIMENSION_COUNT_SWA, "%s.rope.dimension_count_swa" }, diff --git a/src/llama-arch.h b/src/llama-arch.h index 20d6085c..1c5a6d99 100644 --- a/src/llama-arch.h +++ b/src/llama-arch.h @@ -161,6 +161,7 @@ enum llm_kv { LLM_KV_ATTENTION_SHARED_KV_LAYERS, LLM_KV_ATTENTION_KEY_LENGTH_SWA, LLM_KV_ATTENTION_VALUE_LENGTH_SWA, + LLM_KV_ATTENTION_VALUE_SCALE, LLM_KV_ROPE_DIMENSION_COUNT, LLM_KV_ROPE_DIMENSION_COUNT_SWA, diff --git a/src/llama-build-context.cpp b/src/llama-build-context.cpp index 087114be..df265893 100644 --- a/src/llama-build-context.cpp +++ b/src/llama-build-context.cpp @@ -2575,6 +2575,10 @@ ggml_tensor * llm_build_context::build_std_attention(ggml_cgraph * gf, ggml_tens split_wv, bv ? bv->splits[id] : nullptr, the_q_norm, the_k_norm, f_attn_scale, il, add_graph_split); Qcur = Q; Kcur = K; Vcur = V; + if (model.arch == LLM_ARCH_MIMO2 && std::abs(model.hparams.f_attn_v_scale - 1) > 1e-4f) { + Vcur = ggml_scale(ctx0, Vcur, model.hparams.f_attn_v_scale); + cb(Vcur, "Vcur_scales", il_cb); + } } auto rope_factors = rope_factors_in; if (rope_factors) { @@ -2781,6 +2785,10 @@ ggml_tensor * llm_build_context::build_std_attention(ggml_cgraph * gf, ggml_tens model.layers[il].wq, model.layers[il].bq, model.layers[il].wk, model.layers[il].bk, model.layers[il].wv, model.layers[il].bv, model.layers[il].attn_q_norm, model.layers[il].attn_k_norm, f_attn_scale, il); Qcur = Q; Kcur = K; Vcur = V; + if (model.arch == LLM_ARCH_MIMO2 && std::abs(model.hparams.f_attn_v_scale - 1) > 1e-4f) { + Vcur = ggml_scale(ctx0, Vcur, model.hparams.f_attn_v_scale); + cb(Vcur, "Vcur_scales", il); + } if (model.arch == LLM_ARCH_GEMMA4) { Vcur = ggml_reshape_3d(ctx0, Vcur, model.hparams.n_embd_head_v(il), model.hparams.n_head_kv(il), n_tokens); Vcur = ggml_rms_norm(ctx0, Vcur, model.hparams.f_norm_rms_eps); diff --git a/src/llama-hparams.cpp b/src/llama-hparams.cpp index 7053952c..7d2c4b7c 100644 --- a/src/llama-hparams.cpp +++ b/src/llama-hparams.cpp @@ -1248,6 +1248,7 @@ void llm_load_hparams( ml.get_key(LLM_KV_EXPERT_FEED_FORWARD_LENGTH, hparams.n_ff_exp); ml.get_key(LLM_KV_ATTENTION_SLIDING_WINDOW, hparams.n_swa); ml.get_key(LLM_KV_ROPE_FREQ_BASE_SWA, hparams.rope_freq_base_train_swa); + ml.get_key(LLM_KV_ATTENTION_VALUE_SCALE, hparams.f_attn_v_scale, false); //TODO //hparams.swa_type = LLAMA_SWA_TYPE_STANDARD; // which is the same as OpenAI ml.get_key_or_arr(LLM_KV_ATTENTION_SLIDING_WINDOW_PATTERN, hparams.swa_layers, hparams.n_layer); diff --git a/src/llama-hparams.h b/src/llama-hparams.h index 7202ea84..125a6b66 100644 --- a/src/llama-hparams.h +++ b/src/llama-hparams.h @@ -105,6 +105,7 @@ struct llama_hparams { float f_residual_scale = 0.0f; float f_embedding_scale = 0.0f; float f_attention_scale = 0.0f; + float f_attn_v_scale = 1.0f; // grok-2 float f_attn_out_scale = 0.0f;