Gemma4 E2B/E4B tweaks (#1947)

* Gemma4 E2B/E4B tweaks * A few more named nodes
2026-06-28 04:30:15 -05:00 · 2026-06-10 15:28:54 +02:00 · 2026-06-10 15:28:54 +02:00 · c0d25e8fa1
commit c0d25e8fa1
parent 4a1e2eaa69
2 changed files with 11 additions and 12 deletions
--- a/src/graphs/build_gemma4.cpp
+++ b/src/graphs/build_gemma4.cpp
@ -687,15 +687,14 @@ ggml_cgraph * llm_build_context::build_gemma4_mtp() {
 static ggml_tensor * gemma4_project_per_layer_inputs(ggml_context * ctx0, const llama_model & model, const llm_build_cb & cb,
        int n_embd, int n_embd_per_layer, int n_layer, int n_tokens,
        ggml_tensor * inputs_embeds, ggml_tensor * inp_per_layer) {
-    const float per_layer_projection_scale = 1.0f / sqrtf((float) n_embd);
    const float per_layer_input_scale      = 1.0f / sqrtf(2.0f);

    ggml_tensor * per_layer_proj = ggml_mul_mat(ctx0, model.per_layer_model_proj, inputs_embeds);
-    per_layer_proj               = ggml_scale(ctx0, per_layer_proj, per_layer_projection_scale);
+    cb(per_layer_proj, "per_layer_proj", -1);
    per_layer_proj               = ggml_reshape_3d(ctx0, per_layer_proj, n_embd_per_layer, n_layer, n_tokens);
    per_layer_proj               = llm_build_context::llm_build_norm(ctx0, per_layer_proj, model.hparams,
-            model.per_layer_proj_norm, nullptr, LLM_NORM_RMS, cb, -1);  // [n_embd_per_layer, n_layer, n_tokens]
-    cb(per_layer_proj, "per_layer_proj", -1);
+            model.per_layer_proj_norm, nullptr, LLM_NORM_RMS, cb, -1, 1.0f*n_embd);  // [n_embd_per_layer, n_layer, n_tokens]
+    cb(per_layer_proj, "per_layer_proj_normed", -1);

    inp_per_layer = ggml_add(ctx0, per_layer_proj, inp_per_layer);
    inp_per_layer = ggml_scale(ctx0, inp_per_layer, per_layer_input_scale);
@ -902,19 +901,19 @@ ggml_cgraph * llm_build_context::build_gemma4() {
            ggml_tensor * pe_in = cur;
            cb(cur, "pe_in", il);

-            cur = llm_build_lora_mm(lctx, ctx0, model.layers[il].per_layer_inp_gate, cur); // [n_embd_per_layer, n_tokens]
-            cur = ggml_gelu(ctx0, cur);
            ggml_tensor * inp_this_layer = ggml_view_2d(ctx0, inp_per_layer, inp_per_layer->ne[0], inp_per_layer->ne[1],
                    ggml_row_size(inp_per_layer->type, inp_per_layer->ne[0]),
                    il*inp_per_layer->ne[0]*inp_per_layer->ne[1]*ggml_element_size(inp_per_layer)); // [n_embd_per_layer, n_tokens]
-
-            // TODO @ngxson : improve this
            if (il == n_layer - 1 && inp_out_ids) {
                inp_this_layer = ggml_get_rows(ctx0, inp_this_layer, inp_out_ids);
            }

-            cur = ggml_mul(ctx0, cur, inp_this_layer);
+            cur = llm_build_lora_mm(lctx, ctx0, model.layers[il].per_layer_inp_gate, cur); // [n_embd_per_layer, n_tokens]
+            cb(cur, "cur_gated", il);
+            cur = ggml_fused_mul_unary(ctx0, cur, inp_this_layer, GGML_UNARY_OP_GELU);
+            cb(cur, "cur_gelu", il);
            cur = llm_build_lora_mm(lctx, ctx0, model.layers[il].per_layer_proj, cur); // [n_embd, n_tokens]
+            cb(cur, "cur_proj", il);
            cur = llm_build_norm(ctx0, cur, hparams, model.layers[il].per_layer_post_norm, nullptr, LLM_NORM_RMS, cb, il);
            cb(cur, "per_layer_embd_out", il);

@ -950,6 +949,7 @@ ggml_cgraph * llm_build_context::build_gemma4() {
    cur = llm_build_lora_mm(lctx, ctx0, model.output, cur);

    if (hparams.f_final_logit_softcapping > 0) {
+        cb(cur, "result_pre_softcap", -1);
        cur = ggml_softcap(ctx0, cur, 1.0f / hparams.f_final_logit_softcapping, hparams.f_final_logit_softcapping);
    }

--- a/src/llama-build-context.cpp
+++ b/src/llama-build-context.cpp
@ -843,16 +843,15 @@ ggml_tensor * llm_build_context::llm_build_ffn(
        }
        if (down) {
            cur = llm_build_lora_mm(lctx, ctx, down, cur);
+            cb(cur, "ffn_down", il);
            if (lctx.model.arch == LLM_ARCH_GLM4 || lctx.model.arch == LLM_ARCH_GLM4_MOE) {
                // GLM4 and GLM4_MOE seem to have numerical issues with half-precision accumulators
                ggml_mul_mat_set_prec(cur, GGML_PREC_F32);
            }
        }
-        if (down_b) {
-            cb(cur, "ffn_down", il);
-        }
        if (down_b) {
            cur = ggml_add(ctx, cur, down_b);
+            cb(cur, "ffn_down_b", il);
        }
        if (down_s) {
            cur = ggml_mul(ctx, cur, down_s);