feat: map Gemma 4 tensor and support with imatrix (#1796)

2026-06-28 04:30:15 -05:00 · 2026-05-14 03:01:24 -03:00 · 2026-05-14 03:01:24 -03:00 · 0fcffdb64d
commit 0fcffdb64d
parent b2e7f7f6cd
3 changed files with 329 additions and 16 deletions
--- a/examples/imatrix/imatrix.cpp
+++ b/examples/imatrix/imatrix.cpp
@ -26,6 +26,129 @@
 #pragma warning(disable: 4244 4267) // possible loss of data
 #endif

+uint32_t llama_mtp_state_n_embd(const struct llama_context * ctx);
+void llama_set_mtp_target_context(struct llama_context * ctx, struct llama_context * target_ctx);
+
+static llama_model * ik_load_model_from_params(const gpt_params & params, const llama_model_params & mparams) {
+    if (!params.hf_repo.empty() && !params.hf_file.empty()) {
+        return llama_load_model_from_hf(params.hf_repo.c_str(), params.hf_file.c_str(), params.model.c_str(), params.hf_token.c_str(), mparams);
+    }
+    if (!params.model_url.empty()) {
+        return llama_load_model_from_url(params.model_url.c_str(), params.model.c_str(), params.hf_token.c_str(), mparams);
+    }
+
+    return llama_model_load_from_file(params.model.c_str(), mparams);
+}
+
+static bool ik_model_has_arch(const llama_model * model, const char * expected_arch) {
+    char arch[64] = { 0 };
+    const int32_t len = llama_model_meta_val_str(model, "general.architecture", arch, sizeof(arch));
+    return len > 0 && std::string(arch) == expected_arch;
+}
+
+static llama_init_result ik_init_from_loaded_model(llama_model * model, gpt_params & params) {
+    llama_init_result iparams;
+
+    if (model == nullptr) {
+        return iparams;
+    }
+
+    auto cparams = common_context_params_to_llama(params);
+
+    llama_context * lctx = llama_init_from_model(model, cparams);
+    if (lctx == nullptr) {
+        fprintf(stderr, "%s: error: failed to create context with model '%s'\n", __func__, params.model.c_str());
+        llama_free_model(model);
+        return iparams;
+    }
+
+    for (auto [op, on_off] : params.offload_policy) {
+        llama_set_offload_policy(lctx, op, on_off);
+    }
+
+    if (!params.control_vectors.empty()) {
+        if (params.control_vector_layer_start <= 0) params.control_vector_layer_start = 1;
+        if (params.control_vector_layer_end   <= 0) params.control_vector_layer_end   = llama_n_layer(model);
+
+        const auto cvec = llama_control_vector_load(params.control_vectors);
+        if (cvec.n_embd == -1) {
+            llama_free(lctx);
+            llama_free_model(model);
+            return iparams;
+        }
+
+        const int err = llama_control_vector_apply(lctx,
+                                                   cvec.data.data(),
+                                                   cvec.data.size(),
+                                                   cvec.n_embd,
+                                                   params.control_vector_layer_start,
+                                                   params.control_vector_layer_end);
+        if (err) {
+            llama_free(lctx);
+            llama_free_model(model);
+            return iparams;
+        }
+    }
+
+    for (auto & la : params.lora_adapters) {
+        llama_lora_adapter_container loaded_la;
+        loaded_la.path = la.path;
+        loaded_la.scale = la.scale;
+        loaded_la.adapter = llama_lora_adapter_init(model, la.path.c_str());
+        if (loaded_la.adapter == nullptr) {
+            fprintf(stderr, "%s: error: failed to apply lora adapter '%s'\n", __func__, la.path.c_str());
+            llama_free(lctx);
+            llama_free_model(model);
+            return iparams;
+        }
+        iparams.lora_adapters.push_back(loaded_la);
+    }
+    if (!params.lora_init_without_apply) {
+        llama_lora_adapters_apply(lctx, iparams.lora_adapters);
+    }
+
+    if (params.ignore_eos) {
+        params.sparams.logit_bias[llama_token_eos(model)] = -INFINITY;
+    }
+
+    if (params.sparams.dry_penalty_last_n == -1) {
+        LOG("%s: setting dry_penalty_last_n to ctx_size = %d\n", __func__, llama_n_ctx(lctx));
+        params.sparams.dry_penalty_last_n = llama_n_ctx(lctx);
+    }
+
+    if (params.warmup) {
+        LOG("warming up the model with an empty run\n");
+
+        std::vector<llama_token> tmp;
+        llama_token bos = llama_token_bos(model);
+        llama_token eos = llama_token_eos(model);
+        if (bos != -1) {
+            tmp.push_back(bos);
+        } else {
+            tmp.push_back(eos);
+        }
+        if (llama_model_has_encoder(model)) {
+            llama_encode(lctx, llama_batch_get_one(tmp.data(), tmp.size(), 0, 0));
+            llama_token decoder_start_token_id = llama_model_decoder_start_token(model);
+            if (decoder_start_token_id == LLAMA_TOKEN_NULL) {
+                decoder_start_token_id = bos;
+            }
+            tmp.clear();
+            tmp.push_back(decoder_start_token_id);
+        }
+        if (llama_model_has_decoder(model)) {
+            llama_decode(lctx, llama_batch_get_one(tmp.data(), std::min(tmp.size(), (size_t) params.n_batch), 0, 0));
+        }
+        llama_kv_cache_clear(lctx);
+        llama_synchronize(lctx);
+        llama_reset_timings(lctx);
+    }
+
+    iparams.model = model;
+    iparams.context = lctx;
+    return iparams;
+}
+
 static void print_usage(int argc, char ** argv, const gpt_params & params) {
    gpt_params_print_usage(argc, argv, params);

@ -638,7 +761,75 @@ static void process_logits(
    }
 }

-static bool compute_imatrix(llama_context * ctx, const gpt_params & params) {
+static gpt_params build_draft_imatrix_params(const gpt_params & params) {
+    gpt_params draft_params = params;
+
+    draft_params.model = params.speculative.model;
+    draft_params.model_url.clear();
+    draft_params.hf_repo.clear();
+    draft_params.hf_file.clear();
+    draft_params.n_ctx = params.speculative.n_ctx > 0 ? params.speculative.n_ctx : params.n_ctx;
+    draft_params.n_gpu_layers = params.speculative.n_gpu_layers >= 0 ? params.speculative.n_gpu_layers : params.n_gpu_layers;
+    draft_params.has_mtp = true;
+    draft_params.warmup = false;
+    draft_params.cb_eval = ik_collect_imatrix;
+    draft_params.cb_eval_user_data = nullptr;
+
+    if (params.speculative.n_threads > 0) {
+        draft_params.n_threads = params.speculative.n_threads;
+    }
+    if (params.speculative.n_threads_batch > 0) {
+        draft_params.n_threads_batch = params.speculative.n_threads_batch;
+    }
+    if (!params.speculative.devices.empty()) {
+        draft_params.devices = params.speculative.devices;
+    }
+    if (!params.speculative.cache_type_k.empty()) {
+        draft_params.cache_type_k = params.speculative.cache_type_k;
+    }
+    if (!params.speculative.cache_type_v.empty()) {
+        draft_params.cache_type_v = params.speculative.cache_type_v;
+    }
+
+    return draft_params;
+}
+
+static bool compute_draft_imatrix_batch(
+        llama_context * ctx_tgt,
+        llama_context * ctx_dft,
+    llama_token * draft_tokens,
+        int batch_start,
+        int batch_size,
+        int batch_pos) {
+    const float * hidden = llama_get_embeddings(ctx_tgt);
+    const int n_embd_tgt = llama_mtp_state_n_embd(ctx_tgt);
+    const int n_embd_dft = llama_mtp_state_n_embd(ctx_dft);
+
+    if (hidden == nullptr || n_embd_tgt <= 0 || n_embd_dft <= 0) {
+        fprintf(stderr, "%s: missing target hidden state for paired draft calibration\n", __func__);
+        return false;
+    }
+
+    if (n_embd_tgt != n_embd_dft) {
+        fprintf(stderr, "%s: hidden width mismatch between target (%d) and draft (%d)\n",
+                __func__, n_embd_tgt, n_embd_dft);
+        return false;
+    }
+
+    llama_set_mtp_op_type(ctx_dft, MTP_OP_DRAFT_GEN);
+    llama_set_draft_input_hidden_state(ctx_dft, hidden);
+    const int ret = llama_decode(ctx_dft, llama_batch_get_one(draft_tokens + batch_start, batch_size, batch_pos, 0));
+    llama_set_mtp_op_type(ctx_dft, MTP_OP_NONE);
+
+    if (ret != 0) {
+        fprintf(stderr, "%s: paired draft eval failed\n", __func__);
+        return false;
+    }
+
+    return true;
+}
+
+static bool compute_imatrix(llama_context * ctx, const gpt_params & params, llama_context * ctx_dft = nullptr) {
    const bool add_bos = llama_should_add_bos_token(llama_get_model(ctx));
    GGML_ASSERT(llama_add_eos_token(llama_get_model(ctx)) != 1);
    const int n_ctx = llama_n_ctx(ctx);
@ -706,6 +897,9 @@ static bool compute_imatrix(llama_context * ctx, const gpt_params & params) {

        // clear the KV cache
        llama_kv_cache_clear(ctx);
+        if (ctx_dft != nullptr) {
+            llama_kv_cache_clear(ctx_dft);
+        }

        for (int j = 0; j < num_batches; ++j) {
            const int batch_start = start + j * n_batch;
@ -725,6 +919,10 @@ static bool compute_imatrix(llama_context * ctx, const gpt_params & params) {
                return false;
            }

+            if (ctx_dft != nullptr && !compute_draft_imatrix_batch(ctx, ctx_dft, tokens.data(), batch_start, batch_size, j * n_batch)) {
+                return false;
+            }
+
            // restore the original token in case it was set to BOS
            tokens[batch_start] = token_org;

@ -829,14 +1027,83 @@ int main(int argc, char ** argv) {
    llama_backend_init();
    llama_numa_init(params.numa);

-    // pass the callback to the backend scheduler
-    // it will be executed for each node during the graph computation
-    params.cb_eval = ik_collect_imatrix;
-    params.cb_eval_user_data = NULL;
-    params.warmup = false;
+    gpt_params target_params = params;
+    target_params.warmup = false;

-    // init
-    llama_init_result llama_init = llama_init_from_gpt_params(params);
+    const bool has_draft_model = !params.speculative.model.empty();
+    if (!has_draft_model) {
+        // pass the callback to the backend scheduler
+        // it will be executed for each node during the graph computation
+        target_params.cb_eval = ik_collect_imatrix;
+        target_params.cb_eval_user_data = NULL;
+    }
+
+    llama_init_result llama_init;
+    llama_model * model_dft = nullptr;
+    llama_context * ctx_dft = nullptr;
+    bool use_paired_gemma4_mtp = false;
+
+    if (has_draft_model) {
+        gpt_params draft_params = build_draft_imatrix_params(params);
+        auto mparams_dft = common_model_params_to_llama(draft_params);
+
+        model_dft = ik_load_model_from_params(draft_params, mparams_dft);
+        if (model_dft == nullptr) {
+            fprintf(stderr, "%s : failed to load draft model '%s'\n", __func__, draft_params.model.c_str());
+            llama_backend_free();
+            return 1;
+        }
+
+        if (!llama_model_is_gemma4_mtp_assistant(model_dft)) {
+            fprintf(stderr, "%s : paired imatrix mode currently supports Gemma 4 assistant draft models only\n", __func__);
+            llama_free_model(model_dft);
+            llama_backend_free();
+            return 1;
+        }
+
+        target_params.has_mtp = true;
+        target_params.cb_eval = nullptr;
+        target_params.cb_eval_user_data = nullptr;
+
+        auto mparams_tgt = common_model_params_to_llama(target_params);
+        llama_model * model_tgt = ik_load_model_from_params(target_params, mparams_tgt);
+        if (model_tgt == nullptr) {
+            fprintf(stderr, "%s : failed to load target model '%s'\n", __func__, target_params.model.c_str());
+            llama_free_model(model_dft);
+            llama_backend_free();
+            return 1;
+        }
+
+        if (!ik_model_has_arch(model_tgt, "gemma4")) {
+            fprintf(stderr, "%s : paired imatrix mode currently supports Gemma 4 target models only\n", __func__);
+            llama_free_model(model_tgt);
+            llama_free_model(model_dft);
+            llama_backend_free();
+            return 1;
+        }
+
+        llama_init = ik_init_from_loaded_model(model_tgt, target_params);
+        if (llama_init.model == nullptr || llama_init.context == nullptr) {
+            llama_free_model(model_dft);
+            llama_backend_free();
+            return 1;
+        }
+
+        auto draft_init = ik_init_from_loaded_model(model_dft, draft_params);
+        model_dft = draft_init.model;
+        ctx_dft = draft_init.context;
+        if (model_dft == nullptr || ctx_dft == nullptr) {
+            llama_free(llama_init.context);
+            llama_free_model(llama_init.model);
+            llama_backend_free();
+            return 1;
+        }
+
+        llama_set_mtp_target_context(ctx_dft, llama_init.context);
+        use_paired_gemma4_mtp = true;
+    } else {
+        llama_init = llama_init_from_gpt_params(target_params);
+    }

    llama_model * model = llama_init.model;
    llama_context * ctx = llama_init.context;
@ -845,6 +1112,10 @@ int main(int argc, char ** argv) {
        return 1;
    }

+    if (!use_paired_gemma4_mtp && llama_model_is_gemma4_mtp_assistant(model) && !params.process_output) {
+        fprintf(stderr, "%s: warning: standalone Gemma 4 assistant imatrix does not exercise the assistant layers. Use '-m <target> -md <assistant> -mtp' for meaningful calibration.\n", __func__);
+    }
+
    const int n_ctx_train = llama_n_ctx_train(model);
    if (params.n_ctx > n_ctx_train) {
        fprintf(stderr, "%s: warning: model was trained on only %d context tokens (%d specified)\n",
@ -857,7 +1128,16 @@ int main(int argc, char ** argv) {
        fprintf(stderr, "%s\n", gpt_params_get_system_info(params).c_str());
    }

-    if (!compute_imatrix(ctx, params)) {
+    if (!compute_imatrix(ctx, params, ctx_dft)) {
+        if (ctx_dft != nullptr) {
+            llama_free(ctx_dft);
+        }
+        if (model_dft != nullptr) {
+            llama_free_model(model_dft);
+        }
+        llama_free(ctx);
+        llama_free_model(model);
+        llama_backend_free();
        return 1;
    }

@ -866,6 +1146,13 @@ int main(int argc, char ** argv) {

    llama_print_timings(ctx);

+    if (ctx_dft != nullptr) {
+        llama_free(ctx_dft);
+    }
+    if (model_dft != nullptr) {
+        llama_free_model(model_dft);
+    }
+
    llama_free(ctx);
    llama_free_model(model);

--- a/src/graphs/build_gemma4.cpp
+++ b/src/graphs/build_gemma4.cpp
@ -531,10 +531,6 @@ ggml_cgraph * llm_build_context::build_gemma4_mtp() {

    GGML_ASSERT(n_backbone > 0);

-    lctx.inp_tokens = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, batch.n_tokens);
-    cb(lctx.inp_tokens, "inp_tokens", -1);
-    ggml_set_input(lctx.inp_tokens);
-
    ggml_tensor * hidden_state = ggml_new_tensor_2d(ctx0, GGML_TYPE_F32, n_backbone, n_tokens);
    ggml_set_name(hidden_state, "inp_mtp_states");
    ggml_set_input(hidden_state);
@ -557,6 +553,10 @@ ggml_cgraph * llm_build_context::build_gemma4_mtp() {
        return gf;
    }

+    lctx.inp_tokens = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, batch.n_tokens);
+    cb(lctx.inp_tokens, "inp_tokens", -1);
+    ggml_set_input(lctx.inp_tokens);
+
    const llama_model   & target_model   = lctx.mtp_target_ctx->model;
    const llama_hparams & target_hparams = target_model.hparams;
    const llama_cparams & target_cparams = lctx.mtp_target_ctx->cparams;
--- a/src/llama.cpp
+++ b/src/llama.cpp
@ -2557,6 +2557,18 @@ static std::pair<std::vector<double>, double> get_layer_sizes(const llama_model_
    std::vector<bool> has_layer_norm(n_layer, false);
    size_t ow_size = 0;
    size_t embd_size = 0;
+    size_t output_misc_size = 0;
+    int gemma4_rope_layer = -1;
+
+    if (model.arch == LLM_ARCH_GEMMA4) {
+        for (int il = 0; il < n_layer; ++il) {
+            if (!model.hparams.swa_layers[il]) {
+                gemma4_rope_layer = il;
+                break;
+            }
+        }
+    }
+
    for (int i = 0; i < ml.n_tensors; ++i) {
        auto t = ml.get_weight(i)->tensor;
        std::string name(t->name);
@ -2572,6 +2584,20 @@ static std::pair<std::vector<double>, double> get_layer_sizes(const llama_model_
        if (name == "output_norm.weight") {
            continue;
        }
+        if (model.arch == LLM_ARCH_GEMMA4) {
+            if (name == "per_layer_token_embd.weight" ||
+                name == "per_layer_model_proj.weight" ||
+                name == "per_layer_proj_norm.weight") {
+                output_misc_size += size;
+                continue;
+            }
+            if (name == "rope_freqs.weight") {
+                if (gemma4_rope_layer >= 0) {
+                    result[gemma4_rope_layer] += size;
+                }
+                continue;
+            }
+        }
        if (name == "mtp_pre_proj.weight"  || name == "mtp_post_proj.weight" ||
            name == "mtp_centroids.weight" || name == "mtp_token_ordering.weight") {
            continue;
@ -2707,7 +2733,7 @@ static std::pair<std::vector<double>, double> get_layer_sizes(const llama_model_
        }
    }
    if (!ow_size) ow_size = embd_size;
-    result[n_layer] = ow_size;
+    result[n_layer] = ow_size + output_misc_size;
    LLAMA_LOG_INFO("------------------- Layer sizes:\n");
    double tot_model = 0, tot_cache = 0, max_compute = 0;
    for (int il = 0; il < n_layer; ++il) {
@ -3571,7 +3597,7 @@ static void llama_set_inputs(llama_context & lctx, const llama_batch & batch) {
    const auto & cparams = lctx.cparams;
    const auto & kv_self = lctx.kv_self;

-    if (batch.token) {
+    if (batch.token && lctx.inp_tokens) {
 #if IK_PRINT_TIMING == 2
        auto tim1 = ggml_time_us();
 #endif
@ -3645,7 +3671,7 @@ static void llama_set_inputs(llama_context & lctx, const llama_batch & batch) {
        auto tim1 = ggml_time_us();
 #endif
        const int64_t n_tokens = batch.n_tokens;
-        if (n_tokens > 1 && !cparams.mtp) {
+        if (n_tokens > 1 && !cparams.mtp && lctx.n_outputs < n_tokens) {
            GGML_ASSERT(lctx.inp_out_ids && "every model that can must skip unused outputs");
        }