diff --git a/examples/imatrix/imatrix.cpp b/examples/imatrix/imatrix.cpp index 8bc44587..0279b865 100644 --- a/examples/imatrix/imatrix.cpp +++ b/examples/imatrix/imatrix.cpp @@ -26,6 +26,129 @@ #pragma warning(disable: 4244 4267) // possible loss of data #endif +uint32_t llama_mtp_state_n_embd(const struct llama_context * ctx); +void llama_set_mtp_target_context(struct llama_context * ctx, struct llama_context * target_ctx); + +static llama_model * ik_load_model_from_params(const gpt_params & params, const llama_model_params & mparams) { + if (!params.hf_repo.empty() && !params.hf_file.empty()) { + return llama_load_model_from_hf(params.hf_repo.c_str(), params.hf_file.c_str(), params.model.c_str(), params.hf_token.c_str(), mparams); + } + if (!params.model_url.empty()) { + return llama_load_model_from_url(params.model_url.c_str(), params.model.c_str(), params.hf_token.c_str(), mparams); + } + + return llama_model_load_from_file(params.model.c_str(), mparams); +} + +static bool ik_model_has_arch(const llama_model * model, const char * expected_arch) { + char arch[64] = { 0 }; + const int32_t len = llama_model_meta_val_str(model, "general.architecture", arch, sizeof(arch)); + return len > 0 && std::string(arch) == expected_arch; +} + +static llama_init_result ik_init_from_loaded_model(llama_model * model, gpt_params & params) { + llama_init_result iparams; + + if (model == nullptr) { + return iparams; + } + + auto cparams = common_context_params_to_llama(params); + + llama_context * lctx = llama_init_from_model(model, cparams); + if (lctx == nullptr) { + fprintf(stderr, "%s: error: failed to create context with model '%s'\n", __func__, params.model.c_str()); + llama_free_model(model); + return iparams; + } + + for (auto [op, on_off] : params.offload_policy) { + llama_set_offload_policy(lctx, op, on_off); + } + + if (!params.control_vectors.empty()) { + if (params.control_vector_layer_start <= 0) params.control_vector_layer_start = 1; + if (params.control_vector_layer_end <= 0) params.control_vector_layer_end = llama_n_layer(model); + + const auto cvec = llama_control_vector_load(params.control_vectors); + if (cvec.n_embd == -1) { + llama_free(lctx); + llama_free_model(model); + return iparams; + } + + const int err = llama_control_vector_apply(lctx, + cvec.data.data(), + cvec.data.size(), + cvec.n_embd, + params.control_vector_layer_start, + params.control_vector_layer_end); + if (err) { + llama_free(lctx); + llama_free_model(model); + return iparams; + } + } + + for (auto & la : params.lora_adapters) { + llama_lora_adapter_container loaded_la; + loaded_la.path = la.path; + loaded_la.scale = la.scale; + loaded_la.adapter = llama_lora_adapter_init(model, la.path.c_str()); + if (loaded_la.adapter == nullptr) { + fprintf(stderr, "%s: error: failed to apply lora adapter '%s'\n", __func__, la.path.c_str()); + llama_free(lctx); + llama_free_model(model); + return iparams; + } + iparams.lora_adapters.push_back(loaded_la); + } + if (!params.lora_init_without_apply) { + llama_lora_adapters_apply(lctx, iparams.lora_adapters); + } + + if (params.ignore_eos) { + params.sparams.logit_bias[llama_token_eos(model)] = -INFINITY; + } + + if (params.sparams.dry_penalty_last_n == -1) { + LOG("%s: setting dry_penalty_last_n to ctx_size = %d\n", __func__, llama_n_ctx(lctx)); + params.sparams.dry_penalty_last_n = llama_n_ctx(lctx); + } + + if (params.warmup) { + LOG("warming up the model with an empty run\n"); + + std::vector tmp; + llama_token bos = llama_token_bos(model); + llama_token eos = llama_token_eos(model); + if (bos != -1) { + tmp.push_back(bos); + } else { + tmp.push_back(eos); + } + if (llama_model_has_encoder(model)) { + llama_encode(lctx, llama_batch_get_one(tmp.data(), tmp.size(), 0, 0)); + llama_token decoder_start_token_id = llama_model_decoder_start_token(model); + if (decoder_start_token_id == LLAMA_TOKEN_NULL) { + decoder_start_token_id = bos; + } + tmp.clear(); + tmp.push_back(decoder_start_token_id); + } + if (llama_model_has_decoder(model)) { + llama_decode(lctx, llama_batch_get_one(tmp.data(), std::min(tmp.size(), (size_t) params.n_batch), 0, 0)); + } + llama_kv_cache_clear(lctx); + llama_synchronize(lctx); + llama_reset_timings(lctx); + } + + iparams.model = model; + iparams.context = lctx; + return iparams; +} + static void print_usage(int argc, char ** argv, const gpt_params & params) { gpt_params_print_usage(argc, argv, params); @@ -638,7 +761,75 @@ static void process_logits( } } -static bool compute_imatrix(llama_context * ctx, const gpt_params & params) { +static gpt_params build_draft_imatrix_params(const gpt_params & params) { + gpt_params draft_params = params; + + draft_params.model = params.speculative.model; + draft_params.model_url.clear(); + draft_params.hf_repo.clear(); + draft_params.hf_file.clear(); + draft_params.n_ctx = params.speculative.n_ctx > 0 ? params.speculative.n_ctx : params.n_ctx; + draft_params.n_gpu_layers = params.speculative.n_gpu_layers >= 0 ? params.speculative.n_gpu_layers : params.n_gpu_layers; + draft_params.has_mtp = true; + draft_params.warmup = false; + draft_params.cb_eval = ik_collect_imatrix; + draft_params.cb_eval_user_data = nullptr; + + if (params.speculative.n_threads > 0) { + draft_params.n_threads = params.speculative.n_threads; + } + if (params.speculative.n_threads_batch > 0) { + draft_params.n_threads_batch = params.speculative.n_threads_batch; + } + if (!params.speculative.devices.empty()) { + draft_params.devices = params.speculative.devices; + } + if (!params.speculative.cache_type_k.empty()) { + draft_params.cache_type_k = params.speculative.cache_type_k; + } + if (!params.speculative.cache_type_v.empty()) { + draft_params.cache_type_v = params.speculative.cache_type_v; + } + + return draft_params; +} + +static bool compute_draft_imatrix_batch( + llama_context * ctx_tgt, + llama_context * ctx_dft, + llama_token * draft_tokens, + int batch_start, + int batch_size, + int batch_pos) { + const float * hidden = llama_get_embeddings(ctx_tgt); + const int n_embd_tgt = llama_mtp_state_n_embd(ctx_tgt); + const int n_embd_dft = llama_mtp_state_n_embd(ctx_dft); + + if (hidden == nullptr || n_embd_tgt <= 0 || n_embd_dft <= 0) { + fprintf(stderr, "%s: missing target hidden state for paired draft calibration\n", __func__); + return false; + } + + if (n_embd_tgt != n_embd_dft) { + fprintf(stderr, "%s: hidden width mismatch between target (%d) and draft (%d)\n", + __func__, n_embd_tgt, n_embd_dft); + return false; + } + + llama_set_mtp_op_type(ctx_dft, MTP_OP_DRAFT_GEN); + llama_set_draft_input_hidden_state(ctx_dft, hidden); + const int ret = llama_decode(ctx_dft, llama_batch_get_one(draft_tokens + batch_start, batch_size, batch_pos, 0)); + llama_set_mtp_op_type(ctx_dft, MTP_OP_NONE); + + if (ret != 0) { + fprintf(stderr, "%s: paired draft eval failed\n", __func__); + return false; + } + + return true; +} + +static bool compute_imatrix(llama_context * ctx, const gpt_params & params, llama_context * ctx_dft = nullptr) { const bool add_bos = llama_should_add_bos_token(llama_get_model(ctx)); GGML_ASSERT(llama_add_eos_token(llama_get_model(ctx)) != 1); const int n_ctx = llama_n_ctx(ctx); @@ -706,6 +897,9 @@ static bool compute_imatrix(llama_context * ctx, const gpt_params & params) { // clear the KV cache llama_kv_cache_clear(ctx); + if (ctx_dft != nullptr) { + llama_kv_cache_clear(ctx_dft); + } for (int j = 0; j < num_batches; ++j) { const int batch_start = start + j * n_batch; @@ -725,6 +919,10 @@ static bool compute_imatrix(llama_context * ctx, const gpt_params & params) { return false; } + if (ctx_dft != nullptr && !compute_draft_imatrix_batch(ctx, ctx_dft, tokens.data(), batch_start, batch_size, j * n_batch)) { + return false; + } + // restore the original token in case it was set to BOS tokens[batch_start] = token_org; @@ -829,14 +1027,83 @@ int main(int argc, char ** argv) { llama_backend_init(); llama_numa_init(params.numa); - // pass the callback to the backend scheduler - // it will be executed for each node during the graph computation - params.cb_eval = ik_collect_imatrix; - params.cb_eval_user_data = NULL; - params.warmup = false; + gpt_params target_params = params; + target_params.warmup = false; - // init - llama_init_result llama_init = llama_init_from_gpt_params(params); + const bool has_draft_model = !params.speculative.model.empty(); + if (!has_draft_model) { + // pass the callback to the backend scheduler + // it will be executed for each node during the graph computation + target_params.cb_eval = ik_collect_imatrix; + target_params.cb_eval_user_data = NULL; + } + + llama_init_result llama_init; + llama_model * model_dft = nullptr; + llama_context * ctx_dft = nullptr; + bool use_paired_gemma4_mtp = false; + + if (has_draft_model) { + gpt_params draft_params = build_draft_imatrix_params(params); + auto mparams_dft = common_model_params_to_llama(draft_params); + + model_dft = ik_load_model_from_params(draft_params, mparams_dft); + if (model_dft == nullptr) { + fprintf(stderr, "%s : failed to load draft model '%s'\n", __func__, draft_params.model.c_str()); + llama_backend_free(); + return 1; + } + + if (!llama_model_is_gemma4_mtp_assistant(model_dft)) { + fprintf(stderr, "%s : paired imatrix mode currently supports Gemma 4 assistant draft models only\n", __func__); + llama_free_model(model_dft); + llama_backend_free(); + return 1; + } + + target_params.has_mtp = true; + target_params.cb_eval = nullptr; + target_params.cb_eval_user_data = nullptr; + + auto mparams_tgt = common_model_params_to_llama(target_params); + llama_model * model_tgt = ik_load_model_from_params(target_params, mparams_tgt); + if (model_tgt == nullptr) { + fprintf(stderr, "%s : failed to load target model '%s'\n", __func__, target_params.model.c_str()); + llama_free_model(model_dft); + llama_backend_free(); + return 1; + } + + if (!ik_model_has_arch(model_tgt, "gemma4")) { + fprintf(stderr, "%s : paired imatrix mode currently supports Gemma 4 target models only\n", __func__); + llama_free_model(model_tgt); + llama_free_model(model_dft); + llama_backend_free(); + return 1; + } + + llama_init = ik_init_from_loaded_model(model_tgt, target_params); + if (llama_init.model == nullptr || llama_init.context == nullptr) { + llama_free_model(model_dft); + llama_backend_free(); + return 1; + } + + auto draft_init = ik_init_from_loaded_model(model_dft, draft_params); + model_dft = draft_init.model; + ctx_dft = draft_init.context; + if (model_dft == nullptr || ctx_dft == nullptr) { + llama_free(llama_init.context); + llama_free_model(llama_init.model); + llama_backend_free(); + return 1; + } + + llama_set_mtp_target_context(ctx_dft, llama_init.context); + use_paired_gemma4_mtp = true; + } else { + llama_init = llama_init_from_gpt_params(target_params); + } llama_model * model = llama_init.model; llama_context * ctx = llama_init.context; @@ -845,6 +1112,10 @@ int main(int argc, char ** argv) { return 1; } + if (!use_paired_gemma4_mtp && llama_model_is_gemma4_mtp_assistant(model) && !params.process_output) { + fprintf(stderr, "%s: warning: standalone Gemma 4 assistant imatrix does not exercise the assistant layers. Use '-m -md -mtp' for meaningful calibration.\n", __func__); + } + const int n_ctx_train = llama_n_ctx_train(model); if (params.n_ctx > n_ctx_train) { fprintf(stderr, "%s: warning: model was trained on only %d context tokens (%d specified)\n", @@ -857,7 +1128,16 @@ int main(int argc, char ** argv) { fprintf(stderr, "%s\n", gpt_params_get_system_info(params).c_str()); } - if (!compute_imatrix(ctx, params)) { + if (!compute_imatrix(ctx, params, ctx_dft)) { + if (ctx_dft != nullptr) { + llama_free(ctx_dft); + } + if (model_dft != nullptr) { + llama_free_model(model_dft); + } + llama_free(ctx); + llama_free_model(model); + llama_backend_free(); return 1; } @@ -866,6 +1146,13 @@ int main(int argc, char ** argv) { llama_print_timings(ctx); + if (ctx_dft != nullptr) { + llama_free(ctx_dft); + } + if (model_dft != nullptr) { + llama_free_model(model_dft); + } + llama_free(ctx); llama_free_model(model); diff --git a/src/graphs/build_gemma4.cpp b/src/graphs/build_gemma4.cpp index b0853a26..249136f9 100644 --- a/src/graphs/build_gemma4.cpp +++ b/src/graphs/build_gemma4.cpp @@ -531,10 +531,6 @@ ggml_cgraph * llm_build_context::build_gemma4_mtp() { GGML_ASSERT(n_backbone > 0); - lctx.inp_tokens = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, batch.n_tokens); - cb(lctx.inp_tokens, "inp_tokens", -1); - ggml_set_input(lctx.inp_tokens); - ggml_tensor * hidden_state = ggml_new_tensor_2d(ctx0, GGML_TYPE_F32, n_backbone, n_tokens); ggml_set_name(hidden_state, "inp_mtp_states"); ggml_set_input(hidden_state); @@ -557,6 +553,10 @@ ggml_cgraph * llm_build_context::build_gemma4_mtp() { return gf; } + lctx.inp_tokens = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, batch.n_tokens); + cb(lctx.inp_tokens, "inp_tokens", -1); + ggml_set_input(lctx.inp_tokens); + const llama_model & target_model = lctx.mtp_target_ctx->model; const llama_hparams & target_hparams = target_model.hparams; const llama_cparams & target_cparams = lctx.mtp_target_ctx->cparams; diff --git a/src/llama.cpp b/src/llama.cpp index 0fc0157a..635a0fd1 100644 --- a/src/llama.cpp +++ b/src/llama.cpp @@ -2557,6 +2557,18 @@ static std::pair, double> get_layer_sizes(const llama_model_ std::vector has_layer_norm(n_layer, false); size_t ow_size = 0; size_t embd_size = 0; + size_t output_misc_size = 0; + int gemma4_rope_layer = -1; + + if (model.arch == LLM_ARCH_GEMMA4) { + for (int il = 0; il < n_layer; ++il) { + if (!model.hparams.swa_layers[il]) { + gemma4_rope_layer = il; + break; + } + } + } + for (int i = 0; i < ml.n_tensors; ++i) { auto t = ml.get_weight(i)->tensor; std::string name(t->name); @@ -2572,6 +2584,20 @@ static std::pair, double> get_layer_sizes(const llama_model_ if (name == "output_norm.weight") { continue; } + if (model.arch == LLM_ARCH_GEMMA4) { + if (name == "per_layer_token_embd.weight" || + name == "per_layer_model_proj.weight" || + name == "per_layer_proj_norm.weight") { + output_misc_size += size; + continue; + } + if (name == "rope_freqs.weight") { + if (gemma4_rope_layer >= 0) { + result[gemma4_rope_layer] += size; + } + continue; + } + } if (name == "mtp_pre_proj.weight" || name == "mtp_post_proj.weight" || name == "mtp_centroids.weight" || name == "mtp_token_ordering.weight") { continue; @@ -2707,7 +2733,7 @@ static std::pair, double> get_layer_sizes(const llama_model_ } } if (!ow_size) ow_size = embd_size; - result[n_layer] = ow_size; + result[n_layer] = ow_size + output_misc_size; LLAMA_LOG_INFO("------------------- Layer sizes:\n"); double tot_model = 0, tot_cache = 0, max_compute = 0; for (int il = 0; il < n_layer; ++il) { @@ -3571,7 +3597,7 @@ static void llama_set_inputs(llama_context & lctx, const llama_batch & batch) { const auto & cparams = lctx.cparams; const auto & kv_self = lctx.kv_self; - if (batch.token) { + if (batch.token && lctx.inp_tokens) { #if IK_PRINT_TIMING == 2 auto tim1 = ggml_time_us(); #endif @@ -3645,7 +3671,7 @@ static void llama_set_inputs(llama_context & lctx, const llama_batch & batch) { auto tim1 = ggml_time_us(); #endif const int64_t n_tokens = batch.n_tokens; - if (n_tokens > 1 && !cparams.mtp) { + if (n_tokens > 1 && !cparams.mtp && lctx.n_outputs < n_tokens) { GGML_ASSERT(lctx.inp_out_ids && "every model that can must skip unused outputs"); }