mirror of
https://github.com/ikawrakow/ik_llama.cpp.git
synced 2026-06-28 04:30:15 -05:00
feat: map Gemma 4 tensor and support with imatrix (#1796)
This commit is contained in:
parent
b2e7f7f6cd
commit
0fcffdb64d
@ -26,6 +26,129 @@
|
||||
#pragma warning(disable: 4244 4267) // possible loss of data
|
||||
#endif
|
||||
|
||||
uint32_t llama_mtp_state_n_embd(const struct llama_context * ctx);
|
||||
void llama_set_mtp_target_context(struct llama_context * ctx, struct llama_context * target_ctx);
|
||||
|
||||
static llama_model * ik_load_model_from_params(const gpt_params & params, const llama_model_params & mparams) {
|
||||
if (!params.hf_repo.empty() && !params.hf_file.empty()) {
|
||||
return llama_load_model_from_hf(params.hf_repo.c_str(), params.hf_file.c_str(), params.model.c_str(), params.hf_token.c_str(), mparams);
|
||||
}
|
||||
if (!params.model_url.empty()) {
|
||||
return llama_load_model_from_url(params.model_url.c_str(), params.model.c_str(), params.hf_token.c_str(), mparams);
|
||||
}
|
||||
|
||||
return llama_model_load_from_file(params.model.c_str(), mparams);
|
||||
}
|
||||
|
||||
static bool ik_model_has_arch(const llama_model * model, const char * expected_arch) {
|
||||
char arch[64] = { 0 };
|
||||
const int32_t len = llama_model_meta_val_str(model, "general.architecture", arch, sizeof(arch));
|
||||
return len > 0 && std::string(arch) == expected_arch;
|
||||
}
|
||||
|
||||
static llama_init_result ik_init_from_loaded_model(llama_model * model, gpt_params & params) {
|
||||
llama_init_result iparams;
|
||||
|
||||
if (model == nullptr) {
|
||||
return iparams;
|
||||
}
|
||||
|
||||
auto cparams = common_context_params_to_llama(params);
|
||||
|
||||
llama_context * lctx = llama_init_from_model(model, cparams);
|
||||
if (lctx == nullptr) {
|
||||
fprintf(stderr, "%s: error: failed to create context with model '%s'\n", __func__, params.model.c_str());
|
||||
llama_free_model(model);
|
||||
return iparams;
|
||||
}
|
||||
|
||||
for (auto [op, on_off] : params.offload_policy) {
|
||||
llama_set_offload_policy(lctx, op, on_off);
|
||||
}
|
||||
|
||||
if (!params.control_vectors.empty()) {
|
||||
if (params.control_vector_layer_start <= 0) params.control_vector_layer_start = 1;
|
||||
if (params.control_vector_layer_end <= 0) params.control_vector_layer_end = llama_n_layer(model);
|
||||
|
||||
const auto cvec = llama_control_vector_load(params.control_vectors);
|
||||
if (cvec.n_embd == -1) {
|
||||
llama_free(lctx);
|
||||
llama_free_model(model);
|
||||
return iparams;
|
||||
}
|
||||
|
||||
const int err = llama_control_vector_apply(lctx,
|
||||
cvec.data.data(),
|
||||
cvec.data.size(),
|
||||
cvec.n_embd,
|
||||
params.control_vector_layer_start,
|
||||
params.control_vector_layer_end);
|
||||
if (err) {
|
||||
llama_free(lctx);
|
||||
llama_free_model(model);
|
||||
return iparams;
|
||||
}
|
||||
}
|
||||
|
||||
for (auto & la : params.lora_adapters) {
|
||||
llama_lora_adapter_container loaded_la;
|
||||
loaded_la.path = la.path;
|
||||
loaded_la.scale = la.scale;
|
||||
loaded_la.adapter = llama_lora_adapter_init(model, la.path.c_str());
|
||||
if (loaded_la.adapter == nullptr) {
|
||||
fprintf(stderr, "%s: error: failed to apply lora adapter '%s'\n", __func__, la.path.c_str());
|
||||
llama_free(lctx);
|
||||
llama_free_model(model);
|
||||
return iparams;
|
||||
}
|
||||
iparams.lora_adapters.push_back(loaded_la);
|
||||
}
|
||||
if (!params.lora_init_without_apply) {
|
||||
llama_lora_adapters_apply(lctx, iparams.lora_adapters);
|
||||
}
|
||||
|
||||
if (params.ignore_eos) {
|
||||
params.sparams.logit_bias[llama_token_eos(model)] = -INFINITY;
|
||||
}
|
||||
|
||||
if (params.sparams.dry_penalty_last_n == -1) {
|
||||
LOG("%s: setting dry_penalty_last_n to ctx_size = %d\n", __func__, llama_n_ctx(lctx));
|
||||
params.sparams.dry_penalty_last_n = llama_n_ctx(lctx);
|
||||
}
|
||||
|
||||
if (params.warmup) {
|
||||
LOG("warming up the model with an empty run\n");
|
||||
|
||||
std::vector<llama_token> tmp;
|
||||
llama_token bos = llama_token_bos(model);
|
||||
llama_token eos = llama_token_eos(model);
|
||||
if (bos != -1) {
|
||||
tmp.push_back(bos);
|
||||
} else {
|
||||
tmp.push_back(eos);
|
||||
}
|
||||
if (llama_model_has_encoder(model)) {
|
||||
llama_encode(lctx, llama_batch_get_one(tmp.data(), tmp.size(), 0, 0));
|
||||
llama_token decoder_start_token_id = llama_model_decoder_start_token(model);
|
||||
if (decoder_start_token_id == LLAMA_TOKEN_NULL) {
|
||||
decoder_start_token_id = bos;
|
||||
}
|
||||
tmp.clear();
|
||||
tmp.push_back(decoder_start_token_id);
|
||||
}
|
||||
if (llama_model_has_decoder(model)) {
|
||||
llama_decode(lctx, llama_batch_get_one(tmp.data(), std::min(tmp.size(), (size_t) params.n_batch), 0, 0));
|
||||
}
|
||||
llama_kv_cache_clear(lctx);
|
||||
llama_synchronize(lctx);
|
||||
llama_reset_timings(lctx);
|
||||
}
|
||||
|
||||
iparams.model = model;
|
||||
iparams.context = lctx;
|
||||
return iparams;
|
||||
}
|
||||
|
||||
static void print_usage(int argc, char ** argv, const gpt_params & params) {
|
||||
gpt_params_print_usage(argc, argv, params);
|
||||
|
||||
@ -638,7 +761,75 @@ static void process_logits(
|
||||
}
|
||||
}
|
||||
|
||||
static bool compute_imatrix(llama_context * ctx, const gpt_params & params) {
|
||||
static gpt_params build_draft_imatrix_params(const gpt_params & params) {
|
||||
gpt_params draft_params = params;
|
||||
|
||||
draft_params.model = params.speculative.model;
|
||||
draft_params.model_url.clear();
|
||||
draft_params.hf_repo.clear();
|
||||
draft_params.hf_file.clear();
|
||||
draft_params.n_ctx = params.speculative.n_ctx > 0 ? params.speculative.n_ctx : params.n_ctx;
|
||||
draft_params.n_gpu_layers = params.speculative.n_gpu_layers >= 0 ? params.speculative.n_gpu_layers : params.n_gpu_layers;
|
||||
draft_params.has_mtp = true;
|
||||
draft_params.warmup = false;
|
||||
draft_params.cb_eval = ik_collect_imatrix;
|
||||
draft_params.cb_eval_user_data = nullptr;
|
||||
|
||||
if (params.speculative.n_threads > 0) {
|
||||
draft_params.n_threads = params.speculative.n_threads;
|
||||
}
|
||||
if (params.speculative.n_threads_batch > 0) {
|
||||
draft_params.n_threads_batch = params.speculative.n_threads_batch;
|
||||
}
|
||||
if (!params.speculative.devices.empty()) {
|
||||
draft_params.devices = params.speculative.devices;
|
||||
}
|
||||
if (!params.speculative.cache_type_k.empty()) {
|
||||
draft_params.cache_type_k = params.speculative.cache_type_k;
|
||||
}
|
||||
if (!params.speculative.cache_type_v.empty()) {
|
||||
draft_params.cache_type_v = params.speculative.cache_type_v;
|
||||
}
|
||||
|
||||
return draft_params;
|
||||
}
|
||||
|
||||
static bool compute_draft_imatrix_batch(
|
||||
llama_context * ctx_tgt,
|
||||
llama_context * ctx_dft,
|
||||
llama_token * draft_tokens,
|
||||
int batch_start,
|
||||
int batch_size,
|
||||
int batch_pos) {
|
||||
const float * hidden = llama_get_embeddings(ctx_tgt);
|
||||
const int n_embd_tgt = llama_mtp_state_n_embd(ctx_tgt);
|
||||
const int n_embd_dft = llama_mtp_state_n_embd(ctx_dft);
|
||||
|
||||
if (hidden == nullptr || n_embd_tgt <= 0 || n_embd_dft <= 0) {
|
||||
fprintf(stderr, "%s: missing target hidden state for paired draft calibration\n", __func__);
|
||||
return false;
|
||||
}
|
||||
|
||||
if (n_embd_tgt != n_embd_dft) {
|
||||
fprintf(stderr, "%s: hidden width mismatch between target (%d) and draft (%d)\n",
|
||||
__func__, n_embd_tgt, n_embd_dft);
|
||||
return false;
|
||||
}
|
||||
|
||||
llama_set_mtp_op_type(ctx_dft, MTP_OP_DRAFT_GEN);
|
||||
llama_set_draft_input_hidden_state(ctx_dft, hidden);
|
||||
const int ret = llama_decode(ctx_dft, llama_batch_get_one(draft_tokens + batch_start, batch_size, batch_pos, 0));
|
||||
llama_set_mtp_op_type(ctx_dft, MTP_OP_NONE);
|
||||
|
||||
if (ret != 0) {
|
||||
fprintf(stderr, "%s: paired draft eval failed\n", __func__);
|
||||
return false;
|
||||
}
|
||||
|
||||
return true;
|
||||
}
|
||||
|
||||
static bool compute_imatrix(llama_context * ctx, const gpt_params & params, llama_context * ctx_dft = nullptr) {
|
||||
const bool add_bos = llama_should_add_bos_token(llama_get_model(ctx));
|
||||
GGML_ASSERT(llama_add_eos_token(llama_get_model(ctx)) != 1);
|
||||
const int n_ctx = llama_n_ctx(ctx);
|
||||
@ -706,6 +897,9 @@ static bool compute_imatrix(llama_context * ctx, const gpt_params & params) {
|
||||
|
||||
// clear the KV cache
|
||||
llama_kv_cache_clear(ctx);
|
||||
if (ctx_dft != nullptr) {
|
||||
llama_kv_cache_clear(ctx_dft);
|
||||
}
|
||||
|
||||
for (int j = 0; j < num_batches; ++j) {
|
||||
const int batch_start = start + j * n_batch;
|
||||
@ -725,6 +919,10 @@ static bool compute_imatrix(llama_context * ctx, const gpt_params & params) {
|
||||
return false;
|
||||
}
|
||||
|
||||
if (ctx_dft != nullptr && !compute_draft_imatrix_batch(ctx, ctx_dft, tokens.data(), batch_start, batch_size, j * n_batch)) {
|
||||
return false;
|
||||
}
|
||||
|
||||
// restore the original token in case it was set to BOS
|
||||
tokens[batch_start] = token_org;
|
||||
|
||||
@ -829,14 +1027,83 @@ int main(int argc, char ** argv) {
|
||||
llama_backend_init();
|
||||
llama_numa_init(params.numa);
|
||||
|
||||
// pass the callback to the backend scheduler
|
||||
// it will be executed for each node during the graph computation
|
||||
params.cb_eval = ik_collect_imatrix;
|
||||
params.cb_eval_user_data = NULL;
|
||||
params.warmup = false;
|
||||
gpt_params target_params = params;
|
||||
target_params.warmup = false;
|
||||
|
||||
// init
|
||||
llama_init_result llama_init = llama_init_from_gpt_params(params);
|
||||
const bool has_draft_model = !params.speculative.model.empty();
|
||||
if (!has_draft_model) {
|
||||
// pass the callback to the backend scheduler
|
||||
// it will be executed for each node during the graph computation
|
||||
target_params.cb_eval = ik_collect_imatrix;
|
||||
target_params.cb_eval_user_data = NULL;
|
||||
}
|
||||
|
||||
llama_init_result llama_init;
|
||||
llama_model * model_dft = nullptr;
|
||||
llama_context * ctx_dft = nullptr;
|
||||
bool use_paired_gemma4_mtp = false;
|
||||
|
||||
if (has_draft_model) {
|
||||
gpt_params draft_params = build_draft_imatrix_params(params);
|
||||
auto mparams_dft = common_model_params_to_llama(draft_params);
|
||||
|
||||
model_dft = ik_load_model_from_params(draft_params, mparams_dft);
|
||||
if (model_dft == nullptr) {
|
||||
fprintf(stderr, "%s : failed to load draft model '%s'\n", __func__, draft_params.model.c_str());
|
||||
llama_backend_free();
|
||||
return 1;
|
||||
}
|
||||
|
||||
if (!llama_model_is_gemma4_mtp_assistant(model_dft)) {
|
||||
fprintf(stderr, "%s : paired imatrix mode currently supports Gemma 4 assistant draft models only\n", __func__);
|
||||
llama_free_model(model_dft);
|
||||
llama_backend_free();
|
||||
return 1;
|
||||
}
|
||||
|
||||
target_params.has_mtp = true;
|
||||
target_params.cb_eval = nullptr;
|
||||
target_params.cb_eval_user_data = nullptr;
|
||||
|
||||
auto mparams_tgt = common_model_params_to_llama(target_params);
|
||||
llama_model * model_tgt = ik_load_model_from_params(target_params, mparams_tgt);
|
||||
if (model_tgt == nullptr) {
|
||||
fprintf(stderr, "%s : failed to load target model '%s'\n", __func__, target_params.model.c_str());
|
||||
llama_free_model(model_dft);
|
||||
llama_backend_free();
|
||||
return 1;
|
||||
}
|
||||
|
||||
if (!ik_model_has_arch(model_tgt, "gemma4")) {
|
||||
fprintf(stderr, "%s : paired imatrix mode currently supports Gemma 4 target models only\n", __func__);
|
||||
llama_free_model(model_tgt);
|
||||
llama_free_model(model_dft);
|
||||
llama_backend_free();
|
||||
return 1;
|
||||
}
|
||||
|
||||
llama_init = ik_init_from_loaded_model(model_tgt, target_params);
|
||||
if (llama_init.model == nullptr || llama_init.context == nullptr) {
|
||||
llama_free_model(model_dft);
|
||||
llama_backend_free();
|
||||
return 1;
|
||||
}
|
||||
|
||||
auto draft_init = ik_init_from_loaded_model(model_dft, draft_params);
|
||||
model_dft = draft_init.model;
|
||||
ctx_dft = draft_init.context;
|
||||
if (model_dft == nullptr || ctx_dft == nullptr) {
|
||||
llama_free(llama_init.context);
|
||||
llama_free_model(llama_init.model);
|
||||
llama_backend_free();
|
||||
return 1;
|
||||
}
|
||||
|
||||
llama_set_mtp_target_context(ctx_dft, llama_init.context);
|
||||
use_paired_gemma4_mtp = true;
|
||||
} else {
|
||||
llama_init = llama_init_from_gpt_params(target_params);
|
||||
}
|
||||
|
||||
llama_model * model = llama_init.model;
|
||||
llama_context * ctx = llama_init.context;
|
||||
@ -845,6 +1112,10 @@ int main(int argc, char ** argv) {
|
||||
return 1;
|
||||
}
|
||||
|
||||
if (!use_paired_gemma4_mtp && llama_model_is_gemma4_mtp_assistant(model) && !params.process_output) {
|
||||
fprintf(stderr, "%s: warning: standalone Gemma 4 assistant imatrix does not exercise the assistant layers. Use '-m <target> -md <assistant> -mtp' for meaningful calibration.\n", __func__);
|
||||
}
|
||||
|
||||
const int n_ctx_train = llama_n_ctx_train(model);
|
||||
if (params.n_ctx > n_ctx_train) {
|
||||
fprintf(stderr, "%s: warning: model was trained on only %d context tokens (%d specified)\n",
|
||||
@ -857,7 +1128,16 @@ int main(int argc, char ** argv) {
|
||||
fprintf(stderr, "%s\n", gpt_params_get_system_info(params).c_str());
|
||||
}
|
||||
|
||||
if (!compute_imatrix(ctx, params)) {
|
||||
if (!compute_imatrix(ctx, params, ctx_dft)) {
|
||||
if (ctx_dft != nullptr) {
|
||||
llama_free(ctx_dft);
|
||||
}
|
||||
if (model_dft != nullptr) {
|
||||
llama_free_model(model_dft);
|
||||
}
|
||||
llama_free(ctx);
|
||||
llama_free_model(model);
|
||||
llama_backend_free();
|
||||
return 1;
|
||||
}
|
||||
|
||||
@ -866,6 +1146,13 @@ int main(int argc, char ** argv) {
|
||||
|
||||
llama_print_timings(ctx);
|
||||
|
||||
if (ctx_dft != nullptr) {
|
||||
llama_free(ctx_dft);
|
||||
}
|
||||
if (model_dft != nullptr) {
|
||||
llama_free_model(model_dft);
|
||||
}
|
||||
|
||||
llama_free(ctx);
|
||||
llama_free_model(model);
|
||||
|
||||
|
||||
@ -531,10 +531,6 @@ ggml_cgraph * llm_build_context::build_gemma4_mtp() {
|
||||
|
||||
GGML_ASSERT(n_backbone > 0);
|
||||
|
||||
lctx.inp_tokens = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, batch.n_tokens);
|
||||
cb(lctx.inp_tokens, "inp_tokens", -1);
|
||||
ggml_set_input(lctx.inp_tokens);
|
||||
|
||||
ggml_tensor * hidden_state = ggml_new_tensor_2d(ctx0, GGML_TYPE_F32, n_backbone, n_tokens);
|
||||
ggml_set_name(hidden_state, "inp_mtp_states");
|
||||
ggml_set_input(hidden_state);
|
||||
@ -557,6 +553,10 @@ ggml_cgraph * llm_build_context::build_gemma4_mtp() {
|
||||
return gf;
|
||||
}
|
||||
|
||||
lctx.inp_tokens = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, batch.n_tokens);
|
||||
cb(lctx.inp_tokens, "inp_tokens", -1);
|
||||
ggml_set_input(lctx.inp_tokens);
|
||||
|
||||
const llama_model & target_model = lctx.mtp_target_ctx->model;
|
||||
const llama_hparams & target_hparams = target_model.hparams;
|
||||
const llama_cparams & target_cparams = lctx.mtp_target_ctx->cparams;
|
||||
|
||||
@ -2557,6 +2557,18 @@ static std::pair<std::vector<double>, double> get_layer_sizes(const llama_model_
|
||||
std::vector<bool> has_layer_norm(n_layer, false);
|
||||
size_t ow_size = 0;
|
||||
size_t embd_size = 0;
|
||||
size_t output_misc_size = 0;
|
||||
int gemma4_rope_layer = -1;
|
||||
|
||||
if (model.arch == LLM_ARCH_GEMMA4) {
|
||||
for (int il = 0; il < n_layer; ++il) {
|
||||
if (!model.hparams.swa_layers[il]) {
|
||||
gemma4_rope_layer = il;
|
||||
break;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
for (int i = 0; i < ml.n_tensors; ++i) {
|
||||
auto t = ml.get_weight(i)->tensor;
|
||||
std::string name(t->name);
|
||||
@ -2572,6 +2584,20 @@ static std::pair<std::vector<double>, double> get_layer_sizes(const llama_model_
|
||||
if (name == "output_norm.weight") {
|
||||
continue;
|
||||
}
|
||||
if (model.arch == LLM_ARCH_GEMMA4) {
|
||||
if (name == "per_layer_token_embd.weight" ||
|
||||
name == "per_layer_model_proj.weight" ||
|
||||
name == "per_layer_proj_norm.weight") {
|
||||
output_misc_size += size;
|
||||
continue;
|
||||
}
|
||||
if (name == "rope_freqs.weight") {
|
||||
if (gemma4_rope_layer >= 0) {
|
||||
result[gemma4_rope_layer] += size;
|
||||
}
|
||||
continue;
|
||||
}
|
||||
}
|
||||
if (name == "mtp_pre_proj.weight" || name == "mtp_post_proj.weight" ||
|
||||
name == "mtp_centroids.weight" || name == "mtp_token_ordering.weight") {
|
||||
continue;
|
||||
@ -2707,7 +2733,7 @@ static std::pair<std::vector<double>, double> get_layer_sizes(const llama_model_
|
||||
}
|
||||
}
|
||||
if (!ow_size) ow_size = embd_size;
|
||||
result[n_layer] = ow_size;
|
||||
result[n_layer] = ow_size + output_misc_size;
|
||||
LLAMA_LOG_INFO("------------------- Layer sizes:\n");
|
||||
double tot_model = 0, tot_cache = 0, max_compute = 0;
|
||||
for (int il = 0; il < n_layer; ++il) {
|
||||
@ -3571,7 +3597,7 @@ static void llama_set_inputs(llama_context & lctx, const llama_batch & batch) {
|
||||
const auto & cparams = lctx.cparams;
|
||||
const auto & kv_self = lctx.kv_self;
|
||||
|
||||
if (batch.token) {
|
||||
if (batch.token && lctx.inp_tokens) {
|
||||
#if IK_PRINT_TIMING == 2
|
||||
auto tim1 = ggml_time_us();
|
||||
#endif
|
||||
@ -3645,7 +3671,7 @@ static void llama_set_inputs(llama_context & lctx, const llama_batch & batch) {
|
||||
auto tim1 = ggml_time_us();
|
||||
#endif
|
||||
const int64_t n_tokens = batch.n_tokens;
|
||||
if (n_tokens > 1 && !cparams.mtp) {
|
||||
if (n_tokens > 1 && !cparams.mtp && lctx.n_outputs < n_tokens) {
|
||||
GGML_ASSERT(lctx.inp_out_ids && "every model that can must skip unused outputs");
|
||||
}
|
||||
|
||||
|
||||
Loading…
x
Reference in New Issue
Block a user