From 67735a4587f1cc9fe32e951a92eb15d76f97032e Mon Sep 17 00:00:00 2001 From: Kawrakow Date: Wed, 13 May 2026 09:37:26 +0000 Subject: [PATCH] More MTP tweaks --- common/speculative.cpp | 4 +-- src/llama-context.h | 1 - src/llama.cpp | 70 +----------------------------------------- 3 files changed, 2 insertions(+), 73 deletions(-) diff --git a/common/speculative.cpp b/common/speculative.cpp index 3b417881..10f365b3 100644 --- a/common/speculative.cpp +++ b/common/speculative.cpp @@ -1463,9 +1463,7 @@ std::vector mtp_speculative_gen_draft( int i0 = 0; if (last.last_id >= 0) { if (last.prob < p_min) { - llama_batch_free(mtp_batch); - llama_set_mtp_op_type(ctx, MTP_OP_NONE); - return drafts; + n_draft = 1; } current_input_id = last.last_id; last.last_id = -1; diff --git a/src/llama-context.h b/src/llama-context.h index a882ed0c..220f0801 100644 --- a/src/llama-context.h +++ b/src/llama-context.h @@ -146,7 +146,6 @@ struct llama_kv_cache { void checkpoint_delete(); // Per-step checkpoint: allocate, restore step k's full state (SSM + conv) to cache - bool per_step_save(ggml_backend_sched_t sched); bool per_step_alloc(const llama_model & model, int max_tokens); bool per_step_restore(const llama_model & model, ggml_backend_sched_t sched, int step); diff --git a/src/llama.cpp b/src/llama.cpp index 853a0367..0fc0157a 100644 --- a/src/llama.cpp +++ b/src/llama.cpp @@ -1477,68 +1477,6 @@ void llama_kv_cache::checkpoint_delete() { ckpt.saved = false; } -bool llama_kv_cache::per_step_save(ggml_backend_sched_t sched) { - const uint32_t n_layer = (uint32_t)s_l.size(); - const int64_t conv_state_dim = ckpt.per_step_conv_state_dim; - - ckpt.cells_snapshot = cells; - ckpt.head_snapshot = head; - ckpt.used_snapshot = used; - - if (conv_state_dim > 0 && !checkpoint_alloc_shadows(true)) { - return false; - } - - // Non-split recurrent tensors only need the pre-spec conv complement in - // their reduced shadow buffers. Split tensors keep full shadow copies so - // restore can still seed each split conv prefix from split_s_l_shadow. - const size_t conv_bytes = (size_t)std::max(conv_state_dim, 0) * sizeof(float); - std::unordered_set backends_to_sync; - - for (uint32_t il = 0; il < n_layer; ++il) { - if (s_l[il] == nullptr) { - continue; - } - - if (s_l[il]->extra != nullptr) { - auto * split_info = (const ggml_split_tensor_t *)s_l[il]->extra; - auto & shadow_split = ckpt.split_s_l_shadow[il]; - for (int d = 0; d < split_info->n_device; ++d) { - if (split_info->splits[d] == nullptr || shadow_split[d] == nullptr) { - continue; - } - auto src_backend = ggml_backend_sched_get_tensor_backend(sched, split_info->splits[d]); - GGML_ASSERT(src_backend != nullptr); - ggml_backend_tensor_copy_async(src_backend, src_backend, split_info->splits[d], shadow_split[d]); - backends_to_sync.insert(src_backend); - } - continue; - } - - if (conv_bytes == 0) { - continue; - } - - GGML_ASSERT(ckpt.s_l_shadow[il] != nullptr); - - ggml_tensor src = *s_l[il]; - src.ne[0] = conv_bytes / sizeof(float); - src.nb[1] = src.nb[2] = src.nb[3] = conv_bytes; - - auto src_backend = ggml_backend_sched_get_tensor_backend(sched, s_l[il]); - GGML_ASSERT(src_backend != nullptr); - ggml_backend_tensor_copy_async(src_backend, src_backend, &src, ckpt.s_l_shadow[il]); - backends_to_sync.insert(src_backend); - } - - for (auto backend : backends_to_sync) { - ggml_backend_synchronize(backend); - } - - ckpt.saved = true; - return true; -} - bool llama_kv_cache::per_step_alloc(const llama_model & model, int max_tokens) { if (ckpt.per_step_max_allocated >= max_tokens) { return true; @@ -1707,12 +1645,6 @@ bool llama_kv_cache::per_step_restore(const llama_model & model, ggml_backend_sc const int64_t ssm_bytes = ssm_state_dim * sizeof(float); const int64_t conv_bytes = conv_state_dim * sizeof(float); - std::vector ssm_buf(ssm_state_dim); - std::vector conv_buf(conv_state_dim); // reconstructed conv state - std::vector old_conv_buf(conv_state_dim); // pre-spec conv state from shadow - const int64_t qkv_needed = (int64_t)(step + 1) * conv_dim; - std::vector qkv_buf(qkv_needed); - int num_v_heads = model.hparams.ssm_dt_rank; int head_v_dim = model.hparams.ssm_d_inner / num_v_heads; @@ -7315,7 +7247,7 @@ bool llama_spec_ckpt_save(struct llama_context * ctx, llama_seq_id seq_id) { switch (kv.ckpt.selected_spec_mode) { case LLAMA_SPEC_CKPT_PER_STEP: kv.save_per_step_ssm = true; - return kv.per_step_save(ctx->sched); + return true; case LLAMA_SPEC_CKPT_GPU_FALLBACK: return kv.checkpoint_save(ctx->sched);