diff --git a/common/spec-tuner.cpp b/common/spec-tuner.cpp index 90bfdfa1..80427d41 100644 --- a/common/spec-tuner.cpp +++ b/common/spec-tuner.cpp @@ -117,7 +117,7 @@ void spec_tuner::write_best(common_params_speculative & params) const { } } -void spec_tuner::init(common_speculative_type type, const common_params_speculative & user_params) { +void spec_tuner::init(common_speculative_type type, const common_params_speculative & user_params, const llama_model * model_tgt) { enabled = true; spec_type = type; coords.clear(); @@ -136,7 +136,9 @@ void spec_tuner::init(common_speculative_type type, const common_params_speculat { spec_tuner_coord coord; coord.name = "n_max"; - int hi = std::max(16, (int)user_params.n_max); + const bool recurrent_target = model_tgt != nullptr && llama_model_has_recurrent(model_tgt); + int hi = recurrent_target ? std::max(1, (int) user_params.n_max) + : std::max(16, (int) user_params.n_max); coord.build_grid_int(1, hi, 1, user_params.n_max); coords.push_back(std::move(coord)); } diff --git a/common/spec-tuner.h b/common/spec-tuner.h index 8435b9a4..c948915b 100644 --- a/common/spec-tuner.h +++ b/common/spec-tuner.h @@ -2,6 +2,8 @@ #include "common.h" +struct llama_model; + struct spec_tuner_arm { float value; double Q = 0.0; // mean per-step Tokens-Per-Second (TPS) @@ -55,7 +57,7 @@ struct spec_tuner { common_speculative_type spec_type = COMMON_SPECULATIVE_TYPE_NONE; std::vector coords; - void init(common_speculative_type type, const common_params_speculative & user_params); + void init(common_speculative_type type, const common_params_speculative & user_params, const llama_model * model_tgt); void propose(common_params_speculative & params); void accept_feedback(int n_accepted, int n_drafted, double step_tps); void end_of_request(double slot_tps, int n_past, common_params_speculative & active_params); diff --git a/common/speculative.cpp b/common/speculative.cpp index ecadde4c..3b417881 100644 --- a/common/speculative.cpp +++ b/common/speculative.cpp @@ -1122,6 +1122,25 @@ common_speculative * common_speculative_init( } } + if (!configs.empty() && llama_model_has_recurrent(llama_get_model(ctx_tgt))) { + const int ckpt_tokens = std::max(1, params.n_max + 1); + const int actual_mode = llama_spec_ckpt_init(ctx_tgt, params.recurrent_ckpt_mode, ckpt_tokens); + if (actual_mode == LLAMA_SPEC_CKPT_NONE) { + LOG_ERR("%s: failed to prepare recurrent checkpoint mode '%s' during speculative init (max_tokens=%d)\n", + __func__, + params.recurrent_ckpt_mode == LLAMA_SPEC_CKPT_PER_STEP ? "per-step" : + params.recurrent_ckpt_mode == LLAMA_SPEC_CKPT_GPU_FALLBACK ? "gpu-fallback" : + params.recurrent_ckpt_mode == LLAMA_SPEC_CKPT_CPU ? "cpu" : "auto", + ckpt_tokens); + if (ctx_dft != nullptr) { + llama_free(ctx_dft); + } + return nullptr; + } + llama_spec_ckpt_discard(ctx_tgt); + params.recurrent_ckpt_mode = actual_mode; + } + std::vector> impls = {}; for (const common_speculative_config & config : configs) { @@ -1221,7 +1240,7 @@ common_speculative * common_speculative_init( if (actual_type != COMMON_SPECULATIVE_TYPE_NONE && actual_type != COMMON_SPECULATIVE_TYPE_EAGLE3) { result->tuner = std::make_unique(); - result->tuner->init(actual_type, params); + result->tuner->init(actual_type, params, llama_get_model(ctx_tgt)); LOG_DBG("Autotune initialized for %s, tuning %zu parameters\n", common_speculative_type_to_str(actual_type).c_str(), result->tuner->coords.size()); diff --git a/examples/server/server-context.cpp b/examples/server/server-context.cpp index 7ec7c51d..3246ec2f 100644 --- a/examples/server/server-context.cpp +++ b/examples/server/server-context.cpp @@ -454,6 +454,9 @@ void server_context::init() { } } + const bool requested_spec = params_base.speculative.type != COMMON_SPECULATIVE_TYPE_NONE || + params_base.speculative.has_dft(); + bool can_spec = true; if (!params_base.dry_run) { can_spec = common_speculative_is_compat(ctx); @@ -462,7 +465,7 @@ void server_context::init() { SRV_WRN("%s", "speculative decoding not supported by this context\n"); } // try speculative decoding - if (can_spec) { + if (can_spec && requested_spec) { slot.spec = common_speculative_init(params_base.speculative, slot.ctx); if (slot.spec) { if (mctx && !slot.has_mtp) { @@ -471,11 +474,15 @@ void server_context::init() { } SLT_INF(slot, "%s", "speculative decoding context initialized\n"); } else { - if (slot.has_mtp) { - SRV_ERR("%s", "failed to initialize MTP speculative context, aborting\n"); - GGML_ABORT("MTP context creation failed"); + if (llama_model_has_recurrent(model)) { + SRV_ERR("%s", "failed to initialize recurrent speculative context\n"); + throw std::runtime_error("recurrent speculative context initialization failed"); + } else if (slot.has_mtp) { + SRV_ERR("%s", "failed to initialize MTP speculative context\n"); + throw std::runtime_error("MTP speculative context initialization failed"); } else { - SLT_INF(slot, "%s", "speculative decoding context not initialized\n"); + SRV_ERR("%s", "failed to initialize speculative decoding context\n"); + throw std::runtime_error("speculative decoding context initialization failed"); } } } @@ -1233,6 +1240,17 @@ bool server_context::launch_slot_with_task(server_slot& slot, server_task& task) slot.params.speculative.n_min = std::max(slot.params.speculative.n_min, 0); slot.params.speculative.n_max = std::max(slot.params.speculative.n_max, 0); + if (slot.can_speculate() && + llama_model_has_recurrent(model) && + slot.params.speculative.n_max > params_base.speculative.n_max) { + send_error(task, + "Error: speculative.n_max=" + std::to_string(slot.params.speculative.n_max) + + " exceeds the recurrent speculative startup limit of " + std::to_string(params_base.speculative.n_max) + + "; restart the server with a higher --draft-max to reserve checkpoint capacity", + ERROR_TYPE_INVALID_REQUEST); + return false; + } + slot.params.speculative.ngram_size_n = json_value(data, "speculative.ngram_size_n", defaults.speculative.ngram_size_n); slot.params.speculative.ngram_size_m = json_value(data, "speculative.ngram_size_m", defaults.speculative.ngram_size_m); slot.params.speculative.ngram_min_hits = json_value(data, "speculative.ngram_m_hits", defaults.speculative.ngram_min_hits); diff --git a/examples/server/server-context.h b/examples/server/server-context.h index f1009ae2..02194ab9 100644 --- a/examples/server/server-context.h +++ b/examples/server/server-context.h @@ -258,7 +258,7 @@ struct server_context { gpt_params params_base; - llama_batch batch; + llama_batch batch = {}; bool clean_kv_cache = true; bool add_bos_token = true; diff --git a/examples/server/server.cpp b/examples/server/server.cpp index feaf1b4e..24eafcb4 100644 --- a/examples/server/server.cpp +++ b/examples/server/server.cpp @@ -584,7 +584,13 @@ int main(int argc, char ** argv) { state.store(SERVER_STATE_ERROR); return 1; } else { - ctx_server.init(); + try { + ctx_server.init(); + } catch (const std::exception & e) { + LOG_ERROR("server init failed", {{"error", e.what()}}); + state.store(SERVER_STATE_ERROR); + return 1; + } state.store(SERVER_STATE_READY); } diff --git a/src/llama-context.h b/src/llama-context.h index 02a20052..c14ba927 100644 --- a/src/llama-context.h +++ b/src/llama-context.h @@ -103,6 +103,8 @@ struct llama_kv_cache { int32_t per_step_d_conv = 0; int selected_spec_mode = -1; + int fixed_spec_mode = LLAMA_SPEC_CKPT_NONE; + int32_t fixed_max_tokens = 0; // Serialised sequence state for CPU mode std::vector cpu_state_data; @@ -115,6 +117,7 @@ struct llama_kv_cache { std::vector shadow_bufs; bool allocated = false; + bool shadow_conv_only = false; bool saved = false; ~gpu_checkpoint() { @@ -135,13 +138,14 @@ struct llama_kv_cache { gpu_checkpoint ckpt; - bool checkpoint_alloc_shadows(); + bool checkpoint_alloc_shadows(bool conv_only_shadow = false); bool checkpoint_supported() const; bool checkpoint_save(ggml_backend_sched_t sched); bool checkpoint_restore(ggml_backend_sched_t sched); void checkpoint_delete(); // Per-step checkpoint: allocate, restore step k's full state (SSM + conv) to cache + bool per_step_save(ggml_backend_sched_t sched); bool per_step_alloc(const llama_model & model, int max_tokens); bool per_step_restore(const llama_model & model, ggml_backend_sched_t sched, int step); diff --git a/src/llama-delta-net.cpp b/src/llama-delta-net.cpp index e8996686..862c983b 100644 --- a/src/llama-delta-net.cpp +++ b/src/llama-delta-net.cpp @@ -70,7 +70,9 @@ delta_net::delta_net(llama_context & _lctx, const llama_batch & _batch) : lctx(_ GGML_ASSERT((uint32_t) s < qnext_state_slots); } - int max_per_step = lctx.kv_self.save_per_step_ssm ? std::min(8, lctx.kv_self.ckpt.per_step_max_allocated) : 0; + int max_per_step = lctx.kv_self.save_per_step_ssm + ? lctx.kv_self.ckpt.per_step_max_allocated + : 0; save_per_step_states = lctx.kv_self.save_per_step_ssm && batch.n_tokens > 1 && batch.n_tokens <= max_per_step; } diff --git a/src/llama.cpp b/src/llama.cpp index 99b6b850..434dfcb4 100644 --- a/src/llama.cpp +++ b/src/llama.cpp @@ -1255,8 +1255,15 @@ bool llama_kv_cache::checkpoint_supported() const { return false; } -bool llama_kv_cache::checkpoint_alloc_shadows() { +bool llama_kv_cache::checkpoint_alloc_shadows(bool conv_only_shadow) { if (ckpt.allocated) { + if (ckpt.shadow_conv_only != conv_only_shadow) { + LLAMA_LOG_ERROR("%s: requested %s shadow buffers, but %s shadow buffers are already allocated\n", + __func__, + conv_only_shadow ? "conv-state-only" : "full-state", + ckpt.shadow_conv_only ? "conv-state-only" : "full-state"); + return false; + } return true; } @@ -1269,10 +1276,7 @@ bool llama_kv_cache::checkpoint_alloc_shadows() { int split_idx; // -1 for non-split }; - const bool conv_only_shadow = save_per_step_ssm && ckpt.per_step_conv_state_dim > 0; - std::vector nonsplit_entries; - - std::map> split_buft_entries; + std::map> buft_entries; for (uint32_t il = 0; il < n_layer; ++il) { if (s_l[il] == nullptr) { @@ -1285,16 +1289,18 @@ bool llama_kv_cache::checkpoint_alloc_shadows() { for (int d = 0; d < split_info->n_device; ++d) { if (split_info->splits[d] == nullptr) continue; ggml_backend_buffer_type_t buft = ggml_backend_buffer_get_type(split_info->splits[d]->buffer); - split_buft_entries[buft].push_back({split_info->splits[d], il, d}); + buft_entries[buft].push_back({split_info->splits[d], il, d}); } } else { - nonsplit_entries.push_back({s_l[il], il, -1}); + ggml_backend_buffer_type_t buft = ggml_backend_buffer_get_type(s_l[il]->buffer); + buft_entries[buft].push_back({s_l[il], il, -1}); } } - if (!nonsplit_entries.empty()) { + // Allocate all shadows on the same backend type as the source tensor. + for (auto & [buft, entries] : buft_entries) { ggml_init_params params = { - /*.mem_size =*/ nonsplit_entries.size() * ggml_tensor_overhead(), + /*.mem_size =*/ entries.size() * ggml_tensor_overhead(), /*.mem_buffer =*/ NULL, /*.no_alloc =*/ true, }; @@ -1304,60 +1310,39 @@ bool llama_kv_cache::checkpoint_alloc_shadows() { return false; } - for (auto & entry : nonsplit_entries) { - // Only need the conv portion when per-step is active. - const int64_t nelems = conv_only_shadow - ? ckpt.per_step_conv_state_dim - : (int64_t)ggml_nelements(entry.primary); - ggml_tensor * shadow = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, nelems); - ggml_format_name(shadow, "shadow_s_l%d", entry.il); - ckpt.s_l_shadow[entry.il] = shadow; - } - - ggml_backend_buffer_t buf = ggml_backend_alloc_ctx_tensors_from_buft(ctx, ggml_backend_cpu_buffer_type()); - if (!buf) { - LLAMA_LOG_ERROR("%s: failed to allocate CPU buffer for shadow tensors\n", __func__); - ggml_free(ctx); - return false; - } - ggml_backend_buffer_clear(buf, 0); - LLAMA_LOG_INFO("%s: CPU shadow buffer = %8.2f MiB (%s)\n", __func__, - ggml_backend_buffer_get_size(buf) / 1024.0 / 1024.0, - conv_only_shadow ? "conv-state only" : "full recurrent state"); - ckpt.shadow_ctxs.push_back(ctx); - ckpt.shadow_bufs.push_back(buf); - } - - // Allocate split shadows on their respective devices - for (auto & [buft, entries] : split_buft_entries) { - ggml_init_params params = { - /*.mem_size =*/ entries.size() * ggml_tensor_overhead(), - /*.mem_buffer =*/ NULL, - /*.no_alloc =*/ true, - }; - ggml_context * ctx = ggml_init(params); - if (!ctx) { - LLAMA_LOG_ERROR("%s: failed to create ggml context for split shadow tensors\n", __func__); - return false; - } - for (auto & entry : entries) { - ggml_tensor * shadow = ggml_dup_tensor(ctx, entry.primary); - ggml_format_name(shadow, "shadow_s_l%d_d%d", entry.il, entry.split_idx); + ggml_tensor * shadow = nullptr; + if (conv_only_shadow && entry.split_idx < 0) { + shadow = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, ckpt.per_step_conv_state_dim); + } else { + shadow = ggml_dup_tensor(ctx, entry.primary); + } + if (entry.split_idx >= 0) { + ggml_format_name(shadow, "shadow_s_l%d_d%d", entry.il, entry.split_idx); + } else { + ggml_format_name(shadow, "shadow_s_l%d", entry.il); + } entry.primary = shadow; } ggml_backend_buffer_t buf = ggml_backend_alloc_ctx_tensors_from_buft(ctx, buft); if (!buf) { - LLAMA_LOG_ERROR("%s: failed to allocate buffer for split shadow tensors\n", __func__); + LLAMA_LOG_ERROR("%s: failed to allocate buffer for shadow tensors\n", __func__); ggml_free(ctx); return false; } ggml_backend_buffer_clear(buf, 0); - LLAMA_LOG_INFO("%s: %10s split shadow buffer = %8.2f MiB\n", __func__, - ggml_backend_buffer_name(buf), ggml_backend_buffer_get_size(buf) / 1024.0 / 1024.0); + LLAMA_LOG_INFO("%s: %10s shadow buffer = %8.2f MiB%s\n", __func__, + ggml_backend_buffer_name(buf), ggml_backend_buffer_get_size(buf) / 1024.0 / 1024.0, + conv_only_shadow ? " (conv-state only)" : ""); ckpt.shadow_ctxs.push_back(ctx); ckpt.shadow_bufs.push_back(buf); + + for (const auto & entry : entries) { + if (entry.split_idx < 0) { + ckpt.s_l_shadow[entry.il] = entry.primary; + } + } } // Build split shadow lookup @@ -1374,7 +1359,7 @@ bool llama_kv_cache::checkpoint_alloc_shadows() { for (int d = 0; d < split_info->n_device; ++d) { if (split_info->splits[d] == nullptr) continue; ggml_backend_buffer_type_t buft = ggml_backend_buffer_get_type(split_info->splits[d]->buffer); - for (auto & entry : split_buft_entries[buft]) { + for (auto & entry : buft_entries[buft]) { if (entry.il == il && entry.split_idx == d) { shadow_split[d] = entry.primary; break; @@ -1383,15 +1368,18 @@ bool llama_kv_cache::checkpoint_alloc_shadows() { } } + ckpt.shadow_conv_only = conv_only_shadow; ckpt.allocated = true; return true; } bool llama_kv_cache::checkpoint_save(ggml_backend_sched_t sched) { - if (!checkpoint_alloc_shadows()) { + if (!checkpoint_alloc_shadows(false)) { return false; } + GGML_ASSERT(!ckpt.shadow_conv_only); + const uint32_t n_layer = (uint32_t)s_l.size(); ckpt.cells_snapshot = cells; @@ -1417,8 +1405,11 @@ bool llama_kv_cache::checkpoint_save(ggml_backend_sched_t sched) { } } } else { - const size_t nbytes = ggml_nbytes(ckpt.s_l_shadow[il]); - ggml_backend_tensor_get(s_l[il], ckpt.s_l_shadow[il]->data, 0, nbytes); + GGML_ASSERT(ckpt.s_l_shadow[il] != nullptr); + auto src_backend = ggml_backend_sched_get_tensor_backend(sched, s_l[il]); + GGML_ASSERT(src_backend != nullptr); + ggml_backend_tensor_copy_async(src_backend, src_backend, s_l[il], ckpt.s_l_shadow[il]); + backends_to_sync.insert(src_backend); } } @@ -1436,6 +1427,8 @@ bool llama_kv_cache::checkpoint_restore(ggml_backend_sched_t sched) { return false; } + GGML_ASSERT(!ckpt.shadow_conv_only); + const uint32_t n_layer = (uint32_t)s_l.size(); cells = ckpt.cells_snapshot; @@ -1460,8 +1453,12 @@ bool llama_kv_cache::checkpoint_restore(ggml_backend_sched_t sched) { } } } else { + GGML_ASSERT(ckpt.s_l_shadow[il] != nullptr); GGML_ASSERT(ggml_nbytes(ckpt.s_l_shadow[il]) == ggml_nbytes(s_l[il])); - ggml_backend_tensor_copy(ckpt.s_l_shadow[il], s_l[il]); + auto dst_backend = ggml_backend_sched_get_tensor_backend(sched, s_l[il]); + GGML_ASSERT(dst_backend != nullptr); + ggml_backend_tensor_copy_async(dst_backend, dst_backend, ckpt.s_l_shadow[il], s_l[il]); + backends_to_sync.insert(dst_backend); } } @@ -1476,6 +1473,68 @@ void llama_kv_cache::checkpoint_delete() { ckpt.saved = false; } +bool llama_kv_cache::per_step_save(ggml_backend_sched_t sched) { + const uint32_t n_layer = (uint32_t)s_l.size(); + const int64_t conv_state_dim = ckpt.per_step_conv_state_dim; + + ckpt.cells_snapshot = cells; + ckpt.head_snapshot = head; + ckpt.used_snapshot = used; + + if (conv_state_dim > 0 && !checkpoint_alloc_shadows(true)) { + return false; + } + + // Non-split recurrent tensors only need the pre-spec conv complement in + // their reduced shadow buffers. Split tensors keep full shadow copies so + // restore can still seed each split conv prefix from split_s_l_shadow. + const size_t conv_bytes = (size_t)std::max(conv_state_dim, 0) * sizeof(float); + std::unordered_set backends_to_sync; + + for (uint32_t il = 0; il < n_layer; ++il) { + if (s_l[il] == nullptr) { + continue; + } + + if (s_l[il]->extra != nullptr) { + auto * split_info = (const ggml_split_tensor_t *)s_l[il]->extra; + auto & shadow_split = ckpt.split_s_l_shadow[il]; + for (int d = 0; d < split_info->n_device; ++d) { + if (split_info->splits[d] == nullptr || shadow_split[d] == nullptr) { + continue; + } + auto src_backend = ggml_backend_sched_get_tensor_backend(sched, split_info->splits[d]); + GGML_ASSERT(src_backend != nullptr); + ggml_backend_tensor_copy_async(src_backend, src_backend, split_info->splits[d], shadow_split[d]); + backends_to_sync.insert(src_backend); + } + continue; + } + + if (conv_bytes == 0) { + continue; + } + + GGML_ASSERT(ckpt.s_l_shadow[il] != nullptr); + + ggml_tensor src = *s_l[il]; + src.ne[0] = conv_bytes / sizeof(float); + src.nb[1] = src.nb[2] = src.nb[3] = conv_bytes; + + auto src_backend = ggml_backend_sched_get_tensor_backend(sched, s_l[il]); + GGML_ASSERT(src_backend != nullptr); + ggml_backend_tensor_copy_async(src_backend, src_backend, &src, ckpt.s_l_shadow[il]); + backends_to_sync.insert(src_backend); + } + + for (auto backend : backends_to_sync) { + ggml_backend_synchronize(backend); + } + + ckpt.saved = true; + return true; +} + bool llama_kv_cache::per_step_alloc(const llama_model & model, int max_tokens) { if (ckpt.per_step_max_allocated >= max_tokens) { return true; @@ -1500,7 +1559,7 @@ bool llama_kv_cache::per_step_alloc(const llama_model & model, int max_tokens) { ckpt.per_step_qkv.resize(n_layer); const int64_t ssm_state_dim = ckpt.per_step_ssm_state_size; - const int64_t conv_dim = ckpt.per_step_conv_dim; + const int64_t conv_dim = ckpt.per_step_conv_dim; if (ssm_state_dim <= 0 || conv_dim <= 0) { LLAMA_LOG_ERROR("%s: per_step dimensions not set (ssm=%lld, conv_dim=%lld)\n", __func__, (long long)ssm_state_dim, (long long)conv_dim); @@ -7089,9 +7148,9 @@ void llama_kv_cache_clear(struct llama_context * ctx) { // Unified speculative-checkpoint static bool spec_ckpt_try_per_step(llama_kv_cache & kv, const llama_model & model, int max_tokens) { - // Graph-split recurrent tensors are not supported. CPU-only and mixed - // CPU/GPU recurrent placement are allowed as long as each layer has a - // concrete backend buffer for the per-step tensors. + // Split recurrent tensors are supported as long as each layer exposes + // concrete backend buffers for the per-step tensors. CPU-only and mixed + // CPU/GPU recurrent placement are also allowed. bool has_gpu = false; bool has_cpu = false; for (const auto * sl : kv.s_l) { @@ -7100,10 +7159,6 @@ static bool spec_ckpt_try_per_step(llama_kv_cache & kv, const llama_model & mode has_gpu = true; continue; } - //if (sl->extra) { - // kv.save_per_step_ssm = false; - // return false; - //} if (sl->buffer && !ggml_backend_buffer_is_host(sl->buffer)) { has_gpu = true; } else if (sl->buffer) { @@ -7137,9 +7192,73 @@ static bool spec_ckpt_try_per_step(llama_kv_cache & kv, const llama_model & mode return false; } + if (!kv.checkpoint_alloc_shadows(true)) { + LLAMA_LOG_ERROR("%s: failed to allocate conv-state shadow buffers for per-step checkpoints\n", __func__); + kv.save_per_step_ssm = false; + return false; + } + return true; } +static size_t llama_spec_ckpt_cpu_state_reserve(const llama_context * ctx, llama_seq_id seq_id) { + const auto & kv_self = ctx->kv_self; + + size_t size = sizeof(uint32_t); // cell_count + + if (seq_id >= 0 && llama_kv_qnext_seq_id_in_range(kv_self, seq_id) && (uint32_t) seq_id < kv_self.size) { + size += sizeof(llama_pos); + size += sizeof(uint32_t); // n_seq_id = 0 for seq-specific saves + } + + const uint32_t v_state = kv_self.v_l.empty() ? 2 : kv_self.v_trans ? 1 : 0; + const uint32_t n_layer = kv_self.k_l.size(); + + size += sizeof(v_state); + size += sizeof(n_layer); + size += (size_t) n_layer * (sizeof(int32_t) + sizeof(uint64_t)); + + if (v_state == 0) { + size += (size_t) n_layer * (sizeof(int32_t) + sizeof(uint64_t)); + } else if (v_state == 1) { + size += (size_t) n_layer * (sizeof(int32_t) + sizeof(uint32_t) + sizeof(uint32_t)); + } + + const uint32_t qnext_state = llama_kv_has_qnext_state_storage(kv_self) ? 1 : 0; + size += sizeof(qnext_state); + + if (qnext_state != 0) { + for (uint32_t il = 0; il < n_layer; ++il) { + const bool has_s_cache = il < kv_self.s_l.size() && kv_self.s_l[il] != nullptr; + const uint64_t s_size_row = has_s_cache ? ggml_row_size(kv_self.s_l[il]->type, kv_self.s_l[il]->ne[0]) : 0; + const uint32_t s_rows = has_s_cache && seq_id >= 0 && llama_kv_qnext_seq_id_in_range(kv_self, seq_id) && (uint32_t) seq_id < kv_self.size + ? 1 + : 0; + + size += sizeof(int32_t); + size += sizeof(uint64_t); + size += sizeof(uint32_t); + size += (size_t) s_rows * s_size_row; + } + } + return size; +} + +static const char * llama_spec_ckpt_mode_name(int mode) { + switch (mode) { + case LLAMA_SPEC_CKPT_PER_STEP: + return "per-step"; + case LLAMA_SPEC_CKPT_GPU_FALLBACK: + return "gpu-fallback"; + case LLAMA_SPEC_CKPT_CPU: + return "cpu"; + case LLAMA_SPEC_CKPT_AUTO: + return "auto"; + default: + return "none"; + } +} + int llama_spec_ckpt_init(struct llama_context * ctx, int mode, int max_tokens) { auto & kv = ctx->kv_self; @@ -7150,7 +7269,19 @@ int llama_spec_ckpt_init(struct llama_context * ctx, int mode, int max_tokens) { return (int)LLAMA_SPEC_CKPT_NONE; } + if (kv.ckpt.fixed_spec_mode != LLAMA_SPEC_CKPT_NONE) { + if (kv.ckpt.fixed_spec_mode == LLAMA_SPEC_CKPT_PER_STEP && max_tokens > kv.ckpt.fixed_max_tokens) { + LLAMA_LOG_WARN("%s: fixed per-step checkpoint capacity is %d tokens, but the current speculative batch requests %d; disabling checkpoint for this batch\n", + __func__, kv.ckpt.fixed_max_tokens, max_tokens); + return (int)LLAMA_SPEC_CKPT_NONE; + } + + kv.ckpt.selected_spec_mode = kv.ckpt.fixed_spec_mode; + return kv.ckpt.selected_spec_mode; + } + int requested = mode; + int resolved = LLAMA_SPEC_CKPT_NONE; // prefer PER_STEP → GPU_FALLBACK → CPU if (requested == LLAMA_SPEC_CKPT_AUTO) { @@ -7159,22 +7290,53 @@ int llama_spec_ckpt_init(struct llama_context * ctx, int mode, int max_tokens) { if (requested == LLAMA_SPEC_CKPT_PER_STEP) { if (spec_ckpt_try_per_step(kv, ctx->model, max_tokens)) { - kv.ckpt.selected_spec_mode = LLAMA_SPEC_CKPT_PER_STEP; - return (int)LLAMA_SPEC_CKPT_PER_STEP; + resolved = LLAMA_SPEC_CKPT_PER_STEP; + } else if (mode == LLAMA_SPEC_CKPT_PER_STEP) { + LLAMA_LOG_ERROR("%s: failed to preallocate per-step checkpoint buffers for max_tokens=%d; --recurrent-ckpt-mode=%s requires startup allocation\n", + __func__, max_tokens, llama_spec_ckpt_mode_name(mode)); + return (int)LLAMA_SPEC_CKPT_NONE; + } else { + LLAMA_LOG_WARN("%s: auto checkpoint mode could not preallocate per-step buffers for max_tokens=%d; falling back to gpu-fallback\n", + __func__, max_tokens); + requested = LLAMA_SPEC_CKPT_GPU_FALLBACK; } - if (mode == LLAMA_SPEC_CKPT_PER_STEP) { - LLAMA_LOG_WARN("%s: per-step not available, falling back to GPU fallback mode\n", __func__); - } - requested = LLAMA_SPEC_CKPT_GPU_FALLBACK; } - if (requested == LLAMA_SPEC_CKPT_GPU_FALLBACK) { - kv.ckpt.selected_spec_mode = LLAMA_SPEC_CKPT_GPU_FALLBACK; - return (int)LLAMA_SPEC_CKPT_GPU_FALLBACK; + if (resolved == LLAMA_SPEC_CKPT_NONE && requested == LLAMA_SPEC_CKPT_GPU_FALLBACK) { + if (kv.checkpoint_alloc_shadows()) { + resolved = LLAMA_SPEC_CKPT_GPU_FALLBACK; + } else if (mode == LLAMA_SPEC_CKPT_GPU_FALLBACK) { + LLAMA_LOG_ERROR("%s: failed to preallocate gpu-fallback checkpoint shadows at startup; --recurrent-ckpt-mode=%s requires startup allocation\n", + __func__, llama_spec_ckpt_mode_name(mode)); + return (int)LLAMA_SPEC_CKPT_NONE; + } else { + LLAMA_LOG_WARN("%s: auto checkpoint mode could not preallocate gpu-fallback checkpoint shadows; falling back to cpu\n", + __func__); + requested = LLAMA_SPEC_CKPT_CPU; + } } - kv.ckpt.selected_spec_mode = LLAMA_SPEC_CKPT_CPU; - return (int)LLAMA_SPEC_CKPT_CPU; + if (resolved == LLAMA_SPEC_CKPT_NONE) { + resolved = LLAMA_SPEC_CKPT_CPU; + } + + if (resolved == LLAMA_SPEC_CKPT_CPU) { + const size_t cpu_reserve = llama_spec_ckpt_cpu_state_reserve(ctx, 0); + kv.ckpt.cpu_state_data.clear(); + kv.ckpt.cpu_state_data.reserve(cpu_reserve); + LLAMA_LOG_INFO("%s: CPU serialized checkpoint reserve = %8.2f MiB (per seq)\n", + __func__, cpu_reserve / 1024.0 / 1024.0); + } + + kv.ckpt.fixed_spec_mode = resolved; + kv.ckpt.fixed_max_tokens = resolved == LLAMA_SPEC_CKPT_PER_STEP ? max_tokens : 0; + kv.ckpt.selected_spec_mode = resolved; + + LLAMA_LOG_INFO("%s: fixed recurrent checkpoint mode = %s%s\n", + __func__, llama_spec_ckpt_mode_name(resolved), + resolved == LLAMA_SPEC_CKPT_PER_STEP ? (std::string(" (max_tokens=") + std::to_string(max_tokens) + ")").c_str() : ""); + + return resolved; } bool llama_spec_ckpt_save(struct llama_context * ctx, llama_seq_id seq_id) { @@ -7183,7 +7345,7 @@ bool llama_spec_ckpt_save(struct llama_context * ctx, llama_seq_id seq_id) { switch (kv.ckpt.selected_spec_mode) { case LLAMA_SPEC_CKPT_PER_STEP: kv.save_per_step_ssm = true; - return kv.checkpoint_save(ctx->sched); + return kv.per_step_save(ctx->sched); case LLAMA_SPEC_CKPT_GPU_FALLBACK: return kv.checkpoint_save(ctx->sched);