From 010da571be87f84bba82a65d8e8e289de18162d5 Mon Sep 17 00:00:00 2001 From: Kawrakow Date: Fri, 8 May 2026 13:04:00 +0000 Subject: [PATCH] Use async copies to save/restore recurrent state --- ggml/src/ggml-cuda.cu | 2 +- src/llama-context.h | 4 ++-- src/llama.cpp | 31 ++++++++++++++++++++++++------- 3 files changed, 27 insertions(+), 10 deletions(-) diff --git a/ggml/src/ggml-cuda.cu b/ggml/src/ggml-cuda.cu index 23c76b3a..4604db95 100644 --- a/ggml/src/ggml-cuda.cu +++ b/ggml/src/ggml-cuda.cu @@ -4064,7 +4064,7 @@ GGML_CALL static bool ggml_backend_cuda_cpy_tensor_async(ggml_backend_t backend_ } } else { // src and dst are on the same backend - printf("Why is this being invoked?\n"); + // printf("Why is this being invoked?\n"); CUDA_CHECK(cudaMemcpyAsync(dst->data, src->data, ggml_nbytes(dst), cudaMemcpyDeviceToDevice, cuda_ctx_src->stream())); } return true; diff --git a/src/llama-context.h b/src/llama-context.h index 07086bc4..075ec991 100644 --- a/src/llama-context.h +++ b/src/llama-context.h @@ -137,8 +137,8 @@ struct llama_kv_cache { bool checkpoint_alloc_shadows(); bool checkpoint_supported() const; - bool checkpoint_save(); - bool checkpoint_restore(); + bool checkpoint_save(ggml_backend_sched_t sched); + bool checkpoint_restore(ggml_backend_sched_t sched); void checkpoint_delete(); // Per-step checkpoint: allocate, restore step k's full state (SSM + conv) to cache diff --git a/src/llama.cpp b/src/llama.cpp index 02915969..51619cea 100644 --- a/src/llama.cpp +++ b/src/llama.cpp @@ -1383,7 +1383,7 @@ bool llama_kv_cache::checkpoint_alloc_shadows() { return true; } -bool llama_kv_cache::checkpoint_save() { +bool llama_kv_cache::checkpoint_save(ggml_backend_sched_t sched) { if (!checkpoint_alloc_shadows()) { return false; } @@ -1394,6 +1394,8 @@ bool llama_kv_cache::checkpoint_save() { ckpt.head_snapshot = head; ckpt.used_snapshot = used; + std::unordered_set backends_to_sync; + uint32_t split_s_idx = 0; for (uint32_t il = 0; il < n_layer; ++il) { if (s_l[il] == nullptr) { @@ -1405,7 +1407,10 @@ bool llama_kv_cache::checkpoint_save() { auto & shadow_split = ckpt.split_s_l_shadow[split_s_idx]; for (int d = 0; d < split_info->n_device; ++d) { if (split_info->splits[d] && shadow_split[d]) { - ggml_backend_tensor_copy(split_info->splits[d], shadow_split[d]); + //ggml_backend_tensor_copy(split_info->splits[d], shadow_split[d]); + auto src_backend = ggml_backend_sched_get_tensor_backend(sched, split_info->splits[d]); + ggml_backend_tensor_copy_async(src_backend, src_backend, split_info->splits[d], shadow_split[d]); + backends_to_sync.insert(src_backend); } } split_s_idx++; @@ -1415,11 +1420,15 @@ bool llama_kv_cache::checkpoint_save() { } } + for (auto backend : backends_to_sync) { + ggml_backend_synchronize(backend); + } + ckpt.saved = true; return true; } -bool llama_kv_cache::checkpoint_restore() { +bool llama_kv_cache::checkpoint_restore(ggml_backend_sched_t sched) { if (!ckpt.saved) { LLAMA_LOG_ERROR("%s: no checkpoint saved\n", __func__); return false; @@ -1431,6 +1440,8 @@ bool llama_kv_cache::checkpoint_restore() { head = ckpt.head_snapshot; used = ckpt.used_snapshot; + std::unordered_set backends_to_sync; + uint32_t split_s_idx = 0; for (uint32_t il = 0; il < n_layer; ++il) { if (s_l[il] == nullptr) { @@ -1442,7 +1453,9 @@ bool llama_kv_cache::checkpoint_restore() { auto & shadow_split = ckpt.split_s_l_shadow[split_s_idx]; for (int d = 0; d < split_info->n_device; ++d) { if (split_info->splits[d] && shadow_split[d]) { - ggml_backend_tensor_copy(shadow_split[d], split_info->splits[d]); + auto dst_backend = ggml_backend_sched_get_tensor_backend(sched, split_info->splits[d]); + ggml_backend_tensor_copy_async(dst_backend, dst_backend, shadow_split[d], split_info->splits[d]); + backends_to_sync.insert(dst_backend); } } split_s_idx++; @@ -1452,6 +1465,10 @@ bool llama_kv_cache::checkpoint_restore() { } } + for (auto backend : backends_to_sync) { + ggml_backend_synchronize(backend); + } + return true; } @@ -7015,10 +7032,10 @@ bool llama_spec_ckpt_save(struct llama_context * ctx, llama_seq_id seq_id) { switch (kv.ckpt.selected_spec_mode) { case LLAMA_SPEC_CKPT_PER_STEP: kv.save_per_step_ssm = true; - return kv.checkpoint_save(); + return kv.checkpoint_save(ctx->sched); case LLAMA_SPEC_CKPT_GPU_FALLBACK: - return kv.checkpoint_save(); + return kv.checkpoint_save(ctx->sched); case LLAMA_SPEC_CKPT_CPU: { const size_t need = llama_state_seq_get_size(ctx, seq_id, LLAMA_STATE_SEQ_FLAGS_PARTIAL_ONLY); @@ -7052,7 +7069,7 @@ bool llama_spec_ckpt_restore(struct llama_context * ctx, llama_seq_id seq_id, } case LLAMA_SPEC_CKPT_GPU_FALLBACK: - kv.checkpoint_restore(); + kv.checkpoint_restore(ctx->sched); llama_kv_cache_seq_rm(kv, seq_id, n_past, -1); return false;