From 4f3a4beb8d7e7296aabee3593bb65aa88dceb9ed Mon Sep 17 00:00:00 2001 From: Georgi Gerganov Date: Tue, 2 Jun 2026 10:30:38 +0300 Subject: [PATCH] llama : deprecate `llama_set_warmup` (#24009) * llama : deprecate `llama_set_warmup` * cont : fix type Co-authored-by: Daniel Bevenius --------- Co-authored-by: Daniel Bevenius --- common/common.cpp | 3 --- include/llama.h | 6 +++++- src/llama-cparams.h | 2 +- tests/test-backend-sampler.cpp | 2 -- 4 files changed, 6 insertions(+), 7 deletions(-) diff --git a/common/common.cpp b/common/common.cpp index 81b8b75002..0460c6c530 100644 --- a/common/common.cpp +++ b/common/common.cpp @@ -1389,8 +1389,6 @@ common_init_result_ptr common_init_from_params(common_params & params, bool mode if (params.warmup) { LOG_INF("%s: warming up the model with an empty run - please wait ... (--no-warmup to disable)\n", __func__); - llama_set_warmup(lctx, true); - std::vector tmp; llama_token bos = llama_vocab_bos(vocab); llama_token eos = llama_vocab_eos(vocab); @@ -1421,7 +1419,6 @@ common_init_result_ptr common_init_from_params(common_params & params, bool mode llama_memory_clear(llama_get_memory(lctx), true); llama_synchronize(lctx); llama_perf_context_reset(lctx); - llama_set_warmup(lctx, false); // reset samplers to reset RNG state after warmup to the seeded state res->reset_samplers(); diff --git a/include/llama.h b/include/llama.h index a79a491c59..9f78aa9a05 100644 --- a/include/llama.h +++ b/include/llama.h @@ -976,7 +976,11 @@ extern "C" { // Set whether the model is in warmup mode or not // If true, all model tensors are activated during llama_decode() to load and cache their weights. - LLAMA_API void llama_set_warmup(struct llama_context * ctx, bool warmup); + // + // note: using this can cause extra graph reallocations because it changes the graph topology with MoE models, + // so it is generally not recommended to use in practice. will be removed in the future + DEPRECATED(LLAMA_API void llama_set_warmup(struct llama_context * ctx, bool warmup), + "user code should do warmup runs manually [TAG_LLAMA_GRAPH_NO_WARMUP]"); // Set abort callback LLAMA_API void llama_set_abort_callback(struct llama_context * ctx, ggml_abort_callback abort_callback, void * abort_callback_data); diff --git a/src/llama-cparams.h b/src/llama-cparams.h index ba4951a09a..52e1c4f54a 100644 --- a/src/llama-cparams.h +++ b/src/llama-cparams.h @@ -39,7 +39,7 @@ struct llama_cparams { bool fused_gdn_ch; // use fused gated delta net (chunked) bool auto_fgdn; bool no_perf; - bool warmup; + bool warmup; // TODO: remove [TAG_LLAMA_GRAPH_NO_WARMUP] bool op_offload; bool kv_unified; bool pipeline_parallel; diff --git a/tests/test-backend-sampler.cpp b/tests/test-backend-sampler.cpp index 58361ae80a..61ddf91fea 100644 --- a/tests/test-backend-sampler.cpp +++ b/tests/test-backend-sampler.cpp @@ -107,8 +107,6 @@ struct test_context { throw std::runtime_error("failed to create context"); } - llama_set_warmup(ctx.get(), false); - vocab = llama_model_get_vocab(model); n_vocab = llama_vocab_n_tokens(vocab); }