From 4f3a4beb8d7e7296aabee3593bb65aa88dceb9ed Mon Sep 17 00:00:00 2001
From: Georgi Gerganov <ggerganov@gmail.com>
Date: Tue, 2 Jun 2026 10:30:38 +0300
Subject: [PATCH] llama : deprecate `llama_set_warmup` (#24009)

* llama : deprecate `llama_set_warmup`

* cont : fix type

Co-authored-by: Daniel Bevenius <daniel.bevenius@gmail.com>

---------

Co-authored-by: Daniel Bevenius <daniel.bevenius@gmail.com>
---
 common/common.cpp              | 3 ---
 include/llama.h                | 6 +++++-
 src/llama-cparams.h            | 2 +-
 tests/test-backend-sampler.cpp | 2 --
 4 files changed, 6 insertions(+), 7 deletions(-)

diff --git a/common/common.cpp b/common/common.cpp
index 81b8b75002..0460c6c530 100644
--- a/common/common.cpp
+++ b/common/common.cpp
@@ -1389,8 +1389,6 @@ common_init_result_ptr common_init_from_params(common_params & params, bool mode
     if (params.warmup) {
         LOG_INF("%s: warming up the model with an empty run - please wait ... (--no-warmup to disable)\n", __func__);
 
-        llama_set_warmup(lctx, true);
-
         std::vector<llama_token> tmp;
         llama_token bos = llama_vocab_bos(vocab);
         llama_token eos = llama_vocab_eos(vocab);
@@ -1421,7 +1419,6 @@ common_init_result_ptr common_init_from_params(common_params & params, bool mode
         llama_memory_clear(llama_get_memory(lctx), true);
         llama_synchronize(lctx);
         llama_perf_context_reset(lctx);
-        llama_set_warmup(lctx, false);
 
         // reset samplers to reset RNG state after warmup to the seeded state
         res->reset_samplers();
diff --git a/include/llama.h b/include/llama.h
index a79a491c59..9f78aa9a05 100644
--- a/include/llama.h
+++ b/include/llama.h
@@ -976,7 +976,11 @@ extern "C" {
 
     // Set whether the model is in warmup mode or not
     // If true, all model tensors are activated during llama_decode() to load and cache their weights.
-    LLAMA_API void llama_set_warmup(struct llama_context * ctx, bool warmup);
+    //
+    // note: using this can cause extra graph reallocations because it changes the graph topology with MoE models,
+    //       so it is generally not recommended to use in practice. will be removed in the future
+    DEPRECATED(LLAMA_API void llama_set_warmup(struct llama_context * ctx, bool warmup),
+            "user code should do warmup runs manually [TAG_LLAMA_GRAPH_NO_WARMUP]");
 
     // Set abort callback
     LLAMA_API void llama_set_abort_callback(struct llama_context * ctx, ggml_abort_callback abort_callback, void * abort_callback_data);
diff --git a/src/llama-cparams.h b/src/llama-cparams.h
index ba4951a09a..52e1c4f54a 100644
--- a/src/llama-cparams.h
+++ b/src/llama-cparams.h
@@ -39,7 +39,7 @@ struct llama_cparams {
     bool fused_gdn_ch;       // use fused gated delta net (chunked)
     bool auto_fgdn;
     bool no_perf;
-    bool warmup;
+    bool warmup;             // TODO: remove [TAG_LLAMA_GRAPH_NO_WARMUP]
     bool op_offload;
     bool kv_unified;
     bool pipeline_parallel;
diff --git a/tests/test-backend-sampler.cpp b/tests/test-backend-sampler.cpp
index 58361ae80a..61ddf91fea 100644
--- a/tests/test-backend-sampler.cpp
+++ b/tests/test-backend-sampler.cpp
@@ -107,8 +107,6 @@ struct test_context {
             throw std::runtime_error("failed to create context");
         }
 
-        llama_set_warmup(ctx.get(), false);
-
         vocab = llama_model_get_vocab(model);
         n_vocab = llama_vocab_n_tokens(vocab);
     }