llama : deprecate llama_set_warmup (#24009)

* llama : deprecate `llama_set_warmup`

* cont : fix type

Co-authored-by: Daniel Bevenius <daniel.bevenius@gmail.com>

---------

Co-authored-by: Daniel Bevenius <daniel.bevenius@gmail.com>
This commit is contained in:
Georgi Gerganov 2026-06-02 10:30:38 +03:00 committed by GitHub
parent 8f7f3bf141
commit 4f3a4beb8d
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
4 changed files with 6 additions and 7 deletions

View File

@ -1389,8 +1389,6 @@ common_init_result_ptr common_init_from_params(common_params & params, bool mode
if (params.warmup) { if (params.warmup) {
LOG_INF("%s: warming up the model with an empty run - please wait ... (--no-warmup to disable)\n", __func__); LOG_INF("%s: warming up the model with an empty run - please wait ... (--no-warmup to disable)\n", __func__);
llama_set_warmup(lctx, true);
std::vector<llama_token> tmp; std::vector<llama_token> tmp;
llama_token bos = llama_vocab_bos(vocab); llama_token bos = llama_vocab_bos(vocab);
llama_token eos = llama_vocab_eos(vocab); llama_token eos = llama_vocab_eos(vocab);
@ -1421,7 +1419,6 @@ common_init_result_ptr common_init_from_params(common_params & params, bool mode
llama_memory_clear(llama_get_memory(lctx), true); llama_memory_clear(llama_get_memory(lctx), true);
llama_synchronize(lctx); llama_synchronize(lctx);
llama_perf_context_reset(lctx); llama_perf_context_reset(lctx);
llama_set_warmup(lctx, false);
// reset samplers to reset RNG state after warmup to the seeded state // reset samplers to reset RNG state after warmup to the seeded state
res->reset_samplers(); res->reset_samplers();

View File

@ -976,7 +976,11 @@ extern "C" {
// Set whether the model is in warmup mode or not // Set whether the model is in warmup mode or not
// If true, all model tensors are activated during llama_decode() to load and cache their weights. // If true, all model tensors are activated during llama_decode() to load and cache their weights.
LLAMA_API void llama_set_warmup(struct llama_context * ctx, bool warmup); //
// note: using this can cause extra graph reallocations because it changes the graph topology with MoE models,
// so it is generally not recommended to use in practice. will be removed in the future
DEPRECATED(LLAMA_API void llama_set_warmup(struct llama_context * ctx, bool warmup),
"user code should do warmup runs manually [TAG_LLAMA_GRAPH_NO_WARMUP]");
// Set abort callback // Set abort callback
LLAMA_API void llama_set_abort_callback(struct llama_context * ctx, ggml_abort_callback abort_callback, void * abort_callback_data); LLAMA_API void llama_set_abort_callback(struct llama_context * ctx, ggml_abort_callback abort_callback, void * abort_callback_data);

View File

@ -39,7 +39,7 @@ struct llama_cparams {
bool fused_gdn_ch; // use fused gated delta net (chunked) bool fused_gdn_ch; // use fused gated delta net (chunked)
bool auto_fgdn; bool auto_fgdn;
bool no_perf; bool no_perf;
bool warmup; bool warmup; // TODO: remove [TAG_LLAMA_GRAPH_NO_WARMUP]
bool op_offload; bool op_offload;
bool kv_unified; bool kv_unified;
bool pipeline_parallel; bool pipeline_parallel;

View File

@ -107,8 +107,6 @@ struct test_context {
throw std::runtime_error("failed to create context"); throw std::runtime_error("failed to create context");
} }
llama_set_warmup(ctx.get(), false);
vocab = llama_model_get_vocab(model); vocab = llama_model_get_vocab(model);
n_vocab = llama_vocab_n_tokens(vocab); n_vocab = llama_vocab_n_tokens(vocab);
} }