mirror of
https://github.com/ggml-org/llama.cpp.git
synced 2026-06-27 23:50:20 -05:00
llama : deprecate llama_set_warmup (#24009)
* llama : deprecate `llama_set_warmup` * cont : fix type Co-authored-by: Daniel Bevenius <daniel.bevenius@gmail.com> --------- Co-authored-by: Daniel Bevenius <daniel.bevenius@gmail.com>
This commit is contained in:
parent
8f7f3bf141
commit
4f3a4beb8d
@ -1389,8 +1389,6 @@ common_init_result_ptr common_init_from_params(common_params & params, bool mode
|
|||||||
if (params.warmup) {
|
if (params.warmup) {
|
||||||
LOG_INF("%s: warming up the model with an empty run - please wait ... (--no-warmup to disable)\n", __func__);
|
LOG_INF("%s: warming up the model with an empty run - please wait ... (--no-warmup to disable)\n", __func__);
|
||||||
|
|
||||||
llama_set_warmup(lctx, true);
|
|
||||||
|
|
||||||
std::vector<llama_token> tmp;
|
std::vector<llama_token> tmp;
|
||||||
llama_token bos = llama_vocab_bos(vocab);
|
llama_token bos = llama_vocab_bos(vocab);
|
||||||
llama_token eos = llama_vocab_eos(vocab);
|
llama_token eos = llama_vocab_eos(vocab);
|
||||||
@ -1421,7 +1419,6 @@ common_init_result_ptr common_init_from_params(common_params & params, bool mode
|
|||||||
llama_memory_clear(llama_get_memory(lctx), true);
|
llama_memory_clear(llama_get_memory(lctx), true);
|
||||||
llama_synchronize(lctx);
|
llama_synchronize(lctx);
|
||||||
llama_perf_context_reset(lctx);
|
llama_perf_context_reset(lctx);
|
||||||
llama_set_warmup(lctx, false);
|
|
||||||
|
|
||||||
// reset samplers to reset RNG state after warmup to the seeded state
|
// reset samplers to reset RNG state after warmup to the seeded state
|
||||||
res->reset_samplers();
|
res->reset_samplers();
|
||||||
|
|||||||
@ -976,7 +976,11 @@ extern "C" {
|
|||||||
|
|
||||||
// Set whether the model is in warmup mode or not
|
// Set whether the model is in warmup mode or not
|
||||||
// If true, all model tensors are activated during llama_decode() to load and cache their weights.
|
// If true, all model tensors are activated during llama_decode() to load and cache their weights.
|
||||||
LLAMA_API void llama_set_warmup(struct llama_context * ctx, bool warmup);
|
//
|
||||||
|
// note: using this can cause extra graph reallocations because it changes the graph topology with MoE models,
|
||||||
|
// so it is generally not recommended to use in practice. will be removed in the future
|
||||||
|
DEPRECATED(LLAMA_API void llama_set_warmup(struct llama_context * ctx, bool warmup),
|
||||||
|
"user code should do warmup runs manually [TAG_LLAMA_GRAPH_NO_WARMUP]");
|
||||||
|
|
||||||
// Set abort callback
|
// Set abort callback
|
||||||
LLAMA_API void llama_set_abort_callback(struct llama_context * ctx, ggml_abort_callback abort_callback, void * abort_callback_data);
|
LLAMA_API void llama_set_abort_callback(struct llama_context * ctx, ggml_abort_callback abort_callback, void * abort_callback_data);
|
||||||
|
|||||||
@ -39,7 +39,7 @@ struct llama_cparams {
|
|||||||
bool fused_gdn_ch; // use fused gated delta net (chunked)
|
bool fused_gdn_ch; // use fused gated delta net (chunked)
|
||||||
bool auto_fgdn;
|
bool auto_fgdn;
|
||||||
bool no_perf;
|
bool no_perf;
|
||||||
bool warmup;
|
bool warmup; // TODO: remove [TAG_LLAMA_GRAPH_NO_WARMUP]
|
||||||
bool op_offload;
|
bool op_offload;
|
||||||
bool kv_unified;
|
bool kv_unified;
|
||||||
bool pipeline_parallel;
|
bool pipeline_parallel;
|
||||||
|
|||||||
@ -107,8 +107,6 @@ struct test_context {
|
|||||||
throw std::runtime_error("failed to create context");
|
throw std::runtime_error("failed to create context");
|
||||||
}
|
}
|
||||||
|
|
||||||
llama_set_warmup(ctx.get(), false);
|
|
||||||
|
|
||||||
vocab = llama_model_get_vocab(model);
|
vocab = llama_model_get_vocab(model);
|
||||||
n_vocab = llama_vocab_n_tokens(vocab);
|
n_vocab = llama_vocab_n_tokens(vocab);
|
||||||
}
|
}
|
||||||
|
|||||||
Loading…
x
Reference in New Issue
Block a user