mirror of
https://github.com/ikawrakow/ik_llama.cpp.git
synced 2026-06-28 04:30:15 -05:00
llama: add --dry-run option (#1462)
Co-authored-by: firecoperana <firecoperana>
This commit is contained in:
parent
1f4f09419b
commit
f9b7fe9749
@ -1733,6 +1733,10 @@ bool gpt_params_find_arg(int argc, char ** argv, const std::string & arg, gpt_pa
|
||||
fprintf(stderr, "built with %s for %s\n", LLAMA_COMPILER, LLAMA_BUILD_TARGET);
|
||||
exit(0);
|
||||
}
|
||||
if (arg == "--dry-run" || arg == "-dr") {
|
||||
params.dry_run = true;
|
||||
return true;
|
||||
}
|
||||
if (arg == "--in-prefix-bos") {
|
||||
params.input_prefix_bos = true;
|
||||
params.enable_chat_template = false;
|
||||
@ -2239,6 +2243,7 @@ void gpt_params_print_usage(int /*argc*/, char ** argv, const gpt_params & param
|
||||
options.push_back({ "*", "-v, --verbose", "print verbose information" });
|
||||
options.push_back({ "*", " --verbosity N", "set specific verbosity level (default: %d)", params.verbosity });
|
||||
options.push_back({ "*", " --verbose-prompt", "print a verbose prompt before generation (default: %s)", params.verbose_prompt ? "true" : "false" });
|
||||
options.push_back({ "*", "-dr, --dry-run", "skip loading tensors in the files"});
|
||||
options.push_back({ "*", " --no-display-prompt", "don't print prompt at generation (default: %s)", !params.display_prompt ? "true" : "false" });
|
||||
options.push_back({ "*", "-co, --color", "colorise output to distinguish prompt and user input from generations (default: %s)", params.use_color ? "true" : "false" });
|
||||
options.push_back({ "*", "-s, --seed SEED", "RNG seed (default: %d, use random seed for < 0)", params.seed });
|
||||
@ -3233,6 +3238,7 @@ struct llama_model_params common_model_params_to_llama(const gpt_params & params
|
||||
mparams.n_gpu_layers = params.n_gpu_layers;
|
||||
}
|
||||
mparams.mla = params.mla_attn;
|
||||
mparams.dry_run = params.dry_run;
|
||||
mparams.rpc_servers = params.rpc_servers.c_str();
|
||||
mparams.main_gpu = params.main_gpu;
|
||||
mparams.max_gpu = params.max_gpu;
|
||||
|
||||
@ -395,8 +395,9 @@ struct gpt_params {
|
||||
bool enable_chat_template = true;
|
||||
common_reasoning_format reasoning_format = COMMON_REASONING_FORMAT_DEEPSEEK;
|
||||
thinking_tokens think_tokens;
|
||||
int reasoning_budget = -1;
|
||||
bool prefill_assistant = true;
|
||||
int reasoning_budget = -1;
|
||||
bool prefill_assistant = true;
|
||||
bool dry_run = false;
|
||||
|
||||
std::vector<std::string> api_keys;
|
||||
|
||||
|
||||
@ -224,7 +224,10 @@ void server_context::init() {
|
||||
}
|
||||
}
|
||||
|
||||
const bool can_spec = common_speculative_is_compat(ctx);
|
||||
bool can_spec = true;
|
||||
if (!params_base.dry_run) {
|
||||
can_spec = common_speculative_is_compat(ctx);
|
||||
}
|
||||
if (!can_spec) {
|
||||
SRV_WRN("%s", "speculative decoding not supported by this context\n");
|
||||
}
|
||||
|
||||
@ -401,6 +401,7 @@ extern "C" {
|
||||
bool merge_qkv; // if true, merge separate Q, K, V tensors into a single, contiguous tensor
|
||||
bool merge_up_gate_exps; // if true, merge ffn_up_exps and ffn_gate_exps tensors into a single, contiguous tensor
|
||||
bool mtp; // if true, load MTP layers if present
|
||||
bool dry_run; // skip loading tensors
|
||||
};
|
||||
|
||||
// NOTE: changing the default values of parameters marked as [EXPERIMENTAL] may cause crashes or incorrect results in certain configurations
|
||||
|
||||
@ -2008,6 +2008,7 @@ static bool llm_load_tensors(
|
||||
bool use_mlock,
|
||||
bool validate_quants,
|
||||
bool mtp,
|
||||
bool dry_run,
|
||||
llama_progress_callback progress_callback,
|
||||
void * progress_callback_user_data) {
|
||||
model.t_start_us = ggml_time_us();
|
||||
@ -2286,11 +2287,13 @@ static bool llm_load_tensors(
|
||||
}
|
||||
|
||||
// load tensor data
|
||||
for (auto & it : ctx_bufs) {
|
||||
ggml_context * ctx = it.first;
|
||||
auto & bufs = it.second;
|
||||
if (!ml.load_all_data(ctx, bufs, use_mlock ? &model.mlock_mmaps : NULL, progress_callback, progress_callback_user_data)) {
|
||||
return false;
|
||||
if (!dry_run) {
|
||||
for (auto & it : ctx_bufs) {
|
||||
ggml_context * ctx = it.first;
|
||||
auto & bufs = it.second;
|
||||
if (!ml.load_all_data(ctx, bufs, use_mlock ? &model.mlock_mmaps : NULL, progress_callback, progress_callback_user_data)) {
|
||||
return false;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
@ -2429,7 +2432,7 @@ static int llama_model_load(const std::string & fname, llama_model & model, llam
|
||||
|
||||
if (!llm_load_tensors(
|
||||
ml, model, params.n_gpu_layers, params.mla, params.split_mode, params.main_gpu, params.max_gpu, params.tensor_split,
|
||||
params.use_mlock, params.validate_quants, params.mtp,
|
||||
params.use_mlock, params.validate_quants, params.mtp, params.dry_run,
|
||||
params.progress_callback, params.progress_callback_user_data
|
||||
)) {
|
||||
return -2;
|
||||
@ -4407,6 +4410,7 @@ struct llama_model_params llama_model_default_params() {
|
||||
/*.merge_qkv =*/ false,
|
||||
/*.merge_up_gate_exps =*/ false,
|
||||
/*.mtp =*/ false,
|
||||
/*.dry_run =*/ false,
|
||||
};
|
||||
|
||||
#ifdef GGML_USE_METAL
|
||||
|
||||
Loading…
x
Reference in New Issue
Block a user