llama: add --dry-run option (#1462)

Co-authored-by: firecoperana <firecoperana>
This commit is contained in:
firecoperana 2026-03-18 11:20:17 -05:00 committed by GitHub
parent 1f4f09419b
commit f9b7fe9749
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
5 changed files with 24 additions and 9 deletions

View File

@ -1733,6 +1733,10 @@ bool gpt_params_find_arg(int argc, char ** argv, const std::string & arg, gpt_pa
fprintf(stderr, "built with %s for %s\n", LLAMA_COMPILER, LLAMA_BUILD_TARGET);
exit(0);
}
if (arg == "--dry-run" || arg == "-dr") {
params.dry_run = true;
return true;
}
if (arg == "--in-prefix-bos") {
params.input_prefix_bos = true;
params.enable_chat_template = false;
@ -2239,6 +2243,7 @@ void gpt_params_print_usage(int /*argc*/, char ** argv, const gpt_params & param
options.push_back({ "*", "-v, --verbose", "print verbose information" });
options.push_back({ "*", " --verbosity N", "set specific verbosity level (default: %d)", params.verbosity });
options.push_back({ "*", " --verbose-prompt", "print a verbose prompt before generation (default: %s)", params.verbose_prompt ? "true" : "false" });
options.push_back({ "*", "-dr, --dry-run", "skip loading tensors in the files"});
options.push_back({ "*", " --no-display-prompt", "don't print prompt at generation (default: %s)", !params.display_prompt ? "true" : "false" });
options.push_back({ "*", "-co, --color", "colorise output to distinguish prompt and user input from generations (default: %s)", params.use_color ? "true" : "false" });
options.push_back({ "*", "-s, --seed SEED", "RNG seed (default: %d, use random seed for < 0)", params.seed });
@ -3233,6 +3238,7 @@ struct llama_model_params common_model_params_to_llama(const gpt_params & params
mparams.n_gpu_layers = params.n_gpu_layers;
}
mparams.mla = params.mla_attn;
mparams.dry_run = params.dry_run;
mparams.rpc_servers = params.rpc_servers.c_str();
mparams.main_gpu = params.main_gpu;
mparams.max_gpu = params.max_gpu;

View File

@ -395,8 +395,9 @@ struct gpt_params {
bool enable_chat_template = true;
common_reasoning_format reasoning_format = COMMON_REASONING_FORMAT_DEEPSEEK;
thinking_tokens think_tokens;
int reasoning_budget = -1;
bool prefill_assistant = true;
int reasoning_budget = -1;
bool prefill_assistant = true;
bool dry_run = false;
std::vector<std::string> api_keys;

View File

@ -224,7 +224,10 @@ void server_context::init() {
}
}
const bool can_spec = common_speculative_is_compat(ctx);
bool can_spec = true;
if (!params_base.dry_run) {
can_spec = common_speculative_is_compat(ctx);
}
if (!can_spec) {
SRV_WRN("%s", "speculative decoding not supported by this context\n");
}

View File

@ -401,6 +401,7 @@ extern "C" {
bool merge_qkv; // if true, merge separate Q, K, V tensors into a single, contiguous tensor
bool merge_up_gate_exps; // if true, merge ffn_up_exps and ffn_gate_exps tensors into a single, contiguous tensor
bool mtp; // if true, load MTP layers if present
bool dry_run; // skip loading tensors
};
// NOTE: changing the default values of parameters marked as [EXPERIMENTAL] may cause crashes or incorrect results in certain configurations

View File

@ -2008,6 +2008,7 @@ static bool llm_load_tensors(
bool use_mlock,
bool validate_quants,
bool mtp,
bool dry_run,
llama_progress_callback progress_callback,
void * progress_callback_user_data) {
model.t_start_us = ggml_time_us();
@ -2286,11 +2287,13 @@ static bool llm_load_tensors(
}
// load tensor data
for (auto & it : ctx_bufs) {
ggml_context * ctx = it.first;
auto & bufs = it.second;
if (!ml.load_all_data(ctx, bufs, use_mlock ? &model.mlock_mmaps : NULL, progress_callback, progress_callback_user_data)) {
return false;
if (!dry_run) {
for (auto & it : ctx_bufs) {
ggml_context * ctx = it.first;
auto & bufs = it.second;
if (!ml.load_all_data(ctx, bufs, use_mlock ? &model.mlock_mmaps : NULL, progress_callback, progress_callback_user_data)) {
return false;
}
}
}
@ -2429,7 +2432,7 @@ static int llama_model_load(const std::string & fname, llama_model & model, llam
if (!llm_load_tensors(
ml, model, params.n_gpu_layers, params.mla, params.split_mode, params.main_gpu, params.max_gpu, params.tensor_split,
params.use_mlock, params.validate_quants, params.mtp,
params.use_mlock, params.validate_quants, params.mtp, params.dry_run,
params.progress_callback, params.progress_callback_user_data
)) {
return -2;
@ -4407,6 +4410,7 @@ struct llama_model_params llama_model_default_params() {
/*.merge_qkv =*/ false,
/*.merge_up_gate_exps =*/ false,
/*.mtp =*/ false,
/*.dry_run =*/ false,
};
#ifdef GGML_USE_METAL