llama: add --dry-run option (#1462)

Co-authored-by: firecoperana <firecoperana>
2026-06-28 04:30:15 -05:00 · 2026-03-18 11:20:17 -05:00 · 2026-03-18 11:20:17 -05:00 · f9b7fe9749
commit f9b7fe9749
parent 1f4f09419b
5 changed files with 24 additions and 9 deletions
--- a/common/common.cpp
+++ b/common/common.cpp
@ -1733,6 +1733,10 @@ bool gpt_params_find_arg(int argc, char ** argv, const std::string & arg, gpt_pa
        fprintf(stderr, "built with %s for %s\n", LLAMA_COMPILER, LLAMA_BUILD_TARGET);
        exit(0);
    }
+    if (arg == "--dry-run" || arg == "-dr") {
+        params.dry_run = true;
+        return true;
+    }
    if (arg == "--in-prefix-bos") {
        params.input_prefix_bos = true;
        params.enable_chat_template = false;
@ -2239,6 +2243,7 @@ void gpt_params_print_usage(int /*argc*/, char ** argv, const gpt_params & param
    options.push_back({ "*",           "-v,    --verbose",              "print verbose information" });
    options.push_back({ "*",           "       --verbosity N",          "set specific verbosity level (default: %d)", params.verbosity });
    options.push_back({ "*",           "       --verbose-prompt",       "print a verbose prompt before generation (default: %s)", params.verbose_prompt ? "true" : "false" });
+    options.push_back({ "*",           "-dr,   --dry-run",       "skip loading tensors in the files"});
    options.push_back({ "*",           "       --no-display-prompt",    "don't print prompt at generation (default: %s)", !params.display_prompt ? "true" : "false" });
    options.push_back({ "*",           "-co,   --color",                "colorise output to distinguish prompt and user input from generations (default: %s)", params.use_color ? "true" : "false" });
    options.push_back({ "*",           "-s,    --seed SEED",            "RNG seed (default: %d, use random seed for < 0)", params.seed });
@ -3233,6 +3238,7 @@ struct llama_model_params common_model_params_to_llama(const gpt_params & params
        mparams.n_gpu_layers = params.n_gpu_layers;
    }
    mparams.mla             = params.mla_attn;
+    mparams.dry_run         = params.dry_run;
    mparams.rpc_servers     = params.rpc_servers.c_str();
    mparams.main_gpu        = params.main_gpu;
    mparams.max_gpu         = params.max_gpu;
--- a/common/common.h
+++ b/common/common.h
@ -395,8 +395,9 @@ struct gpt_params {
    bool enable_chat_template = true;
    common_reasoning_format reasoning_format = COMMON_REASONING_FORMAT_DEEPSEEK;
    thinking_tokens think_tokens;
-    int reasoning_budget = -1;
-    bool prefill_assistant = true;
+    int reasoning_budget      = -1;
+    bool prefill_assistant    = true;
+    bool dry_run              = false;

    std::vector<std::string> api_keys;

--- a/examples/server/server-context.cpp
+++ b/examples/server/server-context.cpp
@ -224,7 +224,10 @@ void server_context::init() {
            }
        }

-        const bool can_spec = common_speculative_is_compat(ctx);
+        bool can_spec = true;
+        if (!params_base.dry_run) {
+            can_spec = common_speculative_is_compat(ctx);
+        }  
        if (!can_spec) {
            SRV_WRN("%s", "speculative decoding not supported by this context\n");
        }
--- a/include/llama.h
+++ b/include/llama.h
@ -401,6 +401,7 @@ extern "C" {
        bool merge_qkv;     // if true, merge separate Q, K, V tensors into a single, contiguous tensor
        bool merge_up_gate_exps;  // if true, merge ffn_up_exps and ffn_gate_exps tensors into a single, contiguous tensor
        bool mtp;           // if true, load MTP layers if present
+        bool dry_run;       // skip loading tensors
    };

    // NOTE: changing the default values of parameters marked as [EXPERIMENTAL] may cause crashes or incorrect results in certain configurations
--- a/src/llama.cpp
+++ b/src/llama.cpp
@ -2008,6 +2008,7 @@ static bool llm_load_tensors(
        bool use_mlock,
        bool validate_quants,
        bool mtp,
+        bool dry_run,
        llama_progress_callback progress_callback,
        void * progress_callback_user_data) {
    model.t_start_us = ggml_time_us();
@ -2286,11 +2287,13 @@ static bool llm_load_tensors(
    }

    // load tensor data
-    for (auto & it : ctx_bufs) {
-        ggml_context * ctx = it.first;
-        auto & bufs = it.second;
-        if (!ml.load_all_data(ctx, bufs, use_mlock ? &model.mlock_mmaps : NULL, progress_callback, progress_callback_user_data)) {
-            return false;
+    if (!dry_run) {
+        for (auto & it : ctx_bufs) {
+            ggml_context * ctx = it.first;
+            auto & bufs = it.second;
+            if (!ml.load_all_data(ctx, bufs, use_mlock ? &model.mlock_mmaps : NULL, progress_callback, progress_callback_user_data)) {
+                return false;
+            }
        }
    }

@ -2429,7 +2432,7 @@ static int llama_model_load(const std::string & fname, llama_model & model, llam

        if (!llm_load_tensors(
            ml, model, params.n_gpu_layers, params.mla, params.split_mode,  params.main_gpu, params.max_gpu, params.tensor_split,
-            params.use_mlock, params.validate_quants, params.mtp,
+            params.use_mlock, params.validate_quants, params.mtp, params.dry_run,
            params.progress_callback, params.progress_callback_user_data
        )) {
            return -2;
@ -4407,6 +4410,7 @@ struct llama_model_params llama_model_default_params() {
        /*.merge_qkv                   =*/ false,
        /*.merge_up_gate_exps          =*/ false,
        /*.mtp                         =*/ false,
+        /*.dry_run                     =*/ false,
    };

 #ifdef GGML_USE_METAL