app : add the llama download subcommand (#24982)

* app : add the download command (with llama-download) Signed-off-by: Adrien Gallouët <angt@huggingface.co> * Remove llama-download tool for now Signed-off-by: Adrien Gallouët <angt@huggingface.co> --------- Signed-off-by: Adrien Gallouët <angt@huggingface.co>
2026-06-27 23:50:20 -05:00 · 2026-06-25 13:36:36 +02:00 · 2026-06-25 13:36:36 +02:00 · 683b04cc4a
commit 683b04cc4a
parent f728adab68
5 changed files with 107 additions and 19 deletions
--- a/app/CMakeLists.txt
+++ b/app/CMakeLists.txt
@ -1,6 +1,6 @@
 set(TARGET llama-app)
-add_executable(${TARGET} llama.cpp)
+add_executable(${TARGET} llama.cpp download.cpp)
 set_target_properties(${TARGET} PROPERTIES OUTPUT_NAME llama)
 target_link_libraries(${TARGET} PRIVATE
--- a/app/download.cpp
+++ b/app/download.cpp
@ -0,0 +1,70 @@
 #include "arg.h"
 #include "common.h"
 #include "download.h"
 #include "log.h"
 #include <cstdio>
 #include <filesystem>
 static void print_usage(int /*argc*/, char ** argv) {
    printf(
        "\nexamples:\n"
        "  %s -hf ggml-org/gemma-3-4b-it-qat-GGUF\n"
        "  %s -hf ggml-org/gemma-3-4b-it-qat-GGUF:Q4_K_M\n"
        "  %s -hf ggml-org/models -hff model.gguf\n"
        "  %s -mu https://example.com/model.gguf -m model.gguf\n"
        "\n",
        argv[0], argv[0], argv[0], argv[0]
    );
 }
 int llama_download(int argc, char ** argv);
 int llama_download(int argc, char ** argv) {
    common_init();
    common_params params;
    params.verbosity = LOG_LEVEL_ERROR;
    if (!common_params_parse(argc, argv, params, LLAMA_EXAMPLE_DOWNLOAD, print_usage)) {
        return 1;
    }
    const bool has_source = !params.model.hf_repo.empty() || !params.model.url.empty() ||
                            !params.model.path.empty()    || !params.model.docker_repo.empty();
    if (!has_source) {
        fprintf(stderr, "error: no model source specified (use --hf-repo, --model-url, --model or --docker-repo)\n");
        return 1;
    }
    try {
        common_params_handle_models(params, LLAMA_EXAMPLE_DOWNLOAD, {});
    } catch (const std::exception & e) {
        fprintf(stderr, "error: %s\n", e.what());
        return 1;
    }
    if (!params.models_preset.empty()) {
        // -hf pointed at a preset repo: print the preset path and stop
        printf("%s\n", params.models_preset.c_str());
        return 0;
    }
    if (params.model.path.empty()) {
        fprintf(stderr, "error: model download failed\n");
        return 1;
    }
    if (!std::filesystem::exists(params.model.path)) {
        fprintf(stderr, "error: model file does not exist: %s\n", params.model.path.c_str());
        return 1;
    }
    printf("%s\n", params.model.path.c_str());
    if (!params.mmproj.path.empty()) {
        printf("%s\n", params.mmproj.path.c_str());
    }
    if (!params.speculative.draft.mparams.path.empty()) {
        printf("%s\n", params.speculative.draft.mparams.path.c_str());
    }
    return 0;
 }
--- a/app/llama.cpp
+++ b/app/llama.cpp
@ -19,6 +19,7 @@ int llama_batched_bench(int argc, char ** argv);
 int llama_fit_params(int argc, char ** argv);
 int llama_quantize(int argc, char ** argv);
 int llama_perplexity(int argc, char ** argv);
 int llama_download(int argc, char ** argv);
 // Self-update is only supported for binaries built with llama-install.sh
 static int llama_update(int argc, char ** argv) {
@ -61,6 +62,7 @@ static const command cmds[] = {
    {"serve",         "HTTP API server",                                    {"server"},   false,         llama_server       },
    {"cli",           "Command-line interactive interface",                 {"client"},   false,         llama_cli          },
    {"update",        "Update llama to the latest release",                 {},           UPDATE_HIDDEN, llama_update       },
    {"download",      "Download a model",                                   {"get"},      false,         llama_download     },
    {"completion",    "Text completion",                                    {"complete"}, true,          llama_completion   },
    {"bench",         "Benchmark prompt processing and text generation",    {},           true,          llama_bench        },
    {"batched-bench", "Benchmark batched decoding performance",             {},           true,          llama_batched_bench},
--- a/common/arg.cpp
+++ b/common/arg.cpp
@ -594,6 +594,8 @@ static bool common_params_parse_ex(int argc, char ** argv, common_params_context
    const bool skip_model_download =
        // server will call common_params_handle_models() later, so we skip it here
        ctx_arg.ex == LLAMA_EXAMPLE_SERVER ||
        // download calls common_params_handle_models() itself and prints the paths
        ctx_arg.ex == LLAMA_EXAMPLE_DOWNLOAD ||
        // export_graph_ops loads only metadata
        ctx_arg.ex == LLAMA_EXAMPLE_EXPORT_GRAPH_OPS;
@ -671,15 +673,19 @@ static void common_params_print_usage(common_params_context & ctx_arg) {
            common_options.push_back(&opt);
        }
    }
-    printf("----- common params -----\n\n");
+    bool first = true;
-    print_options(common_options);
+    auto print_section = [&](const char * header, std::vector<common_arg *> & options) {
-    printf("\n\n----- sampling params -----\n\n");
+        if (options.empty()) {
-    print_options(sampling_options);
+            return;
-    printf("\n\n----- speculative params -----\n\n");
+        }
-    print_options(spec_options);
+        printf("%s----- %s -----\n\n", first ? "" : "\n\n", header);
-    // TODO: maybe convert enum llama_example to string
+        first = false;
-    printf("\n\n----- example-specific params -----\n\n");
+        print_options(options);
-    print_options(specific_options);
+    };
    print_section("common params",           common_options);
    print_section("sampling params",         sampling_options);
    print_section("speculative params",      spec_options);
    print_section("example-specific params", specific_options);
 }
 static void common_params_print_completion(common_params_context & ctx_arg) {
@ -1079,7 +1085,9 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
     * - if both {LLAMA_EXAMPLE_COMMON, LLAMA_EXAMPLE_*,} are set, we will prioritize the LLAMA_EXAMPLE_* matching current example
     */
    auto add_opt = [&](common_arg arg) {
-        if ((arg.in_example(ex) || arg.in_example(LLAMA_EXAMPLE_COMMON)) && !arg.is_exclude(ex)) {
+        // download only exposes the handful of args explicitly tagged for it
        const bool inherit_common = ex != LLAMA_EXAMPLE_DOWNLOAD;
        if ((arg.in_example(ex) || (inherit_common && arg.in_example(LLAMA_EXAMPLE_COMMON))) && !arg.is_exclude(ex)) {
            ctx_arg.options.push_back(std::move(arg));
        }
    };
@ -1090,7 +1098,7 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
        [](common_params & params) {
            params.usage = true;
        }
-    ));
+    ).set_examples({LLAMA_EXAMPLE_COMMON, LLAMA_EXAMPLE_DOWNLOAD}));
    add_opt(common_arg(
        {"--version"},
        "show version and build info",
@ -2212,7 +2220,7 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
        [](common_params & params, bool value) {
            params.no_mmproj = !value;
        }
-    ).set_examples(mmproj_examples).set_env("LLAMA_ARG_MMPROJ_AUTO"));
+    ).set_examples({LLAMA_EXAMPLE_MTMD, LLAMA_EXAMPLE_SERVER, LLAMA_EXAMPLE_CLI, LLAMA_EXAMPLE_DOWNLOAD}).set_env("LLAMA_ARG_MMPROJ_AUTO"));
    add_opt(common_arg(
        {"--mmproj-offload"},
        {"--no-mmproj-offload"},
@ -2611,14 +2619,14 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
        [](common_params & params, const std::string & value) {
            params.model.path = value;
        }
-    ).set_examples({LLAMA_EXAMPLE_COMMON, LLAMA_EXAMPLE_EXPORT_LORA}).set_env("LLAMA_ARG_MODEL"));
+    ).set_examples({LLAMA_EXAMPLE_COMMON, LLAMA_EXAMPLE_EXPORT_LORA, LLAMA_EXAMPLE_DOWNLOAD}).set_env("LLAMA_ARG_MODEL"));
    add_opt(common_arg(
        {"-mu", "--model-url"}, "MODEL_URL",
        "model download url (default: unused)",
        [](common_params & params, const std::string & value) {
            params.model.url = value;
        }
-    ).set_env("LLAMA_ARG_MODEL_URL"));
+    ).set_examples({LLAMA_EXAMPLE_COMMON, LLAMA_EXAMPLE_DOWNLOAD}).set_env("LLAMA_ARG_MODEL_URL"));
    add_opt(common_arg(
        { "-dr", "--docker-repo" }, "[<repo>/]<model>[:quant]",
        "Docker Hub model repository. repo is optional, default to ai/. quant is optional, default to :latest.\n"
@ -2627,7 +2635,7 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
        [](common_params & params, const std::string & value) {
            params.model.docker_repo = value;
        }
-    ).set_env("LLAMA_ARG_DOCKER_REPO"));
+    ).set_examples({LLAMA_EXAMPLE_COMMON, LLAMA_EXAMPLE_DOWNLOAD}).set_env("LLAMA_ARG_DOCKER_REPO"));
    add_opt(common_arg(
        {"-hf", "-hfr", "--hf-repo"}, "<user>/<model>[:quant]",
        "Hugging Face model repository; quant is optional, case-insensitive, default to Q4_K_M, or falls back to the first file in the repo if Q4_K_M doesn't exist.\n"
@ -2637,14 +2645,14 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
        [](common_params & params, const std::string & value) {
            params.model.hf_repo = value;
        }
-    ).set_env("LLAMA_ARG_HF_REPO"));
+    ).set_examples({LLAMA_EXAMPLE_COMMON, LLAMA_EXAMPLE_DOWNLOAD}).set_env("LLAMA_ARG_HF_REPO"));
    add_opt(common_arg(
        {"-hff", "--hf-file"}, "FILE",
        "Hugging Face model file. If specified, it will override the quant in --hf-repo (default: unused)",
        [](common_params & params, const std::string & value) {
            params.model.hf_file = value;
        }
-    ).set_env("LLAMA_ARG_HF_FILE"));
+    ).set_examples({LLAMA_EXAMPLE_COMMON, LLAMA_EXAMPLE_DOWNLOAD}).set_env("LLAMA_ARG_HF_FILE"));
    add_opt(common_arg(
        {"-hfv", "-hfrv", "--hf-repo-v"}, "<user>/<model>[:quant]",
        "Hugging Face model repository for the vocoder model (default: unused)",
@ -2665,7 +2673,14 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
        [](common_params & params, const std::string & value) {
            params.hf_token = value;
        }
-    ).set_env("HF_TOKEN"));
+    ).set_examples({LLAMA_EXAMPLE_COMMON, LLAMA_EXAMPLE_DOWNLOAD}).set_env("HF_TOKEN"));
    add_opt(common_arg(
        {"--mtp"},
        "also download the multi-token prediction (MTP) head, if available (default: unused)",
        [](common_params & params) {
            params.speculative.types.push_back(COMMON_SPECULATIVE_TYPE_DRAFT_MTP);
        }
    ).set_examples({LLAMA_EXAMPLE_DOWNLOAD}));
    add_opt(common_arg(
        {"--context-file"}, "FNAME",
        "file to load context from (use comma-separated values to specify multiple files)",
--- a/common/common.h
+++ b/common/common.h
@ -96,6 +96,7 @@ enum llama_example {
    LLAMA_EXAMPLE_FIT_PARAMS,
    LLAMA_EXAMPLE_RESULTS,
    LLAMA_EXAMPLE_EXPORT_GRAPH_OPS,
    LLAMA_EXAMPLE_DOWNLOAD,
    LLAMA_EXAMPLE_COUNT,
 };