server: (router) rework -hf preset repo (#24739)

* server: temporary remove HF remote preset * rework remove preset.ini support * rm unused get_remote_preset_whitelist() * print warning * add docs * rm stray file
2026-06-27 23:50:20 -05:00 · 2026-06-18 12:45:23 +02:00 · 2026-06-18 12:45:23 +02:00 · 552258c535
commit 552258c535
parent 968c43891a
8 changed files with 120 additions and 187 deletions
--- a/common/arg.cpp
+++ b/common/arg.cpp
@ -285,58 +285,15 @@ static std::string clean_file_name(const std::string & fname) {
    return clean_fname;
 }
 static bool common_params_handle_remote_preset(common_params & params, llama_example ex) {
    GGML_ASSERT(!params.model.hf_repo.empty());
    // the returned hf_repo is without tag
    auto [hf_repo, hf_tag] = common_download_split_repo_tag(params.model.hf_repo);
    // "latest" tag (default if not specified) is translated to "default" preset
    if (hf_tag == "latest") {
        hf_tag = "default";
    }
    std::string model_endpoint = common_get_model_endpoint();
    auto preset_url = model_endpoint + hf_repo + "/resolve/main/preset.ini";
    // prepare local path for caching
    auto preset_fname = clean_file_name(hf_repo + "_preset.ini");
    auto preset_path = fs_get_cache_file(preset_fname);
    common_download_opts opts;
    opts.bearer_token = params.hf_token;
    opts.offline = params.offline;
    LOG_TRC("%s: looking for remote preset at %s\n", __func__, preset_url.c_str());
    const int status = common_download_file_single(preset_url, preset_path, opts);
    const bool has_preset = status >= 200 && status < 400;
    // remote preset is optional, so we don't error out if not found
    if (has_preset) {
        LOG_TRC("%s: applying remote preset from %s\n", __func__, preset_url.c_str());
        common_preset_context ctx(ex, /* only_remote_allowed */ true);
        common_preset global;
        auto remote_presets = ctx.load_from_ini(preset_path, global);
        remote_presets = ctx.cascade(global, remote_presets);
        if (remote_presets.find(hf_tag) != remote_presets.end()) {
            common_preset preset = remote_presets.at(hf_tag);
            LOG_INF("\n%s", preset.to_ini().c_str()); // to_ini already added trailing newline
            preset.apply_to_params(params);
        } else {
            throw std::runtime_error("Remote preset.ini does not contain [" + std::string(hf_tag) + "] section");
        }
    } else {
        LOG_TRC("%s: no remote preset found, skipping\n", __func__);
    }
    return has_preset;
 }
 struct handle_model_result {
    bool found_mmproj = false;
    common_params_model mmproj;
    bool found_mtp = false;
    common_params_model mtp;
    bool found_preset = false;
    std::string preset_path;
 };
 static handle_model_result common_params_handle_model(struct common_params_model & model,
@ -355,6 +312,12 @@ static handle_model_result common_params_handle_model(struct common_params_model
        common_download_opts hf_opts = opts;
        auto download_result = common_download_model(model, hf_opts);
        if (!download_result.preset_path.empty()) {
            result.found_preset = true;
            result.preset_path = download_result.preset_path;
            return result; // skip everything else if preset.ini is used
        }
        if (download_result.model_path.empty()) {
            throw std::runtime_error("failed to download model from Hugging Face");
        }
@ -454,6 +417,17 @@ bool common_params_handle_models(common_params & params, llama_example curr_ex)
    try {
        auto res = common_params_handle_model(params.model, opts);
        if (res.found_preset) {
            if (!params.models_preset.empty()) {
                throw std::invalid_argument("cannot use both --models-preset and -hf with a preset.ini file");
            }
            // if HF repo is a preset repo, we simply run server in router mode with the preset.ini file
            params.models_preset_hf = params.model.hf_repo; // only for showing a warning
            params.models_preset    = res.preset_path;
            params.model = common_params_model{}; // make sure to clear model, so server starts in router mode
            return true;
        }
        if (params.no_mmproj) {
            params.mmproj = {};
        } else if (res.found_mmproj && params.mmproj.path.empty() && params.mmproj.url.empty()) {
@ -601,30 +575,6 @@ static bool common_params_parse_ex(int argc, char ** argv, common_params_context
    // parse the first time to get -hf option (used for remote preset)
    parse_cli_args();
    // export_graph_ops loads only metadata
    const bool skip_model_download = ctx_arg.ex == LLAMA_EXAMPLE_EXPORT_GRAPH_OPS;
    // maybe handle remote preset
    if (!params.model.hf_repo.empty() && !skip_model_download) {
        std::string cli_hf_repo = params.model.hf_repo;
        bool has_preset = common_params_handle_remote_preset(params, ctx_arg.ex);
        // special case: if hf_repo explicitly set by preset, we need to preserve it (ignore CLI value)
        // this is useful when we have one HF repo pointing to other HF repos (one model - multiple GGUFs)
        std::string preset_hf_repo = params.model.hf_repo;
        bool preset_has_hf_repo = preset_hf_repo != cli_hf_repo;
        if (has_preset) {
            // re-parse CLI args to override preset values
            parse_cli_args();
        }
        // preserve hf_repo from preset if needed
        if (preset_has_hf_repo) {
            params.model.hf_repo = preset_hf_repo;
        }
    }
    postprocess_cpu_params(params.cpuparams,       nullptr);
    postprocess_cpu_params(params.cpuparams_batch, &params.cpuparams);
@ -635,15 +585,21 @@ static bool common_params_parse_ex(int argc, char ** argv, common_params_context
        throw std::invalid_argument("error: --prompt-cache-all not supported in interactive mode yet\n");
    }
-    // handle model and download
+    // export_graph_ops loads only metadata
-    if (!skip_model_download) {
+    const bool skip_model_download = ctx_arg.ex == LLAMA_EXAMPLE_EXPORT_GRAPH_OPS;
        common_params_handle_models(params, ctx_arg.ex);
    }
-    // model is required (except for server)
+    if (!skip_model_download) {
-    // TODO @ngxson : maybe show a list of available models in CLI in this case
+        // handle model and download
-    if (params.model.path.empty() && ctx_arg.ex != LLAMA_EXAMPLE_SERVER && !skip_model_download && !params.usage && !params.completion) {
+        common_params_handle_models(params, ctx_arg.ex);
-        throw std::invalid_argument("error: --model is required\n");
+
        // model is required (except for server)
        // TODO @ngxson : maybe show a list of available models in CLI in this case
        if (params.model.path.empty()
                && ctx_arg.ex != LLAMA_EXAMPLE_SERVER
                && !params.usage
                && !params.completion) {
            throw std::invalid_argument("error: --model is required\n");
        }
    }
    if (params.escape) {
--- a/common/common.h
+++ b/common/common.h
@ -642,10 +642,11 @@ struct common_params {
    std::vector<std::string> server_tools;
    // router server configs
-    std::string models_dir    = ""; // directory containing models for the router server
+    std::string models_dir    = "";     // directory containing models for the router server
-    std::string models_preset = ""; // directory containing model presets for the router server
+    std::string models_preset = "";     // directory containing model presets for the router server
-    int models_max = 4;             // maximum number of models to load simultaneously
+    int models_max = 4;                 // maximum number of models to load simultaneously
-    bool models_autoload = true;    // automatically load models when requested via the router server
+    bool models_autoload = true;        // automatically load models when requested via the router server
    std::string models_preset_hf = "";  // show a warning about remote presets on router loaded (if not empty)
    bool log_json = false;
--- a/common/download.cpp
+++ b/common/download.cpp
@ -696,6 +696,7 @@ struct hf_plan {
    hf_cache::hf_files model_files;
    hf_cache::hf_file mmproj;
    hf_cache::hf_file mtp;
    hf_cache::hf_file preset; // if set, only this file is downloaded
 };
 static hf_plan get_hf_plan(const common_params_model  & model,
@ -717,6 +718,14 @@ static hf_plan get_hf_plan(const common_params_model  & model,
        return plan;
    }
    // if preset.ini exists in the repo root, download only that file
    for (const auto & f : all) {
        if (f.path == "preset.ini") {
            plan.preset = f;
            return plan;
        }
    }
    hf_cache::hf_file primary;
    if (!model.hf_file.empty()) {
@ -794,14 +803,19 @@ common_download_model_result common_download_model(const common_params_model  &
    if (is_hf) {
        hf = get_hf_plan(model, opts, download_mmproj, download_mtp);
-        for (const auto & f : hf.model_files) {
+        if (!hf.preset.path.empty()) {
-            tasks.push_back({f.url, f.local_path});
+            // if preset.ini exists, only download that file alone
-        }
+            tasks.push_back({hf.preset.url, hf.preset.local_path});
-        if (!hf.mmproj.path.empty()) {
+        } else {
-            tasks.push_back({hf.mmproj.url, hf.mmproj.local_path});
+            for (const auto & f : hf.model_files) {
-        }
+                tasks.push_back({f.url, f.local_path});
-        if (!hf.mtp.path.empty()) {
+            }
-            tasks.push_back({hf.mtp.url, hf.mtp.local_path});
+            if (!hf.mmproj.path.empty()) {
                tasks.push_back({hf.mmproj.url, hf.mmproj.local_path});
            }
            if (!hf.mtp.path.empty()) {
                tasks.push_back({hf.mtp.url, hf.mtp.local_path});
            }
        }
    } else if (!model.url.empty()) {
        tasks = get_url_tasks(model);
@ -835,17 +849,22 @@ common_download_model_result common_download_model(const common_params_model  &
    }
    if (is_hf) {
-        for (const auto & f : hf.model_files) {
+        if (!hf.preset.path.empty()) {
-            hf_cache::finalize_file(f);
+            // if preset.ini is used, do not set other paths
-        }
+            result.preset_path = hf_cache::finalize_file(hf.preset);
-        result.model_path = hf.primary.final_path;
+        } else {
            for (const auto & f : hf.model_files) {
                hf_cache::finalize_file(f);
            }
            result.model_path = hf.primary.final_path;
-        if (!hf.mmproj.path.empty()) {
+            if (!hf.mmproj.path.empty()) {
-            result.mmproj_path = hf_cache::finalize_file(hf.mmproj);
+                result.mmproj_path = hf_cache::finalize_file(hf.mmproj);
-        }
+            }
-        if (!hf.mtp.path.empty()) {
+            if (!hf.mtp.path.empty()) {
-            result.mtp_path = hf_cache::finalize_file(hf.mtp);
+                result.mtp_path = hf_cache::finalize_file(hf.mtp);
            }
        }
    } else {
        result.model_path = model.path;
--- a/common/download.h
+++ b/common/download.h
@ -63,6 +63,7 @@ struct common_download_model_result {
    std::string model_path;
    std::string mmproj_path;
    std::string mtp_path;
    std::string preset_path;
 };
 // throw if the file is missing or invalid (e.g. ETag check failed)
--- a/common/preset.cpp
+++ b/common/preset.cpp
@ -16,48 +16,6 @@ static std::string rm_leading_dashes(const std::string & str) {
    return str.substr(pos);
 }
 // only allow a subset of args for remote presets for security reasons
 // do not add more args unless absolutely necessary
 // args that output to files are strictly prohibited
 static std::set<std::string> get_remote_preset_whitelist(const std::map<std::string, common_arg> & key_to_opt) {
    static const std::set<std::string> allowed_options = {
        "model-url",
        "hf-repo",
        "hf-repo-draft",
        "hf-repo-v", // vocoder
        "hf-file-v", // vocoder
        "mmproj-url",
        "pooling",
        "jinja",
        "batch-size",
        "ubatch-size",
        "cache-reuse",
        "chat-template-kwargs",
        "mmap",
        // note: sampling params are automatically allowed by default
        // negated args will be added automatically if the positive arg is specified above
    };
    std::set<std::string> allowed_keys;
    for (const auto & it : key_to_opt) {
        const std::string & key = it.first;
        const common_arg & opt = it.second;
        if (allowed_options.find(key) != allowed_options.end() || opt.is_sampling) {
            allowed_keys.insert(key);
            // also add variant keys (args without leading dashes and env vars)
            for (const auto & arg : opt.get_args()) {
                allowed_keys.insert(rm_leading_dashes(arg));
            }
            for (const auto & env : opt.get_env()) {
                allowed_keys.insert(env);
            }
        }
    }
    return allowed_keys;
 }
 std::vector<std::string> common_preset::to_args(const std::string & bin_path) const {
    std::vector<std::string> args;
@ -300,16 +258,10 @@ static std::string parse_bool_arg(const common_arg & arg, const std::string & ke
    return value;
 }
-common_preset_context::common_preset_context(llama_example ex, bool only_remote_allowed)
+common_preset_context::common_preset_context(llama_example ex)
        : ctx_params(common_params_parser_init(default_params, ex)) {
    common_params_add_preset_options(ctx_params.options);
    key_to_opt = get_map_key_opt(ctx_params);
    // setup allowed keys if only_remote_allowed is true
    if (only_remote_allowed) {
        filter_allowed_keys = true;
        allowed_keys = get_remote_preset_whitelist(key_to_opt);
    }
 }
 common_presets common_preset_context::load_from_ini(const std::string & path, common_preset & global) const {
--- a/common/preset.h
+++ b/common/preset.h
@ -60,7 +60,7 @@ struct common_preset_context {
    std::set<std::string> allowed_keys;
    // if only_remote_allowed is true, only accept whitelisted keys
-    common_preset_context(llama_example ex, bool only_remote_allowed = false);
+    common_preset_context(llama_example ex);
    // load presets from INI file
    common_presets load_from_ini(const std::string & path, common_preset & global) const;
--- a/docs/preset.md
+++ b/docs/preset.md
@ -8,55 +8,53 @@ The INI preset feature, introduced in [PR#17859](https://github.com/ggml-org/lla
 When running multiple models on the server (router mode), INI preset files can be used to configure model-specific parameters. Please refer to the [server documentation](../tools/server/README.md) for more details.
-### Using a Remote Preset
+### Using a Hugging Face Preset
-> [!NOTE]
+> [!IMPORTANT]
 >
-> This feature is currently only supported via the `-hf` option.
+> Please only use presets that you can trust! Unknown presets may be unsafe
-For GGUF models hosted on Hugging Face, you can include a `preset.ini` file in the root directory of the repository to define specific configurations for that model.
+You can push your preset to Hugging Face Hub and share with other users by:
 1. Creating an empty model repository on Hugging Face
 2. Creating a `preset.ini` file in the root directory of the repository
-Example:
+Example of a `preset.ini`:
 ```ini
-hf-repo-draft = username/my-draft-model-GGUF
+[*]
-temp = 0.5
+ctx-size             = 0
-top-k = 20
+mmap                 = 1
-top-p = 0.95
+kv-unified           = 1
 parallel             = 4
 spec-default         = 1
 [Qwen3.5-4B]
 hf                   = unsloth/Qwen3.5-4B-GGUF:Q4_K_M
 ctx-size             = 262144
 batch-size           = 2048
 ubatch-size          = 2048
 top-p                = 1.0
 top-k                = 0
 min-p                = 0.01
 temp                 = 1.0
 [gpt-oss-120b-hf]
 hf                   = ggml-org/gpt-oss-120b-GGUF
 ctx-size             = 262144
 batch-size           = 2048
 ubatch-size          = 2048
 top-p                = 1.0
 top-k                = 0
 min-p                = 0.01
 temp                 = 1.0
 chat-template-kwargs = {"reasoning_effort": "high"}
 ```
-For security reasons, only certain options are allowed. Please refer to [preset.cpp](../common/preset.cpp) for the complete list of permitted options.
+The preset will be loaded similarly to the `--models-preset` option. Therefore, you can also override certain params via CLI arguments:
 Example usage:
 Assuming your repository `username/my-model-with-preset` contains a `preset.ini` with the configuration above:
 ```sh
 llama-cli -hf username/my-model-with-preset
 # This is equivalent to:
 llama-cli -hf username/my-model-with-preset \
  --hf-repo-draft username/my-draft-model-GGUF \
  --temp 0.5 \
  --top-k 20 \
  --top-p 0.95
 ```
 You can also override preset arguments by specifying them on the command line:
 ```sh
 # Force temp = 0.1, overriding the preset value
-llama-cli -hf username/my-model-with-preset --temp 0.1
+llama-cli -hf username/my-preset --temp 0.1
 ```
 If you want to define multiple preset configurations for one or more GGUF models, you can create a blank HF repo for each preset. Each HF repo should contain a `preset.ini` file that references the actual model(s):
 ```ini
 hf-repo = user/my-model-main
 hf-repo-draft = user/my-model-draft
 temp = 0.8
 ctx-size = 1024
 ; (and other configurations)
 ```
 ### Named presets
--- a/tools/server/server.cpp
+++ b/tools/server/server.cpp
@ -349,6 +349,12 @@ int llama_server(int argc, char ** argv) {
        SRV_INF("router server is listening on %s\n", ctx_http.listening_address.c_str());
        SRV_WRN("%s", "NOTE: router mode is experimental\n");
        SRV_WRN("%s", "      it is not recommended to use this mode in untrusted environments\n");
        if (!params.models_preset_hf.empty()) {
            SRV_WRN(      "NOTE: using preset.ini from HF repo '%s'\n", params.models_preset_hf.c_str());
            SRV_WRN("%s", "      please only use presets that you can trust! Unknown presets may be unsafe\n");
        }
        if (ctx_http.thread.joinable()) {
            ctx_http.thread.join(); // keep the main thread alive
        }