From 552258c5350dcf86c6a1e9a2dcd45f06076a4667 Mon Sep 17 00:00:00 2001
From: Xuan-Son Nguyen <son@huggingface.co>
Date: Thu, 18 Jun 2026 12:45:23 +0200
Subject: [PATCH] server: (router) rework -hf preset repo (#24739)

* server: temporary remove HF remote preset

* rework remove preset.ini support

* rm unused get_remote_preset_whitelist()

* print warning

* add docs

* rm stray file
---
 common/arg.cpp          | 112 ++++++++++++----------------------------
 common/common.h         |   9 ++--
 common/download.cpp     |  53 +++++++++++++------
 common/download.h       |   1 +
 common/preset.cpp       |  50 +-----------------
 common/preset.h         |   2 +-
 docs/preset.md          |  74 +++++++++++++-------------
 tools/server/server.cpp |   6 +++
 8 files changed, 120 insertions(+), 187 deletions(-)

diff --git a/common/arg.cpp b/common/arg.cpp
index 8382c1d85d..bd4b113d16 100644
--- a/common/arg.cpp
+++ b/common/arg.cpp
@@ -285,58 +285,15 @@ static std::string clean_file_name(const std::string & fname) {
     return clean_fname;
 }
 
-static bool common_params_handle_remote_preset(common_params & params, llama_example ex) {
-    GGML_ASSERT(!params.model.hf_repo.empty());
-
-    // the returned hf_repo is without tag
-    auto [hf_repo, hf_tag] = common_download_split_repo_tag(params.model.hf_repo);
-
-    // "latest" tag (default if not specified) is translated to "default" preset
-    if (hf_tag == "latest") {
-        hf_tag = "default";
-    }
-
-    std::string model_endpoint = common_get_model_endpoint();
-    auto preset_url = model_endpoint + hf_repo + "/resolve/main/preset.ini";
-
-    // prepare local path for caching
-    auto preset_fname = clean_file_name(hf_repo + "_preset.ini");
-    auto preset_path = fs_get_cache_file(preset_fname);
-    common_download_opts opts;
-    opts.bearer_token = params.hf_token;
-    opts.offline = params.offline;
-
-    LOG_TRC("%s: looking for remote preset at %s\n", __func__, preset_url.c_str());
-    const int status = common_download_file_single(preset_url, preset_path, opts);
-    const bool has_preset = status >= 200 && status < 400;
-
-    // remote preset is optional, so we don't error out if not found
-    if (has_preset) {
-        LOG_TRC("%s: applying remote preset from %s\n", __func__, preset_url.c_str());
-        common_preset_context ctx(ex, /* only_remote_allowed */ true);
-        common_preset global;
-        auto remote_presets = ctx.load_from_ini(preset_path, global);
-        remote_presets = ctx.cascade(global, remote_presets);
-        if (remote_presets.find(hf_tag) != remote_presets.end()) {
-            common_preset preset = remote_presets.at(hf_tag);
-            LOG_INF("\n%s", preset.to_ini().c_str()); // to_ini already added trailing newline
-            preset.apply_to_params(params);
-        } else {
-            throw std::runtime_error("Remote preset.ini does not contain [" + std::string(hf_tag) + "] section");
-        }
-    } else {
-        LOG_TRC("%s: no remote preset found, skipping\n", __func__);
-    }
-
-    return has_preset;
-}
-
 struct handle_model_result {
     bool found_mmproj = false;
     common_params_model mmproj;
 
     bool found_mtp = false;
     common_params_model mtp;
+
+    bool found_preset = false;
+    std::string preset_path;
 };
 
 static handle_model_result common_params_handle_model(struct common_params_model & model,
@@ -355,6 +312,12 @@ static handle_model_result common_params_handle_model(struct common_params_model
         common_download_opts hf_opts = opts;
         auto download_result = common_download_model(model, hf_opts);
 
+        if (!download_result.preset_path.empty()) {
+            result.found_preset = true;
+            result.preset_path = download_result.preset_path;
+            return result; // skip everything else if preset.ini is used
+        }
+
         if (download_result.model_path.empty()) {
             throw std::runtime_error("failed to download model from Hugging Face");
         }
@@ -454,6 +417,17 @@ bool common_params_handle_models(common_params & params, llama_example curr_ex)
 
     try {
         auto res = common_params_handle_model(params.model, opts);
+        if (res.found_preset) {
+            if (!params.models_preset.empty()) {
+                throw std::invalid_argument("cannot use both --models-preset and -hf with a preset.ini file");
+            }
+            // if HF repo is a preset repo, we simply run server in router mode with the preset.ini file
+            params.models_preset_hf = params.model.hf_repo; // only for showing a warning
+            params.models_preset    = res.preset_path;
+            params.model = common_params_model{}; // make sure to clear model, so server starts in router mode
+            return true;
+        }
+
         if (params.no_mmproj) {
             params.mmproj = {};
         } else if (res.found_mmproj && params.mmproj.path.empty() && params.mmproj.url.empty()) {
@@ -601,30 +575,6 @@ static bool common_params_parse_ex(int argc, char ** argv, common_params_context
     // parse the first time to get -hf option (used for remote preset)
     parse_cli_args();
 
-    // export_graph_ops loads only metadata
-    const bool skip_model_download = ctx_arg.ex == LLAMA_EXAMPLE_EXPORT_GRAPH_OPS;
-
-    // maybe handle remote preset
-    if (!params.model.hf_repo.empty() && !skip_model_download) {
-        std::string cli_hf_repo = params.model.hf_repo;
-        bool has_preset = common_params_handle_remote_preset(params, ctx_arg.ex);
-
-        // special case: if hf_repo explicitly set by preset, we need to preserve it (ignore CLI value)
-        // this is useful when we have one HF repo pointing to other HF repos (one model - multiple GGUFs)
-        std::string preset_hf_repo = params.model.hf_repo;
-        bool preset_has_hf_repo = preset_hf_repo != cli_hf_repo;
-
-        if (has_preset) {
-            // re-parse CLI args to override preset values
-            parse_cli_args();
-        }
-
-        // preserve hf_repo from preset if needed
-        if (preset_has_hf_repo) {
-            params.model.hf_repo = preset_hf_repo;
-        }
-    }
-
     postprocess_cpu_params(params.cpuparams,       nullptr);
     postprocess_cpu_params(params.cpuparams_batch, &params.cpuparams);
 
@@ -635,15 +585,21 @@ static bool common_params_parse_ex(int argc, char ** argv, common_params_context
         throw std::invalid_argument("error: --prompt-cache-all not supported in interactive mode yet\n");
     }
 
-    // handle model and download
-    if (!skip_model_download) {
-        common_params_handle_models(params, ctx_arg.ex);
-    }
+    // export_graph_ops loads only metadata
+    const bool skip_model_download = ctx_arg.ex == LLAMA_EXAMPLE_EXPORT_GRAPH_OPS;
 
-    // model is required (except for server)
-    // TODO @ngxson : maybe show a list of available models in CLI in this case
-    if (params.model.path.empty() && ctx_arg.ex != LLAMA_EXAMPLE_SERVER && !skip_model_download && !params.usage && !params.completion) {
-        throw std::invalid_argument("error: --model is required\n");
+    if (!skip_model_download) {
+        // handle model and download
+        common_params_handle_models(params, ctx_arg.ex);
+
+        // model is required (except for server)
+        // TODO @ngxson : maybe show a list of available models in CLI in this case
+        if (params.model.path.empty()
+                && ctx_arg.ex != LLAMA_EXAMPLE_SERVER
+                && !params.usage
+                && !params.completion) {
+            throw std::invalid_argument("error: --model is required\n");
+        }
     }
 
     if (params.escape) {
diff --git a/common/common.h b/common/common.h
index 0b284cbb36..040b9cf233 100644
--- a/common/common.h
+++ b/common/common.h
@@ -642,10 +642,11 @@ struct common_params {
     std::vector<std::string> server_tools;
 
     // router server configs
-    std::string models_dir    = ""; // directory containing models for the router server
-    std::string models_preset = ""; // directory containing model presets for the router server
-    int models_max = 4;             // maximum number of models to load simultaneously
-    bool models_autoload = true;    // automatically load models when requested via the router server
+    std::string models_dir    = "";     // directory containing models for the router server
+    std::string models_preset = "";     // directory containing model presets for the router server
+    int models_max = 4;                 // maximum number of models to load simultaneously
+    bool models_autoload = true;        // automatically load models when requested via the router server
+    std::string models_preset_hf = "";  // show a warning about remote presets on router loaded (if not empty)
 
     bool log_json = false;
 
diff --git a/common/download.cpp b/common/download.cpp
index c3c8ff49bb..f320462753 100644
--- a/common/download.cpp
+++ b/common/download.cpp
@@ -696,6 +696,7 @@ struct hf_plan {
     hf_cache::hf_files model_files;
     hf_cache::hf_file mmproj;
     hf_cache::hf_file mtp;
+    hf_cache::hf_file preset; // if set, only this file is downloaded
 };
 
 static hf_plan get_hf_plan(const common_params_model  & model,
@@ -717,6 +718,14 @@ static hf_plan get_hf_plan(const common_params_model  & model,
         return plan;
     }
 
+    // if preset.ini exists in the repo root, download only that file
+    for (const auto & f : all) {
+        if (f.path == "preset.ini") {
+            plan.preset = f;
+            return plan;
+        }
+    }
+
     hf_cache::hf_file primary;
 
     if (!model.hf_file.empty()) {
@@ -794,14 +803,19 @@ common_download_model_result common_download_model(const common_params_model  &
 
     if (is_hf) {
         hf = get_hf_plan(model, opts, download_mmproj, download_mtp);
-        for (const auto & f : hf.model_files) {
-            tasks.push_back({f.url, f.local_path});
-        }
-        if (!hf.mmproj.path.empty()) {
-            tasks.push_back({hf.mmproj.url, hf.mmproj.local_path});
-        }
-        if (!hf.mtp.path.empty()) {
-            tasks.push_back({hf.mtp.url, hf.mtp.local_path});
+        if (!hf.preset.path.empty()) {
+            // if preset.ini exists, only download that file alone
+            tasks.push_back({hf.preset.url, hf.preset.local_path});
+        } else {
+            for (const auto & f : hf.model_files) {
+                tasks.push_back({f.url, f.local_path});
+            }
+            if (!hf.mmproj.path.empty()) {
+                tasks.push_back({hf.mmproj.url, hf.mmproj.local_path});
+            }
+            if (!hf.mtp.path.empty()) {
+                tasks.push_back({hf.mtp.url, hf.mtp.local_path});
+            }
         }
     } else if (!model.url.empty()) {
         tasks = get_url_tasks(model);
@@ -835,17 +849,22 @@ common_download_model_result common_download_model(const common_params_model  &
     }
 
     if (is_hf) {
-        for (const auto & f : hf.model_files) {
-            hf_cache::finalize_file(f);
-        }
-        result.model_path = hf.primary.final_path;
+        if (!hf.preset.path.empty()) {
+            // if preset.ini is used, do not set other paths
+            result.preset_path = hf_cache::finalize_file(hf.preset);
+        } else {
+            for (const auto & f : hf.model_files) {
+                hf_cache::finalize_file(f);
+            }
+            result.model_path = hf.primary.final_path;
 
-        if (!hf.mmproj.path.empty()) {
-            result.mmproj_path = hf_cache::finalize_file(hf.mmproj);
-        }
+            if (!hf.mmproj.path.empty()) {
+                result.mmproj_path = hf_cache::finalize_file(hf.mmproj);
+            }
 
-        if (!hf.mtp.path.empty()) {
-            result.mtp_path = hf_cache::finalize_file(hf.mtp);
+            if (!hf.mtp.path.empty()) {
+                result.mtp_path = hf_cache::finalize_file(hf.mtp);
+            }
         }
     } else {
         result.model_path = model.path;
diff --git a/common/download.h b/common/download.h
index 2371797644..8dbf07836f 100644
--- a/common/download.h
+++ b/common/download.h
@@ -63,6 +63,7 @@ struct common_download_model_result {
     std::string model_path;
     std::string mmproj_path;
     std::string mtp_path;
+    std::string preset_path;
 };
 
 // throw if the file is missing or invalid (e.g. ETag check failed)
diff --git a/common/preset.cpp b/common/preset.cpp
index 51ea984d8c..f0cc1fa1a2 100644
--- a/common/preset.cpp
+++ b/common/preset.cpp
@@ -16,48 +16,6 @@ static std::string rm_leading_dashes(const std::string & str) {
     return str.substr(pos);
 }
 
-// only allow a subset of args for remote presets for security reasons
-// do not add more args unless absolutely necessary
-// args that output to files are strictly prohibited
-static std::set<std::string> get_remote_preset_whitelist(const std::map<std::string, common_arg> & key_to_opt) {
-    static const std::set<std::string> allowed_options = {
-        "model-url",
-        "hf-repo",
-        "hf-repo-draft",
-        "hf-repo-v", // vocoder
-        "hf-file-v", // vocoder
-        "mmproj-url",
-        "pooling",
-        "jinja",
-        "batch-size",
-        "ubatch-size",
-        "cache-reuse",
-        "chat-template-kwargs",
-        "mmap",
-        // note: sampling params are automatically allowed by default
-        // negated args will be added automatically if the positive arg is specified above
-    };
-
-    std::set<std::string> allowed_keys;
-
-    for (const auto & it : key_to_opt) {
-        const std::string & key = it.first;
-        const common_arg & opt = it.second;
-        if (allowed_options.find(key) != allowed_options.end() || opt.is_sampling) {
-            allowed_keys.insert(key);
-            // also add variant keys (args without leading dashes and env vars)
-            for (const auto & arg : opt.get_args()) {
-                allowed_keys.insert(rm_leading_dashes(arg));
-            }
-            for (const auto & env : opt.get_env()) {
-                allowed_keys.insert(env);
-            }
-        }
-    }
-
-    return allowed_keys;
-}
-
 std::vector<std::string> common_preset::to_args(const std::string & bin_path) const {
     std::vector<std::string> args;
 
@@ -300,16 +258,10 @@ static std::string parse_bool_arg(const common_arg & arg, const std::string & ke
     return value;
 }
 
-common_preset_context::common_preset_context(llama_example ex, bool only_remote_allowed)
+common_preset_context::common_preset_context(llama_example ex)
         : ctx_params(common_params_parser_init(default_params, ex)) {
     common_params_add_preset_options(ctx_params.options);
     key_to_opt = get_map_key_opt(ctx_params);
-
-    // setup allowed keys if only_remote_allowed is true
-    if (only_remote_allowed) {
-        filter_allowed_keys = true;
-        allowed_keys = get_remote_preset_whitelist(key_to_opt);
-    }
 }
 
 common_presets common_preset_context::load_from_ini(const std::string & path, common_preset & global) const {
diff --git a/common/preset.h b/common/preset.h
index 06f829c3e5..52935ebde8 100644
--- a/common/preset.h
+++ b/common/preset.h
@@ -60,7 +60,7 @@ struct common_preset_context {
     std::set<std::string> allowed_keys;
 
     // if only_remote_allowed is true, only accept whitelisted keys
-    common_preset_context(llama_example ex, bool only_remote_allowed = false);
+    common_preset_context(llama_example ex);
 
     // load presets from INI file
     common_presets load_from_ini(const std::string & path, common_preset & global) const;
diff --git a/docs/preset.md b/docs/preset.md
index d49fb0a1ae..85762a420b 100644
--- a/docs/preset.md
+++ b/docs/preset.md
@@ -8,55 +8,53 @@ The INI preset feature, introduced in [PR#17859](https://github.com/ggml-org/lla
 
 When running multiple models on the server (router mode), INI preset files can be used to configure model-specific parameters. Please refer to the [server documentation](../tools/server/README.md) for more details.
 
-### Using a Remote Preset
+### Using a Hugging Face Preset
 
-> [!NOTE]
+> [!IMPORTANT]
 >
-> This feature is currently only supported via the `-hf` option.
+> Please only use presets that you can trust! Unknown presets may be unsafe
 
-For GGUF models hosted on Hugging Face, you can include a `preset.ini` file in the root directory of the repository to define specific configurations for that model.
+You can push your preset to Hugging Face Hub and share with other users by:
+1. Creating an empty model repository on Hugging Face
+2. Creating a `preset.ini` file in the root directory of the repository
 
-Example:
+Example of a `preset.ini`:
 
 ```ini
-hf-repo-draft = username/my-draft-model-GGUF
-temp = 0.5
-top-k = 20
-top-p = 0.95
+[*]
+ctx-size             = 0
+mmap                 = 1
+kv-unified           = 1
+parallel             = 4
+spec-default         = 1
+
+[Qwen3.5-4B]
+hf                   = unsloth/Qwen3.5-4B-GGUF:Q4_K_M
+ctx-size             = 262144
+batch-size           = 2048
+ubatch-size          = 2048
+top-p                = 1.0
+top-k                = 0
+min-p                = 0.01
+temp                 = 1.0
+
+[gpt-oss-120b-hf]
+hf                   = ggml-org/gpt-oss-120b-GGUF
+ctx-size             = 262144
+batch-size           = 2048
+ubatch-size          = 2048
+top-p                = 1.0
+top-k                = 0
+min-p                = 0.01
+temp                 = 1.0
+chat-template-kwargs = {"reasoning_effort": "high"}
 ```
 
-For security reasons, only certain options are allowed. Please refer to [preset.cpp](../common/preset.cpp) for the complete list of permitted options.
-
-Example usage:
-
-Assuming your repository `username/my-model-with-preset` contains a `preset.ini` with the configuration above:
-
-```sh
-llama-cli -hf username/my-model-with-preset
-
-# This is equivalent to:
-llama-cli -hf username/my-model-with-preset \
-  --hf-repo-draft username/my-draft-model-GGUF \
-  --temp 0.5 \
-  --top-k 20 \
-  --top-p 0.95
-```
-
-You can also override preset arguments by specifying them on the command line:
+The preset will be loaded similarly to the `--models-preset` option. Therefore, you can also override certain params via CLI arguments:
 
 ```sh
 # Force temp = 0.1, overriding the preset value
-llama-cli -hf username/my-model-with-preset --temp 0.1
-```
-
-If you want to define multiple preset configurations for one or more GGUF models, you can create a blank HF repo for each preset. Each HF repo should contain a `preset.ini` file that references the actual model(s):
-
-```ini
-hf-repo = user/my-model-main
-hf-repo-draft = user/my-model-draft
-temp = 0.8
-ctx-size = 1024
-; (and other configurations)
+llama-cli -hf username/my-preset --temp 0.1
 ```
 
 ### Named presets
diff --git a/tools/server/server.cpp b/tools/server/server.cpp
index 0364d7d3b7..78ab0318cf 100644
--- a/tools/server/server.cpp
+++ b/tools/server/server.cpp
@@ -349,6 +349,12 @@ int llama_server(int argc, char ** argv) {
         SRV_INF("router server is listening on %s\n", ctx_http.listening_address.c_str());
         SRV_WRN("%s", "NOTE: router mode is experimental\n");
         SRV_WRN("%s", "      it is not recommended to use this mode in untrusted environments\n");
+
+        if (!params.models_preset_hf.empty()) {
+            SRV_WRN(      "NOTE: using preset.ini from HF repo '%s'\n", params.models_preset_hf.c_str());
+            SRV_WRN("%s", "      please only use presets that you can trust! Unknown presets may be unsafe\n");
+        }
+
         if (ctx_http.thread.joinable()) {
             ctx_http.thread.join(); // keep the main thread alive
         }