From 552258c5350dcf86c6a1e9a2dcd45f06076a4667 Mon Sep 17 00:00:00 2001 From: Xuan-Son Nguyen Date: Thu, 18 Jun 2026 12:45:23 +0200 Subject: [PATCH] server: (router) rework -hf preset repo (#24739) * server: temporary remove HF remote preset * rework remove preset.ini support * rm unused get_remote_preset_whitelist() * print warning * add docs * rm stray file --- common/arg.cpp | 112 ++++++++++++---------------------------- common/common.h | 9 ++-- common/download.cpp | 53 +++++++++++++------ common/download.h | 1 + common/preset.cpp | 50 +----------------- common/preset.h | 2 +- docs/preset.md | 74 +++++++++++++------------- tools/server/server.cpp | 6 +++ 8 files changed, 120 insertions(+), 187 deletions(-) diff --git a/common/arg.cpp b/common/arg.cpp index 8382c1d85d..bd4b113d16 100644 --- a/common/arg.cpp +++ b/common/arg.cpp @@ -285,58 +285,15 @@ static std::string clean_file_name(const std::string & fname) { return clean_fname; } -static bool common_params_handle_remote_preset(common_params & params, llama_example ex) { - GGML_ASSERT(!params.model.hf_repo.empty()); - - // the returned hf_repo is without tag - auto [hf_repo, hf_tag] = common_download_split_repo_tag(params.model.hf_repo); - - // "latest" tag (default if not specified) is translated to "default" preset - if (hf_tag == "latest") { - hf_tag = "default"; - } - - std::string model_endpoint = common_get_model_endpoint(); - auto preset_url = model_endpoint + hf_repo + "/resolve/main/preset.ini"; - - // prepare local path for caching - auto preset_fname = clean_file_name(hf_repo + "_preset.ini"); - auto preset_path = fs_get_cache_file(preset_fname); - common_download_opts opts; - opts.bearer_token = params.hf_token; - opts.offline = params.offline; - - LOG_TRC("%s: looking for remote preset at %s\n", __func__, preset_url.c_str()); - const int status = common_download_file_single(preset_url, preset_path, opts); - const bool has_preset = status >= 200 && status < 400; - - // remote preset is optional, so we don't error out if not found - if (has_preset) { - LOG_TRC("%s: applying remote preset from %s\n", __func__, preset_url.c_str()); - common_preset_context ctx(ex, /* only_remote_allowed */ true); - common_preset global; - auto remote_presets = ctx.load_from_ini(preset_path, global); - remote_presets = ctx.cascade(global, remote_presets); - if (remote_presets.find(hf_tag) != remote_presets.end()) { - common_preset preset = remote_presets.at(hf_tag); - LOG_INF("\n%s", preset.to_ini().c_str()); // to_ini already added trailing newline - preset.apply_to_params(params); - } else { - throw std::runtime_error("Remote preset.ini does not contain [" + std::string(hf_tag) + "] section"); - } - } else { - LOG_TRC("%s: no remote preset found, skipping\n", __func__); - } - - return has_preset; -} - struct handle_model_result { bool found_mmproj = false; common_params_model mmproj; bool found_mtp = false; common_params_model mtp; + + bool found_preset = false; + std::string preset_path; }; static handle_model_result common_params_handle_model(struct common_params_model & model, @@ -355,6 +312,12 @@ static handle_model_result common_params_handle_model(struct common_params_model common_download_opts hf_opts = opts; auto download_result = common_download_model(model, hf_opts); + if (!download_result.preset_path.empty()) { + result.found_preset = true; + result.preset_path = download_result.preset_path; + return result; // skip everything else if preset.ini is used + } + if (download_result.model_path.empty()) { throw std::runtime_error("failed to download model from Hugging Face"); } @@ -454,6 +417,17 @@ bool common_params_handle_models(common_params & params, llama_example curr_ex) try { auto res = common_params_handle_model(params.model, opts); + if (res.found_preset) { + if (!params.models_preset.empty()) { + throw std::invalid_argument("cannot use both --models-preset and -hf with a preset.ini file"); + } + // if HF repo is a preset repo, we simply run server in router mode with the preset.ini file + params.models_preset_hf = params.model.hf_repo; // only for showing a warning + params.models_preset = res.preset_path; + params.model = common_params_model{}; // make sure to clear model, so server starts in router mode + return true; + } + if (params.no_mmproj) { params.mmproj = {}; } else if (res.found_mmproj && params.mmproj.path.empty() && params.mmproj.url.empty()) { @@ -601,30 +575,6 @@ static bool common_params_parse_ex(int argc, char ** argv, common_params_context // parse the first time to get -hf option (used for remote preset) parse_cli_args(); - // export_graph_ops loads only metadata - const bool skip_model_download = ctx_arg.ex == LLAMA_EXAMPLE_EXPORT_GRAPH_OPS; - - // maybe handle remote preset - if (!params.model.hf_repo.empty() && !skip_model_download) { - std::string cli_hf_repo = params.model.hf_repo; - bool has_preset = common_params_handle_remote_preset(params, ctx_arg.ex); - - // special case: if hf_repo explicitly set by preset, we need to preserve it (ignore CLI value) - // this is useful when we have one HF repo pointing to other HF repos (one model - multiple GGUFs) - std::string preset_hf_repo = params.model.hf_repo; - bool preset_has_hf_repo = preset_hf_repo != cli_hf_repo; - - if (has_preset) { - // re-parse CLI args to override preset values - parse_cli_args(); - } - - // preserve hf_repo from preset if needed - if (preset_has_hf_repo) { - params.model.hf_repo = preset_hf_repo; - } - } - postprocess_cpu_params(params.cpuparams, nullptr); postprocess_cpu_params(params.cpuparams_batch, ¶ms.cpuparams); @@ -635,15 +585,21 @@ static bool common_params_parse_ex(int argc, char ** argv, common_params_context throw std::invalid_argument("error: --prompt-cache-all not supported in interactive mode yet\n"); } - // handle model and download - if (!skip_model_download) { - common_params_handle_models(params, ctx_arg.ex); - } + // export_graph_ops loads only metadata + const bool skip_model_download = ctx_arg.ex == LLAMA_EXAMPLE_EXPORT_GRAPH_OPS; - // model is required (except for server) - // TODO @ngxson : maybe show a list of available models in CLI in this case - if (params.model.path.empty() && ctx_arg.ex != LLAMA_EXAMPLE_SERVER && !skip_model_download && !params.usage && !params.completion) { - throw std::invalid_argument("error: --model is required\n"); + if (!skip_model_download) { + // handle model and download + common_params_handle_models(params, ctx_arg.ex); + + // model is required (except for server) + // TODO @ngxson : maybe show a list of available models in CLI in this case + if (params.model.path.empty() + && ctx_arg.ex != LLAMA_EXAMPLE_SERVER + && !params.usage + && !params.completion) { + throw std::invalid_argument("error: --model is required\n"); + } } if (params.escape) { diff --git a/common/common.h b/common/common.h index 0b284cbb36..040b9cf233 100644 --- a/common/common.h +++ b/common/common.h @@ -642,10 +642,11 @@ struct common_params { std::vector server_tools; // router server configs - std::string models_dir = ""; // directory containing models for the router server - std::string models_preset = ""; // directory containing model presets for the router server - int models_max = 4; // maximum number of models to load simultaneously - bool models_autoload = true; // automatically load models when requested via the router server + std::string models_dir = ""; // directory containing models for the router server + std::string models_preset = ""; // directory containing model presets for the router server + int models_max = 4; // maximum number of models to load simultaneously + bool models_autoload = true; // automatically load models when requested via the router server + std::string models_preset_hf = ""; // show a warning about remote presets on router loaded (if not empty) bool log_json = false; diff --git a/common/download.cpp b/common/download.cpp index c3c8ff49bb..f320462753 100644 --- a/common/download.cpp +++ b/common/download.cpp @@ -696,6 +696,7 @@ struct hf_plan { hf_cache::hf_files model_files; hf_cache::hf_file mmproj; hf_cache::hf_file mtp; + hf_cache::hf_file preset; // if set, only this file is downloaded }; static hf_plan get_hf_plan(const common_params_model & model, @@ -717,6 +718,14 @@ static hf_plan get_hf_plan(const common_params_model & model, return plan; } + // if preset.ini exists in the repo root, download only that file + for (const auto & f : all) { + if (f.path == "preset.ini") { + plan.preset = f; + return plan; + } + } + hf_cache::hf_file primary; if (!model.hf_file.empty()) { @@ -794,14 +803,19 @@ common_download_model_result common_download_model(const common_params_model & if (is_hf) { hf = get_hf_plan(model, opts, download_mmproj, download_mtp); - for (const auto & f : hf.model_files) { - tasks.push_back({f.url, f.local_path}); - } - if (!hf.mmproj.path.empty()) { - tasks.push_back({hf.mmproj.url, hf.mmproj.local_path}); - } - if (!hf.mtp.path.empty()) { - tasks.push_back({hf.mtp.url, hf.mtp.local_path}); + if (!hf.preset.path.empty()) { + // if preset.ini exists, only download that file alone + tasks.push_back({hf.preset.url, hf.preset.local_path}); + } else { + for (const auto & f : hf.model_files) { + tasks.push_back({f.url, f.local_path}); + } + if (!hf.mmproj.path.empty()) { + tasks.push_back({hf.mmproj.url, hf.mmproj.local_path}); + } + if (!hf.mtp.path.empty()) { + tasks.push_back({hf.mtp.url, hf.mtp.local_path}); + } } } else if (!model.url.empty()) { tasks = get_url_tasks(model); @@ -835,17 +849,22 @@ common_download_model_result common_download_model(const common_params_model & } if (is_hf) { - for (const auto & f : hf.model_files) { - hf_cache::finalize_file(f); - } - result.model_path = hf.primary.final_path; + if (!hf.preset.path.empty()) { + // if preset.ini is used, do not set other paths + result.preset_path = hf_cache::finalize_file(hf.preset); + } else { + for (const auto & f : hf.model_files) { + hf_cache::finalize_file(f); + } + result.model_path = hf.primary.final_path; - if (!hf.mmproj.path.empty()) { - result.mmproj_path = hf_cache::finalize_file(hf.mmproj); - } + if (!hf.mmproj.path.empty()) { + result.mmproj_path = hf_cache::finalize_file(hf.mmproj); + } - if (!hf.mtp.path.empty()) { - result.mtp_path = hf_cache::finalize_file(hf.mtp); + if (!hf.mtp.path.empty()) { + result.mtp_path = hf_cache::finalize_file(hf.mtp); + } } } else { result.model_path = model.path; diff --git a/common/download.h b/common/download.h index 2371797644..8dbf07836f 100644 --- a/common/download.h +++ b/common/download.h @@ -63,6 +63,7 @@ struct common_download_model_result { std::string model_path; std::string mmproj_path; std::string mtp_path; + std::string preset_path; }; // throw if the file is missing or invalid (e.g. ETag check failed) diff --git a/common/preset.cpp b/common/preset.cpp index 51ea984d8c..f0cc1fa1a2 100644 --- a/common/preset.cpp +++ b/common/preset.cpp @@ -16,48 +16,6 @@ static std::string rm_leading_dashes(const std::string & str) { return str.substr(pos); } -// only allow a subset of args for remote presets for security reasons -// do not add more args unless absolutely necessary -// args that output to files are strictly prohibited -static std::set get_remote_preset_whitelist(const std::map & key_to_opt) { - static const std::set allowed_options = { - "model-url", - "hf-repo", - "hf-repo-draft", - "hf-repo-v", // vocoder - "hf-file-v", // vocoder - "mmproj-url", - "pooling", - "jinja", - "batch-size", - "ubatch-size", - "cache-reuse", - "chat-template-kwargs", - "mmap", - // note: sampling params are automatically allowed by default - // negated args will be added automatically if the positive arg is specified above - }; - - std::set allowed_keys; - - for (const auto & it : key_to_opt) { - const std::string & key = it.first; - const common_arg & opt = it.second; - if (allowed_options.find(key) != allowed_options.end() || opt.is_sampling) { - allowed_keys.insert(key); - // also add variant keys (args without leading dashes and env vars) - for (const auto & arg : opt.get_args()) { - allowed_keys.insert(rm_leading_dashes(arg)); - } - for (const auto & env : opt.get_env()) { - allowed_keys.insert(env); - } - } - } - - return allowed_keys; -} - std::vector common_preset::to_args(const std::string & bin_path) const { std::vector args; @@ -300,16 +258,10 @@ static std::string parse_bool_arg(const common_arg & arg, const std::string & ke return value; } -common_preset_context::common_preset_context(llama_example ex, bool only_remote_allowed) +common_preset_context::common_preset_context(llama_example ex) : ctx_params(common_params_parser_init(default_params, ex)) { common_params_add_preset_options(ctx_params.options); key_to_opt = get_map_key_opt(ctx_params); - - // setup allowed keys if only_remote_allowed is true - if (only_remote_allowed) { - filter_allowed_keys = true; - allowed_keys = get_remote_preset_whitelist(key_to_opt); - } } common_presets common_preset_context::load_from_ini(const std::string & path, common_preset & global) const { diff --git a/common/preset.h b/common/preset.h index 06f829c3e5..52935ebde8 100644 --- a/common/preset.h +++ b/common/preset.h @@ -60,7 +60,7 @@ struct common_preset_context { std::set allowed_keys; // if only_remote_allowed is true, only accept whitelisted keys - common_preset_context(llama_example ex, bool only_remote_allowed = false); + common_preset_context(llama_example ex); // load presets from INI file common_presets load_from_ini(const std::string & path, common_preset & global) const; diff --git a/docs/preset.md b/docs/preset.md index d49fb0a1ae..85762a420b 100644 --- a/docs/preset.md +++ b/docs/preset.md @@ -8,55 +8,53 @@ The INI preset feature, introduced in [PR#17859](https://github.com/ggml-org/lla When running multiple models on the server (router mode), INI preset files can be used to configure model-specific parameters. Please refer to the [server documentation](../tools/server/README.md) for more details. -### Using a Remote Preset +### Using a Hugging Face Preset -> [!NOTE] +> [!IMPORTANT] > -> This feature is currently only supported via the `-hf` option. +> Please only use presets that you can trust! Unknown presets may be unsafe -For GGUF models hosted on Hugging Face, you can include a `preset.ini` file in the root directory of the repository to define specific configurations for that model. +You can push your preset to Hugging Face Hub and share with other users by: +1. Creating an empty model repository on Hugging Face +2. Creating a `preset.ini` file in the root directory of the repository -Example: +Example of a `preset.ini`: ```ini -hf-repo-draft = username/my-draft-model-GGUF -temp = 0.5 -top-k = 20 -top-p = 0.95 +[*] +ctx-size = 0 +mmap = 1 +kv-unified = 1 +parallel = 4 +spec-default = 1 + +[Qwen3.5-4B] +hf = unsloth/Qwen3.5-4B-GGUF:Q4_K_M +ctx-size = 262144 +batch-size = 2048 +ubatch-size = 2048 +top-p = 1.0 +top-k = 0 +min-p = 0.01 +temp = 1.0 + +[gpt-oss-120b-hf] +hf = ggml-org/gpt-oss-120b-GGUF +ctx-size = 262144 +batch-size = 2048 +ubatch-size = 2048 +top-p = 1.0 +top-k = 0 +min-p = 0.01 +temp = 1.0 +chat-template-kwargs = {"reasoning_effort": "high"} ``` -For security reasons, only certain options are allowed. Please refer to [preset.cpp](../common/preset.cpp) for the complete list of permitted options. - -Example usage: - -Assuming your repository `username/my-model-with-preset` contains a `preset.ini` with the configuration above: - -```sh -llama-cli -hf username/my-model-with-preset - -# This is equivalent to: -llama-cli -hf username/my-model-with-preset \ - --hf-repo-draft username/my-draft-model-GGUF \ - --temp 0.5 \ - --top-k 20 \ - --top-p 0.95 -``` - -You can also override preset arguments by specifying them on the command line: +The preset will be loaded similarly to the `--models-preset` option. Therefore, you can also override certain params via CLI arguments: ```sh # Force temp = 0.1, overriding the preset value -llama-cli -hf username/my-model-with-preset --temp 0.1 -``` - -If you want to define multiple preset configurations for one or more GGUF models, you can create a blank HF repo for each preset. Each HF repo should contain a `preset.ini` file that references the actual model(s): - -```ini -hf-repo = user/my-model-main -hf-repo-draft = user/my-model-draft -temp = 0.8 -ctx-size = 1024 -; (and other configurations) +llama-cli -hf username/my-preset --temp 0.1 ``` ### Named presets diff --git a/tools/server/server.cpp b/tools/server/server.cpp index 0364d7d3b7..78ab0318cf 100644 --- a/tools/server/server.cpp +++ b/tools/server/server.cpp @@ -349,6 +349,12 @@ int llama_server(int argc, char ** argv) { SRV_INF("router server is listening on %s\n", ctx_http.listening_address.c_str()); SRV_WRN("%s", "NOTE: router mode is experimental\n"); SRV_WRN("%s", " it is not recommended to use this mode in untrusted environments\n"); + + if (!params.models_preset_hf.empty()) { + SRV_WRN( "NOTE: using preset.ini from HF repo '%s'\n", params.models_preset_hf.c_str()); + SRV_WRN("%s", " please only use presets that you can trust! Unknown presets may be unsafe\n"); + } + if (ctx_http.thread.joinable()) { ctx_http.thread.join(); // keep the main thread alive }