server: (router) rework -hf preset repo (#24739)

* server: temporary remove HF remote preset

* rework remove preset.ini support

* rm unused get_remote_preset_whitelist()

* print warning

* add docs

* rm stray file
This commit is contained in:
Xuan-Son Nguyen 2026-06-18 12:45:23 +02:00 committed by GitHub
parent 968c43891a
commit 552258c535
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
8 changed files with 120 additions and 187 deletions

View File

@ -285,58 +285,15 @@ static std::string clean_file_name(const std::string & fname) {
return clean_fname; return clean_fname;
} }
static bool common_params_handle_remote_preset(common_params & params, llama_example ex) {
GGML_ASSERT(!params.model.hf_repo.empty());
// the returned hf_repo is without tag
auto [hf_repo, hf_tag] = common_download_split_repo_tag(params.model.hf_repo);
// "latest" tag (default if not specified) is translated to "default" preset
if (hf_tag == "latest") {
hf_tag = "default";
}
std::string model_endpoint = common_get_model_endpoint();
auto preset_url = model_endpoint + hf_repo + "/resolve/main/preset.ini";
// prepare local path for caching
auto preset_fname = clean_file_name(hf_repo + "_preset.ini");
auto preset_path = fs_get_cache_file(preset_fname);
common_download_opts opts;
opts.bearer_token = params.hf_token;
opts.offline = params.offline;
LOG_TRC("%s: looking for remote preset at %s\n", __func__, preset_url.c_str());
const int status = common_download_file_single(preset_url, preset_path, opts);
const bool has_preset = status >= 200 && status < 400;
// remote preset is optional, so we don't error out if not found
if (has_preset) {
LOG_TRC("%s: applying remote preset from %s\n", __func__, preset_url.c_str());
common_preset_context ctx(ex, /* only_remote_allowed */ true);
common_preset global;
auto remote_presets = ctx.load_from_ini(preset_path, global);
remote_presets = ctx.cascade(global, remote_presets);
if (remote_presets.find(hf_tag) != remote_presets.end()) {
common_preset preset = remote_presets.at(hf_tag);
LOG_INF("\n%s", preset.to_ini().c_str()); // to_ini already added trailing newline
preset.apply_to_params(params);
} else {
throw std::runtime_error("Remote preset.ini does not contain [" + std::string(hf_tag) + "] section");
}
} else {
LOG_TRC("%s: no remote preset found, skipping\n", __func__);
}
return has_preset;
}
struct handle_model_result { struct handle_model_result {
bool found_mmproj = false; bool found_mmproj = false;
common_params_model mmproj; common_params_model mmproj;
bool found_mtp = false; bool found_mtp = false;
common_params_model mtp; common_params_model mtp;
bool found_preset = false;
std::string preset_path;
}; };
static handle_model_result common_params_handle_model(struct common_params_model & model, static handle_model_result common_params_handle_model(struct common_params_model & model,
@ -355,6 +312,12 @@ static handle_model_result common_params_handle_model(struct common_params_model
common_download_opts hf_opts = opts; common_download_opts hf_opts = opts;
auto download_result = common_download_model(model, hf_opts); auto download_result = common_download_model(model, hf_opts);
if (!download_result.preset_path.empty()) {
result.found_preset = true;
result.preset_path = download_result.preset_path;
return result; // skip everything else if preset.ini is used
}
if (download_result.model_path.empty()) { if (download_result.model_path.empty()) {
throw std::runtime_error("failed to download model from Hugging Face"); throw std::runtime_error("failed to download model from Hugging Face");
} }
@ -454,6 +417,17 @@ bool common_params_handle_models(common_params & params, llama_example curr_ex)
try { try {
auto res = common_params_handle_model(params.model, opts); auto res = common_params_handle_model(params.model, opts);
if (res.found_preset) {
if (!params.models_preset.empty()) {
throw std::invalid_argument("cannot use both --models-preset and -hf with a preset.ini file");
}
// if HF repo is a preset repo, we simply run server in router mode with the preset.ini file
params.models_preset_hf = params.model.hf_repo; // only for showing a warning
params.models_preset = res.preset_path;
params.model = common_params_model{}; // make sure to clear model, so server starts in router mode
return true;
}
if (params.no_mmproj) { if (params.no_mmproj) {
params.mmproj = {}; params.mmproj = {};
} else if (res.found_mmproj && params.mmproj.path.empty() && params.mmproj.url.empty()) { } else if (res.found_mmproj && params.mmproj.path.empty() && params.mmproj.url.empty()) {
@ -601,30 +575,6 @@ static bool common_params_parse_ex(int argc, char ** argv, common_params_context
// parse the first time to get -hf option (used for remote preset) // parse the first time to get -hf option (used for remote preset)
parse_cli_args(); parse_cli_args();
// export_graph_ops loads only metadata
const bool skip_model_download = ctx_arg.ex == LLAMA_EXAMPLE_EXPORT_GRAPH_OPS;
// maybe handle remote preset
if (!params.model.hf_repo.empty() && !skip_model_download) {
std::string cli_hf_repo = params.model.hf_repo;
bool has_preset = common_params_handle_remote_preset(params, ctx_arg.ex);
// special case: if hf_repo explicitly set by preset, we need to preserve it (ignore CLI value)
// this is useful when we have one HF repo pointing to other HF repos (one model - multiple GGUFs)
std::string preset_hf_repo = params.model.hf_repo;
bool preset_has_hf_repo = preset_hf_repo != cli_hf_repo;
if (has_preset) {
// re-parse CLI args to override preset values
parse_cli_args();
}
// preserve hf_repo from preset if needed
if (preset_has_hf_repo) {
params.model.hf_repo = preset_hf_repo;
}
}
postprocess_cpu_params(params.cpuparams, nullptr); postprocess_cpu_params(params.cpuparams, nullptr);
postprocess_cpu_params(params.cpuparams_batch, &params.cpuparams); postprocess_cpu_params(params.cpuparams_batch, &params.cpuparams);
@ -635,15 +585,21 @@ static bool common_params_parse_ex(int argc, char ** argv, common_params_context
throw std::invalid_argument("error: --prompt-cache-all not supported in interactive mode yet\n"); throw std::invalid_argument("error: --prompt-cache-all not supported in interactive mode yet\n");
} }
// handle model and download // export_graph_ops loads only metadata
if (!skip_model_download) { const bool skip_model_download = ctx_arg.ex == LLAMA_EXAMPLE_EXPORT_GRAPH_OPS;
common_params_handle_models(params, ctx_arg.ex);
}
// model is required (except for server) if (!skip_model_download) {
// TODO @ngxson : maybe show a list of available models in CLI in this case // handle model and download
if (params.model.path.empty() && ctx_arg.ex != LLAMA_EXAMPLE_SERVER && !skip_model_download && !params.usage && !params.completion) { common_params_handle_models(params, ctx_arg.ex);
throw std::invalid_argument("error: --model is required\n");
// model is required (except for server)
// TODO @ngxson : maybe show a list of available models in CLI in this case
if (params.model.path.empty()
&& ctx_arg.ex != LLAMA_EXAMPLE_SERVER
&& !params.usage
&& !params.completion) {
throw std::invalid_argument("error: --model is required\n");
}
} }
if (params.escape) { if (params.escape) {

View File

@ -642,10 +642,11 @@ struct common_params {
std::vector<std::string> server_tools; std::vector<std::string> server_tools;
// router server configs // router server configs
std::string models_dir = ""; // directory containing models for the router server std::string models_dir = ""; // directory containing models for the router server
std::string models_preset = ""; // directory containing model presets for the router server std::string models_preset = ""; // directory containing model presets for the router server
int models_max = 4; // maximum number of models to load simultaneously int models_max = 4; // maximum number of models to load simultaneously
bool models_autoload = true; // automatically load models when requested via the router server bool models_autoload = true; // automatically load models when requested via the router server
std::string models_preset_hf = ""; // show a warning about remote presets on router loaded (if not empty)
bool log_json = false; bool log_json = false;

View File

@ -696,6 +696,7 @@ struct hf_plan {
hf_cache::hf_files model_files; hf_cache::hf_files model_files;
hf_cache::hf_file mmproj; hf_cache::hf_file mmproj;
hf_cache::hf_file mtp; hf_cache::hf_file mtp;
hf_cache::hf_file preset; // if set, only this file is downloaded
}; };
static hf_plan get_hf_plan(const common_params_model & model, static hf_plan get_hf_plan(const common_params_model & model,
@ -717,6 +718,14 @@ static hf_plan get_hf_plan(const common_params_model & model,
return plan; return plan;
} }
// if preset.ini exists in the repo root, download only that file
for (const auto & f : all) {
if (f.path == "preset.ini") {
plan.preset = f;
return plan;
}
}
hf_cache::hf_file primary; hf_cache::hf_file primary;
if (!model.hf_file.empty()) { if (!model.hf_file.empty()) {
@ -794,14 +803,19 @@ common_download_model_result common_download_model(const common_params_model &
if (is_hf) { if (is_hf) {
hf = get_hf_plan(model, opts, download_mmproj, download_mtp); hf = get_hf_plan(model, opts, download_mmproj, download_mtp);
for (const auto & f : hf.model_files) { if (!hf.preset.path.empty()) {
tasks.push_back({f.url, f.local_path}); // if preset.ini exists, only download that file alone
} tasks.push_back({hf.preset.url, hf.preset.local_path});
if (!hf.mmproj.path.empty()) { } else {
tasks.push_back({hf.mmproj.url, hf.mmproj.local_path}); for (const auto & f : hf.model_files) {
} tasks.push_back({f.url, f.local_path});
if (!hf.mtp.path.empty()) { }
tasks.push_back({hf.mtp.url, hf.mtp.local_path}); if (!hf.mmproj.path.empty()) {
tasks.push_back({hf.mmproj.url, hf.mmproj.local_path});
}
if (!hf.mtp.path.empty()) {
tasks.push_back({hf.mtp.url, hf.mtp.local_path});
}
} }
} else if (!model.url.empty()) { } else if (!model.url.empty()) {
tasks = get_url_tasks(model); tasks = get_url_tasks(model);
@ -835,17 +849,22 @@ common_download_model_result common_download_model(const common_params_model &
} }
if (is_hf) { if (is_hf) {
for (const auto & f : hf.model_files) { if (!hf.preset.path.empty()) {
hf_cache::finalize_file(f); // if preset.ini is used, do not set other paths
} result.preset_path = hf_cache::finalize_file(hf.preset);
result.model_path = hf.primary.final_path; } else {
for (const auto & f : hf.model_files) {
hf_cache::finalize_file(f);
}
result.model_path = hf.primary.final_path;
if (!hf.mmproj.path.empty()) { if (!hf.mmproj.path.empty()) {
result.mmproj_path = hf_cache::finalize_file(hf.mmproj); result.mmproj_path = hf_cache::finalize_file(hf.mmproj);
} }
if (!hf.mtp.path.empty()) { if (!hf.mtp.path.empty()) {
result.mtp_path = hf_cache::finalize_file(hf.mtp); result.mtp_path = hf_cache::finalize_file(hf.mtp);
}
} }
} else { } else {
result.model_path = model.path; result.model_path = model.path;

View File

@ -63,6 +63,7 @@ struct common_download_model_result {
std::string model_path; std::string model_path;
std::string mmproj_path; std::string mmproj_path;
std::string mtp_path; std::string mtp_path;
std::string preset_path;
}; };
// throw if the file is missing or invalid (e.g. ETag check failed) // throw if the file is missing or invalid (e.g. ETag check failed)

View File

@ -16,48 +16,6 @@ static std::string rm_leading_dashes(const std::string & str) {
return str.substr(pos); return str.substr(pos);
} }
// only allow a subset of args for remote presets for security reasons
// do not add more args unless absolutely necessary
// args that output to files are strictly prohibited
static std::set<std::string> get_remote_preset_whitelist(const std::map<std::string, common_arg> & key_to_opt) {
static const std::set<std::string> allowed_options = {
"model-url",
"hf-repo",
"hf-repo-draft",
"hf-repo-v", // vocoder
"hf-file-v", // vocoder
"mmproj-url",
"pooling",
"jinja",
"batch-size",
"ubatch-size",
"cache-reuse",
"chat-template-kwargs",
"mmap",
// note: sampling params are automatically allowed by default
// negated args will be added automatically if the positive arg is specified above
};
std::set<std::string> allowed_keys;
for (const auto & it : key_to_opt) {
const std::string & key = it.first;
const common_arg & opt = it.second;
if (allowed_options.find(key) != allowed_options.end() || opt.is_sampling) {
allowed_keys.insert(key);
// also add variant keys (args without leading dashes and env vars)
for (const auto & arg : opt.get_args()) {
allowed_keys.insert(rm_leading_dashes(arg));
}
for (const auto & env : opt.get_env()) {
allowed_keys.insert(env);
}
}
}
return allowed_keys;
}
std::vector<std::string> common_preset::to_args(const std::string & bin_path) const { std::vector<std::string> common_preset::to_args(const std::string & bin_path) const {
std::vector<std::string> args; std::vector<std::string> args;
@ -300,16 +258,10 @@ static std::string parse_bool_arg(const common_arg & arg, const std::string & ke
return value; return value;
} }
common_preset_context::common_preset_context(llama_example ex, bool only_remote_allowed) common_preset_context::common_preset_context(llama_example ex)
: ctx_params(common_params_parser_init(default_params, ex)) { : ctx_params(common_params_parser_init(default_params, ex)) {
common_params_add_preset_options(ctx_params.options); common_params_add_preset_options(ctx_params.options);
key_to_opt = get_map_key_opt(ctx_params); key_to_opt = get_map_key_opt(ctx_params);
// setup allowed keys if only_remote_allowed is true
if (only_remote_allowed) {
filter_allowed_keys = true;
allowed_keys = get_remote_preset_whitelist(key_to_opt);
}
} }
common_presets common_preset_context::load_from_ini(const std::string & path, common_preset & global) const { common_presets common_preset_context::load_from_ini(const std::string & path, common_preset & global) const {

View File

@ -60,7 +60,7 @@ struct common_preset_context {
std::set<std::string> allowed_keys; std::set<std::string> allowed_keys;
// if only_remote_allowed is true, only accept whitelisted keys // if only_remote_allowed is true, only accept whitelisted keys
common_preset_context(llama_example ex, bool only_remote_allowed = false); common_preset_context(llama_example ex);
// load presets from INI file // load presets from INI file
common_presets load_from_ini(const std::string & path, common_preset & global) const; common_presets load_from_ini(const std::string & path, common_preset & global) const;

View File

@ -8,55 +8,53 @@ The INI preset feature, introduced in [PR#17859](https://github.com/ggml-org/lla
When running multiple models on the server (router mode), INI preset files can be used to configure model-specific parameters. Please refer to the [server documentation](../tools/server/README.md) for more details. When running multiple models on the server (router mode), INI preset files can be used to configure model-specific parameters. Please refer to the [server documentation](../tools/server/README.md) for more details.
### Using a Remote Preset ### Using a Hugging Face Preset
> [!NOTE] > [!IMPORTANT]
> >
> This feature is currently only supported via the `-hf` option. > Please only use presets that you can trust! Unknown presets may be unsafe
For GGUF models hosted on Hugging Face, you can include a `preset.ini` file in the root directory of the repository to define specific configurations for that model. You can push your preset to Hugging Face Hub and share with other users by:
1. Creating an empty model repository on Hugging Face
2. Creating a `preset.ini` file in the root directory of the repository
Example: Example of a `preset.ini`:
```ini ```ini
hf-repo-draft = username/my-draft-model-GGUF [*]
temp = 0.5 ctx-size = 0
top-k = 20 mmap = 1
top-p = 0.95 kv-unified = 1
parallel = 4
spec-default = 1
[Qwen3.5-4B]
hf = unsloth/Qwen3.5-4B-GGUF:Q4_K_M
ctx-size = 262144
batch-size = 2048
ubatch-size = 2048
top-p = 1.0
top-k = 0
min-p = 0.01
temp = 1.0
[gpt-oss-120b-hf]
hf = ggml-org/gpt-oss-120b-GGUF
ctx-size = 262144
batch-size = 2048
ubatch-size = 2048
top-p = 1.0
top-k = 0
min-p = 0.01
temp = 1.0
chat-template-kwargs = {"reasoning_effort": "high"}
``` ```
For security reasons, only certain options are allowed. Please refer to [preset.cpp](../common/preset.cpp) for the complete list of permitted options. The preset will be loaded similarly to the `--models-preset` option. Therefore, you can also override certain params via CLI arguments:
Example usage:
Assuming your repository `username/my-model-with-preset` contains a `preset.ini` with the configuration above:
```sh
llama-cli -hf username/my-model-with-preset
# This is equivalent to:
llama-cli -hf username/my-model-with-preset \
--hf-repo-draft username/my-draft-model-GGUF \
--temp 0.5 \
--top-k 20 \
--top-p 0.95
```
You can also override preset arguments by specifying them on the command line:
```sh ```sh
# Force temp = 0.1, overriding the preset value # Force temp = 0.1, overriding the preset value
llama-cli -hf username/my-model-with-preset --temp 0.1 llama-cli -hf username/my-preset --temp 0.1
```
If you want to define multiple preset configurations for one or more GGUF models, you can create a blank HF repo for each preset. Each HF repo should contain a `preset.ini` file that references the actual model(s):
```ini
hf-repo = user/my-model-main
hf-repo-draft = user/my-model-draft
temp = 0.8
ctx-size = 1024
; (and other configurations)
``` ```
### Named presets ### Named presets

View File

@ -349,6 +349,12 @@ int llama_server(int argc, char ** argv) {
SRV_INF("router server is listening on %s\n", ctx_http.listening_address.c_str()); SRV_INF("router server is listening on %s\n", ctx_http.listening_address.c_str());
SRV_WRN("%s", "NOTE: router mode is experimental\n"); SRV_WRN("%s", "NOTE: router mode is experimental\n");
SRV_WRN("%s", " it is not recommended to use this mode in untrusted environments\n"); SRV_WRN("%s", " it is not recommended to use this mode in untrusted environments\n");
if (!params.models_preset_hf.empty()) {
SRV_WRN( "NOTE: using preset.ini from HF repo '%s'\n", params.models_preset_hf.c_str());
SRV_WRN("%s", " please only use presets that you can trust! Unknown presets may be unsafe\n");
}
if (ctx_http.thread.joinable()) { if (ctx_http.thread.joinable()) {
ctx_http.thread.join(); // keep the main thread alive ctx_http.thread.join(); // keep the main thread alive
} }