mirror of
https://github.com/ggml-org/llama.cpp.git
synced 2026-06-27 23:50:20 -05:00
server: (router) rework -hf preset repo (#24739)
* server: temporary remove HF remote preset * rework remove preset.ini support * rm unused get_remote_preset_whitelist() * print warning * add docs * rm stray file
This commit is contained in:
parent
968c43891a
commit
552258c535
112
common/arg.cpp
112
common/arg.cpp
@ -285,58 +285,15 @@ static std::string clean_file_name(const std::string & fname) {
|
|||||||
return clean_fname;
|
return clean_fname;
|
||||||
}
|
}
|
||||||
|
|
||||||
static bool common_params_handle_remote_preset(common_params & params, llama_example ex) {
|
|
||||||
GGML_ASSERT(!params.model.hf_repo.empty());
|
|
||||||
|
|
||||||
// the returned hf_repo is without tag
|
|
||||||
auto [hf_repo, hf_tag] = common_download_split_repo_tag(params.model.hf_repo);
|
|
||||||
|
|
||||||
// "latest" tag (default if not specified) is translated to "default" preset
|
|
||||||
if (hf_tag == "latest") {
|
|
||||||
hf_tag = "default";
|
|
||||||
}
|
|
||||||
|
|
||||||
std::string model_endpoint = common_get_model_endpoint();
|
|
||||||
auto preset_url = model_endpoint + hf_repo + "/resolve/main/preset.ini";
|
|
||||||
|
|
||||||
// prepare local path for caching
|
|
||||||
auto preset_fname = clean_file_name(hf_repo + "_preset.ini");
|
|
||||||
auto preset_path = fs_get_cache_file(preset_fname);
|
|
||||||
common_download_opts opts;
|
|
||||||
opts.bearer_token = params.hf_token;
|
|
||||||
opts.offline = params.offline;
|
|
||||||
|
|
||||||
LOG_TRC("%s: looking for remote preset at %s\n", __func__, preset_url.c_str());
|
|
||||||
const int status = common_download_file_single(preset_url, preset_path, opts);
|
|
||||||
const bool has_preset = status >= 200 && status < 400;
|
|
||||||
|
|
||||||
// remote preset is optional, so we don't error out if not found
|
|
||||||
if (has_preset) {
|
|
||||||
LOG_TRC("%s: applying remote preset from %s\n", __func__, preset_url.c_str());
|
|
||||||
common_preset_context ctx(ex, /* only_remote_allowed */ true);
|
|
||||||
common_preset global;
|
|
||||||
auto remote_presets = ctx.load_from_ini(preset_path, global);
|
|
||||||
remote_presets = ctx.cascade(global, remote_presets);
|
|
||||||
if (remote_presets.find(hf_tag) != remote_presets.end()) {
|
|
||||||
common_preset preset = remote_presets.at(hf_tag);
|
|
||||||
LOG_INF("\n%s", preset.to_ini().c_str()); // to_ini already added trailing newline
|
|
||||||
preset.apply_to_params(params);
|
|
||||||
} else {
|
|
||||||
throw std::runtime_error("Remote preset.ini does not contain [" + std::string(hf_tag) + "] section");
|
|
||||||
}
|
|
||||||
} else {
|
|
||||||
LOG_TRC("%s: no remote preset found, skipping\n", __func__);
|
|
||||||
}
|
|
||||||
|
|
||||||
return has_preset;
|
|
||||||
}
|
|
||||||
|
|
||||||
struct handle_model_result {
|
struct handle_model_result {
|
||||||
bool found_mmproj = false;
|
bool found_mmproj = false;
|
||||||
common_params_model mmproj;
|
common_params_model mmproj;
|
||||||
|
|
||||||
bool found_mtp = false;
|
bool found_mtp = false;
|
||||||
common_params_model mtp;
|
common_params_model mtp;
|
||||||
|
|
||||||
|
bool found_preset = false;
|
||||||
|
std::string preset_path;
|
||||||
};
|
};
|
||||||
|
|
||||||
static handle_model_result common_params_handle_model(struct common_params_model & model,
|
static handle_model_result common_params_handle_model(struct common_params_model & model,
|
||||||
@ -355,6 +312,12 @@ static handle_model_result common_params_handle_model(struct common_params_model
|
|||||||
common_download_opts hf_opts = opts;
|
common_download_opts hf_opts = opts;
|
||||||
auto download_result = common_download_model(model, hf_opts);
|
auto download_result = common_download_model(model, hf_opts);
|
||||||
|
|
||||||
|
if (!download_result.preset_path.empty()) {
|
||||||
|
result.found_preset = true;
|
||||||
|
result.preset_path = download_result.preset_path;
|
||||||
|
return result; // skip everything else if preset.ini is used
|
||||||
|
}
|
||||||
|
|
||||||
if (download_result.model_path.empty()) {
|
if (download_result.model_path.empty()) {
|
||||||
throw std::runtime_error("failed to download model from Hugging Face");
|
throw std::runtime_error("failed to download model from Hugging Face");
|
||||||
}
|
}
|
||||||
@ -454,6 +417,17 @@ bool common_params_handle_models(common_params & params, llama_example curr_ex)
|
|||||||
|
|
||||||
try {
|
try {
|
||||||
auto res = common_params_handle_model(params.model, opts);
|
auto res = common_params_handle_model(params.model, opts);
|
||||||
|
if (res.found_preset) {
|
||||||
|
if (!params.models_preset.empty()) {
|
||||||
|
throw std::invalid_argument("cannot use both --models-preset and -hf with a preset.ini file");
|
||||||
|
}
|
||||||
|
// if HF repo is a preset repo, we simply run server in router mode with the preset.ini file
|
||||||
|
params.models_preset_hf = params.model.hf_repo; // only for showing a warning
|
||||||
|
params.models_preset = res.preset_path;
|
||||||
|
params.model = common_params_model{}; // make sure to clear model, so server starts in router mode
|
||||||
|
return true;
|
||||||
|
}
|
||||||
|
|
||||||
if (params.no_mmproj) {
|
if (params.no_mmproj) {
|
||||||
params.mmproj = {};
|
params.mmproj = {};
|
||||||
} else if (res.found_mmproj && params.mmproj.path.empty() && params.mmproj.url.empty()) {
|
} else if (res.found_mmproj && params.mmproj.path.empty() && params.mmproj.url.empty()) {
|
||||||
@ -601,30 +575,6 @@ static bool common_params_parse_ex(int argc, char ** argv, common_params_context
|
|||||||
// parse the first time to get -hf option (used for remote preset)
|
// parse the first time to get -hf option (used for remote preset)
|
||||||
parse_cli_args();
|
parse_cli_args();
|
||||||
|
|
||||||
// export_graph_ops loads only metadata
|
|
||||||
const bool skip_model_download = ctx_arg.ex == LLAMA_EXAMPLE_EXPORT_GRAPH_OPS;
|
|
||||||
|
|
||||||
// maybe handle remote preset
|
|
||||||
if (!params.model.hf_repo.empty() && !skip_model_download) {
|
|
||||||
std::string cli_hf_repo = params.model.hf_repo;
|
|
||||||
bool has_preset = common_params_handle_remote_preset(params, ctx_arg.ex);
|
|
||||||
|
|
||||||
// special case: if hf_repo explicitly set by preset, we need to preserve it (ignore CLI value)
|
|
||||||
// this is useful when we have one HF repo pointing to other HF repos (one model - multiple GGUFs)
|
|
||||||
std::string preset_hf_repo = params.model.hf_repo;
|
|
||||||
bool preset_has_hf_repo = preset_hf_repo != cli_hf_repo;
|
|
||||||
|
|
||||||
if (has_preset) {
|
|
||||||
// re-parse CLI args to override preset values
|
|
||||||
parse_cli_args();
|
|
||||||
}
|
|
||||||
|
|
||||||
// preserve hf_repo from preset if needed
|
|
||||||
if (preset_has_hf_repo) {
|
|
||||||
params.model.hf_repo = preset_hf_repo;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
postprocess_cpu_params(params.cpuparams, nullptr);
|
postprocess_cpu_params(params.cpuparams, nullptr);
|
||||||
postprocess_cpu_params(params.cpuparams_batch, ¶ms.cpuparams);
|
postprocess_cpu_params(params.cpuparams_batch, ¶ms.cpuparams);
|
||||||
|
|
||||||
@ -635,15 +585,21 @@ static bool common_params_parse_ex(int argc, char ** argv, common_params_context
|
|||||||
throw std::invalid_argument("error: --prompt-cache-all not supported in interactive mode yet\n");
|
throw std::invalid_argument("error: --prompt-cache-all not supported in interactive mode yet\n");
|
||||||
}
|
}
|
||||||
|
|
||||||
// handle model and download
|
// export_graph_ops loads only metadata
|
||||||
if (!skip_model_download) {
|
const bool skip_model_download = ctx_arg.ex == LLAMA_EXAMPLE_EXPORT_GRAPH_OPS;
|
||||||
common_params_handle_models(params, ctx_arg.ex);
|
|
||||||
}
|
|
||||||
|
|
||||||
// model is required (except for server)
|
if (!skip_model_download) {
|
||||||
// TODO @ngxson : maybe show a list of available models in CLI in this case
|
// handle model and download
|
||||||
if (params.model.path.empty() && ctx_arg.ex != LLAMA_EXAMPLE_SERVER && !skip_model_download && !params.usage && !params.completion) {
|
common_params_handle_models(params, ctx_arg.ex);
|
||||||
throw std::invalid_argument("error: --model is required\n");
|
|
||||||
|
// model is required (except for server)
|
||||||
|
// TODO @ngxson : maybe show a list of available models in CLI in this case
|
||||||
|
if (params.model.path.empty()
|
||||||
|
&& ctx_arg.ex != LLAMA_EXAMPLE_SERVER
|
||||||
|
&& !params.usage
|
||||||
|
&& !params.completion) {
|
||||||
|
throw std::invalid_argument("error: --model is required\n");
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
if (params.escape) {
|
if (params.escape) {
|
||||||
|
|||||||
@ -642,10 +642,11 @@ struct common_params {
|
|||||||
std::vector<std::string> server_tools;
|
std::vector<std::string> server_tools;
|
||||||
|
|
||||||
// router server configs
|
// router server configs
|
||||||
std::string models_dir = ""; // directory containing models for the router server
|
std::string models_dir = ""; // directory containing models for the router server
|
||||||
std::string models_preset = ""; // directory containing model presets for the router server
|
std::string models_preset = ""; // directory containing model presets for the router server
|
||||||
int models_max = 4; // maximum number of models to load simultaneously
|
int models_max = 4; // maximum number of models to load simultaneously
|
||||||
bool models_autoload = true; // automatically load models when requested via the router server
|
bool models_autoload = true; // automatically load models when requested via the router server
|
||||||
|
std::string models_preset_hf = ""; // show a warning about remote presets on router loaded (if not empty)
|
||||||
|
|
||||||
bool log_json = false;
|
bool log_json = false;
|
||||||
|
|
||||||
|
|||||||
@ -696,6 +696,7 @@ struct hf_plan {
|
|||||||
hf_cache::hf_files model_files;
|
hf_cache::hf_files model_files;
|
||||||
hf_cache::hf_file mmproj;
|
hf_cache::hf_file mmproj;
|
||||||
hf_cache::hf_file mtp;
|
hf_cache::hf_file mtp;
|
||||||
|
hf_cache::hf_file preset; // if set, only this file is downloaded
|
||||||
};
|
};
|
||||||
|
|
||||||
static hf_plan get_hf_plan(const common_params_model & model,
|
static hf_plan get_hf_plan(const common_params_model & model,
|
||||||
@ -717,6 +718,14 @@ static hf_plan get_hf_plan(const common_params_model & model,
|
|||||||
return plan;
|
return plan;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// if preset.ini exists in the repo root, download only that file
|
||||||
|
for (const auto & f : all) {
|
||||||
|
if (f.path == "preset.ini") {
|
||||||
|
plan.preset = f;
|
||||||
|
return plan;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
hf_cache::hf_file primary;
|
hf_cache::hf_file primary;
|
||||||
|
|
||||||
if (!model.hf_file.empty()) {
|
if (!model.hf_file.empty()) {
|
||||||
@ -794,14 +803,19 @@ common_download_model_result common_download_model(const common_params_model &
|
|||||||
|
|
||||||
if (is_hf) {
|
if (is_hf) {
|
||||||
hf = get_hf_plan(model, opts, download_mmproj, download_mtp);
|
hf = get_hf_plan(model, opts, download_mmproj, download_mtp);
|
||||||
for (const auto & f : hf.model_files) {
|
if (!hf.preset.path.empty()) {
|
||||||
tasks.push_back({f.url, f.local_path});
|
// if preset.ini exists, only download that file alone
|
||||||
}
|
tasks.push_back({hf.preset.url, hf.preset.local_path});
|
||||||
if (!hf.mmproj.path.empty()) {
|
} else {
|
||||||
tasks.push_back({hf.mmproj.url, hf.mmproj.local_path});
|
for (const auto & f : hf.model_files) {
|
||||||
}
|
tasks.push_back({f.url, f.local_path});
|
||||||
if (!hf.mtp.path.empty()) {
|
}
|
||||||
tasks.push_back({hf.mtp.url, hf.mtp.local_path});
|
if (!hf.mmproj.path.empty()) {
|
||||||
|
tasks.push_back({hf.mmproj.url, hf.mmproj.local_path});
|
||||||
|
}
|
||||||
|
if (!hf.mtp.path.empty()) {
|
||||||
|
tasks.push_back({hf.mtp.url, hf.mtp.local_path});
|
||||||
|
}
|
||||||
}
|
}
|
||||||
} else if (!model.url.empty()) {
|
} else if (!model.url.empty()) {
|
||||||
tasks = get_url_tasks(model);
|
tasks = get_url_tasks(model);
|
||||||
@ -835,17 +849,22 @@ common_download_model_result common_download_model(const common_params_model &
|
|||||||
}
|
}
|
||||||
|
|
||||||
if (is_hf) {
|
if (is_hf) {
|
||||||
for (const auto & f : hf.model_files) {
|
if (!hf.preset.path.empty()) {
|
||||||
hf_cache::finalize_file(f);
|
// if preset.ini is used, do not set other paths
|
||||||
}
|
result.preset_path = hf_cache::finalize_file(hf.preset);
|
||||||
result.model_path = hf.primary.final_path;
|
} else {
|
||||||
|
for (const auto & f : hf.model_files) {
|
||||||
|
hf_cache::finalize_file(f);
|
||||||
|
}
|
||||||
|
result.model_path = hf.primary.final_path;
|
||||||
|
|
||||||
if (!hf.mmproj.path.empty()) {
|
if (!hf.mmproj.path.empty()) {
|
||||||
result.mmproj_path = hf_cache::finalize_file(hf.mmproj);
|
result.mmproj_path = hf_cache::finalize_file(hf.mmproj);
|
||||||
}
|
}
|
||||||
|
|
||||||
if (!hf.mtp.path.empty()) {
|
if (!hf.mtp.path.empty()) {
|
||||||
result.mtp_path = hf_cache::finalize_file(hf.mtp);
|
result.mtp_path = hf_cache::finalize_file(hf.mtp);
|
||||||
|
}
|
||||||
}
|
}
|
||||||
} else {
|
} else {
|
||||||
result.model_path = model.path;
|
result.model_path = model.path;
|
||||||
|
|||||||
@ -63,6 +63,7 @@ struct common_download_model_result {
|
|||||||
std::string model_path;
|
std::string model_path;
|
||||||
std::string mmproj_path;
|
std::string mmproj_path;
|
||||||
std::string mtp_path;
|
std::string mtp_path;
|
||||||
|
std::string preset_path;
|
||||||
};
|
};
|
||||||
|
|
||||||
// throw if the file is missing or invalid (e.g. ETag check failed)
|
// throw if the file is missing or invalid (e.g. ETag check failed)
|
||||||
|
|||||||
@ -16,48 +16,6 @@ static std::string rm_leading_dashes(const std::string & str) {
|
|||||||
return str.substr(pos);
|
return str.substr(pos);
|
||||||
}
|
}
|
||||||
|
|
||||||
// only allow a subset of args for remote presets for security reasons
|
|
||||||
// do not add more args unless absolutely necessary
|
|
||||||
// args that output to files are strictly prohibited
|
|
||||||
static std::set<std::string> get_remote_preset_whitelist(const std::map<std::string, common_arg> & key_to_opt) {
|
|
||||||
static const std::set<std::string> allowed_options = {
|
|
||||||
"model-url",
|
|
||||||
"hf-repo",
|
|
||||||
"hf-repo-draft",
|
|
||||||
"hf-repo-v", // vocoder
|
|
||||||
"hf-file-v", // vocoder
|
|
||||||
"mmproj-url",
|
|
||||||
"pooling",
|
|
||||||
"jinja",
|
|
||||||
"batch-size",
|
|
||||||
"ubatch-size",
|
|
||||||
"cache-reuse",
|
|
||||||
"chat-template-kwargs",
|
|
||||||
"mmap",
|
|
||||||
// note: sampling params are automatically allowed by default
|
|
||||||
// negated args will be added automatically if the positive arg is specified above
|
|
||||||
};
|
|
||||||
|
|
||||||
std::set<std::string> allowed_keys;
|
|
||||||
|
|
||||||
for (const auto & it : key_to_opt) {
|
|
||||||
const std::string & key = it.first;
|
|
||||||
const common_arg & opt = it.second;
|
|
||||||
if (allowed_options.find(key) != allowed_options.end() || opt.is_sampling) {
|
|
||||||
allowed_keys.insert(key);
|
|
||||||
// also add variant keys (args without leading dashes and env vars)
|
|
||||||
for (const auto & arg : opt.get_args()) {
|
|
||||||
allowed_keys.insert(rm_leading_dashes(arg));
|
|
||||||
}
|
|
||||||
for (const auto & env : opt.get_env()) {
|
|
||||||
allowed_keys.insert(env);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
return allowed_keys;
|
|
||||||
}
|
|
||||||
|
|
||||||
std::vector<std::string> common_preset::to_args(const std::string & bin_path) const {
|
std::vector<std::string> common_preset::to_args(const std::string & bin_path) const {
|
||||||
std::vector<std::string> args;
|
std::vector<std::string> args;
|
||||||
|
|
||||||
@ -300,16 +258,10 @@ static std::string parse_bool_arg(const common_arg & arg, const std::string & ke
|
|||||||
return value;
|
return value;
|
||||||
}
|
}
|
||||||
|
|
||||||
common_preset_context::common_preset_context(llama_example ex, bool only_remote_allowed)
|
common_preset_context::common_preset_context(llama_example ex)
|
||||||
: ctx_params(common_params_parser_init(default_params, ex)) {
|
: ctx_params(common_params_parser_init(default_params, ex)) {
|
||||||
common_params_add_preset_options(ctx_params.options);
|
common_params_add_preset_options(ctx_params.options);
|
||||||
key_to_opt = get_map_key_opt(ctx_params);
|
key_to_opt = get_map_key_opt(ctx_params);
|
||||||
|
|
||||||
// setup allowed keys if only_remote_allowed is true
|
|
||||||
if (only_remote_allowed) {
|
|
||||||
filter_allowed_keys = true;
|
|
||||||
allowed_keys = get_remote_preset_whitelist(key_to_opt);
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
|
|
||||||
common_presets common_preset_context::load_from_ini(const std::string & path, common_preset & global) const {
|
common_presets common_preset_context::load_from_ini(const std::string & path, common_preset & global) const {
|
||||||
|
|||||||
@ -60,7 +60,7 @@ struct common_preset_context {
|
|||||||
std::set<std::string> allowed_keys;
|
std::set<std::string> allowed_keys;
|
||||||
|
|
||||||
// if only_remote_allowed is true, only accept whitelisted keys
|
// if only_remote_allowed is true, only accept whitelisted keys
|
||||||
common_preset_context(llama_example ex, bool only_remote_allowed = false);
|
common_preset_context(llama_example ex);
|
||||||
|
|
||||||
// load presets from INI file
|
// load presets from INI file
|
||||||
common_presets load_from_ini(const std::string & path, common_preset & global) const;
|
common_presets load_from_ini(const std::string & path, common_preset & global) const;
|
||||||
|
|||||||
@ -8,55 +8,53 @@ The INI preset feature, introduced in [PR#17859](https://github.com/ggml-org/lla
|
|||||||
|
|
||||||
When running multiple models on the server (router mode), INI preset files can be used to configure model-specific parameters. Please refer to the [server documentation](../tools/server/README.md) for more details.
|
When running multiple models on the server (router mode), INI preset files can be used to configure model-specific parameters. Please refer to the [server documentation](../tools/server/README.md) for more details.
|
||||||
|
|
||||||
### Using a Remote Preset
|
### Using a Hugging Face Preset
|
||||||
|
|
||||||
> [!NOTE]
|
> [!IMPORTANT]
|
||||||
>
|
>
|
||||||
> This feature is currently only supported via the `-hf` option.
|
> Please only use presets that you can trust! Unknown presets may be unsafe
|
||||||
|
|
||||||
For GGUF models hosted on Hugging Face, you can include a `preset.ini` file in the root directory of the repository to define specific configurations for that model.
|
You can push your preset to Hugging Face Hub and share with other users by:
|
||||||
|
1. Creating an empty model repository on Hugging Face
|
||||||
|
2. Creating a `preset.ini` file in the root directory of the repository
|
||||||
|
|
||||||
Example:
|
Example of a `preset.ini`:
|
||||||
|
|
||||||
```ini
|
```ini
|
||||||
hf-repo-draft = username/my-draft-model-GGUF
|
[*]
|
||||||
temp = 0.5
|
ctx-size = 0
|
||||||
top-k = 20
|
mmap = 1
|
||||||
top-p = 0.95
|
kv-unified = 1
|
||||||
|
parallel = 4
|
||||||
|
spec-default = 1
|
||||||
|
|
||||||
|
[Qwen3.5-4B]
|
||||||
|
hf = unsloth/Qwen3.5-4B-GGUF:Q4_K_M
|
||||||
|
ctx-size = 262144
|
||||||
|
batch-size = 2048
|
||||||
|
ubatch-size = 2048
|
||||||
|
top-p = 1.0
|
||||||
|
top-k = 0
|
||||||
|
min-p = 0.01
|
||||||
|
temp = 1.0
|
||||||
|
|
||||||
|
[gpt-oss-120b-hf]
|
||||||
|
hf = ggml-org/gpt-oss-120b-GGUF
|
||||||
|
ctx-size = 262144
|
||||||
|
batch-size = 2048
|
||||||
|
ubatch-size = 2048
|
||||||
|
top-p = 1.0
|
||||||
|
top-k = 0
|
||||||
|
min-p = 0.01
|
||||||
|
temp = 1.0
|
||||||
|
chat-template-kwargs = {"reasoning_effort": "high"}
|
||||||
```
|
```
|
||||||
|
|
||||||
For security reasons, only certain options are allowed. Please refer to [preset.cpp](../common/preset.cpp) for the complete list of permitted options.
|
The preset will be loaded similarly to the `--models-preset` option. Therefore, you can also override certain params via CLI arguments:
|
||||||
|
|
||||||
Example usage:
|
|
||||||
|
|
||||||
Assuming your repository `username/my-model-with-preset` contains a `preset.ini` with the configuration above:
|
|
||||||
|
|
||||||
```sh
|
|
||||||
llama-cli -hf username/my-model-with-preset
|
|
||||||
|
|
||||||
# This is equivalent to:
|
|
||||||
llama-cli -hf username/my-model-with-preset \
|
|
||||||
--hf-repo-draft username/my-draft-model-GGUF \
|
|
||||||
--temp 0.5 \
|
|
||||||
--top-k 20 \
|
|
||||||
--top-p 0.95
|
|
||||||
```
|
|
||||||
|
|
||||||
You can also override preset arguments by specifying them on the command line:
|
|
||||||
|
|
||||||
```sh
|
```sh
|
||||||
# Force temp = 0.1, overriding the preset value
|
# Force temp = 0.1, overriding the preset value
|
||||||
llama-cli -hf username/my-model-with-preset --temp 0.1
|
llama-cli -hf username/my-preset --temp 0.1
|
||||||
```
|
|
||||||
|
|
||||||
If you want to define multiple preset configurations for one or more GGUF models, you can create a blank HF repo for each preset. Each HF repo should contain a `preset.ini` file that references the actual model(s):
|
|
||||||
|
|
||||||
```ini
|
|
||||||
hf-repo = user/my-model-main
|
|
||||||
hf-repo-draft = user/my-model-draft
|
|
||||||
temp = 0.8
|
|
||||||
ctx-size = 1024
|
|
||||||
; (and other configurations)
|
|
||||||
```
|
```
|
||||||
|
|
||||||
### Named presets
|
### Named presets
|
||||||
|
|||||||
@ -349,6 +349,12 @@ int llama_server(int argc, char ** argv) {
|
|||||||
SRV_INF("router server is listening on %s\n", ctx_http.listening_address.c_str());
|
SRV_INF("router server is listening on %s\n", ctx_http.listening_address.c_str());
|
||||||
SRV_WRN("%s", "NOTE: router mode is experimental\n");
|
SRV_WRN("%s", "NOTE: router mode is experimental\n");
|
||||||
SRV_WRN("%s", " it is not recommended to use this mode in untrusted environments\n");
|
SRV_WRN("%s", " it is not recommended to use this mode in untrusted environments\n");
|
||||||
|
|
||||||
|
if (!params.models_preset_hf.empty()) {
|
||||||
|
SRV_WRN( "NOTE: using preset.ini from HF repo '%s'\n", params.models_preset_hf.c_str());
|
||||||
|
SRV_WRN("%s", " please only use presets that you can trust! Unknown presets may be unsafe\n");
|
||||||
|
}
|
||||||
|
|
||||||
if (ctx_http.thread.joinable()) {
|
if (ctx_http.thread.joinable()) {
|
||||||
ctx_http.thread.join(); // keep the main thread alive
|
ctx_http.thread.join(); // keep the main thread alive
|
||||||
}
|
}
|
||||||
|
|||||||
Loading…
x
Reference in New Issue
Block a user