mirror of
https://github.com/ggml-org/llama.cpp.git
synced 2026-06-27 23:50:20 -05:00
server: add --agent arg, remove redundant webui naming compat (#24801)
* server: add --agent arg, remove redundant webui naming compat * corrent env * fix the test * llama-gen-docs * nits: wordings
This commit is contained in:
parent
38724ab593
commit
8c2d6f6475
@ -2830,62 +2830,26 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
|
|||||||
params.api_prefix = value;
|
params.api_prefix = value;
|
||||||
}
|
}
|
||||||
).set_examples({LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_ARG_API_PREFIX"));
|
).set_examples({LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_ARG_API_PREFIX"));
|
||||||
// Deprecated: use --ui-config instead (kept for backward compat)
|
|
||||||
add_opt(common_arg(
|
add_opt(common_arg(
|
||||||
{"--webui-config"}, "JSON",
|
{"--ui-config", "--webui-config"}, "JSON",
|
||||||
"[DEPRECATED: use --ui-config] JSON that provides default WebUI settings (overrides WebUI defaults)",
|
|
||||||
[](common_params & params, const std::string & value) {
|
|
||||||
params.ui_config_json = value;
|
|
||||||
params.webui_config_json = value;
|
|
||||||
}
|
|
||||||
).set_examples({LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_ARG_WEBUI_CONFIG"));
|
|
||||||
|
|
||||||
add_opt(common_arg(
|
|
||||||
{"--ui-config"}, "JSON",
|
|
||||||
"JSON that provides default UI settings (overrides UI defaults)",
|
"JSON that provides default UI settings (overrides UI defaults)",
|
||||||
[](common_params & params, const std::string & value) {
|
[](common_params & params, const std::string & value) {
|
||||||
params.ui_config_json = value;
|
params.ui_config_json = value;
|
||||||
params.webui_config_json = value;
|
|
||||||
}
|
}
|
||||||
).set_examples({LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_ARG_UI_CONFIG"));
|
).set_examples({LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_ARG_UI_CONFIG"));
|
||||||
|
|
||||||
// Deprecated: use --ui-config-file instead (kept for backward compat)
|
|
||||||
add_opt(common_arg(
|
add_opt(common_arg(
|
||||||
{"--webui-config-file"}, "PATH",
|
{"--ui-config-file", "--webui-config-file"}, "PATH",
|
||||||
"[DEPRECATED: use --ui-config-file] JSON file that provides default WebUI settings (overrides WebUI defaults)",
|
|
||||||
[](common_params & params, const std::string & value) {
|
|
||||||
params.ui_config_json = read_file(value);
|
|
||||||
params.webui_config_json = params.ui_config_json;
|
|
||||||
}
|
|
||||||
).set_examples({LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_ARG_WEBUI_CONFIG_FILE"));
|
|
||||||
|
|
||||||
add_opt(common_arg(
|
|
||||||
{"--ui-config-file"}, "PATH",
|
|
||||||
"JSON file that provides default UI settings (overrides UI defaults)",
|
"JSON file that provides default UI settings (overrides UI defaults)",
|
||||||
[](common_params & params, const std::string & value) {
|
[](common_params & params, const std::string & value) {
|
||||||
params.ui_config_json = read_file(value);
|
params.ui_config_json = read_file(value);
|
||||||
params.webui_config_json = params.ui_config_json;
|
|
||||||
}
|
}
|
||||||
).set_examples({LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_ARG_UI_CONFIG_FILE"));
|
).set_examples({LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_ARG_UI_CONFIG_FILE"));
|
||||||
|
|
||||||
// Deprecated: use --ui-mcp-proxy instead (kept for backward compat)
|
|
||||||
add_opt(common_arg(
|
add_opt(common_arg(
|
||||||
{"--webui-mcp-proxy"},
|
{"--ui-mcp-proxy", "--webui-mcp-proxy"},
|
||||||
{"--no-webui-mcp-proxy"},
|
{"--no-ui-mcp-proxy", "--no-webui-mcp-proxy"},
|
||||||
"[DEPRECATED: use --ui-mcp-proxy/--no-ui-mcp-proxy] experimental: whether to enable MCP CORS proxy",
|
|
||||||
[](common_params & params, bool value) {
|
|
||||||
params.ui_mcp_proxy = value;
|
|
||||||
params.webui_mcp_proxy = value;
|
|
||||||
}
|
|
||||||
).set_examples({LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_ARG_WEBUI_MCP_PROXY"));
|
|
||||||
|
|
||||||
add_opt(common_arg(
|
|
||||||
{"--ui-mcp-proxy"},
|
|
||||||
{"--no-ui-mcp-proxy"},
|
|
||||||
"experimental: whether to enable MCP CORS proxy - do not enable in untrusted environments (default: disabled)",
|
"experimental: whether to enable MCP CORS proxy - do not enable in untrusted environments (default: disabled)",
|
||||||
[](common_params & params, bool value) {
|
[](common_params & params, bool value) {
|
||||||
params.ui_mcp_proxy = value;
|
params.ui_mcp_proxy = value;
|
||||||
params.webui_mcp_proxy = value;
|
|
||||||
}
|
}
|
||||||
).set_examples({LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_ARG_UI_MCP_PROXY"));
|
).set_examples({LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_ARG_UI_MCP_PROXY"));
|
||||||
add_opt(common_arg(
|
add_opt(common_arg(
|
||||||
@ -2897,24 +2861,26 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
|
|||||||
params.server_tools = parse_csv_row(value);
|
params.server_tools = parse_csv_row(value);
|
||||||
}
|
}
|
||||||
).set_examples({LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_ARG_TOOLS"));
|
).set_examples({LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_ARG_TOOLS"));
|
||||||
// Deprecated: use --ui/--no-ui instead (kept for backward compat)
|
add_opt(common_arg(
|
||||||
add_opt(common_arg(
|
{"-ag", "--agent"},
|
||||||
{"--webui"},
|
{"-no-ag", "--no-agent"},
|
||||||
{"--no-webui"},
|
"whether to enable CORS proxy and all built-in tools - do not enable in untrusted environments (default: disabled)",
|
||||||
"[DEPRECATED: use --ui/--no-ui] whether to enable the Web UI",
|
|
||||||
[](common_params & params, bool value) {
|
[](common_params & params, bool value) {
|
||||||
params.ui = value;
|
if (value) {
|
||||||
params.webui = value;
|
params.server_tools = {"all"};
|
||||||
|
params.ui_mcp_proxy = true;
|
||||||
|
} else {
|
||||||
|
params.server_tools.clear();
|
||||||
|
params.ui_mcp_proxy = false;
|
||||||
|
}
|
||||||
}
|
}
|
||||||
).set_examples({LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_ARG_WEBUI"));
|
).set_examples({LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_ARG_AGENT"));
|
||||||
|
|
||||||
add_opt(common_arg(
|
add_opt(common_arg(
|
||||||
{"--ui"},
|
{"--ui", "--webui"},
|
||||||
{"--no-ui"},
|
{"--no-ui", "--no-webui"},
|
||||||
string_format("whether to enable the Web UI (default: %s)", params.ui ? "enabled" : "disabled"),
|
string_format("whether to enable the Web UI (default: %s)", params.ui ? "enabled" : "disabled"),
|
||||||
[](common_params & params, bool value) {
|
[](common_params & params, bool value) {
|
||||||
params.ui = value;
|
params.ui = value;
|
||||||
params.webui = value;
|
|
||||||
}
|
}
|
||||||
).set_examples({LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_ARG_UI"));
|
).set_examples({LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_ARG_UI"));
|
||||||
add_opt(common_arg(
|
add_opt(common_arg(
|
||||||
|
|||||||
@ -624,12 +624,6 @@ struct common_params {
|
|||||||
|
|
||||||
// UI configs
|
// UI configs
|
||||||
bool ui = true;
|
bool ui = true;
|
||||||
|
|
||||||
// Deprecated: use ui, ui_mcp_proxy, ui_config_json instead
|
|
||||||
bool webui = ui;
|
|
||||||
bool webui_mcp_proxy = false;
|
|
||||||
std::string webui_config_json;
|
|
||||||
|
|
||||||
bool ui_mcp_proxy = false;
|
bool ui_mcp_proxy = false;
|
||||||
std::string ui_config_json;
|
std::string ui_config_json;
|
||||||
|
|
||||||
|
|||||||
@ -161,7 +161,7 @@
|
|||||||
| `-mmu, --mmproj-url URL` | URL to a multimodal projector file. see tools/mtmd/README.md<br/>(env: LLAMA_ARG_MMPROJ_URL) |
|
| `-mmu, --mmproj-url URL` | URL to a multimodal projector file. see tools/mtmd/README.md<br/>(env: LLAMA_ARG_MMPROJ_URL) |
|
||||||
| `--mmproj-auto, --no-mmproj, --no-mmproj-auto` | whether to use multimodal projector file (if available), useful when using -hf (default: enabled)<br/>(env: LLAMA_ARG_MMPROJ_AUTO) |
|
| `--mmproj-auto, --no-mmproj, --no-mmproj-auto` | whether to use multimodal projector file (if available), useful when using -hf (default: enabled)<br/>(env: LLAMA_ARG_MMPROJ_AUTO) |
|
||||||
| `--mmproj-offload, --no-mmproj-offload` | whether to enable GPU offloading for multimodal projector (default: enabled)<br/>(env: LLAMA_ARG_MMPROJ_OFFLOAD) |
|
| `--mmproj-offload, --no-mmproj-offload` | whether to enable GPU offloading for multimodal projector (default: enabled)<br/>(env: LLAMA_ARG_MMPROJ_OFFLOAD) |
|
||||||
| `--image, --audio FILE` | path to an image or audio file. use with multimodal models, use comma-separated values for multiple files |
|
| `--image, --audio, --video FILE` | path to an image, audio, or video file. use with multimodal models, use comma-separated values for multiple files |
|
||||||
| `--image-min-tokens N` | minimum number of tokens each image can take, only used by vision models with dynamic resolution (default: read from model)<br/>(env: LLAMA_ARG_IMAGE_MIN_TOKENS) |
|
| `--image-min-tokens N` | minimum number of tokens each image can take, only used by vision models with dynamic resolution (default: read from model)<br/>(env: LLAMA_ARG_IMAGE_MIN_TOKENS) |
|
||||||
| `--image-max-tokens N` | maximum number of tokens each image can take, only used by vision models with dynamic resolution (default: read from model)<br/>(env: LLAMA_ARG_IMAGE_MAX_TOKENS) |
|
| `--image-max-tokens N` | maximum number of tokens each image can take, only used by vision models with dynamic resolution (default: read from model)<br/>(env: LLAMA_ARG_IMAGE_MAX_TOKENS) |
|
||||||
| `--chat-template-kwargs STRING` | sets additional params for the json template parser, must be a valid json object string, e.g. '{"key1":"value1","key2":"value2"}'<br/>(env: LLAMA_ARG_CHAT_TEMPLATE_KWARGS) |
|
| `--chat-template-kwargs STRING` | sets additional params for the json template parser, must be a valid json object string, e.g. '{"key1":"value1","key2":"value2"}'<br/>(env: LLAMA_ARG_CHAT_TEMPLATE_KWARGS) |
|
||||||
@ -174,6 +174,7 @@
|
|||||||
| `--chat-template-file JINJA_TEMPLATE_FILE` | set custom jinja chat template file (default: template taken from model's metadata)<br/>if suffix/prefix are specified, template will be disabled<br/>only commonly used templates are accepted (unless --jinja is set before this flag):<br/>list of built-in templates:<br/>bailing, bailing-think, bailing2, chatglm3, chatglm4, chatml, command-r, deepseek, deepseek-ocr, deepseek2, deepseek3, exaone-moe, exaone3, exaone4, falcon3, gemma, gigachat, glmedge, gpt-oss, granite, granite-4.0, granite-4.1, grok-2, hunyuan-dense, hunyuan-moe, hunyuan-vl, kimi-k2, llama2, llama2-sys, llama2-sys-bos, llama2-sys-strip, llama3, llama4, megrez, minicpm, mistral-v1, mistral-v3, mistral-v3-tekken, mistral-v7, mistral-v7-tekken, monarch, openchat, orion, pangu-embedded, phi3, phi4, rwkv-world, seed_oss, smolvlm, solar-open, vicuna, vicuna-orca, yandex, zephyr<br/>(env: LLAMA_ARG_CHAT_TEMPLATE_FILE) |
|
| `--chat-template-file JINJA_TEMPLATE_FILE` | set custom jinja chat template file (default: template taken from model's metadata)<br/>if suffix/prefix are specified, template will be disabled<br/>only commonly used templates are accepted (unless --jinja is set before this flag):<br/>list of built-in templates:<br/>bailing, bailing-think, bailing2, chatglm3, chatglm4, chatml, command-r, deepseek, deepseek-ocr, deepseek2, deepseek3, exaone-moe, exaone3, exaone4, falcon3, gemma, gigachat, glmedge, gpt-oss, granite, granite-4.0, granite-4.1, grok-2, hunyuan-dense, hunyuan-moe, hunyuan-vl, kimi-k2, llama2, llama2-sys, llama2-sys-bos, llama2-sys-strip, llama3, llama4, megrez, minicpm, mistral-v1, mistral-v3, mistral-v3-tekken, mistral-v7, mistral-v7-tekken, monarch, openchat, orion, pangu-embedded, phi3, phi4, rwkv-world, seed_oss, smolvlm, solar-open, vicuna, vicuna-orca, yandex, zephyr<br/>(env: LLAMA_ARG_CHAT_TEMPLATE_FILE) |
|
||||||
| `--skip-chat-parsing, --no-skip-chat-parsing` | force a pure content parser, even if a Jinja template is specified; model will output everything in the content section, including any reasoning and/or tool calls (default: disabled)<br/>(env: LLAMA_ARG_SKIP_CHAT_PARSING) |
|
| `--skip-chat-parsing, --no-skip-chat-parsing` | force a pure content parser, even if a Jinja template is specified; model will output everything in the content section, including any reasoning and/or tool calls (default: disabled)<br/>(env: LLAMA_ARG_SKIP_CHAT_PARSING) |
|
||||||
| `--simple-io` | use basic IO for better compatibility in subprocesses and limited consoles |
|
| `--simple-io` | use basic IO for better compatibility in subprocesses and limited consoles |
|
||||||
|
| `--log-prompts-dir PATH` | Log prompts to directory (only used for debugging, default: disabled) |
|
||||||
| `--spec-draft-hf, -hfd, -hfrd, --hf-repo-draft <user>/<model>[:quant]` | Same as --hf-repo, but for the draft model (default: unused)<br/>(env: LLAMA_ARG_SPEC_DRAFT_HF_REPO) |
|
| `--spec-draft-hf, -hfd, -hfrd, --hf-repo-draft <user>/<model>[:quant]` | Same as --hf-repo, but for the draft model (default: unused)<br/>(env: LLAMA_ARG_SPEC_DRAFT_HF_REPO) |
|
||||||
| `--spec-draft-threads, -td, --threads-draft N` | number of threads to use during generation (default: same as --threads) |
|
| `--spec-draft-threads, -td, --threads-draft N` | number of threads to use during generation (default: same as --threads) |
|
||||||
| `--spec-draft-threads-batch, -tbd, --threads-batch-draft N` | number of threads to use during batch and prompt processing (default: same as --threads-draft) |
|
| `--spec-draft-threads-batch, -tbd, --threads-batch-draft N` | number of threads to use during batch and prompt processing (default: same as --threads-draft) |
|
||||||
|
|||||||
@ -175,13 +175,12 @@ For the full list of features, please refer to [server's changelog](https://gith
|
|||||||
| `-np, --parallel N` | number of server slots (default: -1, -1 = auto)<br/>(env: LLAMA_ARG_N_PARALLEL) |
|
| `-np, --parallel N` | number of server slots (default: -1, -1 = auto)<br/>(env: LLAMA_ARG_N_PARALLEL) |
|
||||||
| `-cb, --cont-batching, -nocb, --no-cont-batching` | whether to enable continuous batching (a.k.a dynamic batching) (default: enabled)<br/>(env: LLAMA_ARG_CONT_BATCHING) |
|
| `-cb, --cont-batching, -nocb, --no-cont-batching` | whether to enable continuous batching (a.k.a dynamic batching) (default: enabled)<br/>(env: LLAMA_ARG_CONT_BATCHING) |
|
||||||
| `-mm, --mmproj FILE` | path to a multimodal projector file. see tools/mtmd/README.md<br/>note: if -hf is used, this argument can be omitted<br/>(env: LLAMA_ARG_MMPROJ) |
|
| `-mm, --mmproj FILE` | path to a multimodal projector file. see tools/mtmd/README.md<br/>note: if -hf is used, this argument can be omitted<br/>(env: LLAMA_ARG_MMPROJ) |
|
||||||
| `-tk, --talker-model FILE` | path to the qwen3-omni talker gguf, enables the /v1/audio/speech endpoint<br/>(env: LLAMA_ARG_TALKER_MODEL) |
|
|
||||||
| `-c2w, --code2wav-model FILE` | path to the qwen3-omni code2wav gguf, the talker code detokenizer<br/>(env: LLAMA_ARG_CODE2WAV_MODEL) |
|
|
||||||
| `-mmu, --mmproj-url URL` | URL to a multimodal projector file. see tools/mtmd/README.md<br/>(env: LLAMA_ARG_MMPROJ_URL) |
|
| `-mmu, --mmproj-url URL` | URL to a multimodal projector file. see tools/mtmd/README.md<br/>(env: LLAMA_ARG_MMPROJ_URL) |
|
||||||
| `--mmproj-auto, --no-mmproj, --no-mmproj-auto` | whether to use multimodal projector file (if available), useful when using -hf (default: enabled)<br/>(env: LLAMA_ARG_MMPROJ_AUTO) |
|
| `--mmproj-auto, --no-mmproj, --no-mmproj-auto` | whether to use multimodal projector file (if available), useful when using -hf (default: enabled)<br/>(env: LLAMA_ARG_MMPROJ_AUTO) |
|
||||||
| `--mmproj-offload, --no-mmproj-offload` | whether to enable GPU offloading for multimodal projector (default: enabled)<br/>(env: LLAMA_ARG_MMPROJ_OFFLOAD) |
|
| `--mmproj-offload, --no-mmproj-offload` | whether to enable GPU offloading for multimodal projector (default: enabled)<br/>(env: LLAMA_ARG_MMPROJ_OFFLOAD) |
|
||||||
| `--image-min-tokens N` | minimum number of tokens each image can take, only used by vision models with dynamic resolution (default: read from model)<br/>(env: LLAMA_ARG_IMAGE_MIN_TOKENS) |
|
| `--image-min-tokens N` | minimum number of tokens each image can take, only used by vision models with dynamic resolution (default: read from model)<br/>(env: LLAMA_ARG_IMAGE_MIN_TOKENS) |
|
||||||
| `--image-max-tokens N` | maximum number of tokens each image can take, only used by vision models with dynamic resolution (default: read from model)<br/>(env: LLAMA_ARG_IMAGE_MAX_TOKENS) |
|
| `--image-max-tokens N` | maximum number of tokens each image can take, only used by vision models with dynamic resolution (default: read from model)<br/>(env: LLAMA_ARG_IMAGE_MAX_TOKENS) |
|
||||||
|
| `--mtmd-batch-max-tokens N` | maximum number of image tokens per batch when encoding images (default: 1024)<br/>(env: LLAMA_ARG_MTMD_BATCH_MAX_TOKENS) |
|
||||||
| `-a, --alias STRING` | set model name aliases, comma-separated (to be used by API)<br/>(env: LLAMA_ARG_ALIAS) |
|
| `-a, --alias STRING` | set model name aliases, comma-separated (to be used by API)<br/>(env: LLAMA_ARG_ALIAS) |
|
||||||
| `--tags STRING` | set model tags, comma-separated (informational, not used for routing)<br/>(env: LLAMA_ARG_TAGS) |
|
| `--tags STRING` | set model tags, comma-separated (informational, not used for routing)<br/>(env: LLAMA_ARG_TAGS) |
|
||||||
| `--embd-normalize N` | normalisation for embeddings (default: 2) (-1=none, 0=max absolute int16, 1=taxicab, 2=euclidean, >2=p-norm) |
|
| `--embd-normalize N` | normalisation for embeddings (default: 2) (-1=none, 0=max absolute int16, 1=taxicab, 2=euclidean, >2=p-norm) |
|
||||||
@ -190,15 +189,12 @@ For the full list of features, please refer to [server's changelog](https://gith
|
|||||||
| `--reuse-port` | allow multiple sockets to bind to the same port (default: disabled)<br/>(env: LLAMA_ARG_REUSE_PORT) |
|
| `--reuse-port` | allow multiple sockets to bind to the same port (default: disabled)<br/>(env: LLAMA_ARG_REUSE_PORT) |
|
||||||
| `--path PATH` | path to serve static files from (default: )<br/>(env: LLAMA_ARG_STATIC_PATH) |
|
| `--path PATH` | path to serve static files from (default: )<br/>(env: LLAMA_ARG_STATIC_PATH) |
|
||||||
| `--api-prefix PREFIX` | prefix path the server serves from, without the trailing slash (default: )<br/>(env: LLAMA_ARG_API_PREFIX) |
|
| `--api-prefix PREFIX` | prefix path the server serves from, without the trailing slash (default: )<br/>(env: LLAMA_ARG_API_PREFIX) |
|
||||||
| `--webui-config JSON` | [DEPRECATED: use --ui-config] JSON that provides default WebUI settings (overrides WebUI defaults)<br/>(env: LLAMA_ARG_WEBUI_CONFIG) |
|
| `--ui-config, --webui-config JSON` | JSON that provides default UI settings (overrides UI defaults)<br/>(env: LLAMA_ARG_UI_CONFIG) |
|
||||||
| `--ui-config JSON` | JSON that provides default UI settings (overrides UI defaults)<br/>(env: LLAMA_ARG_UI_CONFIG) |
|
| `--ui-config-file, --webui-config-file PATH` | JSON file that provides default UI settings (overrides UI defaults)<br/>(env: LLAMA_ARG_UI_CONFIG_FILE) |
|
||||||
| `--webui-config-file PATH` | [DEPRECATED: use --ui-config-file] JSON file that provides default WebUI settings (overrides WebUI defaults)<br/>(env: LLAMA_ARG_WEBUI_CONFIG_FILE) |
|
| `--ui-mcp-proxy, --webui-mcp-proxy, --no-ui-mcp-proxy, --no-webui-mcp-proxy` | experimental: whether to enable MCP CORS proxy - do not enable in untrusted environments (default: disabled)<br/>(env: LLAMA_ARG_UI_MCP_PROXY) |
|
||||||
| `--ui-config-file PATH` | JSON file that provides default UI settings (overrides UI defaults)<br/>(env: LLAMA_ARG_UI_CONFIG_FILE) |
|
|
||||||
| `--webui-mcp-proxy, --no-webui-mcp-proxy` | [DEPRECATED: use --ui-mcp-proxy/--no-ui-mcp-proxy] experimental: whether to enable MCP CORS proxy<br/>(env: LLAMA_ARG_WEBUI_MCP_PROXY) |
|
|
||||||
| `--ui-mcp-proxy, --no-ui-mcp-proxy` | experimental: whether to enable MCP CORS proxy - do not enable in untrusted environments (default: disabled)<br/>(env: LLAMA_ARG_UI_MCP_PROXY) |
|
|
||||||
| `--tools TOOL1,TOOL2,...` | experimental: whether to enable built-in tools for AI agents - do not enable in untrusted environments (default: no tools)<br/>specify "all" to enable all tools<br/>available tools: read_file, file_glob_search, grep_search, exec_shell_command, write_file, edit_file, apply_diff, get_datetime<br/>(env: LLAMA_ARG_TOOLS) |
|
| `--tools TOOL1,TOOL2,...` | experimental: whether to enable built-in tools for AI agents - do not enable in untrusted environments (default: no tools)<br/>specify "all" to enable all tools<br/>available tools: read_file, file_glob_search, grep_search, exec_shell_command, write_file, edit_file, apply_diff, get_datetime<br/>(env: LLAMA_ARG_TOOLS) |
|
||||||
| `--webui, --no-webui` | [DEPRECATED: use --ui/--no-ui] whether to enable the Web UI<br/>(env: LLAMA_ARG_WEBUI) |
|
| `-ag, --agent, -no-ag, --no-agent` | whether to enable CORS proxy and all built-in tools - do not enable in untrusted environments (default: disabled)<br/>(env: LLAMA_ARG_AGENT) |
|
||||||
| `--ui, --no-ui` | whether to enable the Web UI (default: enabled)<br/>(env: LLAMA_ARG_UI) |
|
| `--ui, --webui, --no-ui, --no-webui` | whether to enable the Web UI (default: enabled)<br/>(env: LLAMA_ARG_UI) |
|
||||||
| `--embedding, --embeddings` | restrict to only support embedding use case; use only with dedicated embedding models (default: disabled)<br/>(env: LLAMA_ARG_EMBEDDINGS) |
|
| `--embedding, --embeddings` | restrict to only support embedding use case; use only with dedicated embedding models (default: disabled)<br/>(env: LLAMA_ARG_EMBEDDINGS) |
|
||||||
| `--rerank, --reranking` | enable reranking endpoint on server (default: disabled)<br/>(env: LLAMA_ARG_RERANKING) |
|
| `--rerank, --reranking` | enable reranking endpoint on server (default: disabled)<br/>(env: LLAMA_ARG_RERANKING) |
|
||||||
| `--api-key KEY` | API key to use for authentication, multiple keys can be provided as a comma-separated list (default: none)<br/>(env: LLAMA_API_KEY) |
|
| `--api-key KEY` | API key to use for authentication, multiple keys can be provided as a comma-separated list (default: none)<br/>(env: LLAMA_API_KEY) |
|
||||||
@ -207,6 +203,7 @@ For the full list of features, please refer to [server's changelog](https://gith
|
|||||||
| `--ssl-cert-file FNAME` | path to file a PEM-encoded SSL certificate<br/>(env: LLAMA_ARG_SSL_CERT_FILE) |
|
| `--ssl-cert-file FNAME` | path to file a PEM-encoded SSL certificate<br/>(env: LLAMA_ARG_SSL_CERT_FILE) |
|
||||||
| `--chat-template-kwargs STRING` | sets additional params for the json template parser, must be a valid json object string, e.g. '{"key1":"value1","key2":"value2"}'<br/>(env: LLAMA_ARG_CHAT_TEMPLATE_KWARGS) |
|
| `--chat-template-kwargs STRING` | sets additional params for the json template parser, must be a valid json object string, e.g. '{"key1":"value1","key2":"value2"}'<br/>(env: LLAMA_ARG_CHAT_TEMPLATE_KWARGS) |
|
||||||
| `-to, --timeout N` | server read/write timeout in seconds (default: 3600)<br/>(env: LLAMA_ARG_TIMEOUT) |
|
| `-to, --timeout N` | server read/write timeout in seconds (default: 3600)<br/>(env: LLAMA_ARG_TIMEOUT) |
|
||||||
|
| `--sse-ping-interval N` | server SSE ping interval in seconds (-1 = disabled, default: 30)<br/>(env: LLAMA_ARG_SSE_PING_INTERVAL) |
|
||||||
| `--threads-http N` | number of threads used to process HTTP requests (default: -1)<br/>(env: LLAMA_ARG_THREADS_HTTP) |
|
| `--threads-http N` | number of threads used to process HTTP requests (default: -1)<br/>(env: LLAMA_ARG_THREADS_HTTP) |
|
||||||
| `--cache-prompt, --no-cache-prompt` | whether to enable prompt caching (default: enabled)<br/>(env: LLAMA_ARG_CACHE_PROMPT) |
|
| `--cache-prompt, --no-cache-prompt` | whether to enable prompt caching (default: enabled)<br/>(env: LLAMA_ARG_CACHE_PROMPT) |
|
||||||
| `--cache-reuse N` | min chunk size to attempt reusing from the cache via KV shifting, requires prompt caching to be enabled (default: 0)<br/>[(card)](https://ggml.ai/f0.png)<br/>(env: LLAMA_ARG_CACHE_REUSE) |
|
| `--cache-reuse N` | min chunk size to attempt reusing from the cache via KV shifting, requires prompt caching to be enabled (default: 0)<br/>[(card)](https://ggml.ai/f0.png)<br/>(env: LLAMA_ARG_CACHE_REUSE) |
|
||||||
@ -231,6 +228,7 @@ For the full list of features, please refer to [server's changelog](https://gith
|
|||||||
| `-sps, --slot-prompt-similarity SIMILARITY` | how much the prompt of a request must match the prompt of a slot in order to use that slot (default: 0.10, 0.0 = disabled) |
|
| `-sps, --slot-prompt-similarity SIMILARITY` | how much the prompt of a request must match the prompt of a slot in order to use that slot (default: 0.10, 0.0 = disabled) |
|
||||||
| `--lora-init-without-apply` | load LoRA adapters without applying them (apply later via POST /lora-adapters) (default: disabled) |
|
| `--lora-init-without-apply` | load LoRA adapters without applying them (apply later via POST /lora-adapters) (default: disabled) |
|
||||||
| `--sleep-idle-seconds SECONDS` | number of seconds of idleness after which the server will sleep (default: -1; -1 = disabled) |
|
| `--sleep-idle-seconds SECONDS` | number of seconds of idleness after which the server will sleep (default: -1; -1 = disabled) |
|
||||||
|
| `--log-prompts-dir PATH` | Log prompts to directory (only used for debugging, default: disabled) |
|
||||||
| `--spec-draft-hf, -hfd, -hfrd, --hf-repo-draft <user>/<model>[:quant]` | Same as --hf-repo, but for the draft model (default: unused)<br/>(env: LLAMA_ARG_SPEC_DRAFT_HF_REPO) |
|
| `--spec-draft-hf, -hfd, -hfrd, --hf-repo-draft <user>/<model>[:quant]` | Same as --hf-repo, but for the draft model (default: unused)<br/>(env: LLAMA_ARG_SPEC_DRAFT_HF_REPO) |
|
||||||
| `--spec-draft-threads, -td, --threads-draft N` | number of threads to use during generation (default: same as --threads) |
|
| `--spec-draft-threads, -td, --threads-draft N` | number of threads to use during generation (default: same as --threads) |
|
||||||
| `--spec-draft-threads-batch, -tbd, --threads-batch-draft N` | number of threads to use during batch and prompt processing (default: same as --threads-draft) |
|
| `--spec-draft-threads-batch, -tbd, --threads-batch-draft N` | number of threads to use during batch and prompt processing (default: same as --threads-draft) |
|
||||||
|
|||||||
@ -1302,11 +1302,8 @@ private:
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
// populate UI settings (from either new ui_config_json or deprecated webui_config_json)
|
|
||||||
{
|
{
|
||||||
const std::string & cfg = !params_base.ui_config_json.empty()
|
const std::string & cfg = params_base.ui_config_json;
|
||||||
? params_base.ui_config_json
|
|
||||||
: params_base.webui_config_json;
|
|
||||||
if (!cfg.empty()) {
|
if (!cfg.empty()) {
|
||||||
try {
|
try {
|
||||||
json json_settings = json::parse(cfg);
|
json json_settings = json::parse(cfg);
|
||||||
@ -4304,18 +4301,18 @@ void server_routes::init_routes() {
|
|||||||
{ "endpoint_props", params.endpoint_props },
|
{ "endpoint_props", params.endpoint_props },
|
||||||
{ "endpoint_metrics", params.endpoint_metrics },
|
{ "endpoint_metrics", params.endpoint_metrics },
|
||||||
// New keys
|
// New keys
|
||||||
{ "ui", params.ui },
|
{ "ui", params.ui },
|
||||||
{ "ui_settings", meta->json_ui_settings },
|
{ "ui_settings", meta->json_ui_settings },
|
||||||
// Deprecated: use ui/ui_settings instead (kept for backward compat)
|
// Deprecated: use ui/ui_settings instead (kept for backward compat)
|
||||||
{ "webui", params.webui },
|
{ "webui", params.ui },
|
||||||
{ "webui_settings", meta->json_webui_settings },
|
{ "webui_settings", meta->json_ui_settings },
|
||||||
{ "chat_template", tmpl_default },
|
{ "chat_template", tmpl_default },
|
||||||
{ "chat_template_caps", meta->chat_template_caps },
|
{ "chat_template_caps", meta->chat_template_caps },
|
||||||
{ "bos_token", meta->bos_token_str },
|
{ "bos_token", meta->bos_token_str },
|
||||||
{ "eos_token", meta->eos_token_str },
|
{ "eos_token", meta->eos_token_str },
|
||||||
{ "build_info", meta->build_info },
|
{ "build_info", meta->build_info },
|
||||||
{ "is_sleeping", queue_tasks.is_sleeping() },
|
{ "is_sleeping", queue_tasks.is_sleeping() },
|
||||||
{ "cors_proxy_enabled", params.ui_mcp_proxy || params.webui_mcp_proxy },
|
{ "cors_proxy_enabled", params.ui_mcp_proxy },
|
||||||
};
|
};
|
||||||
if (params.use_jinja) {
|
if (params.use_jinja) {
|
||||||
if (!tmpl_tools.empty()) {
|
if (!tmpl_tools.empty()) {
|
||||||
|
|||||||
@ -1462,9 +1462,9 @@ void server_models_routes::init_routes() {
|
|||||||
auto res = std::make_unique<server_http_res>();
|
auto res = std::make_unique<server_http_res>();
|
||||||
res_ok(res, {
|
res_ok(res, {
|
||||||
// TODO: add support for this on web UI
|
// TODO: add support for this on web UI
|
||||||
{"role", "router"},
|
{"role", "router"},
|
||||||
{"max_instances", params.models_max},
|
{"max_instances", params.models_max},
|
||||||
{"models_autoload", params.models_autoload},
|
{"models_autoload", params.models_autoload},
|
||||||
// this is a dummy response to make sure the UI doesn't break
|
// this is a dummy response to make sure the UI doesn't break
|
||||||
{"model_alias", "llama-server"},
|
{"model_alias", "llama-server"},
|
||||||
{"model_path", "none"},
|
{"model_path", "none"},
|
||||||
@ -1473,11 +1473,10 @@ void server_models_routes::init_routes() {
|
|||||||
{"n_ctx", 0},
|
{"n_ctx", 0},
|
||||||
}},
|
}},
|
||||||
// New key
|
// New key
|
||||||
{"ui_settings", ui_settings},
|
{"ui_settings", ui_settings},
|
||||||
// Deprecated: use ui_settings instead (kept for backward compat)
|
{"webui_settings", webui_settings},
|
||||||
{"webui_settings", webui_settings},
|
{"build_info", std::string(llama_build_info())},
|
||||||
{"build_info", std::string(llama_build_info())},
|
{"cors_proxy_enabled", params.ui_mcp_proxy},
|
||||||
{"cors_proxy_enabled", params.ui_mcp_proxy || params.webui_mcp_proxy},
|
|
||||||
});
|
});
|
||||||
return res;
|
return res;
|
||||||
}
|
}
|
||||||
|
|||||||
@ -212,10 +212,7 @@ struct server_models_routes {
|
|||||||
server_models models;
|
server_models models;
|
||||||
server_models_routes(const common_params & params, int argc, char ** argv)
|
server_models_routes(const common_params & params, int argc, char ** argv)
|
||||||
: params(params), models(params, argc, argv) {
|
: params(params), models(params, argc, argv) {
|
||||||
// Support both new ui_config_json and deprecated webui_config_json
|
const std::string & cfg = this->params.ui_config_json;
|
||||||
const std::string & cfg = !this->params.ui_config_json.empty()
|
|
||||||
? this->params.ui_config_json
|
|
||||||
: this->params.webui_config_json;
|
|
||||||
if (!cfg.empty()) {
|
if (!cfg.empty()) {
|
||||||
try {
|
try {
|
||||||
json json_settings = json::parse(cfg);
|
json json_settings = json::parse(cfg);
|
||||||
|
|||||||
@ -227,8 +227,7 @@ int llama_server(int argc, char ** argv) {
|
|||||||
ctx_http.register_gcp_compat();
|
ctx_http.register_gcp_compat();
|
||||||
|
|
||||||
// CORS proxy (EXPERIMENTAL, only used by the Web UI for MCP)
|
// CORS proxy (EXPERIMENTAL, only used by the Web UI for MCP)
|
||||||
// Supports both new ui_mcp_proxy and deprecated webui_mcp_proxy fields
|
if (params.ui_mcp_proxy) {
|
||||||
if (params.ui_mcp_proxy || params.webui_mcp_proxy) {
|
|
||||||
SRV_WRN("%s", "-----------------\n");
|
SRV_WRN("%s", "-----------------\n");
|
||||||
SRV_WRN("%s", "CORS proxy is enabled, do not expose server to untrusted environments\n");
|
SRV_WRN("%s", "CORS proxy is enabled, do not expose server to untrusted environments\n");
|
||||||
SRV_WRN("%s", "This feature is EXPERIMENTAL and may be removed or changed in future versions\n");
|
SRV_WRN("%s", "This feature is EXPERIMENTAL and may be removed or changed in future versions\n");
|
||||||
|
|||||||
Loading…
x
Reference in New Issue
Block a user