From e3a74b299085cd00013804f7fca2e03441b2da20 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Adrien=20Gallou=C3=ABt?= Date: Tue, 16 Jun 2026 08:26:05 +0200 Subject: [PATCH] bench : add --offline (#24511) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit * bench : add --offline Signed-off-by: Adrien Gallouët * Add default Signed-off-by: Adrien Gallouët --------- Signed-off-by: Adrien Gallouët --- tools/llama-bench/llama-bench.cpp | 7 +++++++ tools/server/bench/bench.py | 3 +++ 2 files changed, 10 insertions(+) diff --git a/tools/llama-bench/llama-bench.cpp b/tools/llama-bench/llama-bench.cpp index a85f86c3ab..55970c0745 100644 --- a/tools/llama-bench/llama-bench.cpp +++ b/tools/llama-bench/llama-bench.cpp @@ -323,6 +323,7 @@ struct cmd_params { std::vector hf_repo; std::vector hf_file; std::string hf_token; + bool offline; std::vector n_prompt; std::vector n_gen; std::vector> n_pg; @@ -367,6 +368,7 @@ static const cmd_params cmd_params_defaults = { /* hf_repo */ {}, /* hf_file */ {}, /* hf_token */ "", + /* offline */ false, /* n_prompt */ { 512 }, /* n_gen */ { 128 }, /* n_pg */ {}, @@ -437,6 +439,8 @@ static void print_usage(int /* argc */, char ** argv) { printf(" (default: unused)\n"); printf(" -hft, --hf-token Hugging Face access token\n"); printf(" (default: value from HF_TOKEN environment variable)\n"); + printf(" --offline Offline mode: forces use of cache, prevents network access\n"); + printf(" (default: disabled)\n"); printf(" -p, --n-prompt (default: %s)\n", join(cmd_params_defaults.n_prompt, ",").c_str()); printf(" -n, --n-gen (default: %s)\n", join(cmd_params_defaults.n_gen, ",").c_str()); printf(" -pg (default: %s)\n", join(transform_to_str(cmd_params_defaults.n_pg, pair_str), ",").c_str()); @@ -558,6 +562,8 @@ static cmd_params parse_cmd_params(int argc, char ** argv) { break; } params.hf_token = argv[i]; + } else if (arg == "--offline") { + params.offline = true; } else if (arg == "-p" || arg == "--n-prompt") { if (++i >= argc) { invalid_param = true; @@ -1040,6 +1046,7 @@ static cmd_params parse_cmd_params(int argc, char ** argv) { common_download_opts opts; opts.bearer_token = params.hf_token; + opts.offline = params.offline; auto download_result = common_download_model(model, opts); if (download_result.model_path.empty()) { fprintf(stderr, "error: failed to download model from HuggingFace\n"); diff --git a/tools/server/bench/bench.py b/tools/server/bench/bench.py index c816816eaf..2c56ab5ebc 100644 --- a/tools/server/bench/bench.py +++ b/tools/server/bench/bench.py @@ -40,6 +40,7 @@ def main(args_in: list[str] | None = None) -> None: required=True) parser.add_argument("--hf-repo", type=str, help="Hugging Face model repository", required=True) parser.add_argument("--hf-file", type=str, help="Hugging Face model file", required=True) + parser.add_argument("--offline", action="store_true", default=False, help="Offline mode: forces use of cache, prevents network access") parser.add_argument("-ngl", "--n-gpu-layers", type=int, help="layers to the GPU for computation", required=True) parser.add_argument("--ctx-size", type=int, help="Set the size of the prompt context", required=True) parser.add_argument("--parallel", type=int, help="Set the number of slots for process requests", required=True) @@ -268,6 +269,8 @@ def start_server_background(args): ] server_args.extend(['--hf-repo', args.hf_repo]) server_args.extend(['--hf-file', args.hf_file]) + if args.offline: + server_args.append('--offline') server_args.extend(['--n-gpu-layers', args.n_gpu_layers]) server_args.extend(['--ctx-size', args.ctx_size]) server_args.extend(['--parallel', args.parallel])