From 19296c1735147a826d54eb9f53cab658b60449e7 Mon Sep 17 00:00:00 2001 From: Xuan Son Nguyen Date: Tue, 23 Jun 2026 16:09:09 +0200 Subject: [PATCH] working --- tools/cli/cli-context.cpp | 52 ++++++++++++++++++++++----------------- tools/cli/cli-context.h | 26 +------------------- tools/cli/cli-server.h | 44 ++++++++++++++++++++++++--------- tools/cli/cli-view.h | 44 +++++++++++++++++++++++++-------- tools/server/main.cpp | 14 +++++++++++ tools/server/server.cpp | 50 +++++++++++++++++++++++-------------- 6 files changed, 142 insertions(+), 88 deletions(-) diff --git a/tools/cli/cli-context.cpp b/tools/cli/cli-context.cpp index dfc2c9f6b9..cbfde0c0a3 100644 --- a/tools/cli/cli-context.cpp +++ b/tools/cli/cli-context.cpp @@ -64,6 +64,8 @@ static std::string media_type_from_ext(const std::string & fname) { } bool cli_context::init() { + view::init(params); + std::optional spinner; if (!params.server_base.empty()) { @@ -85,7 +87,7 @@ bool cli_context::init() { return false; } - spinner.emplace("Loading model..."); + spinner.emplace("\n\nLoading model..."); server.emplace(); if (!server->start(params)) { @@ -281,35 +283,35 @@ int cli_context::run() { modalities += ", video"; } - std::vector banner; - banner.push_back("\n"); - banner.push_back(LLAMA_ASCII_LOGO); - banner.push_back("\n"); - banner.push_back("build : " + build_info); - banner.push_back("model : " + model_name); - banner.push_back("modalities : " + modalities); + std::string banner; + banner += "\n"; + banner += LLAMA_ASCII_LOGO; + banner += "\n"; + banner += "build : " + build_info + "\n"; + banner += "model : " + model_name + "\n"; + banner += "modalities : " + modalities + "\n"; if (!params.system_prompt.empty()) { - console::log("using custom system prompt\n"); + banner += "using custom system prompt\n"; } - console::log("\n"); - console::log("available commands:\n"); - console::log(" /exit or Ctrl+C stop or exit\n"); - console::log(" /regen regenerate the last response\n"); - console::log(" /clear clear the chat history\n"); - console::log(" /read add a text file\n"); - console::log(" /glob add text files using globbing pattern\n"); + banner += "\n"; + banner += "available commands:\n"; + banner += " /exit or Ctrl+C stop or exit\n"; + banner += " /regen regenerate the last response\n"; + banner += " /clear clear the chat history\n"; + banner += " /read add a text file\n"; + banner += " /glob add text files using globbing pattern\n"; if (has_vision) { - console::log(" /image add an image file\n"); + banner += " /image add an image file\n"; } if (has_audio) { - console::log(" /audio add an audio file\n"); + banner += " /audio add an audio file\n"; } if (has_video) { - console::log(" /video add a video file\n"); + banner += " /video add a video file\n"; } - console::log("\n"); + banner += "\n"; - view::show_banner(banner); + view::show_message(banner); // interactive loop std::string cur_msg; @@ -476,7 +478,11 @@ int cli_context::run() { }); if (params.show_timings) { - // TODO + view::show_info(string_format( + "\n[ Prompt: %.1f t/s | Generation: %.1f t/s ]", + timings.prompt_per_second, + timings.predicted_per_second + )); } if (params.single_turn) { @@ -484,7 +490,7 @@ int cli_context::run() { } } - view::show_message("Exiting..."); + view::show_message("\n\nExiting..."); return 0; } diff --git a/tools/cli/cli-context.h b/tools/cli/cli-context.h index cbf6729d6d..2c67586d63 100644 --- a/tools/cli/cli-context.h +++ b/tools/cli/cli-context.h @@ -1,9 +1,3 @@ -// controller for llama-cli (the "controller" in MVC) -// -// owns the chat state, drives the view and talks to llama-server through -// cli_client; when no --server-base is given it also manages a local -// llama-server child process via cli_server - #pragma once #include "common.h" @@ -20,25 +14,6 @@ struct cli_timings { double predicted_per_second = 0.0; }; -struct cli_command_info { - std::string usage; // e.g. "/read " - std::string description; // e.g. "add a text file" -}; - -// properties of the connected server, shown on startup -struct cli_server_info { - std::string build_info; - std::string model_name; - std::string server_base; - bool is_local_server = false; // server is spawned and managed by llama-cli - bool has_system_prompt = false; - bool has_vision = false; - bool has_audio = false; - bool has_video = false; - - std::vector commands; -}; - // set by the SIGINT handler; cleared once the interrupt has been handled extern std::atomic g_cli_interrupted; @@ -52,6 +27,7 @@ struct cli_context { json pending_media = json::array(); // staged multimodal content parts // properties of the connected server + // will be populated by fetch_server_props() std::string model_name; std::string build_info; bool has_vision = false; diff --git a/tools/cli/cli-server.h b/tools/cli/cli-server.h index 41f860af86..50447f2551 100644 --- a/tools/cli/cli-server.h +++ b/tools/cli/cli-server.h @@ -9,18 +9,22 @@ // llama_server will be available as a dynamic library symbol int llama_server(common_params & params, int argc, char ** argv); +void llama_server_terminate(); struct cli_server { std::thread th; int port = -1; + std::atomic is_alive = false; + std::atomic is_stopping = false; ~cli_server() { stop(); } void stop() { - if (th.joinable()) { - th.detach(); + if (alive() && !is_stopping.exchange(true)) { + llama_server_terminate(); + th.join(); } } @@ -31,12 +35,17 @@ struct cli_server { exit(1); } + is_alive.store(true, std::memory_order_release); + th = std::thread([&]() { + common_params server_params = params; // copy + server_params.port = port; // argc / argv are only used in router mode, we can skip them for now - int res = llama_server(params, 0, nullptr); + int res = llama_server(server_params, 0, nullptr); if (res != 0) { fprintf(stderr, "llama_server exited with code %d\n", res); } + is_alive.store(false, std::memory_order_release); }); return true; @@ -47,17 +56,30 @@ struct cli_server { } bool wait_ready(std::function should_stop) { - // while (true) { - // if (should_stop()) { - // break; - // } - // std::this_thread::sleep_for(std::chrono::milliseconds(5000)); - // } - std::this_thread::sleep_for(std::chrono::milliseconds(5000)); + if (!alive()) { + return false; + } + while (!should_stop()) { + auto [cli, parts] = common_http_client(address()); + cli.set_connection_timeout(1, 0); + auto res = cli.Get("/health"); + if (res) { + if (res->status == 200) { + return true; + } + // any other status means the server is up but not ready yet + // (e.g. 503 while the model is still loading) + } + if (!alive()) { + // in case server die permanently + return false; + } + std::this_thread::sleep_for(std::chrono::milliseconds(200)); + } return true; } bool alive() const { - return th.joinable(); + return is_alive.load(std::memory_order_acquire); } }; diff --git a/tools/cli/cli-view.h b/tools/cli/cli-view.h index 6168f6ade3..a44a0ba240 100644 --- a/tools/cli/cli-view.h +++ b/tools/cli/cli-view.h @@ -19,7 +19,9 @@ namespace view { struct spinner { spinner(const std::string & message) { - console::log("%s\n", message.c_str()); + if (!message.empty()) { + console::log("%s ", message.c_str()); + } console::spinner::start(); } ~spinner() { @@ -60,27 +62,49 @@ namespace view { }; struct assistant_turn { assistant_display_mode mode = ASSISTANT_DISPLAY_MODE_CONTENT; + bool trailing_newline = true; + bool is_inside_reasoning = false; assistant_turn() { console::set_display(DISPLAY_TYPE_RESET); } ~assistant_turn() { console::set_display(DISPLAY_TYPE_RESET); + add_newline_if_needed(); } void push(assistant_display_mode m, const std::string & buffer) { if (m != mode) { + add_newline_if_needed(); switch (m) { case ASSISTANT_DISPLAY_MODE_CONTENT: - console::set_display(DISPLAY_TYPE_RESET); - break; + { + if (is_inside_reasoning) { + console::log("[End thinking]\n\n"); + is_inside_reasoning = false; + } + console::set_display(DISPLAY_TYPE_RESET); + } break; case ASSISTANT_DISPLAY_MODE_REASONING: - console::set_display(DISPLAY_TYPE_REASONING); - break; + { + console::set_display(DISPLAY_TYPE_REASONING); + is_inside_reasoning = true; + console::log("\n[Start thinking]\n\n"); + } break; } } mode = m; + if (buffer.empty()) { + return; + } + trailing_newline = buffer.back() == '\n'; console::log("%s", buffer.c_str()); console::flush(); } + void add_newline_if_needed() { + if (!trailing_newline) { + console::log("\n"); + console::flush(); + } + } }; static void show_error(const std::string & title, const std::string & message = "") { @@ -95,9 +119,9 @@ namespace view { console::log("%s\n", message.c_str()); } - static void show_banner(const std::vector & lines) { - for (const auto & line : lines) { - console::log("%s\n", line.c_str()); - } + static void show_info(const std::string & message) { + console::set_display(DISPLAY_TYPE_INFO); + console::log("%s\n", message.c_str()); + console::set_display(DISPLAY_TYPE_RESET); } -}; +} diff --git a/tools/server/main.cpp b/tools/server/main.cpp index 7f17c56a8c..b8d14e3111 100644 --- a/tools/server/main.cpp +++ b/tools/server/main.cpp @@ -3,3 +3,17 @@ int llama_server(int argc, char ** argv); int main(int argc, char ** argv) { return llama_server(argc, argv); } + +// satisfies -Wmissing-declarations +void server_signal_handler(int signal); + +void server_signal_handler(int signal) { + if (is_terminating.test_and_set()) { + // in case it hangs, we can force terminate the server by hitting Ctrl+C twice + // this is for better developer experience, we can remove when the server is stable enough + fprintf(stderr, "Received second interrupt, terminating immediately.\n"); + exit(1); + } + + shutdown_handler(signal); +} diff --git a/tools/server/server.cpp b/tools/server/server.cpp index a101df655d..3b55c5f4be 100644 --- a/tools/server/server.cpp +++ b/tools/server/server.cpp @@ -21,12 +21,6 @@ #include #endif -// satisfies -Wmissing-declarations (used by llama command) -int llama_server(int argc, char ** argv); - -// to be used via CLI (argc / argv are used by router mode only) -int llama_server(common_params & params, int argc, char ** argv); - static std::function shutdown_handler; static std::atomic_flag is_terminating = ATOMIC_FLAG_INIT; @@ -41,6 +35,19 @@ static inline void signal_handler(int signal) { shutdown_handler(signal); } +// satisfies -Wmissing-declarations (used by llama command) +int llama_server(int argc, char ** argv); + +// to be used via CLI (argc / argv are used by router mode only) +int llama_server(common_params & params, int argc, char ** argv); +void llama_server_terminate(); +void llama_server_terminate() { + if (shutdown_handler) { + shutdown_handler(0); + } +} + + // wrapper function that handles exceptions and logs errors // this is to make sure handler_t never throws exceptions; instead, it returns an error response static server_http_context::handler_t ex_wrapper(server_http_context::handler_t func) { @@ -96,8 +103,10 @@ int llama_server(int argc, char ** argv) { } int llama_server(common_params & params, int argc, char ** argv) { + bool is_run_by_cli = (argv == nullptr); + // note: router mode also accepts -hf remote-preset, so we need to check that first - if (!params.model.hf_repo.empty()) { + if (!is_run_by_cli && !params.model.hf_repo.empty()) { try { common_params_handle_models_params handle_params; handle_params.preset_only = true; @@ -279,8 +288,9 @@ int llama_server(common_params & params, int argc, char ** argv) { if (child.is_child() && child.get_mode() == SERVER_CHILD_MODE_DOWNLOAD) { return child.run_download(params); - } else if (!is_router_server) { + } else if (!is_router_server && !is_run_by_cli) { // single-model mode (NOT spawned by router) + // if this is invoked by CLI, model downloading should already handled common_params_handle_models(params, LLAMA_EXAMPLE_SERVER, {}); } @@ -363,20 +373,22 @@ int llama_server(common_params & params, int argc, char ** argv) { }; } - // TODO: refactor in common/console + // register signal handler is not running by CLI + if (!is_run_by_cli) { #if defined (__unix__) || (defined (__APPLE__) && defined (__MACH__)) - struct sigaction sigint_action; - sigint_action.sa_handler = signal_handler; - sigemptyset (&sigint_action.sa_mask); - sigint_action.sa_flags = 0; - sigaction(SIGINT, &sigint_action, NULL); - sigaction(SIGTERM, &sigint_action, NULL); + struct sigaction sigint_action; + sigint_action.sa_handler = signal_handler; + sigemptyset (&sigint_action.sa_mask); + sigint_action.sa_flags = 0; + sigaction(SIGINT, &sigint_action, NULL); + sigaction(SIGTERM, &sigint_action, NULL); #elif defined (_WIN32) - auto console_ctrl_handler = +[](DWORD ctrl_type) -> BOOL { - return (ctrl_type == CTRL_C_EVENT) ? (signal_handler(SIGINT), true) : false; - }; - SetConsoleCtrlHandler(reinterpret_cast(console_ctrl_handler), true); + auto console_ctrl_handler = +[](DWORD ctrl_type) -> BOOL { + return (ctrl_type == CTRL_C_EVENT) ? (signal_handler(SIGINT), true) : false; + }; + SetConsoleCtrlHandler(reinterpret_cast(console_ctrl_handler), true); #endif + } if (is_router_server) { SRV_INF("router server is listening on %s\n", ctx_http.listening_address.c_str());