This commit is contained in:
Xuan Son Nguyen 2026-06-23 16:09:09 +02:00
parent 90c111bf98
commit 19296c1735
6 changed files with 142 additions and 88 deletions

View File

@ -64,6 +64,8 @@ static std::string media_type_from_ext(const std::string & fname) {
}
bool cli_context::init() {
view::init(params);
std::optional<view::spinner> spinner;
if (!params.server_base.empty()) {
@ -85,7 +87,7 @@ bool cli_context::init() {
return false;
}
spinner.emplace("Loading model...");
spinner.emplace("\n\nLoading model...");
server.emplace();
if (!server->start(params)) {
@ -281,35 +283,35 @@ int cli_context::run() {
modalities += ", video";
}
std::vector<std::string> banner;
banner.push_back("\n");
banner.push_back(LLAMA_ASCII_LOGO);
banner.push_back("\n");
banner.push_back("build : " + build_info);
banner.push_back("model : " + model_name);
banner.push_back("modalities : " + modalities);
std::string banner;
banner += "\n";
banner += LLAMA_ASCII_LOGO;
banner += "\n";
banner += "build : " + build_info + "\n";
banner += "model : " + model_name + "\n";
banner += "modalities : " + modalities + "\n";
if (!params.system_prompt.empty()) {
console::log("using custom system prompt\n");
banner += "using custom system prompt\n";
}
console::log("\n");
console::log("available commands:\n");
console::log(" /exit or Ctrl+C stop or exit\n");
console::log(" /regen regenerate the last response\n");
console::log(" /clear clear the chat history\n");
console::log(" /read <file> add a text file\n");
console::log(" /glob <pattern> add text files using globbing pattern\n");
banner += "\n";
banner += "available commands:\n";
banner += " /exit or Ctrl+C stop or exit\n";
banner += " /regen regenerate the last response\n";
banner += " /clear clear the chat history\n";
banner += " /read <file> add a text file\n";
banner += " /glob <pattern> add text files using globbing pattern\n";
if (has_vision) {
console::log(" /image <file> add an image file\n");
banner += " /image <file> add an image file\n";
}
if (has_audio) {
console::log(" /audio <file> add an audio file\n");
banner += " /audio <file> add an audio file\n";
}
if (has_video) {
console::log(" /video <file> add a video file\n");
banner += " /video <file> add a video file\n";
}
console::log("\n");
banner += "\n";
view::show_banner(banner);
view::show_message(banner);
// interactive loop
std::string cur_msg;
@ -476,7 +478,11 @@ int cli_context::run() {
});
if (params.show_timings) {
// TODO
view::show_info(string_format(
"\n[ Prompt: %.1f t/s | Generation: %.1f t/s ]",
timings.prompt_per_second,
timings.predicted_per_second
));
}
if (params.single_turn) {
@ -484,7 +490,7 @@ int cli_context::run() {
}
}
view::show_message("Exiting...");
view::show_message("\n\nExiting...");
return 0;
}

View File

@ -1,9 +1,3 @@
// controller for llama-cli (the "controller" in MVC)
//
// owns the chat state, drives the view and talks to llama-server through
// cli_client; when no --server-base is given it also manages a local
// llama-server child process via cli_server
#pragma once
#include "common.h"
@ -20,25 +14,6 @@ struct cli_timings {
double predicted_per_second = 0.0;
};
struct cli_command_info {
std::string usage; // e.g. "/read <file>"
std::string description; // e.g. "add a text file"
};
// properties of the connected server, shown on startup
struct cli_server_info {
std::string build_info;
std::string model_name;
std::string server_base;
bool is_local_server = false; // server is spawned and managed by llama-cli
bool has_system_prompt = false;
bool has_vision = false;
bool has_audio = false;
bool has_video = false;
std::vector<cli_command_info> commands;
};
// set by the SIGINT handler; cleared once the interrupt has been handled
extern std::atomic<bool> g_cli_interrupted;
@ -52,6 +27,7 @@ struct cli_context {
json pending_media = json::array(); // staged multimodal content parts
// properties of the connected server
// will be populated by fetch_server_props()
std::string model_name;
std::string build_info;
bool has_vision = false;

View File

@ -9,18 +9,22 @@
// llama_server will be available as a dynamic library symbol
int llama_server(common_params & params, int argc, char ** argv);
void llama_server_terminate();
struct cli_server {
std::thread th;
int port = -1;
std::atomic<bool> is_alive = false;
std::atomic<bool> is_stopping = false;
~cli_server() {
stop();
}
void stop() {
if (th.joinable()) {
th.detach();
if (alive() && !is_stopping.exchange(true)) {
llama_server_terminate();
th.join();
}
}
@ -31,12 +35,17 @@ struct cli_server {
exit(1);
}
is_alive.store(true, std::memory_order_release);
th = std::thread([&]() {
common_params server_params = params; // copy
server_params.port = port;
// argc / argv are only used in router mode, we can skip them for now
int res = llama_server(params, 0, nullptr);
int res = llama_server(server_params, 0, nullptr);
if (res != 0) {
fprintf(stderr, "llama_server exited with code %d\n", res);
}
is_alive.store(false, std::memory_order_release);
});
return true;
@ -47,17 +56,30 @@ struct cli_server {
}
bool wait_ready(std::function<bool()> should_stop) {
// while (true) {
// if (should_stop()) {
// break;
// }
// std::this_thread::sleep_for(std::chrono::milliseconds(5000));
// }
std::this_thread::sleep_for(std::chrono::milliseconds(5000));
if (!alive()) {
return false;
}
while (!should_stop()) {
auto [cli, parts] = common_http_client(address());
cli.set_connection_timeout(1, 0);
auto res = cli.Get("/health");
if (res) {
if (res->status == 200) {
return true;
}
// any other status means the server is up but not ready yet
// (e.g. 503 while the model is still loading)
}
if (!alive()) {
// in case server die permanently
return false;
}
std::this_thread::sleep_for(std::chrono::milliseconds(200));
}
return true;
}
bool alive() const {
return th.joinable();
return is_alive.load(std::memory_order_acquire);
}
};

View File

@ -19,7 +19,9 @@ namespace view {
struct spinner {
spinner(const std::string & message) {
console::log("%s\n", message.c_str());
if (!message.empty()) {
console::log("%s ", message.c_str());
}
console::spinner::start();
}
~spinner() {
@ -60,27 +62,49 @@ namespace view {
};
struct assistant_turn {
assistant_display_mode mode = ASSISTANT_DISPLAY_MODE_CONTENT;
bool trailing_newline = true;
bool is_inside_reasoning = false;
assistant_turn() {
console::set_display(DISPLAY_TYPE_RESET);
}
~assistant_turn() {
console::set_display(DISPLAY_TYPE_RESET);
add_newline_if_needed();
}
void push(assistant_display_mode m, const std::string & buffer) {
if (m != mode) {
add_newline_if_needed();
switch (m) {
case ASSISTANT_DISPLAY_MODE_CONTENT:
console::set_display(DISPLAY_TYPE_RESET);
break;
{
if (is_inside_reasoning) {
console::log("[End thinking]\n\n");
is_inside_reasoning = false;
}
console::set_display(DISPLAY_TYPE_RESET);
} break;
case ASSISTANT_DISPLAY_MODE_REASONING:
console::set_display(DISPLAY_TYPE_REASONING);
break;
{
console::set_display(DISPLAY_TYPE_REASONING);
is_inside_reasoning = true;
console::log("\n[Start thinking]\n\n");
} break;
}
}
mode = m;
if (buffer.empty()) {
return;
}
trailing_newline = buffer.back() == '\n';
console::log("%s", buffer.c_str());
console::flush();
}
void add_newline_if_needed() {
if (!trailing_newline) {
console::log("\n");
console::flush();
}
}
};
static void show_error(const std::string & title, const std::string & message = "") {
@ -95,9 +119,9 @@ namespace view {
console::log("%s\n", message.c_str());
}
static void show_banner(const std::vector<std::string> & lines) {
for (const auto & line : lines) {
console::log("%s\n", line.c_str());
}
static void show_info(const std::string & message) {
console::set_display(DISPLAY_TYPE_INFO);
console::log("%s\n", message.c_str());
console::set_display(DISPLAY_TYPE_RESET);
}
};
}

View File

@ -3,3 +3,17 @@ int llama_server(int argc, char ** argv);
int main(int argc, char ** argv) {
return llama_server(argc, argv);
}
// satisfies -Wmissing-declarations
void server_signal_handler(int signal);
void server_signal_handler(int signal) {
if (is_terminating.test_and_set()) {
// in case it hangs, we can force terminate the server by hitting Ctrl+C twice
// this is for better developer experience, we can remove when the server is stable enough
fprintf(stderr, "Received second interrupt, terminating immediately.\n");
exit(1);
}
shutdown_handler(signal);
}

View File

@ -21,12 +21,6 @@
#include <windows.h>
#endif
// satisfies -Wmissing-declarations (used by llama command)
int llama_server(int argc, char ** argv);
// to be used via CLI (argc / argv are used by router mode only)
int llama_server(common_params & params, int argc, char ** argv);
static std::function<void(int)> shutdown_handler;
static std::atomic_flag is_terminating = ATOMIC_FLAG_INIT;
@ -41,6 +35,19 @@ static inline void signal_handler(int signal) {
shutdown_handler(signal);
}
// satisfies -Wmissing-declarations (used by llama command)
int llama_server(int argc, char ** argv);
// to be used via CLI (argc / argv are used by router mode only)
int llama_server(common_params & params, int argc, char ** argv);
void llama_server_terminate();
void llama_server_terminate() {
if (shutdown_handler) {
shutdown_handler(0);
}
}
// wrapper function that handles exceptions and logs errors
// this is to make sure handler_t never throws exceptions; instead, it returns an error response
static server_http_context::handler_t ex_wrapper(server_http_context::handler_t func) {
@ -96,8 +103,10 @@ int llama_server(int argc, char ** argv) {
}
int llama_server(common_params & params, int argc, char ** argv) {
bool is_run_by_cli = (argv == nullptr);
// note: router mode also accepts -hf remote-preset, so we need to check that first
if (!params.model.hf_repo.empty()) {
if (!is_run_by_cli && !params.model.hf_repo.empty()) {
try {
common_params_handle_models_params handle_params;
handle_params.preset_only = true;
@ -279,8 +288,9 @@ int llama_server(common_params & params, int argc, char ** argv) {
if (child.is_child() && child.get_mode() == SERVER_CHILD_MODE_DOWNLOAD) {
return child.run_download(params);
} else if (!is_router_server) {
} else if (!is_router_server && !is_run_by_cli) {
// single-model mode (NOT spawned by router)
// if this is invoked by CLI, model downloading should already handled
common_params_handle_models(params, LLAMA_EXAMPLE_SERVER, {});
}
@ -363,20 +373,22 @@ int llama_server(common_params & params, int argc, char ** argv) {
};
}
// TODO: refactor in common/console
// register signal handler is not running by CLI
if (!is_run_by_cli) {
#if defined (__unix__) || (defined (__APPLE__) && defined (__MACH__))
struct sigaction sigint_action;
sigint_action.sa_handler = signal_handler;
sigemptyset (&sigint_action.sa_mask);
sigint_action.sa_flags = 0;
sigaction(SIGINT, &sigint_action, NULL);
sigaction(SIGTERM, &sigint_action, NULL);
struct sigaction sigint_action;
sigint_action.sa_handler = signal_handler;
sigemptyset (&sigint_action.sa_mask);
sigint_action.sa_flags = 0;
sigaction(SIGINT, &sigint_action, NULL);
sigaction(SIGTERM, &sigint_action, NULL);
#elif defined (_WIN32)
auto console_ctrl_handler = +[](DWORD ctrl_type) -> BOOL {
return (ctrl_type == CTRL_C_EVENT) ? (signal_handler(SIGINT), true) : false;
};
SetConsoleCtrlHandler(reinterpret_cast<PHANDLER_ROUTINE>(console_ctrl_handler), true);
auto console_ctrl_handler = +[](DWORD ctrl_type) -> BOOL {
return (ctrl_type == CTRL_C_EVENT) ? (signal_handler(SIGINT), true) : false;
};
SetConsoleCtrlHandler(reinterpret_cast<PHANDLER_ROUTINE>(console_ctrl_handler), true);
#endif
}
if (is_router_server) {
SRV_INF("router server is listening on %s\n", ctx_http.listening_address.c_str());