working

2026-06-27 23:50:20 -05:00 · 2026-06-23 16:09:09 +02:00 · 2026-06-23 16:09:09 +02:00 · 19296c1735
commit 19296c1735
parent 90c111bf98
6 changed files with 142 additions and 88 deletions
--- a/tools/cli/cli-context.cpp
+++ b/tools/cli/cli-context.cpp
@ -64,6 +64,8 @@ static std::string media_type_from_ext(const std::string & fname) {
 }

 bool cli_context::init() {
+    view::init(params);
+
    std::optional<view::spinner> spinner;

    if (!params.server_base.empty()) {
@ -85,7 +87,7 @@ bool cli_context::init() {
            return false;
        }

-        spinner.emplace("Loading model...");
+        spinner.emplace("\n\nLoading model...");

        server.emplace();
        if (!server->start(params)) {
@ -281,35 +283,35 @@ int cli_context::run() {
        modalities += ", video";
    }

-    std::vector<std::string> banner;
-    banner.push_back("\n");
-    banner.push_back(LLAMA_ASCII_LOGO);
-    banner.push_back("\n");
-    banner.push_back("build      : " + build_info);
-    banner.push_back("model      : " + model_name);
-    banner.push_back("modalities : " + modalities);
+    std::string banner;
+    banner += "\n";
+    banner += LLAMA_ASCII_LOGO;
+    banner += "\n";
+    banner += "build      : " + build_info + "\n";
+    banner += "model      : " + model_name + "\n";
+    banner += "modalities : " + modalities + "\n";
    if (!params.system_prompt.empty()) {
-        console::log("using custom system prompt\n");
+        banner += "using custom system prompt\n";
    }
-    console::log("\n");
-    console::log("available commands:\n");
-    console::log("  /exit or Ctrl+C     stop or exit\n");
-    console::log("  /regen              regenerate the last response\n");
-    console::log("  /clear              clear the chat history\n");
-    console::log("  /read <file>        add a text file\n");
-    console::log("  /glob <pattern>     add text files using globbing pattern\n");
+    banner += "\n";
+    banner += "available commands:\n";
+    banner += "  /exit or Ctrl+C     stop or exit\n";
+    banner += "  /regen              regenerate the last response\n";
+    banner += "  /clear              clear the chat history\n";
+    banner += "  /read <file>        add a text file\n";
+    banner += "  /glob <pattern>     add text files using globbing pattern\n";
    if (has_vision) {
-        console::log("  /image <file>       add an image file\n");
+        banner += "  /image <file>       add an image file\n";
    }
    if (has_audio) {
-        console::log("  /audio <file>       add an audio file\n");
+        banner += "  /audio <file>       add an audio file\n";
    }
    if (has_video) {
-        console::log("  /video <file>       add a video file\n");
+        banner += "  /video <file>       add a video file\n";
    }
-    console::log("\n");
+    banner += "\n";

-    view::show_banner(banner);
+    view::show_message(banner);

    // interactive loop
    std::string cur_msg;
@ -476,7 +478,11 @@ int cli_context::run() {
        });

        if (params.show_timings) {
-            // TODO
+            view::show_info(string_format(
+                "\n[ Prompt: %.1f t/s | Generation: %.1f t/s ]",
+                timings.prompt_per_second,
+                timings.predicted_per_second
+            ));
        }

        if (params.single_turn) {
@ -484,7 +490,7 @@ int cli_context::run() {
        }
    }

-    view::show_message("Exiting...");
+    view::show_message("\n\nExiting...");

    return 0;
 }
--- a/tools/cli/cli-context.h
+++ b/tools/cli/cli-context.h
@ -1,9 +1,3 @@
-// controller for llama-cli (the "controller" in MVC)
-//
-// owns the chat state, drives the view and talks to llama-server through
-// cli_client; when no --server-base is given it also manages a local
-// llama-server child process via cli_server
-
 #pragma once

 #include "common.h"
@ -20,25 +14,6 @@ struct cli_timings {
    double predicted_per_second = 0.0;
 };

-struct cli_command_info {
-    std::string usage;       // e.g. "/read <file>"
-    std::string description; // e.g. "add a text file"
-};
-
-// properties of the connected server, shown on startup
-struct cli_server_info {
-    std::string build_info;
-    std::string model_name;
-    std::string server_base;
-    bool is_local_server   = false; // server is spawned and managed by llama-cli
-    bool has_system_prompt = false;
-    bool has_vision        = false;
-    bool has_audio         = false;
-    bool has_video         = false;
-
-    std::vector<cli_command_info> commands;
-};
-
 // set by the SIGINT handler; cleared once the interrupt has been handled
 extern std::atomic<bool> g_cli_interrupted;

@ -52,6 +27,7 @@ struct cli_context {
    json pending_media = json::array(); // staged multimodal content parts

    // properties of the connected server
+    // will be populated by fetch_server_props()
    std::string model_name;
    std::string build_info;
    bool has_vision = false;
--- a/tools/cli/cli-server.h
+++ b/tools/cli/cli-server.h
@ -9,18 +9,22 @@

 // llama_server will be available as a dynamic library symbol
 int llama_server(common_params & params, int argc, char ** argv);
+void llama_server_terminate();

 struct cli_server {
    std::thread th;
    int port = -1;
+    std::atomic<bool> is_alive = false;
+    std::atomic<bool> is_stopping = false;

    ~cli_server() {
        stop();
    }

    void stop() {
-        if (th.joinable()) {
-            th.detach();
+        if (alive() && !is_stopping.exchange(true)) {
+            llama_server_terminate();
+            th.join();
        }
    }

@ -31,12 +35,17 @@ struct cli_server {
            exit(1);
        }

+        is_alive.store(true, std::memory_order_release);
+
        th = std::thread([&]() {
+            common_params server_params = params; // copy
+            server_params.port = port;
            // argc / argv are only used in router mode, we can skip them for now
-            int res = llama_server(params, 0, nullptr);
+            int res = llama_server(server_params, 0, nullptr);
            if (res != 0) {
                fprintf(stderr, "llama_server exited with code %d\n", res);
            }
+            is_alive.store(false, std::memory_order_release);
        });

        return true;
@ -47,17 +56,30 @@ struct cli_server {
    }

    bool wait_ready(std::function<bool()> should_stop) {
-        // while (true) {
-        //     if (should_stop()) {
-        //         break;
-        //     }
-        //     std::this_thread::sleep_for(std::chrono::milliseconds(5000));
-        // }
-        std::this_thread::sleep_for(std::chrono::milliseconds(5000));
+        if (!alive()) {
+            return false;
+        }
+        while (!should_stop()) {
+            auto [cli, parts] = common_http_client(address());
+            cli.set_connection_timeout(1, 0);
+            auto res = cli.Get("/health");
+            if (res) {
+                if (res->status == 200) {
+                    return true;
+                }
+                // any other status means the server is up but not ready yet
+                // (e.g. 503 while the model is still loading)
+            }
+            if (!alive()) {
+                // in case server die permanently
+                return false;
+            }
+            std::this_thread::sleep_for(std::chrono::milliseconds(200));
+        }
        return true;
    }

    bool alive() const {
-        return th.joinable();
+        return is_alive.load(std::memory_order_acquire);
    }
 };
--- a/tools/cli/cli-view.h
+++ b/tools/cli/cli-view.h
@ -19,7 +19,9 @@ namespace view {

    struct spinner {
        spinner(const std::string & message) {
-            console::log("%s\n", message.c_str());
+            if (!message.empty()) {
+                console::log("%s ", message.c_str());
+            }
            console::spinner::start();
        }
        ~spinner() {
@ -60,27 +62,49 @@ namespace view {
    };
    struct assistant_turn {
        assistant_display_mode mode = ASSISTANT_DISPLAY_MODE_CONTENT;
+        bool trailing_newline = true;
+        bool is_inside_reasoning = false;
        assistant_turn() {
            console::set_display(DISPLAY_TYPE_RESET);
        }
        ~assistant_turn() {
            console::set_display(DISPLAY_TYPE_RESET);
+            add_newline_if_needed();
        }
        void push(assistant_display_mode m, const std::string & buffer) {
            if (m != mode) {
+                add_newline_if_needed();
                switch (m) {
                    case ASSISTANT_DISPLAY_MODE_CONTENT:
-                        console::set_display(DISPLAY_TYPE_RESET);
-                        break;
+                        {
+                            if (is_inside_reasoning) {
+                                console::log("[End thinking]\n\n");
+                                is_inside_reasoning = false;
+                            }
+                            console::set_display(DISPLAY_TYPE_RESET);
+                        } break;
                    case ASSISTANT_DISPLAY_MODE_REASONING:
-                        console::set_display(DISPLAY_TYPE_REASONING);
-                        break;
+                        {
+                            console::set_display(DISPLAY_TYPE_REASONING);
+                            is_inside_reasoning = true;
+                            console::log("\n[Start thinking]\n\n");
+                        } break;
                }
            }
            mode = m;
+            if (buffer.empty()) {
+                return;
+            }
+            trailing_newline = buffer.back() == '\n';
            console::log("%s", buffer.c_str());
            console::flush();
        }
+        void add_newline_if_needed() {
+            if (!trailing_newline) {
+                console::log("\n");
+                console::flush();
+            }
+        }
    };

    static void show_error(const std::string & title, const std::string & message = "") {
@ -95,9 +119,9 @@ namespace view {
        console::log("%s\n", message.c_str());
    }

-    static void show_banner(const std::vector<std::string> & lines) {
-        for (const auto & line : lines) {
-            console::log("%s\n", line.c_str());
-        }
+    static void show_info(const std::string & message) {
+        console::set_display(DISPLAY_TYPE_INFO);
+        console::log("%s\n", message.c_str());
+        console::set_display(DISPLAY_TYPE_RESET);
    }
-};
+}
--- a/tools/server/main.cpp
+++ b/tools/server/main.cpp
@ -3,3 +3,17 @@ int llama_server(int argc, char ** argv);
 int main(int argc, char ** argv) {
    return llama_server(argc, argv);
 }
+
+// satisfies -Wmissing-declarations
+void server_signal_handler(int signal);
+
+void server_signal_handler(int signal) {
+    if (is_terminating.test_and_set()) {
+        // in case it hangs, we can force terminate the server by hitting Ctrl+C twice
+        // this is for better developer experience, we can remove when the server is stable enough
+        fprintf(stderr, "Received second interrupt, terminating immediately.\n");
+        exit(1);
+    }
+
+    shutdown_handler(signal);
+}
--- a/tools/server/server.cpp
+++ b/tools/server/server.cpp
@ -21,12 +21,6 @@
 #include <windows.h>
 #endif

-// satisfies -Wmissing-declarations (used by llama command)
-int llama_server(int argc, char ** argv);
-
-// to be used via CLI (argc / argv are used by router mode only)
-int llama_server(common_params & params, int argc, char ** argv);
-
 static std::function<void(int)> shutdown_handler;
 static std::atomic_flag is_terminating = ATOMIC_FLAG_INIT;

@ -41,6 +35,19 @@ static inline void signal_handler(int signal) {
    shutdown_handler(signal);
 }

+// satisfies -Wmissing-declarations (used by llama command)
+int llama_server(int argc, char ** argv);
+
+// to be used via CLI (argc / argv are used by router mode only)
+int llama_server(common_params & params, int argc, char ** argv);
+void llama_server_terminate();
+void llama_server_terminate() {
+    if (shutdown_handler) {
+        shutdown_handler(0);
+    }
+}
+
+
 // wrapper function that handles exceptions and logs errors
 // this is to make sure handler_t never throws exceptions; instead, it returns an error response
 static server_http_context::handler_t ex_wrapper(server_http_context::handler_t func) {
@ -96,8 +103,10 @@ int llama_server(int argc, char ** argv) {
 }

 int llama_server(common_params & params, int argc, char ** argv) {
+    bool is_run_by_cli = (argv == nullptr);
+
    // note: router mode also accepts -hf remote-preset, so we need to check that first
-    if (!params.model.hf_repo.empty()) {
+    if (!is_run_by_cli && !params.model.hf_repo.empty()) {
        try {
            common_params_handle_models_params handle_params;
            handle_params.preset_only = true;
@ -279,8 +288,9 @@ int llama_server(common_params & params, int argc, char ** argv) {

    if (child.is_child() && child.get_mode() == SERVER_CHILD_MODE_DOWNLOAD) {
        return child.run_download(params);
-    } else if (!is_router_server) {
+    } else if (!is_router_server && !is_run_by_cli) {
        // single-model mode (NOT spawned by router)
+        // if this is invoked by CLI, model downloading should already handled
        common_params_handle_models(params, LLAMA_EXAMPLE_SERVER, {});
    }

@ -363,20 +373,22 @@ int llama_server(common_params & params, int argc, char ** argv) {
        };
    }

-    // TODO: refactor in common/console
+    // register signal handler is not running by CLI
+    if (!is_run_by_cli) {
 #if defined (__unix__) || (defined (__APPLE__) && defined (__MACH__))
-    struct sigaction sigint_action;
-    sigint_action.sa_handler = signal_handler;
-    sigemptyset (&sigint_action.sa_mask);
-    sigint_action.sa_flags = 0;
-    sigaction(SIGINT, &sigint_action, NULL);
-    sigaction(SIGTERM, &sigint_action, NULL);
+        struct sigaction sigint_action;
+        sigint_action.sa_handler = signal_handler;
+        sigemptyset (&sigint_action.sa_mask);
+        sigint_action.sa_flags = 0;
+        sigaction(SIGINT, &sigint_action, NULL);
+        sigaction(SIGTERM, &sigint_action, NULL);
 #elif defined (_WIN32)
-    auto console_ctrl_handler = +[](DWORD ctrl_type) -> BOOL {
-        return (ctrl_type == CTRL_C_EVENT) ? (signal_handler(SIGINT), true) : false;
-    };
-    SetConsoleCtrlHandler(reinterpret_cast<PHANDLER_ROUTINE>(console_ctrl_handler), true);
+        auto console_ctrl_handler = +[](DWORD ctrl_type) -> BOOL {
+            return (ctrl_type == CTRL_C_EVENT) ? (signal_handler(SIGINT), true) : false;
+        };
+        SetConsoleCtrlHandler(reinterpret_cast<PHANDLER_ROUTINE>(console_ctrl_handler), true);
 #endif
+    }

    if (is_router_server) {
        SRV_INF("router server is listening on %s\n", ctx_http.listening_address.c_str());