From 19296c1735147a826d54eb9f53cab658b60449e7 Mon Sep 17 00:00:00 2001
From: Xuan Son Nguyen <son@huggingface.co>
Date: Tue, 23 Jun 2026 16:09:09 +0200
Subject: [PATCH] working

---
 tools/cli/cli-context.cpp | 52 ++++++++++++++++++++++-----------------
 tools/cli/cli-context.h   | 26 +-------------------
 tools/cli/cli-server.h    | 44 ++++++++++++++++++++++++---------
 tools/cli/cli-view.h      | 44 +++++++++++++++++++++++++--------
 tools/server/main.cpp     | 14 +++++++++++
 tools/server/server.cpp   | 50 +++++++++++++++++++++++--------------
 6 files changed, 142 insertions(+), 88 deletions(-)
diff --git a/tools/cli/cli-context.cpp b/tools/cli/cli-context.cpp
index dfc2c9f6b9..cbfde0c0a3 100644
--- a/tools/cli/cli-context.cpp
+++ b/tools/cli/cli-context.cpp
@@ -64,6 +64,8 @@ static std::string media_type_from_ext(const std::string & fname) {
 }
 
 bool cli_context::init() {
+    view::init(params);
+
     std::optional<view::spinner> spinner;
 
     if (!params.server_base.empty()) {
@@ -85,7 +87,7 @@ bool cli_context::init() {
             return false;
         }
 
-        spinner.emplace("Loading model...");
+        spinner.emplace("\n\nLoading model...");
 
         server.emplace();
         if (!server->start(params)) {
@@ -281,35 +283,35 @@ int cli_context::run() {
         modalities += ", video";
     }
 
-    std::vector<std::string> banner;
-    banner.push_back("\n");
-    banner.push_back(LLAMA_ASCII_LOGO);
-    banner.push_back("\n");
-    banner.push_back("build      : " + build_info);
-    banner.push_back("model      : " + model_name);
-    banner.push_back("modalities : " + modalities);
+    std::string banner;
+    banner += "\n";
+    banner += LLAMA_ASCII_LOGO;
+    banner += "\n";
+    banner += "build      : " + build_info + "\n";
+    banner += "model      : " + model_name + "\n";
+    banner += "modalities : " + modalities + "\n";
     if (!params.system_prompt.empty()) {
-        console::log("using custom system prompt\n");
+        banner += "using custom system prompt\n";
     }
-    console::log("\n");
-    console::log("available commands:\n");
-    console::log("  /exit or Ctrl+C     stop or exit\n");
-    console::log("  /regen              regenerate the last response\n");
-    console::log("  /clear              clear the chat history\n");
-    console::log("  /read <file>        add a text file\n");
-    console::log("  /glob <pattern>     add text files using globbing pattern\n");
+    banner += "\n";
+    banner += "available commands:\n";
+    banner += "  /exit or Ctrl+C     stop or exit\n";
+    banner += "  /regen              regenerate the last response\n";
+    banner += "  /clear              clear the chat history\n";
+    banner += "  /read <file>        add a text file\n";
+    banner += "  /glob <pattern>     add text files using globbing pattern\n";
     if (has_vision) {
-        console::log("  /image <file>       add an image file\n");
+        banner += "  /image <file>       add an image file\n";
     }
     if (has_audio) {
-        console::log("  /audio <file>       add an audio file\n");
+        banner += "  /audio <file>       add an audio file\n";
     }
     if (has_video) {
-        console::log("  /video <file>       add a video file\n");
+        banner += "  /video <file>       add a video file\n";
     }
-    console::log("\n");
+    banner += "\n";
 
-    view::show_banner(banner);
+    view::show_message(banner);
 
     // interactive loop
     std::string cur_msg;
@@ -476,7 +478,11 @@ int cli_context::run() {
         });
 
         if (params.show_timings) {
-            // TODO
+            view::show_info(string_format(
+                "\n[ Prompt: %.1f t/s | Generation: %.1f t/s ]",
+                timings.prompt_per_second,
+                timings.predicted_per_second
+            ));
         }
 
         if (params.single_turn) {
@@ -484,7 +490,7 @@ int cli_context::run() {
         }
     }
 
-    view::show_message("Exiting...");
+    view::show_message("\n\nExiting...");
 
     return 0;
 }
diff --git a/tools/cli/cli-context.h b/tools/cli/cli-context.h
index cbf6729d6d..2c67586d63 100644
--- a/tools/cli/cli-context.h
+++ b/tools/cli/cli-context.h
@@ -1,9 +1,3 @@
-// controller for llama-cli (the "controller" in MVC)
-//
-// owns the chat state, drives the view and talks to llama-server through
-// cli_client; when no --server-base is given it also manages a local
-// llama-server child process via cli_server
-
 #pragma once
 
 #include "common.h"
@@ -20,25 +14,6 @@ struct cli_timings {
     double predicted_per_second = 0.0;
 };
 
-struct cli_command_info {
-    std::string usage;       // e.g. "/read <file>"
-    std::string description; // e.g. "add a text file"
-};
-
-// properties of the connected server, shown on startup
-struct cli_server_info {
-    std::string build_info;
-    std::string model_name;
-    std::string server_base;
-    bool is_local_server   = false; // server is spawned and managed by llama-cli
-    bool has_system_prompt = false;
-    bool has_vision        = false;
-    bool has_audio         = false;
-    bool has_video         = false;
-
-    std::vector<cli_command_info> commands;
-};
-
 // set by the SIGINT handler; cleared once the interrupt has been handled
 extern std::atomic<bool> g_cli_interrupted;
 
@@ -52,6 +27,7 @@ struct cli_context {
     json pending_media = json::array(); // staged multimodal content parts
 
     // properties of the connected server
+    // will be populated by fetch_server_props()
     std::string model_name;
     std::string build_info;
     bool has_vision = false;
diff --git a/tools/cli/cli-server.h b/tools/cli/cli-server.h
index 41f860af86..50447f2551 100644
--- a/tools/cli/cli-server.h
+++ b/tools/cli/cli-server.h
@@ -9,18 +9,22 @@
 
 // llama_server will be available as a dynamic library symbol
 int llama_server(common_params & params, int argc, char ** argv);
+void llama_server_terminate();
 
 struct cli_server {
     std::thread th;
     int port = -1;
+    std::atomic<bool> is_alive = false;
+    std::atomic<bool> is_stopping = false;
 
     ~cli_server() {
         stop();
     }
 
     void stop() {
-        if (th.joinable()) {
-            th.detach();
+        if (alive() && !is_stopping.exchange(true)) {
+            llama_server_terminate();
+            th.join();
         }
     }
 
@@ -31,12 +35,17 @@ struct cli_server {
             exit(1);
         }
 
+        is_alive.store(true, std::memory_order_release);
+
         th = std::thread([&]() {
+            common_params server_params = params; // copy
+            server_params.port = port;
             // argc / argv are only used in router mode, we can skip them for now
-            int res = llama_server(params, 0, nullptr);
+            int res = llama_server(server_params, 0, nullptr);
             if (res != 0) {
                 fprintf(stderr, "llama_server exited with code %d\n", res);
             }
+            is_alive.store(false, std::memory_order_release);
         });
 
         return true;
@@ -47,17 +56,30 @@ struct cli_server {
     }
 
     bool wait_ready(std::function<bool()> should_stop) {
-        // while (true) {
-        //     if (should_stop()) {
-        //         break;
-        //     }
-        //     std::this_thread::sleep_for(std::chrono::milliseconds(5000));
-        // }
-        std::this_thread::sleep_for(std::chrono::milliseconds(5000));
+        if (!alive()) {
+            return false;
+        }
+        while (!should_stop()) {
+            auto [cli, parts] = common_http_client(address());
+            cli.set_connection_timeout(1, 0);
+            auto res = cli.Get("/health");
+            if (res) {
+                if (res->status == 200) {
+                    return true;
+                }
+                // any other status means the server is up but not ready yet
+                // (e.g. 503 while the model is still loading)
+            }
+            if (!alive()) {
+                // in case server die permanently
+                return false;
+            }
+            std::this_thread::sleep_for(std::chrono::milliseconds(200));
+        }
         return true;
     }
 
     bool alive() const {
-        return th.joinable();
+        return is_alive.load(std::memory_order_acquire);
     }
 };
diff --git a/tools/cli/cli-view.h b/tools/cli/cli-view.h
index 6168f6ade3..a44a0ba240 100644
--- a/tools/cli/cli-view.h
+++ b/tools/cli/cli-view.h
@@ -19,7 +19,9 @@ namespace view {
 
     struct spinner {
         spinner(const std::string & message) {
-            console::log("%s\n", message.c_str());
+            if (!message.empty()) {
+                console::log("%s ", message.c_str());
+            }
             console::spinner::start();
         }
         ~spinner() {
@@ -60,27 +62,49 @@ namespace view {
     };
     struct assistant_turn {
         assistant_display_mode mode = ASSISTANT_DISPLAY_MODE_CONTENT;
+        bool trailing_newline = true;
+        bool is_inside_reasoning = false;
         assistant_turn() {
             console::set_display(DISPLAY_TYPE_RESET);
         }
         ~assistant_turn() {
             console::set_display(DISPLAY_TYPE_RESET);
+            add_newline_if_needed();
         }
         void push(assistant_display_mode m, const std::string & buffer) {
             if (m != mode) {
+                add_newline_if_needed();
                 switch (m) {
                     case ASSISTANT_DISPLAY_MODE_CONTENT:
-                        console::set_display(DISPLAY_TYPE_RESET);
-                        break;
+                        {
+                            if (is_inside_reasoning) {
+                                console::log("[End thinking]\n\n");
+                                is_inside_reasoning = false;
+                            }
+                            console::set_display(DISPLAY_TYPE_RESET);
+                        } break;
                     case ASSISTANT_DISPLAY_MODE_REASONING:
-                        console::set_display(DISPLAY_TYPE_REASONING);
-                        break;
+                        {
+                            console::set_display(DISPLAY_TYPE_REASONING);
+                            is_inside_reasoning = true;
+                            console::log("\n[Start thinking]\n\n");
+                        } break;
                 }
             }
             mode = m;
+            if (buffer.empty()) {
+                return;
+            }
+            trailing_newline = buffer.back() == '\n';
             console::log("%s", buffer.c_str());
             console::flush();
         }
+        void add_newline_if_needed() {
+            if (!trailing_newline) {
+                console::log("\n");
+                console::flush();
+            }
+        }
     };
 
     static void show_error(const std::string & title, const std::string & message = "") {
@@ -95,9 +119,9 @@ namespace view {
         console::log("%s\n", message.c_str());
     }
 
-    static void show_banner(const std::vector<std::string> & lines) {
-        for (const auto & line : lines) {
-            console::log("%s\n", line.c_str());
-        }
+    static void show_info(const std::string & message) {
+        console::set_display(DISPLAY_TYPE_INFO);
+        console::log("%s\n", message.c_str());
+        console::set_display(DISPLAY_TYPE_RESET);
     }
-};
+}
diff --git a/tools/server/main.cpp b/tools/server/main.cpp
index 7f17c56a8c..b8d14e3111 100644
--- a/tools/server/main.cpp
+++ b/tools/server/main.cpp
@@ -3,3 +3,17 @@ int llama_server(int argc, char ** argv);
 int main(int argc, char ** argv) {
     return llama_server(argc, argv);
 }
+
+// satisfies -Wmissing-declarations
+void server_signal_handler(int signal);
+
+void server_signal_handler(int signal) {
+    if (is_terminating.test_and_set()) {
+        // in case it hangs, we can force terminate the server by hitting Ctrl+C twice
+        // this is for better developer experience, we can remove when the server is stable enough
+        fprintf(stderr, "Received second interrupt, terminating immediately.\n");
+        exit(1);
+    }
+
+    shutdown_handler(signal);
+}
diff --git a/tools/server/server.cpp b/tools/server/server.cpp
index a101df655d..3b55c5f4be 100644
--- a/tools/server/server.cpp
+++ b/tools/server/server.cpp
@@ -21,12 +21,6 @@
 #include <windows.h>
 #endif
 
-// satisfies -Wmissing-declarations (used by llama command)
-int llama_server(int argc, char ** argv);
-
-// to be used via CLI (argc / argv are used by router mode only)
-int llama_server(common_params & params, int argc, char ** argv);
-
 static std::function<void(int)> shutdown_handler;
 static std::atomic_flag is_terminating = ATOMIC_FLAG_INIT;
 
@@ -41,6 +35,19 @@ static inline void signal_handler(int signal) {
     shutdown_handler(signal);
 }
 
+// satisfies -Wmissing-declarations (used by llama command)
+int llama_server(int argc, char ** argv);
+
+// to be used via CLI (argc / argv are used by router mode only)
+int llama_server(common_params & params, int argc, char ** argv);
+void llama_server_terminate();
+void llama_server_terminate() {
+    if (shutdown_handler) {
+        shutdown_handler(0);
+    }
+}
+
+
 // wrapper function that handles exceptions and logs errors
 // this is to make sure handler_t never throws exceptions; instead, it returns an error response
 static server_http_context::handler_t ex_wrapper(server_http_context::handler_t func) {
@@ -96,8 +103,10 @@ int llama_server(int argc, char ** argv) {
 }
 
 int llama_server(common_params & params, int argc, char ** argv) {
+    bool is_run_by_cli = (argv == nullptr);
+
     // note: router mode also accepts -hf remote-preset, so we need to check that first
-    if (!params.model.hf_repo.empty()) {
+    if (!is_run_by_cli && !params.model.hf_repo.empty()) {
         try {
             common_params_handle_models_params handle_params;
             handle_params.preset_only = true;
@@ -279,8 +288,9 @@ int llama_server(common_params & params, int argc, char ** argv) {
 
     if (child.is_child() && child.get_mode() == SERVER_CHILD_MODE_DOWNLOAD) {
         return child.run_download(params);
-    } else if (!is_router_server) {
+    } else if (!is_router_server && !is_run_by_cli) {
         // single-model mode (NOT spawned by router)
+        // if this is invoked by CLI, model downloading should already handled
         common_params_handle_models(params, LLAMA_EXAMPLE_SERVER, {});
     }
 
@@ -363,20 +373,22 @@ int llama_server(common_params & params, int argc, char ** argv) {
         };
     }
 
-    // TODO: refactor in common/console
+    // register signal handler is not running by CLI
+    if (!is_run_by_cli) {
 #if defined (__unix__) || (defined (__APPLE__) && defined (__MACH__))
-    struct sigaction sigint_action;
-    sigint_action.sa_handler = signal_handler;
-    sigemptyset (&sigint_action.sa_mask);
-    sigint_action.sa_flags = 0;
-    sigaction(SIGINT, &sigint_action, NULL);
-    sigaction(SIGTERM, &sigint_action, NULL);
+        struct sigaction sigint_action;
+        sigint_action.sa_handler = signal_handler;
+        sigemptyset (&sigint_action.sa_mask);
+        sigint_action.sa_flags = 0;
+        sigaction(SIGINT, &sigint_action, NULL);
+        sigaction(SIGTERM, &sigint_action, NULL);
 #elif defined (_WIN32)
-    auto console_ctrl_handler = +[](DWORD ctrl_type) -> BOOL {
-        return (ctrl_type == CTRL_C_EVENT) ? (signal_handler(SIGINT), true) : false;
-    };
-    SetConsoleCtrlHandler(reinterpret_cast<PHANDLER_ROUTINE>(console_ctrl_handler), true);
+        auto console_ctrl_handler = +[](DWORD ctrl_type) -> BOOL {
+            return (ctrl_type == CTRL_C_EVENT) ? (signal_handler(SIGINT), true) : false;
+        };
+        SetConsoleCtrlHandler(reinterpret_cast<PHANDLER_ROUTINE>(console_ctrl_handler), true);
 #endif
+    }
 
     if (is_router_server) {
         SRV_INF("router server is listening on %s\n", ctx_http.listening_address.c_str());