mirror of
https://github.com/ggml-org/llama.cpp.git
synced 2026-06-27 23:50:20 -05:00
working
This commit is contained in:
parent
90c111bf98
commit
19296c1735
@ -64,6 +64,8 @@ static std::string media_type_from_ext(const std::string & fname) {
|
||||
}
|
||||
|
||||
bool cli_context::init() {
|
||||
view::init(params);
|
||||
|
||||
std::optional<view::spinner> spinner;
|
||||
|
||||
if (!params.server_base.empty()) {
|
||||
@ -85,7 +87,7 @@ bool cli_context::init() {
|
||||
return false;
|
||||
}
|
||||
|
||||
spinner.emplace("Loading model...");
|
||||
spinner.emplace("\n\nLoading model...");
|
||||
|
||||
server.emplace();
|
||||
if (!server->start(params)) {
|
||||
@ -281,35 +283,35 @@ int cli_context::run() {
|
||||
modalities += ", video";
|
||||
}
|
||||
|
||||
std::vector<std::string> banner;
|
||||
banner.push_back("\n");
|
||||
banner.push_back(LLAMA_ASCII_LOGO);
|
||||
banner.push_back("\n");
|
||||
banner.push_back("build : " + build_info);
|
||||
banner.push_back("model : " + model_name);
|
||||
banner.push_back("modalities : " + modalities);
|
||||
std::string banner;
|
||||
banner += "\n";
|
||||
banner += LLAMA_ASCII_LOGO;
|
||||
banner += "\n";
|
||||
banner += "build : " + build_info + "\n";
|
||||
banner += "model : " + model_name + "\n";
|
||||
banner += "modalities : " + modalities + "\n";
|
||||
if (!params.system_prompt.empty()) {
|
||||
console::log("using custom system prompt\n");
|
||||
banner += "using custom system prompt\n";
|
||||
}
|
||||
console::log("\n");
|
||||
console::log("available commands:\n");
|
||||
console::log(" /exit or Ctrl+C stop or exit\n");
|
||||
console::log(" /regen regenerate the last response\n");
|
||||
console::log(" /clear clear the chat history\n");
|
||||
console::log(" /read <file> add a text file\n");
|
||||
console::log(" /glob <pattern> add text files using globbing pattern\n");
|
||||
banner += "\n";
|
||||
banner += "available commands:\n";
|
||||
banner += " /exit or Ctrl+C stop or exit\n";
|
||||
banner += " /regen regenerate the last response\n";
|
||||
banner += " /clear clear the chat history\n";
|
||||
banner += " /read <file> add a text file\n";
|
||||
banner += " /glob <pattern> add text files using globbing pattern\n";
|
||||
if (has_vision) {
|
||||
console::log(" /image <file> add an image file\n");
|
||||
banner += " /image <file> add an image file\n";
|
||||
}
|
||||
if (has_audio) {
|
||||
console::log(" /audio <file> add an audio file\n");
|
||||
banner += " /audio <file> add an audio file\n";
|
||||
}
|
||||
if (has_video) {
|
||||
console::log(" /video <file> add a video file\n");
|
||||
banner += " /video <file> add a video file\n";
|
||||
}
|
||||
console::log("\n");
|
||||
banner += "\n";
|
||||
|
||||
view::show_banner(banner);
|
||||
view::show_message(banner);
|
||||
|
||||
// interactive loop
|
||||
std::string cur_msg;
|
||||
@ -476,7 +478,11 @@ int cli_context::run() {
|
||||
});
|
||||
|
||||
if (params.show_timings) {
|
||||
// TODO
|
||||
view::show_info(string_format(
|
||||
"\n[ Prompt: %.1f t/s | Generation: %.1f t/s ]",
|
||||
timings.prompt_per_second,
|
||||
timings.predicted_per_second
|
||||
));
|
||||
}
|
||||
|
||||
if (params.single_turn) {
|
||||
@ -484,7 +490,7 @@ int cli_context::run() {
|
||||
}
|
||||
}
|
||||
|
||||
view::show_message("Exiting...");
|
||||
view::show_message("\n\nExiting...");
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
@ -1,9 +1,3 @@
|
||||
// controller for llama-cli (the "controller" in MVC)
|
||||
//
|
||||
// owns the chat state, drives the view and talks to llama-server through
|
||||
// cli_client; when no --server-base is given it also manages a local
|
||||
// llama-server child process via cli_server
|
||||
|
||||
#pragma once
|
||||
|
||||
#include "common.h"
|
||||
@ -20,25 +14,6 @@ struct cli_timings {
|
||||
double predicted_per_second = 0.0;
|
||||
};
|
||||
|
||||
struct cli_command_info {
|
||||
std::string usage; // e.g. "/read <file>"
|
||||
std::string description; // e.g. "add a text file"
|
||||
};
|
||||
|
||||
// properties of the connected server, shown on startup
|
||||
struct cli_server_info {
|
||||
std::string build_info;
|
||||
std::string model_name;
|
||||
std::string server_base;
|
||||
bool is_local_server = false; // server is spawned and managed by llama-cli
|
||||
bool has_system_prompt = false;
|
||||
bool has_vision = false;
|
||||
bool has_audio = false;
|
||||
bool has_video = false;
|
||||
|
||||
std::vector<cli_command_info> commands;
|
||||
};
|
||||
|
||||
// set by the SIGINT handler; cleared once the interrupt has been handled
|
||||
extern std::atomic<bool> g_cli_interrupted;
|
||||
|
||||
@ -52,6 +27,7 @@ struct cli_context {
|
||||
json pending_media = json::array(); // staged multimodal content parts
|
||||
|
||||
// properties of the connected server
|
||||
// will be populated by fetch_server_props()
|
||||
std::string model_name;
|
||||
std::string build_info;
|
||||
bool has_vision = false;
|
||||
|
||||
@ -9,18 +9,22 @@
|
||||
|
||||
// llama_server will be available as a dynamic library symbol
|
||||
int llama_server(common_params & params, int argc, char ** argv);
|
||||
void llama_server_terminate();
|
||||
|
||||
struct cli_server {
|
||||
std::thread th;
|
||||
int port = -1;
|
||||
std::atomic<bool> is_alive = false;
|
||||
std::atomic<bool> is_stopping = false;
|
||||
|
||||
~cli_server() {
|
||||
stop();
|
||||
}
|
||||
|
||||
void stop() {
|
||||
if (th.joinable()) {
|
||||
th.detach();
|
||||
if (alive() && !is_stopping.exchange(true)) {
|
||||
llama_server_terminate();
|
||||
th.join();
|
||||
}
|
||||
}
|
||||
|
||||
@ -31,12 +35,17 @@ struct cli_server {
|
||||
exit(1);
|
||||
}
|
||||
|
||||
is_alive.store(true, std::memory_order_release);
|
||||
|
||||
th = std::thread([&]() {
|
||||
common_params server_params = params; // copy
|
||||
server_params.port = port;
|
||||
// argc / argv are only used in router mode, we can skip them for now
|
||||
int res = llama_server(params, 0, nullptr);
|
||||
int res = llama_server(server_params, 0, nullptr);
|
||||
if (res != 0) {
|
||||
fprintf(stderr, "llama_server exited with code %d\n", res);
|
||||
}
|
||||
is_alive.store(false, std::memory_order_release);
|
||||
});
|
||||
|
||||
return true;
|
||||
@ -47,17 +56,30 @@ struct cli_server {
|
||||
}
|
||||
|
||||
bool wait_ready(std::function<bool()> should_stop) {
|
||||
// while (true) {
|
||||
// if (should_stop()) {
|
||||
// break;
|
||||
// }
|
||||
// std::this_thread::sleep_for(std::chrono::milliseconds(5000));
|
||||
// }
|
||||
std::this_thread::sleep_for(std::chrono::milliseconds(5000));
|
||||
if (!alive()) {
|
||||
return false;
|
||||
}
|
||||
while (!should_stop()) {
|
||||
auto [cli, parts] = common_http_client(address());
|
||||
cli.set_connection_timeout(1, 0);
|
||||
auto res = cli.Get("/health");
|
||||
if (res) {
|
||||
if (res->status == 200) {
|
||||
return true;
|
||||
}
|
||||
// any other status means the server is up but not ready yet
|
||||
// (e.g. 503 while the model is still loading)
|
||||
}
|
||||
if (!alive()) {
|
||||
// in case server die permanently
|
||||
return false;
|
||||
}
|
||||
std::this_thread::sleep_for(std::chrono::milliseconds(200));
|
||||
}
|
||||
return true;
|
||||
}
|
||||
|
||||
bool alive() const {
|
||||
return th.joinable();
|
||||
return is_alive.load(std::memory_order_acquire);
|
||||
}
|
||||
};
|
||||
|
||||
@ -19,7 +19,9 @@ namespace view {
|
||||
|
||||
struct spinner {
|
||||
spinner(const std::string & message) {
|
||||
console::log("%s\n", message.c_str());
|
||||
if (!message.empty()) {
|
||||
console::log("%s ", message.c_str());
|
||||
}
|
||||
console::spinner::start();
|
||||
}
|
||||
~spinner() {
|
||||
@ -60,27 +62,49 @@ namespace view {
|
||||
};
|
||||
struct assistant_turn {
|
||||
assistant_display_mode mode = ASSISTANT_DISPLAY_MODE_CONTENT;
|
||||
bool trailing_newline = true;
|
||||
bool is_inside_reasoning = false;
|
||||
assistant_turn() {
|
||||
console::set_display(DISPLAY_TYPE_RESET);
|
||||
}
|
||||
~assistant_turn() {
|
||||
console::set_display(DISPLAY_TYPE_RESET);
|
||||
add_newline_if_needed();
|
||||
}
|
||||
void push(assistant_display_mode m, const std::string & buffer) {
|
||||
if (m != mode) {
|
||||
add_newline_if_needed();
|
||||
switch (m) {
|
||||
case ASSISTANT_DISPLAY_MODE_CONTENT:
|
||||
console::set_display(DISPLAY_TYPE_RESET);
|
||||
break;
|
||||
{
|
||||
if (is_inside_reasoning) {
|
||||
console::log("[End thinking]\n\n");
|
||||
is_inside_reasoning = false;
|
||||
}
|
||||
console::set_display(DISPLAY_TYPE_RESET);
|
||||
} break;
|
||||
case ASSISTANT_DISPLAY_MODE_REASONING:
|
||||
console::set_display(DISPLAY_TYPE_REASONING);
|
||||
break;
|
||||
{
|
||||
console::set_display(DISPLAY_TYPE_REASONING);
|
||||
is_inside_reasoning = true;
|
||||
console::log("\n[Start thinking]\n\n");
|
||||
} break;
|
||||
}
|
||||
}
|
||||
mode = m;
|
||||
if (buffer.empty()) {
|
||||
return;
|
||||
}
|
||||
trailing_newline = buffer.back() == '\n';
|
||||
console::log("%s", buffer.c_str());
|
||||
console::flush();
|
||||
}
|
||||
void add_newline_if_needed() {
|
||||
if (!trailing_newline) {
|
||||
console::log("\n");
|
||||
console::flush();
|
||||
}
|
||||
}
|
||||
};
|
||||
|
||||
static void show_error(const std::string & title, const std::string & message = "") {
|
||||
@ -95,9 +119,9 @@ namespace view {
|
||||
console::log("%s\n", message.c_str());
|
||||
}
|
||||
|
||||
static void show_banner(const std::vector<std::string> & lines) {
|
||||
for (const auto & line : lines) {
|
||||
console::log("%s\n", line.c_str());
|
||||
}
|
||||
static void show_info(const std::string & message) {
|
||||
console::set_display(DISPLAY_TYPE_INFO);
|
||||
console::log("%s\n", message.c_str());
|
||||
console::set_display(DISPLAY_TYPE_RESET);
|
||||
}
|
||||
};
|
||||
}
|
||||
|
||||
@ -3,3 +3,17 @@ int llama_server(int argc, char ** argv);
|
||||
int main(int argc, char ** argv) {
|
||||
return llama_server(argc, argv);
|
||||
}
|
||||
|
||||
// satisfies -Wmissing-declarations
|
||||
void server_signal_handler(int signal);
|
||||
|
||||
void server_signal_handler(int signal) {
|
||||
if (is_terminating.test_and_set()) {
|
||||
// in case it hangs, we can force terminate the server by hitting Ctrl+C twice
|
||||
// this is for better developer experience, we can remove when the server is stable enough
|
||||
fprintf(stderr, "Received second interrupt, terminating immediately.\n");
|
||||
exit(1);
|
||||
}
|
||||
|
||||
shutdown_handler(signal);
|
||||
}
|
||||
|
||||
@ -21,12 +21,6 @@
|
||||
#include <windows.h>
|
||||
#endif
|
||||
|
||||
// satisfies -Wmissing-declarations (used by llama command)
|
||||
int llama_server(int argc, char ** argv);
|
||||
|
||||
// to be used via CLI (argc / argv are used by router mode only)
|
||||
int llama_server(common_params & params, int argc, char ** argv);
|
||||
|
||||
static std::function<void(int)> shutdown_handler;
|
||||
static std::atomic_flag is_terminating = ATOMIC_FLAG_INIT;
|
||||
|
||||
@ -41,6 +35,19 @@ static inline void signal_handler(int signal) {
|
||||
shutdown_handler(signal);
|
||||
}
|
||||
|
||||
// satisfies -Wmissing-declarations (used by llama command)
|
||||
int llama_server(int argc, char ** argv);
|
||||
|
||||
// to be used via CLI (argc / argv are used by router mode only)
|
||||
int llama_server(common_params & params, int argc, char ** argv);
|
||||
void llama_server_terminate();
|
||||
void llama_server_terminate() {
|
||||
if (shutdown_handler) {
|
||||
shutdown_handler(0);
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
// wrapper function that handles exceptions and logs errors
|
||||
// this is to make sure handler_t never throws exceptions; instead, it returns an error response
|
||||
static server_http_context::handler_t ex_wrapper(server_http_context::handler_t func) {
|
||||
@ -96,8 +103,10 @@ int llama_server(int argc, char ** argv) {
|
||||
}
|
||||
|
||||
int llama_server(common_params & params, int argc, char ** argv) {
|
||||
bool is_run_by_cli = (argv == nullptr);
|
||||
|
||||
// note: router mode also accepts -hf remote-preset, so we need to check that first
|
||||
if (!params.model.hf_repo.empty()) {
|
||||
if (!is_run_by_cli && !params.model.hf_repo.empty()) {
|
||||
try {
|
||||
common_params_handle_models_params handle_params;
|
||||
handle_params.preset_only = true;
|
||||
@ -279,8 +288,9 @@ int llama_server(common_params & params, int argc, char ** argv) {
|
||||
|
||||
if (child.is_child() && child.get_mode() == SERVER_CHILD_MODE_DOWNLOAD) {
|
||||
return child.run_download(params);
|
||||
} else if (!is_router_server) {
|
||||
} else if (!is_router_server && !is_run_by_cli) {
|
||||
// single-model mode (NOT spawned by router)
|
||||
// if this is invoked by CLI, model downloading should already handled
|
||||
common_params_handle_models(params, LLAMA_EXAMPLE_SERVER, {});
|
||||
}
|
||||
|
||||
@ -363,20 +373,22 @@ int llama_server(common_params & params, int argc, char ** argv) {
|
||||
};
|
||||
}
|
||||
|
||||
// TODO: refactor in common/console
|
||||
// register signal handler is not running by CLI
|
||||
if (!is_run_by_cli) {
|
||||
#if defined (__unix__) || (defined (__APPLE__) && defined (__MACH__))
|
||||
struct sigaction sigint_action;
|
||||
sigint_action.sa_handler = signal_handler;
|
||||
sigemptyset (&sigint_action.sa_mask);
|
||||
sigint_action.sa_flags = 0;
|
||||
sigaction(SIGINT, &sigint_action, NULL);
|
||||
sigaction(SIGTERM, &sigint_action, NULL);
|
||||
struct sigaction sigint_action;
|
||||
sigint_action.sa_handler = signal_handler;
|
||||
sigemptyset (&sigint_action.sa_mask);
|
||||
sigint_action.sa_flags = 0;
|
||||
sigaction(SIGINT, &sigint_action, NULL);
|
||||
sigaction(SIGTERM, &sigint_action, NULL);
|
||||
#elif defined (_WIN32)
|
||||
auto console_ctrl_handler = +[](DWORD ctrl_type) -> BOOL {
|
||||
return (ctrl_type == CTRL_C_EVENT) ? (signal_handler(SIGINT), true) : false;
|
||||
};
|
||||
SetConsoleCtrlHandler(reinterpret_cast<PHANDLER_ROUTINE>(console_ctrl_handler), true);
|
||||
auto console_ctrl_handler = +[](DWORD ctrl_type) -> BOOL {
|
||||
return (ctrl_type == CTRL_C_EVENT) ? (signal_handler(SIGINT), true) : false;
|
||||
};
|
||||
SetConsoleCtrlHandler(reinterpret_cast<PHANDLER_ROUTINE>(console_ctrl_handler), true);
|
||||
#endif
|
||||
}
|
||||
|
||||
if (is_router_server) {
|
||||
SRV_INF("router server is listening on %s\n", ctx_http.listening_address.c_str());
|
||||
|
||||
Loading…
x
Reference in New Issue
Block a user