server: re-inject subcommand when router spawns children under unified binary (#23442)

2026-06-27 23:50:20 -05:00 · 2026-05-21 10:09:19 +02:00 · 2026-05-21 10:09:19 +02:00 · c9021714e8
commit c9021714e8
parent 1d7ab2b947
2 changed files with 17 additions and 0 deletions
--- a/app/llama.cpp
+++ b/app/llama.cpp
@ -1,6 +1,7 @@
 #include "build-info.h"

 #include <cstdio>
+#include <cstdlib>
 #include <string>
 #include <vector>

@ -77,6 +78,14 @@ int main(int argc, char ** argv) {

    for (const auto & cmd : cmds) {
        if (matches(arg, cmd)) {
+
+            // router spawns children through this same binary, it needs the
+            // subcommand to relaunch as 'llama serve' and not bare options
+#ifdef _WIN32
+            _putenv_s("LLAMA_APP_CMD", cmd.name);
+#else
+            setenv("LLAMA_APP_CMD", cmd.name, 1);
+#endif
            return cmd.func(argc - 1, argv + 1);
        }
    }
--- a/tools/server/server-models.cpp
+++ b/tools/server/server-models.cpp
@ -14,6 +14,7 @@
 #include <mutex>
 #include <condition_variable>
 #include <cstring>
+#include <cstdlib>
 #include <atomic>
 #include <chrono>
 #include <queue>
@ -159,6 +160,13 @@ void server_model_meta::update_args(common_preset_context & ctx_preset, std::str
    // TODO: maybe validate preset before rendering ?
    // render args
    args = preset.to_args(bin_path);
+
+    // unified binary dispatches by subcommand, re-inject it right after the
+    // binary path so the child starts as 'llama serve ...' not 'llama ...'
+    const char * app_cmd = std::getenv("LLAMA_APP_CMD");
+    if (app_cmd != nullptr && app_cmd[0] != '\0' && !bin_path.empty()) {
+        args.insert(args.begin() + 1, app_cmd);
+    }
 }

 void server_model_meta::update_caps() {