diff --git a/examples/server/server.cpp b/examples/server/server.cpp index f65b4ad0..b51f8e4f 100644 --- a/examples/server/server.cpp +++ b/examples/server/server.cpp @@ -330,6 +330,18 @@ struct server_response_reader { return !cancelled && received_count < id_tasks.size(); } + // cancel-cascade fix: true only if one of THIS reader's tasks is on a + // slot (the active decode). Used to gate llama_decode_stop() so a queued/ + // deferred task's disconnect cannot abort another task's active decode via + // the process-global stop_internal_decode flag. Best-effort cross-thread + // read (slots are not resized at runtime; same race class as the global). + bool any_task_on_slot() const { + for (const auto & slot : ctx_server.slots) { + if (slot.is_processing() && id_tasks.count(slot.id_task)) return true; + } + return false; + } + // return nullptr if should_stop() is true before receiving a result // note: if one error is received, it will stop further processing and return error result server_task_result_ptr next(const std::function& should_stop) { @@ -1127,7 +1139,7 @@ int main(int argc, char ** argv) { // non-stream, wait for the results auto all_results = rd->wait_for_all(is_connection_closed); if (all_results.is_terminated) { - llama_decode_stop(); // send a signal to stop decode process + if (rd->any_task_on_slot()) llama_decode_stop(); // cancel-cascade fix: stop only if OUR task is the active decode return; // connection is closed } else if (all_results.error) { @@ -1150,7 +1162,7 @@ int main(int argc, char ** argv) { // ref: https://github.com/ggml-org/llama.cpp/pull/16486#discussion_r2419657309 server_task_result_ptr first_result = rd->next(is_connection_closed); if (first_result == nullptr) { - llama_decode_stop(); // send a signal to stop decode process + if (rd->any_task_on_slot()) llama_decode_stop(); // cancel-cascade fix: stop only if OUR task is the active decode return; // connection is closed } else if (first_result->is_error()) { @@ -1480,7 +1492,7 @@ int main(int argc, char ** argv) { // collect results if (all_results.is_terminated) { - llama_decode_stop(); + if (rd.any_task_on_slot()) llama_decode_stop(); // cancel-cascade fix: stop only if OUR task is the active decode return; // connection is closed } else if (all_results.error) {