poc: threadpool sampling

2026-06-27 23:50:20 -05:00 · 2026-06-20 22:08:42 +02:00 · 2026-06-20 22:08:42 +02:00 · 447b0c3646
commit 447b0c3646
parent a527509d0f
1 changed files with 101 additions and 4 deletions
--- a/tools/server/server-context.cpp
+++ b/tools/server/server-context.cpp
@ -24,6 +24,11 @@
 #include <filesystem>
 #include <utility>
 #include <fstream>
+#include <thread>
+#include <mutex>
+#include <condition_variable>
+#include <queue>
+#include <functional>

 // fix problem with std::min and std::max
 #if defined(_WIN32)
@ -3616,6 +3621,67 @@ private:
        return true;
    }

+    struct sampler_task {
+        server_slot * slot;
+        int32_t tok_idx;
+    };
+
+    struct sampler_threadpool {
+        std::vector<std::thread> threads;
+        std::queue<std::function<void()>> tasks;
+        std::mutex mtx;
+        std::condition_variable cv;
+        std::condition_variable cv_done;
+        int pending = 0;
+        bool stop = false;
+
+        ~sampler_threadpool() {
+            {
+                std::lock_guard<std::mutex> lock(mtx);
+                stop = true;
+            }
+            cv.notify_all();
+            for (auto & t : threads) t.join();
+        }
+
+        void init(int n) {
+            for (int i = 0; i < n; i++) {
+                threads.emplace_back([this]() {
+                    while (true) {
+                        std::function<void()> task;
+                        {
+                            std::unique_lock<std::mutex> lock(mtx);
+                            cv.wait(lock, [this]() { return stop || !tasks.empty(); });
+                            if (stop && tasks.empty()) return;
+                            task = std::move(tasks.front());
+                            tasks.pop();
+                        }
+                        task();
+                        {
+                            std::lock_guard<std::mutex> lock(mtx);
+                            pending--;
+                        }
+                        cv_done.notify_one();
+                    }
+                });
+            }
+        }
+
+        void enqueue(std::function<void()> fn) {
+            {
+                std::lock_guard<std::mutex> lock(mtx);
+                tasks.push(std::move(fn));
+                pending++;
+            }
+            cv.notify_one();
+        }
+
+        void wait_all() {
+            std::unique_lock<std::mutex> lock(mtx);
+            cv_done.wait(lock, [this]() { return pending == 0; });
+        }
+    };
+
    void post_decode(int32_t n_batch_tokens, int32_t off, llama_batch & batch_view) {
        // for checking if a given batch index is inside batch_view
        auto is_inside_view = [&](int32_t idx) {
@ -3637,6 +3703,8 @@ private:
                slot.task->params.sampling.preserved_tokens.find(token) != slot.task->params.sampling.preserved_tokens.end();
        };

+        std::vector<sampler_task> smpl_tasks;
+
        iterate(slots, [&](server_slot & slot) {
            // optionally send prompt processing progress
            if (slot.state == SLOT_STATE_PROCESSING_PROMPT || slot.state == SLOT_STATE_DONE_PROMPT) {
@ -3684,13 +3752,42 @@ private:

            // shifted according to the current sub-batch
            const int tok_idx = slot.i_batch - off;
+            smpl_tasks.push_back({&slot, tok_idx});
+        });

-            llama_token id;
-            {
-                scoped_timer timer(t_sampl, n_sampl);
-                id = common_sampler_sample(slot.smpl.get(), slot.ctx_tgt, tok_idx);
+        std::unordered_map<server_slot *, llama_token> sampled_token;
+
+        // run common_sampler_sample in a thread pool to sample all tokens in parallel
+        if (!smpl_tasks.empty()) {
+            scoped_timer timer(t_sampl, n_sampl);
+            static sampler_threadpool pool;
+            if (pool.threads.empty()) {
+                pool.init(params_base.n_parallel);
            }

+            std::vector<std::pair<server_slot *, llama_token>> results(smpl_tasks.size());
+            for (size_t i = 0; i < smpl_tasks.size(); i++) {
+                const auto & task = smpl_tasks[i];
+                pool.enqueue([&results, i, slot_ptr = task.slot, tok_idx = task.tok_idx]() {
+                    results[i] = {slot_ptr, common_sampler_sample(slot_ptr->smpl.get(), slot_ptr->ctx_tgt, tok_idx)};
+                });
+            }
+            pool.wait_all();
+
+            for (const auto & [slot_ptr, id] : results) {
+                sampled_token[slot_ptr] = id;
+            }
+        }
+
+        iterate(slots, [&](server_slot & slot) {
+            const int tok_idx = slot.i_batch - off;
+            auto it = sampled_token.find(&slot);
+            if (it == sampled_token.end()) {
+                // no token sampled for this slot, skip
+                return;
+            }
+            auto id = it->second;
+
            slot.i_batch = -1;

            common_sampler_accept(slot.smpl.get(), id, true);