From a527509d0f05e2f9228704aee29ce467b563d3b9 Mon Sep 17 00:00:00 2001
From: Xuan Son Nguyen <son@huggingface.co>
Date: Sat, 20 Jun 2026 20:22:31 +0200
Subject: [PATCH] debug: force llama_synchronize for accurate timings

---
 tools/server/server-context.cpp | 12 +++++++++++-
 1 file changed, 11 insertions(+), 1 deletion(-)

diff --git a/tools/server/server-context.cpp b/tools/server/server-context.cpp
index 62eb660d7f..a0d78a5dae 100644
--- a/tools/server/server-context.cpp
+++ b/tools/server/server-context.cpp
@@ -2629,9 +2629,11 @@ private:
     int64_t t_pre_decode  = 0;
     int64_t t_decode      = 0;
     int64_t t_post_decode = 0;
+    int64_t t_sampl       = 0;
     int64_t n_pre_decode  = 0;
     int64_t n_decode      = 0;
     int64_t n_post_decode = 0;
+    int64_t n_sampl       = 0;
 #define DEBUG_TIMINGS
 #ifdef DEBUG_TIMINGS
     struct scoped_timer {
@@ -2663,6 +2665,7 @@ private:
             SRV_INF("avg t_pre_decode  = %f ms\n", (double) t_pre_decode / n_pre_decode / 1000.0);
             SRV_INF("avg t_decode      = %f ms\n", (double) t_decode / n_decode / 1000.0);
             SRV_INF("avg t_post_decode = %f ms\n", (double) t_post_decode / n_post_decode / 1000.0);
+            SRV_INF("avg t_sampl       = %f ms\n", (double) t_sampl / n_sampl / 1000.0);
         }
 #endif
 
@@ -2710,6 +2713,9 @@ private:
 
                 batch_view = batch.get_view(off, n_tokens);
                 bool ok = decode(n_batch, off, batch_view);
+#ifdef DEBUG_TIMINGS
+                llama_synchronize(ctx_tgt);
+#endif
 
                 if (ok) {
                     // move the head of the batch forward with the number of tokens we just processed
@@ -3679,7 +3685,11 @@ private:
             // shifted according to the current sub-batch
             const int tok_idx = slot.i_batch - off;
 
-            llama_token id = common_sampler_sample(slot.smpl.get(), slot.ctx_tgt, tok_idx);
+            llama_token id;
+            {
+                scoped_timer timer(t_sampl, n_sampl);
+                id = common_sampler_sample(slot.smpl.get(), slot.ctx_tgt, tok_idx);
+            }
 
             slot.i_batch = -1;