From a527509d0f05e2f9228704aee29ce467b563d3b9 Mon Sep 17 00:00:00 2001 From: Xuan Son Nguyen Date: Sat, 20 Jun 2026 20:22:31 +0200 Subject: [PATCH] debug: force llama_synchronize for accurate timings --- tools/server/server-context.cpp | 12 +++++++++++- 1 file changed, 11 insertions(+), 1 deletion(-) diff --git a/tools/server/server-context.cpp b/tools/server/server-context.cpp index 62eb660d7f..a0d78a5dae 100644 --- a/tools/server/server-context.cpp +++ b/tools/server/server-context.cpp @@ -2629,9 +2629,11 @@ private: int64_t t_pre_decode = 0; int64_t t_decode = 0; int64_t t_post_decode = 0; + int64_t t_sampl = 0; int64_t n_pre_decode = 0; int64_t n_decode = 0; int64_t n_post_decode = 0; + int64_t n_sampl = 0; #define DEBUG_TIMINGS #ifdef DEBUG_TIMINGS struct scoped_timer { @@ -2663,6 +2665,7 @@ private: SRV_INF("avg t_pre_decode = %f ms\n", (double) t_pre_decode / n_pre_decode / 1000.0); SRV_INF("avg t_decode = %f ms\n", (double) t_decode / n_decode / 1000.0); SRV_INF("avg t_post_decode = %f ms\n", (double) t_post_decode / n_post_decode / 1000.0); + SRV_INF("avg t_sampl = %f ms\n", (double) t_sampl / n_sampl / 1000.0); } #endif @@ -2710,6 +2713,9 @@ private: batch_view = batch.get_view(off, n_tokens); bool ok = decode(n_batch, off, batch_view); +#ifdef DEBUG_TIMINGS + llama_synchronize(ctx_tgt); +#endif if (ok) { // move the head of the batch forward with the number of tokens we just processed @@ -3679,7 +3685,11 @@ private: // shifted according to the current sub-batch const int tok_idx = slot.i_batch - off; - llama_token id = common_sampler_sample(slot.smpl.get(), slot.ctx_tgt, tok_idx); + llama_token id; + { + scoped_timer timer(t_sampl, n_sampl); + id = common_sampler_sample(slot.smpl.get(), slot.ctx_tgt, tok_idx); + } slot.i_batch = -1;