debug: force llama_synchronize for accurate timings

2026-06-27 23:50:20 -05:00 · 2026-06-20 20:22:31 +02:00 · 2026-06-20 20:22:31 +02:00 · a527509d0f
commit a527509d0f
parent 7486a39756
1 changed files with 11 additions and 1 deletions
--- a/tools/server/server-context.cpp
+++ b/tools/server/server-context.cpp
@ -2629,9 +2629,11 @@ private:
    int64_t t_pre_decode  = 0;
    int64_t t_decode      = 0;
    int64_t t_post_decode = 0;
+    int64_t t_sampl       = 0;
    int64_t n_pre_decode  = 0;
    int64_t n_decode      = 0;
    int64_t n_post_decode = 0;
+    int64_t n_sampl       = 0;
 #define DEBUG_TIMINGS
 #ifdef DEBUG_TIMINGS
    struct scoped_timer {
@ -2663,6 +2665,7 @@ private:
            SRV_INF("avg t_pre_decode  = %f ms\n", (double) t_pre_decode / n_pre_decode / 1000.0);
            SRV_INF("avg t_decode      = %f ms\n", (double) t_decode / n_decode / 1000.0);
            SRV_INF("avg t_post_decode = %f ms\n", (double) t_post_decode / n_post_decode / 1000.0);
+            SRV_INF("avg t_sampl       = %f ms\n", (double) t_sampl / n_sampl / 1000.0);
        }
 #endif

@ -2710,6 +2713,9 @@ private:

                batch_view = batch.get_view(off, n_tokens);
                bool ok = decode(n_batch, off, batch_view);
+#ifdef DEBUG_TIMINGS
+                llama_synchronize(ctx_tgt);
+#endif

                if (ok) {
                    // move the head of the batch forward with the number of tokens we just processed
@ -3679,7 +3685,11 @@ private:
            // shifted according to the current sub-batch
            const int tok_idx = slot.i_batch - off;

-            llama_token id = common_sampler_sample(slot.smpl.get(), slot.ctx_tgt, tok_idx);
+            llama_token id;
+            {
+                scoped_timer timer(t_sampl, n_sampl);
+                id = common_sampler_sample(slot.smpl.get(), slot.ctx_tgt, tok_idx);
+            }

            slot.i_batch = -1;