spec: add spec metrics mean acceptance length and acceptance rate per position (#24536)

* spec: add spec metrics mean acceptance length and acceptance per pos

* fix as suggestion

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>

* fix as suggestion

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>

* fix as suggestion

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>

* fix as suggestions

---------

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>
This commit is contained in:
Ruixiang Wang 2026-06-16 09:23:09 +02:00 committed by GitHub
parent e3a74b2990
commit 635b65ad7a
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
2 changed files with 57 additions and 4 deletions

View File

@ -140,6 +140,8 @@ struct common_speculative_impl {
size_t n_gen_tokens = 0; // number of tokens generated by this implementation.
size_t n_acc_tokens = 0; // number of tokens accepted by the target model.
std::vector<size_t> n_acc_tokens_per_pos; // number of tokens accepted per draft position.
// TODO: track performance of most recent calls
const bool gen_perf = true; // whether to generate performance stats.
@ -2059,6 +2061,15 @@ void common_speculative_accept(common_speculative * spec, llama_seq_id seq_id, u
{
common_time_meas tm(impl->t_accept_us, !impl->gen_perf);
if (impl->n_acc_tokens_per_pos.size() < n_accepted) {
impl->n_acc_tokens_per_pos.resize(n_accepted, 0);
}
for (size_t i = 0; i < n_accepted; ++i) {
impl->n_acc_tokens_per_pos[i]++;
}
if (n_accepted > 0) {
impl->n_acc_drafts++;
impl->n_acc_tokens += n_accepted;
@ -2093,13 +2104,31 @@ void common_speculative_print_stats(const common_speculative * spec) {
str_perf = "";
}
LOG_INF("statistics %16s: #calls(b,g,a) = %4zu %6zu %6zu, #gen drafts = %6zu, #acc drafts = %5zu, #gen tokens = %6zu, #acc tokens = %5zu%s\n",
std::string str_stats;
if (impl->n_call_accept > 0) {
const double mean =
1.0 + (double) impl->n_acc_tokens / (double) impl->n_call_accept;
std::ostringstream tmp;
tmp << std::fixed << std::setprecision(3);
for (size_t i = 0; i < impl->n_acc_tokens_per_pos.size(); ++i) {
if (i > 0) {
tmp << ", ";
}
tmp << (double) impl->n_acc_tokens_per_pos[i] / (double) impl->n_call_accept;
}
std::ostringstream oss;
oss << std::fixed << std::setprecision(2) << mean;
str_stats = ", #mean acc len = " + oss.str() + ", #acc rate/pos = (" + tmp.str() + ")";
}
LOG_INF("statistics %16s: #calls(b,g,a) = %4zu %6zu %6zu, #gen drafts = %6zu, #acc drafts = %5zu, #gen tokens = %6zu, #acc tokens = %5zu%s%s\n",
common_speculative_type_to_str(impl->type).c_str(),
impl->n_call_begin, impl->n_call_draft, impl->n_call_accept,
impl->n_gen_drafts,
impl->n_acc_drafts,
impl->n_gen_tokens,
impl->n_acc_tokens,
str_stats.c_str(),
str_perf.c_str());
}
}

View File

@ -201,6 +201,8 @@ struct server_slot {
// Speculative decoding stats
int32_t n_draft_total = 0; // Total draft tokens generated
int32_t n_draft_accepted = 0; // Draft tokens actually accepted
int32_t n_draft_verif_steps = 0; // Total draft token verification steps by the target model
std::vector<int32_t> n_accepted_per_pos; // Accepted tokens per draft position
void reset() {
SLT_DBG(*this, "%s", "\n");
@ -227,6 +229,8 @@ struct server_slot {
// clear speculative decoding stats
n_draft_total = 0;
n_draft_accepted = 0;
n_draft_verif_steps = 0;
n_accepted_per_pos.clear();
task_prev = std::move(task);
task.reset();
@ -509,10 +513,22 @@ struct server_slot {
llama_perf_context(ctx_tgt).n_reused);
if (n_draft_total > 0) {
const float draft_ratio = (float) n_draft_accepted / n_draft_total;
const float draft_ratio = (float) n_draft_accepted / n_draft_total;
const double mean_acc_len = n_draft_verif_steps > 0 ? 1.0 + (double) n_draft_accepted / (double) n_draft_verif_steps : 1.0;
std::string acceptance_rates_per_pos;
if (n_draft_verif_steps > 0) {
for (size_t i = 0; i < n_accepted_per_pos.size(); ++i) {
if (i > 0) {
acceptance_rates_per_pos += ", ";
}
acceptance_rates_per_pos += string_format("%.3f", (double) n_accepted_per_pos[i] / (double) n_draft_verif_steps);
}
}
SLT_INF(*this,
"draft acceptance = %0.5f (%5d accepted / %5d generated)\n",
draft_ratio, n_draft_accepted, n_draft_total);
"draft acceptance = %0.5f (%5d accepted / %5d generated), mean acceptance length = %5.2f, acceptance rate per position = (%s)\n",
draft_ratio, n_draft_accepted, n_draft_total, mean_acc_len, acceptance_rates_per_pos.c_str());
}
common_speculative_print_stats(spec);
@ -3543,6 +3559,14 @@ private:
// update how many tokens out of those tested were accepted
slot.n_draft_accepted += ids.size() - 1;
slot.n_draft_verif_steps += 1;
if (slot.n_accepted_per_pos.empty()) {
slot.n_accepted_per_pos.resize(common_speculative_n_max(&params_base.speculative), 0);
}
for (size_t i = 0; i < ids.size() - 1 && i < slot.n_accepted_per_pos.size(); ++i) {
slot.n_accepted_per_pos[i]++;
}
// add accepted tokens to the prompt
slot.prompt.tokens.keep_first(slot.prompt.n_tokens() - n_draft);