diff --git a/common/speculative.cpp b/common/speculative.cpp index d874315559..a744c79ae5 100644 --- a/common/speculative.cpp +++ b/common/speculative.cpp @@ -140,6 +140,8 @@ struct common_speculative_impl { size_t n_gen_tokens = 0; // number of tokens generated by this implementation. size_t n_acc_tokens = 0; // number of tokens accepted by the target model. + std::vector n_acc_tokens_per_pos; // number of tokens accepted per draft position. + // TODO: track performance of most recent calls const bool gen_perf = true; // whether to generate performance stats. @@ -2059,6 +2061,15 @@ void common_speculative_accept(common_speculative * spec, llama_seq_id seq_id, u { common_time_meas tm(impl->t_accept_us, !impl->gen_perf); + + if (impl->n_acc_tokens_per_pos.size() < n_accepted) { + impl->n_acc_tokens_per_pos.resize(n_accepted, 0); + } + + for (size_t i = 0; i < n_accepted; ++i) { + impl->n_acc_tokens_per_pos[i]++; + } + if (n_accepted > 0) { impl->n_acc_drafts++; impl->n_acc_tokens += n_accepted; @@ -2093,13 +2104,31 @@ void common_speculative_print_stats(const common_speculative * spec) { str_perf = ""; } - LOG_INF("statistics %16s: #calls(b,g,a) = %4zu %6zu %6zu, #gen drafts = %6zu, #acc drafts = %5zu, #gen tokens = %6zu, #acc tokens = %5zu%s\n", + std::string str_stats; + if (impl->n_call_accept > 0) { + const double mean = + 1.0 + (double) impl->n_acc_tokens / (double) impl->n_call_accept; + std::ostringstream tmp; + tmp << std::fixed << std::setprecision(3); + for (size_t i = 0; i < impl->n_acc_tokens_per_pos.size(); ++i) { + if (i > 0) { + tmp << ", "; + } + tmp << (double) impl->n_acc_tokens_per_pos[i] / (double) impl->n_call_accept; + } + std::ostringstream oss; + oss << std::fixed << std::setprecision(2) << mean; + str_stats = ", #mean acc len = " + oss.str() + ", #acc rate/pos = (" + tmp.str() + ")"; + } + + LOG_INF("statistics %16s: #calls(b,g,a) = %4zu %6zu %6zu, #gen drafts = %6zu, #acc drafts = %5zu, #gen tokens = %6zu, #acc tokens = %5zu%s%s\n", common_speculative_type_to_str(impl->type).c_str(), impl->n_call_begin, impl->n_call_draft, impl->n_call_accept, impl->n_gen_drafts, impl->n_acc_drafts, impl->n_gen_tokens, impl->n_acc_tokens, + str_stats.c_str(), str_perf.c_str()); } } diff --git a/tools/server/server-context.cpp b/tools/server/server-context.cpp index bcae39a109..da6a47586d 100644 --- a/tools/server/server-context.cpp +++ b/tools/server/server-context.cpp @@ -201,6 +201,8 @@ struct server_slot { // Speculative decoding stats int32_t n_draft_total = 0; // Total draft tokens generated int32_t n_draft_accepted = 0; // Draft tokens actually accepted + int32_t n_draft_verif_steps = 0; // Total draft token verification steps by the target model + std::vector n_accepted_per_pos; // Accepted tokens per draft position void reset() { SLT_DBG(*this, "%s", "\n"); @@ -227,6 +229,8 @@ struct server_slot { // clear speculative decoding stats n_draft_total = 0; n_draft_accepted = 0; + n_draft_verif_steps = 0; + n_accepted_per_pos.clear(); task_prev = std::move(task); task.reset(); @@ -509,10 +513,22 @@ struct server_slot { llama_perf_context(ctx_tgt).n_reused); if (n_draft_total > 0) { - const float draft_ratio = (float) n_draft_accepted / n_draft_total; + const float draft_ratio = (float) n_draft_accepted / n_draft_total; + const double mean_acc_len = n_draft_verif_steps > 0 ? 1.0 + (double) n_draft_accepted / (double) n_draft_verif_steps : 1.0; + + std::string acceptance_rates_per_pos; + if (n_draft_verif_steps > 0) { + for (size_t i = 0; i < n_accepted_per_pos.size(); ++i) { + if (i > 0) { + acceptance_rates_per_pos += ", "; + } + acceptance_rates_per_pos += string_format("%.3f", (double) n_accepted_per_pos[i] / (double) n_draft_verif_steps); + } + } + SLT_INF(*this, - "draft acceptance = %0.5f (%5d accepted / %5d generated)\n", - draft_ratio, n_draft_accepted, n_draft_total); + "draft acceptance = %0.5f (%5d accepted / %5d generated), mean acceptance length = %5.2f, acceptance rate per position = (%s)\n", + draft_ratio, n_draft_accepted, n_draft_total, mean_acc_len, acceptance_rates_per_pos.c_str()); } common_speculative_print_stats(spec); @@ -3543,6 +3559,14 @@ private: // update how many tokens out of those tested were accepted slot.n_draft_accepted += ids.size() - 1; + slot.n_draft_verif_steps += 1; + + if (slot.n_accepted_per_pos.empty()) { + slot.n_accepted_per_pos.resize(common_speculative_n_max(¶ms_base.speculative), 0); + } + for (size_t i = 0; i < ids.size() - 1 && i < slot.n_accepted_per_pos.size(); ++i) { + slot.n_accepted_per_pos[i]++; + } // add accepted tokens to the prompt slot.prompt.tokens.keep_first(slot.prompt.n_tokens() - n_draft);