diff --git a/common/speculative.cpp b/common/speculative.cpp index e8291727..bf78c33b 100644 --- a/common/speculative.cpp +++ b/common/speculative.cpp @@ -471,16 +471,20 @@ struct common_speculative_state_draft : public common_speculative_state { common_sampler_accept(smpl, nullptr, id, true); + // only collect very high-confidence draft tokens + if (cur_p->data[0].p < params.p_min) { + if (i == 0) { + result.push_back(id); + } + break; + } + result.push_back(id); if (params.n_max <= (int) result.size()) { break; } - // only collect very high-confidence draft tokens - if (cur_p->data[0].p < params.p_min) { - break; - } common_batch_add(batch, id, n_past + i + 1, { 0 }, true); diff --git a/examples/server/server-context.cpp b/examples/server/server-context.cpp index ca5d426f..b2da0f66 100644 --- a/examples/server/server-context.cpp +++ b/examples/server/server-context.cpp @@ -5032,7 +5032,7 @@ void server_context::update_slots() { // start populating the batch for this iteration common_batch_clear(batch); - // frist, add sampled tokens from any ongoing sequences + // first, add sampled tokens from any ongoing sequences add_sampled_tokens(); // Prepare batch for inference // process in chunks of params.n_batch