Disallow speculation for hybrid/recurrent models (#1645)

This commit is contained in:
Kawrakow 2026-04-16 17:21:44 +02:00 committed by GitHub
parent 8df5cbc0b3
commit 539d1cf989
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
2 changed files with 13 additions and 4 deletions

View File

@ -111,6 +111,15 @@ bool server_context::load_model(const gpt_params& params_) {
}
// Load draft model for speculative decoding if specified
if (has_draft_model) {
if (llama_model_has_recurrent(model)) {
LLAMA_LOG_WARN("\n=======================================================================\n");
LLAMA_LOG_WARN(" Speculative decodong is not suported for recurrent/hybrid models\n");
LLAMA_LOG_WARN(" --> bailing out\n");
LLAMA_LOG_WARN("========================================================================\n\n");
GGML_ABORT("Fatal error");
}
LLAMA_LOG_INFO("\n\n==================================loading DRAFT model==================================\n\n");
gpt_params params_dft;
@ -1470,9 +1479,9 @@ bool server_context::launch_slot_with_task(server_slot& slot, server_task& task)
//
// TODO: try to make this conditional on the context or the memory module, instead of the model type
params_base.do_checkpoint = do_checkpoint;
if (slot.n_buffer != 0) {
LLAMA_LOG_WARN("banned strings is not supported by recurrent model, it will be disabled.\n");
}
if (slot.n_buffer != 0) {
LLAMA_LOG_WARN("banned strings is not supported by recurrent model, it will be disabled.\n");
}
if (params_base.ctx_shift) {
params_base.ctx_shift = false;
LOG_WARNING("%s\n", "ctx_shift is not supported by recurrent model, it will be disabled");

View File

@ -4130,7 +4130,7 @@ static int llama_decode_internal(
if (n_outputs_new) {
GGML_ASSERT( n_outputs_prev + n_outputs_new <= n_outputs);
GGML_ASSERT((n_outputs_prev + n_outputs_new)*n_vocab <= (int64_t) lctx.logits_size);
if (res->ne[1] == n_tokens && n_outputs_new < n_tokens) {
int32_t i_out = 0;
if (u_batch.logits && !embd_pooled) {