From 1a2dea29b9cf95416db358c5ffb02018a7cb04fb Mon Sep 17 00:00:00 2001 From: Ruixiang Wang Date: Wed, 17 Jun 2026 16:29:49 +0200 Subject: [PATCH] spec: fix segfault error on long prompts for eagle3 (#24707) --- src/llama-context.cpp | 2 +- src/llama-hparams.cpp | 4 ++++ src/llama-hparams.h | 7 +++++++ src/models/eagle3.cpp | 8 ++++---- 4 files changed, 16 insertions(+), 5 deletions(-) diff --git a/src/llama-context.cpp b/src/llama-context.cpp index 168dbabd76..529bc4a5e9 100644 --- a/src/llama-context.cpp +++ b/src/llama-context.cpp @@ -1382,7 +1382,7 @@ int llama_context::encode(const llama_batch & batch_inp) { const auto & hparams = model.hparams; // eagle3/DFlash: features as encoder input, and non-draft paths fall back to model's input dim - const int64_t n_embd = hparams.n_embd_inp(); + const int64_t n_embd = hparams.n_embd_inp_enc(); const int64_t n_vocab = model.vocab.n_tokens(); // note: during encode, we always pass the full sequence starting from pos = 0 diff --git a/src/llama-hparams.cpp b/src/llama-hparams.cpp index 2bf5768738..9d0683d2fe 100644 --- a/src/llama-hparams.cpp +++ b/src/llama-hparams.cpp @@ -104,6 +104,10 @@ uint32_t llama_hparams::n_embd_inp() const { return n_embd_inp; } +uint32_t llama_hparams::n_embd_inp_enc() const { + return n_embd_inp_enc_impl > 0 ? n_embd_inp_enc_impl : n_embd_inp(); +} + uint32_t llama_hparams::n_embd_out() const { return n_embd_out_impl > 0 ? n_embd_out_impl : n_embd; } diff --git a/src/llama-hparams.h b/src/llama-hparams.h index d045059a63..2eadeb2148 100644 --- a/src/llama-hparams.h +++ b/src/llama-hparams.h @@ -189,6 +189,10 @@ struct llama_hparams { // input embedding dimension (0 = use n_embd) uint32_t n_embd_inp_impl = 0; + // encoder input embedding dimension (0 = use n_embd_inp()) + // e.g. the eagle3 encoder fuses target_layers * target_hidden features + uint32_t n_embd_inp_enc_impl = 0; + // output embedding dimension (0 = use n_embd) uint32_t n_embd_out_impl = 0; @@ -305,6 +309,9 @@ struct llama_hparams { // dimension of main + auxiliary input embeddings uint32_t n_embd_inp() const; + // dimension of the encoder input embeddings + uint32_t n_embd_inp_enc() const; + // dimension of output embeddings uint32_t n_embd_out() const; diff --git a/src/models/eagle3.cpp b/src/models/eagle3.cpp index 3321b39051..9d96fae594 100644 --- a/src/models/eagle3.cpp +++ b/src/models/eagle3.cpp @@ -19,7 +19,7 @@ void llama_model_eagle3::load_arch_hparams(llama_model_loader & ml) { ml.get_key(LLM_KV_TARGET_HIDDEN_SIZE, n_embd_tgt); LLAMA_LOG_INFO("%s: EAGLE3 n_embd_tgt = %u (draft n_embd = %u)\n", __func__, n_embd_tgt, hparams.n_embd); - hparams.n_embd_inp_impl = (uint32_t) target_layer_ids.size() * n_embd_tgt; + hparams.n_embd_inp_enc_impl = (uint32_t) target_layer_ids.size() * n_embd_tgt; // eagle3 norm_before_residual (optional, default false) // compatible with Readhat eagle3 speculator model @@ -34,7 +34,7 @@ void llama_model_eagle3::load_arch_hparams(llama_model_loader & ml) { void llama_model_eagle3::load_arch_tensors(llama_model_loader &) { LLAMA_LOAD_LOCALS; - const int64_t n_embd_inp = hparams.n_embd_inp(); + const int64_t n_embd_inp = hparams.n_embd_inp_enc(); const int64_t n_embd_attn_input = 2 * n_embd; // Get vocab size from the d2t tensor in the GGUF file (optional - only needed if eagle3 has different vocab_size than target) @@ -109,8 +109,8 @@ ggml_tensor * llama_model_eagle3::graph::build_inp_embd_enc() const { // Input: Target model features (3 layers concatenated: low, mid, high) // Data will be provided via ubatch->embd in encode_eagle3_features() - auto inp_target = std::make_unique(hparams.n_embd_inp()); - inp_target->embd = ggml_new_tensor_2d(ctx0, GGML_TYPE_F32,hparams.n_embd_inp(), n_tokens); + auto inp_target = std::make_unique(hparams.n_embd_inp_enc()); + inp_target->embd = ggml_new_tensor_2d(ctx0, GGML_TYPE_F32, hparams.n_embd_inp_enc(), n_tokens); ggml_set_input(inp_target->embd); cur = inp_target->embd;