mirror of
https://github.com/ggml-org/llama.cpp.git
synced 2026-06-27 23:50:20 -05:00
spec: add EAGLE3 speculative decoding support (#18039)
* llama : enable layer input extraction * spec: support eagle3 * eagle3: fix params bug * eagle3: support Gemma4 eagle3 from RedHatAI * eagle3: set sync when get features from target Co-authored-by: tnhnyzc <115956684+tnhnyzc@users.noreply.github.com> * eagle3 : fix ubatch handling in embd_layer_inp extraction and encoder Co-authored-by: Doğaç Eldenk <dogacel@gmail.com> * eagle3: adapt to upstream changes * eagle3: fix rebase issues and adapt to upstream changes * eagle3:exclude the eagle3 arch from test-llama-archs * eagle3: fix editorconfig check failures * eagle3: fix multi-seq issue in d2t vocab mapping * cont : minor style / clean-up * spec : remove `common_speculative_setup_draft_model()` * llama : clean-up unused API * eagle3: set d2t vocab mapping in decode graph * cont : assert layer inputs are configured * hparams : use n_embd_inp instead of n_embd_target_features * eagle3: make output.weight optional and inherit from target model when needed * haparams : generic norm-before-residual param * llama-ext : consistent names * cont : fix * hparams : remove target_hidden_size * cparams : rename output_layer_inp -> embeddings_layer_inp * arch : reuse ATTN_NORM_2 instead of adding new hidden norm * llama : clean-up names * cont : add assert + comment * Update conversion/llama.py Co-authored-by: Sigbjørn Skjæret <sigbjorn.skjaeret@scala.com> --------- Co-authored-by: Georgi Gerganov <ggerganov@gmail.com> Co-authored-by: tnhnyzc <115956684+tnhnyzc@users.noreply.github.com> Co-authored-by: Doğaç Eldenk <dogacel@gmail.com> Co-authored-by: Sigbjørn Skjæret <sigbjorn.skjaeret@scala.com>
This commit is contained in:
parent
85f99dca8b
commit
88a39274ec
@ -375,31 +375,437 @@ struct common_speculative_impl_draft_simple : public common_speculative_impl {
|
||||
}
|
||||
};
|
||||
|
||||
|
||||
// EAGLE3 speculative decoding state
|
||||
//
|
||||
// Input of draft decoder: (This is different compared to MTP)
|
||||
// At "pos P", the decoder takes input pair (t_{P+1}, g_P), with RoPE at P.
|
||||
// - t_{P+1} = token at sequence pos P+1 (the *next* token after P)
|
||||
// - g_P = encoder output = projection of target's extracted hidden states at P
|
||||
//
|
||||
// Deferred boundary (MTP doesn't have this issue):
|
||||
// Within a single process() call with n_tokens, we can only write decoder KV for
|
||||
// training pos 0..n_tokens-2. The last training pos (n_tokens-1) needs t_{n_tokens}
|
||||
// which lies *outside* this batch — it is the token target will sample next or the first token from next ubatch.
|
||||
// So the last training pos of each process() call is *deferred* to whichever next call has
|
||||
// the missing token in hand:
|
||||
// - multi-ubatch prefill: the next process()'s first token completes the pair
|
||||
// (handled by the per-seq "cross-ubatch bridge")
|
||||
// - single-ubatch prefill / after verify: draft()'s seed step uses "dp.id_last"
|
||||
// (target's freshest sample) to complete the pair
|
||||
//
|
||||
// Per-seq carry-over state:
|
||||
// pending_g_last [n_embd_dec] ┐ the deferred boundary's (g, pos). Set by
|
||||
// pending_pos_last llama_pos ┘ process() at end of ubatch (= last row);
|
||||
// rebased by accept() to first-non-accepted pos.
|
||||
// verify_g [N × n_embd_dec] snapshot of process()'s encoder output;
|
||||
// verify_pos_first llama_pos consumed by accept() to recover the right
|
||||
// verify_g_rows int32_t pending_g_last row for any n_accepted value.
|
||||
//
|
||||
// Performance is overall good but there is waste in verify cycle:
|
||||
// process() runs encoder + decoder on the *full* verify batch including rows for
|
||||
// rejected drafts. The KV at those positions is then dropped.
|
||||
//
|
||||
// TODO: Not sure if we need optimization for this waste?
|
||||
// If so we may need hybrid stash:
|
||||
// in verify mode, have process() only stash features and let draft() seed run
|
||||
// encoder+decoder on n_accepted+1 rows).
|
||||
struct common_speculative_impl_draft_eagle3 : public common_speculative_impl {
|
||||
//common_params_speculative_eagle3 params;
|
||||
common_params_speculative_draft params;
|
||||
llama_batch batch;
|
||||
|
||||
std::vector<common_sampler_ptr> smpls;
|
||||
|
||||
int32_t n_embd_dec = 0; // draft hidden size
|
||||
int32_t n_embd_enc = 0; // target_layer_ids_n * target_hidden_size
|
||||
int32_t n_embd_tgt = 0; // target model hidden size
|
||||
|
||||
const int32_t * target_layer_ids = nullptr; // model_dft's extract layer indices
|
||||
uint32_t target_layer_ids_n = 0;
|
||||
|
||||
// [per-seq] deferred boundary state
|
||||
std::vector<std::vector<float>> pending_g_last;
|
||||
std::vector<llama_pos> pending_pos_last;
|
||||
|
||||
// [per-seq] snapshot of the most recent process()'s encoder output
|
||||
std::vector<std::vector<float>> verify_g; // [n_seq][n_rows * n_embd_dec]
|
||||
std::vector<llama_pos> verify_pos_first; // [n_seq] — pos of verify_g[seq][0]
|
||||
std::vector<int32_t> verify_g_rows; // [n_seq] — number of rows
|
||||
|
||||
// scratch buffer for concatenated target features [n_tokens, n_embd_enc]
|
||||
std::vector<float> features_buf;
|
||||
std::vector<float> g_embd_buf;
|
||||
|
||||
common_speculative_impl_draft_eagle3(const common_params_speculative & params, uint32_t n_seq)
|
||||
: common_speculative_impl(COMMON_SPECULATIVE_TYPE_DRAFT_EAGLE3, n_seq)
|
||||
, params(params.draft)
|
||||
{
|
||||
LOG_INF("%s: adding speculative implementation 'draft-eagle3'\n", __func__);
|
||||
LOG_INF("%s: - n_max=%d, n_min=%d, p_min=%f\n", __func__, params.draft.n_max, params.draft.n_min, params.draft.p_min);
|
||||
|
||||
auto * ctx_tgt = this->params.ctx_tgt;
|
||||
auto * ctx_dft = this->params.ctx_dft;
|
||||
GGML_ASSERT(ctx_tgt && ctx_dft && "EAGLE3 requires ctx_tgt and ctx_dft to be set");
|
||||
|
||||
const llama_model * model_dft = llama_get_model(ctx_dft);
|
||||
const llama_model * model_tgt = llama_get_model(ctx_tgt);
|
||||
|
||||
target_layer_ids = llama_model_target_layer_ids (model_dft);
|
||||
target_layer_ids_n = llama_model_target_layer_ids_n(model_dft);
|
||||
if (target_layer_ids_n != 3) {
|
||||
throw std::runtime_error("draft model is not eagle3 (expected 3 extract layers, got " +
|
||||
std::to_string(target_layer_ids_n) + ")");
|
||||
}
|
||||
|
||||
n_embd_tgt = llama_model_n_embd(model_tgt);
|
||||
n_embd_dec = llama_model_n_embd(model_dft);
|
||||
n_embd_enc = (int32_t) target_layer_ids_n * n_embd_tgt;
|
||||
|
||||
const int32_t n_b = (int32_t) llama_n_batch(ctx_dft);
|
||||
batch = llama_batch_init(/*n_tokens=*/ n_b, /*embd=*/ n_embd_dec, /*n_seq_max=*/ 1);
|
||||
// llama_batch_init allocates only one of token/embd; eagle3 decoder needs both.
|
||||
// TODO: fix, how to call without malloc
|
||||
batch.token = (llama_token *) malloc(sizeof(llama_token) * n_b);
|
||||
|
||||
smpls.resize(n_seq);
|
||||
for (auto & s : smpls) {
|
||||
common_params_sampling sparams;
|
||||
sparams.no_perf = false;
|
||||
sparams.top_k = 10;
|
||||
sparams.samplers = { COMMON_SAMPLER_TYPE_TOP_K };
|
||||
s.reset(common_sampler_init(llama_get_model(ctx_dft), sparams));
|
||||
}
|
||||
|
||||
// turn on extraction of the target layers' input embeddings
|
||||
for (uint32_t k = 0; k < target_layer_ids_n; ++k) {
|
||||
llama_set_embeddings_layer_inp(ctx_tgt, (uint32_t) target_layer_ids[k], true);
|
||||
}
|
||||
|
||||
// turn on extraction of the draft model's pre-norm hidden state
|
||||
// (used both for the encoder output g_embd and the decoder pre-norm output).
|
||||
llama_set_embeddings_nextn(ctx_dft, true, /*masked*/ true);
|
||||
|
||||
pending_g_last.assign(n_seq, std::vector<float>(n_embd_dec, 0.0f));
|
||||
pending_pos_last.assign(n_seq, -1);
|
||||
|
||||
verify_g.assign(n_seq, std::vector<float>());
|
||||
verify_pos_first.assign(n_seq, -1);
|
||||
verify_g_rows.assign(n_seq, 0);
|
||||
}
|
||||
|
||||
void begin(llama_seq_id /*seq_id*/, const llama_tokens & /*prompt*/) override {
|
||||
// noop
|
||||
~common_speculative_impl_draft_eagle3() override {
|
||||
if (batch.token != nullptr) {
|
||||
free(batch.token);
|
||||
batch.token = nullptr;
|
||||
}
|
||||
llama_batch_free(batch);
|
||||
}
|
||||
|
||||
bool process(const llama_batch & /*batch*/) override {
|
||||
// TODO: implement
|
||||
void begin(llama_seq_id seq_id, const llama_tokens & prompt) override {
|
||||
const int32_t N = (int32_t) prompt.size();
|
||||
if (N <= 0) {
|
||||
return;
|
||||
}
|
||||
// expected state after prefill: ctx_dft has pos 0..N-2 (last position is deferred to
|
||||
// draft()'s seed step). Warn only if more than one position is missing.
|
||||
auto * ctx_dft = this->params.ctx_dft;
|
||||
const llama_pos pos_max = llama_memory_seq_pos_max(llama_get_memory(ctx_dft), seq_id);
|
||||
if (pos_max < N - 2) {
|
||||
LOG_WRN("%s: ctx_dft pos_max=%d < N-2=%d — process() did not run on every prefill ubatch. "
|
||||
"Drafts may degrade.\n",
|
||||
__func__, (int) pos_max, N - 2);
|
||||
}
|
||||
}
|
||||
|
||||
bool process(const llama_batch & batch_in) override {
|
||||
if (batch_in.n_tokens <= 0) {
|
||||
return true;
|
||||
}
|
||||
|
||||
if (batch_in.token == nullptr || batch_in.embd != nullptr) {
|
||||
return true;
|
||||
}
|
||||
|
||||
const int32_t n_tokens = batch_in.n_tokens;
|
||||
|
||||
// i_batch_beg[seq] / i_batch_end[seq]: inclusive batch indices of this seq's
|
||||
// first/last token in batch_in. Assumes per-seq tokens are contiguous within
|
||||
// the ubatch (server's default ordering).
|
||||
std::vector<int32_t> i_batch_beg(n_seq, -1);
|
||||
std::vector<int32_t> i_batch_end(n_seq, -1);
|
||||
for (int k = 0; k < n_tokens; ++k) {
|
||||
GGML_ASSERT(batch_in.n_seq_id[k] == 1);
|
||||
const llama_seq_id seq_id = batch_in.seq_id[k][0];
|
||||
if (seq_id < 0 || seq_id >= (llama_seq_id) n_seq) {
|
||||
continue;
|
||||
}
|
||||
i_batch_end[seq_id] = k;
|
||||
if (i_batch_beg[seq_id] < 0) {
|
||||
i_batch_beg[seq_id] = k;
|
||||
}
|
||||
}
|
||||
|
||||
auto * ctx_tgt = this->params.ctx_tgt;
|
||||
auto * ctx_dft = this->params.ctx_dft;
|
||||
|
||||
// Interleave each extract_layer's hidden state into a contiguous buffer of
|
||||
// shape [n_tokens, target_layer_ids_n * n_embd_tgt]. Then run EAGLE3 encoder
|
||||
// to get one g_embd row per token.
|
||||
features_buf.resize((size_t) n_tokens * n_embd_enc, 0.0f);
|
||||
|
||||
for (uint32_t k = 0; k < target_layer_ids_n; ++k) {
|
||||
const float * layer = llama_get_embeddings_layer_inp(ctx_tgt, (uint32_t) target_layer_ids[k]);
|
||||
if (!layer) {
|
||||
GGML_ABORT("EAGLE3: target layer %d input not extracted.", target_layer_ids[k]);
|
||||
}
|
||||
for (int32_t i = 0; i < n_tokens; ++i) {
|
||||
float * dst = features_buf.data() + (size_t) i * n_embd_enc + k * (size_t) n_embd_tgt;
|
||||
const float * src = layer + (size_t) i * n_embd_tgt;
|
||||
std::memcpy(dst, src, (size_t) n_embd_tgt * sizeof(float));
|
||||
}
|
||||
}
|
||||
|
||||
g_embd_buf.resize((size_t) n_tokens * n_embd_dec);
|
||||
|
||||
// llama_encode() requires the full encoder batch to fit in n_ubatch.
|
||||
// Allow batch > ubatch: eagle3's per-token encoder can be chunked safely.
|
||||
const int32_t n_ubatch_dft = (int32_t) llama_n_ubatch(ctx_dft);
|
||||
for (int32_t i = 0; i < n_tokens; i += n_ubatch_dft) {
|
||||
const int32_t n_chunk = std::min(n_ubatch_dft, n_tokens - i);
|
||||
|
||||
llama_batch enc_batch = {
|
||||
/*.n_tokens =*/ n_chunk,
|
||||
/*.token =*/ nullptr,
|
||||
/*.embd =*/ features_buf.data() + (size_t) i * n_embd_enc,
|
||||
/*.pos =*/ nullptr,
|
||||
/*.n_seq_id =*/ nullptr,
|
||||
/*.seq_id =*/ nullptr,
|
||||
/*.logits =*/ nullptr,
|
||||
};
|
||||
const int32_t rc = llama_encode(ctx_dft, enc_batch);
|
||||
if (rc != 0) {
|
||||
LOG_ERR("%s: llama_encode(ctx_dft) failed rc=%d (n_tokens=%d, offset=%d)\n",
|
||||
__func__, rc, (int) n_chunk, (int) i);
|
||||
return false;
|
||||
}
|
||||
|
||||
// g_embd has shape [n_chunk, n_embd_dec] in ctx_dft's pre-norm embeddings buffer.
|
||||
const float * g_embd_chunk = llama_get_embeddings_nextn(ctx_dft);
|
||||
GGML_ASSERT(g_embd_chunk && "EAGLE3 encoder produced no output.");
|
||||
std::memcpy(g_embd_buf.data() + (size_t) i * n_embd_dec,
|
||||
g_embd_chunk,
|
||||
(size_t) n_chunk * n_embd_dec * sizeof(float));
|
||||
}
|
||||
|
||||
const float * g_embd = g_embd_buf.data();
|
||||
|
||||
const size_t row_bytes = (size_t) n_embd_dec * sizeof(float);
|
||||
|
||||
// EAGLE3 decoder input convention: at memory pos P the input pair is
|
||||
// (token[P+1], g_embd[P]). This shifts the token index "left by one" relative to g_embd.
|
||||
//
|
||||
// Per seq, in order:
|
||||
// (a) cross-ubatch bridge — when applicable, write the previously-deferred
|
||||
// pos using this ubatch's first token + pending_g_last.
|
||||
// (b) main write loop — for k in [beg, end-1], write (token[k+1], g_embd[k])
|
||||
// at pos[k]. The last training pos (k=end) is left unwritten = new
|
||||
// deferred boundary, completed by the next process() or draft() call.
|
||||
// (c) refresh deferred state — stash this ubatch's full g_embd into verify_g,
|
||||
// update pending_g_last / pending_pos_last to the last row.
|
||||
common_batch_clear(batch);
|
||||
|
||||
for (llama_seq_id seq_id = 0; seq_id < (llama_seq_id) n_seq; ++seq_id) {
|
||||
const int32_t beg = i_batch_beg[seq_id];
|
||||
const int32_t end = i_batch_end[seq_id];
|
||||
if (beg < 0 || end < 0) {
|
||||
continue;
|
||||
}
|
||||
|
||||
// cross-ubatch bridge — complete the prior ubatch's deferred boundary.
|
||||
// Fires iff all three preconditions hold:
|
||||
// 1) pending_pos_last >= 0
|
||||
// 2) pending_pos_last + 1 == pos[beg]
|
||||
// 3) pending_pos_last > dft_pos_max // TODO: is this check needed?
|
||||
const llama_pos pending_pos = pending_pos_last[seq_id];
|
||||
if (pending_pos >= 0 && pending_pos + 1 == batch_in.pos[beg]) {
|
||||
const llama_pos dft_pos_max = llama_memory_seq_pos_max(llama_get_memory(ctx_dft), seq_id);
|
||||
if (pending_pos > dft_pos_max) {
|
||||
common_batch_add(batch, batch_in.token[beg], pending_pos, { seq_id }, /*logits=*/ false);
|
||||
std::memcpy(batch.embd + (size_t) (batch.n_tokens - 1) * n_embd_dec,
|
||||
pending_g_last[seq_id].data(), row_bytes);
|
||||
}
|
||||
}
|
||||
|
||||
for (int32_t k = beg; k < end; ++k) {
|
||||
common_batch_add(batch, batch_in.token[k + 1], batch_in.pos[k], { seq_id }, /*logits=*/ false);
|
||||
std::memcpy(batch.embd + (size_t) (batch.n_tokens - 1) * n_embd_dec,
|
||||
g_embd + (size_t) k * n_embd_dec, row_bytes);
|
||||
}
|
||||
|
||||
// refresh deferred state
|
||||
const int32_t n_rows = end - beg + 1;
|
||||
verify_pos_first[seq_id] = batch_in.pos[beg];
|
||||
pending_pos_last[seq_id] = batch_in.pos[end];
|
||||
verify_g_rows[seq_id] = n_rows;
|
||||
verify_g[seq_id].resize((size_t) n_rows * n_embd_dec, 0.0f);
|
||||
std::memcpy(verify_g[seq_id].data(), g_embd + (size_t) beg * n_embd_dec, row_bytes * n_rows);
|
||||
std::memcpy(pending_g_last[seq_id].data(), g_embd + (size_t) end * n_embd_dec, row_bytes);
|
||||
}
|
||||
|
||||
if (batch.n_tokens > 0) {
|
||||
const int32_t rc = llama_decode(ctx_dft, batch);
|
||||
if (rc != 0) {
|
||||
LOG_ERR("%s: llama_decode(ctx_dft) failed rc=%d (n_tokens=%d, ubatch_pos[0]=%d)\n",
|
||||
__func__, rc, (int) batch.n_tokens, (int) batch_in.pos[0]);
|
||||
return false;
|
||||
}
|
||||
}
|
||||
|
||||
return true;
|
||||
}
|
||||
|
||||
void draft(common_speculative_draft_params_vec & /*dparams*/) override {
|
||||
// TODO: implement
|
||||
void draft(common_speculative_draft_params_vec & dparams) override {
|
||||
auto & ctx_dft = params.ctx_dft;
|
||||
|
||||
common_batch_clear(batch);
|
||||
|
||||
// keep track of which sequences are still drafting
|
||||
int n_drafting = 0;
|
||||
std::vector<bool> drafting(n_seq);
|
||||
|
||||
const size_t row_bytes = (size_t) n_embd_dec * sizeof(float);
|
||||
|
||||
// Complete the deferred boundary pair (dp.id_last, pending_g_last) at memory
|
||||
// pos pending_pos_last. dp.id_last is target's freshest sample (= corrected
|
||||
// token after verify, or first generated token after prefill), matching the
|
||||
// EAGLE3 input convention (token[P+1], g_embd[P]) at pos P.
|
||||
for (llama_seq_id seq_id = 0; seq_id < (llama_seq_id) n_seq; ++seq_id) {
|
||||
auto & dp = dparams[seq_id];
|
||||
|
||||
if (!dp.drafting) {
|
||||
continue;
|
||||
}
|
||||
if (pending_pos_last[seq_id] < 0) {
|
||||
continue;
|
||||
}
|
||||
|
||||
n_drafting++;
|
||||
drafting[seq_id] = true;
|
||||
common_sampler_reset(smpls[seq_id].get());
|
||||
|
||||
llama_memory_seq_rm(llama_get_memory(ctx_dft), seq_id, pending_pos_last[seq_id], -1);
|
||||
|
||||
common_batch_add(batch, dp.id_last, pending_pos_last[seq_id], { seq_id }, true);
|
||||
std::memcpy(batch.embd + (size_t) (batch.n_tokens - 1) * n_embd_dec,
|
||||
pending_g_last[seq_id].data(),
|
||||
row_bytes);
|
||||
}
|
||||
|
||||
if (batch.n_tokens == 0) {
|
||||
return;
|
||||
}
|
||||
|
||||
int ret = llama_decode(ctx_dft, batch);
|
||||
if (ret != 0) {
|
||||
LOG_WRN("%s: llama_decode returned %d\n", __func__, ret);
|
||||
return;
|
||||
}
|
||||
|
||||
int i = 0;
|
||||
|
||||
while (n_drafting > 0) {
|
||||
int i_batch = 0;
|
||||
|
||||
common_batch_clear(batch);
|
||||
|
||||
for (llama_seq_id seq_id = 0; seq_id < (llama_seq_id) n_seq; ++seq_id) {
|
||||
if (!drafting[seq_id]) {
|
||||
continue;
|
||||
}
|
||||
|
||||
auto * smpl = smpls[seq_id].get();
|
||||
|
||||
common_sampler_sample(smpl, ctx_dft, i_batch, true);
|
||||
// pre-norm hidden state of this position becomes g_embd for the next step
|
||||
const float * prenorm = llama_get_embeddings_nextn_ith(ctx_dft, i_batch);
|
||||
++i_batch;
|
||||
|
||||
const auto * cur_p = common_sampler_get_candidates(smpl, true);
|
||||
|
||||
for (int k = 0; k < std::min(3, (int) cur_p->size); ++k) {
|
||||
LOG_DBG(" - seq_id %d, draft candidate %3d, pos %3d: %6d (%8.3f) '%s'\n",
|
||||
seq_id, k, i, cur_p->data[k].id, cur_p->data[k].p,
|
||||
common_token_to_piece(ctx_dft, cur_p->data[k].id).c_str());
|
||||
}
|
||||
|
||||
const llama_token id = cur_p->data[0].id;
|
||||
|
||||
// only collect very high-confidence draft tokens
|
||||
// (configurable via --spec-draft-p-min, set to 0.0 to disable early-stop)
|
||||
if (cur_p->data[0].p < params.p_min) {
|
||||
drafting[seq_id] = false;
|
||||
n_drafting--;
|
||||
|
||||
continue;
|
||||
}
|
||||
|
||||
common_sampler_accept(smpl, id, true);
|
||||
|
||||
auto & dp = dparams.at(seq_id);
|
||||
auto & result = *dp.result;
|
||||
|
||||
result.push_back(id);
|
||||
|
||||
if (params.n_max <= (int) result.size()) {
|
||||
drafting[seq_id] = false;
|
||||
n_drafting--;
|
||||
continue;
|
||||
}
|
||||
|
||||
common_batch_add(batch, id, pending_pos_last[seq_id] + (i + 1), { seq_id }, true);
|
||||
std::memcpy(batch.embd + (size_t) (batch.n_tokens - 1) * n_embd_dec, prenorm, row_bytes);
|
||||
}
|
||||
|
||||
if (batch.n_tokens == 0) {
|
||||
break;
|
||||
}
|
||||
|
||||
ret = llama_decode(ctx_dft, batch);
|
||||
if (ret != 0) {
|
||||
LOG_WRN("%s: llama_decode[%d] returned %d\n", __func__, i, ret);
|
||||
break;
|
||||
}
|
||||
|
||||
++i;
|
||||
}
|
||||
|
||||
for (llama_seq_id seq_id = 0; seq_id < (llama_seq_id) n_seq; ++seq_id) {
|
||||
auto & dp = dparams[seq_id];
|
||||
if (!dp.drafting) {
|
||||
continue;
|
||||
}
|
||||
|
||||
if (dp.result->size() < (size_t) params.n_min) {
|
||||
dp.result->clear();
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
void accept(llama_seq_id /*seq_id*/, uint16_t /*n_accepted*/, bool /*is_other*/) override {
|
||||
// noop
|
||||
void accept(llama_seq_id seq_id, uint16_t n_accepted, bool /*is_other*/) override {
|
||||
if (seq_id < 0 || seq_id >= (llama_seq_id) n_seq) {
|
||||
return;
|
||||
}
|
||||
|
||||
const int32_t n_rows = verify_g_rows[seq_id];
|
||||
if (n_rows <= 0) {
|
||||
return;
|
||||
}
|
||||
|
||||
const int32_t i_g = std::min<int32_t>(n_accepted, n_rows - 1);
|
||||
pending_pos_last[seq_id] = verify_pos_first[seq_id] + i_g;
|
||||
std::memcpy(pending_g_last[seq_id].data(),
|
||||
verify_g[seq_id].data() + (size_t) i_g * n_embd_dec,
|
||||
(size_t) n_embd_dec * sizeof(float));
|
||||
}
|
||||
|
||||
bool need_embd() const override {
|
||||
@ -1370,9 +1776,11 @@ common_speculative * common_speculative_init(common_params_speculative & params,
|
||||
uint32_t enabled_configs = common_get_enabled_speculative_configs(params.types);
|
||||
|
||||
bool has_draft_simple = (enabled_configs & (1u << COMMON_SPECULATIVE_TYPE_DRAFT_SIMPLE));
|
||||
bool has_draft_eagle3 = false; // TODO PR-18039: if params.speculative.eagle3
|
||||
bool has_draft_eagle3 = (enabled_configs & (1u << COMMON_SPECULATIVE_TYPE_DRAFT_EAGLE3)) && params.draft.ctx_dft != nullptr;
|
||||
bool has_mtp = (enabled_configs & (1u << COMMON_SPECULATIVE_TYPE_DRAFT_MTP)) && params.draft.ctx_dft != nullptr;
|
||||
|
||||
|
||||
|
||||
bool has_ngram_cache = (enabled_configs & (1u << COMMON_SPECULATIVE_TYPE_NGRAM_CACHE));
|
||||
bool has_ngram_simple = (enabled_configs & (1u << COMMON_SPECULATIVE_TYPE_NGRAM_SIMPLE));
|
||||
bool has_ngram_map_k = (enabled_configs & (1u << COMMON_SPECULATIVE_TYPE_NGRAM_MAP_K));
|
||||
|
||||
@ -130,6 +130,9 @@ TEXT_MODEL_MAP: dict[str, str] = {
|
||||
"LlamaBidirectionalModel": "llama",
|
||||
"LlamaForCausalLM": "llama",
|
||||
"LlamaModel": "llama",
|
||||
"Eagle3DraftModel": "llama",
|
||||
"Eagle3Speculator": "llama",
|
||||
"LlamaForCausalLMEagle3": "llama",
|
||||
"LlavaForConditionalGeneration": "llama",
|
||||
"LlavaStableLMEpochForCausalLM": "stablelm",
|
||||
"MPTForCausalLM": "mpt",
|
||||
|
||||
@ -94,6 +94,7 @@ class ModelBase:
|
||||
metadata: gguf.Metadata
|
||||
dir_model_card: Path
|
||||
remote_hf_model_id: str | None
|
||||
target_model_dir: Path | None
|
||||
|
||||
# subclasses should define this!
|
||||
model_arch: gguf.MODEL_ARCH
|
||||
@ -119,6 +120,7 @@ class ModelBase:
|
||||
small_first_shard: bool = False, hparams: dict[str, Any] | None = None, remote_hf_model_id: str | None = None,
|
||||
disable_mistral_community_chat_template: bool = False,
|
||||
sentence_transformers_dense_modules: bool = False,
|
||||
target_model_dir: Path | None = None,
|
||||
fuse_gate_up_exps: bool = False,
|
||||
fp8_as_q8: bool = False):
|
||||
if type(self) is ModelBase or \
|
||||
@ -139,6 +141,7 @@ class ModelBase:
|
||||
self.dry_run = dry_run
|
||||
self.remote_hf_model_id = remote_hf_model_id
|
||||
self.sentence_transformers_dense_modules = sentence_transformers_dense_modules
|
||||
self.target_model_dir = target_model_dir
|
||||
self.fuse_gate_up_exps = fuse_gate_up_exps
|
||||
self._gate_exp_buffer: dict[int, Tensor] = {}
|
||||
self._up_exp_buffer: dict[int, Tensor] = {}
|
||||
@ -2481,6 +2484,7 @@ class LazyTorchTensor(gguf.LazyBase):
|
||||
torch.float16: np.float16,
|
||||
torch.float32: np.float32,
|
||||
torch.uint8: np.uint8,
|
||||
torch.int64: np.int64,
|
||||
}
|
||||
|
||||
# only used when byteswapping data. Only correct size is needed
|
||||
|
||||
@ -5,12 +5,13 @@ import math
|
||||
|
||||
from typing import Callable, Iterable, TYPE_CHECKING
|
||||
|
||||
import numpy as np
|
||||
import torch
|
||||
|
||||
if TYPE_CHECKING:
|
||||
from torch import Tensor
|
||||
|
||||
from .base import ModelBase, TextModel, gguf
|
||||
from .base import ModelBase, TextModel, gguf, logger
|
||||
|
||||
|
||||
@ModelBase.register(
|
||||
@ -21,6 +22,9 @@ from .base import ModelBase, TextModel, gguf
|
||||
"VLlama3ForCausalLM",
|
||||
"LlavaForConditionalGeneration",
|
||||
"VoxtralForConditionalGeneration",
|
||||
"LlamaForCausalLMEagle3",
|
||||
"Eagle3Speculator",
|
||||
"Eagle3DraftModel",
|
||||
"IQuestCoderForCausalLM",
|
||||
"LlamaModel")
|
||||
class LlamaModel(TextModel):
|
||||
@ -39,7 +43,61 @@ class LlamaModel(TextModel):
|
||||
hparams = ModelBase.load_hparams(self.dir_model, is_mistral_format=False)
|
||||
self.origin_hf_arch = hparams.get('architectures', [None])[0]
|
||||
|
||||
# Detect eagle3 draft checkpoint by hparams (some models don't use a distinct HF arch name)
|
||||
if "draft_vocab_size" in self.hparams and self.hparams["num_hidden_layers"] == 1:
|
||||
self.is_eagle3 = True
|
||||
self.model_arch = gguf.MODEL_ARCH.EAGLE3
|
||||
logger.info("Detected EAGLE-3 draft model, switching to EAGLE3 architecture")
|
||||
# Re-initialize tensor_map with eagle3 architecture
|
||||
self.tensor_map = gguf.get_tensor_name_map(self.model_arch, self.block_count)
|
||||
# Update gguf_writer architecture
|
||||
self.gguf_writer.arch = gguf.MODEL_ARCH_NAMES[self.model_arch]
|
||||
self.gguf_writer.add_architecture()
|
||||
if self.target_model_dir is None:
|
||||
raise ValueError(
|
||||
"EAGLE-3 model requires --target-model-dir to be specified. "
|
||||
"Please provide the path to the target model directory to read config.json"
|
||||
)
|
||||
# Read both eagle3 raw config and target model config
|
||||
with open(self.dir_model / "config.json", 'r', encoding='utf-8') as f:
|
||||
eagle3_raw_config = json.load(f)
|
||||
with open(self.target_model_dir / "config.json", 'r', encoding='utf-8') as f:
|
||||
target_config = json.load(f)
|
||||
|
||||
if "text_config" in target_config:
|
||||
target_config = {**target_config, **target_config["text_config"]}
|
||||
self.target_vocab_size = target_config["vocab_size"]
|
||||
|
||||
# target_layers: derived from target model layer count (low/mid/high)
|
||||
target_num_layers = target_config["num_hidden_layers"]
|
||||
target_layers = [2, target_num_layers // 2, target_num_layers - 3]
|
||||
logger.info(f"EAGLE-3: target_layers = {target_layers} (target model has {target_num_layers} layers)")
|
||||
self.gguf_writer.add_array(f"{self.gguf_writer.arch}.target_layers", target_layers)
|
||||
|
||||
# target_hidden_size: prefer eagle3 config, fallback to target config
|
||||
if eagle3_raw_config.get("target_hidden_size") is not None:
|
||||
target_hidden_size = eagle3_raw_config["target_hidden_size"]
|
||||
src = "EAGLE-3 config"
|
||||
else:
|
||||
target_hidden_size = target_config["hidden_size"]
|
||||
src = "target model config"
|
||||
logger.info(f"EAGLE-3: target_hidden_size = {target_hidden_size} (from {src})")
|
||||
self.gguf_writer.add_uint32(f"{self.gguf_writer.arch}.target_hidden_size", target_hidden_size)
|
||||
|
||||
# norm_before_residual (RedHat-style eagle3 specific)
|
||||
norm_before_residual = eagle3_raw_config.get("norm_before_residual", False)
|
||||
logger.info(f"EAGLE-3: norm_before_residual = {norm_before_residual}")
|
||||
self.gguf_writer.add_bool(f"{self.gguf_writer.arch}.norm_before_residual", norm_before_residual)
|
||||
|
||||
def set_vocab(self):
|
||||
# eagle3: use tokenizer from target model if provided
|
||||
original_dir_model = None
|
||||
if getattr(self, 'is_eagle3', False):
|
||||
assert self.target_model_dir is not None
|
||||
logger.info(f"EAGLE-3: Using tokenizer from target model: {self.target_model_dir}")
|
||||
original_dir_model = self.dir_model
|
||||
self.dir_model = self.target_model_dir
|
||||
|
||||
if self.origin_hf_arch == "GlmasrModel":
|
||||
return self._set_vocab_glmedge()
|
||||
|
||||
@ -85,6 +143,10 @@ class LlamaModel(TextModel):
|
||||
if self.hparams.get("vocab_size", 32000) == 49152:
|
||||
self.gguf_writer.add_add_bos_token(False)
|
||||
|
||||
# eagle3: Restore original dir_model
|
||||
if original_dir_model is not None:
|
||||
self.dir_model = original_dir_model
|
||||
|
||||
def set_gguf_parameters(self):
|
||||
super().set_gguf_parameters()
|
||||
hparams = self.hparams
|
||||
@ -129,7 +191,49 @@ class LlamaModel(TextModel):
|
||||
|
||||
return super().filter_tensors((name, gen))
|
||||
|
||||
def index_tensors(self, remote_hf_model_id: str | None = None) -> dict[str, Callable[[], Tensor]]:
|
||||
tensors = super().index_tensors(remote_hf_model_id)
|
||||
|
||||
# Handle Eagle3Speculator nested config
|
||||
if "transformer_layer_config" in self.hparams:
|
||||
self.hparams = {**self.hparams, **self.hparams["transformer_layer_config"]}
|
||||
|
||||
# eagle3 detection
|
||||
if "draft_vocab_size" in self.hparams and self.hparams["num_hidden_layers"] == 1:
|
||||
logger.info("EAGLE-3: renaming midlayer.* / layers.0.* to model.layers.0.*")
|
||||
new_tensors = {}
|
||||
for name, gen in tensors.items():
|
||||
if name.startswith("midlayer."):
|
||||
new_name = "model.layers.0." + name[len("midlayer."):]
|
||||
new_tensors[new_name] = gen
|
||||
elif name.startswith("layers.0."): # Eagle3Speculator format
|
||||
new_name = "model." + name
|
||||
new_tensors[new_name] = gen
|
||||
else:
|
||||
new_tensors[name] = gen
|
||||
return new_tensors
|
||||
|
||||
return tensors
|
||||
|
||||
def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]:
|
||||
# eagle3: special tensors that bypass standard llama mapping
|
||||
if getattr(self, 'is_eagle3', False):
|
||||
if name == "fc.weight":
|
||||
yield (name, data_torch)
|
||||
return
|
||||
if name == "d2t":
|
||||
# store for manual int64 handling in prepare_tensors (avoid F32 conversion)
|
||||
if not hasattr(self, '_eagle3_int_tensors'):
|
||||
self._eagle3_int_tensors = {}
|
||||
self._eagle3_int_tensors[name] = data_torch
|
||||
return
|
||||
if name == "t2d":
|
||||
# not used at runtime, skip
|
||||
return
|
||||
if name.endswith(".hidden_norm.weight"):
|
||||
yield (self.format_tensor_name(gguf.MODEL_TENSOR.ATTN_NORM_2, bid), data_torch)
|
||||
return
|
||||
|
||||
n_head = self.find_hparam(["n_heads", "num_attention_heads"])
|
||||
n_kv_head = self.find_hparam(["n_kv_heads", "num_key_value_heads"])
|
||||
|
||||
@ -205,8 +309,33 @@ class LlamaModel(TextModel):
|
||||
yield (self.format_tensor_name(gguf.MODEL_TENSOR.ROPE_FREQS), torch.tensor(rope_factors, dtype=torch.float32))
|
||||
|
||||
def prepare_tensors(self):
|
||||
# eagle3: collect d2t original dtype before parent converts tensors to F32
|
||||
eagle3_original_dtypes = {}
|
||||
if getattr(self, 'is_eagle3', False):
|
||||
for name, data_torch in self.get_tensors():
|
||||
if name == "d2t":
|
||||
eagle3_original_dtypes[name] = data_torch.dtype
|
||||
|
||||
super().prepare_tensors()
|
||||
|
||||
# eagle3: write d2t as absolute target token ids
|
||||
if getattr(self, 'is_eagle3', False) and hasattr(self, '_eagle3_int_tensors'):
|
||||
for name, data_torch in self._eagle3_int_tensors.items():
|
||||
old_dtype = eagle3_original_dtypes.get(name, data_torch.dtype)
|
||||
data = data_torch.to(torch.int64).cpu().numpy()
|
||||
if name == "d2t":
|
||||
data = data.reshape(-1)
|
||||
data = data + np.arange(data.size, dtype=np.int64)
|
||||
if np.any((data < 0) | (data >= self.target_vocab_size)):
|
||||
raise ValueError(f"EAGLE-3 d2t target ids out of range for target vocab size {self.target_vocab_size}")
|
||||
if np.unique(data).size != data.size:
|
||||
raise ValueError("EAGLE-3 d2t contains duplicate target ids")
|
||||
data_qtype = gguf.GGMLQuantizationType.I64
|
||||
|
||||
shape_str = f"{{{', '.join(str(n) for n in reversed(data.shape))}}}"
|
||||
logger.info(f"{name + ',':<30} {old_dtype} --> {data_qtype.name}, shape = {shape_str}")
|
||||
self.gguf_writer.add_tensor(name, data, raw_dtype=data_qtype)
|
||||
|
||||
if self._experts is not None:
|
||||
# flatten `list[dict[str, Tensor]]` into `list[str]`
|
||||
experts = [k for d in self._experts for k in d.keys()]
|
||||
|
||||
@ -153,6 +153,15 @@ def parse_args() -> argparse.Namespace:
|
||||
help="Store tensors dequantized from FP8 as Q8_0 instead of BF16/F16.",
|
||||
)
|
||||
|
||||
parser.add_argument(
|
||||
"--target-model-dir", type=str, default=None,
|
||||
help=(
|
||||
"path to the target model directory; required when converting a standalone draft model "
|
||||
"(e.g. EAGLE3 / DFlash) that needs target-model metadata such as tokenizer, hidden size, and "
|
||||
"layer count to populate its GGUF."
|
||||
),
|
||||
)
|
||||
|
||||
args = parser.parse_args()
|
||||
if not args.print_supported_models and args.model is None:
|
||||
parser.error("the following arguments are required: model")
|
||||
@ -269,6 +278,7 @@ def main() -> None:
|
||||
small_first_shard=args.no_tensor_first_split,
|
||||
remote_hf_model_id=hf_repo_id, disable_mistral_community_chat_template=disable_mistral_community_chat_template,
|
||||
sentence_transformers_dense_modules=args.sentence_transformers_dense_modules,
|
||||
target_model_dir=Path(args.target_model_dir) if args.target_model_dir else None,
|
||||
fuse_gate_up_exps=args.fuse_gate_up_exps,
|
||||
fp8_as_q8=args.fp8_as_q8,
|
||||
)
|
||||
|
||||
@ -154,6 +154,9 @@ class Keys:
|
||||
HIDDEN_ACT = "{arch}.hidden_activation"
|
||||
DENSE_FEAT_IN_SIZE = "{arch}.{dense}_feat_in"
|
||||
DENSE_FEAT_OUT_SIZE = "{arch}.{dense}_feat_out"
|
||||
TARGET_LAYERS = "{arch}.target_layers"
|
||||
TARGET_HIDDEN_SIZE = "{arch}.target_hidden_size"
|
||||
NORM_BEFORE_RESIDUAL = "{arch}.norm_before_residual"
|
||||
|
||||
class Attention:
|
||||
HEAD_COUNT = "{arch}.attention.head_count"
|
||||
@ -511,6 +514,7 @@ class MODEL_ARCH(IntEnum):
|
||||
RND1 = auto()
|
||||
PANGU_EMBED = auto()
|
||||
MISTRAL3 = auto()
|
||||
EAGLE3 = auto()
|
||||
MISTRAL4 = auto()
|
||||
PADDLEOCR = auto()
|
||||
MIMO2 = auto()
|
||||
@ -901,14 +905,17 @@ class MODEL_TENSOR(IntEnum):
|
||||
A_PER_DIM_K_SCALE = auto() # gemma4
|
||||
A_PER_DIM_SCALE = auto() # gemma4
|
||||
# nextn/mtp
|
||||
NEXTN_PROJ_PRE = auto()
|
||||
NEXTN_PROJ_POST = auto()
|
||||
NEXTN_EH_PROJ = auto()
|
||||
NEXTN_EMBED_TOKENS = auto()
|
||||
NEXTN_ENORM = auto()
|
||||
NEXTN_HNORM = auto()
|
||||
NEXTN_PROJ_PRE = auto()
|
||||
NEXTN_PROJ_POST = auto()
|
||||
NEXTN_EH_PROJ = auto()
|
||||
NEXTN_EMBED_TOKENS = auto()
|
||||
NEXTN_ENORM = auto()
|
||||
NEXTN_HNORM = auto()
|
||||
NEXTN_SHARED_HEAD_HEAD = auto()
|
||||
NEXTN_SHARED_HEAD_NORM = auto()
|
||||
# eagle3
|
||||
FC = auto() # feature fusion layer
|
||||
D2T = auto() # draft to target vocabulary mapping
|
||||
# lfm2 audio
|
||||
A_ENC_NORM_CONV = auto()
|
||||
A_ENC_LINEAR_POS = auto()
|
||||
@ -1063,6 +1070,7 @@ MODEL_ARCH_NAMES: dict[MODEL_ARCH, str] = {
|
||||
MODEL_ARCH.RND1: "rnd1",
|
||||
MODEL_ARCH.PANGU_EMBED: "pangu-embedded",
|
||||
MODEL_ARCH.MISTRAL3: "mistral3",
|
||||
MODEL_ARCH.EAGLE3: "eagle3",
|
||||
MODEL_ARCH.MISTRAL4: "mistral4",
|
||||
MODEL_ARCH.PADDLEOCR: "paddleocr",
|
||||
MODEL_ARCH.MIMO2: "mimo2",
|
||||
@ -1095,8 +1103,8 @@ TENSOR_NAMES: dict[MODEL_TENSOR, str] = {
|
||||
MODEL_TENSOR.POS_EMBD: "position_embd",
|
||||
MODEL_TENSOR.OUTPUT_NORM: "output_norm",
|
||||
MODEL_TENSOR.OUTPUT: "output",
|
||||
MODEL_TENSOR.DENSE_2_OUT: "dense_2", # embeddinggemma 2_Dense
|
||||
MODEL_TENSOR.DENSE_3_OUT: "dense_3", # embeddinggemma 2_Dense
|
||||
MODEL_TENSOR.DENSE_2_OUT: "dense_2", # embeddinggemma 2_Dense
|
||||
MODEL_TENSOR.DENSE_3_OUT: "dense_3", # embeddinggemma 2_Dense
|
||||
MODEL_TENSOR.ROPE_FREQS: "rope_freqs",
|
||||
MODEL_TENSOR.ROPE_FACTORS_LONG: "rope_factors_long",
|
||||
MODEL_TENSOR.ROPE_FACTORS_SHORT: "rope_factors_short",
|
||||
@ -1488,6 +1496,8 @@ TENSOR_NAMES: dict[MODEL_TENSOR, str] = {
|
||||
MODEL_TENSOR.NEXTN_HNORM: "blk.{bid}.nextn.hnorm",
|
||||
MODEL_TENSOR.NEXTN_SHARED_HEAD_HEAD: "blk.{bid}.nextn.shared_head_head",
|
||||
MODEL_TENSOR.NEXTN_SHARED_HEAD_NORM: "blk.{bid}.nextn.shared_head_norm",
|
||||
MODEL_TENSOR.FC: "fc",
|
||||
MODEL_TENSOR.D2T: "d2t",
|
||||
}
|
||||
|
||||
MODEL_TENSORS: dict[MODEL_ARCH, list[MODEL_TENSOR]] = {
|
||||
@ -4028,6 +4038,24 @@ MODEL_TENSORS: dict[MODEL_ARCH, list[MODEL_TENSOR]] = {
|
||||
MODEL_TENSOR.FFN_DOWN_EXP,
|
||||
MODEL_TENSOR.FFN_UP_EXP,
|
||||
],
|
||||
MODEL_ARCH.EAGLE3: [
|
||||
MODEL_TENSOR.TOKEN_EMBD,
|
||||
MODEL_TENSOR.OUTPUT_NORM,
|
||||
MODEL_TENSOR.OUTPUT,
|
||||
MODEL_TENSOR.ROPE_FREQS,
|
||||
MODEL_TENSOR.ATTN_NORM,
|
||||
MODEL_TENSOR.ATTN_NORM_2,
|
||||
MODEL_TENSOR.ATTN_Q,
|
||||
MODEL_TENSOR.ATTN_K,
|
||||
MODEL_TENSOR.ATTN_V,
|
||||
MODEL_TENSOR.ATTN_OUT,
|
||||
MODEL_TENSOR.FFN_NORM,
|
||||
MODEL_TENSOR.FFN_GATE,
|
||||
MODEL_TENSOR.FFN_DOWN,
|
||||
MODEL_TENSOR.FFN_UP,
|
||||
MODEL_TENSOR.FC,
|
||||
MODEL_TENSOR.D2T,
|
||||
],
|
||||
MODEL_ARCH.MISTRAL4: [
|
||||
MODEL_TENSOR.TOKEN_EMBD,
|
||||
MODEL_TENSOR.OUTPUT_NORM,
|
||||
|
||||
@ -3,7 +3,6 @@
|
||||
#include "llama-impl.h"
|
||||
|
||||
#include <map>
|
||||
#include <set>
|
||||
#include <vector>
|
||||
|
||||
static const std::map<llm_arch, const char *> LLM_ARCH_NAMES = {
|
||||
@ -128,6 +127,7 @@ static const std::map<llm_arch, const char *> LLM_ARCH_NAMES = {
|
||||
{ LLM_ARCH_RND1, "rnd1" },
|
||||
{ LLM_ARCH_PANGU_EMBED, "pangu-embedded" },
|
||||
{ LLM_ARCH_MISTRAL3, "mistral3" },
|
||||
{ LLM_ARCH_EAGLE3, "eagle3" },
|
||||
{ LLM_ARCH_MISTRAL4, "mistral4" },
|
||||
{ LLM_ARCH_PADDLEOCR, "paddleocr" },
|
||||
{ LLM_ARCH_MIMO2, "mimo2" },
|
||||
@ -292,12 +292,16 @@ static const std::map<llm_kv, const char *> LLM_KV_NAMES = {
|
||||
|
||||
{ LLM_KV_CLASSIFIER_OUTPUT_LABELS, "%s.classifier.output_labels" },
|
||||
|
||||
{ LLM_KV_TARGET_LAYERS, "%s.target_layers" },
|
||||
{ LLM_KV_TARGET_HIDDEN_SIZE, "%s.target_hidden_size" },
|
||||
{ LLM_KV_NORM_BEFORE_RESIDUAL, "%s.norm_before_residual" },
|
||||
|
||||
{ LLM_KV_SHORTCONV_L_CACHE, "%s.shortconv.l_cache" },
|
||||
// sentence-transformers dense modules feature dims
|
||||
{ LLM_KV_DENSE_2_FEAT_IN, "%s.dense_2_feat_in" },
|
||||
{ LLM_KV_DENSE_2_FEAT_OUT, "%s.dense_2_feat_out" },
|
||||
{ LLM_KV_DENSE_3_FEAT_IN, "%s.dense_3_feat_in" },
|
||||
{ LLM_KV_DENSE_3_FEAT_OUT, "%s.dense_3_feat_out" },
|
||||
{ LLM_KV_DENSE_2_FEAT_OUT, "%s.dense_2_feat_out" },
|
||||
{ LLM_KV_DENSE_3_FEAT_IN, "%s.dense_3_feat_in" },
|
||||
{ LLM_KV_DENSE_3_FEAT_OUT, "%s.dense_3_feat_out" },
|
||||
|
||||
{ LLM_KV_TOKENIZER_MODEL, "tokenizer.ggml.model" },
|
||||
{ LLM_KV_TOKENIZER_PRE, "tokenizer.ggml.pre" },
|
||||
@ -562,6 +566,8 @@ static const std::map<llm_tensor, const char *> LLM_TENSOR_NAMES = {
|
||||
{ LLM_TENSOR_INDEXER_ATTN_Q_B, "blk.%d.indexer.attn_q_b" },
|
||||
{ LLM_TENSOR_MASKED_EMBD_CENTROIDS, "masked_embd_centroids" },
|
||||
{ LLM_TENSOR_MASKED_EMBD_ORDERING, "masked_embd_ordering" },
|
||||
{ LLM_TENSOR_FC, "fc" },
|
||||
{ LLM_TENSOR_D2T, "d2t" },
|
||||
};
|
||||
|
||||
// declare information about the model weight tensors:
|
||||
@ -788,6 +794,9 @@ static const std::map<llm_tensor, llm_tensor_info> LLM_TENSOR_INFOS = {
|
||||
{LLM_TENSOR_FFN_LATENT_UP, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL_MAT}},
|
||||
{LLM_TENSOR_MASKED_EMBD_CENTROIDS, {LLM_TENSOR_LAYER_INPUT, GGML_OP_NONE}},
|
||||
{LLM_TENSOR_MASKED_EMBD_ORDERING, {LLM_TENSOR_LAYER_INPUT, GGML_OP_NONE}},
|
||||
// eagle3
|
||||
{LLM_TENSOR_FC, {LLM_TENSOR_LAYER_OUTPUT, GGML_OP_MUL_MAT}},
|
||||
{LLM_TENSOR_D2T, {LLM_TENSOR_LAYER_OUTPUT, GGML_OP_GET_ROWS}},
|
||||
};
|
||||
|
||||
LLM_KV::LLM_KV(llm_arch arch, const char * suffix) : arch(arch), suffix(suffix) {}
|
||||
|
||||
@ -141,6 +141,7 @@ enum llm_arch {
|
||||
LLM_ARCH_KIMI_LINEAR,
|
||||
LLM_ARCH_TALKIE,
|
||||
LLM_ARCH_MELLUM,
|
||||
LLM_ARCH_EAGLE3,
|
||||
LLM_ARCH_UNKNOWN,
|
||||
};
|
||||
|
||||
@ -337,6 +338,10 @@ enum llm_kv {
|
||||
|
||||
LLM_KV_CLASSIFIER_OUTPUT_LABELS,
|
||||
|
||||
LLM_KV_TARGET_LAYERS,
|
||||
LLM_KV_TARGET_HIDDEN_SIZE,
|
||||
LLM_KV_NORM_BEFORE_RESIDUAL,
|
||||
|
||||
LLM_KV_SHORTCONV_L_CACHE,
|
||||
|
||||
LLM_KV_XIELU_ALPHA_N,
|
||||
@ -569,6 +574,8 @@ enum llm_tensor {
|
||||
LLM_TENSOR_NEXTN_SHARED_HEAD_NORM,
|
||||
LLM_TENSOR_MASKED_EMBD_CENTROIDS,
|
||||
LLM_TENSOR_MASKED_EMBD_ORDERING,
|
||||
LLM_TENSOR_FC,
|
||||
LLM_TENSOR_D2T,
|
||||
};
|
||||
|
||||
|
||||
|
||||
@ -71,6 +71,9 @@ llama_context::llama_context(
|
||||
cparams.no_perf = params.no_perf;
|
||||
cparams.warmup = false;
|
||||
|
||||
cparams.embeddings_layer_inp.resize(hparams.n_layer(), false);
|
||||
embd_layer_inp.resize(hparams.n_layer());
|
||||
|
||||
cparams.ctx_type = params.ctx_type;
|
||||
cparams.pooling_type = params.pooling_type;
|
||||
|
||||
@ -91,12 +94,21 @@ llama_context::llama_context(
|
||||
if (model.arch == LLM_ARCH_GEMMA4_ASSISTANT) {
|
||||
if (params.ctx_other == nullptr) {
|
||||
// TODO: change from runtime_error to llama_exception to avoid printing error message
|
||||
throw std::runtime_error("Gemma4Assistant requires ctx_other to be set (this is normal during memory fitting)");
|
||||
throw std::runtime_error("Gemma4Assistant requires ctx_other to be set (this warning is normal during memory fitting)");
|
||||
}
|
||||
|
||||
cparams.ctx_other = params.ctx_other;
|
||||
}
|
||||
|
||||
if (model.arch == LLM_ARCH_EAGLE3) {
|
||||
if (model.tok_embd == nullptr || model.output == nullptr) {
|
||||
if (params.ctx_other == nullptr) {
|
||||
throw std::runtime_error("EAGLE3 requires ctx_other to be set (this warning is normal during memory fitting)");
|
||||
}
|
||||
cparams.ctx_other = params.ctx_other;
|
||||
}
|
||||
}
|
||||
|
||||
// Initialize backend samplers here so they are part of the sampling graph
|
||||
// before the reserve passes run later in this function. This avoids a later
|
||||
// re-reserve when graph nodes change.
|
||||
@ -194,7 +206,7 @@ llama_context::llama_context(
|
||||
|
||||
cparams.n_ubatch = std::min(cparams.n_batch, params.n_ubatch == 0 ? params.n_batch : params.n_ubatch);
|
||||
|
||||
cparams.n_outputs_max = params.n_outputs_max == 0 ? cparams.n_batch : params.n_outputs_max;
|
||||
cparams.n_outputs_max = params.n_outputs_max == 0 || llama_model_has_encoder(&model) ? cparams.n_batch : params.n_outputs_max;
|
||||
|
||||
cparams.op_offload = params.op_offload;
|
||||
cparams.kv_unified = params.kv_unified;
|
||||
@ -938,6 +950,14 @@ float * llama_context::get_embeddings_nextn_ith(int32_t i) {
|
||||
}
|
||||
}
|
||||
|
||||
float * llama_context::get_embeddings_layer_inp(uint32_t lid) {
|
||||
output_reorder();
|
||||
|
||||
GGML_ASSERT(lid < embd_layer_inp.size() && embd_layer_inp[lid].has_data());
|
||||
|
||||
return embd_layer_inp[lid].data;
|
||||
}
|
||||
|
||||
llama_token llama_context::get_sampled_token_ith(int32_t idx) {
|
||||
output_reorder();
|
||||
|
||||
@ -1125,6 +1145,17 @@ void llama_context::set_embeddings_nextn(bool value, bool masked) {
|
||||
cparams.embeddings_nextn_masked = masked;
|
||||
}
|
||||
|
||||
void llama_context::set_embeddings_layer_inp(uint32_t lid, bool enable) {
|
||||
LLAMA_LOG_DEBUG("%s: lid = %d, enable = %d\n", __func__, lid, enable);
|
||||
|
||||
GGML_ASSERT(lid < model.hparams.n_layer());
|
||||
|
||||
cparams.embeddings_layer_inp[lid] = enable;
|
||||
|
||||
// note: without this reserve, the draft acceptance drops to zero. not sure why - this is unexpected
|
||||
sched_need_reserve = true;
|
||||
}
|
||||
|
||||
void llama_context::set_causal_attn(bool value) {
|
||||
LLAMA_LOG_DEBUG("%s: value = %d\n", __func__, value);
|
||||
|
||||
@ -1350,7 +1381,8 @@ int llama_context::encode(const llama_batch & batch_inp) {
|
||||
|
||||
const auto & hparams = model.hparams;
|
||||
|
||||
const int64_t n_embd = hparams.n_embd_inp();
|
||||
// eagle3/DFlash: features as encoder input, and non-draft paths fall back to model's input dim
|
||||
const int64_t n_embd = hparams.n_embd_inp();
|
||||
const int64_t n_vocab = model.vocab.n_tokens();
|
||||
|
||||
// note: during encode, we always pass the full sequence starting from pos = 0
|
||||
@ -1925,6 +1957,8 @@ int llama_context::decode(const llama_batch & batch_inp) {
|
||||
}
|
||||
}
|
||||
|
||||
extract_layer_inputs(res, n_tokens_prev, ubatch.n_tokens);
|
||||
|
||||
// extract nextn embeddings before
|
||||
// only meaningful in LLAMA_POOLING_TYPE_NONE (per-token); other pooling modes are ignored.
|
||||
{
|
||||
@ -2029,6 +2063,7 @@ uint32_t llama_context::output_reserve(int32_t n_outputs) {
|
||||
|
||||
const auto n_batch = cparams.n_batch;
|
||||
const auto n_vocab = vocab.n_tokens();
|
||||
const auto n_embd = hparams.n_embd;
|
||||
const auto n_embd_out = hparams.n_embd_out();
|
||||
|
||||
bool has_logits = true;
|
||||
@ -2041,9 +2076,9 @@ uint32_t llama_context::output_reserve(int32_t n_outputs) {
|
||||
has_embd = true;
|
||||
}
|
||||
|
||||
|
||||
size_t backend_float_count = 0;
|
||||
size_t backend_token_count = 0;
|
||||
size_t embd_layer_inp_float_count = 0;
|
||||
|
||||
logits.size = has_logits ? n_vocab*n_outputs_max : 0;
|
||||
embd.size = has_embd ? n_embd_out*n_outputs_max : 0;
|
||||
@ -2055,6 +2090,12 @@ uint32_t llama_context::output_reserve(int32_t n_outputs) {
|
||||
embd_nextn.size = (size_t) n_embd_out * n_batch;
|
||||
}
|
||||
|
||||
for (bool enabled : cparams.embeddings_layer_inp) {
|
||||
if (enabled) {
|
||||
embd_layer_inp_float_count += (size_t) n_embd * n_batch;
|
||||
}
|
||||
}
|
||||
|
||||
// Allocate backend sampling output buffers if there are backend samplers configured.
|
||||
const bool has_sampling = !sampling.samplers.empty();
|
||||
if (has_sampling) {
|
||||
@ -2069,8 +2110,8 @@ uint32_t llama_context::output_reserve(int32_t n_outputs) {
|
||||
|
||||
const size_t prev_size = buf_output ? ggml_backend_buffer_get_size(buf_output.get()) : 0;
|
||||
const size_t new_size =
|
||||
(logits.size + embd.size + embd_nextn.size + backend_float_count) * sizeof(float) +
|
||||
( backend_token_count) * sizeof(llama_token);
|
||||
(logits.size + embd.size + embd_nextn.size + embd_layer_inp_float_count + backend_float_count) * sizeof(float) +
|
||||
( backend_token_count) * sizeof(llama_token);
|
||||
|
||||
// alloc only when more than the current capacity is required
|
||||
// TODO: also consider shrinking the buffer
|
||||
@ -2087,6 +2128,9 @@ uint32_t llama_context::output_reserve(int32_t n_outputs) {
|
||||
logits.data = nullptr;
|
||||
embd.data = nullptr;
|
||||
embd_nextn.data = nullptr;
|
||||
for (auto & layer_inp : embd_layer_inp) {
|
||||
layer_inp = {nullptr, 0};
|
||||
}
|
||||
}
|
||||
|
||||
auto * buft = ggml_backend_cpu_buffer_type();
|
||||
@ -2118,6 +2162,15 @@ uint32_t llama_context::output_reserve(int32_t n_outputs) {
|
||||
embd_nextn = has_embd_nextn ? buffer_view<float>{(float *) (base + offset), embd_nextn.size} : buffer_view<float>{nullptr, 0};
|
||||
offset += embd_nextn.size * sizeof(float);
|
||||
|
||||
for (uint32_t il = 0; il < embd_layer_inp.size(); ++il) {
|
||||
if (cparams.embeddings_layer_inp[il]) {
|
||||
embd_layer_inp[il] = buffer_view<float>{(float *) (base + offset), (size_t) n_embd * n_batch};
|
||||
offset += embd_layer_inp[il].size * sizeof(float);
|
||||
} else {
|
||||
embd_layer_inp[il] = buffer_view<float>{nullptr, 0};
|
||||
}
|
||||
}
|
||||
|
||||
if (has_sampling) {
|
||||
sampling.logits = {(float *) (base + offset), (size_t)(n_vocab*n_outputs_max)};
|
||||
offset += sampling.logits.size * sizeof(float);
|
||||
@ -2164,6 +2217,34 @@ uint32_t llama_context::output_reserve(int32_t n_outputs) {
|
||||
return n_outputs_max;
|
||||
}
|
||||
|
||||
void llama_context::extract_layer_inputs(const llm_graph_result * res, size_t token_offset, size_t n_tokens) {
|
||||
for (uint32_t il = 0; il < cparams.embeddings_layer_inp.size(); ++il) {
|
||||
if (!cparams.embeddings_layer_inp[il]) {
|
||||
continue;
|
||||
}
|
||||
if (!embd_layer_inp[il].has_data()) {
|
||||
GGML_ABORT("output layer input buffer not allocated");
|
||||
}
|
||||
ggml_tensor * t = res->get_layer_inp((int) il);
|
||||
if (!t) {
|
||||
GGML_ABORT("layer input tensor not found");
|
||||
}
|
||||
|
||||
const size_t nbytes = ggml_nbytes(t);
|
||||
const size_t nfloats = nbytes / sizeof(float);
|
||||
GGML_ASSERT(n_tokens > 0);
|
||||
GGML_ASSERT(nfloats % n_tokens == 0);
|
||||
|
||||
const size_t row_floats = nfloats / n_tokens;
|
||||
const size_t dst_offset = token_offset * row_floats;
|
||||
GGML_ASSERT(dst_offset + nfloats <= embd_layer_inp[il].size);
|
||||
|
||||
ggml_backend_t backend = ggml_backend_sched_get_tensor_backend(sched.get(), t);
|
||||
GGML_ASSERT(backend != nullptr);
|
||||
ggml_backend_tensor_get_async(backend, t, embd_layer_inp[il].data + dst_offset, 0, nbytes);
|
||||
}
|
||||
}
|
||||
|
||||
void llama_context::output_reorder() {
|
||||
const uint64_t n_vocab = model.vocab.n_tokens();
|
||||
const uint64_t n_embd = model.hparams.n_embd;
|
||||
@ -2190,6 +2271,16 @@ void llama_context::output_reorder() {
|
||||
}
|
||||
}
|
||||
|
||||
if (embd_layer_inp.size() > 0) {
|
||||
for (int lid = 0; lid < (int) embd_layer_inp.size(); ++lid) {
|
||||
if (embd_layer_inp[lid].size > 0) {
|
||||
for (uint64_t k = 0; k < n_embd; ++k) {
|
||||
std::swap(embd_layer_inp[lid].data[i0*n_embd + k], embd_layer_inp[lid].data[i1*n_embd + k]);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
if (!sampling.samplers.empty()) {
|
||||
assert(sampling.logits.size > 0);
|
||||
assert(sampling.probs.size > 0);
|
||||
@ -3604,6 +3695,10 @@ void llama_set_embeddings_nextn(llama_context * ctx, bool value, bool masked) {
|
||||
ctx->set_embeddings_nextn(value, masked);
|
||||
}
|
||||
|
||||
void llama_set_embeddings_layer_inp(llama_context * ctx, uint32_t lid, bool value) {
|
||||
ctx->set_embeddings_layer_inp(lid, value);
|
||||
}
|
||||
|
||||
llama_memory_t llama_get_memory(const struct llama_context * ctx) {
|
||||
if (!ctx) {
|
||||
return nullptr;
|
||||
@ -3624,6 +3719,12 @@ float * llama_get_embeddings_nextn_ith(llama_context * ctx, int32_t i) {
|
||||
return ctx->get_embeddings_nextn_ith(i);
|
||||
}
|
||||
|
||||
float * llama_get_embeddings_layer_inp(llama_context * ctx, uint32_t lid) {
|
||||
ctx->synchronize();
|
||||
|
||||
return ctx->get_embeddings_layer_inp(lid);
|
||||
}
|
||||
|
||||
bool llama_set_sampler(llama_context * ctx, llama_seq_id seq_id, llama_sampler * smpl) {
|
||||
return ctx->set_sampler(seq_id, smpl);
|
||||
}
|
||||
|
||||
@ -88,6 +88,8 @@ struct llama_context {
|
||||
float * get_embeddings_nextn();
|
||||
float * get_embeddings_nextn_ith(int32_t i);
|
||||
|
||||
float * get_embeddings_layer_inp(uint32_t lid);
|
||||
|
||||
llama_token * get_sampled_tokens() const;
|
||||
llama_token get_sampled_token_ith(int32_t idx);
|
||||
|
||||
@ -112,6 +114,7 @@ struct llama_context {
|
||||
|
||||
void set_embeddings (bool value);
|
||||
void set_embeddings_nextn(bool value, bool masked);
|
||||
void set_embeddings_layer_inp(uint32_t lid, bool enable);
|
||||
void set_causal_attn(bool value);
|
||||
void set_warmup(bool value);
|
||||
|
||||
@ -226,6 +229,10 @@ private:
|
||||
// map the output row index `i` to batch index
|
||||
int64_t output_resolve_row(int32_t i) const;
|
||||
|
||||
// async-copy enabled layer-input tensors (per cparams.output_layer_inp)
|
||||
// from backend into host-side embd_layer_inp buffers
|
||||
void extract_layer_inputs(const llm_graph_result * res, size_t token_offset, size_t n_tokens);
|
||||
|
||||
//
|
||||
// graph
|
||||
//
|
||||
@ -288,6 +295,10 @@ private:
|
||||
// sets llm_graph_result::t_h_nextn
|
||||
buffer_view<float> embd_nextn = {nullptr, 0};
|
||||
|
||||
// host buffers for output layer input embeddings, per layer
|
||||
// populated when cparams.output_layer_inp[il] is true
|
||||
std::vector<buffer_view<float>> embd_layer_inp;
|
||||
|
||||
struct sampling_info {
|
||||
// !samplers.empty() to check if any samplers are active
|
||||
std::map<llama_seq_id, llama_sampler *> samplers;
|
||||
|
||||
@ -3,6 +3,7 @@
|
||||
#include "llama.h"
|
||||
|
||||
#include <cstdint>
|
||||
#include <vector>
|
||||
|
||||
#define LLAMA_MAX_SEQ 256
|
||||
|
||||
@ -44,6 +45,8 @@ struct llama_cparams {
|
||||
bool kv_unified;
|
||||
bool pipeline_parallel;
|
||||
|
||||
std::vector<bool> embeddings_layer_inp; // [n_layer()] extract input embeddings for layer
|
||||
|
||||
enum llama_context_type ctx_type;
|
||||
enum llama_pooling_type pooling_type;
|
||||
|
||||
|
||||
@ -101,4 +101,20 @@ LLAMA_API float * llama_get_embeddings_nextn(struct llama_context * ctx);
|
||||
// LLAMA_API float * llama_get_embeddings_ith(struct llama_context * ctx, int32_t i);
|
||||
LLAMA_API float * llama_get_embeddings_nextn_ith(struct llama_context * ctx, int32_t i);
|
||||
|
||||
// Set whether the context outputs the input embeddings of a specific layer
|
||||
LLAMA_API void llama_set_embeddings_layer_inp(struct llama_context * ctx, uint32_t lid, bool value);
|
||||
|
||||
// mirrors:
|
||||
// LLAMA_API float * llama_get_embeddings(struct llama_context * ctx);
|
||||
LLAMA_API float * llama_get_embeddings_layer_inp(struct llama_context * ctx, uint32_t lid);
|
||||
|
||||
LLAMA_API llama_context * llama_get_ctx_other(struct llama_context * ctx);
|
||||
|
||||
//
|
||||
// model/context data extraction
|
||||
//
|
||||
|
||||
// returns pointer to the target-model layer indices
|
||||
LLAMA_API const int32_t * llama_model_target_layer_ids (const struct llama_model * model);
|
||||
// returns the number of extracted layers from target model
|
||||
LLAMA_API uint32_t llama_model_target_layer_ids_n(const struct llama_model * model);
|
||||
|
||||
@ -904,6 +904,10 @@ void llm_graph_result::reset() {
|
||||
t_logits = nullptr;
|
||||
t_embd = nullptr;
|
||||
t_embd_pooled = nullptr;
|
||||
|
||||
t_layer_inp.resize(LLAMA_MAX_LAYERS);
|
||||
std::fill(t_layer_inp.begin(), t_layer_inp.end(), nullptr);
|
||||
|
||||
t_sampled.clear();
|
||||
t_sampled_probs.clear();
|
||||
t_sampled_logits.clear();
|
||||
@ -932,7 +936,7 @@ void llm_graph_result::set_inputs(const llama_ubatch * ubatch) {
|
||||
}
|
||||
}
|
||||
|
||||
void llm_graph_result::set_outputs() {
|
||||
void llm_graph_result::set_outputs(const llm_graph_params & params) {
|
||||
if (t_logits != nullptr) {
|
||||
ggml_set_output(t_logits);
|
||||
}
|
||||
@ -945,6 +949,15 @@ void llm_graph_result::set_outputs() {
|
||||
if (t_h_nextn != nullptr) {
|
||||
ggml_set_output(t_h_nextn);
|
||||
}
|
||||
{
|
||||
const auto & embeddings_layer_inp = params.cparams.embeddings_layer_inp;
|
||||
for (size_t il = 0; il < embeddings_layer_inp.size(); ++il) {
|
||||
if (embeddings_layer_inp[il]) {
|
||||
GGML_ASSERT(t_layer_inp[il] != nullptr && "layer input tensor is null");
|
||||
ggml_set_output(t_layer_inp[il]);
|
||||
}
|
||||
}
|
||||
}
|
||||
for (auto & [seq_id, t] : t_sampled) {
|
||||
if (t != nullptr) {
|
||||
ggml_set_output(t);
|
||||
|
||||
@ -705,6 +705,8 @@ public:
|
||||
ggml_tensor * get_embd_pooled() const { return t_embd_pooled; }
|
||||
ggml_tensor * get_h_nextn() const { return t_h_nextn; }
|
||||
|
||||
ggml_tensor * get_layer_inp(int il) const { return t_layer_inp[il]; }
|
||||
|
||||
ggml_cgraph * get_gf() const { return gf; }
|
||||
ggml_context * get_ctx() const { return ctx_compute.get(); }
|
||||
|
||||
@ -713,7 +715,7 @@ public:
|
||||
void reset();
|
||||
|
||||
void set_inputs(const llama_ubatch * ubatch);
|
||||
void set_outputs();
|
||||
void set_outputs(const llm_graph_params & params);
|
||||
|
||||
// try to update the existing graph result using the new graph parameters in order to reuse it
|
||||
// this can only be done if we determine that the resulting graph using the new graph parameters
|
||||
@ -734,10 +736,12 @@ public:
|
||||
ggml_tensor * t_embd_pooled = nullptr;
|
||||
ggml_tensor * t_h_nextn = nullptr; // [n_embd, n_outputs] hidden state before final output norm
|
||||
|
||||
std::map<llama_seq_id, ggml_tensor*> t_sampled_logits;
|
||||
std::map<llama_seq_id, ggml_tensor*> t_candidates;
|
||||
std::map<llama_seq_id, ggml_tensor*> t_sampled;
|
||||
std::map<llama_seq_id, ggml_tensor*> t_sampled_probs;
|
||||
std::vector<ggml_tensor *> t_layer_inp;
|
||||
|
||||
std::map<llama_seq_id, ggml_tensor *> t_sampled_logits;
|
||||
std::map<llama_seq_id, ggml_tensor *> t_candidates;
|
||||
std::map<llama_seq_id, ggml_tensor *> t_sampled;
|
||||
std::map<llama_seq_id, ggml_tensor *> t_sampled_probs;
|
||||
|
||||
std::vector<llm_graph_input_ptr> inputs;
|
||||
|
||||
|
||||
@ -45,6 +45,7 @@ struct llama_hparams {
|
||||
bool rope_finetuned;
|
||||
bool use_par_res;
|
||||
bool swin_norm;
|
||||
bool norm_before_residual = false;
|
||||
|
||||
uint32_t n_ctx_train; // context size the model was trained on
|
||||
uint32_t n_embd;
|
||||
|
||||
@ -394,6 +394,7 @@ namespace GGUFMeta {
|
||||
|
||||
template bool llama_model_loader::get_arr<std::vector<std::string>>(enum llm_kv kid, std::vector<std::string> & result, bool required);
|
||||
template bool llama_model_loader::get_arr<std::array<int32_t, 512>>(enum llm_kv kid, std::array<int32_t, 512> & result, bool required);
|
||||
template bool llama_model_loader::get_arr<std::vector<int32_t>>(enum llm_kv kid, std::vector<int32_t> & result, bool required);
|
||||
|
||||
template<typename T>
|
||||
bool llama_model_loader::get_key(const std::string & key, T & result, bool required) {
|
||||
|
||||
@ -287,6 +287,8 @@ static llama_model * llama_model_mapping(llm_arch arch, const llama_model_params
|
||||
return new llama_model_qwen35moe(params);
|
||||
case LLM_ARCH_MISTRAL3:
|
||||
return new llama_model_mistral3(params);
|
||||
case LLM_ARCH_EAGLE3:
|
||||
return new llama_model_eagle3(params);
|
||||
case LLM_ARCH_MIMO2:
|
||||
return new llama_model_mimo2(params);
|
||||
case LLM_ARCH_KIMI_LINEAR:
|
||||
@ -2238,7 +2240,7 @@ ggml_cgraph * llama_model::build_graph(const llm_graph_params & params) const {
|
||||
// TODO: move reranking logic here and generalize
|
||||
llm->build_dense_out(dense_2_out_layers, dense_2_out_layers_b, dense_3_out_layers);
|
||||
|
||||
llm->res->set_outputs();
|
||||
llm->res->set_outputs(params);
|
||||
|
||||
return llm->res->get_gf();
|
||||
}
|
||||
@ -2406,6 +2408,7 @@ llama_rope_type llama_model_rope_type(const llama_model * model) {
|
||||
case LLM_ARCH_ERNIE4_5:
|
||||
case LLM_ARCH_ERNIE4_5_MOE:
|
||||
case LLM_ARCH_MISTRAL3:
|
||||
case LLM_ARCH_EAGLE3:
|
||||
case LLM_ARCH_MISTRAL4:
|
||||
case LLM_ARCH_LLAMA_EMBED:
|
||||
case LLM_ARCH_MAINCODER:
|
||||
@ -2600,8 +2603,9 @@ uint64_t llama_model_n_params(const llama_model * model) {
|
||||
|
||||
bool llama_model_has_encoder(const llama_model * model) {
|
||||
switch (model->arch) {
|
||||
case LLM_ARCH_T5: return true;
|
||||
case LLM_ARCH_T5ENCODER: return true;
|
||||
case LLM_ARCH_T5:
|
||||
case LLM_ARCH_T5ENCODER:
|
||||
case LLM_ARCH_EAGLE3: return true;
|
||||
default: return false;
|
||||
}
|
||||
}
|
||||
@ -2687,3 +2691,12 @@ void llama_model_base::create_tensor_qkv(llama_layer & layer, int bid,
|
||||
layer.wv_b = create_tensor(tn(LLM_TENSOR_ATTN_V, "bias", bid), {n_embd_v_}, TENSOR_NOT_REQUIRED);
|
||||
}
|
||||
}
|
||||
|
||||
const int32_t * llama_model_target_layer_ids(const struct llama_model * model) {
|
||||
const auto & v = model->target_layer_ids;
|
||||
return v.empty() ? nullptr : v.data();
|
||||
}
|
||||
|
||||
uint32_t llama_model_target_layer_ids_n(const struct llama_model * model) {
|
||||
return (uint32_t) model->target_layer_ids.size();
|
||||
}
|
||||
|
||||
@ -569,6 +569,13 @@ struct llama_model {
|
||||
struct ggml_tensor * per_layer_model_proj = nullptr;
|
||||
struct ggml_tensor * per_layer_proj_norm = nullptr;
|
||||
|
||||
// eagle3
|
||||
struct ggml_tensor * fc = nullptr; // feature fusion layer
|
||||
struct ggml_tensor * d2t = nullptr; // draft to target vocabulary mapping
|
||||
|
||||
// unified vector to store target-model extracted layer ids in eagle3, dflash, etc.
|
||||
std::vector<int32_t> target_layer_ids;
|
||||
|
||||
std::vector<llama_layer> layers;
|
||||
|
||||
//Dense linear projections for SentenceTransformers models like embeddinggemma
|
||||
|
||||
323
src/models/eagle3.cpp
Normal file
323
src/models/eagle3.cpp
Normal file
@ -0,0 +1,323 @@
|
||||
#include "models.h"
|
||||
|
||||
void llama_model_eagle3::load_arch_hparams(llama_model_loader & ml) {
|
||||
ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
|
||||
|
||||
if (!ml.get_arr(LLM_KV_TARGET_LAYERS, target_layer_ids, false)) {
|
||||
throw std::runtime_error("EAGLE3 model requires 'extract_layers' in GGUF metadata");
|
||||
}
|
||||
if (target_layer_ids.size() != 3) {
|
||||
throw std::runtime_error("EAGLE3 requires exactly 3 entries in 'extract_layers'");
|
||||
}
|
||||
LLAMA_LOG_INFO("%s: EAGLE3 extract_layers = [%d, %d, %d]\n", __func__,
|
||||
target_layer_ids[0],
|
||||
target_layer_ids[1],
|
||||
target_layer_ids[2]);
|
||||
|
||||
uint32_t n_embd_tgt = 0;
|
||||
|
||||
ml.get_key(LLM_KV_TARGET_HIDDEN_SIZE, n_embd_tgt);
|
||||
LLAMA_LOG_INFO("%s: EAGLE3 n_embd_tgt = %u (draft n_embd = %u)\n", __func__, n_embd_tgt, hparams.n_embd);
|
||||
|
||||
hparams.n_embd_inp_impl = (uint32_t) target_layer_ids.size() * n_embd_tgt;
|
||||
|
||||
// eagle3 norm_before_residual (optional, default false)
|
||||
// compatible with Readhat eagle3 speculator model
|
||||
ml.get_key(LLM_KV_NORM_BEFORE_RESIDUAL, hparams.norm_before_residual, false);
|
||||
if (hparams.norm_before_residual) {
|
||||
LLAMA_LOG_INFO("%s: EAGLE3gnorm_before_residual = true\n", __func__);
|
||||
}
|
||||
|
||||
type = LLM_TYPE_UNKNOWN;
|
||||
}
|
||||
|
||||
void llama_model_eagle3::load_arch_tensors(llama_model_loader &) {
|
||||
LLAMA_LOAD_LOCALS;
|
||||
|
||||
const int64_t n_embd_inp = hparams.n_embd_inp();
|
||||
const int64_t n_embd_attn_input = 2 * n_embd;
|
||||
|
||||
// Get vocab size from the d2t tensor in the GGUF file (optional - only needed if eagle3 has different vocab_size than target)
|
||||
// d2t: draft to target vocabulary mapping
|
||||
int64_t n_draft_vocab = n_vocab; // Default: same as target vocab
|
||||
const struct ggml_tensor * d2t_meta = ml->get_tensor_meta("d2t");
|
||||
if (d2t_meta) {
|
||||
n_draft_vocab = d2t_meta->ne[0]; // update draft vocab size
|
||||
d2t = create_tensor(tn(LLM_TENSOR_D2T), {n_draft_vocab}, 0);
|
||||
LLAMA_LOG_INFO("%s: EAGLE3 using d2t mapping (draft_vocab_size = %lld)\n", __func__, (long long)n_draft_vocab);
|
||||
} else {
|
||||
d2t = nullptr; // no d2t, use default vocab size
|
||||
LLAMA_LOG_INFO("%s: EAGLE3 without d2t - sharing same vocab_size with target (vocab_size = %lld)\n", __func__, (long long)n_draft_vocab);
|
||||
}
|
||||
|
||||
// Feature fusion layer: projects 3 target layers to draft hidden size
|
||||
fc = create_tensor(tn(LLM_TENSOR_FC, "weight"), {n_embd_inp, n_embd}, 0);
|
||||
|
||||
// Output layer (uses draft vocab size)
|
||||
output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0);
|
||||
output = create_tensor(tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_draft_vocab}, TENSOR_NOT_REQUIRED);
|
||||
|
||||
// Token embeddings (optional - Llama 3.3 70B EAGLE3 has its own)
|
||||
const struct ggml_tensor * tok_embd_meta = ml->get_tensor_meta(tn(LLM_TENSOR_TOKEN_EMBD, "weight").str().c_str());
|
||||
if (tok_embd_meta) {
|
||||
const int64_t n_target_vocab = tok_embd_meta->ne[1];
|
||||
tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_target_vocab}, 0);
|
||||
LLAMA_LOG_INFO("%s: EAGLE3 using its own token_embd (vocab = %lld)\n", __func__, (long long)n_target_vocab);
|
||||
}
|
||||
|
||||
// Single decoder layer
|
||||
for (int i = 0; i < n_layer; ++i) {
|
||||
auto & layer = layers[i];
|
||||
|
||||
// input_layernorm: applied to token embeddings
|
||||
layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, 0);
|
||||
|
||||
// eagle3 specific: hidden_norm applied to fused target features
|
||||
layer.attn_norm_2 = create_tensor(tn(LLM_TENSOR_ATTN_NORM_2, "weight", i), {n_embd}, 0);
|
||||
|
||||
// Attention takes input_embeds_normed + fused_target_normed as input
|
||||
layer.wq = create_tensor(tn(LLM_TENSOR_ATTN_Q, "weight", i), {n_embd_attn_input, n_embd_head_k * n_head}, 0);
|
||||
layer.wk = create_tensor(tn(LLM_TENSOR_ATTN_K, "weight", i), {n_embd_attn_input, n_embd_k_gqa}, 0);
|
||||
layer.wv = create_tensor(tn(LLM_TENSOR_ATTN_V, "weight", i), {n_embd_attn_input, n_embd_v_gqa}, 0);
|
||||
layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd_head_k * n_head, n_embd}, 0);
|
||||
|
||||
layer.ffn_norm = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, 0);
|
||||
layer.ffn_gate = create_tensor(tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd, n_ff}, 0);
|
||||
layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), { n_ff, n_embd}, 0);
|
||||
layer.ffn_up = create_tensor(tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff}, 0);
|
||||
|
||||
// rope_freqs for llama3 rope scaling (optional - only if eagle3 config has rope_scaling)
|
||||
layer.rope_freqs = create_tensor(tn(LLM_TENSOR_ROPE_FREQS, "weight", i), {n_rot/2}, TENSOR_NOT_REQUIRED);
|
||||
}
|
||||
}
|
||||
|
||||
std::unique_ptr<llm_graph_context> llama_model_eagle3::build_arch_graph(const llm_graph_params & params) const {
|
||||
switch (params.gtype) {
|
||||
case LLM_GRAPH_TYPE_ENCODER:
|
||||
return std::make_unique<graph<true>>(*this, params);
|
||||
case LLM_GRAPH_TYPE_DEFAULT:
|
||||
case LLM_GRAPH_TYPE_DECODER:
|
||||
return std::make_unique<graph<false>>(*this, params);
|
||||
default:
|
||||
GGML_ABORT("invalid graph type");
|
||||
};
|
||||
}
|
||||
|
||||
template <>
|
||||
ggml_tensor * llama_model_eagle3::graph<true>::build_inp_embd_enc() const {
|
||||
ggml_tensor * cur = nullptr;
|
||||
|
||||
// Input: Target model features (3 layers concatenated: low, mid, high)
|
||||
// Data will be provided via ubatch->embd in encode_eagle3_features()
|
||||
auto inp_target = std::make_unique<llm_graph_input_embd>(hparams.n_embd_inp());
|
||||
inp_target->embd = ggml_new_tensor_2d(ctx0, GGML_TYPE_F32,hparams.n_embd_inp(), n_tokens);
|
||||
ggml_set_input(inp_target->embd);
|
||||
|
||||
cur = inp_target->embd;
|
||||
cb(cur, "inp_embd", -1);
|
||||
|
||||
res->add_input(std::move(inp_target));
|
||||
|
||||
return cur;
|
||||
}
|
||||
|
||||
// eagle3 Encoder: processes target model features through feature fusion layer
|
||||
// Input: target_features e.g. [12288, n_tokens] from target model layers low, middle, high
|
||||
// Output: g_embeddings e.g. [4096, n_tokens] stored in context
|
||||
template <>
|
||||
llama_model_eagle3::graph<true>::graph(const llama_model & model, const llm_graph_params & params) : llm_graph_context(params) {
|
||||
ggml_tensor * cur = nullptr;
|
||||
|
||||
cur = build_inp_embd_enc();
|
||||
|
||||
// Feature fusion layer
|
||||
cur = build_lora_mm(model.fc, cur);
|
||||
cb(cur, "fc_out", -1);
|
||||
|
||||
// Output: g_embeddings e.g. [4096, n_tokens]
|
||||
// store in t_h_nextn (same as MTP) so can be read via llama_get_embeddings_nextn(ctx_dft)
|
||||
ggml_set_output(cur);
|
||||
res->t_h_nextn = cur;
|
||||
|
||||
ggml_build_forward_expand(gf, cur);
|
||||
}
|
||||
|
||||
// eagle3 Decoder: processes draft tokens using g_embeddings from encoder
|
||||
// Input: draft tokens + g_embeddings from encoder
|
||||
// Output: draft logits
|
||||
template <>
|
||||
llama_model_eagle3::graph<false>::graph(const llama_model & model, const llm_graph_params & params) : llm_graph_context(params) {
|
||||
const int64_t n_embd_head = hparams.n_embd_head_v();
|
||||
|
||||
GGML_ASSERT(n_embd_head == hparams.n_embd_head_k());
|
||||
GGML_ASSERT(n_layer == 1); // eagle3 has only one decoder layer
|
||||
|
||||
ggml_tensor * cur;
|
||||
ggml_tensor * inpL;
|
||||
|
||||
// eagle3 Decoder receives:
|
||||
// 1. Token embeddings (e.g.from eagle3's own tok_embd for Llama 3.3 70B, or target model for Llama 3.1 8B)
|
||||
// 2. g_embeddings from encoder
|
||||
auto * tok_embd = model.tok_embd;
|
||||
if (model.tok_embd == nullptr) {
|
||||
GGML_ASSERT(cparams.ctx_other != nullptr);
|
||||
const auto * model_other = llama_get_model(cparams.ctx_other);
|
||||
|
||||
GGML_ASSERT(model_other->tok_embd != nullptr && "EAGLE3 decoder requires token embeddings (own or from target model)");
|
||||
tok_embd = model_other->tok_embd;
|
||||
}
|
||||
|
||||
auto inp = std::make_unique<llm_graph_input_embd>(n_embd);
|
||||
|
||||
inp->tokens = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, n_tokens);
|
||||
ggml_set_input(inp->tokens);
|
||||
|
||||
inp->embd = ggml_new_tensor_2d(ctx0, GGML_TYPE_F32, n_embd, n_tokens);
|
||||
ggml_set_input(inp->embd);
|
||||
|
||||
ggml_tensor * inp_embd = ggml_get_rows(ctx0, tok_embd, inp->tokens);
|
||||
cb(inp_embd, "inp_embd", -1);
|
||||
|
||||
ggml_tensor * inp_g = inp->embd;
|
||||
cb(inp_g, "inp_g_embeddings", -1);
|
||||
|
||||
res->add_input(std::move(inp));
|
||||
|
||||
inpL = inp_g;
|
||||
|
||||
// inp_pos - contains the positions
|
||||
ggml_tensor * inp_pos = build_inp_pos();
|
||||
|
||||
auto * inp_attn = build_attn_inp_kv();
|
||||
|
||||
const float kq_scale = 1.0f/sqrtf(float(n_embd_head));
|
||||
|
||||
// Single decoder layer (il = 0)
|
||||
const int il = 0;
|
||||
{
|
||||
// Apply input_layernorm to the token embeddings
|
||||
ggml_tensor * embd_norm = build_norm(inp_embd,
|
||||
model.layers[il].attn_norm, NULL,
|
||||
LLM_NORM_RMS, il);
|
||||
cb(embd_norm, "embd_norm", il);
|
||||
|
||||
// Apply hidden_norm to inp_g
|
||||
ggml_tensor * g_norm = build_norm(inp_g,
|
||||
model.layers[il].attn_norm_2, NULL,
|
||||
LLM_NORM_RMS, -1);
|
||||
cb(g_norm, "g_norm", il);
|
||||
|
||||
// norm_before_residual: determines what goes into the residual connection (compatible with Readhat eagle3 speculator model)
|
||||
// - false (default): use raw inp_g for residual
|
||||
// - true: use normalized g_norm for residual
|
||||
// inpL is the concatenated input (normalized inp_embd + normalized inp_g)
|
||||
ggml_tensor * inpSA = hparams.norm_before_residual ? g_norm : inpL;
|
||||
|
||||
// Concatenate normalized inp_embd and normalized inp_g
|
||||
cur = ggml_concat(ctx0, embd_norm, g_norm, il);
|
||||
cb(cur, "concat_embd", il);
|
||||
|
||||
// Self-attention with concatenated input
|
||||
ggml_tensor * Qcur = build_lora_mm(model.layers[il].wq, cur);
|
||||
cb(Qcur, "Qcur", il);
|
||||
|
||||
ggml_tensor * Kcur = build_lora_mm(model.layers[il].wk, cur);
|
||||
cb(Kcur, "Kcur", il);
|
||||
|
||||
ggml_tensor * Vcur = build_lora_mm(model.layers[il].wv, cur);
|
||||
cb(Vcur, "Vcur", il);
|
||||
|
||||
Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens);
|
||||
Kcur = ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens);
|
||||
Vcur = ggml_reshape_3d(ctx0, Vcur, n_embd_head, n_head_kv, n_tokens);
|
||||
|
||||
// rope freq factors, returns nullptr if not available
|
||||
ggml_tensor * rope_factors = model.get_rope_factors(cparams, il);
|
||||
|
||||
// RoPE
|
||||
Qcur = ggml_rope_ext(
|
||||
ctx0, Qcur, inp_pos, rope_factors,
|
||||
n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
|
||||
ext_factor, attn_factor, beta_fast, beta_slow
|
||||
);
|
||||
Kcur = ggml_rope_ext(
|
||||
ctx0, Kcur, inp_pos, rope_factors,
|
||||
n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
|
||||
ext_factor, attn_factor, beta_fast, beta_slow
|
||||
);
|
||||
|
||||
cb(Qcur, "Qcur_rope", il);
|
||||
cb(Kcur, "Kcur_rope", il);
|
||||
|
||||
cur = build_attn(inp_attn,
|
||||
model.layers[il].wo, NULL, nullptr,
|
||||
Qcur, Kcur, Vcur, nullptr, nullptr, nullptr, kq_scale, il);
|
||||
|
||||
// Add residual and update it
|
||||
ggml_tensor * ffn_inp = ggml_add(ctx0, cur, inpSA);
|
||||
cb(ffn_inp, "ffn_inp", il);
|
||||
|
||||
// Apply FFN norm to the sum
|
||||
cur = build_norm(ffn_inp,
|
||||
model.layers[il].ffn_norm, NULL,
|
||||
LLM_NORM_RMS, il);
|
||||
cb(cur, "post_attn_norm", il);
|
||||
|
||||
cur = build_ffn(cur,
|
||||
model.layers[il].ffn_up, NULL, NULL,
|
||||
model.layers[il].ffn_gate, NULL, NULL,
|
||||
model.layers[il].ffn_down, NULL, NULL,
|
||||
NULL,
|
||||
LLM_FFN_SILU, LLM_FFN_PAR, il);
|
||||
cb(cur, "ffn_out", il);
|
||||
|
||||
// Output norm with residual
|
||||
cur = ggml_add(ctx0, cur, ffn_inp);
|
||||
cb(cur, "eagle3_prenorm", il);
|
||||
|
||||
inpL = cur;
|
||||
}
|
||||
|
||||
cur = inpL;
|
||||
|
||||
// Output prenorm state (for next token's g_embeddings in autoregressive generation)
|
||||
ggml_set_output(cur);
|
||||
res->t_h_nextn = cur;
|
||||
|
||||
cur = build_norm(cur,
|
||||
model.output_norm, NULL,
|
||||
LLM_NORM_RMS, -1);
|
||||
cb(cur, "result_norm", -1);
|
||||
|
||||
// lm_head - projects to draft vocabulary
|
||||
// if the draft has no own output projection, inherit the target model's lm_head
|
||||
auto * output = model.output;
|
||||
if (output == nullptr) {
|
||||
GGML_ASSERT(cparams.ctx_other != nullptr);
|
||||
const auto * model_other = llama_get_model(cparams.ctx_other);
|
||||
|
||||
GGML_ASSERT(model_other->output != nullptr && "EAGLE3 decoder requires an output projection (own or from target model)");
|
||||
output = model_other->output;
|
||||
}
|
||||
cur = build_lora_mm(output, cur);
|
||||
|
||||
if (model.d2t) {
|
||||
const int64_t n_draft_vocab = cur->ne[0];
|
||||
const int64_t n_outputs = cur->ne[1];
|
||||
const int64_t n_vocab = (int64_t) model.vocab.n_tokens();
|
||||
|
||||
GGML_ASSERT(model.d2t->type == GGML_TYPE_I64);
|
||||
GGML_ASSERT(model.d2t->ne[0] == n_draft_vocab);
|
||||
|
||||
ggml_tensor * logits = ggml_fill(ctx0, ggml_new_tensor_3d(ctx0, GGML_TYPE_F32, 1, n_vocab, n_outputs), -INFINITY);
|
||||
cur = ggml_set_rows(ctx0, logits,
|
||||
ggml_reshape_3d(ctx0, cur, 1, n_draft_vocab, n_outputs),
|
||||
ggml_reshape_3d(ctx0, model.d2t, n_draft_vocab, 1, 1));
|
||||
cur = ggml_reshape_2d(ctx0, cur, n_vocab, n_outputs);
|
||||
}
|
||||
|
||||
cb(cur, "result_output", -1);
|
||||
res->t_logits = cur;
|
||||
|
||||
ggml_build_forward_expand(gf, cur);
|
||||
}
|
||||
@ -210,6 +210,8 @@ llama_model_gemma4::graph::graph(const llama_model & model, const llm_graph_para
|
||||
const float freq_scale_l = model.get_rope_freq_scale(cparams, il);
|
||||
const int n_rot_l = hparams.n_rot(il);
|
||||
|
||||
res->t_layer_inp[il] = inpL;
|
||||
|
||||
// norm
|
||||
cur = build_norm(inpL, model.layers[il].attn_norm, nullptr, LLM_NORM_RMS, il);
|
||||
cb(cur, "attn_norm", il);
|
||||
|
||||
@ -124,6 +124,8 @@ llama_model_llama::graph<embed>::graph(const llama_model & model, const llm_grap
|
||||
ggml_tensor * inp_out_ids = build_inp_out_ids();
|
||||
|
||||
for (int il = 0; il < n_layer; ++il) {
|
||||
res->t_layer_inp[il] = inpL;
|
||||
|
||||
ggml_tensor * inpSA = inpL;
|
||||
|
||||
// norm
|
||||
|
||||
@ -1089,6 +1089,21 @@ struct llama_model_glm_dsa : public llama_model_base {
|
||||
std::unique_ptr<llm_graph_context> build_arch_graph(const llm_graph_params & params) const override;
|
||||
};
|
||||
|
||||
struct llama_model_eagle3 : public llama_model_base {
|
||||
llama_model_eagle3(const struct llama_model_params & params) : llama_model_base(params) {}
|
||||
void load_arch_hparams(llama_model_loader & ml) override;
|
||||
void load_arch_tensors(llama_model_loader & ml) override;
|
||||
|
||||
template <bool is_enc>
|
||||
struct graph : public llm_graph_context {
|
||||
graph(const llama_model & model, const llm_graph_params & params);
|
||||
|
||||
ggml_tensor * build_inp_embd_enc() const;
|
||||
};
|
||||
|
||||
std::unique_ptr<llm_graph_context> build_arch_graph(const llm_graph_params & params) const override;
|
||||
};
|
||||
|
||||
|
||||
struct llama_model_mistral4 : public llama_model_deepseek2 {
|
||||
llama_model_mistral4(const struct llama_model_params & params) : llama_model_deepseek2(params) {}
|
||||
|
||||
@ -75,6 +75,8 @@ llama_model_openai_moe::graph::graph(const llama_model & model, const llm_graph_
|
||||
ggml_tensor * inp_out_ids = build_inp_out_ids();
|
||||
|
||||
for (int il = 0; il < n_layer; ++il) {
|
||||
res->t_layer_inp[il] = inpL;
|
||||
|
||||
const float freq_base_l = model.get_rope_freq_base (cparams, il);
|
||||
const float freq_scale_l = model.get_rope_freq_scale(cparams, il);
|
||||
|
||||
|
||||
@ -69,6 +69,8 @@ llama_model_qwen3::graph::graph(const llama_model & model, const llm_graph_param
|
||||
ggml_tensor * inp_out_ids = build_inp_out_ids();
|
||||
|
||||
for (int il = 0; il < n_layer; ++il) {
|
||||
res->t_layer_inp[il] = inpL;
|
||||
|
||||
ggml_tensor * inpSA = inpL;
|
||||
|
||||
// norm
|
||||
|
||||
@ -173,7 +173,7 @@ llama_model_qwen35::graph::graph(const llama_model & model, const llm_graph_para
|
||||
}
|
||||
|
||||
if (il == n_layer - 1 && inp_out_ids && cparams.embeddings_nextn_masked) {
|
||||
cur = ggml_get_rows(ctx0, cur, inp_out_ids);
|
||||
cur = ggml_get_rows(ctx0, cur, inp_out_ids);
|
||||
inpSA = ggml_get_rows(ctx0, inpSA, inp_out_ids);
|
||||
}
|
||||
|
||||
|
||||
@ -78,6 +78,8 @@ llama_model_qwen3moe::graph::graph(const llama_model & model, const llm_graph_pa
|
||||
ggml_tensor * inp_out_ids = build_inp_out_ids();
|
||||
|
||||
for (int il = 0; il < n_layer; ++il) {
|
||||
res->t_layer_inp[il] = inpL;
|
||||
|
||||
ggml_tensor * inpSA = inpL;
|
||||
|
||||
// norm
|
||||
|
||||
@ -450,6 +450,9 @@ static int save_models(const llm_arch target_arch, const size_t seed, const ggml
|
||||
if (arch == LLM_ARCH_GEMMA4 || arch == LLM_ARCH_GEMMA4_ASSISTANT) {
|
||||
continue; // FIXME: ISWA KV cache initialization needs more fixture params
|
||||
}
|
||||
if (arch == LLM_ARCH_EAGLE3) {
|
||||
continue;
|
||||
}
|
||||
for (bool moe : {false, true}) {
|
||||
if (moe && !moe_implemented(arch)) {
|
||||
continue;
|
||||
@ -553,6 +556,9 @@ static int test_backends(const llm_arch target_arch, const size_t seed, const gg
|
||||
if (arch == LLM_ARCH_GEMMA4 || arch == LLM_ARCH_GEMMA4_ASSISTANT) {
|
||||
continue; // FIXME: ISWA KV cache initialization needs more fixture params
|
||||
}
|
||||
if (arch == LLM_ARCH_EAGLE3) {
|
||||
continue;
|
||||
}
|
||||
|
||||
const bool encode = arch == LLM_ARCH_T5 || arch == LLM_ARCH_DREAM || arch == LLM_ARCH_LLADA || arch == LLM_ARCH_LLADA_MOE || arch == LLM_ARCH_RND1;
|
||||
for (bool moe : {false, true}) {
|
||||
|
||||
Loading…
x
Reference in New Issue
Block a user