diff --git a/conversion/__init__.py b/conversion/__init__.py index a587b1c37e..5aad203e53 100644 --- a/conversion/__init__.py +++ b/conversion/__init__.py @@ -136,6 +136,7 @@ TEXT_MODEL_MAP: dict[str, str] = { "LlamaModel": "llama", "Eagle3DraftModel": "llama", "Eagle3Speculator": "llama", + "Eagle3LlamaForCausalLM": "llama", "LlamaForCausalLMEagle3": "llama", "LlavaForConditionalGeneration": "llama", "LlavaStableLMEpochForCausalLM": "stablelm", diff --git a/conversion/llama.py b/conversion/llama.py index a0d39472eb..b43cc994aa 100644 --- a/conversion/llama.py +++ b/conversion/llama.py @@ -23,6 +23,7 @@ from .base import ModelBase, TextModel, gguf, logger "LlavaForConditionalGeneration", "VoxtralForConditionalGeneration", "LlamaForCausalLMEagle3", + "Eagle3LlamaForCausalLM", "Eagle3Speculator", "Eagle3DraftModel", "IQuestCoderForCausalLM", diff --git a/docs/speculative.md b/docs/speculative.md index 43d1818589..8f91256c4a 100644 --- a/docs/speculative.md +++ b/docs/speculative.md @@ -13,6 +13,45 @@ The `llama-server` application supports several implementations of speculative d A much smaller model (called the _draft model_) generates drafts. A draft model is the most used approach in speculative decoding. +### EAGLE-3 (`draft-eagle3`) + +EAGLE-3 uses a small draft model that reads the target model's hidden states to predict the next tokens, so it +reaches higher acceptance than a standalone draft model of the same size. The draft is a one-layer transformer +trained for a specific target model; it shares the target model's tokenizer and, optionally, uses a reduced draft +vocabulary with its own `lm_head`, which is mapped back using a `d2t` table. + +Convert the EAGLE-3 checkpoint with `--target-model-dir` so it inherits the target's tokenizer and the layer +indices to read. Both the SpecForge `LlamaForCausalLMEagle3` and the vLLM/AngelSlim `Eagle3LlamaForCausalLM` +checkpoint formats are supported (for example [`AngelSlim/Qwen3-4B_eagle3`](https://huggingface.co/AngelSlim/Qwen3-4B_eagle3) +for `Qwen/Qwen3-4B`): + +```bash +python convert_hf_to_gguf.py AngelSlim/Qwen3-4B_eagle3 \ + --target-model-dir Qwen/Qwen3-4B --outtype bf16 --outfile Qwen3-4B-eagle3.gguf + +llama-server -m Qwen3-4B.gguf -md Qwen3-4B-eagle3.gguf --spec-type draft-eagle3 +``` + +Supported EAGLE-3 draft models include: + +- [yuhuili/EAGLE3-LLaMA3.1-Instruct-8B](https://huggingface.co/yuhuili/EAGLE3-LLaMA3.1-Instruct-8B) +- [yuhuili/EAGLE3-LLaMA3.3-Instruct-70B](https://huggingface.co/yuhuili/EAGLE3-LLaMA3.3-Instruct-70B) +- [RedHatAI/gemma-4-31B-it-speculator.eagle3](https://huggingface.co/RedHatAI/gemma-4-31B-it-speculator.eagle3) +- [RedHatAI/gemma-4-26B-A4B-it-speculator.eagle3](https://huggingface.co/RedHatAI/gemma-4-26B-A4B-it-speculator.eagle3) +- [Tengyunw/qwen3_8b_eagle3](https://huggingface.co/Tengyunw/qwen3_8b_eagle3) +- [Tengyunw/qwen3_30b_moe_eagle3](https://huggingface.co/Tengyunw/qwen3_30b_moe_eagle3) +- [AngelSlim/Qwen3-1.7B_eagle3](https://huggingface.co/AngelSlim/Qwen3-1.7B_eagle3) +- [AngelSlim/Qwen3-4B_eagle3](https://huggingface.co/AngelSlim/Qwen3-4B_eagle3) +- [AngelSlim/Qwen3-8B_eagle3](https://huggingface.co/AngelSlim/Qwen3-8B_eagle3) +- [AngelSlim/Qwen3-14B_eagle3](https://huggingface.co/AngelSlim/Qwen3-14B_eagle3) +- [AngelSlim/Qwen3-32B_eagle3](https://huggingface.co/AngelSlim/Qwen3-32B_eagle3) +- [AngelSlim/Qwen3-a3B_eagle3](https://huggingface.co/AngelSlim/Qwen3-a3B_eagle3) +- [RedHatAI/gpt-oss-20b-speculator.eagle3](https://huggingface.co/RedHatAI/gpt-oss-20b-speculator.eagle3) +- [lmsys/EAGLE3-gpt-oss-120b-bf16](https://huggingface.co/lmsys/EAGLE3-gpt-oss-120b-bf16) +- [nvidia/gpt-oss-120b-Eagle3-long-context](https://huggingface.co/nvidia/gpt-oss-120b-Eagle3-long-context) + +For the full and up-to-date list of supported models, see #18039. + ### n-gram Cache (`ngram-cache`) An n-gram is a sequence of n tokens. The n-gram cache implementation maintains statistics about short n-gram sequences. @@ -108,7 +147,7 @@ If a draft model is combined with a draftless decoding the draftless decoding ha ### General Speculative Parameters ``` ---spec-type [none|draft-simple|draft-mtp|ngram-cache|ngram-simple|ngram-map-k|ngram-map-k4v|ngram-mod] +--spec-type [none|draft-simple|draft-eagle3|draft-mtp|ngram-cache|ngram-simple|ngram-map-k|ngram-map-k4v|ngram-mod] comma-separated list of types of speculative decoding to use (default: none) (env: LLAMA_ARG_SPEC_TYPE) @@ -247,6 +286,7 @@ Specifies a comma-separated list of speculative decoding types to use. |------|-------------| | `none` | No speculative decoding (default) | | `draft-simple` | Use a simple draft model for speculation | +| `draft-eagle3` | Use an EAGLE-3 draft model that reads the target's hidden states | | `draft-mtp` | Use Multi Token Prediction (MTP) heads from the main model | | `ngram-cache` | Use n-gram cache lookup | | `ngram-simple` | Use simple n-gram pattern matching |