Standardize speculative decoding arguments on the server (#1908)

* refactor spec args

* add shell-safe quoting of string-valued stage keys in speculative decoding
This commit is contained in:
Samuel Oliveira Alves 2026-06-04 10:44:57 -03:00 committed by GitHub
parent 6c0180d702
commit 007d640098
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
10 changed files with 275 additions and 213 deletions

View File

@ -124,7 +124,16 @@ static int32_t common_speculative_stage_effective_n_min(
std::vector<common_speculative_stage_params> common_params_speculative::get_resolved_stages() const {
if (!stages.empty()) {
return stages;
std::vector<common_speculative_stage_params> resolved;
resolved.reserve(stages.size());
for (const auto & stage : stages) {
if (stage.type != COMMON_SPECULATIVE_TYPE_NONE) {
resolved.push_back(stage);
}
}
return resolved;
}
if (type == COMMON_SPECULATIVE_TYPE_NONE) {
@ -164,6 +173,9 @@ common_params_speculative common_params_speculative::with_stage_overrides(const
if (stage.has_suffix_max_depth_override()) {
result.suffix_max_depth = stage.suffix_max_depth;
}
if (stage.has_suffix_corpus_override()) {
result.suffix_corpus = stage.suffix_corpus;
}
result.n_max = std::max(result.n_max, 0);
result.n_min = std::max(0, std::min(result.n_min, result.n_max));
@ -612,28 +624,20 @@ static void common_speculative_finalize_stages(gpt_params & params) {
auto & spec = params.speculative;
if (!spec.stages.empty()) {
spec.type = spec.stages.front().type;
const auto resolved = spec.get_resolved_stages();
if (resolved.size() != spec.stages.size()) {
spec.stages = resolved;
}
spec.type = resolved.empty() ? COMMON_SPECULATIVE_TYPE_NONE : resolved.front().type;
params.has_mtp = spec.has_stage_type(COMMON_SPECULATIVE_TYPE_MTP);
return;
}
const bool wants_mtp = params.has_mtp;
const bool wants_draft = spec.has_dft();
if (spec.type != COMMON_SPECULATIVE_TYPE_NONE) {
spec.stages.push_back({ .type = spec.type });
if (common_speculative_type_is_self_spec(spec.type)) {
if (wants_mtp) {
} else if (params.has_mtp) {
spec.stages.push_back({ .type = COMMON_SPECULATIVE_TYPE_MTP });
} else if (wants_draft) {
spec.stages.push_back({ .type = COMMON_SPECULATIVE_TYPE_DRAFT });
}
}
} else if (wants_mtp) {
spec.stages.push_back({ .type = COMMON_SPECULATIVE_TYPE_MTP });
} else if (wants_draft) {
spec.stages.push_back({ .type = COMMON_SPECULATIVE_TYPE_DRAFT });
}
spec.type = spec.stages.empty() ? COMMON_SPECULATIVE_TYPE_NONE : spec.stages.front().type;
@ -827,11 +831,14 @@ static std::string common_normalize_spec_stage_key(std::string key) {
std::replace(key.begin(), key.end(), '-', '_');
if (key.rfind("spec_", 0) == 0) {
key.erase(0, 5);
return key;
}
return key;
static std::invalid_argument common_speculative_legacy_option_error(
const std::string & arg,
const std::string & replacement) {
return std::invalid_argument(
"legacy speculative option '" + arg + "' is disabled; use " + replacement);
}
static void common_speculative_remove_explicit_stage(common_params_speculative & params, common_speculative_type type) {
@ -850,21 +857,21 @@ static void common_speculative_stage_apply_kv(
const std::string & value_raw) {
const std::string key = common_normalize_spec_stage_key(key_raw);
if (key == "draft" || key == "draft_max" || key == "draft_n" || key == "n_max") {
if (key == "n_max") {
stage.n_max = std::stoi(value_raw);
if (stage.n_max < 0) {
throw std::invalid_argument("speculative stage n_max must be >= 0");
}
return;
}
if (key == "draft_min" || key == "draft_n_min" || key == "n_min") {
if (key == "n_min") {
stage.n_min = std::stoi(value_raw);
if (stage.n_min < 0) {
throw std::invalid_argument("speculative stage n_min must be >= 0");
}
return;
}
if (key == "draft_p_min" || key == "p_min") {
if (key == "p_min") {
stage.p_min = std::stof(value_raw);
if (stage.p_min < 0.0f) {
throw std::invalid_argument("speculative stage p_min must be >= 0");
@ -892,7 +899,7 @@ static void common_speculative_stage_apply_kv(
}
return;
}
if (key == "suffix_min_match_len" || key == "suffix_pattern_len") {
if (key == "suffix_min_match_len") {
stage.suffix_min_match_len = std::stoi(value_raw);
if (stage.suffix_min_match_len < 1) {
throw std::invalid_argument("speculative stage suffix_min_match_len must be at least 1");
@ -906,10 +913,100 @@ static void common_speculative_stage_apply_kv(
}
return;
}
if (key == "suffix_corpus") {
stage.suffix_corpus = value_raw;
if (stage.suffix_corpus.empty()) {
throw std::invalid_argument("speculative stage suffix_corpus must not be empty");
}
return;
}
throw std::invalid_argument("unknown speculative stage parameter: " + key_raw);
}
static std::vector<std::string> common_speculative_stage_split_kvs(const std::string & values) {
std::vector<std::string> result;
std::string current;
char quote = '\0';
bool escaped = false;
for (char ch : values) {
if (escaped) {
current += ch;
escaped = false;
continue;
}
if (ch == '\\') {
current += ch;
escaped = true;
continue;
}
if (quote != '\0') {
if (ch == quote) {
quote = '\0';
}
current += ch;
continue;
}
if ((ch == '\'' || ch == '"') && !current.empty() && current.back() == '=') {
quote = ch;
current += ch;
continue;
}
if (ch == ',') {
result.push_back(current);
current.clear();
continue;
}
current += ch;
}
if (quote != '\0') {
throw std::invalid_argument("invalid speculative stage option list: unterminated quote");
}
result.push_back(current);
return result;
}
static std::string common_speculative_stage_unescape_value(const std::string & value_raw) {
std::string value = value_raw;
if (value.size() >= 2) {
const char first = value.front();
const char last = value.back();
if ((first == '\'' && last == '\'') || (first == '"' && last == '"')) {
value = value.substr(1, value.size() - 2);
}
}
std::string result;
result.reserve(value.size());
for (size_t i = 0; i < value.size(); ++i) {
const char ch = value[i];
if (ch != '\\' || i + 1 >= value.size()) {
result += ch;
continue;
}
const char next = value[i + 1];
if (next == '\\' || next == ',' || next == '\'' || next == '"') {
result += next;
++i;
continue;
}
result += ch;
}
return result;
}
static common_speculative_stage_params common_speculative_stage_from_arg(const std::string & value) {
const auto spec_pos = value.find(':');
const std::string type_name = value.substr(0, spec_pos);
@ -924,15 +1021,13 @@ static common_speculative_stage_params common_speculative_stage_from_arg(const s
return stage;
}
std::stringstream ss(value.substr(spec_pos + 1));
std::string kv;
while (std::getline(ss, kv, ',')) {
for (const std::string & kv : common_speculative_stage_split_kvs(value.substr(spec_pos + 1))) {
const auto eq_pos = kv.find('=');
if (eq_pos == std::string::npos) {
throw std::invalid_argument("invalid speculative stage option: " + kv);
}
common_speculative_stage_apply_kv(stage, kv.substr(0, eq_pos), kv.substr(eq_pos + 1));
common_speculative_stage_apply_kv(stage, kv.substr(0, eq_pos), common_speculative_stage_unescape_value(kv.substr(eq_pos + 1)));
}
return stage;
@ -1379,18 +1474,18 @@ bool gpt_params_find_arg(int argc, char ** argv, const std::string & arg, gpt_pa
}
if (arg == "--draft" || arg == "--draft-max" || arg == "--draft-n") {
CHECK_ARG
params.speculative.n_max = std::stoi(argv[i]);
return true;
throw common_speculative_legacy_option_error(arg,
"the value inside the relevant repeated --spec-type entry, e.g. --spec-type mtp:n_max=" + std::string(argv[i]) + ",p_min=0.0 or --spec-type draft:n_max=" + std::string(argv[i]) + ",p_min=0.0");
}
if (arg == "--draft-min" || arg == "--draft-n-min") {
CHECK_ARG
params.speculative.n_min = std::stoi(argv[i]);
return true;
throw common_speculative_legacy_option_error(arg,
"the value inside the relevant repeated --spec-type entry using the canonical key n_min, e.g. --spec-type ngram-mod:n_min=" + std::string(argv[i]));
}
if (arg == "--draft-p-min") {
CHECK_ARG
params.speculative.p_min = std::stof(argv[i]);
return true;
throw common_speculative_legacy_option_error(arg,
"the value inside the relevant repeated --spec-type entry using the canonical key p_min, e.g. --spec-type mtp:p_min=" + std::string(argv[i]));
}
if (arg == "--recurrent-ckpt-mode") {
CHECK_ARG
@ -1445,89 +1540,46 @@ bool gpt_params_find_arg(int argc, char ** argv, const std::string & arg, gpt_pa
}
if (arg == "--spec-stage") {
CHECK_ARG
if (params.speculative.stages.empty()) {
if (params.speculative.type != COMMON_SPECULATIVE_TYPE_NONE) {
throw std::invalid_argument("--spec-stage cannot be combined with --spec-type; use only --spec-stage for explicit stage chains");
}
if (params.has_mtp) {
throw std::invalid_argument("--spec-stage cannot be combined with -mtp/--multi-token-prediction; add the mtp fallback explicitly with --spec-stage mtp[:k=v,...]");
}
}
params.speculative.stages.push_back(common_speculative_stage_from_arg(argv[i]));
if (params.speculative.stages.size() == 1) {
params.speculative.type = params.speculative.stages.front().type;
}
params.has_mtp = params.speculative.has_stage_type(COMMON_SPECULATIVE_TYPE_MTP);
return true;
throw common_speculative_legacy_option_error(arg,
"repeated --spec-type SPEC[:k=v,...] entries, e.g. --spec-type ngram-mod:n_max=64,n_min=2,ngram_size_n=8 --spec-type mtp:n_max=1,p_min=0.0");
}
if (arg == "--spec-type") {
CHECK_ARG
if (!params.speculative.stages.empty()) {
throw std::invalid_argument("--spec-type cannot be combined with --spec-stage; use only --spec-stage for explicit stage chains");
}
const auto type = common_speculative_type_from_name(argv[i]);
if (type == COMMON_SPECULATIVE_TYPE_NONE || type == COMMON_SPECULATIVE_TYPE_MTP || common_speculative_type_is_self_spec(type)) {
params.speculative.type = type;
if (type == COMMON_SPECULATIVE_TYPE_MTP) {
params.has_mtp = true;
}
} else {
throw std::invalid_argument("unknown speculative decoding type");
}
params.speculative.stages.push_back(common_speculative_stage_from_arg(argv[i]));
const auto resolved = params.speculative.get_resolved_stages();
params.speculative.type = resolved.empty() ? COMMON_SPECULATIVE_TYPE_NONE : resolved.front().type;
params.has_mtp = params.speculative.has_stage_type(COMMON_SPECULATIVE_TYPE_MTP);
return true;
}
if (arg == "--spec-ngram-size-n") {
CHECK_ARG
int value = std::stoi(argv[i]);
if (value < 1 || value > 1024) {
throw std::invalid_argument("ngram size N must be between 1 and 1024 inclusive");
}
params.speculative.ngram_size_n = value;
return true;
throw common_speculative_legacy_option_error(arg,
"the canonical stage key inside --spec-type, e.g. --spec-type ngram-mod:ngram_size_n=" + std::string(argv[i]));
}
if (arg == "--spec-ngram-size-m") {
CHECK_ARG
int value = std::stoi(argv[i]);
if (value < 1 || value > 1024) {
throw std::invalid_argument("ngram size M must be between 1 and 1024 inclusive");
}
params.speculative.ngram_size_m = value;
return true;
throw common_speculative_legacy_option_error(arg,
"the canonical stage key inside --spec-type, e.g. --spec-type ngram-map-k4v:ngram_size_m=" + std::string(argv[i]));
}
if (arg == "--spec-ngram-min-hits") {
CHECK_ARG
int value = std::stoi(argv[i]);
if (value < 1) {
throw std::invalid_argument("ngram min hits must be at least 1");
}
params.speculative.ngram_min_hits = value;
return true;
throw common_speculative_legacy_option_error(arg,
"the canonical stage key inside --spec-type, e.g. --spec-type ngram-map-k4v:ngram_min_hits=" + std::string(argv[i]));
}
if (arg == "--suffix-pattern-len") {
CHECK_ARG
int value = std::stoi(argv[i]);
if (value < 1) {
throw std::invalid_argument("suffix pattern length must be at least 1");
}
params.speculative.suffix_min_match_len = value;
return true;
throw common_speculative_legacy_option_error(arg,
"the canonical stage key inside --spec-type, e.g. --spec-type suffix:suffix_min_match_len=" + std::string(argv[i]));
}
if (arg == "--suffix-max-depth") {
CHECK_ARG
int value = std::stoi(argv[i]);
if (value < 1) {
throw std::invalid_argument("suffix max depth must be at least 1");
}
params.speculative.suffix_max_depth = value;
return true;
throw common_speculative_legacy_option_error(arg,
"the canonical stage key inside --spec-type, e.g. --spec-type suffix:suffix_max_depth=" + std::string(argv[i]));
}
if (arg == "--suffix-corpus") {
CHECK_ARG
params.speculative.suffix_corpus = argv[i];
return true;
throw common_speculative_legacy_option_error(arg,
"the canonical stage key inside --spec-type, e.g. --spec-type suffix:suffix_corpus=" + std::string(argv[i]));
}
if (arg == "-a" || arg == "--alias") {
CHECK_ARG
@ -1976,17 +2028,12 @@ bool gpt_params_find_arg(int argc, char ** argv, const std::string & arg, gpt_pa
return true;
}
if (arg == "-mtp" || arg == "--multi-token-prediction") {
if (!params.speculative.stages.empty()) {
throw std::invalid_argument("-mtp/--multi-token-prediction cannot be combined with --spec-stage; add the mtp fallback explicitly with --spec-stage mtp[:k=v,...]");
}
params.has_mtp = true;
return true;
throw common_speculative_legacy_option_error(arg,
"--spec-type mtp:n_max=1,p_min=0.0");
}
if (arg == "-no-mtp" || arg == "--no-multi-token-prediction") {
params.has_mtp = false;
common_speculative_remove_explicit_stage(params.speculative, COMMON_SPECULATIVE_TYPE_MTP);
return true;
throw common_speculative_legacy_option_error(arg,
"remove the mtp entry from repeated --spec-type arguments");
}
if (arg == "-draft" || arg == "--draft-params") {
CHECK_ARG
@ -3172,29 +3219,20 @@ void gpt_params_print_usage(int /*argc*/, char ** argv, const gpt_params & param
options.push_back({ "*", "-hfr, --hf-repo REPO", "Hugging Face model repository (default: unused)" });
options.push_back({ "*", "-hff, --hf-file FILE", "Hugging Face model file (default: unused)" });
options.push_back({ "*", "-hft, --hf-token TOKEN", "Hugging Face access token (default: value from HF_TOKEN environment variable)" });
options.push_back({ "*", "-mtp, --multi-token-prediction", "legacy shortcut for enabling MTP when --spec-stage is not used (default: %s)", params.has_mtp ? "true" : "false" });
options.push_back({ "*", "-no-mtp, --no-multi-token-prediction", "disable the legacy MTP shortcut or remove an explicit MTP stage (default: %s)", !params.has_mtp ? "true" : "false" });
options.push_back({ "*", "--draft-max, --draft, --draft-n N",
"global default number of tokens to draft for speculative decoding or for stages without an explicit n_max override (default: %d)", params.speculative.n_max });
options.push_back({ "*", "--draft-min, --draft-n-min N", "global default minimum draft threshold or fallback threshold for stages without an explicit n_min override" });
options.push_back({ "*", "--draft-p-min P", "global default minimum speculative decoding probability (greedy) for stages without an explicit p_min override (default: %.1f)", (double)params.speculative.p_min });
options.push_back({ "*", "--recurrent-ckpt-mode MODE", "checkpoint strategy for recurrent/hybrid speculative decoding\n"
" auto auto-select: per-step if CUDA full-GPU, gpu-fallback otherwise (default)\n"
" per-step save SSM state per draft step in VRAM; no re-decode on rejection\n"
" gpu-fallback copy state to GPU buffer; re-decode on rejection\n"
" cpu serialise state via llama_state_seq; re-decode on rejection" });
options.push_back({ "*", "--spec-stage SPEC[:k=v,...]", "explicit speculative stage. repeat once for a supported two-stage chain.\n"
"examples: --spec-stage ngram-mod:n_max=64,n_min=2 --spec-stage mtp:n_max=1\n"
"supported two-stage shape in this PR: self-spec first, then mtp or draft fallback" });
options.push_back({ "*", "--spec-type Name [none | mtp | ngram-cache | ngram-simple | ngram-map-k | ngram-map-k4v | ngram-mod | suffix]", "single-stage speculative selection when --spec-stage is not used (default: %d)\n", (int)params.speculative.type});
options.push_back({ "*", "--spec-ngram-size-n N", "ngram size N for ngram-simple/ngram-map speculative decoding, length of lookup n-gram (default: %d)\n",params.speculative.ngram_size_n });
options.push_back({ "*", "--spec-ngram-size-m N", "ngram size M for ngram-simple/ngram-map speculative decoding, length of draft m-gram (default: %d)\n", params.speculative.ngram_size_m });
options.push_back({ "*", "--spec-ngram-min-hits N", "minimum hits for ngram-map speculative decoding (default: %d)\n", params.speculative.ngram_min_hits });
options.push_back({ "*", "--suffix-pattern-len N", "minimum context match length for suffix decoding (default: %d)", params.speculative.suffix_min_match_len });
options.push_back({ "*", "--suffix-max-depth N", "suffix tree maximum depth for suffix decoding (default: %d)", params.speculative.suffix_max_depth });
options.push_back({ "*", "--suffix-corpus PATH", "corpus file to pre-warm the suffix tree: .json (array of strings or conversation messages) or .bin (raw int32 token IDs)" });
options.push_back({ "*", "--spec-type SPEC[:k=v,...]", "canonical speculative stage entry; repeat for a supported two-stage chain.\n"
"types: none, draft, mtp, ngram-cache, ngram-simple, ngram-map-k, ngram-map-k4v, ngram-mod, suffix\n"
"canonical keys: n_max,n_min,p_min,ngram_size_n,ngram_size_m,ngram_min_hits,suffix_min_match_len,suffix_max_depth,suffix_corpus\n"
"for comma-bearing string values, quote the value inside the stage payload for normal shell use\n"
"if argv is passed directly without shell unescaping, the parser also accepts escaped commas as \\,\n"
"examples: --spec-type mtp:n_max=1,p_min=0.0\n"
" --spec-type ngram-mod:n_max=64,n_min=2,ngram_size_n=8 --spec-type mtp:n_max=1,p_min=0.0\n"
" --spec-type \"suffix:n_max=16,n_min=2,suffix_min_match_len=5,suffix_max_depth=64,suffix_corpus='/tmp/spec,type-corpus.json'\"\n"
"legacy --spec-stage, --draft-*, --spec-ngram-*, --suffix-* and -mtp flags are rejected" });
options.push_back({ "*", "--spec-autotune", "automatically tune speculative params to maximize tokens/sec" });
options.push_back({ "retrieval" });

View File

@ -169,6 +169,7 @@ struct common_speculative_stage_params {
int32_t suffix_min_match_len = -1;
int32_t suffix_max_depth = -1;
std::string suffix_corpus;
bool has_n_max_override() const { return n_max >= 0; }
bool has_n_min_override() const { return n_min >= 0; }
@ -178,6 +179,7 @@ struct common_speculative_stage_params {
bool has_ngram_min_hits_override() const { return ngram_min_hits > 0; }
bool has_suffix_min_match_len_override() const { return suffix_min_match_len >= 0; }
bool has_suffix_max_depth_override() const { return suffix_max_depth >= 0; }
bool has_suffix_corpus_override() const { return !suffix_corpus.empty(); }
};
struct common_params_model {

View File

@ -357,20 +357,15 @@ void spec_tuner::print_best() const {
{
std::ostringstream oss;
oss << "Autotune reuse: ";
oss << "Autotune reuse: --spec-type " << common_speculative_type_to_str(spec_type);
bool first_kv = true;
for (const auto & coord : coords) {
bool is_int = (coord.name != "p_min");
if (coord.name == "n_max") oss << "--draft-max ";
else if (coord.name == "p_min") oss << "--draft-p-min ";
else if (coord.name == "n_min") oss << "--draft-min ";
else if (coord.name == "ngram_size_n") oss << "--spec-ngram-size-n ";
else if (coord.name == "ngram_size_m") oss << "--spec-ngram-size-m ";
else if (coord.name == "ngram_min_hits") oss << "--spec-ngram-min-hits ";
else if (coord.name == "suffix_min_match_len") oss << "--suffix-pattern-len ";
else oss << "--" << coord.name << " ";
oss << (first_kv ? ':' : ',') << coord.name << '=';
first_kv = false;
if (is_int) oss << (int)coord.arms[coord.best_idx].value << " ";
else oss << std::fixed << std::setprecision(2) << coord.arms[coord.best_idx].value << " ";
if (is_int) oss << (int)coord.arms[coord.best_idx].value;
else oss << std::fixed << std::setprecision(2) << coord.arms[coord.best_idx].value;
}
LOG_INF("%s\n", oss.str().c_str());
}

View File

@ -1160,7 +1160,7 @@ common_speculative * common_speculative_init(
});
if (has_draft_stage) {
LOG_ERR("%s: Gemma4 assistant models only support MTP stages; omit -md for self-spec-only runs or use -mtp/--spec-stage mtp for assistant-backed MTP\n", __func__);
LOG_ERR("%s: Gemma4 assistant models only support MTP stages; omit -md for self-spec-only runs or use --spec-type mtp:n_max=1,p_min=0.0 for assistant-backed MTP\n", __func__);
return nullptr;
}
}

View File

@ -209,7 +209,7 @@ static bool suffix_corpus_check_limit(const std::string & path, size_t n_tokens,
return true;
}
LOG_ERR("load_corpus: refusing suffix corpus '%s' - estimated insert work %llu exceeds limit %llu (tokens=%zu, depth=%d); reduce corpus size or --suffix-max-depth\n",
LOG_ERR("load_corpus: refusing suffix corpus '%s' - estimated insert work %llu exceeds limit %llu (tokens=%zu, depth=%d); reduce corpus size or lower suffix_max_depth inside --spec-type suffix:suffix_max_depth=...\n",
path.c_str(),
(unsigned long long) estimated_work,
(unsigned long long) SUFFIX_CORPUS_MAX_INSERT_WORK,

View File

@ -120,21 +120,13 @@ Check the details [here](./speculative.md).
| `-ctkd, --cache-type-k-draft TYPE` | KV cache data type for K for the draft model | - | For draft model, see: `-ctk` |
| `-ctvd, --cache-type-v-draft TYPE` | KV cache data type for V for the draft model | - | For draft model, see: `-ctk` |
| `-draft, --draft-params` | Comma-separated list of draft model parameters | - | |
| `--spec-ngram-size-n N` | ngram size N for ngram-simple/ngram-map speculative decoding, length of lookup n-gram| 12 | [PR 1261](https://github.com/ikawrakow/ik_llama.cpp/pull/1261) |
| `--spec-ngram-size-m N` | ngram size M for ngram-simple/ngram-map speculative decoding, length of draft m-gram | 48 | [PR 1261](https://github.com/ikawrakow/ik_llama.cpp/pull/1261) |
| `--spec-ngram-min-hits N` | minimum hits for ngram-map speculative decoding | 1 | [PR 1261](https://github.com/ikawrakow/ik_llama.cpp/pull/1261) |
| `--spec-type Name` | Comma-separated list of draft model parameters | - | none / ngram - cache / ngram - simple / ngram - map - k / ngram - map - k4v / ngram - mod / suffix [PR 1261](https://github.com/ikawrakow/ik_llama.cpp/pull/1261) [PR 1646](https://github.com/ikawrakow/ik_llama.cpp/pull/1646) |
| `--spec-stage SPEC[:k=v,...]` | Add an explicit speculative stage; repeat once for a supported two-stage chain | - | Supported two-stage shape: self-spec first, then `mtp` or `draft` fallback. [PR 1789](https://github.com/ikawrakow/ik_llama.cpp/pull/1789) |
| `-mtp, --multi-token-prediction` | | - | MTP decoding [PR 1270](https://github.com/ikawrakow/ik_llama.cpp/pull/1270) [1698](https://github.com/ikawrakow/ik_llama.cpp/pull/1698) |
| `-no-mtp, --no-multi-token-prediction` | | - | MTP decoding [PR 1270](https://github.com/ikawrakow/ik_llama.cpp/pull/1270) [1698](https://github.com/ikawrakow/ik_llama.cpp/pull/1698) |
| `--draft-max` | | - | MTP decoding [PR 1270](https://github.com/ikawrakow/ik_llama.cpp/pull/1270) [1698](https://github.com/ikawrakow/ik_llama.cpp/pull/1698) |
| `--draft-p-min` | | - | MTP decoding [PR 1270](https://github.com/ikawrakow/ik_llama.cpp/pull/1270) [1698](https://github.com/ikawrakow/ik_llama.cpp/pull/1698) |
| `--spec-type SPEC[:k=v,...]` | Canonical speculative stage entry; repeat to configure the supported two-stage chain | - | Types: `none`, `draft`, `mtp`, `ngram-cache`, `ngram-simple`, `ngram-map-k`, `ngram-map-k4v`, `ngram-mod`, `suffix`. Canonical keys: `n_max`, `n_min`, `p_min`, `ngram_size_n`, `ngram_size_m`, `ngram_min_hits`, `suffix_min_match_len`, `suffix_max_depth`, `suffix_corpus`. String values may escape commas as `\,` or quote the value inside the stage payload. Example: `--spec-type ngram-mod:n_max=64,n_min=2,ngram_size_n=8 --spec-type mtp:n_max=1,p_min=0.0` |
| `--spec-autotune` | Automatically tune speculative params to maximize tokens/sec | - | Automatically determines the near-optimal arguments for the type of speculation being performed [PR 1595](https://github.com/ikawrakow/ik_llama.cpp/pull/1595) |
| `--recurrent-ckpt-mode MODE` | Checkpoint strategy for recurrent/hybrid speculative decoding | auto | One of: - `auto` auto-select: per-step if CUDA full-GPU, gpu-fallback otherwise - `per-step` save SSM state per draft step in VRAM; no re-decode on rejection - `gpu-fallback` copy state to GPU buffer; re-decode on rejection - `cpu` serialise state via llama_state_seq; re-decode on rejection [PR 1669](https://github.com/ikawrakow/ik_llama.cpp/pull/1669) [PR 1774](https://github.com/ikawrakow/ik_llama.cpp/pull/1774) |
Notes:
- `--spec-type` cannot be combined with `--spec-stage`.
- Legacy `--spec-stage`, `--draft-*`, `--spec-ngram-*`, `--suffix-*`, and `-mtp` flags are rejected with replacement guidance.
- Explicit stage chains currently support at most two stages.
- Supported self-spec stage names are `ngram-cache`, `ngram-simple`, `ngram-map-k`, `ngram-map-k4v`, `ngram-mod`, and `suffix`.
- Composite stage chains disable speculative autotune.
@ -378,9 +370,7 @@ WIP
| `--override-kv KEY=TYPE:VALUE` | Override model metadata by key | - | Advanced option to override model metadata by key. May be specified multiple times. types: int, float, bool, str. Example: `--override-kv tokenizer.ggml.add_bos_token=bool:false` |
| `-m, --model FNAME` | Model path | models/$filename | Mandatory, the GGUF model file to be served. |
| `-md, --model-draft FNAME` | Draft model for speculative decoding | unused | Required when an explicit `draft` stage is used. |
| `--draft-max, --draft, --draft-n N` | Global speculative draft cap, or fallback value for stages without an explicit `n_max` override | 16 | Also used by single-stage MTP and draft-model speculation. |
| `--draft-min, --draft-n-min N` | Global minimum speculative draft threshold, or fallback value for stages without an explicit `n_min` override | 0 | |
| `--draft-p-min P` | Global minimum speculative decoding probability (greedy), or fallback value for stages without an explicit `p_min` override | 0.8 | |
| `--spec-type SPEC[:k=v,...]` | Canonical speculative stage entry; repeat for the supported two-stage chain | none | Use stage-local keys like `n_max`, `n_min`, `p_min`, `ngram_size_n`, `ngram_size_m`, `ngram_min_hits`, `suffix_min_match_len`, `suffix_max_depth`, and `suffix_corpus`. |
### Request-Level Speculative Overrides

View File

@ -33,18 +33,18 @@ An example to use this approach can be the rewriting of source code by a LLM.
This implementation looks for the last n-gram in history that matches the current n-gram and creates a draft using the m tokens following the matched n-gram. It is the simplest self-speculative approach with minimal overhead.
```
llama-server [...] --spec-type ngram-simple --draft-max 64
llama-server [...] --spec-type ngram-simple:n_max=64
```
#### n-gram Map Key (`ngram-map-k`)
This implementation looks for the current n-gram of size n (called the _key_) in the token history. If the key n-gram is followed by the same m tokens (called the _mgram_) multiple times, it creates a draft using these m tokens. This approach requires a minimum number of occurrences (argument `--spec-ngram-min-hits`, default is 1) before generating drafts.
This implementation looks for the current n-gram of size n (called the _key_) in the token history. If the key n-gram is followed by the same m tokens (called the _mgram_) multiple times, it creates a draft using these m tokens. This approach requires a minimum number of occurrences (stage key `ngram_min_hits`, default is 1) before generating drafts.
The number of accepted tokens is stored for each used n-gram.
**Example:**
```
llama-server [...] --spec-type ngram-map-k --draft-max 64
llama-server [...] --spec-type ngram-map-k:n_max=64,ngram_min_hits=1
```
#### n-gram Map Key-4-Values (`ngram-map-k4v`)
@ -55,7 +55,7 @@ The number of accepted tokens is stored for each used n-gram.
**Example:** Server options to be used if there are a lot of longer repetitions.
```
llama-server [...] --spec-type ngram-map-k4v --spec-ngram-size-n 8 --spec-ngram-size-m 8 --spec-ngram-min-hits 2 --draft-max 64
llama-server [...] --spec-type ngram-map-k4v:n_max=64,ngram_size_n=8,ngram_size_m=8,ngram_min_hits=2
```
### n-gram Mod (`ngram-mod`)
@ -80,9 +80,9 @@ Currently, a single hash pool is shared across all server slots, so different re
# notes:
# - small `n` are not recommended
# - MoEs require long drafts
# - dense models: can reduce `--draft-min` and `--draft-max`
# - dense models: can reduce `n_min` and `n_max`
llama-server ... --spec-type ngram-mod --spec-ngram-size-n 24 --draft-min 48 --draft-max 64
llama-server ... --spec-type ngram-mod:n_max=64,n_min=48,ngram_size_n=24
```
Applications:
@ -103,57 +103,78 @@ Example Video:
## Command-Line Options
If a draft model is combined with a draftless decoding the draftless decoding has higher precedence.
The canonical startup surface is repeated `--spec-type SPEC[:k=v,...]`. Legacy `--spec-stage`, `--draft-*`, `--spec-ngram-*`, `--suffix-*`, and `-mtp` flags are rejected with replacement guidance.
```
--draft, --draft-n, --draft-max N number of tokens to draft for speculative decoding (default: 16)
(env: LLAMA_ARG_DRAFT_MAX)
--draft-min, --draft-n-min N minimum number of draft tokens to use for speculative decoding
(default: 0)
(env: LLAMA_ARG_DRAFT_MIN)
[...]
--spec-type [none|ngram-cache|ngram-simple|ngram-map-k|ngram-map-k4v|ngram-mod]
type of speculative decoding to use when no draft model is provided
(default: none)
--spec-ngram-size-n N ngram size N for ngram-simple/ngram-map speculative decoding, length
of lookup n-gram (default: 12)
--spec-ngram-size-m N ngram size M for ngram-simple/ngram-map speculative decoding, length
of draft m-gram (default: 48)
--spec-ngram-min-hits N minimum hits for ngram-map speculative decoding (default: 1)
```
### `--spec-type SPEC[:k=v,...]`
### `--spec-type TYPE`
Specifies a type of speculative decoding without draft model.
Each `--spec-type` entry defines one speculative stage. Repeat it to configure the supported two-stage path.
| Type | Description |
|------|-------------|
| `none` | No speculative decoding (default) |
| `none` | No speculative decoding |
| `draft` | Draft-model speculative decoding; pair with `-md/--model-draft` |
| `mtp` | Embedded or assistant-backed MTP |
| `ngram-cache` | Use n-gram cache lookup |
| `ngram-simple` | Use simple n-gram pattern matching |
| `ngram-map-k` | Use n-gram pattern matching with n-gram-keys |
| `ngram-map-k4v` | Use n-gram pattern matching with n-gram-keys and up to four m-gram values (experimental) |
| `ngram-mod` | Use basic ngram hasher for speculative decoding with shared pool |
| `ngram-map-k` | Use n-gram pattern matching with n-gram keys |
| `ngram-map-k4v` | Use n-gram pattern matching with n-gram keys and up to four m-gram values |
| `ngram-mod` | Use the shared n-gram hasher |
| `suffix` | Use suffix-tree speculative decoding |
Canonical stage keys:
| Key | Meaning |
|-----|---------|
| `n_max` | Maximum drafted tokens for that stage |
| `n_min` | Minimum usable drafted tokens for that stage |
| `p_min` | Minimum speculative probability threshold |
| `ngram_size_n` | Lookup n-gram size |
| `ngram_size_m` | Draft m-gram size |
| `ngram_min_hits` | Minimum matching hits for n-gram map stages |
| `suffix_min_match_len` | Minimum suffix context match length |
| `suffix_max_depth` | Maximum suffix-tree depth |
| `suffix_corpus` | Optional suffix corpus file for pre-warming |
String-valued stage keys such as `suffix_corpus` need shell-safe quoting when the value contains commas. From a normal shell, quote the value inside the stage payload so the parser sees the comma as part of the string value.
Example shell-safe form:
**Example:** Server-instance used to refactor source code.
```bash
./llama-server [...] --spec-type ngram-simple
./llama-server [...] \
--spec-type "suffix:n_max=16,n_min=2,suffix_min_match_len=5,suffix_max_depth=64,suffix_corpus='/tmp/spec,type-corpus.json'"
```
### `--spec-ngram-size-n N`
If you are constructing `argv` directly without shell unescaping, the parser also accepts escaped commas as `\,`.
Sets the size N of the lookup n-gram for n-gram map based speculative decoding.
The n-gram size N determines how many tokens in a row to look back when searching for matching patterns.
Examples:
### `--spec-ngram-size-m M`
```bash
# Single-stage MTP
./llama-server [...] --spec-type mtp:n_max=1,p_min=0.0
Sets the size M of the draft m-gram for n-gram map based speculative decoding.
The m-gram size determines how many tokens to draft when a match is found.
Larger values can provide more speedup but may reduce acceptance rate.
# Single-stage ngram-mod
./llama-server [...] --spec-type ngram-mod:n_max=64,n_min=48,ngram_size_n=24
### `--spec-ngram-min-hits H`
# Draft-model speculation
./llama-server [...] --model-draft draft.gguf --spec-type draft:n_max=4,p_min=0.0
This option defines how often a key has to appear in the token history to be used as a draft (default is 1).
# Two-stage self-spec -> MTP fallback
./llama-server [...] \
--spec-type ngram-mod:n_max=64,n_min=2,ngram_size_n=8 \
--spec-type mtp:n_max=1,p_min=0.0
# Suffix stage with pre-warmed corpus
./llama-server [...] \
--spec-type suffix:n_max=16,n_min=2,suffix_min_match_len=5,suffix_max_depth=64,suffix_corpus=/path/to/corpus.json
# Suffix stage with a comma-bearing corpus path from a normal shell
./llama-server [...] \
--spec-type "suffix:n_max=16,n_min=2,suffix_min_match_len=5,suffix_max_depth=64,suffix_corpus='/tmp/spec,type-corpus.json'"
```
### `--spec-autotune`
Autotunes the active stage parameters and reports the best configuration back as a canonical `--spec-type ...` snippet.
## Statistics
Each speculative decoding implementation prints statistics.
@ -180,4 +201,3 @@ statistics ngram_map_k: #calls(b,g,a) = 6 1690 26, #gen drafts = 26, #acc drafts
- `#gen tokens`: number of tokens generated by this implementation (including rejected tokens)
- `#acc tokens`: number of tokens accepted by the main model
- `dur(b,g,a): durations of begin (new prompt), generation and accumulation (process acceptance).

View File

@ -1232,7 +1232,7 @@ int main(int argc, char ** argv) {
}
if (!use_paired_gemma4_mtp && llama_model_is_gemma4_mtp_assistant(model) && !params.process_output) {
fprintf(stderr, "%s: warning: standalone Gemma 4 assistant imatrix does not exercise the assistant layers. Use '-m <target> -md <assistant> -mtp' for meaningful calibration.\n", __func__);
fprintf(stderr, "%s: warning: standalone Gemma 4 assistant imatrix does not exercise the assistant layers. Use '-m <target> -md <assistant> --spec-type mtp:n_max=1,p_min=0.0' for meaningful calibration.\n", __func__);
}
const int n_ctx_train = llama_n_ctx_train(model);

View File

@ -210,10 +210,10 @@ model:
-m, --model FNAME model path (default: models/$filename with filename from --hf-file
or --model-url if set, otherwise models/7B/ggml-model-f16.gguf)
-md, --model-draft FNAME draft model for speculative decoding (default: unused)
--spec-stage SPEC[:k=v,...]
explicit speculative stage. repeat once for a supported two-stage chain
examples: --spec-stage ngram-mod:n_max=64,n_min=2 --spec-stage mtp:n_max=1
supported two-stage shape: self-spec first, then mtp or draft fallback
--spec-type SPEC[:k=v,...]
canonical speculative stage entry; repeat for a supported two-stage chain
examples: --spec-type mtp:n_max=1,p_min=0.0
--spec-type ngram-mod:n_max=64,n_min=2,ngram_size_n=8 --spec-type mtp:n_max=1,p_min=0.0
-mu, --model-url MODEL_URL model download url (default: unused)
-hfr, --hf-repo REPO Hugging Face model repository (default: unused)
-hff, --hf-file FILE Hugging Face model file (default: unused)
@ -966,15 +966,15 @@ To know the `id` of the adapter, use GET `/lora-adapters`
### Composite speculative decoding
Use `--spec-stage` for explicit stage chains. The currently supported two-stage shape is self-spec first, then `mtp` or `draft` fallback.
Use repeated `--spec-type SPEC[:k=v,...]` entries for explicit stage chains. The currently supported two-stage shape is self-spec first, then `mtp` or `draft` fallback.
Example with `ngram-mod` plus MTP fallback:
```bash
./build/bin/llama-server \
--model /models/target-mtp.gguf \
--spec-stage ngram-mod:n_max=64,n_min=2,ngram_size_n=8 \
--spec-stage mtp:n_max=1,p_min=0.0
--spec-type ngram-mod:n_max=64,n_min=2,ngram_size_n=8 \
--spec-type mtp:n_max=1,p_min=0.0
```
Example with `ngram-mod` plus draft-model fallback:
@ -983,14 +983,13 @@ Example with `ngram-mod` plus draft-model fallback:
./build/bin/llama-server \
--model /models/target.gguf \
--model-draft /models/draft.gguf \
--spec-stage ngram-mod:n_max=64,n_min=2,ngram_size_n=8 \
--spec-stage draft:n_max=4,p_min=0.0
--spec-type ngram-mod:n_max=64,n_min=2,ngram_size_n=8 \
--spec-type draft:n_max=4,p_min=0.0
```
Notes:
- Use `--spec-type` when you want a single self-spec stage only.
- `--spec-type` cannot be combined with `--spec-stage`.
- Use `--spec-type` for both single-stage and two-stage startup configuration.
- Explicit stage chains currently support at most two stages.
### Change system prompt on runtime

View File

@ -166,7 +166,8 @@ static void server_reject_dead_speculative_request_overrides(const json & data)
json_value_ptr(data, "speculative.ngram_size_m") != nullptr ||
json_value_ptr(data, "speculative.ngram_min_hits") != nullptr ||
json_value_ptr(data, "speculative.suffix_min_match_len") != nullptr ||
json_value_ptr(data, "speculative.suffix_max_depth") != nullptr) {
json_value_ptr(data, "speculative.suffix_max_depth") != nullptr ||
json_value_ptr(data, "speculative.suffix_corpus") != nullptr) {
throw std::runtime_error("Error: structural speculative overrides are startup-only; per-request overrides only support speculative.n_max, speculative.n_min, speculative.p_min, and speculative.stages");
}
}
@ -284,7 +285,6 @@ bool server_context::load_model(const gpt_params& params_) {
});
params_base.has_mtp = false;
server_remove_speculative_stage(params_base.speculative, COMMON_SPECULATIVE_TYPE_MTP);
if (!server_speculative_needs_draft_model(params_base.speculative)) {
@ -1251,6 +1251,10 @@ bool server_context::launch_slot_with_task(server_slot& slot, server_task& task)
// speculative decoding parameters
try {
slot.params.speculative = defaults.speculative;
const bool has_flat_n_max = json_value_ptr(data, "speculative.n_max") != nullptr;
const bool has_flat_n_min = json_value_ptr(data, "speculative.n_min") != nullptr;
const bool has_flat_p_min = json_value_ptr(data, "speculative.p_min") != nullptr;
slot.params.speculative.n_max = json_value(data, "speculative.n_max", params_base.speculative.n_max);
slot.params.speculative.n_min = json_value(data, "speculative.n_min", params_base.speculative.n_min);
slot.params.speculative.p_min = json_value(data, "speculative.p_min", params_base.speculative.p_min);
@ -1258,6 +1262,20 @@ bool server_context::launch_slot_with_task(server_slot& slot, server_task& task)
server_reject_dead_speculative_request_overrides(data);
const json stages = json_value(data, "speculative.stages", json());
if (stages.is_null() && !slot.params.speculative.stages.empty()) {
for (auto & stage : slot.params.speculative.stages) {
if (has_flat_n_max) {
stage.n_max = -1;
}
if (has_flat_n_min) {
stage.n_min = -1;
}
if (has_flat_p_min) {
stage.p_min = -1.0f;
}
}
}
if (!stages.is_null()) {
if (!stages.is_array()) {
throw std::runtime_error("Error: speculative.stages must be an array");
@ -1296,11 +1314,11 @@ bool server_context::launch_slot_with_task(server_slot& slot, server_task& task)
if (slot.can_speculate() &&
llama_model_has_recurrent(model) &&
slot.params.speculative.n_max > params_base.speculative.n_max) {
slot.params.speculative.get_max_stage_n_max() > params_base.speculative.get_max_stage_n_max()) {
send_error(task,
"Error: speculative.n_max=" + std::to_string(slot.params.speculative.n_max) +
" exceeds the recurrent speculative startup limit of " + std::to_string(params_base.speculative.n_max) +
"; restart the server with a higher --draft-max to reserve checkpoint capacity",
"Error: speculative n_max=" + std::to_string(slot.params.speculative.get_max_stage_n_max()) +
" exceeds the recurrent speculative startup limit of " + std::to_string(params_base.speculative.get_max_stage_n_max()) +
"; restart the server with a higher n_max inside the configured --spec-type stages to reserve checkpoint capacity",
ERROR_TYPE_INVALID_REQUEST);
return false;
}