mirror of
https://github.com/ikawrakow/ik_llama.cpp.git
synced 2026-06-28 04:30:15 -05:00
Standardize speculative decoding arguments on the server (#1908)
* refactor spec args * add shell-safe quoting of string-valued stage keys in speculative decoding
This commit is contained in:
parent
6c0180d702
commit
007d640098
@ -124,7 +124,16 @@ static int32_t common_speculative_stage_effective_n_min(
|
||||
|
||||
std::vector<common_speculative_stage_params> common_params_speculative::get_resolved_stages() const {
|
||||
if (!stages.empty()) {
|
||||
return stages;
|
||||
std::vector<common_speculative_stage_params> resolved;
|
||||
resolved.reserve(stages.size());
|
||||
|
||||
for (const auto & stage : stages) {
|
||||
if (stage.type != COMMON_SPECULATIVE_TYPE_NONE) {
|
||||
resolved.push_back(stage);
|
||||
}
|
||||
}
|
||||
|
||||
return resolved;
|
||||
}
|
||||
|
||||
if (type == COMMON_SPECULATIVE_TYPE_NONE) {
|
||||
@ -164,6 +173,9 @@ common_params_speculative common_params_speculative::with_stage_overrides(const
|
||||
if (stage.has_suffix_max_depth_override()) {
|
||||
result.suffix_max_depth = stage.suffix_max_depth;
|
||||
}
|
||||
if (stage.has_suffix_corpus_override()) {
|
||||
result.suffix_corpus = stage.suffix_corpus;
|
||||
}
|
||||
|
||||
result.n_max = std::max(result.n_max, 0);
|
||||
result.n_min = std::max(0, std::min(result.n_min, result.n_max));
|
||||
@ -612,28 +624,20 @@ static void common_speculative_finalize_stages(gpt_params & params) {
|
||||
auto & spec = params.speculative;
|
||||
|
||||
if (!spec.stages.empty()) {
|
||||
spec.type = spec.stages.front().type;
|
||||
const auto resolved = spec.get_resolved_stages();
|
||||
if (resolved.size() != spec.stages.size()) {
|
||||
spec.stages = resolved;
|
||||
}
|
||||
|
||||
spec.type = resolved.empty() ? COMMON_SPECULATIVE_TYPE_NONE : resolved.front().type;
|
||||
params.has_mtp = spec.has_stage_type(COMMON_SPECULATIVE_TYPE_MTP);
|
||||
return;
|
||||
}
|
||||
|
||||
const bool wants_mtp = params.has_mtp;
|
||||
const bool wants_draft = spec.has_dft();
|
||||
|
||||
if (spec.type != COMMON_SPECULATIVE_TYPE_NONE) {
|
||||
spec.stages.push_back({ .type = spec.type });
|
||||
|
||||
if (common_speculative_type_is_self_spec(spec.type)) {
|
||||
if (wants_mtp) {
|
||||
} else if (params.has_mtp) {
|
||||
spec.stages.push_back({ .type = COMMON_SPECULATIVE_TYPE_MTP });
|
||||
} else if (wants_draft) {
|
||||
spec.stages.push_back({ .type = COMMON_SPECULATIVE_TYPE_DRAFT });
|
||||
}
|
||||
}
|
||||
} else if (wants_mtp) {
|
||||
spec.stages.push_back({ .type = COMMON_SPECULATIVE_TYPE_MTP });
|
||||
} else if (wants_draft) {
|
||||
spec.stages.push_back({ .type = COMMON_SPECULATIVE_TYPE_DRAFT });
|
||||
}
|
||||
|
||||
spec.type = spec.stages.empty() ? COMMON_SPECULATIVE_TYPE_NONE : spec.stages.front().type;
|
||||
@ -827,13 +831,16 @@ static std::string common_normalize_spec_stage_key(std::string key) {
|
||||
|
||||
std::replace(key.begin(), key.end(), '-', '_');
|
||||
|
||||
if (key.rfind("spec_", 0) == 0) {
|
||||
key.erase(0, 5);
|
||||
}
|
||||
|
||||
return key;
|
||||
}
|
||||
|
||||
static std::invalid_argument common_speculative_legacy_option_error(
|
||||
const std::string & arg,
|
||||
const std::string & replacement) {
|
||||
return std::invalid_argument(
|
||||
"legacy speculative option '" + arg + "' is disabled; use " + replacement);
|
||||
}
|
||||
|
||||
static void common_speculative_remove_explicit_stage(common_params_speculative & params, common_speculative_type type) {
|
||||
params.stages.erase(std::remove_if(params.stages.begin(), params.stages.end(), [type](const common_speculative_stage_params & stage) {
|
||||
return stage.type == type;
|
||||
@ -850,21 +857,21 @@ static void common_speculative_stage_apply_kv(
|
||||
const std::string & value_raw) {
|
||||
const std::string key = common_normalize_spec_stage_key(key_raw);
|
||||
|
||||
if (key == "draft" || key == "draft_max" || key == "draft_n" || key == "n_max") {
|
||||
if (key == "n_max") {
|
||||
stage.n_max = std::stoi(value_raw);
|
||||
if (stage.n_max < 0) {
|
||||
throw std::invalid_argument("speculative stage n_max must be >= 0");
|
||||
}
|
||||
return;
|
||||
}
|
||||
if (key == "draft_min" || key == "draft_n_min" || key == "n_min") {
|
||||
if (key == "n_min") {
|
||||
stage.n_min = std::stoi(value_raw);
|
||||
if (stage.n_min < 0) {
|
||||
throw std::invalid_argument("speculative stage n_min must be >= 0");
|
||||
}
|
||||
return;
|
||||
}
|
||||
if (key == "draft_p_min" || key == "p_min") {
|
||||
if (key == "p_min") {
|
||||
stage.p_min = std::stof(value_raw);
|
||||
if (stage.p_min < 0.0f) {
|
||||
throw std::invalid_argument("speculative stage p_min must be >= 0");
|
||||
@ -892,7 +899,7 @@ static void common_speculative_stage_apply_kv(
|
||||
}
|
||||
return;
|
||||
}
|
||||
if (key == "suffix_min_match_len" || key == "suffix_pattern_len") {
|
||||
if (key == "suffix_min_match_len") {
|
||||
stage.suffix_min_match_len = std::stoi(value_raw);
|
||||
if (stage.suffix_min_match_len < 1) {
|
||||
throw std::invalid_argument("speculative stage suffix_min_match_len must be at least 1");
|
||||
@ -906,10 +913,100 @@ static void common_speculative_stage_apply_kv(
|
||||
}
|
||||
return;
|
||||
}
|
||||
if (key == "suffix_corpus") {
|
||||
stage.suffix_corpus = value_raw;
|
||||
if (stage.suffix_corpus.empty()) {
|
||||
throw std::invalid_argument("speculative stage suffix_corpus must not be empty");
|
||||
}
|
||||
return;
|
||||
}
|
||||
|
||||
throw std::invalid_argument("unknown speculative stage parameter: " + key_raw);
|
||||
}
|
||||
|
||||
static std::vector<std::string> common_speculative_stage_split_kvs(const std::string & values) {
|
||||
std::vector<std::string> result;
|
||||
std::string current;
|
||||
char quote = '\0';
|
||||
bool escaped = false;
|
||||
|
||||
for (char ch : values) {
|
||||
if (escaped) {
|
||||
current += ch;
|
||||
escaped = false;
|
||||
continue;
|
||||
}
|
||||
|
||||
if (ch == '\\') {
|
||||
current += ch;
|
||||
escaped = true;
|
||||
continue;
|
||||
}
|
||||
|
||||
if (quote != '\0') {
|
||||
if (ch == quote) {
|
||||
quote = '\0';
|
||||
}
|
||||
current += ch;
|
||||
continue;
|
||||
}
|
||||
|
||||
if ((ch == '\'' || ch == '"') && !current.empty() && current.back() == '=') {
|
||||
quote = ch;
|
||||
current += ch;
|
||||
continue;
|
||||
}
|
||||
|
||||
if (ch == ',') {
|
||||
result.push_back(current);
|
||||
current.clear();
|
||||
continue;
|
||||
}
|
||||
|
||||
current += ch;
|
||||
}
|
||||
|
||||
if (quote != '\0') {
|
||||
throw std::invalid_argument("invalid speculative stage option list: unterminated quote");
|
||||
}
|
||||
|
||||
result.push_back(current);
|
||||
return result;
|
||||
}
|
||||
|
||||
static std::string common_speculative_stage_unescape_value(const std::string & value_raw) {
|
||||
std::string value = value_raw;
|
||||
if (value.size() >= 2) {
|
||||
const char first = value.front();
|
||||
const char last = value.back();
|
||||
if ((first == '\'' && last == '\'') || (first == '"' && last == '"')) {
|
||||
value = value.substr(1, value.size() - 2);
|
||||
}
|
||||
}
|
||||
|
||||
std::string result;
|
||||
result.reserve(value.size());
|
||||
|
||||
for (size_t i = 0; i < value.size(); ++i) {
|
||||
const char ch = value[i];
|
||||
if (ch != '\\' || i + 1 >= value.size()) {
|
||||
result += ch;
|
||||
continue;
|
||||
}
|
||||
|
||||
const char next = value[i + 1];
|
||||
if (next == '\\' || next == ',' || next == '\'' || next == '"') {
|
||||
result += next;
|
||||
++i;
|
||||
continue;
|
||||
}
|
||||
|
||||
result += ch;
|
||||
}
|
||||
|
||||
return result;
|
||||
}
|
||||
|
||||
static common_speculative_stage_params common_speculative_stage_from_arg(const std::string & value) {
|
||||
const auto spec_pos = value.find(':');
|
||||
const std::string type_name = value.substr(0, spec_pos);
|
||||
@ -924,15 +1021,13 @@ static common_speculative_stage_params common_speculative_stage_from_arg(const s
|
||||
return stage;
|
||||
}
|
||||
|
||||
std::stringstream ss(value.substr(spec_pos + 1));
|
||||
std::string kv;
|
||||
while (std::getline(ss, kv, ',')) {
|
||||
for (const std::string & kv : common_speculative_stage_split_kvs(value.substr(spec_pos + 1))) {
|
||||
const auto eq_pos = kv.find('=');
|
||||
if (eq_pos == std::string::npos) {
|
||||
throw std::invalid_argument("invalid speculative stage option: " + kv);
|
||||
}
|
||||
|
||||
common_speculative_stage_apply_kv(stage, kv.substr(0, eq_pos), kv.substr(eq_pos + 1));
|
||||
common_speculative_stage_apply_kv(stage, kv.substr(0, eq_pos), common_speculative_stage_unescape_value(kv.substr(eq_pos + 1)));
|
||||
}
|
||||
|
||||
return stage;
|
||||
@ -1379,18 +1474,18 @@ bool gpt_params_find_arg(int argc, char ** argv, const std::string & arg, gpt_pa
|
||||
}
|
||||
if (arg == "--draft" || arg == "--draft-max" || arg == "--draft-n") {
|
||||
CHECK_ARG
|
||||
params.speculative.n_max = std::stoi(argv[i]);
|
||||
return true;
|
||||
throw common_speculative_legacy_option_error(arg,
|
||||
"the value inside the relevant repeated --spec-type entry, e.g. --spec-type mtp:n_max=" + std::string(argv[i]) + ",p_min=0.0 or --spec-type draft:n_max=" + std::string(argv[i]) + ",p_min=0.0");
|
||||
}
|
||||
if (arg == "--draft-min" || arg == "--draft-n-min") {
|
||||
CHECK_ARG
|
||||
params.speculative.n_min = std::stoi(argv[i]);
|
||||
return true;
|
||||
throw common_speculative_legacy_option_error(arg,
|
||||
"the value inside the relevant repeated --spec-type entry using the canonical key n_min, e.g. --spec-type ngram-mod:n_min=" + std::string(argv[i]));
|
||||
}
|
||||
if (arg == "--draft-p-min") {
|
||||
CHECK_ARG
|
||||
params.speculative.p_min = std::stof(argv[i]);
|
||||
return true;
|
||||
throw common_speculative_legacy_option_error(arg,
|
||||
"the value inside the relevant repeated --spec-type entry using the canonical key p_min, e.g. --spec-type mtp:p_min=" + std::string(argv[i]));
|
||||
}
|
||||
if (arg == "--recurrent-ckpt-mode") {
|
||||
CHECK_ARG
|
||||
@ -1445,89 +1540,46 @@ bool gpt_params_find_arg(int argc, char ** argv, const std::string & arg, gpt_pa
|
||||
}
|
||||
if (arg == "--spec-stage") {
|
||||
CHECK_ARG
|
||||
|
||||
if (params.speculative.stages.empty()) {
|
||||
if (params.speculative.type != COMMON_SPECULATIVE_TYPE_NONE) {
|
||||
throw std::invalid_argument("--spec-stage cannot be combined with --spec-type; use only --spec-stage for explicit stage chains");
|
||||
}
|
||||
if (params.has_mtp) {
|
||||
throw std::invalid_argument("--spec-stage cannot be combined with -mtp/--multi-token-prediction; add the mtp fallback explicitly with --spec-stage mtp[:k=v,...]");
|
||||
}
|
||||
}
|
||||
|
||||
params.speculative.stages.push_back(common_speculative_stage_from_arg(argv[i]));
|
||||
if (params.speculative.stages.size() == 1) {
|
||||
params.speculative.type = params.speculative.stages.front().type;
|
||||
}
|
||||
params.has_mtp = params.speculative.has_stage_type(COMMON_SPECULATIVE_TYPE_MTP);
|
||||
return true;
|
||||
throw common_speculative_legacy_option_error(arg,
|
||||
"repeated --spec-type SPEC[:k=v,...] entries, e.g. --spec-type ngram-mod:n_max=64,n_min=2,ngram_size_n=8 --spec-type mtp:n_max=1,p_min=0.0");
|
||||
}
|
||||
if (arg == "--spec-type") {
|
||||
CHECK_ARG
|
||||
if (!params.speculative.stages.empty()) {
|
||||
throw std::invalid_argument("--spec-type cannot be combined with --spec-stage; use only --spec-stage for explicit stage chains");
|
||||
}
|
||||
|
||||
const auto type = common_speculative_type_from_name(argv[i]);
|
||||
if (type == COMMON_SPECULATIVE_TYPE_NONE || type == COMMON_SPECULATIVE_TYPE_MTP || common_speculative_type_is_self_spec(type)) {
|
||||
params.speculative.type = type;
|
||||
if (type == COMMON_SPECULATIVE_TYPE_MTP) {
|
||||
params.has_mtp = true;
|
||||
}
|
||||
} else {
|
||||
throw std::invalid_argument("unknown speculative decoding type");
|
||||
}
|
||||
params.speculative.stages.push_back(common_speculative_stage_from_arg(argv[i]));
|
||||
const auto resolved = params.speculative.get_resolved_stages();
|
||||
params.speculative.type = resolved.empty() ? COMMON_SPECULATIVE_TYPE_NONE : resolved.front().type;
|
||||
params.has_mtp = params.speculative.has_stage_type(COMMON_SPECULATIVE_TYPE_MTP);
|
||||
return true;
|
||||
}
|
||||
if (arg == "--spec-ngram-size-n") {
|
||||
CHECK_ARG
|
||||
int value = std::stoi(argv[i]);
|
||||
if (value < 1 || value > 1024) {
|
||||
throw std::invalid_argument("ngram size N must be between 1 and 1024 inclusive");
|
||||
}
|
||||
params.speculative.ngram_size_n = value;
|
||||
return true;
|
||||
throw common_speculative_legacy_option_error(arg,
|
||||
"the canonical stage key inside --spec-type, e.g. --spec-type ngram-mod:ngram_size_n=" + std::string(argv[i]));
|
||||
}
|
||||
if (arg == "--spec-ngram-size-m") {
|
||||
CHECK_ARG
|
||||
int value = std::stoi(argv[i]);
|
||||
if (value < 1 || value > 1024) {
|
||||
throw std::invalid_argument("ngram size M must be between 1 and 1024 inclusive");
|
||||
}
|
||||
params.speculative.ngram_size_m = value;
|
||||
return true;
|
||||
throw common_speculative_legacy_option_error(arg,
|
||||
"the canonical stage key inside --spec-type, e.g. --spec-type ngram-map-k4v:ngram_size_m=" + std::string(argv[i]));
|
||||
}
|
||||
if (arg == "--spec-ngram-min-hits") {
|
||||
CHECK_ARG
|
||||
int value = std::stoi(argv[i]);
|
||||
if (value < 1) {
|
||||
throw std::invalid_argument("ngram min hits must be at least 1");
|
||||
}
|
||||
params.speculative.ngram_min_hits = value;
|
||||
return true;
|
||||
throw common_speculative_legacy_option_error(arg,
|
||||
"the canonical stage key inside --spec-type, e.g. --spec-type ngram-map-k4v:ngram_min_hits=" + std::string(argv[i]));
|
||||
}
|
||||
if (arg == "--suffix-pattern-len") {
|
||||
CHECK_ARG
|
||||
int value = std::stoi(argv[i]);
|
||||
if (value < 1) {
|
||||
throw std::invalid_argument("suffix pattern length must be at least 1");
|
||||
}
|
||||
params.speculative.suffix_min_match_len = value;
|
||||
return true;
|
||||
throw common_speculative_legacy_option_error(arg,
|
||||
"the canonical stage key inside --spec-type, e.g. --spec-type suffix:suffix_min_match_len=" + std::string(argv[i]));
|
||||
}
|
||||
if (arg == "--suffix-max-depth") {
|
||||
CHECK_ARG
|
||||
int value = std::stoi(argv[i]);
|
||||
if (value < 1) {
|
||||
throw std::invalid_argument("suffix max depth must be at least 1");
|
||||
}
|
||||
params.speculative.suffix_max_depth = value;
|
||||
return true;
|
||||
throw common_speculative_legacy_option_error(arg,
|
||||
"the canonical stage key inside --spec-type, e.g. --spec-type suffix:suffix_max_depth=" + std::string(argv[i]));
|
||||
}
|
||||
if (arg == "--suffix-corpus") {
|
||||
CHECK_ARG
|
||||
params.speculative.suffix_corpus = argv[i];
|
||||
return true;
|
||||
throw common_speculative_legacy_option_error(arg,
|
||||
"the canonical stage key inside --spec-type, e.g. --spec-type suffix:suffix_corpus=" + std::string(argv[i]));
|
||||
}
|
||||
if (arg == "-a" || arg == "--alias") {
|
||||
CHECK_ARG
|
||||
@ -1976,17 +2028,12 @@ bool gpt_params_find_arg(int argc, char ** argv, const std::string & arg, gpt_pa
|
||||
return true;
|
||||
}
|
||||
if (arg == "-mtp" || arg == "--multi-token-prediction") {
|
||||
if (!params.speculative.stages.empty()) {
|
||||
throw std::invalid_argument("-mtp/--multi-token-prediction cannot be combined with --spec-stage; add the mtp fallback explicitly with --spec-stage mtp[:k=v,...]");
|
||||
}
|
||||
|
||||
params.has_mtp = true;
|
||||
return true;
|
||||
throw common_speculative_legacy_option_error(arg,
|
||||
"--spec-type mtp:n_max=1,p_min=0.0");
|
||||
}
|
||||
if (arg == "-no-mtp" || arg == "--no-multi-token-prediction") {
|
||||
params.has_mtp = false;
|
||||
common_speculative_remove_explicit_stage(params.speculative, COMMON_SPECULATIVE_TYPE_MTP);
|
||||
return true;
|
||||
throw common_speculative_legacy_option_error(arg,
|
||||
"remove the mtp entry from repeated --spec-type arguments");
|
||||
}
|
||||
if (arg == "-draft" || arg == "--draft-params") {
|
||||
CHECK_ARG
|
||||
@ -3172,29 +3219,20 @@ void gpt_params_print_usage(int /*argc*/, char ** argv, const gpt_params & param
|
||||
options.push_back({ "*", "-hfr, --hf-repo REPO", "Hugging Face model repository (default: unused)" });
|
||||
options.push_back({ "*", "-hff, --hf-file FILE", "Hugging Face model file (default: unused)" });
|
||||
options.push_back({ "*", "-hft, --hf-token TOKEN", "Hugging Face access token (default: value from HF_TOKEN environment variable)" });
|
||||
options.push_back({ "*", "-mtp, --multi-token-prediction", "legacy shortcut for enabling MTP when --spec-stage is not used (default: %s)", params.has_mtp ? "true" : "false" });
|
||||
options.push_back({ "*", "-no-mtp, --no-multi-token-prediction", "disable the legacy MTP shortcut or remove an explicit MTP stage (default: %s)", !params.has_mtp ? "true" : "false" });
|
||||
options.push_back({ "*", "--draft-max, --draft, --draft-n N",
|
||||
"global default number of tokens to draft for speculative decoding or for stages without an explicit n_max override (default: %d)", params.speculative.n_max });
|
||||
options.push_back({ "*", "--draft-min, --draft-n-min N", "global default minimum draft threshold or fallback threshold for stages without an explicit n_min override" });
|
||||
options.push_back({ "*", "--draft-p-min P", "global default minimum speculative decoding probability (greedy) for stages without an explicit p_min override (default: %.1f)", (double)params.speculative.p_min });
|
||||
options.push_back({ "*", "--recurrent-ckpt-mode MODE", "checkpoint strategy for recurrent/hybrid speculative decoding\n"
|
||||
" auto auto-select: per-step if CUDA full-GPU, gpu-fallback otherwise (default)\n"
|
||||
" per-step save SSM state per draft step in VRAM; no re-decode on rejection\n"
|
||||
" gpu-fallback copy state to GPU buffer; re-decode on rejection\n"
|
||||
" cpu serialise state via llama_state_seq; re-decode on rejection" });
|
||||
options.push_back({ "*", "--spec-stage SPEC[:k=v,...]", "explicit speculative stage. repeat once for a supported two-stage chain.\n"
|
||||
"examples: --spec-stage ngram-mod:n_max=64,n_min=2 --spec-stage mtp:n_max=1\n"
|
||||
"supported two-stage shape in this PR: self-spec first, then mtp or draft fallback" });
|
||||
options.push_back({ "*", "--spec-type Name [none | mtp | ngram-cache | ngram-simple | ngram-map-k | ngram-map-k4v | ngram-mod | suffix]", "single-stage speculative selection when --spec-stage is not used (default: %d)\n", (int)params.speculative.type});
|
||||
options.push_back({ "*", "--spec-ngram-size-n N", "ngram size N for ngram-simple/ngram-map speculative decoding, length of lookup n-gram (default: %d)\n",params.speculative.ngram_size_n });
|
||||
|
||||
options.push_back({ "*", "--spec-ngram-size-m N", "ngram size M for ngram-simple/ngram-map speculative decoding, length of draft m-gram (default: %d)\n", params.speculative.ngram_size_m });
|
||||
|
||||
options.push_back({ "*", "--spec-ngram-min-hits N", "minimum hits for ngram-map speculative decoding (default: %d)\n", params.speculative.ngram_min_hits });
|
||||
options.push_back({ "*", "--suffix-pattern-len N", "minimum context match length for suffix decoding (default: %d)", params.speculative.suffix_min_match_len });
|
||||
options.push_back({ "*", "--suffix-max-depth N", "suffix tree maximum depth for suffix decoding (default: %d)", params.speculative.suffix_max_depth });
|
||||
options.push_back({ "*", "--suffix-corpus PATH", "corpus file to pre-warm the suffix tree: .json (array of strings or conversation messages) or .bin (raw int32 token IDs)" });
|
||||
options.push_back({ "*", "--spec-type SPEC[:k=v,...]", "canonical speculative stage entry; repeat for a supported two-stage chain.\n"
|
||||
"types: none, draft, mtp, ngram-cache, ngram-simple, ngram-map-k, ngram-map-k4v, ngram-mod, suffix\n"
|
||||
"canonical keys: n_max,n_min,p_min,ngram_size_n,ngram_size_m,ngram_min_hits,suffix_min_match_len,suffix_max_depth,suffix_corpus\n"
|
||||
"for comma-bearing string values, quote the value inside the stage payload for normal shell use\n"
|
||||
"if argv is passed directly without shell unescaping, the parser also accepts escaped commas as \\,\n"
|
||||
"examples: --spec-type mtp:n_max=1,p_min=0.0\n"
|
||||
" --spec-type ngram-mod:n_max=64,n_min=2,ngram_size_n=8 --spec-type mtp:n_max=1,p_min=0.0\n"
|
||||
" --spec-type \"suffix:n_max=16,n_min=2,suffix_min_match_len=5,suffix_max_depth=64,suffix_corpus='/tmp/spec,type-corpus.json'\"\n"
|
||||
"legacy --spec-stage, --draft-*, --spec-ngram-*, --suffix-* and -mtp flags are rejected" });
|
||||
options.push_back({ "*", "--spec-autotune", "automatically tune speculative params to maximize tokens/sec" });
|
||||
|
||||
options.push_back({ "retrieval" });
|
||||
|
||||
@ -169,6 +169,7 @@ struct common_speculative_stage_params {
|
||||
|
||||
int32_t suffix_min_match_len = -1;
|
||||
int32_t suffix_max_depth = -1;
|
||||
std::string suffix_corpus;
|
||||
|
||||
bool has_n_max_override() const { return n_max >= 0; }
|
||||
bool has_n_min_override() const { return n_min >= 0; }
|
||||
@ -178,6 +179,7 @@ struct common_speculative_stage_params {
|
||||
bool has_ngram_min_hits_override() const { return ngram_min_hits > 0; }
|
||||
bool has_suffix_min_match_len_override() const { return suffix_min_match_len >= 0; }
|
||||
bool has_suffix_max_depth_override() const { return suffix_max_depth >= 0; }
|
||||
bool has_suffix_corpus_override() const { return !suffix_corpus.empty(); }
|
||||
};
|
||||
|
||||
struct common_params_model {
|
||||
|
||||
@ -357,20 +357,15 @@ void spec_tuner::print_best() const {
|
||||
|
||||
{
|
||||
std::ostringstream oss;
|
||||
oss << "Autotune reuse: ";
|
||||
oss << "Autotune reuse: --spec-type " << common_speculative_type_to_str(spec_type);
|
||||
bool first_kv = true;
|
||||
for (const auto & coord : coords) {
|
||||
bool is_int = (coord.name != "p_min");
|
||||
if (coord.name == "n_max") oss << "--draft-max ";
|
||||
else if (coord.name == "p_min") oss << "--draft-p-min ";
|
||||
else if (coord.name == "n_min") oss << "--draft-min ";
|
||||
else if (coord.name == "ngram_size_n") oss << "--spec-ngram-size-n ";
|
||||
else if (coord.name == "ngram_size_m") oss << "--spec-ngram-size-m ";
|
||||
else if (coord.name == "ngram_min_hits") oss << "--spec-ngram-min-hits ";
|
||||
else if (coord.name == "suffix_min_match_len") oss << "--suffix-pattern-len ";
|
||||
else oss << "--" << coord.name << " ";
|
||||
oss << (first_kv ? ':' : ',') << coord.name << '=';
|
||||
first_kv = false;
|
||||
|
||||
if (is_int) oss << (int)coord.arms[coord.best_idx].value << " ";
|
||||
else oss << std::fixed << std::setprecision(2) << coord.arms[coord.best_idx].value << " ";
|
||||
if (is_int) oss << (int)coord.arms[coord.best_idx].value;
|
||||
else oss << std::fixed << std::setprecision(2) << coord.arms[coord.best_idx].value;
|
||||
}
|
||||
LOG_INF("%s\n", oss.str().c_str());
|
||||
}
|
||||
|
||||
@ -1160,7 +1160,7 @@ common_speculative * common_speculative_init(
|
||||
});
|
||||
|
||||
if (has_draft_stage) {
|
||||
LOG_ERR("%s: Gemma4 assistant models only support MTP stages; omit -md for self-spec-only runs or use -mtp/--spec-stage mtp for assistant-backed MTP\n", __func__);
|
||||
LOG_ERR("%s: Gemma4 assistant models only support MTP stages; omit -md for self-spec-only runs or use --spec-type mtp:n_max=1,p_min=0.0 for assistant-backed MTP\n", __func__);
|
||||
return nullptr;
|
||||
}
|
||||
}
|
||||
|
||||
@ -209,7 +209,7 @@ static bool suffix_corpus_check_limit(const std::string & path, size_t n_tokens,
|
||||
return true;
|
||||
}
|
||||
|
||||
LOG_ERR("load_corpus: refusing suffix corpus '%s' - estimated insert work %llu exceeds limit %llu (tokens=%zu, depth=%d); reduce corpus size or --suffix-max-depth\n",
|
||||
LOG_ERR("load_corpus: refusing suffix corpus '%s' - estimated insert work %llu exceeds limit %llu (tokens=%zu, depth=%d); reduce corpus size or lower suffix_max_depth inside --spec-type suffix:suffix_max_depth=...\n",
|
||||
path.c_str(),
|
||||
(unsigned long long) estimated_work,
|
||||
(unsigned long long) SUFFIX_CORPUS_MAX_INSERT_WORK,
|
||||
|
||||
@ -120,21 +120,13 @@ Check the details [here](./speculative.md).
|
||||
| `-ctkd, --cache-type-k-draft TYPE` | KV cache data type for K for the draft model | - | For draft model, see: `-ctk` |
|
||||
| `-ctvd, --cache-type-v-draft TYPE` | KV cache data type for V for the draft model | - | For draft model, see: `-ctk` |
|
||||
| `-draft, --draft-params` | Comma-separated list of draft model parameters | - | |
|
||||
| `--spec-ngram-size-n N` | ngram size N for ngram-simple/ngram-map speculative decoding, length of lookup n-gram| 12 | [PR 1261](https://github.com/ikawrakow/ik_llama.cpp/pull/1261) |
|
||||
| `--spec-ngram-size-m N` | ngram size M for ngram-simple/ngram-map speculative decoding, length of draft m-gram | 48 | [PR 1261](https://github.com/ikawrakow/ik_llama.cpp/pull/1261) |
|
||||
| `--spec-ngram-min-hits N` | minimum hits for ngram-map speculative decoding | 1 | [PR 1261](https://github.com/ikawrakow/ik_llama.cpp/pull/1261) |
|
||||
| `--spec-type Name` | Comma-separated list of draft model parameters | - | none / ngram - cache / ngram - simple / ngram - map - k / ngram - map - k4v / ngram - mod / suffix [PR 1261](https://github.com/ikawrakow/ik_llama.cpp/pull/1261) [PR 1646](https://github.com/ikawrakow/ik_llama.cpp/pull/1646) |
|
||||
| `--spec-stage SPEC[:k=v,...]` | Add an explicit speculative stage; repeat once for a supported two-stage chain | - | Supported two-stage shape: self-spec first, then `mtp` or `draft` fallback. [PR 1789](https://github.com/ikawrakow/ik_llama.cpp/pull/1789) |
|
||||
| `-mtp, --multi-token-prediction` | | - | MTP decoding [PR 1270](https://github.com/ikawrakow/ik_llama.cpp/pull/1270) [1698](https://github.com/ikawrakow/ik_llama.cpp/pull/1698) |
|
||||
| `-no-mtp, --no-multi-token-prediction` | | - | MTP decoding [PR 1270](https://github.com/ikawrakow/ik_llama.cpp/pull/1270) [1698](https://github.com/ikawrakow/ik_llama.cpp/pull/1698) |
|
||||
| `--draft-max` | | - | MTP decoding [PR 1270](https://github.com/ikawrakow/ik_llama.cpp/pull/1270) [1698](https://github.com/ikawrakow/ik_llama.cpp/pull/1698) |
|
||||
| `--draft-p-min` | | - | MTP decoding [PR 1270](https://github.com/ikawrakow/ik_llama.cpp/pull/1270) [1698](https://github.com/ikawrakow/ik_llama.cpp/pull/1698) |
|
||||
| `--spec-type SPEC[:k=v,...]` | Canonical speculative stage entry; repeat to configure the supported two-stage chain | - | Types: `none`, `draft`, `mtp`, `ngram-cache`, `ngram-simple`, `ngram-map-k`, `ngram-map-k4v`, `ngram-mod`, `suffix`. Canonical keys: `n_max`, `n_min`, `p_min`, `ngram_size_n`, `ngram_size_m`, `ngram_min_hits`, `suffix_min_match_len`, `suffix_max_depth`, `suffix_corpus`. String values may escape commas as `\,` or quote the value inside the stage payload. Example: `--spec-type ngram-mod:n_max=64,n_min=2,ngram_size_n=8 --spec-type mtp:n_max=1,p_min=0.0` |
|
||||
| `--spec-autotune` | Automatically tune speculative params to maximize tokens/sec | - | Automatically determines the near-optimal arguments for the type of speculation being performed [PR 1595](https://github.com/ikawrakow/ik_llama.cpp/pull/1595) |
|
||||
| `--recurrent-ckpt-mode MODE` | Checkpoint strategy for recurrent/hybrid speculative decoding | auto | One of: - `auto` auto-select: per-step if CUDA full-GPU, gpu-fallback otherwise - `per-step` save SSM state per draft step in VRAM; no re-decode on rejection - `gpu-fallback` copy state to GPU buffer; re-decode on rejection - `cpu` serialise state via llama_state_seq; re-decode on rejection [PR 1669](https://github.com/ikawrakow/ik_llama.cpp/pull/1669) [PR 1774](https://github.com/ikawrakow/ik_llama.cpp/pull/1774) |
|
||||
|
||||
Notes:
|
||||
|
||||
- `--spec-type` cannot be combined with `--spec-stage`.
|
||||
- Legacy `--spec-stage`, `--draft-*`, `--spec-ngram-*`, `--suffix-*`, and `-mtp` flags are rejected with replacement guidance.
|
||||
- Explicit stage chains currently support at most two stages.
|
||||
- Supported self-spec stage names are `ngram-cache`, `ngram-simple`, `ngram-map-k`, `ngram-map-k4v`, `ngram-mod`, and `suffix`.
|
||||
- Composite stage chains disable speculative autotune.
|
||||
@ -378,9 +370,7 @@ WIP
|
||||
| `--override-kv KEY=TYPE:VALUE` | Override model metadata by key | - | Advanced option to override model metadata by key. May be specified multiple times. types: int, float, bool, str. Example: `--override-kv tokenizer.ggml.add_bos_token=bool:false` |
|
||||
| `-m, --model FNAME` | Model path | models/$filename | Mandatory, the GGUF model file to be served. |
|
||||
| `-md, --model-draft FNAME` | Draft model for speculative decoding | unused | Required when an explicit `draft` stage is used. |
|
||||
| `--draft-max, --draft, --draft-n N` | Global speculative draft cap, or fallback value for stages without an explicit `n_max` override | 16 | Also used by single-stage MTP and draft-model speculation. |
|
||||
| `--draft-min, --draft-n-min N` | Global minimum speculative draft threshold, or fallback value for stages without an explicit `n_min` override | 0 | |
|
||||
| `--draft-p-min P` | Global minimum speculative decoding probability (greedy), or fallback value for stages without an explicit `p_min` override | 0.8 | |
|
||||
| `--spec-type SPEC[:k=v,...]` | Canonical speculative stage entry; repeat for the supported two-stage chain | none | Use stage-local keys like `n_max`, `n_min`, `p_min`, `ngram_size_n`, `ngram_size_m`, `ngram_min_hits`, `suffix_min_match_len`, `suffix_max_depth`, and `suffix_corpus`. |
|
||||
|
||||
### Request-Level Speculative Overrides
|
||||
|
||||
|
||||
@ -33,18 +33,18 @@ An example to use this approach can be the rewriting of source code by a LLM.
|
||||
This implementation looks for the last n-gram in history that matches the current n-gram and creates a draft using the m tokens following the matched n-gram. It is the simplest self-speculative approach with minimal overhead.
|
||||
|
||||
```
|
||||
llama-server [...] --spec-type ngram-simple --draft-max 64
|
||||
llama-server [...] --spec-type ngram-simple:n_max=64
|
||||
```
|
||||
|
||||
#### n-gram Map Key (`ngram-map-k`)
|
||||
|
||||
This implementation looks for the current n-gram of size n (called the _key_) in the token history. If the key n-gram is followed by the same m tokens (called the _mgram_) multiple times, it creates a draft using these m tokens. This approach requires a minimum number of occurrences (argument `--spec-ngram-min-hits`, default is 1) before generating drafts.
|
||||
This implementation looks for the current n-gram of size n (called the _key_) in the token history. If the key n-gram is followed by the same m tokens (called the _mgram_) multiple times, it creates a draft using these m tokens. This approach requires a minimum number of occurrences (stage key `ngram_min_hits`, default is 1) before generating drafts.
|
||||
|
||||
The number of accepted tokens is stored for each used n-gram.
|
||||
|
||||
**Example:**
|
||||
```
|
||||
llama-server [...] --spec-type ngram-map-k --draft-max 64
|
||||
llama-server [...] --spec-type ngram-map-k:n_max=64,ngram_min_hits=1
|
||||
```
|
||||
|
||||
#### n-gram Map Key-4-Values (`ngram-map-k4v`)
|
||||
@ -55,7 +55,7 @@ The number of accepted tokens is stored for each used n-gram.
|
||||
|
||||
**Example:** Server options to be used if there are a lot of longer repetitions.
|
||||
```
|
||||
llama-server [...] --spec-type ngram-map-k4v --spec-ngram-size-n 8 --spec-ngram-size-m 8 --spec-ngram-min-hits 2 --draft-max 64
|
||||
llama-server [...] --spec-type ngram-map-k4v:n_max=64,ngram_size_n=8,ngram_size_m=8,ngram_min_hits=2
|
||||
```
|
||||
|
||||
### n-gram Mod (`ngram-mod`)
|
||||
@ -80,9 +80,9 @@ Currently, a single hash pool is shared across all server slots, so different re
|
||||
# notes:
|
||||
# - small `n` are not recommended
|
||||
# - MoEs require long drafts
|
||||
# - dense models: can reduce `--draft-min` and `--draft-max`
|
||||
# - dense models: can reduce `n_min` and `n_max`
|
||||
|
||||
llama-server ... --spec-type ngram-mod --spec-ngram-size-n 24 --draft-min 48 --draft-max 64
|
||||
llama-server ... --spec-type ngram-mod:n_max=64,n_min=48,ngram_size_n=24
|
||||
```
|
||||
|
||||
Applications:
|
||||
@ -103,57 +103,78 @@ Example Video:
|
||||
|
||||
## Command-Line Options
|
||||
|
||||
If a draft model is combined with a draftless decoding the draftless decoding has higher precedence.
|
||||
The canonical startup surface is repeated `--spec-type SPEC[:k=v,...]`. Legacy `--spec-stage`, `--draft-*`, `--spec-ngram-*`, `--suffix-*`, and `-mtp` flags are rejected with replacement guidance.
|
||||
|
||||
```
|
||||
--draft, --draft-n, --draft-max N number of tokens to draft for speculative decoding (default: 16)
|
||||
(env: LLAMA_ARG_DRAFT_MAX)
|
||||
--draft-min, --draft-n-min N minimum number of draft tokens to use for speculative decoding
|
||||
(default: 0)
|
||||
(env: LLAMA_ARG_DRAFT_MIN)
|
||||
[...]
|
||||
--spec-type [none|ngram-cache|ngram-simple|ngram-map-k|ngram-map-k4v|ngram-mod]
|
||||
type of speculative decoding to use when no draft model is provided
|
||||
(default: none)
|
||||
--spec-ngram-size-n N ngram size N for ngram-simple/ngram-map speculative decoding, length
|
||||
of lookup n-gram (default: 12)
|
||||
--spec-ngram-size-m N ngram size M for ngram-simple/ngram-map speculative decoding, length
|
||||
of draft m-gram (default: 48)
|
||||
--spec-ngram-min-hits N minimum hits for ngram-map speculative decoding (default: 1)
|
||||
```
|
||||
### `--spec-type SPEC[:k=v,...]`
|
||||
|
||||
### `--spec-type TYPE`
|
||||
|
||||
Specifies a type of speculative decoding without draft model.
|
||||
Each `--spec-type` entry defines one speculative stage. Repeat it to configure the supported two-stage path.
|
||||
|
||||
| Type | Description |
|
||||
|------|-------------|
|
||||
| `none` | No speculative decoding (default) |
|
||||
| `none` | No speculative decoding |
|
||||
| `draft` | Draft-model speculative decoding; pair with `-md/--model-draft` |
|
||||
| `mtp` | Embedded or assistant-backed MTP |
|
||||
| `ngram-cache` | Use n-gram cache lookup |
|
||||
| `ngram-simple` | Use simple n-gram pattern matching |
|
||||
| `ngram-map-k` | Use n-gram pattern matching with n-gram-keys |
|
||||
| `ngram-map-k4v` | Use n-gram pattern matching with n-gram-keys and up to four m-gram values (experimental) |
|
||||
| `ngram-mod` | Use basic ngram hasher for speculative decoding with shared pool |
|
||||
| `ngram-map-k` | Use n-gram pattern matching with n-gram keys |
|
||||
| `ngram-map-k4v` | Use n-gram pattern matching with n-gram keys and up to four m-gram values |
|
||||
| `ngram-mod` | Use the shared n-gram hasher |
|
||||
| `suffix` | Use suffix-tree speculative decoding |
|
||||
|
||||
Canonical stage keys:
|
||||
|
||||
| Key | Meaning |
|
||||
|-----|---------|
|
||||
| `n_max` | Maximum drafted tokens for that stage |
|
||||
| `n_min` | Minimum usable drafted tokens for that stage |
|
||||
| `p_min` | Minimum speculative probability threshold |
|
||||
| `ngram_size_n` | Lookup n-gram size |
|
||||
| `ngram_size_m` | Draft m-gram size |
|
||||
| `ngram_min_hits` | Minimum matching hits for n-gram map stages |
|
||||
| `suffix_min_match_len` | Minimum suffix context match length |
|
||||
| `suffix_max_depth` | Maximum suffix-tree depth |
|
||||
| `suffix_corpus` | Optional suffix corpus file for pre-warming |
|
||||
|
||||
String-valued stage keys such as `suffix_corpus` need shell-safe quoting when the value contains commas. From a normal shell, quote the value inside the stage payload so the parser sees the comma as part of the string value.
|
||||
|
||||
Example shell-safe form:
|
||||
|
||||
**Example:** Server-instance used to refactor source code.
|
||||
```bash
|
||||
./llama-server [...] --spec-type ngram-simple
|
||||
./llama-server [...] \
|
||||
--spec-type "suffix:n_max=16,n_min=2,suffix_min_match_len=5,suffix_max_depth=64,suffix_corpus='/tmp/spec,type-corpus.json'"
|
||||
```
|
||||
|
||||
### `--spec-ngram-size-n N`
|
||||
If you are constructing `argv` directly without shell unescaping, the parser also accepts escaped commas as `\,`.
|
||||
|
||||
Sets the size N of the lookup n-gram for n-gram map based speculative decoding.
|
||||
The n-gram size N determines how many tokens in a row to look back when searching for matching patterns.
|
||||
Examples:
|
||||
|
||||
### `--spec-ngram-size-m M`
|
||||
```bash
|
||||
# Single-stage MTP
|
||||
./llama-server [...] --spec-type mtp:n_max=1,p_min=0.0
|
||||
|
||||
Sets the size M of the draft m-gram for n-gram map based speculative decoding.
|
||||
The m-gram size determines how many tokens to draft when a match is found.
|
||||
Larger values can provide more speedup but may reduce acceptance rate.
|
||||
# Single-stage ngram-mod
|
||||
./llama-server [...] --spec-type ngram-mod:n_max=64,n_min=48,ngram_size_n=24
|
||||
|
||||
### `--spec-ngram-min-hits H`
|
||||
# Draft-model speculation
|
||||
./llama-server [...] --model-draft draft.gguf --spec-type draft:n_max=4,p_min=0.0
|
||||
|
||||
This option defines how often a key has to appear in the token history to be used as a draft (default is 1).
|
||||
# Two-stage self-spec -> MTP fallback
|
||||
./llama-server [...] \
|
||||
--spec-type ngram-mod:n_max=64,n_min=2,ngram_size_n=8 \
|
||||
--spec-type mtp:n_max=1,p_min=0.0
|
||||
|
||||
# Suffix stage with pre-warmed corpus
|
||||
./llama-server [...] \
|
||||
--spec-type suffix:n_max=16,n_min=2,suffix_min_match_len=5,suffix_max_depth=64,suffix_corpus=/path/to/corpus.json
|
||||
|
||||
# Suffix stage with a comma-bearing corpus path from a normal shell
|
||||
./llama-server [...] \
|
||||
--spec-type "suffix:n_max=16,n_min=2,suffix_min_match_len=5,suffix_max_depth=64,suffix_corpus='/tmp/spec,type-corpus.json'"
|
||||
```
|
||||
|
||||
### `--spec-autotune`
|
||||
|
||||
Autotunes the active stage parameters and reports the best configuration back as a canonical `--spec-type ...` snippet.
|
||||
|
||||
## Statistics
|
||||
Each speculative decoding implementation prints statistics.
|
||||
@ -180,4 +201,3 @@ statistics ngram_map_k: #calls(b,g,a) = 6 1690 26, #gen drafts = 26, #acc drafts
|
||||
- `#gen tokens`: number of tokens generated by this implementation (including rejected tokens)
|
||||
- `#acc tokens`: number of tokens accepted by the main model
|
||||
- `dur(b,g,a): durations of begin (new prompt), generation and accumulation (process acceptance).
|
||||
|
||||
|
||||
@ -1232,7 +1232,7 @@ int main(int argc, char ** argv) {
|
||||
}
|
||||
|
||||
if (!use_paired_gemma4_mtp && llama_model_is_gemma4_mtp_assistant(model) && !params.process_output) {
|
||||
fprintf(stderr, "%s: warning: standalone Gemma 4 assistant imatrix does not exercise the assistant layers. Use '-m <target> -md <assistant> -mtp' for meaningful calibration.\n", __func__);
|
||||
fprintf(stderr, "%s: warning: standalone Gemma 4 assistant imatrix does not exercise the assistant layers. Use '-m <target> -md <assistant> --spec-type mtp:n_max=1,p_min=0.0' for meaningful calibration.\n", __func__);
|
||||
}
|
||||
|
||||
const int n_ctx_train = llama_n_ctx_train(model);
|
||||
|
||||
@ -210,10 +210,10 @@ model:
|
||||
-m, --model FNAME model path (default: models/$filename with filename from --hf-file
|
||||
or --model-url if set, otherwise models/7B/ggml-model-f16.gguf)
|
||||
-md, --model-draft FNAME draft model for speculative decoding (default: unused)
|
||||
--spec-stage SPEC[:k=v,...]
|
||||
explicit speculative stage. repeat once for a supported two-stage chain
|
||||
examples: --spec-stage ngram-mod:n_max=64,n_min=2 --spec-stage mtp:n_max=1
|
||||
supported two-stage shape: self-spec first, then mtp or draft fallback
|
||||
--spec-type SPEC[:k=v,...]
|
||||
canonical speculative stage entry; repeat for a supported two-stage chain
|
||||
examples: --spec-type mtp:n_max=1,p_min=0.0
|
||||
--spec-type ngram-mod:n_max=64,n_min=2,ngram_size_n=8 --spec-type mtp:n_max=1,p_min=0.0
|
||||
-mu, --model-url MODEL_URL model download url (default: unused)
|
||||
-hfr, --hf-repo REPO Hugging Face model repository (default: unused)
|
||||
-hff, --hf-file FILE Hugging Face model file (default: unused)
|
||||
@ -966,15 +966,15 @@ To know the `id` of the adapter, use GET `/lora-adapters`
|
||||
|
||||
### Composite speculative decoding
|
||||
|
||||
Use `--spec-stage` for explicit stage chains. The currently supported two-stage shape is self-spec first, then `mtp` or `draft` fallback.
|
||||
Use repeated `--spec-type SPEC[:k=v,...]` entries for explicit stage chains. The currently supported two-stage shape is self-spec first, then `mtp` or `draft` fallback.
|
||||
|
||||
Example with `ngram-mod` plus MTP fallback:
|
||||
|
||||
```bash
|
||||
./build/bin/llama-server \
|
||||
--model /models/target-mtp.gguf \
|
||||
--spec-stage ngram-mod:n_max=64,n_min=2,ngram_size_n=8 \
|
||||
--spec-stage mtp:n_max=1,p_min=0.0
|
||||
--spec-type ngram-mod:n_max=64,n_min=2,ngram_size_n=8 \
|
||||
--spec-type mtp:n_max=1,p_min=0.0
|
||||
```
|
||||
|
||||
Example with `ngram-mod` plus draft-model fallback:
|
||||
@ -983,14 +983,13 @@ Example with `ngram-mod` plus draft-model fallback:
|
||||
./build/bin/llama-server \
|
||||
--model /models/target.gguf \
|
||||
--model-draft /models/draft.gguf \
|
||||
--spec-stage ngram-mod:n_max=64,n_min=2,ngram_size_n=8 \
|
||||
--spec-stage draft:n_max=4,p_min=0.0
|
||||
--spec-type ngram-mod:n_max=64,n_min=2,ngram_size_n=8 \
|
||||
--spec-type draft:n_max=4,p_min=0.0
|
||||
```
|
||||
|
||||
Notes:
|
||||
|
||||
- Use `--spec-type` when you want a single self-spec stage only.
|
||||
- `--spec-type` cannot be combined with `--spec-stage`.
|
||||
- Use `--spec-type` for both single-stage and two-stage startup configuration.
|
||||
- Explicit stage chains currently support at most two stages.
|
||||
|
||||
### Change system prompt on runtime
|
||||
|
||||
@ -166,7 +166,8 @@ static void server_reject_dead_speculative_request_overrides(const json & data)
|
||||
json_value_ptr(data, "speculative.ngram_size_m") != nullptr ||
|
||||
json_value_ptr(data, "speculative.ngram_min_hits") != nullptr ||
|
||||
json_value_ptr(data, "speculative.suffix_min_match_len") != nullptr ||
|
||||
json_value_ptr(data, "speculative.suffix_max_depth") != nullptr) {
|
||||
json_value_ptr(data, "speculative.suffix_max_depth") != nullptr ||
|
||||
json_value_ptr(data, "speculative.suffix_corpus") != nullptr) {
|
||||
throw std::runtime_error("Error: structural speculative overrides are startup-only; per-request overrides only support speculative.n_max, speculative.n_min, speculative.p_min, and speculative.stages");
|
||||
}
|
||||
}
|
||||
@ -284,7 +285,6 @@ bool server_context::load_model(const gpt_params& params_) {
|
||||
});
|
||||
|
||||
params_base.has_mtp = false;
|
||||
|
||||
server_remove_speculative_stage(params_base.speculative, COMMON_SPECULATIVE_TYPE_MTP);
|
||||
|
||||
if (!server_speculative_needs_draft_model(params_base.speculative)) {
|
||||
@ -1251,6 +1251,10 @@ bool server_context::launch_slot_with_task(server_slot& slot, server_task& task)
|
||||
// speculative decoding parameters
|
||||
try {
|
||||
slot.params.speculative = defaults.speculative;
|
||||
const bool has_flat_n_max = json_value_ptr(data, "speculative.n_max") != nullptr;
|
||||
const bool has_flat_n_min = json_value_ptr(data, "speculative.n_min") != nullptr;
|
||||
const bool has_flat_p_min = json_value_ptr(data, "speculative.p_min") != nullptr;
|
||||
|
||||
slot.params.speculative.n_max = json_value(data, "speculative.n_max", params_base.speculative.n_max);
|
||||
slot.params.speculative.n_min = json_value(data, "speculative.n_min", params_base.speculative.n_min);
|
||||
slot.params.speculative.p_min = json_value(data, "speculative.p_min", params_base.speculative.p_min);
|
||||
@ -1258,6 +1262,20 @@ bool server_context::launch_slot_with_task(server_slot& slot, server_task& task)
|
||||
server_reject_dead_speculative_request_overrides(data);
|
||||
|
||||
const json stages = json_value(data, "speculative.stages", json());
|
||||
if (stages.is_null() && !slot.params.speculative.stages.empty()) {
|
||||
for (auto & stage : slot.params.speculative.stages) {
|
||||
if (has_flat_n_max) {
|
||||
stage.n_max = -1;
|
||||
}
|
||||
if (has_flat_n_min) {
|
||||
stage.n_min = -1;
|
||||
}
|
||||
if (has_flat_p_min) {
|
||||
stage.p_min = -1.0f;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
if (!stages.is_null()) {
|
||||
if (!stages.is_array()) {
|
||||
throw std::runtime_error("Error: speculative.stages must be an array");
|
||||
@ -1296,11 +1314,11 @@ bool server_context::launch_slot_with_task(server_slot& slot, server_task& task)
|
||||
|
||||
if (slot.can_speculate() &&
|
||||
llama_model_has_recurrent(model) &&
|
||||
slot.params.speculative.n_max > params_base.speculative.n_max) {
|
||||
slot.params.speculative.get_max_stage_n_max() > params_base.speculative.get_max_stage_n_max()) {
|
||||
send_error(task,
|
||||
"Error: speculative.n_max=" + std::to_string(slot.params.speculative.n_max) +
|
||||
" exceeds the recurrent speculative startup limit of " + std::to_string(params_base.speculative.n_max) +
|
||||
"; restart the server with a higher --draft-max to reserve checkpoint capacity",
|
||||
"Error: speculative n_max=" + std::to_string(slot.params.speculative.get_max_stage_n_max()) +
|
||||
" exceeds the recurrent speculative startup limit of " + std::to_string(params_base.speculative.get_max_stage_n_max()) +
|
||||
"; restart the server with a higher n_max inside the configured --spec-type stages to reserve checkpoint capacity",
|
||||
ERROR_TYPE_INVALID_REQUEST);
|
||||
return false;
|
||||
}
|
||||
|
||||
Loading…
x
Reference in New Issue
Block a user