diff --git a/common/common.cpp b/common/common.cpp index 4261865a..7677d614 100644 --- a/common/common.cpp +++ b/common/common.cpp @@ -124,7 +124,16 @@ static int32_t common_speculative_stage_effective_n_min( std::vector common_params_speculative::get_resolved_stages() const { if (!stages.empty()) { - return stages; + std::vector resolved; + resolved.reserve(stages.size()); + + for (const auto & stage : stages) { + if (stage.type != COMMON_SPECULATIVE_TYPE_NONE) { + resolved.push_back(stage); + } + } + + return resolved; } if (type == COMMON_SPECULATIVE_TYPE_NONE) { @@ -164,6 +173,9 @@ common_params_speculative common_params_speculative::with_stage_overrides(const if (stage.has_suffix_max_depth_override()) { result.suffix_max_depth = stage.suffix_max_depth; } + if (stage.has_suffix_corpus_override()) { + result.suffix_corpus = stage.suffix_corpus; + } result.n_max = std::max(result.n_max, 0); result.n_min = std::max(0, std::min(result.n_min, result.n_max)); @@ -612,28 +624,20 @@ static void common_speculative_finalize_stages(gpt_params & params) { auto & spec = params.speculative; if (!spec.stages.empty()) { - spec.type = spec.stages.front().type; + const auto resolved = spec.get_resolved_stages(); + if (resolved.size() != spec.stages.size()) { + spec.stages = resolved; + } + + spec.type = resolved.empty() ? COMMON_SPECULATIVE_TYPE_NONE : resolved.front().type; params.has_mtp = spec.has_stage_type(COMMON_SPECULATIVE_TYPE_MTP); return; } - const bool wants_mtp = params.has_mtp; - const bool wants_draft = spec.has_dft(); - if (spec.type != COMMON_SPECULATIVE_TYPE_NONE) { spec.stages.push_back({ .type = spec.type }); - - if (common_speculative_type_is_self_spec(spec.type)) { - if (wants_mtp) { - spec.stages.push_back({ .type = COMMON_SPECULATIVE_TYPE_MTP }); - } else if (wants_draft) { - spec.stages.push_back({ .type = COMMON_SPECULATIVE_TYPE_DRAFT }); - } - } - } else if (wants_mtp) { + } else if (params.has_mtp) { spec.stages.push_back({ .type = COMMON_SPECULATIVE_TYPE_MTP }); - } else if (wants_draft) { - spec.stages.push_back({ .type = COMMON_SPECULATIVE_TYPE_DRAFT }); } spec.type = spec.stages.empty() ? COMMON_SPECULATIVE_TYPE_NONE : spec.stages.front().type; @@ -827,13 +831,16 @@ static std::string common_normalize_spec_stage_key(std::string key) { std::replace(key.begin(), key.end(), '-', '_'); - if (key.rfind("spec_", 0) == 0) { - key.erase(0, 5); - } - return key; } +static std::invalid_argument common_speculative_legacy_option_error( + const std::string & arg, + const std::string & replacement) { + return std::invalid_argument( + "legacy speculative option '" + arg + "' is disabled; use " + replacement); +} + static void common_speculative_remove_explicit_stage(common_params_speculative & params, common_speculative_type type) { params.stages.erase(std::remove_if(params.stages.begin(), params.stages.end(), [type](const common_speculative_stage_params & stage) { return stage.type == type; @@ -850,21 +857,21 @@ static void common_speculative_stage_apply_kv( const std::string & value_raw) { const std::string key = common_normalize_spec_stage_key(key_raw); - if (key == "draft" || key == "draft_max" || key == "draft_n" || key == "n_max") { + if (key == "n_max") { stage.n_max = std::stoi(value_raw); if (stage.n_max < 0) { throw std::invalid_argument("speculative stage n_max must be >= 0"); } return; } - if (key == "draft_min" || key == "draft_n_min" || key == "n_min") { + if (key == "n_min") { stage.n_min = std::stoi(value_raw); if (stage.n_min < 0) { throw std::invalid_argument("speculative stage n_min must be >= 0"); } return; } - if (key == "draft_p_min" || key == "p_min") { + if (key == "p_min") { stage.p_min = std::stof(value_raw); if (stage.p_min < 0.0f) { throw std::invalid_argument("speculative stage p_min must be >= 0"); @@ -892,7 +899,7 @@ static void common_speculative_stage_apply_kv( } return; } - if (key == "suffix_min_match_len" || key == "suffix_pattern_len") { + if (key == "suffix_min_match_len") { stage.suffix_min_match_len = std::stoi(value_raw); if (stage.suffix_min_match_len < 1) { throw std::invalid_argument("speculative stage suffix_min_match_len must be at least 1"); @@ -906,10 +913,100 @@ static void common_speculative_stage_apply_kv( } return; } + if (key == "suffix_corpus") { + stage.suffix_corpus = value_raw; + if (stage.suffix_corpus.empty()) { + throw std::invalid_argument("speculative stage suffix_corpus must not be empty"); + } + return; + } throw std::invalid_argument("unknown speculative stage parameter: " + key_raw); } +static std::vector common_speculative_stage_split_kvs(const std::string & values) { + std::vector result; + std::string current; + char quote = '\0'; + bool escaped = false; + + for (char ch : values) { + if (escaped) { + current += ch; + escaped = false; + continue; + } + + if (ch == '\\') { + current += ch; + escaped = true; + continue; + } + + if (quote != '\0') { + if (ch == quote) { + quote = '\0'; + } + current += ch; + continue; + } + + if ((ch == '\'' || ch == '"') && !current.empty() && current.back() == '=') { + quote = ch; + current += ch; + continue; + } + + if (ch == ',') { + result.push_back(current); + current.clear(); + continue; + } + + current += ch; + } + + if (quote != '\0') { + throw std::invalid_argument("invalid speculative stage option list: unterminated quote"); + } + + result.push_back(current); + return result; +} + +static std::string common_speculative_stage_unescape_value(const std::string & value_raw) { + std::string value = value_raw; + if (value.size() >= 2) { + const char first = value.front(); + const char last = value.back(); + if ((first == '\'' && last == '\'') || (first == '"' && last == '"')) { + value = value.substr(1, value.size() - 2); + } + } + + std::string result; + result.reserve(value.size()); + + for (size_t i = 0; i < value.size(); ++i) { + const char ch = value[i]; + if (ch != '\\' || i + 1 >= value.size()) { + result += ch; + continue; + } + + const char next = value[i + 1]; + if (next == '\\' || next == ',' || next == '\'' || next == '"') { + result += next; + ++i; + continue; + } + + result += ch; + } + + return result; +} + static common_speculative_stage_params common_speculative_stage_from_arg(const std::string & value) { const auto spec_pos = value.find(':'); const std::string type_name = value.substr(0, spec_pos); @@ -924,15 +1021,13 @@ static common_speculative_stage_params common_speculative_stage_from_arg(const s return stage; } - std::stringstream ss(value.substr(spec_pos + 1)); - std::string kv; - while (std::getline(ss, kv, ',')) { + for (const std::string & kv : common_speculative_stage_split_kvs(value.substr(spec_pos + 1))) { const auto eq_pos = kv.find('='); if (eq_pos == std::string::npos) { throw std::invalid_argument("invalid speculative stage option: " + kv); } - common_speculative_stage_apply_kv(stage, kv.substr(0, eq_pos), kv.substr(eq_pos + 1)); + common_speculative_stage_apply_kv(stage, kv.substr(0, eq_pos), common_speculative_stage_unescape_value(kv.substr(eq_pos + 1))); } return stage; @@ -1379,18 +1474,18 @@ bool gpt_params_find_arg(int argc, char ** argv, const std::string & arg, gpt_pa } if (arg == "--draft" || arg == "--draft-max" || arg == "--draft-n") { CHECK_ARG - params.speculative.n_max = std::stoi(argv[i]); - return true; + throw common_speculative_legacy_option_error(arg, + "the value inside the relevant repeated --spec-type entry, e.g. --spec-type mtp:n_max=" + std::string(argv[i]) + ",p_min=0.0 or --spec-type draft:n_max=" + std::string(argv[i]) + ",p_min=0.0"); } if (arg == "--draft-min" || arg == "--draft-n-min") { CHECK_ARG - params.speculative.n_min = std::stoi(argv[i]); - return true; + throw common_speculative_legacy_option_error(arg, + "the value inside the relevant repeated --spec-type entry using the canonical key n_min, e.g. --spec-type ngram-mod:n_min=" + std::string(argv[i])); } if (arg == "--draft-p-min") { CHECK_ARG - params.speculative.p_min = std::stof(argv[i]); - return true; + throw common_speculative_legacy_option_error(arg, + "the value inside the relevant repeated --spec-type entry using the canonical key p_min, e.g. --spec-type mtp:p_min=" + std::string(argv[i])); } if (arg == "--recurrent-ckpt-mode") { CHECK_ARG @@ -1445,89 +1540,46 @@ bool gpt_params_find_arg(int argc, char ** argv, const std::string & arg, gpt_pa } if (arg == "--spec-stage") { CHECK_ARG - - if (params.speculative.stages.empty()) { - if (params.speculative.type != COMMON_SPECULATIVE_TYPE_NONE) { - throw std::invalid_argument("--spec-stage cannot be combined with --spec-type; use only --spec-stage for explicit stage chains"); - } - if (params.has_mtp) { - throw std::invalid_argument("--spec-stage cannot be combined with -mtp/--multi-token-prediction; add the mtp fallback explicitly with --spec-stage mtp[:k=v,...]"); - } - } - - params.speculative.stages.push_back(common_speculative_stage_from_arg(argv[i])); - if (params.speculative.stages.size() == 1) { - params.speculative.type = params.speculative.stages.front().type; - } - params.has_mtp = params.speculative.has_stage_type(COMMON_SPECULATIVE_TYPE_MTP); - return true; + throw common_speculative_legacy_option_error(arg, + "repeated --spec-type SPEC[:k=v,...] entries, e.g. --spec-type ngram-mod:n_max=64,n_min=2,ngram_size_n=8 --spec-type mtp:n_max=1,p_min=0.0"); } if (arg == "--spec-type") { CHECK_ARG - if (!params.speculative.stages.empty()) { - throw std::invalid_argument("--spec-type cannot be combined with --spec-stage; use only --spec-stage for explicit stage chains"); - } - - const auto type = common_speculative_type_from_name(argv[i]); - if (type == COMMON_SPECULATIVE_TYPE_NONE || type == COMMON_SPECULATIVE_TYPE_MTP || common_speculative_type_is_self_spec(type)) { - params.speculative.type = type; - if (type == COMMON_SPECULATIVE_TYPE_MTP) { - params.has_mtp = true; - } - } else { - throw std::invalid_argument("unknown speculative decoding type"); - } + params.speculative.stages.push_back(common_speculative_stage_from_arg(argv[i])); + const auto resolved = params.speculative.get_resolved_stages(); + params.speculative.type = resolved.empty() ? COMMON_SPECULATIVE_TYPE_NONE : resolved.front().type; + params.has_mtp = params.speculative.has_stage_type(COMMON_SPECULATIVE_TYPE_MTP); return true; } if (arg == "--spec-ngram-size-n") { CHECK_ARG - int value = std::stoi(argv[i]); - if (value < 1 || value > 1024) { - throw std::invalid_argument("ngram size N must be between 1 and 1024 inclusive"); - } - params.speculative.ngram_size_n = value; - return true; + throw common_speculative_legacy_option_error(arg, + "the canonical stage key inside --spec-type, e.g. --spec-type ngram-mod:ngram_size_n=" + std::string(argv[i])); } if (arg == "--spec-ngram-size-m") { CHECK_ARG - int value = std::stoi(argv[i]); - if (value < 1 || value > 1024) { - throw std::invalid_argument("ngram size M must be between 1 and 1024 inclusive"); - } - params.speculative.ngram_size_m = value; - return true; + throw common_speculative_legacy_option_error(arg, + "the canonical stage key inside --spec-type, e.g. --spec-type ngram-map-k4v:ngram_size_m=" + std::string(argv[i])); } if (arg == "--spec-ngram-min-hits") { CHECK_ARG - int value = std::stoi(argv[i]); - if (value < 1) { - throw std::invalid_argument("ngram min hits must be at least 1"); - } - params.speculative.ngram_min_hits = value; - return true; + throw common_speculative_legacy_option_error(arg, + "the canonical stage key inside --spec-type, e.g. --spec-type ngram-map-k4v:ngram_min_hits=" + std::string(argv[i])); } if (arg == "--suffix-pattern-len") { CHECK_ARG - int value = std::stoi(argv[i]); - if (value < 1) { - throw std::invalid_argument("suffix pattern length must be at least 1"); - } - params.speculative.suffix_min_match_len = value; - return true; + throw common_speculative_legacy_option_error(arg, + "the canonical stage key inside --spec-type, e.g. --spec-type suffix:suffix_min_match_len=" + std::string(argv[i])); } if (arg == "--suffix-max-depth") { CHECK_ARG - int value = std::stoi(argv[i]); - if (value < 1) { - throw std::invalid_argument("suffix max depth must be at least 1"); - } - params.speculative.suffix_max_depth = value; - return true; + throw common_speculative_legacy_option_error(arg, + "the canonical stage key inside --spec-type, e.g. --spec-type suffix:suffix_max_depth=" + std::string(argv[i])); } if (arg == "--suffix-corpus") { CHECK_ARG - params.speculative.suffix_corpus = argv[i]; - return true; + throw common_speculative_legacy_option_error(arg, + "the canonical stage key inside --spec-type, e.g. --spec-type suffix:suffix_corpus=" + std::string(argv[i])); } if (arg == "-a" || arg == "--alias") { CHECK_ARG @@ -1976,17 +2028,12 @@ bool gpt_params_find_arg(int argc, char ** argv, const std::string & arg, gpt_pa return true; } if (arg == "-mtp" || arg == "--multi-token-prediction") { - if (!params.speculative.stages.empty()) { - throw std::invalid_argument("-mtp/--multi-token-prediction cannot be combined with --spec-stage; add the mtp fallback explicitly with --spec-stage mtp[:k=v,...]"); - } - - params.has_mtp = true; - return true; + throw common_speculative_legacy_option_error(arg, + "--spec-type mtp:n_max=1,p_min=0.0"); } if (arg == "-no-mtp" || arg == "--no-multi-token-prediction") { - params.has_mtp = false; - common_speculative_remove_explicit_stage(params.speculative, COMMON_SPECULATIVE_TYPE_MTP); - return true; + throw common_speculative_legacy_option_error(arg, + "remove the mtp entry from repeated --spec-type arguments"); } if (arg == "-draft" || arg == "--draft-params") { CHECK_ARG @@ -3172,29 +3219,20 @@ void gpt_params_print_usage(int /*argc*/, char ** argv, const gpt_params & param options.push_back({ "*", "-hfr, --hf-repo REPO", "Hugging Face model repository (default: unused)" }); options.push_back({ "*", "-hff, --hf-file FILE", "Hugging Face model file (default: unused)" }); options.push_back({ "*", "-hft, --hf-token TOKEN", "Hugging Face access token (default: value from HF_TOKEN environment variable)" }); - options.push_back({ "*", "-mtp, --multi-token-prediction", "legacy shortcut for enabling MTP when --spec-stage is not used (default: %s)", params.has_mtp ? "true" : "false" }); - options.push_back({ "*", "-no-mtp, --no-multi-token-prediction", "disable the legacy MTP shortcut or remove an explicit MTP stage (default: %s)", !params.has_mtp ? "true" : "false" }); - options.push_back({ "*", "--draft-max, --draft, --draft-n N", - "global default number of tokens to draft for speculative decoding or for stages without an explicit n_max override (default: %d)", params.speculative.n_max }); - options.push_back({ "*", "--draft-min, --draft-n-min N", "global default minimum draft threshold or fallback threshold for stages without an explicit n_min override" }); - options.push_back({ "*", "--draft-p-min P", "global default minimum speculative decoding probability (greedy) for stages without an explicit p_min override (default: %.1f)", (double)params.speculative.p_min }); options.push_back({ "*", "--recurrent-ckpt-mode MODE", "checkpoint strategy for recurrent/hybrid speculative decoding\n" " auto auto-select: per-step if CUDA full-GPU, gpu-fallback otherwise (default)\n" " per-step save SSM state per draft step in VRAM; no re-decode on rejection\n" " gpu-fallback copy state to GPU buffer; re-decode on rejection\n" " cpu serialise state via llama_state_seq; re-decode on rejection" }); - options.push_back({ "*", "--spec-stage SPEC[:k=v,...]", "explicit speculative stage. repeat once for a supported two-stage chain.\n" - "examples: --spec-stage ngram-mod:n_max=64,n_min=2 --spec-stage mtp:n_max=1\n" - "supported two-stage shape in this PR: self-spec first, then mtp or draft fallback" }); - options.push_back({ "*", "--spec-type Name [none | mtp | ngram-cache | ngram-simple | ngram-map-k | ngram-map-k4v | ngram-mod | suffix]", "single-stage speculative selection when --spec-stage is not used (default: %d)\n", (int)params.speculative.type}); - options.push_back({ "*", "--spec-ngram-size-n N", "ngram size N for ngram-simple/ngram-map speculative decoding, length of lookup n-gram (default: %d)\n",params.speculative.ngram_size_n }); - - options.push_back({ "*", "--spec-ngram-size-m N", "ngram size M for ngram-simple/ngram-map speculative decoding, length of draft m-gram (default: %d)\n", params.speculative.ngram_size_m }); - - options.push_back({ "*", "--spec-ngram-min-hits N", "minimum hits for ngram-map speculative decoding (default: %d)\n", params.speculative.ngram_min_hits }); - options.push_back({ "*", "--suffix-pattern-len N", "minimum context match length for suffix decoding (default: %d)", params.speculative.suffix_min_match_len }); - options.push_back({ "*", "--suffix-max-depth N", "suffix tree maximum depth for suffix decoding (default: %d)", params.speculative.suffix_max_depth }); - options.push_back({ "*", "--suffix-corpus PATH", "corpus file to pre-warm the suffix tree: .json (array of strings or conversation messages) or .bin (raw int32 token IDs)" }); + options.push_back({ "*", "--spec-type SPEC[:k=v,...]", "canonical speculative stage entry; repeat for a supported two-stage chain.\n" + "types: none, draft, mtp, ngram-cache, ngram-simple, ngram-map-k, ngram-map-k4v, ngram-mod, suffix\n" + "canonical keys: n_max,n_min,p_min,ngram_size_n,ngram_size_m,ngram_min_hits,suffix_min_match_len,suffix_max_depth,suffix_corpus\n" + "for comma-bearing string values, quote the value inside the stage payload for normal shell use\n" + "if argv is passed directly without shell unescaping, the parser also accepts escaped commas as \\,\n" + "examples: --spec-type mtp:n_max=1,p_min=0.0\n" + " --spec-type ngram-mod:n_max=64,n_min=2,ngram_size_n=8 --spec-type mtp:n_max=1,p_min=0.0\n" + " --spec-type \"suffix:n_max=16,n_min=2,suffix_min_match_len=5,suffix_max_depth=64,suffix_corpus='/tmp/spec,type-corpus.json'\"\n" + "legacy --spec-stage, --draft-*, --spec-ngram-*, --suffix-* and -mtp flags are rejected" }); options.push_back({ "*", "--spec-autotune", "automatically tune speculative params to maximize tokens/sec" }); options.push_back({ "retrieval" }); diff --git a/common/common.h b/common/common.h index 1ec0f095..0125ccd5 100644 --- a/common/common.h +++ b/common/common.h @@ -169,6 +169,7 @@ struct common_speculative_stage_params { int32_t suffix_min_match_len = -1; int32_t suffix_max_depth = -1; + std::string suffix_corpus; bool has_n_max_override() const { return n_max >= 0; } bool has_n_min_override() const { return n_min >= 0; } @@ -178,6 +179,7 @@ struct common_speculative_stage_params { bool has_ngram_min_hits_override() const { return ngram_min_hits > 0; } bool has_suffix_min_match_len_override() const { return suffix_min_match_len >= 0; } bool has_suffix_max_depth_override() const { return suffix_max_depth >= 0; } + bool has_suffix_corpus_override() const { return !suffix_corpus.empty(); } }; struct common_params_model { @@ -517,7 +519,7 @@ struct gpt_params { bool do_checkpoint = false; // do checkpoint for recurrent models only int32_t ctx_checkpoints_n = 32; // max number of context checkpoints per slot int32_t ctx_checkpoints_interval = 512; // minimum number of tokens between each context checkpoints - int32_t ctx_checkpoints_tolerance = 5; // the number of tokens before the full prompt to create the checkpoint + int32_t ctx_checkpoints_tolerance = 5; // the number of tokens before the full prompt to create the checkpoint int32_t cache_ram_mib = 8192; // -1 = no limit, 0 - disable, 1 = 1 MiB, etc. int32_t cache_ram_n_min = 0; // min number of tokens required to save in the ram float cache_ram_similarity = 0.5f; // similarity of tokens to cached tokens diff --git a/common/spec-tuner.cpp b/common/spec-tuner.cpp index 80427d41..52733877 100644 --- a/common/spec-tuner.cpp +++ b/common/spec-tuner.cpp @@ -357,20 +357,15 @@ void spec_tuner::print_best() const { { std::ostringstream oss; - oss << "Autotune reuse: "; + oss << "Autotune reuse: --spec-type " << common_speculative_type_to_str(spec_type); + bool first_kv = true; for (const auto & coord : coords) { bool is_int = (coord.name != "p_min"); - if (coord.name == "n_max") oss << "--draft-max "; - else if (coord.name == "p_min") oss << "--draft-p-min "; - else if (coord.name == "n_min") oss << "--draft-min "; - else if (coord.name == "ngram_size_n") oss << "--spec-ngram-size-n "; - else if (coord.name == "ngram_size_m") oss << "--spec-ngram-size-m "; - else if (coord.name == "ngram_min_hits") oss << "--spec-ngram-min-hits "; - else if (coord.name == "suffix_min_match_len") oss << "--suffix-pattern-len "; - else oss << "--" << coord.name << " "; + oss << (first_kv ? ':' : ',') << coord.name << '='; + first_kv = false; - if (is_int) oss << (int)coord.arms[coord.best_idx].value << " "; - else oss << std::fixed << std::setprecision(2) << coord.arms[coord.best_idx].value << " "; + if (is_int) oss << (int)coord.arms[coord.best_idx].value; + else oss << std::fixed << std::setprecision(2) << coord.arms[coord.best_idx].value; } LOG_INF("%s\n", oss.str().c_str()); } diff --git a/common/speculative.cpp b/common/speculative.cpp index 2341bb6c..758202ac 100644 --- a/common/speculative.cpp +++ b/common/speculative.cpp @@ -1160,7 +1160,7 @@ common_speculative * common_speculative_init( }); if (has_draft_stage) { - LOG_ERR("%s: Gemma4 assistant models only support MTP stages; omit -md for self-spec-only runs or use -mtp/--spec-stage mtp for assistant-backed MTP\n", __func__); + LOG_ERR("%s: Gemma4 assistant models only support MTP stages; omit -md for self-spec-only runs or use --spec-type mtp:n_max=1,p_min=0.0 for assistant-backed MTP\n", __func__); return nullptr; } } diff --git a/common/suffix-tree.cpp b/common/suffix-tree.cpp index 6e0c0691..09c2ff3c 100644 --- a/common/suffix-tree.cpp +++ b/common/suffix-tree.cpp @@ -209,7 +209,7 @@ static bool suffix_corpus_check_limit(const std::string & path, size_t n_tokens, return true; } - LOG_ERR("load_corpus: refusing suffix corpus '%s' - estimated insert work %llu exceeds limit %llu (tokens=%zu, depth=%d); reduce corpus size or --suffix-max-depth\n", + LOG_ERR("load_corpus: refusing suffix corpus '%s' - estimated insert work %llu exceeds limit %llu (tokens=%zu, depth=%d); reduce corpus size or lower suffix_max_depth inside --spec-type suffix:suffix_max_depth=...\n", path.c_str(), (unsigned long long) estimated_work, (unsigned long long) SUFFIX_CORPUS_MAX_INSERT_WORK, diff --git a/docs/parameters.md b/docs/parameters.md index 77ec4fa2..b7847ded 100644 --- a/docs/parameters.md +++ b/docs/parameters.md @@ -65,7 +65,7 @@ Some often used terms. | - | - | - | - | | `-h, --help, --usage` | Print usage and exit | - | - | | `--fit` | Automatically fit to available VRAM | off | Loads as many tensors to the GPU(s) as available VRAM will permit. [PR 1501](https://github.com/ikawrakow/ik_llama.cpp/pull/1501) [PR 1504](https://github.com/ikawrakow/ik_llama.cpp/pull/1504) | -| `--fit-margin N` | Safety VRAM margin in MiB when using `--fit` | 1024 | Increase this value in case of CUDA OOM when loading the model. Decrease to less than 1024 if the model loads successfully and you feel that too much VRAM has been left unused | +| `--fit-margin N` | Safety VRAM margin in MiB when using `--fit` | 1024 | Increase this value in case of CUDA OOM when loading the model. Decrease to less than 1024 if the model loads successfully and you feel that too much VRAM has been left unused | | `-wgt, --worst-graph-tokens N` | Number of tokens to use for worst-case graph | - | Control compute buffer sizes for large batches. Provided "as is" for users that understand the limitations, please don't open issues when using this. [PR 1560](https://github.com/ikawrakow/ik_llama.cpp/pull/1560) | | `-t, --threads N` | Number of threads to use during generation | 4 | Try to match the number of physical CPU cores. Avoid odd numbers (e.g. 1,3,...). | | `-tb, --threads-batch N` | Number of threads to use during batch and prompt processing | Same as `--threads` | Same as `--threads` When doing full GPU offload, use a lower number (e.g. 2) | @@ -120,21 +120,13 @@ Check the details [here](./speculative.md). | `-ctkd, --cache-type-k-draft TYPE` | KV cache data type for K for the draft model | - | For draft model, see: `-ctk` | | `-ctvd, --cache-type-v-draft TYPE` | KV cache data type for V for the draft model | - | For draft model, see: `-ctk` | | `-draft, --draft-params` | Comma-separated list of draft model parameters | - | | -| `--spec-ngram-size-n N` | ngram size N for ngram-simple/ngram-map speculative decoding, length of lookup n-gram| 12 | [PR 1261](https://github.com/ikawrakow/ik_llama.cpp/pull/1261) | -| `--spec-ngram-size-m N` | ngram size M for ngram-simple/ngram-map speculative decoding, length of draft m-gram | 48 | [PR 1261](https://github.com/ikawrakow/ik_llama.cpp/pull/1261) | -| `--spec-ngram-min-hits N` | minimum hits for ngram-map speculative decoding | 1 | [PR 1261](https://github.com/ikawrakow/ik_llama.cpp/pull/1261) | -| `--spec-type Name` | Comma-separated list of draft model parameters | - | none / ngram - cache / ngram - simple / ngram - map - k / ngram - map - k4v / ngram - mod / suffix [PR 1261](https://github.com/ikawrakow/ik_llama.cpp/pull/1261) [PR 1646](https://github.com/ikawrakow/ik_llama.cpp/pull/1646) | -| `--spec-stage SPEC[:k=v,...]` | Add an explicit speculative stage; repeat once for a supported two-stage chain | - | Supported two-stage shape: self-spec first, then `mtp` or `draft` fallback. [PR 1789](https://github.com/ikawrakow/ik_llama.cpp/pull/1789) | -| `-mtp, --multi-token-prediction` | | - | MTP decoding [PR 1270](https://github.com/ikawrakow/ik_llama.cpp/pull/1270) [1698](https://github.com/ikawrakow/ik_llama.cpp/pull/1698) | -| `-no-mtp, --no-multi-token-prediction` | | - | MTP decoding [PR 1270](https://github.com/ikawrakow/ik_llama.cpp/pull/1270) [1698](https://github.com/ikawrakow/ik_llama.cpp/pull/1698) | -| `--draft-max` | | - | MTP decoding [PR 1270](https://github.com/ikawrakow/ik_llama.cpp/pull/1270) [1698](https://github.com/ikawrakow/ik_llama.cpp/pull/1698) | -| `--draft-p-min` | | - | MTP decoding [PR 1270](https://github.com/ikawrakow/ik_llama.cpp/pull/1270) [1698](https://github.com/ikawrakow/ik_llama.cpp/pull/1698) | +| `--spec-type SPEC[:k=v,...]` | Canonical speculative stage entry; repeat to configure the supported two-stage chain | - | Types: `none`, `draft`, `mtp`, `ngram-cache`, `ngram-simple`, `ngram-map-k`, `ngram-map-k4v`, `ngram-mod`, `suffix`. Canonical keys: `n_max`, `n_min`, `p_min`, `ngram_size_n`, `ngram_size_m`, `ngram_min_hits`, `suffix_min_match_len`, `suffix_max_depth`, `suffix_corpus`. String values may escape commas as `\,` or quote the value inside the stage payload. Example: `--spec-type ngram-mod:n_max=64,n_min=2,ngram_size_n=8 --spec-type mtp:n_max=1,p_min=0.0` | | `--spec-autotune` | Automatically tune speculative params to maximize tokens/sec | - | Automatically determines the near-optimal arguments for the type of speculation being performed [PR 1595](https://github.com/ikawrakow/ik_llama.cpp/pull/1595) | | `--recurrent-ckpt-mode MODE` | Checkpoint strategy for recurrent/hybrid speculative decoding | auto | One of: - `auto` auto-select: per-step if CUDA full-GPU, gpu-fallback otherwise - `per-step` save SSM state per draft step in VRAM; no re-decode on rejection - `gpu-fallback` copy state to GPU buffer; re-decode on rejection - `cpu` serialise state via llama_state_seq; re-decode on rejection [PR 1669](https://github.com/ikawrakow/ik_llama.cpp/pull/1669) [PR 1774](https://github.com/ikawrakow/ik_llama.cpp/pull/1774) | Notes: -- `--spec-type` cannot be combined with `--spec-stage`. +- Legacy `--spec-stage`, `--draft-*`, `--spec-ngram-*`, `--suffix-*`, and `-mtp` flags are rejected with replacement guidance. - Explicit stage chains currently support at most two stages. - Supported self-spec stage names are `ngram-cache`, `ngram-simple`, `ngram-map-k`, `ngram-map-k4v`, `ngram-mod`, and `suffix`. - Composite stage chains disable speculative autotune. @@ -318,7 +310,7 @@ python3 gguf-py/scripts/gguf_dump.py /models/Qwen_Qwen3-0.6B-IQ4_NL.gguf - `-ngl`, `-ot`, `--cpu-moe`, `--n-cpu-moe N` - For MoE models, use a number greater than the number of model layers with `-ngl`. If unsure, use a large number like `-ngl 999`. - - It's good to explicitly put up/down/gate onto the GPU for speedups. + - It's good to explicitly put up/down/gate onto the GPU for speedups. - Up/Gate shouldn't be on separate GPU devices because it might cause a bit of a deadlock. - For models with shared experts (like GPT-OSS), they should end up on GPU. - In some quants the layers aren't uniform so it can be better to skip larger layers if more smaller blocks will fit without empty space where nothing fits. @@ -328,7 +320,7 @@ python3 gguf-py/scripts/gguf_dump.py /models/Qwen_Qwen3-0.6B-IQ4_NL.gguf - In general, in a single GPU + CPU system, you just do something like this: `-ngl 999` To put all layers in VRAM by default - + `-ot "blk.(?:[0-9]|[1-7][0-9]|[8][0-7]).ffn._exps.=CPU"` To create exceptions and put back in ram anything that has "ffn" and "_exps" in its name, and that sits in layers called "blk.n", where "n" (the lawyer number) is any match between 0 and 9, or between 1 to 7 + 0 to 9 (aka a number between 10 and 79), or 8 + 0 to 7 (aka a number between 80 and 87). Basically a complicated way of saying put all experts from layer 0 to 87 in ram. Experts from layer 88 to 93 (there's 93 layers in qwen3vl 235b) can sit in VRAM still. (Thats all I can load on a 5090). @@ -342,7 +334,7 @@ C. Other tips - If you are not happy with the allocations done by `--fit` across GPUs, use `-ts` to manually tweak. - Look for `ReBAR`/`Resizable BAR` support for your Motherboard, CPU, BIOS/UEFI and GPU. Then for the "patched driver" for your GPUs to enable GPU to GPU direct communication. -### Common GPU configurations and popular models +### Common GPU configurations and popular models WIP @@ -378,9 +370,7 @@ WIP | `--override-kv KEY=TYPE:VALUE` | Override model metadata by key | - | Advanced option to override model metadata by key. May be specified multiple times. types: int, float, bool, str. Example: `--override-kv tokenizer.ggml.add_bos_token=bool:false` | | `-m, --model FNAME` | Model path | models/$filename | Mandatory, the GGUF model file to be served. | | `-md, --model-draft FNAME` | Draft model for speculative decoding | unused | Required when an explicit `draft` stage is used. | -| `--draft-max, --draft, --draft-n N` | Global speculative draft cap, or fallback value for stages without an explicit `n_max` override | 16 | Also used by single-stage MTP and draft-model speculation. | -| `--draft-min, --draft-n-min N` | Global minimum speculative draft threshold, or fallback value for stages without an explicit `n_min` override | 0 | | -| `--draft-p-min P` | Global minimum speculative decoding probability (greedy), or fallback value for stages without an explicit `p_min` override | 0.8 | | +| `--spec-type SPEC[:k=v,...]` | Canonical speculative stage entry; repeat for the supported two-stage chain | none | Use stage-local keys like `n_max`, `n_min`, `p_min`, `ngram_size_n`, `ngram_size_m`, `ngram_min_hits`, `suffix_min_match_len`, `suffix_max_depth`, and `suffix_corpus`. | ### Request-Level Speculative Overrides diff --git a/docs/speculative.md b/docs/speculative.md index 29da3328..6665dc3a 100644 --- a/docs/speculative.md +++ b/docs/speculative.md @@ -33,18 +33,18 @@ An example to use this approach can be the rewriting of source code by a LLM. This implementation looks for the last n-gram in history that matches the current n-gram and creates a draft using the m tokens following the matched n-gram. It is the simplest self-speculative approach with minimal overhead. ``` -llama-server [...] --spec-type ngram-simple --draft-max 64 +llama-server [...] --spec-type ngram-simple:n_max=64 ``` #### n-gram Map Key (`ngram-map-k`) -This implementation looks for the current n-gram of size n (called the _key_) in the token history. If the key n-gram is followed by the same m tokens (called the _mgram_) multiple times, it creates a draft using these m tokens. This approach requires a minimum number of occurrences (argument `--spec-ngram-min-hits`, default is 1) before generating drafts. +This implementation looks for the current n-gram of size n (called the _key_) in the token history. If the key n-gram is followed by the same m tokens (called the _mgram_) multiple times, it creates a draft using these m tokens. This approach requires a minimum number of occurrences (stage key `ngram_min_hits`, default is 1) before generating drafts. The number of accepted tokens is stored for each used n-gram. **Example:** ``` -llama-server [...] --spec-type ngram-map-k --draft-max 64 +llama-server [...] --spec-type ngram-map-k:n_max=64,ngram_min_hits=1 ``` #### n-gram Map Key-4-Values (`ngram-map-k4v`) @@ -55,7 +55,7 @@ The number of accepted tokens is stored for each used n-gram. **Example:** Server options to be used if there are a lot of longer repetitions. ``` -llama-server [...] --spec-type ngram-map-k4v --spec-ngram-size-n 8 --spec-ngram-size-m 8 --spec-ngram-min-hits 2 --draft-max 64 +llama-server [...] --spec-type ngram-map-k4v:n_max=64,ngram_size_n=8,ngram_size_m=8,ngram_min_hits=2 ``` ### n-gram Mod (`ngram-mod`) @@ -80,9 +80,9 @@ Currently, a single hash pool is shared across all server slots, so different re # notes: # - small `n` are not recommended # - MoEs require long drafts -# - dense models: can reduce `--draft-min` and `--draft-max` +# - dense models: can reduce `n_min` and `n_max` -llama-server ... --spec-type ngram-mod --spec-ngram-size-n 24 --draft-min 48 --draft-max 64 +llama-server ... --spec-type ngram-mod:n_max=64,n_min=48,ngram_size_n=24 ``` Applications: @@ -103,57 +103,78 @@ Example Video: ## Command-Line Options -If a draft model is combined with a draftless decoding the draftless decoding has higher precedence. +The canonical startup surface is repeated `--spec-type SPEC[:k=v,...]`. Legacy `--spec-stage`, `--draft-*`, `--spec-ngram-*`, `--suffix-*`, and `-mtp` flags are rejected with replacement guidance. -``` ---draft, --draft-n, --draft-max N number of tokens to draft for speculative decoding (default: 16) - (env: LLAMA_ARG_DRAFT_MAX) ---draft-min, --draft-n-min N minimum number of draft tokens to use for speculative decoding - (default: 0) - (env: LLAMA_ARG_DRAFT_MIN) -[...] ---spec-type [none|ngram-cache|ngram-simple|ngram-map-k|ngram-map-k4v|ngram-mod] - type of speculative decoding to use when no draft model is provided - (default: none) ---spec-ngram-size-n N ngram size N for ngram-simple/ngram-map speculative decoding, length - of lookup n-gram (default: 12) ---spec-ngram-size-m N ngram size M for ngram-simple/ngram-map speculative decoding, length - of draft m-gram (default: 48) ---spec-ngram-min-hits N minimum hits for ngram-map speculative decoding (default: 1) -``` +### `--spec-type SPEC[:k=v,...]` -### `--spec-type TYPE` - -Specifies a type of speculative decoding without draft model. +Each `--spec-type` entry defines one speculative stage. Repeat it to configure the supported two-stage path. | Type | Description | |------|-------------| -| `none` | No speculative decoding (default) | +| `none` | No speculative decoding | +| `draft` | Draft-model speculative decoding; pair with `-md/--model-draft` | +| `mtp` | Embedded or assistant-backed MTP | | `ngram-cache` | Use n-gram cache lookup | | `ngram-simple` | Use simple n-gram pattern matching | -| `ngram-map-k` | Use n-gram pattern matching with n-gram-keys | -| `ngram-map-k4v` | Use n-gram pattern matching with n-gram-keys and up to four m-gram values (experimental) | -| `ngram-mod` | Use basic ngram hasher for speculative decoding with shared pool | +| `ngram-map-k` | Use n-gram pattern matching with n-gram keys | +| `ngram-map-k4v` | Use n-gram pattern matching with n-gram keys and up to four m-gram values | +| `ngram-mod` | Use the shared n-gram hasher | +| `suffix` | Use suffix-tree speculative decoding | + +Canonical stage keys: + +| Key | Meaning | +|-----|---------| +| `n_max` | Maximum drafted tokens for that stage | +| `n_min` | Minimum usable drafted tokens for that stage | +| `p_min` | Minimum speculative probability threshold | +| `ngram_size_n` | Lookup n-gram size | +| `ngram_size_m` | Draft m-gram size | +| `ngram_min_hits` | Minimum matching hits for n-gram map stages | +| `suffix_min_match_len` | Minimum suffix context match length | +| `suffix_max_depth` | Maximum suffix-tree depth | +| `suffix_corpus` | Optional suffix corpus file for pre-warming | + +String-valued stage keys such as `suffix_corpus` need shell-safe quoting when the value contains commas. From a normal shell, quote the value inside the stage payload so the parser sees the comma as part of the string value. + +Example shell-safe form: -**Example:** Server-instance used to refactor source code. ```bash -./llama-server [...] --spec-type ngram-simple +./llama-server [...] \ + --spec-type "suffix:n_max=16,n_min=2,suffix_min_match_len=5,suffix_max_depth=64,suffix_corpus='/tmp/spec,type-corpus.json'" ``` -### `--spec-ngram-size-n N` +If you are constructing `argv` directly without shell unescaping, the parser also accepts escaped commas as `\,`. -Sets the size N of the lookup n-gram for n-gram map based speculative decoding. -The n-gram size N determines how many tokens in a row to look back when searching for matching patterns. +Examples: -### `--spec-ngram-size-m M` +```bash +# Single-stage MTP +./llama-server [...] --spec-type mtp:n_max=1,p_min=0.0 -Sets the size M of the draft m-gram for n-gram map based speculative decoding. -The m-gram size determines how many tokens to draft when a match is found. -Larger values can provide more speedup but may reduce acceptance rate. +# Single-stage ngram-mod +./llama-server [...] --spec-type ngram-mod:n_max=64,n_min=48,ngram_size_n=24 -### `--spec-ngram-min-hits H` +# Draft-model speculation +./llama-server [...] --model-draft draft.gguf --spec-type draft:n_max=4,p_min=0.0 -This option defines how often a key has to appear in the token history to be used as a draft (default is 1). +# Two-stage self-spec -> MTP fallback +./llama-server [...] \ + --spec-type ngram-mod:n_max=64,n_min=2,ngram_size_n=8 \ + --spec-type mtp:n_max=1,p_min=0.0 + +# Suffix stage with pre-warmed corpus +./llama-server [...] \ + --spec-type suffix:n_max=16,n_min=2,suffix_min_match_len=5,suffix_max_depth=64,suffix_corpus=/path/to/corpus.json + +# Suffix stage with a comma-bearing corpus path from a normal shell +./llama-server [...] \ + --spec-type "suffix:n_max=16,n_min=2,suffix_min_match_len=5,suffix_max_depth=64,suffix_corpus='/tmp/spec,type-corpus.json'" +``` + +### `--spec-autotune` + +Autotunes the active stage parameters and reports the best configuration back as a canonical `--spec-type ...` snippet. ## Statistics Each speculative decoding implementation prints statistics. @@ -180,4 +201,3 @@ statistics ngram_map_k: #calls(b,g,a) = 6 1690 26, #gen drafts = 26, #acc drafts - `#gen tokens`: number of tokens generated by this implementation (including rejected tokens) - `#acc tokens`: number of tokens accepted by the main model - `dur(b,g,a): durations of begin (new prompt), generation and accumulation (process acceptance). - diff --git a/examples/imatrix/imatrix.cpp b/examples/imatrix/imatrix.cpp index cff41565..0cf70853 100644 --- a/examples/imatrix/imatrix.cpp +++ b/examples/imatrix/imatrix.cpp @@ -1232,7 +1232,7 @@ int main(int argc, char ** argv) { } if (!use_paired_gemma4_mtp && llama_model_is_gemma4_mtp_assistant(model) && !params.process_output) { - fprintf(stderr, "%s: warning: standalone Gemma 4 assistant imatrix does not exercise the assistant layers. Use '-m -md -mtp' for meaningful calibration.\n", __func__); + fprintf(stderr, "%s: warning: standalone Gemma 4 assistant imatrix does not exercise the assistant layers. Use '-m -md --spec-type mtp:n_max=1,p_min=0.0' for meaningful calibration.\n", __func__); } const int n_ctx_train = llama_n_ctx_train(model); diff --git a/examples/server/README.md b/examples/server/README.md index be3a7f74..a56aeecd 100644 --- a/examples/server/README.md +++ b/examples/server/README.md @@ -210,10 +210,10 @@ model: -m, --model FNAME model path (default: models/$filename with filename from --hf-file or --model-url if set, otherwise models/7B/ggml-model-f16.gguf) -md, --model-draft FNAME draft model for speculative decoding (default: unused) - --spec-stage SPEC[:k=v,...] - explicit speculative stage. repeat once for a supported two-stage chain - examples: --spec-stage ngram-mod:n_max=64,n_min=2 --spec-stage mtp:n_max=1 - supported two-stage shape: self-spec first, then mtp or draft fallback + --spec-type SPEC[:k=v,...] + canonical speculative stage entry; repeat for a supported two-stage chain + examples: --spec-type mtp:n_max=1,p_min=0.0 + --spec-type ngram-mod:n_max=64,n_min=2,ngram_size_n=8 --spec-type mtp:n_max=1,p_min=0.0 -mu, --model-url MODEL_URL model download url (default: unused) -hfr, --hf-repo REPO Hugging Face model repository (default: unused) -hff, --hf-file FILE Hugging Face model file (default: unused) @@ -966,15 +966,15 @@ To know the `id` of the adapter, use GET `/lora-adapters` ### Composite speculative decoding -Use `--spec-stage` for explicit stage chains. The currently supported two-stage shape is self-spec first, then `mtp` or `draft` fallback. +Use repeated `--spec-type SPEC[:k=v,...]` entries for explicit stage chains. The currently supported two-stage shape is self-spec first, then `mtp` or `draft` fallback. Example with `ngram-mod` plus MTP fallback: ```bash ./build/bin/llama-server \ --model /models/target-mtp.gguf \ - --spec-stage ngram-mod:n_max=64,n_min=2,ngram_size_n=8 \ - --spec-stage mtp:n_max=1,p_min=0.0 + --spec-type ngram-mod:n_max=64,n_min=2,ngram_size_n=8 \ + --spec-type mtp:n_max=1,p_min=0.0 ``` Example with `ngram-mod` plus draft-model fallback: @@ -983,14 +983,13 @@ Example with `ngram-mod` plus draft-model fallback: ./build/bin/llama-server \ --model /models/target.gguf \ --model-draft /models/draft.gguf \ - --spec-stage ngram-mod:n_max=64,n_min=2,ngram_size_n=8 \ - --spec-stage draft:n_max=4,p_min=0.0 + --spec-type ngram-mod:n_max=64,n_min=2,ngram_size_n=8 \ + --spec-type draft:n_max=4,p_min=0.0 ``` Notes: -- Use `--spec-type` when you want a single self-spec stage only. -- `--spec-type` cannot be combined with `--spec-stage`. +- Use `--spec-type` for both single-stage and two-stage startup configuration. - Explicit stage chains currently support at most two stages. ### Change system prompt on runtime diff --git a/examples/server/server-context.cpp b/examples/server/server-context.cpp index 444ed2ce..4b1499f1 100644 --- a/examples/server/server-context.cpp +++ b/examples/server/server-context.cpp @@ -166,7 +166,8 @@ static void server_reject_dead_speculative_request_overrides(const json & data) json_value_ptr(data, "speculative.ngram_size_m") != nullptr || json_value_ptr(data, "speculative.ngram_min_hits") != nullptr || json_value_ptr(data, "speculative.suffix_min_match_len") != nullptr || - json_value_ptr(data, "speculative.suffix_max_depth") != nullptr) { + json_value_ptr(data, "speculative.suffix_max_depth") != nullptr || + json_value_ptr(data, "speculative.suffix_corpus") != nullptr) { throw std::runtime_error("Error: structural speculative overrides are startup-only; per-request overrides only support speculative.n_max, speculative.n_min, speculative.p_min, and speculative.stages"); } } @@ -284,7 +285,6 @@ bool server_context::load_model(const gpt_params& params_) { }); params_base.has_mtp = false; - server_remove_speculative_stage(params_base.speculative, COMMON_SPECULATIVE_TYPE_MTP); if (!server_speculative_needs_draft_model(params_base.speculative)) { @@ -1251,6 +1251,10 @@ bool server_context::launch_slot_with_task(server_slot& slot, server_task& task) // speculative decoding parameters try { slot.params.speculative = defaults.speculative; + const bool has_flat_n_max = json_value_ptr(data, "speculative.n_max") != nullptr; + const bool has_flat_n_min = json_value_ptr(data, "speculative.n_min") != nullptr; + const bool has_flat_p_min = json_value_ptr(data, "speculative.p_min") != nullptr; + slot.params.speculative.n_max = json_value(data, "speculative.n_max", params_base.speculative.n_max); slot.params.speculative.n_min = json_value(data, "speculative.n_min", params_base.speculative.n_min); slot.params.speculative.p_min = json_value(data, "speculative.p_min", params_base.speculative.p_min); @@ -1258,6 +1262,20 @@ bool server_context::launch_slot_with_task(server_slot& slot, server_task& task) server_reject_dead_speculative_request_overrides(data); const json stages = json_value(data, "speculative.stages", json()); + if (stages.is_null() && !slot.params.speculative.stages.empty()) { + for (auto & stage : slot.params.speculative.stages) { + if (has_flat_n_max) { + stage.n_max = -1; + } + if (has_flat_n_min) { + stage.n_min = -1; + } + if (has_flat_p_min) { + stage.p_min = -1.0f; + } + } + } + if (!stages.is_null()) { if (!stages.is_array()) { throw std::runtime_error("Error: speculative.stages must be an array"); @@ -1296,11 +1314,11 @@ bool server_context::launch_slot_with_task(server_slot& slot, server_task& task) if (slot.can_speculate() && llama_model_has_recurrent(model) && - slot.params.speculative.n_max > params_base.speculative.n_max) { + slot.params.speculative.get_max_stage_n_max() > params_base.speculative.get_max_stage_n_max()) { send_error(task, - "Error: speculative.n_max=" + std::to_string(slot.params.speculative.n_max) + - " exceeds the recurrent speculative startup limit of " + std::to_string(params_base.speculative.n_max) + - "; restart the server with a higher --draft-max to reserve checkpoint capacity", + "Error: speculative n_max=" + std::to_string(slot.params.speculative.get_max_stage_n_max()) + + " exceeds the recurrent speculative startup limit of " + std::to_string(params_base.speculative.get_max_stage_n_max()) + + "; restart the server with a higher n_max inside the configured --spec-type stages to reserve checkpoint capacity", ERROR_TYPE_INVALID_REQUEST); return false; }