Standardize speculative decoding arguments on the server (#1908)

* refactor spec args * add shell-safe quoting of string-valued stage keys in speculative decoding
2026-06-28 04:30:15 -05:00 · 2026-06-04 10:44:57 -03:00 · 2026-06-04 10:44:57 -03:00 · 007d640098
commit 007d640098
parent 6c0180d702
10 changed files with 275 additions and 213 deletions
--- a/common/common.cpp
+++ b/common/common.cpp
@ -124,7 +124,16 @@ static int32_t common_speculative_stage_effective_n_min(

 std::vector<common_speculative_stage_params> common_params_speculative::get_resolved_stages() const {
    if (!stages.empty()) {
-        return stages;
+        std::vector<common_speculative_stage_params> resolved;
+        resolved.reserve(stages.size());
+
+        for (const auto & stage : stages) {
+            if (stage.type != COMMON_SPECULATIVE_TYPE_NONE) {
+                resolved.push_back(stage);
+            }
+        }
+
+        return resolved;
    }

    if (type == COMMON_SPECULATIVE_TYPE_NONE) {
@ -164,6 +173,9 @@ common_params_speculative common_params_speculative::with_stage_overrides(const
    if (stage.has_suffix_max_depth_override()) {
        result.suffix_max_depth = stage.suffix_max_depth;
    }
+    if (stage.has_suffix_corpus_override()) {
+        result.suffix_corpus = stage.suffix_corpus;
+    }

    result.n_max = std::max(result.n_max, 0);
    result.n_min = std::max(0, std::min(result.n_min, result.n_max));
@ -612,28 +624,20 @@ static void common_speculative_finalize_stages(gpt_params & params) {
    auto & spec = params.speculative;

    if (!spec.stages.empty()) {
-        spec.type = spec.stages.front().type;
+        const auto resolved = spec.get_resolved_stages();
+        if (resolved.size() != spec.stages.size()) {
+            spec.stages = resolved;
+        }
+
+        spec.type = resolved.empty() ? COMMON_SPECULATIVE_TYPE_NONE : resolved.front().type;
        params.has_mtp = spec.has_stage_type(COMMON_SPECULATIVE_TYPE_MTP);
        return;
    }

-    const bool wants_mtp = params.has_mtp;
-    const bool wants_draft = spec.has_dft();
-
    if (spec.type != COMMON_SPECULATIVE_TYPE_NONE) {
        spec.stages.push_back({ .type = spec.type });
-
-        if (common_speculative_type_is_self_spec(spec.type)) {
-            if (wants_mtp) {
+    } else if (params.has_mtp) {
        spec.stages.push_back({ .type = COMMON_SPECULATIVE_TYPE_MTP });
-            } else if (wants_draft) {
-                spec.stages.push_back({ .type = COMMON_SPECULATIVE_TYPE_DRAFT });
-            }
-        }
-    } else if (wants_mtp) {
-        spec.stages.push_back({ .type = COMMON_SPECULATIVE_TYPE_MTP });
-    } else if (wants_draft) {
-        spec.stages.push_back({ .type = COMMON_SPECULATIVE_TYPE_DRAFT });
    }

    spec.type = spec.stages.empty() ? COMMON_SPECULATIVE_TYPE_NONE : spec.stages.front().type;
@ -827,11 +831,14 @@ static std::string common_normalize_spec_stage_key(std::string key) {

    std::replace(key.begin(), key.end(), '-', '_');

-    if (key.rfind("spec_", 0) == 0) {
-        key.erase(0, 5);
+    return key;
 }

-    return key;
+static std::invalid_argument common_speculative_legacy_option_error(
+        const std::string & arg,
+        const std::string & replacement) {
+    return std::invalid_argument(
+        "legacy speculative option '" + arg + "' is disabled; use " + replacement);
 }

 static void common_speculative_remove_explicit_stage(common_params_speculative & params, common_speculative_type type) {
@ -850,21 +857,21 @@ static void common_speculative_stage_apply_kv(
        const std::string & value_raw) {
    const std::string key = common_normalize_spec_stage_key(key_raw);

-    if (key == "draft" || key == "draft_max" || key == "draft_n" || key == "n_max") {
+    if (key == "n_max") {
        stage.n_max = std::stoi(value_raw);
        if (stage.n_max < 0) {
            throw std::invalid_argument("speculative stage n_max must be >= 0");
        }
        return;
    }
-    if (key == "draft_min" || key == "draft_n_min" || key == "n_min") {
+    if (key == "n_min") {
        stage.n_min = std::stoi(value_raw);
        if (stage.n_min < 0) {
            throw std::invalid_argument("speculative stage n_min must be >= 0");
        }
        return;
    }
-    if (key == "draft_p_min" || key == "p_min") {
+    if (key == "p_min") {
        stage.p_min = std::stof(value_raw);
        if (stage.p_min < 0.0f) {
            throw std::invalid_argument("speculative stage p_min must be >= 0");
@ -892,7 +899,7 @@ static void common_speculative_stage_apply_kv(
        }
        return;
    }
-    if (key == "suffix_min_match_len" || key == "suffix_pattern_len") {
+    if (key == "suffix_min_match_len") {
        stage.suffix_min_match_len = std::stoi(value_raw);
        if (stage.suffix_min_match_len < 1) {
            throw std::invalid_argument("speculative stage suffix_min_match_len must be at least 1");
@ -906,10 +913,100 @@ static void common_speculative_stage_apply_kv(
        }
        return;
    }
+    if (key == "suffix_corpus") {
+        stage.suffix_corpus = value_raw;
+        if (stage.suffix_corpus.empty()) {
+            throw std::invalid_argument("speculative stage suffix_corpus must not be empty");
+        }
+        return;
+    }

    throw std::invalid_argument("unknown speculative stage parameter: " + key_raw);
 }

+static std::vector<std::string> common_speculative_stage_split_kvs(const std::string & values) {
+    std::vector<std::string> result;
+    std::string current;
+    char quote = '\0';
+    bool escaped = false;
+
+    for (char ch : values) {
+        if (escaped) {
+            current += ch;
+            escaped = false;
+            continue;
+        }
+
+        if (ch == '\\') {
+            current += ch;
+            escaped = true;
+            continue;
+        }
+
+        if (quote != '\0') {
+            if (ch == quote) {
+                quote = '\0';
+            }
+            current += ch;
+            continue;
+        }
+
+        if ((ch == '\'' || ch == '"') && !current.empty() && current.back() == '=') {
+            quote = ch;
+            current += ch;
+            continue;
+        }
+
+        if (ch == ',') {
+            result.push_back(current);
+            current.clear();
+            continue;
+        }
+
+        current += ch;
+    }
+
+    if (quote != '\0') {
+        throw std::invalid_argument("invalid speculative stage option list: unterminated quote");
+    }
+
+    result.push_back(current);
+    return result;
+}
+
+static std::string common_speculative_stage_unescape_value(const std::string & value_raw) {
+    std::string value = value_raw;
+    if (value.size() >= 2) {
+        const char first = value.front();
+        const char last = value.back();
+        if ((first == '\'' && last == '\'') || (first == '"' && last == '"')) {
+            value = value.substr(1, value.size() - 2);
+        }
+    }
+
+    std::string result;
+    result.reserve(value.size());
+
+    for (size_t i = 0; i < value.size(); ++i) {
+        const char ch = value[i];
+        if (ch != '\\' || i + 1 >= value.size()) {
+            result += ch;
+            continue;
+        }
+
+        const char next = value[i + 1];
+        if (next == '\\' || next == ',' || next == '\'' || next == '"') {
+            result += next;
+            ++i;
+            continue;
+        }
+
+        result += ch;
+    }
+
+    return result;
+}
+
 static common_speculative_stage_params common_speculative_stage_from_arg(const std::string & value) {
    const auto spec_pos = value.find(':');
    const std::string type_name = value.substr(0, spec_pos);
@ -924,15 +1021,13 @@ static common_speculative_stage_params common_speculative_stage_from_arg(const s
        return stage;
    }

-    std::stringstream ss(value.substr(spec_pos + 1));
-    std::string kv;
-    while (std::getline(ss, kv, ',')) {
+    for (const std::string & kv : common_speculative_stage_split_kvs(value.substr(spec_pos + 1))) {
        const auto eq_pos = kv.find('=');
        if (eq_pos == std::string::npos) {
            throw std::invalid_argument("invalid speculative stage option: " + kv);
        }

-        common_speculative_stage_apply_kv(stage, kv.substr(0, eq_pos), kv.substr(eq_pos + 1));
+        common_speculative_stage_apply_kv(stage, kv.substr(0, eq_pos), common_speculative_stage_unescape_value(kv.substr(eq_pos + 1)));
    }

    return stage;
@ -1379,18 +1474,18 @@ bool gpt_params_find_arg(int argc, char ** argv, const std::string & arg, gpt_pa
    }
    if (arg == "--draft" || arg == "--draft-max" || arg == "--draft-n") {
        CHECK_ARG
-        params.speculative.n_max = std::stoi(argv[i]);
-        return true;
+        throw common_speculative_legacy_option_error(arg,
+            "the value inside the relevant repeated --spec-type entry, e.g. --spec-type mtp:n_max=" + std::string(argv[i]) + ",p_min=0.0 or --spec-type draft:n_max=" + std::string(argv[i]) + ",p_min=0.0");
    }
    if (arg == "--draft-min" || arg == "--draft-n-min") {
        CHECK_ARG
-        params.speculative.n_min = std::stoi(argv[i]);
-        return true;
+        throw common_speculative_legacy_option_error(arg,
+            "the value inside the relevant repeated --spec-type entry using the canonical key n_min, e.g. --spec-type ngram-mod:n_min=" + std::string(argv[i]));
    }
    if (arg == "--draft-p-min") {
        CHECK_ARG
-        params.speculative.p_min = std::stof(argv[i]);
-        return true;
+        throw common_speculative_legacy_option_error(arg,
+            "the value inside the relevant repeated --spec-type entry using the canonical key p_min, e.g. --spec-type mtp:p_min=" + std::string(argv[i]));
    }
    if (arg == "--recurrent-ckpt-mode") {
        CHECK_ARG
@ -1445,89 +1540,46 @@ bool gpt_params_find_arg(int argc, char ** argv, const std::string & arg, gpt_pa
    }
    if (arg == "--spec-stage") {
        CHECK_ARG
-
-        if (params.speculative.stages.empty()) {
-            if (params.speculative.type != COMMON_SPECULATIVE_TYPE_NONE) {
-                throw std::invalid_argument("--spec-stage cannot be combined with --spec-type; use only --spec-stage for explicit stage chains");
-            }
-            if (params.has_mtp) {
-                throw std::invalid_argument("--spec-stage cannot be combined with -mtp/--multi-token-prediction; add the mtp fallback explicitly with --spec-stage mtp[:k=v,...]");
-            }
-        }
-
-        params.speculative.stages.push_back(common_speculative_stage_from_arg(argv[i]));
-        if (params.speculative.stages.size() == 1) {
-            params.speculative.type = params.speculative.stages.front().type;
-        }
-        params.has_mtp = params.speculative.has_stage_type(COMMON_SPECULATIVE_TYPE_MTP);
-        return true;
+        throw common_speculative_legacy_option_error(arg,
+            "repeated --spec-type SPEC[:k=v,...] entries, e.g. --spec-type ngram-mod:n_max=64,n_min=2,ngram_size_n=8 --spec-type mtp:n_max=1,p_min=0.0");
    }
    if (arg == "--spec-type") {
        CHECK_ARG
-        if (!params.speculative.stages.empty()) {
-            throw std::invalid_argument("--spec-type cannot be combined with --spec-stage; use only --spec-stage for explicit stage chains");
-        }
-
-        const auto type = common_speculative_type_from_name(argv[i]);
-        if (type == COMMON_SPECULATIVE_TYPE_NONE || type == COMMON_SPECULATIVE_TYPE_MTP || common_speculative_type_is_self_spec(type)) {
-            params.speculative.type = type;
-            if (type == COMMON_SPECULATIVE_TYPE_MTP) {
-                params.has_mtp = true;
-            }
-        } else {
-            throw std::invalid_argument("unknown speculative decoding type");
-        }
+        params.speculative.stages.push_back(common_speculative_stage_from_arg(argv[i]));
+        const auto resolved = params.speculative.get_resolved_stages();
+        params.speculative.type = resolved.empty() ? COMMON_SPECULATIVE_TYPE_NONE : resolved.front().type;
+        params.has_mtp = params.speculative.has_stage_type(COMMON_SPECULATIVE_TYPE_MTP);
        return true;
    }
    if (arg == "--spec-ngram-size-n") {
        CHECK_ARG
-        int value = std::stoi(argv[i]);
-        if (value < 1 || value > 1024) {
-            throw std::invalid_argument("ngram size N must be between 1 and 1024 inclusive");
-        }
-        params.speculative.ngram_size_n = value;
-        return true;
+        throw common_speculative_legacy_option_error(arg,
+            "the canonical stage key inside --spec-type, e.g. --spec-type ngram-mod:ngram_size_n=" + std::string(argv[i]));
    }
    if (arg == "--spec-ngram-size-m") {
        CHECK_ARG
-        int value = std::stoi(argv[i]);
-        if (value < 1 || value > 1024) {
-            throw std::invalid_argument("ngram size M must be between 1 and 1024 inclusive");
-        }
-        params.speculative.ngram_size_m = value;
-        return true;
+        throw common_speculative_legacy_option_error(arg,
+            "the canonical stage key inside --spec-type, e.g. --spec-type ngram-map-k4v:ngram_size_m=" + std::string(argv[i]));
    }
    if (arg == "--spec-ngram-min-hits") {
        CHECK_ARG
-        int value = std::stoi(argv[i]);
-        if (value < 1) {
-            throw std::invalid_argument("ngram min hits must be at least 1");
-        }
-        params.speculative.ngram_min_hits = value;
-        return true;
+        throw common_speculative_legacy_option_error(arg,
+            "the canonical stage key inside --spec-type, e.g. --spec-type ngram-map-k4v:ngram_min_hits=" + std::string(argv[i]));
    }
    if (arg == "--suffix-pattern-len") {
        CHECK_ARG
-        int value = std::stoi(argv[i]);
-        if (value < 1) {
-            throw std::invalid_argument("suffix pattern length must be at least 1");
-        }
-        params.speculative.suffix_min_match_len = value;
-        return true;
+        throw common_speculative_legacy_option_error(arg,
+            "the canonical stage key inside --spec-type, e.g. --spec-type suffix:suffix_min_match_len=" + std::string(argv[i]));
    }
    if (arg == "--suffix-max-depth") {
        CHECK_ARG
-        int value = std::stoi(argv[i]);
-        if (value < 1) {
-            throw std::invalid_argument("suffix max depth must be at least 1");
-        }
-        params.speculative.suffix_max_depth = value;
-        return true;
+        throw common_speculative_legacy_option_error(arg,
+            "the canonical stage key inside --spec-type, e.g. --spec-type suffix:suffix_max_depth=" + std::string(argv[i]));
    }
    if (arg == "--suffix-corpus") {
        CHECK_ARG
-        params.speculative.suffix_corpus = argv[i];
-        return true;
+        throw common_speculative_legacy_option_error(arg,
+            "the canonical stage key inside --spec-type, e.g. --spec-type suffix:suffix_corpus=" + std::string(argv[i]));
    }
    if (arg == "-a" || arg == "--alias") {
        CHECK_ARG
@ -1976,17 +2028,12 @@ bool gpt_params_find_arg(int argc, char ** argv, const std::string & arg, gpt_pa
        return true;
    }
    if (arg == "-mtp" || arg == "--multi-token-prediction") {
-        if (!params.speculative.stages.empty()) {
-            throw std::invalid_argument("-mtp/--multi-token-prediction cannot be combined with --spec-stage; add the mtp fallback explicitly with --spec-stage mtp[:k=v,...]");
-        }
-
-        params.has_mtp = true;
-        return true;
+        throw common_speculative_legacy_option_error(arg,
+            "--spec-type mtp:n_max=1,p_min=0.0");
    }
    if (arg == "-no-mtp" || arg == "--no-multi-token-prediction") {
-        params.has_mtp = false;
-        common_speculative_remove_explicit_stage(params.speculative, COMMON_SPECULATIVE_TYPE_MTP);
-        return true;
+        throw common_speculative_legacy_option_error(arg,
+            "remove the mtp entry from repeated --spec-type arguments");
    }
    if (arg == "-draft" || arg == "--draft-params") {
        CHECK_ARG
@ -3172,29 +3219,20 @@ void gpt_params_print_usage(int /*argc*/, char ** argv, const gpt_params & param
    options.push_back({ "*",           "-hfr,  --hf-repo REPO",         "Hugging Face model repository (default: unused)" });
    options.push_back({ "*",           "-hff,  --hf-file FILE",         "Hugging Face model file (default: unused)" });
    options.push_back({ "*",           "-hft,  --hf-token TOKEN",       "Hugging Face access token (default: value from HF_TOKEN environment variable)" });
-    options.push_back({ "*", "-mtp, --multi-token-prediction",          "legacy shortcut for enabling MTP when --spec-stage is not used (default: %s)", params.has_mtp ? "true" : "false" });
-    options.push_back({ "*", "-no-mtp, --no-multi-token-prediction",    "disable the legacy MTP shortcut or remove an explicit MTP stage (default: %s)", !params.has_mtp ? "true" : "false" });
-    options.push_back({ "*", "--draft-max, --draft, --draft-n N",
-                                                                        "global default number of tokens to draft for speculative decoding or for stages without an explicit n_max override (default: %d)", params.speculative.n_max });
-    options.push_back({ "*", "--draft-min, --draft-n-min N",   "global default minimum draft threshold or fallback threshold for stages without an explicit n_min override" });
-    options.push_back({ "*", "--draft-p-min P",                "global default minimum speculative decoding probability (greedy) for stages without an explicit p_min override (default: %.1f)", (double)params.speculative.p_min });
    options.push_back({ "*", "--recurrent-ckpt-mode MODE",    "checkpoint strategy for recurrent/hybrid speculative decoding\n"
                                                              "  auto         auto-select: per-step if CUDA full-GPU, gpu-fallback otherwise (default)\n"
                                                              "  per-step     save SSM state per draft step in VRAM; no re-decode on rejection\n"
                                                              "  gpu-fallback copy state to GPU buffer; re-decode on rejection\n"
                                                              "  cpu          serialise state via llama_state_seq; re-decode on rejection" });
-    options.push_back({ "*", "--spec-stage SPEC[:k=v,...]",    "explicit speculative stage. repeat once for a supported two-stage chain.\n"
-                                                              "examples: --spec-stage ngram-mod:n_max=64,n_min=2 --spec-stage mtp:n_max=1\n"
-                                                              "supported two-stage shape in this PR: self-spec first, then mtp or draft fallback" });
-    options.push_back({ "*", "--spec-type Name [none | mtp | ngram-cache | ngram-simple | ngram-map-k | ngram-map-k4v | ngram-mod | suffix]", "single-stage speculative selection when --spec-stage is not used (default: %d)\n", (int)params.speculative.type});
-    options.push_back({ "*", "--spec-ngram-size-n N", "ngram size N for ngram-simple/ngram-map speculative decoding, length of lookup n-gram (default: %d)\n",params.speculative.ngram_size_n });
-
-    options.push_back({ "*", "--spec-ngram-size-m N", "ngram size M for ngram-simple/ngram-map speculative decoding, length of draft m-gram (default: %d)\n", params.speculative.ngram_size_m });
-
-    options.push_back({ "*", "--spec-ngram-min-hits N", "minimum hits for ngram-map speculative decoding (default: %d)\n", params.speculative.ngram_min_hits });
-    options.push_back({ "*", "--suffix-pattern-len N",   "minimum context match length for suffix decoding (default: %d)", params.speculative.suffix_min_match_len });
-    options.push_back({ "*", "--suffix-max-depth N",     "suffix tree maximum depth for suffix decoding (default: %d)",    params.speculative.suffix_max_depth });
-    options.push_back({ "*", "--suffix-corpus PATH",     "corpus file to pre-warm the suffix tree: .json (array of strings or conversation messages) or .bin (raw int32 token IDs)" });
+    options.push_back({ "*", "--spec-type SPEC[:k=v,...]",      "canonical speculative stage entry; repeat for a supported two-stage chain.\n"
+                                                              "types: none, draft, mtp, ngram-cache, ngram-simple, ngram-map-k, ngram-map-k4v, ngram-mod, suffix\n"
+                                                              "canonical keys: n_max,n_min,p_min,ngram_size_n,ngram_size_m,ngram_min_hits,suffix_min_match_len,suffix_max_depth,suffix_corpus\n"
+                                                              "for comma-bearing string values, quote the value inside the stage payload for normal shell use\n"
+                                                              "if argv is passed directly without shell unescaping, the parser also accepts escaped commas as \\,\n"
+                                                              "examples: --spec-type mtp:n_max=1,p_min=0.0\n"
+                                                              "          --spec-type ngram-mod:n_max=64,n_min=2,ngram_size_n=8 --spec-type mtp:n_max=1,p_min=0.0\n"
+                                                              "          --spec-type \"suffix:n_max=16,n_min=2,suffix_min_match_len=5,suffix_max_depth=64,suffix_corpus='/tmp/spec,type-corpus.json'\"\n"
+                                                              "legacy --spec-stage, --draft-*, --spec-ngram-*, --suffix-* and -mtp flags are rejected" });
    options.push_back({ "*", "--spec-autotune",          "automatically tune speculative params to maximize tokens/sec" });

    options.push_back({ "retrieval" });
--- a/common/common.h
+++ b/common/common.h
@ -169,6 +169,7 @@ struct common_speculative_stage_params {

    int32_t suffix_min_match_len = -1;
    int32_t suffix_max_depth = -1;
+    std::string suffix_corpus;

    bool has_n_max_override() const { return n_max >= 0; }
    bool has_n_min_override() const { return n_min >= 0; }
@ -178,6 +179,7 @@ struct common_speculative_stage_params {
    bool has_ngram_min_hits_override() const { return ngram_min_hits > 0; }
    bool has_suffix_min_match_len_override() const { return suffix_min_match_len >= 0; }
    bool has_suffix_max_depth_override() const { return suffix_max_depth >= 0; }
+    bool has_suffix_corpus_override() const { return !suffix_corpus.empty(); }
 };

 struct common_params_model {
--- a/common/spec-tuner.cpp
+++ b/common/spec-tuner.cpp
@ -357,20 +357,15 @@ void spec_tuner::print_best() const {

    {
        std::ostringstream oss;
-        oss << "Autotune reuse: ";
+        oss << "Autotune reuse: --spec-type " << common_speculative_type_to_str(spec_type);
+        bool first_kv = true;
        for (const auto & coord : coords) {
            bool is_int = (coord.name != "p_min");
-            if      (coord.name == "n_max")             oss << "--draft-max ";
-            else if (coord.name == "p_min")             oss << "--draft-p-min ";
-            else if (coord.name == "n_min")             oss << "--draft-min ";
-            else if (coord.name == "ngram_size_n")      oss << "--spec-ngram-size-n ";
-            else if (coord.name == "ngram_size_m")      oss << "--spec-ngram-size-m ";
-            else if (coord.name == "ngram_min_hits")    oss << "--spec-ngram-min-hits ";
-            else if (coord.name == "suffix_min_match_len") oss << "--suffix-pattern-len ";
-            else                                        oss << "--" << coord.name << " ";
+            oss << (first_kv ? ':' : ',') << coord.name << '=';
+            first_kv = false;

-            if (is_int) oss << (int)coord.arms[coord.best_idx].value << " ";
-            else oss << std::fixed << std::setprecision(2) << coord.arms[coord.best_idx].value << " ";
+            if (is_int) oss << (int)coord.arms[coord.best_idx].value;
+            else oss << std::fixed << std::setprecision(2) << coord.arms[coord.best_idx].value;
        }
        LOG_INF("%s\n", oss.str().c_str());
    }
--- a/common/speculative.cpp
+++ b/common/speculative.cpp
@ -1160,7 +1160,7 @@ common_speculative * common_speculative_init(
        });

        if (has_draft_stage) {
-            LOG_ERR("%s: Gemma4 assistant models only support MTP stages; omit -md for self-spec-only runs or use -mtp/--spec-stage mtp for assistant-backed MTP\n", __func__);
+            LOG_ERR("%s: Gemma4 assistant models only support MTP stages; omit -md for self-spec-only runs or use --spec-type mtp:n_max=1,p_min=0.0 for assistant-backed MTP\n", __func__);
            return nullptr;
        }
    }
--- a/common/suffix-tree.cpp
+++ b/common/suffix-tree.cpp
@ -209,7 +209,7 @@ static bool suffix_corpus_check_limit(const std::string & path, size_t n_tokens,
        return true;
    }

-    LOG_ERR("load_corpus: refusing suffix corpus '%s' - estimated insert work %llu exceeds limit %llu (tokens=%zu, depth=%d); reduce corpus size or --suffix-max-depth\n",
+    LOG_ERR("load_corpus: refusing suffix corpus '%s' - estimated insert work %llu exceeds limit %llu (tokens=%zu, depth=%d); reduce corpus size or lower suffix_max_depth inside --spec-type suffix:suffix_max_depth=...\n",
            path.c_str(),
            (unsigned long long) estimated_work,
            (unsigned long long) SUFFIX_CORPUS_MAX_INSERT_WORK,
--- a/docs/parameters.md
+++ b/docs/parameters.md
@ -120,21 +120,13 @@ Check the details [here](./speculative.md).
 | `-ctkd, --cache-type-k-draft TYPE` | KV cache data type for K for the draft model | - | For draft model, see: `-ctk` |
 | `-ctvd, --cache-type-v-draft TYPE` | KV cache data type for V for the draft model | - | For draft model, see: `-ctk` |
 | `-draft, --draft-params` | Comma-separated list of draft model parameters | - |  |
-| `--spec-ngram-size-n N` | ngram size N for ngram-simple/ngram-map speculative decoding, length of lookup n-gram| 12 | [PR 1261](https://github.com/ikawrakow/ik_llama.cpp/pull/1261) |
-| `--spec-ngram-size-m N` | ngram size M for ngram-simple/ngram-map speculative decoding, length of draft m-gram | 48 | [PR 1261](https://github.com/ikawrakow/ik_llama.cpp/pull/1261) |
-| `--spec-ngram-min-hits N` | minimum hits for ngram-map speculative decoding | 1 | [PR 1261](https://github.com/ikawrakow/ik_llama.cpp/pull/1261) |
-| `--spec-type Name` | Comma-separated list of draft model parameters | - | none / ngram - cache / ngram - simple / ngram - map - k / ngram - map - k4v / ngram - mod / suffix [PR 1261](https://github.com/ikawrakow/ik_llama.cpp/pull/1261) [PR 1646](https://github.com/ikawrakow/ik_llama.cpp/pull/1646) |
-| `--spec-stage SPEC[:k=v,...]` | Add an explicit speculative stage; repeat once for a supported two-stage chain | - | Supported two-stage shape: self-spec first, then `mtp` or `draft` fallback. [PR 1789](https://github.com/ikawrakow/ik_llama.cpp/pull/1789) |
-| `-mtp, --multi-token-prediction` |  | - | MTP decoding [PR 1270](https://github.com/ikawrakow/ik_llama.cpp/pull/1270) [1698](https://github.com/ikawrakow/ik_llama.cpp/pull/1698) |
-| `-no-mtp, --no-multi-token-prediction` |  | - | MTP decoding [PR 1270](https://github.com/ikawrakow/ik_llama.cpp/pull/1270) [1698](https://github.com/ikawrakow/ik_llama.cpp/pull/1698) |
-| `--draft-max` |  | - | MTP decoding [PR 1270](https://github.com/ikawrakow/ik_llama.cpp/pull/1270) [1698](https://github.com/ikawrakow/ik_llama.cpp/pull/1698) |
-| `--draft-p-min` |  | - | MTP decoding [PR 1270](https://github.com/ikawrakow/ik_llama.cpp/pull/1270) [1698](https://github.com/ikawrakow/ik_llama.cpp/pull/1698) |
+| `--spec-type SPEC[:k=v,...]` | Canonical speculative stage entry; repeat to configure the supported two-stage chain | - | Types: `none`, `draft`, `mtp`, `ngram-cache`, `ngram-simple`, `ngram-map-k`, `ngram-map-k4v`, `ngram-mod`, `suffix`. Canonical keys: `n_max`, `n_min`, `p_min`, `ngram_size_n`, `ngram_size_m`, `ngram_min_hits`, `suffix_min_match_len`, `suffix_max_depth`, `suffix_corpus`. String values may escape commas as `\,` or quote the value inside the stage payload. Example: `--spec-type ngram-mod:n_max=64,n_min=2,ngram_size_n=8 --spec-type mtp:n_max=1,p_min=0.0` |
 | `--spec-autotune` | Automatically tune speculative params to maximize tokens/sec | - | Automatically determines the near-optimal arguments for the type of speculation being performed [PR 1595](https://github.com/ikawrakow/ik_llama.cpp/pull/1595) |
 | `--recurrent-ckpt-mode MODE` | Checkpoint strategy for recurrent/hybrid speculative decoding | auto | One of: - `auto` auto-select: per-step if CUDA full-GPU, gpu-fallback otherwise - `per-step` save SSM state per draft step in VRAM; no re-decode on rejection - `gpu-fallback` copy state to GPU buffer; re-decode on rejection - `cpu` serialise state via llama_state_seq; re-decode on rejection [PR 1669](https://github.com/ikawrakow/ik_llama.cpp/pull/1669) [PR 1774](https://github.com/ikawrakow/ik_llama.cpp/pull/1774) |

 Notes:

- `--spec-type` cannot be combined with `--spec-stage`.
+- Legacy `--spec-stage`, `--draft-*`, `--spec-ngram-*`, `--suffix-*`, and `-mtp` flags are rejected with replacement guidance.
 - Explicit stage chains currently support at most two stages.
 - Supported self-spec stage names are `ngram-cache`, `ngram-simple`, `ngram-map-k`, `ngram-map-k4v`, `ngram-mod`, and `suffix`.
 - Composite stage chains disable speculative autotune.
@ -378,9 +370,7 @@ WIP
 | `--override-kv KEY=TYPE:VALUE` | Override model metadata by key | - | Advanced option to override model metadata by key. May be specified multiple times. types: int, float, bool, str. Example: `--override-kv tokenizer.ggml.add_bos_token=bool:false` |
 | `-m, --model FNAME` | Model path | models/$filename | Mandatory, the GGUF model file to be served. |
 | `-md, --model-draft FNAME` | Draft model for speculative decoding | unused | Required when an explicit `draft` stage is used. |
-| `--draft-max, --draft, --draft-n N` | Global speculative draft cap, or fallback value for stages without an explicit `n_max` override | 16 | Also used by single-stage MTP and draft-model speculation. |
-| `--draft-min, --draft-n-min N` | Global minimum speculative draft threshold, or fallback value for stages without an explicit `n_min` override | 0 |  |
-| `--draft-p-min P` | Global minimum speculative decoding probability (greedy), or fallback value for stages without an explicit `p_min` override | 0.8 |  |
+| `--spec-type SPEC[:k=v,...]` | Canonical speculative stage entry; repeat for the supported two-stage chain | none | Use stage-local keys like `n_max`, `n_min`, `p_min`, `ngram_size_n`, `ngram_size_m`, `ngram_min_hits`, `suffix_min_match_len`, `suffix_max_depth`, and `suffix_corpus`. |

 ### Request-Level Speculative Overrides

--- a/docs/speculative.md
+++ b/docs/speculative.md
@ -33,18 +33,18 @@ An example to use this approach can be the rewriting of source code by a LLM.
 This implementation looks for the last n-gram in history that matches the current n-gram and creates a draft using the m tokens following the matched n-gram. It is the simplest self-speculative approach with minimal overhead.

 ```
-llama-server [...] --spec-type ngram-simple --draft-max 64
+llama-server [...] --spec-type ngram-simple:n_max=64
 ```

 #### n-gram Map Key (`ngram-map-k`)

-This implementation looks for the current n-gram of size n (called the _key_) in the token history. If the key n-gram is followed by the same m tokens (called the _mgram_) multiple times, it creates a draft using these m tokens. This approach requires a minimum number of occurrences (argument `--spec-ngram-min-hits`, default is 1) before generating drafts.
+This implementation looks for the current n-gram of size n (called the _key_) in the token history. If the key n-gram is followed by the same m tokens (called the _mgram_) multiple times, it creates a draft using these m tokens. This approach requires a minimum number of occurrences (stage key `ngram_min_hits`, default is 1) before generating drafts.

 The number of accepted tokens is stored for each used n-gram.

 **Example:**
 ```
-llama-server [...] --spec-type ngram-map-k --draft-max 64
+llama-server [...] --spec-type ngram-map-k:n_max=64,ngram_min_hits=1
 ```

 #### n-gram Map Key-4-Values (`ngram-map-k4v`)
@ -55,7 +55,7 @@ The number of accepted tokens is stored for each used n-gram.

 **Example:** Server options to be used if there are a lot of longer repetitions.
 ```
-llama-server [...] --spec-type ngram-map-k4v --spec-ngram-size-n 8 --spec-ngram-size-m 8 --spec-ngram-min-hits 2 --draft-max 64
+llama-server [...] --spec-type ngram-map-k4v:n_max=64,ngram_size_n=8,ngram_size_m=8,ngram_min_hits=2
 ```

 ### n-gram Mod (`ngram-mod`)
@ -80,9 +80,9 @@ Currently, a single hash pool is shared across all server slots, so different re
 # notes:
 # - small `n` are not recommended
 # - MoEs require long drafts
-# - dense models: can reduce `--draft-min` and `--draft-max`
+# - dense models: can reduce `n_min` and `n_max`

-llama-server ... --spec-type ngram-mod --spec-ngram-size-n 24 --draft-min 48 --draft-max 64
+llama-server ... --spec-type ngram-mod:n_max=64,n_min=48,ngram_size_n=24
 ```

 Applications:
@ -103,57 +103,78 @@ Example Video:

 ## Command-Line Options

-If a draft model is combined with a draftless decoding the draftless decoding has higher precedence.
+The canonical startup surface is repeated `--spec-type SPEC[:k=v,...]`. Legacy `--spec-stage`, `--draft-*`, `--spec-ngram-*`, `--suffix-*`, and `-mtp` flags are rejected with replacement guidance.

-```
--draft, --draft-n, --draft-max N       number of tokens to draft for speculative decoding (default: 16)
-                                        (env: LLAMA_ARG_DRAFT_MAX)
--draft-min, --draft-n-min N            minimum number of draft tokens to use for speculative decoding
-                                        (default: 0)
-                                        (env: LLAMA_ARG_DRAFT_MIN)
-[...]
--spec-type [none|ngram-cache|ngram-simple|ngram-map-k|ngram-map-k4v|ngram-mod]
-                                        type of speculative decoding to use when no draft model is provided
-                                        (default: none)
--spec-ngram-size-n N                   ngram size N for ngram-simple/ngram-map speculative decoding, length
-                                        of lookup n-gram (default: 12)
--spec-ngram-size-m N                   ngram size M for ngram-simple/ngram-map speculative decoding, length
-                                        of draft m-gram (default: 48)
--spec-ngram-min-hits N                 minimum hits for ngram-map speculative decoding (default: 1)
-```
+### `--spec-type SPEC[:k=v,...]`

-### `--spec-type TYPE`
-
-Specifies a type of speculative decoding without draft model.
+Each `--spec-type` entry defines one speculative stage. Repeat it to configure the supported two-stage path.

 | Type | Description |
 |------|-------------|
-| `none` | No speculative decoding (default) |
+| `none` | No speculative decoding |
+| `draft` | Draft-model speculative decoding; pair with `-md/--model-draft` |
+| `mtp` | Embedded or assistant-backed MTP |
 | `ngram-cache` | Use n-gram cache lookup |
 | `ngram-simple` | Use simple n-gram pattern matching |
-| `ngram-map-k` | Use n-gram pattern matching with n-gram-keys |
-| `ngram-map-k4v` | Use n-gram pattern matching with n-gram-keys and up to four m-gram values (experimental) |
-| `ngram-mod` | Use basic ngram hasher for speculative decoding with shared pool |
+| `ngram-map-k` | Use n-gram pattern matching with n-gram keys |
+| `ngram-map-k4v` | Use n-gram pattern matching with n-gram keys and up to four m-gram values |
+| `ngram-mod` | Use the shared n-gram hasher |
+| `suffix` | Use suffix-tree speculative decoding |
+
+Canonical stage keys:
+
+| Key | Meaning |
+|-----|---------|
+| `n_max` | Maximum drafted tokens for that stage |
+| `n_min` | Minimum usable drafted tokens for that stage |
+| `p_min` | Minimum speculative probability threshold |
+| `ngram_size_n` | Lookup n-gram size |
+| `ngram_size_m` | Draft m-gram size |
+| `ngram_min_hits` | Minimum matching hits for n-gram map stages |
+| `suffix_min_match_len` | Minimum suffix context match length |
+| `suffix_max_depth` | Maximum suffix-tree depth |
+| `suffix_corpus` | Optional suffix corpus file for pre-warming |
+
+String-valued stage keys such as `suffix_corpus` need shell-safe quoting when the value contains commas. From a normal shell, quote the value inside the stage payload so the parser sees the comma as part of the string value.
+
+Example shell-safe form:

-**Example:** Server-instance used to refactor source code.
 ```bash
-./llama-server [...] --spec-type ngram-simple
+./llama-server [...] \
+    --spec-type "suffix:n_max=16,n_min=2,suffix_min_match_len=5,suffix_max_depth=64,suffix_corpus='/tmp/spec,type-corpus.json'"
 ```

-### `--spec-ngram-size-n N`
+If you are constructing `argv` directly without shell unescaping, the parser also accepts escaped commas as `\,`.

-Sets the size N of the lookup n-gram for n-gram map based speculative decoding.
-The n-gram size N determines how many tokens in a row to look back when searching for matching patterns.
+Examples:

-### `--spec-ngram-size-m M`
+```bash
+# Single-stage MTP
+./llama-server [...] --spec-type mtp:n_max=1,p_min=0.0

-Sets the size M of the draft m-gram for n-gram map based speculative decoding.
-The m-gram size determines how many tokens to draft when a match is found.
-Larger values can provide more speedup but may reduce acceptance rate.
+# Single-stage ngram-mod
+./llama-server [...] --spec-type ngram-mod:n_max=64,n_min=48,ngram_size_n=24

-### `--spec-ngram-min-hits H`
+# Draft-model speculation
+./llama-server [...] --model-draft draft.gguf --spec-type draft:n_max=4,p_min=0.0

-This option defines how often a key has to appear in the token history to be used as a draft (default is 1).
+# Two-stage self-spec -> MTP fallback
+./llama-server [...] \
+    --spec-type ngram-mod:n_max=64,n_min=2,ngram_size_n=8 \
+    --spec-type mtp:n_max=1,p_min=0.0
+
+# Suffix stage with pre-warmed corpus
+./llama-server [...] \
+    --spec-type suffix:n_max=16,n_min=2,suffix_min_match_len=5,suffix_max_depth=64,suffix_corpus=/path/to/corpus.json
+
+# Suffix stage with a comma-bearing corpus path from a normal shell
+./llama-server [...] \
+    --spec-type "suffix:n_max=16,n_min=2,suffix_min_match_len=5,suffix_max_depth=64,suffix_corpus='/tmp/spec,type-corpus.json'"
+```
+
+### `--spec-autotune`
+
+Autotunes the active stage parameters and reports the best configuration back as a canonical `--spec-type ...` snippet.

 ## Statistics
 Each speculative decoding implementation prints statistics.
@ -180,4 +201,3 @@ statistics ngram_map_k: #calls(b,g,a) = 6 1690 26, #gen drafts = 26, #acc drafts
 - `#gen tokens`: number of tokens generated by this implementation (including rejected tokens)
 - `#acc tokens`: number of tokens accepted by the main model
 - `dur(b,g,a): durations of begin (new prompt), generation and accumulation (process acceptance).
-
--- a/examples/imatrix/imatrix.cpp
+++ b/examples/imatrix/imatrix.cpp
@ -1232,7 +1232,7 @@ int main(int argc, char ** argv) {
    }

    if (!use_paired_gemma4_mtp && llama_model_is_gemma4_mtp_assistant(model) && !params.process_output) {
-        fprintf(stderr, "%s: warning: standalone Gemma 4 assistant imatrix does not exercise the assistant layers. Use '-m <target> -md <assistant> -mtp' for meaningful calibration.\n", __func__);
+        fprintf(stderr, "%s: warning: standalone Gemma 4 assistant imatrix does not exercise the assistant layers. Use '-m <target> -md <assistant> --spec-type mtp:n_max=1,p_min=0.0' for meaningful calibration.\n", __func__);
    }

    const int n_ctx_train = llama_n_ctx_train(model);
--- a/examples/server/README.md
+++ b/examples/server/README.md
@ -210,10 +210,10 @@ model:
  -m,    --model FNAME            model path (default: models/$filename with filename from --hf-file
                                  or --model-url if set, otherwise models/7B/ggml-model-f16.gguf)
  -md,   --model-draft FNAME      draft model for speculative decoding (default: unused)
-         --spec-stage SPEC[:k=v,...]
-                                  explicit speculative stage. repeat once for a supported two-stage chain
-                                  examples: --spec-stage ngram-mod:n_max=64,n_min=2 --spec-stage mtp:n_max=1
-                                  supported two-stage shape: self-spec first, then mtp or draft fallback
+      --spec-type SPEC[:k=v,...]
+                canonical speculative stage entry; repeat for a supported two-stage chain
+                examples: --spec-type mtp:n_max=1,p_min=0.0
+                --spec-type ngram-mod:n_max=64,n_min=2,ngram_size_n=8 --spec-type mtp:n_max=1,p_min=0.0
  -mu,   --model-url MODEL_URL    model download url (default: unused)
  -hfr,  --hf-repo REPO           Hugging Face model repository (default: unused)
  -hff,  --hf-file FILE           Hugging Face model file (default: unused)
@ -966,15 +966,15 @@ To know the `id` of the adapter, use GET `/lora-adapters`

 ### Composite speculative decoding

-Use `--spec-stage` for explicit stage chains. The currently supported two-stage shape is self-spec first, then `mtp` or `draft` fallback.
+Use repeated `--spec-type SPEC[:k=v,...]` entries for explicit stage chains. The currently supported two-stage shape is self-spec first, then `mtp` or `draft` fallback.

 Example with `ngram-mod` plus MTP fallback:

 ```bash
 ./build/bin/llama-server \
  --model /models/target-mtp.gguf \
-  --spec-stage ngram-mod:n_max=64,n_min=2,ngram_size_n=8 \
-  --spec-stage mtp:n_max=1,p_min=0.0
+  --spec-type ngram-mod:n_max=64,n_min=2,ngram_size_n=8 \
+  --spec-type mtp:n_max=1,p_min=0.0
 ```

 Example with `ngram-mod` plus draft-model fallback:
@ -983,14 +983,13 @@ Example with `ngram-mod` plus draft-model fallback:
 ./build/bin/llama-server \
  --model /models/target.gguf \
  --model-draft /models/draft.gguf \
-  --spec-stage ngram-mod:n_max=64,n_min=2,ngram_size_n=8 \
-  --spec-stage draft:n_max=4,p_min=0.0
+  --spec-type ngram-mod:n_max=64,n_min=2,ngram_size_n=8 \
+  --spec-type draft:n_max=4,p_min=0.0
 ```

 Notes:

- Use `--spec-type` when you want a single self-spec stage only.
- `--spec-type` cannot be combined with `--spec-stage`.
+- Use `--spec-type` for both single-stage and two-stage startup configuration.
 - Explicit stage chains currently support at most two stages.

 ### Change system prompt on runtime
--- a/examples/server/server-context.cpp
+++ b/examples/server/server-context.cpp
@ -166,7 +166,8 @@ static void server_reject_dead_speculative_request_overrides(const json & data)
        json_value_ptr(data, "speculative.ngram_size_m") != nullptr ||
        json_value_ptr(data, "speculative.ngram_min_hits") != nullptr ||
        json_value_ptr(data, "speculative.suffix_min_match_len") != nullptr ||
-        json_value_ptr(data, "speculative.suffix_max_depth") != nullptr) {
+        json_value_ptr(data, "speculative.suffix_max_depth") != nullptr ||
+        json_value_ptr(data, "speculative.suffix_corpus") != nullptr) {
        throw std::runtime_error("Error: structural speculative overrides are startup-only; per-request overrides only support speculative.n_max, speculative.n_min, speculative.p_min, and speculative.stages");
    }
 }
@ -284,7 +285,6 @@ bool server_context::load_model(const gpt_params& params_) {
        });

        params_base.has_mtp = false;
-
        server_remove_speculative_stage(params_base.speculative, COMMON_SPECULATIVE_TYPE_MTP);

        if (!server_speculative_needs_draft_model(params_base.speculative)) {
@ -1251,6 +1251,10 @@ bool server_context::launch_slot_with_task(server_slot& slot, server_task& task)
    // speculative decoding parameters
    try {
        slot.params.speculative = defaults.speculative;
+        const bool has_flat_n_max = json_value_ptr(data, "speculative.n_max") != nullptr;
+        const bool has_flat_n_min = json_value_ptr(data, "speculative.n_min") != nullptr;
+        const bool has_flat_p_min = json_value_ptr(data, "speculative.p_min") != nullptr;
+
        slot.params.speculative.n_max = json_value(data, "speculative.n_max", params_base.speculative.n_max);
        slot.params.speculative.n_min = json_value(data, "speculative.n_min", params_base.speculative.n_min);
        slot.params.speculative.p_min = json_value(data, "speculative.p_min", params_base.speculative.p_min);
@ -1258,6 +1262,20 @@ bool server_context::launch_slot_with_task(server_slot& slot, server_task& task)
        server_reject_dead_speculative_request_overrides(data);

        const json stages = json_value(data, "speculative.stages", json());
+        if (stages.is_null() && !slot.params.speculative.stages.empty()) {
+            for (auto & stage : slot.params.speculative.stages) {
+                if (has_flat_n_max) {
+                    stage.n_max = -1;
+                }
+                if (has_flat_n_min) {
+                    stage.n_min = -1;
+                }
+                if (has_flat_p_min) {
+                    stage.p_min = -1.0f;
+                }
+            }
+        }
+
        if (!stages.is_null()) {
            if (!stages.is_array()) {
                throw std::runtime_error("Error: speculative.stages must be an array");
@ -1296,11 +1314,11 @@ bool server_context::launch_slot_with_task(server_slot& slot, server_task& task)

        if (slot.can_speculate() &&
            llama_model_has_recurrent(model) &&
-            slot.params.speculative.n_max > params_base.speculative.n_max) {
+            slot.params.speculative.get_max_stage_n_max() > params_base.speculative.get_max_stage_n_max()) {
            send_error(task,
-                    "Error: speculative.n_max=" + std::to_string(slot.params.speculative.n_max) +
-                    " exceeds the recurrent speculative startup limit of " + std::to_string(params_base.speculative.n_max) +
-                    "; restart the server with a higher --draft-max to reserve checkpoint capacity",
+                "Error: speculative n_max=" + std::to_string(slot.params.speculative.get_max_stage_n_max()) +
+                " exceeds the recurrent speculative startup limit of " + std::to_string(params_base.speculative.get_max_stage_n_max()) +
+                "; restart the server with a higher n_max inside the configured --spec-type stages to reserve checkpoint capacity",
                    ERROR_TYPE_INVALID_REQUEST);
            return false;
        }