#pragma once #include "llama.h" #include #include #include #include #include #include // A trie-based suffix tree for suffix-decoding speculative decoding. // // Stores all suffixes (up to max_depth) of the token history. // Used to find matching patterns in context and generate draft tokens // by following the most frequent continuation path. // // Reference: "Suffix Decoding" (Saxena et al., 2024) — arXiv:2411.04975 struct common_suffix_node { int64_t count = 0; std::unordered_map> children; }; class common_suffix_tree { public: explicit common_suffix_tree(int max_depth = 64); ~common_suffix_tree(); // Append tokens to the history and insert new suffixes into the trie. // Incremental: only processes suffixes that haven't been inserted yet. void extend(const llama_token * tokens, int n_tokens); void clear(); // Generate draft tokens by matching the context in the trie. // Tries multiple context lengths and returns the draft with the best score. std::vector speculate( const llama_token * context, int n_context, int max_spec_tokens, float min_token_prob = 0.1f, int min_match_count = 1, int min_match_len = 5) const; // Load an offline corpus to pre-warm the tree before any request. // Supported formats (.json or .bin) bool load_corpus( const std::string & path, std::function(const std::string &)> tokenize_fn = {}); int max_depth() const { return _max_depth; } int token_count() const { return (int)_tokens.size(); } private: int _max_depth; std::unique_ptr _root; std::vector _tokens; int _n_inserted = 0; void _insert_suffix(int start_pos); void _extend_suffix(int start_pos, int old_len, int new_len); };