mirror of
https://github.com/ikawrakow/ik_llama.cpp.git
synced 2026-06-28 04:30:15 -05:00
* server: spec checkpoints for recurrent models * fix: save/restore sampler state during speculative checkpoint When speculative decoding rejects draft tokens and restores the recurrent state checkpoint, the sampler (RNG, grammar, prev tokens) must also be restored to maintain consistency. Without this, the sampler state reflects the rejected draft tokens, leading to potential divergence. Uses common_sampler_clone() to snapshot the sampler before the speculative batch decode, and restores it on rejection. * server: snapshot recurrent state in tensor * reset ngram mod state for rejected tokens * server: refactor checkpoint state logic * speculative: fix sampler for checkpoints * recurrent model: implement recurrent kernel checkpoint * recurrent model: refactor api * spec: free rbudget before overwriting
780 lines
34 KiB
C++
780 lines
34 KiB
C++
//
|
|
// Copyright (C) 2023-2025 The llama.cpp authors
|
|
// Copyright (C) 2024-2025 Iwan Kawrakow
|
|
// MIT license
|
|
// SPDX-License-Identifier: MIT
|
|
//
|
|
|
|
// Various helper functions and utilities
|
|
|
|
#pragma once
|
|
|
|
#include "llama.h"
|
|
|
|
#include "sampling.h"
|
|
|
|
#define LOG_NO_FILE_LINE_FUNCTION
|
|
#include "log.h"
|
|
#include <set>
|
|
#include <cmath>
|
|
#include <string>
|
|
#include <sstream>
|
|
#include <string_view>
|
|
#include <vector>
|
|
#include <random>
|
|
#include <thread>
|
|
#include <unordered_map>
|
|
#include <tuple>
|
|
#include <map>
|
|
#include <sstream>
|
|
#include <variant>
|
|
|
|
#ifdef _WIN32
|
|
#define DIRECTORY_SEPARATOR '\\'
|
|
#else
|
|
#define DIRECTORY_SEPARATOR '/'
|
|
#endif // _WIN32
|
|
|
|
#define die(msg) do { fputs("error: " msg "\n", stderr); exit(1); } while (0)
|
|
#define die_fmt(fmt, ...) do { fprintf(stderr, "error: " fmt "\n", __VA_ARGS__); exit(1); } while (0)
|
|
|
|
#define print_build_info() do { \
|
|
fprintf(stderr, "%s: build = %d (%s)\n", __func__, LLAMA_BUILD_NUMBER, LLAMA_COMMIT); \
|
|
fprintf(stderr, "%s: built with %s for %s\n", __func__, LLAMA_COMPILER, LLAMA_BUILD_TARGET); \
|
|
} while(0)
|
|
|
|
#define DEFAULT_MODEL_PATH "models/7B/ggml-model-f16.gguf"
|
|
|
|
struct common_time_meas {
|
|
common_time_meas(int64_t & t_acc, bool disable = false);
|
|
~common_time_meas();
|
|
|
|
const int64_t t_start_us;
|
|
|
|
int64_t & t_acc;
|
|
};
|
|
|
|
struct llama_lora_adapter_info {
|
|
std::string path;
|
|
float scale;
|
|
};
|
|
|
|
struct llama_lora_adapter_container : llama_lora_adapter_info {
|
|
struct llama_lora_adapter * adapter;
|
|
};
|
|
|
|
using llama_tokens = std::vector<llama_token>;
|
|
|
|
// build info
|
|
extern int LLAMA_BUILD_NUMBER;
|
|
extern char const * LLAMA_COMMIT;
|
|
extern char const * LLAMA_COMPILER;
|
|
extern char const * LLAMA_BUILD_TARGET;
|
|
|
|
struct llama_control_vector_load_info;
|
|
|
|
//
|
|
// CPU utils
|
|
//
|
|
|
|
int32_t cpu_get_num_physical_cores();
|
|
int32_t cpu_get_num_math();
|
|
|
|
enum llama_example {
|
|
LLAMA_EXAMPLE_COMMON,
|
|
LLAMA_EXAMPLE_SPECULATIVE,
|
|
LLAMA_EXAMPLE_MAIN,
|
|
LLAMA_EXAMPLE_EMBEDDING,
|
|
LLAMA_EXAMPLE_PERPLEXITY,
|
|
LLAMA_EXAMPLE_RETRIEVAL,
|
|
LLAMA_EXAMPLE_PASSKEY,
|
|
LLAMA_EXAMPLE_IMATRIX,
|
|
LLAMA_EXAMPLE_BENCH,
|
|
LLAMA_EXAMPLE_SERVER,
|
|
LLAMA_EXAMPLE_CVECTOR_GENERATOR,
|
|
LLAMA_EXAMPLE_EXPORT_LORA,
|
|
LLAMA_EXAMPLE_MTMD,
|
|
LLAMA_EXAMPLE_LOOKUP,
|
|
LLAMA_EXAMPLE_PARALLEL,
|
|
LLAMA_EXAMPLE_TTS,
|
|
LLAMA_EXAMPLE_DIFFUSION,
|
|
LLAMA_EXAMPLE_FINETUNE,
|
|
|
|
LLAMA_EXAMPLE_COUNT,
|
|
};
|
|
|
|
//
|
|
// CLI argument parsing
|
|
//
|
|
|
|
// dimensionality reduction methods, used by cvector-generator
|
|
enum dimre_method {
|
|
DIMRE_METHOD_PCA,
|
|
DIMRE_METHOD_MEAN,
|
|
};
|
|
|
|
// reasoning API response format (not to be confused as chat template's reasoning format)
|
|
enum common_reasoning_format {
|
|
COMMON_REASONING_FORMAT_NONE,
|
|
COMMON_REASONING_FORMAT_AUTO,
|
|
COMMON_REASONING_FORMAT_DEEPSEEK_LEGACY, // Extract thinking tag contents and return as `message.reasoning_content`, or leave inline in <think> tags in stream mode
|
|
COMMON_REASONING_FORMAT_DEEPSEEK, // Extract thinking tag contents and return as `message.reasoning_content`, including in streaming deltas.
|
|
};
|
|
|
|
enum common_webui {
|
|
COMMON_WEBUI_NONE,
|
|
COMMON_WEBUI_AUTO,
|
|
COMMON_WEBUI_LLAMACPP,
|
|
};
|
|
|
|
common_webui common_webui_from_name(const std::string& format);
|
|
|
|
struct thinking_tokens {
|
|
bool exclude = true;
|
|
std::string begin = "<think>";
|
|
std::string end = "</think>";
|
|
};
|
|
|
|
thinking_tokens thinking_tokens_from_string(const std::string& format);
|
|
|
|
enum common_speculative_type {
|
|
COMMON_SPECULATIVE_TYPE_NONE, // no speculative decoding
|
|
COMMON_SPECULATIVE_TYPE_DRAFT, // draft model
|
|
COMMON_SPECULATIVE_TYPE_MTP, // MTP model
|
|
COMMON_SPECULATIVE_TYPE_EAGLE3, // eagle draft model
|
|
COMMON_SPECULATIVE_TYPE_NGRAM_SIMPLE, // simple self-speculative decoding
|
|
COMMON_SPECULATIVE_TYPE_NGRAM_MAP_K, // self-speculative decoding with n-gram keys only
|
|
COMMON_SPECULATIVE_TYPE_NGRAM_MAP_K4V, // self-speculative decoding with n-gram keys and 4 m-gram values
|
|
COMMON_SPECULATIVE_TYPE_NGRAM_MOD,
|
|
COMMON_SPECULATIVE_TYPE_NGRAM_CACHE, // self-speculative decoding with 3-level n-gram cache
|
|
COMMON_SPECULATIVE_TYPE_SUFFIX, // self-speculative suffix-decoding (arXiv:2411.04975)
|
|
COMMON_SPECULATIVE_TYPE_COUNT // number of types, unknown type
|
|
};
|
|
|
|
struct common_params_model {
|
|
std::string path = ""; // model local path // NOLINT
|
|
std::string url = ""; // model url to download // NOLINT
|
|
std::string hf_repo = ""; // HF repo // NOLINT
|
|
std::string hf_file = ""; // HF file // NOLINT
|
|
std::string docker_repo = ""; // Docker repo // NOLINT
|
|
};
|
|
|
|
struct common_ngram_mod;
|
|
|
|
struct common_params_speculative {
|
|
common_speculative_type type = COMMON_SPECULATIVE_TYPE_NONE; // type of speculative decoding
|
|
|
|
// Recurrent-model checkpoint strategy for speculative decoding.
|
|
int recurrent_ckpt_mode = LLAMA_SPEC_CKPT_AUTO;
|
|
|
|
std::string devices;
|
|
std::string params;
|
|
int32_t n_threads = -1;
|
|
int32_t n_threads_batch = -1;
|
|
|
|
int32_t n_max = 16; // number of tokens to draft during speculative decoding
|
|
int32_t n_min = 0; // minimum number of tokens to draft during speculative decoding
|
|
|
|
float p_split = 0.1f; // speculative decoding split probability
|
|
float p_min = 0.75f; // minimum speculative decoding probability (greedy)
|
|
|
|
// ngram-based speculative decoding
|
|
|
|
uint16_t ngram_size_n = 12; // ngram size for lookup
|
|
uint16_t ngram_size_m = 48; // mgram size for speculative tokens
|
|
uint16_t ngram_min_hits = 1; // minimum hits at ngram/mgram lookup for mgram to be proposed
|
|
|
|
std::shared_ptr<common_ngram_mod> ngram_mod;
|
|
|
|
// suffix-decoding specific
|
|
int32_t suffix_min_match_len = 5; // minimum context match length
|
|
int32_t suffix_max_depth = 64; // suffix tree maximum depth
|
|
std::string suffix_corpus; // path to corpus file for offline pre-warming (.json or .bin)
|
|
|
|
std::string lookup_cache_static; // path of static ngram cache file for lookup decoding // NOLINT
|
|
std::string lookup_cache_dynamic; // path of dynamic ngram cache file for lookup decoding // NOLINT
|
|
|
|
// draft-model speculative decoding
|
|
struct common_params_model mparams_dft;
|
|
|
|
llama_model * model_dft = nullptr; // a llama_model that can be shared by multiple speculative contexts
|
|
|
|
llama_context_params cparams_dft; // these are the parameters for the draft llama_context
|
|
|
|
int32_t n_ctx = 0; // draft context size
|
|
int32_t n_gpu_layers = -1; // number of layers to store in VRAM for the draft model (-1 - use default)
|
|
|
|
std::string model = ""; // draft model for speculative decoding
|
|
std::vector<std::pair<std::string, std::string>> replacements; // main to speculative model replacements
|
|
std::string cache_type_k = ""; // KV cache data type for K for the draft model
|
|
std::string cache_type_v = ""; // KV cache data type for V for the draft model
|
|
|
|
bool autotune = false; // automatically optimize speculative params for max tokens/sec
|
|
|
|
bool has_dft() const {
|
|
return !model.empty() || !params.empty();
|
|
//return !mparams_dft.path.empty() || !mparams_dft.hf_repo.empty();
|
|
}
|
|
|
|
};
|
|
|
|
struct gpt_params {
|
|
std::string devices;
|
|
uint32_t seed = LLAMA_DEFAULT_SEED; // RNG seed
|
|
|
|
int32_t n_threads = cpu_get_num_math();
|
|
int32_t n_threads_batch = -1; // number of threads to use for batch processing (-1 = use n_threads)
|
|
int32_t n_predict = -1; // new tokens to predict
|
|
int32_t n_ctx = 0; // context size
|
|
int32_t n_batch = 2048; // logical batch size for prompt processing (must be >=32 to use BLAS)
|
|
int32_t n_ubatch = 512; // physical batch size for prompt processing (must be >=32 to use BLAS)
|
|
int32_t n_keep = 0; // number of tokens to keep from initial prompt
|
|
int32_t n_chunks = -1; // max number of chunks to process (-1 = unlimited)
|
|
int32_t n_parallel = 1; // number of parallel sequences to decode
|
|
int32_t n_sequences = 1; // number of sequences to decode
|
|
float p_split = 0.1f; // speculative decoding split probability
|
|
int32_t n_gpu_layers = -1; // number of layers to store in VRAM (-1 - use default)
|
|
int32_t main_gpu = 0; // the GPU that is used for scratch and small tensors
|
|
int32_t max_gpu = 0; // max number of GPUs to use at a time for split mode "graph"
|
|
int32_t ncmoe = 0; // number of layers in which MoE tensors are left in VRAM
|
|
int32_t fit_margin = 0; // safety margin for auto-fit in MiB
|
|
bool fit = false; // automatically fit model (for now just using MoE tensor overrides)
|
|
int32_t worst_graph_tokens = 0; // number of tokens to use when reserving the worst graph
|
|
float tensor_split[128] = {0}; // how split tensors should be distributed across GPUs
|
|
int32_t grp_attn_n = 1; // group-attention factor
|
|
int32_t grp_attn_w = 512; // group-attention width
|
|
int32_t n_print = -1; // print token count every n tokens (-1 = disabled)
|
|
float rope_freq_base = 0.0f; // RoPE base frequency
|
|
float rope_freq_scale = 0.0f; // RoPE frequency scaling factor
|
|
float yarn_ext_factor = -1.0f; // YaRN extrapolation mix factor
|
|
float yarn_attn_factor = -1.0f; // YaRN magnitude scaling factor
|
|
float yarn_beta_fast = -1.0f; // YaRN low correction dim
|
|
float yarn_beta_slow = -1.0f; // YaRN high correction dim
|
|
int32_t yarn_orig_ctx = 0; // YaRN original context length
|
|
float defrag_thold = -1.0f; // KV cache defragmentation threshold
|
|
float ban_phrases_bias = -999.0f; // logit bias applied to ban phrases
|
|
int32_t max_extra_alloc_MiB = 256; // additional VRAM per GPU the scheduler may allocate for more efficient compute graph evaluation
|
|
int32_t nrep = 1; // number of repetitions used in sweep bench
|
|
|
|
ggml_backend_sched_eval_callback cb_eval = nullptr;
|
|
void * cb_eval_user_data = nullptr;
|
|
|
|
ggml_numa_strategy numa = GGML_NUMA_STRATEGY_DISABLED;
|
|
|
|
enum llama_split_mode split_mode = LLAMA_SPLIT_MODE_LAYER; // how to split the model across GPUs
|
|
enum llama_rope_scaling_type rope_scaling_type = LLAMA_ROPE_SCALING_TYPE_UNSPECIFIED;
|
|
enum llama_pooling_type pooling_type = LLAMA_POOLING_TYPE_UNSPECIFIED; // pooling type for embeddings
|
|
enum llama_attention_type attention_type = LLAMA_ATTENTION_TYPE_UNSPECIFIED; // attention type for embeddings
|
|
|
|
// // sampling parameters
|
|
struct common_params_sampling sparams;
|
|
struct common_params_speculative speculative;
|
|
|
|
std::string model = ""; // model path
|
|
std::string model_alias = "unknown"; // model alias
|
|
std::string model_url = ""; // model url to download
|
|
std::string hf_token = ""; // HF token
|
|
std::string hf_repo = ""; // HF repo
|
|
std::string hf_file = ""; // HF file
|
|
std::string prompt = "";
|
|
std::string prompt_file = ""; // store the external prompt file name
|
|
bool prompt_is_binary = false; // don't fool around when the prompt contains binary data (as it is for multiple choice)
|
|
std::string path_prompt_cache = ""; // path to file for saving/loading prompt eval state
|
|
std::string input_prefix = ""; // string to prefix user inputs with
|
|
std::string input_suffix = ""; // string to suffix user inputs with
|
|
std::string logdir = ""; // directory in which to save YAML log files
|
|
std::string lookup_cache_static = ""; // path of static ngram cache file for lookup decoding
|
|
std::string lookup_cache_dynamic = ""; // path of dynamic ngram cache file for lookup decoding
|
|
std::string logits_file = ""; // file for saving *all* logits
|
|
std::string rpc_servers = ""; // comma separated list of RPC servers
|
|
|
|
std::string cuda_params = ""; // comma separated list of cuda parameters key=value1,key2=value2
|
|
|
|
std::vector<std::string> in_files; // all input files
|
|
std::vector<std::string> antiprompt; // strings upon which more user input is prompted (a.k.a. reverse prompts)
|
|
std::vector<std::string> ban_phrases; // strings that are banned in generation
|
|
int32_t banned_n = 1; // number of tokens that are banned in the phrase
|
|
size_t n_buffer = 0; // number of token buffers for string ban
|
|
bool can_ban_phrases = true; // whether to ban strings
|
|
|
|
std::vector<std::vector<std::tuple<
|
|
uint32_t // lower codepoint
|
|
,uint32_t // upper codepoint
|
|
,std::string // unicode script name
|
|
,float // bias
|
|
>>> allow_ruless;
|
|
std::vector<std::string> allow_pieces; // each token to allowlist
|
|
std::vector<std::string> allow_kws; // keywords
|
|
size_t allow_kw_delay; // minimum n_decoded before first keyword is active
|
|
|
|
std::vector<llama_model_kv_override> kv_overrides;
|
|
std::vector<llama_model_tensor_buft_override> tensor_buft_overrides;
|
|
std::vector<std::pair<int,int>> offload_policy;
|
|
|
|
bool lora_init_without_apply = false; // only load lora to memory, but do not apply it to ctx (user can manually apply lora later using llama_lora_adapter_apply)
|
|
std::vector<llama_lora_adapter_info> lora_adapters; // lora adapter path with user defined scale
|
|
|
|
std::vector<llama_control_vector_load_info> control_vectors; // control vector with user defined scale
|
|
|
|
int32_t verbosity = 0;
|
|
int32_t control_vector_layer_start = -1; // layer range for control vector
|
|
int32_t control_vector_layer_end = -1; // layer range for control vector
|
|
|
|
int32_t ppl_stride = 0; // stride for perplexity calculations. If left at 0, the pre-existing approach will be used.
|
|
int32_t ppl_output_type = 0; // = 0 -> ppl output is as usual, = 1 -> ppl output is num_tokens, ppl, one per line
|
|
// (which is more convenient to use for plotting)
|
|
//
|
|
bool hellaswag = false; // compute HellaSwag score over random tasks from datafile supplied in prompt
|
|
size_t hellaswag_tasks = 400; // number of tasks to use when computing the HellaSwag score
|
|
|
|
bool winogrande = false; // compute Winogrande score over random tasks from datafile supplied in prompt
|
|
size_t winogrande_tasks = 0; // number of tasks to use when computing the Winogrande score. If 0, all tasks will be computed
|
|
|
|
bool multiple_choice = false; // compute TruthfulQA score over random tasks from datafile supplied in prompt
|
|
size_t multiple_choice_tasks = 0; // number of tasks to use when computing the TruthfulQA score. If 0, all tasks will be computed
|
|
|
|
bool kl_divergence = false; // compute KL divergence
|
|
|
|
bool usage = false; // print usage
|
|
bool use_color = false; // use color to distinguish generations and inputs
|
|
bool special = false; // enable special token output
|
|
bool interactive = false; // interactive mode
|
|
bool interactive_first = false; // wait for user input immediately
|
|
bool conversation = false; // conversation mode (does not print special tokens and suffix/prefix)
|
|
bool prompt_cache_all = false; // save user input and generations to prompt cache
|
|
bool prompt_cache_ro = false; // open the prompt cache read-only and do not update it
|
|
bool ctx_shift = true;
|
|
bool escape = true; // escape "\n", "\r", "\t", "\'", "\"", and "\\"
|
|
bool multiline_input = false; // reverse the usage of `\`
|
|
bool simple_io = false; // improves compatibility with subprocesses and limited consoles
|
|
bool cont_batching = true; // insert new sequences for decoding on-the-fly
|
|
bool flash_attn = true; // flash attention
|
|
int mla_attn = 3; // MLA 0: standard, 1: MLA with K and V^T cache, 2: MLA with just K cache, 3: the best of both worlds
|
|
int attn_max_batch = 0; // Max batch size to use when computing attention (only applicable if flash_attn = false)
|
|
bool fused_moe_up_gate = true; // fused up*unary(gate) op for MoE models
|
|
bool fused_up_gate = true; // fused up*unary(gate) op
|
|
bool fused_mmad = true; // fused mul+multi_add op
|
|
bool grouped_expert_routing = false; // if to use grouped expert routing (BailingMoeV2 arch)
|
|
bool rope_cache = false; // if to use RoPE cache (for supported models)
|
|
bool graph_reuse = true; // if to reuse compute graphs
|
|
int min_experts = -1;
|
|
float thresh_experts = 0;
|
|
|
|
bool input_prefix_bos = false; // prefix BOS to user inputs, preceding input_prefix
|
|
bool ignore_eos = false; // ignore generated EOS tokens
|
|
bool logits_all = false; // return logits for all tokens in the batch
|
|
bool use_mmap = true; // use mmap for faster loads
|
|
bool use_mlock = false; // use mlock to keep model in memory
|
|
bool verbose_prompt = false; // print prompt tokens before generation
|
|
bool display_prompt = true; // print prompt before generation
|
|
bool infill = false; // use infill mode
|
|
bool dump_kv_cache = false; // dump the KV cache contents for debugging purposes
|
|
bool no_kv_offload = false; // disable KV offloading
|
|
bool warmup = true; // warmup run
|
|
bool batch_warmup = false; // batch warmup run
|
|
bool check_tensors = false; // validate tensor data
|
|
bool repack_tensors = false; // repack tensors if interleaved variant is available
|
|
bool use_thp = false; // use transparent huge pages (linux only)
|
|
bool validate_quants = false; // if true, check for NaNs while loading the model
|
|
bool only_active_exps = true; // if true, offload only active experts (relevant only for hybrid CPU/GPU)
|
|
bool merge_qkv = false; // if true, merge separate Q, K, V tensors into a single, contiguous tensor
|
|
bool merge_up_gate_exps= false; // if true, merge ffn_up_exps and ffn_gate_exps into a single, contiguous tensor
|
|
bool defer_experts = false; // if true, defer expert mmap residency to speed up model loading (Linux only)
|
|
bool k_cache_hadamard = false; // if true, use Hadamard transform for the K-cache (only makes sense with quantized cache)
|
|
bool v_cache_hadamard = false; // if true, use Hadamard transform for the V-cache (only makes sense with quantized cache, which requires FA)
|
|
bool split_mode_graph_scheduling = false; // if true, force split mode graph scheduling
|
|
//bool split_mode_f16 = true; // if true, intermediate results will be cast to f16 before copying to other GPUs to perform reduce ops
|
|
bool scheduler_async = false; // if true, in split mode graph the scheduler will use multiple threads to evaluate the graph
|
|
int fused_delta_net = 0; // use fused delta-net if number of tokens in the batch is less than this value
|
|
bool has_mtp = false; // enable MTP if supported by the model
|
|
|
|
std::string cache_type_k = "f16"; // KV cache data type for the K
|
|
std::string cache_type_v = "f16"; // KV cache data type for the V
|
|
|
|
std::string reduce_type = "f16";
|
|
|
|
std::string type_k_first = "f16";
|
|
std::string type_k_last = "f16";
|
|
std::string type_v_first = "f16";
|
|
std::string type_v_last = "f16";
|
|
int32_t n_k_first = -1;
|
|
int32_t n_k_last = -1;
|
|
int32_t n_v_first = -1;
|
|
int32_t n_v_last = -1;
|
|
|
|
// multimodal models (see examples/mtmd)
|
|
common_params_model mmproj;
|
|
bool mmproj_use_gpu = true; // use GPU for multimodal model
|
|
bool no_mmproj = false; // explicitly disable multimodal model
|
|
std::vector<std::string> image; // path to image file(s)
|
|
int image_min_tokens = -1;
|
|
int image_max_tokens = -1;
|
|
std::string mtmd_kq_type = "f32";
|
|
|
|
// embedding
|
|
bool embedding = false; // get only sentence embedding
|
|
int32_t embd_normalize = 2; // normalisation for embendings (-1=none, 0=max absolute int16, 1=taxicab, 2=euclidean, >2=p-norm)
|
|
std::string embd_out = ""; // empty = default, "array" = [[],[]...], "json" = openai style, "json+" = same "json" + cosine similarity matrix
|
|
std::string embd_sep = "\n"; // separator of embendings
|
|
|
|
// server params
|
|
int32_t port = 8080; // server listens on this network port
|
|
int32_t timeout_read = 600; // http read timeout in seconds
|
|
int32_t timeout_write = timeout_read; // http write timeout in seconds
|
|
int32_t n_threads_http = -1; // number of threads to process HTTP requests
|
|
bool send_done = false; // send done message as required for OAI compatibility
|
|
|
|
std::string hostname = "127.0.0.1";
|
|
std::string public_path = "";
|
|
|
|
// tool call and template
|
|
std::string chat_template = "";
|
|
bool use_jinja = false; // NOLINT
|
|
bool use_peg = false;
|
|
std::string system_prompt = "";
|
|
bool enable_chat_template = true;
|
|
bool force_pure_content_parser = false;
|
|
bool parallel_tool_calls = false;
|
|
common_reasoning_format reasoning_format = COMMON_REASONING_FORMAT_DEEPSEEK;
|
|
int enable_reasoning = -1; // -1 = auto, 0 = disable, 1 = enable
|
|
int reasoning_budget = -1;
|
|
std::string reasoning_budget_message; // message injected before end tag when budget exhausted
|
|
std::map<std::string, std::string> default_template_kwargs;
|
|
|
|
thinking_tokens think_tokens;
|
|
|
|
bool prefill_assistant = true;
|
|
bool dry_run = false;
|
|
|
|
std::vector<std::string> api_keys;
|
|
|
|
std::string ssl_file_key = "";
|
|
std::string ssl_file_cert = "";
|
|
|
|
|
|
|
|
// "advanced" endpoints are disabled by default for better security
|
|
common_webui webui = COMMON_WEBUI_AUTO;
|
|
bool endpoint_slots = true;
|
|
bool endpoint_props = false; // only control POST requests, not GET
|
|
bool endpoint_metrics = false;
|
|
|
|
bool log_json = false;
|
|
|
|
std::string slot_save_path;
|
|
std::string sql_save_file;
|
|
std::string sqlite_zstd_ext_file;
|
|
|
|
float slot_prompt_similarity = 0.1f;
|
|
|
|
bool do_checkpoint = false; // do checkpoint for recurrent models only
|
|
int32_t ctx_checkpoints_n = 32; // max number of context checkpoints per slot
|
|
int32_t ctx_checkpoints_interval = 512; // minimum number of tokens between each context checkpoints
|
|
int32_t ctx_checkpoints_tolerance = 5; // the number of tokens before the full prompt to create the checkpoint
|
|
int32_t cache_ram_mib = 8192; // -1 = no limit, 0 - disable, 1 = 1 MiB, etc.
|
|
int32_t cache_ram_n_min = 0; // min number of tokens required to save in the ram
|
|
float cache_ram_similarity = 0.5f; // similarity of tokens to cached tokens
|
|
|
|
// batched-bench params
|
|
bool is_pp_shared = false;
|
|
|
|
std::vector<int32_t> n_pp;
|
|
std::vector<int32_t> n_tg;
|
|
std::vector<int32_t> n_pl;
|
|
|
|
// retrieval params
|
|
std::vector<std::string> context_files; // context files to embed
|
|
|
|
int32_t chunk_size = 64; // chunk size for context embedding
|
|
|
|
std::string chunk_separator = "\n"; // chunk separator for context embedding
|
|
|
|
// passkey params
|
|
int32_t n_junk = 250; // number of times to repeat the junk text
|
|
int32_t i_pos = -1; // position of the passkey in the junk text
|
|
|
|
// imatrix params
|
|
std::string out_file = "imatrix.dat"; // save the resulting imatrix to this file
|
|
std::string output_tensor_name = "output.weight"; // name of the output tensor
|
|
|
|
int32_t n_out_freq = 10; // output the imatrix every n_out_freq iterations
|
|
int32_t n_save_freq = 0; // save the imatrix every n_save_freq iterations
|
|
int32_t i_chunk = 0; // start processing from this chunk
|
|
|
|
bool process_output = false; // collect data for the output tensor
|
|
bool compute_ppl = true; // whether to compute perplexity
|
|
|
|
// cvector-generator params
|
|
int n_pca_batch = 100;
|
|
int n_pca_iterations = 1000;
|
|
dimre_method cvector_dimre_method = DIMRE_METHOD_PCA;
|
|
std::string cvector_outfile = "control_vector.gguf";
|
|
std::string cvector_positive_file = "examples/cvector-generator/positive.txt";
|
|
std::string cvector_negative_file = "examples/cvector-generator/negative.txt";
|
|
|
|
bool spm_infill = false; // suffix/prefix/middle pattern for infill
|
|
|
|
std::string lora_outfile = "ggml-lora-merged-f16.gguf";
|
|
|
|
bool sweep_bench_output_jsonl = false;
|
|
bool minilog = false;
|
|
};
|
|
|
|
|
|
std::pair<int, char**> parse_command_line(const std::string& commandLine);
|
|
void free_command_line(int argc, char** argv);
|
|
|
|
void gpt_params_handle_hf_token(gpt_params & params);
|
|
void gpt_params_parse_from_env(gpt_params & params);
|
|
void gpt_params_handle_model_default(gpt_params & params);
|
|
|
|
bool gpt_params_parse_ex (int argc, char ** argv, gpt_params & params);
|
|
bool gpt_params_parse (int argc, char ** argv, gpt_params & params);
|
|
bool gpt_params_find_arg (int argc, char ** argv, const std::string & arg, gpt_params & params, int & i, bool & invalid_param);
|
|
void gpt_params_print_usage(int argc, char ** argv, const gpt_params & params);
|
|
|
|
std::string gpt_params_get_system_info(const gpt_params & params);
|
|
|
|
|
|
struct common_remote_params {
|
|
std::vector<std::string> headers;
|
|
long timeout = 0; // CURLOPT_TIMEOUT, in seconds ; 0 means no timeout
|
|
long max_size = 0; // max size of the response ; unlimited if 0 ; max is 2GB
|
|
};
|
|
// get remote file content, returns <http_code, raw_response_body>
|
|
std::pair<long, std::vector<char>> common_remote_get_content(const std::string& url, const common_remote_params& params);
|
|
|
|
//
|
|
// String utils
|
|
//
|
|
std::string string_join(const std::vector<std::string>& values, const std::string& separator);
|
|
std::string string_strip(const std::string & str);
|
|
std::string string_get_sortable_timestamp();
|
|
std::string string_lower(const std::string & str);
|
|
std::string string_repeat(const std::string & str, size_t n);
|
|
|
|
static bool string_starts_with(const std::string& str,
|
|
const std::string& prefix) { // While we wait for C++20's std::string::starts_with...
|
|
return str.rfind(prefix, 0) == 0;
|
|
}
|
|
|
|
std::vector<std::string> string_split(const std::string& str, const std::string& delimiter);
|
|
std::vector<std::string> string_split(const std::string& str, char delim);
|
|
|
|
void string_replace_all(std::string & s, const std::string & search, const std::string & replace);
|
|
// While we wait for C++20's std::string::ends_with...
|
|
bool string_ends_with(const std::string_view& str, const std::string_view& suffix);
|
|
size_t string_find_partial_stop(const std::string_view& str, const std::string_view& stop);
|
|
|
|
std::string regex_escape(const std::string& s);
|
|
|
|
template<class T>
|
|
static std::vector<T> string_split(const std::string & str, char delim) {
|
|
std::vector<T> values;
|
|
std::istringstream str_stream(str);
|
|
std::string token;
|
|
while (std::getline(str_stream, token, delim)) {
|
|
T value;
|
|
std::istringstream token_stream(token);
|
|
token_stream >> value;
|
|
values.push_back(value);
|
|
}
|
|
return values;
|
|
}
|
|
|
|
template<>
|
|
std::vector<std::string> string_split<std::string>(const std::string& input, char separator)
|
|
{
|
|
std::vector<std::string> parts;
|
|
size_t begin_pos = 0;
|
|
size_t separator_pos = input.find(separator);
|
|
while (separator_pos != std::string::npos) {
|
|
std::string part = input.substr(begin_pos, separator_pos - begin_pos);
|
|
parts.emplace_back(part);
|
|
begin_pos = separator_pos + 1;
|
|
separator_pos = input.find(separator, begin_pos);
|
|
}
|
|
parts.emplace_back(input.substr(begin_pos, separator_pos - begin_pos));
|
|
return parts;
|
|
}
|
|
|
|
bool string_parse_kv_override(const char * data, std::vector<llama_model_kv_override> & overrides);
|
|
void string_process_escapes(std::string & input);
|
|
|
|
//
|
|
// Filesystem utils
|
|
//
|
|
|
|
bool fs_validate_filename(const std::string & filename);
|
|
bool fs_create_directory_with_parents(const std::string & path);
|
|
|
|
std::string fs_get_cache_directory();
|
|
std::string fs_get_cache_file(const std::string & filename);
|
|
|
|
|
|
//
|
|
// Model utils
|
|
//
|
|
|
|
struct llama_init_result {
|
|
struct llama_model * model = nullptr;
|
|
struct llama_context * context = nullptr;
|
|
std::vector<llama_lora_adapter_container> lora_adapters;
|
|
};
|
|
|
|
struct llama_init_result llama_init_from_gpt_params(gpt_params & params);
|
|
|
|
struct llama_model_params common_model_params_to_llama (const gpt_params & params);
|
|
struct llama_context_params common_context_params_to_llama(const gpt_params & params);
|
|
|
|
struct llama_model * llama_load_model_from_url(const char * model_url, const char * path_model, const char * hf_token, const struct llama_model_params & params);
|
|
struct llama_model * llama_load_model_from_hf(const char * repo, const char * file, const char * path_model, const char * hf_token, const struct llama_model_params & params);
|
|
|
|
// clear LoRA adapters from context, then apply new list of adapters
|
|
void llama_lora_adapters_apply(struct llama_context * ctx, std::vector<llama_lora_adapter_container> & lora_adapters);
|
|
|
|
// Batch utils
|
|
|
|
void common_batch_clear(struct llama_batch & batch);
|
|
|
|
void common_batch_add(
|
|
struct llama_batch & batch,
|
|
llama_token id,
|
|
llama_pos pos,
|
|
const std::vector<llama_seq_id> & seq_ids,
|
|
bool logits);
|
|
|
|
//
|
|
// Vocab utils
|
|
//
|
|
|
|
// tokenizes a string into a vector of tokens
|
|
// should work similar to Python's `tokenizer.encode`
|
|
std::vector<llama_token> common_tokenize(
|
|
const struct llama_context * ctx,
|
|
const std::string & text,
|
|
bool add_special,
|
|
bool parse_special = false);
|
|
|
|
std::vector<llama_token> common_tokenize(
|
|
const struct llama_model * model,
|
|
const std::string & text,
|
|
bool add_special,
|
|
bool parse_special = false);
|
|
|
|
std::vector<llama_token> common_tokenize(
|
|
const struct llama_vocab* vocab,
|
|
const std::string& text,
|
|
bool add_special,
|
|
bool parse_special = false);
|
|
|
|
std::vector<llama_token> llama_tokenize(
|
|
const struct llama_vocab * vocab,
|
|
const std::string & text,
|
|
bool add_special,
|
|
bool parse_special = false);
|
|
|
|
// tokenizes a token into a piece, optionally renders special/control tokens
|
|
// should work similar to Python's `tokenizer.id_to_piece`
|
|
std::string common_token_to_piece(
|
|
const struct llama_context * ctx,
|
|
llama_token token,
|
|
bool special = true);
|
|
|
|
std::string llama_token_to_piece(
|
|
const struct llama_model* model,
|
|
llama_token token,
|
|
bool special = true);
|
|
|
|
// detokenizes a vector of tokens into a string
|
|
// should work similar to Python's `tokenizer.decode`
|
|
// optionally renders special/control tokens
|
|
std::string common_detokenize(
|
|
const llama_context * ctx,
|
|
const std::vector<llama_token> & tokens,
|
|
bool special = true);
|
|
|
|
std::string common_detokenize(
|
|
const struct llama_vocab * vocab,
|
|
const std::vector<llama_token> & tokens,
|
|
bool special = true);
|
|
|
|
std::string common_token_to_piece(
|
|
const struct llama_vocab * vocab,
|
|
llama_token token,
|
|
bool special = true);
|
|
|
|
// Uses the value from the model metadata if possible, otherwise
|
|
// defaults to true when model type is SPM, otherwise false.
|
|
bool llama_should_add_bos_token(const llama_model * model);
|
|
|
|
|
|
|
|
|
|
//
|
|
// KV cache utils
|
|
//
|
|
|
|
// Dump the KV cache view with the number of sequences per cell.
|
|
void llama_kv_cache_dump_view(const llama_kv_cache_view & view, int row_size = 80);
|
|
|
|
// Dump the KV cache view showing individual sequences in each cell (long output).
|
|
void llama_kv_cache_dump_view_seqs(const llama_kv_cache_view & view, int row_size = 40);
|
|
|
|
//
|
|
// Embedding utils
|
|
//
|
|
|
|
void common_embd_normalize(const float * inp, float * out, int n, int embd_norm = 2);
|
|
|
|
float common_embd_similarity_cos(const float * embd1, const float * embd2, int n);
|
|
|
|
//
|
|
// Control vector utils
|
|
//
|
|
|
|
struct llama_control_vector_data {
|
|
int n_embd;
|
|
|
|
// stores data for layers [1, n_layer] where n_layer = data.size() / n_embd
|
|
std::vector<float> data;
|
|
};
|
|
|
|
struct llama_control_vector_load_info {
|
|
float strength;
|
|
|
|
std::string fname;
|
|
};
|
|
|
|
// Load control vectors, scale each by strength, and add them together.
|
|
// On error, returns {-1, empty}
|
|
llama_control_vector_data llama_control_vector_load(const std::vector<llama_control_vector_load_info> & load_infos);
|
|
|
|
//
|
|
// Split utils
|
|
//
|
|
|
|
static const char * const LLM_KV_SPLIT_NO = "split.no";
|
|
static const char * const LLM_KV_SPLIT_COUNT = "split.count";
|
|
static const char * const LLM_KV_SPLIT_TENSORS_COUNT = "split.tensors.count";
|
|
|
|
//
|
|
// YAML utils
|
|
//
|
|
|
|
void yaml_dump_vector_float (FILE * stream, const char * prop_name, const std::vector<float> & data);
|
|
void yaml_dump_vector_int (FILE * stream, const char * prop_name, const std::vector<int> & data);
|
|
void yaml_dump_string_multiline(FILE * stream, const char * prop_name, const char * data);
|
|
|
|
void yaml_dump_non_result_info(
|
|
FILE * stream, const gpt_params & params, const llama_context * lctx,
|
|
const std::string & timestamp, const std::vector<int> & prompt_tokens, const char * model_desc);
|
|
|
|
std::string string_format(const char* fmt, ...);
|
|
|
|
//
|
|
// Argparse utils
|
|
//
|
|
|
|
std::tuple<uint32_t, uint32_t, std::string, float> argparse_allowlist_unicode_rule(std::string argstr);
|