mirror of
https://github.com/ikawrakow/ik_llama.cpp.git
synced 2026-06-28 04:30:15 -05:00
common/chat, server: refactor, move all conversion functions to common, add tests (#20690) jinja : remove unused header (#22310) common : fix jinja warnings with clang 21 (#22313) Signed-off-by: Adrien Gallouët <angt@huggingface.co> chat: fix handling of space in reasoning markers (#22353) * chat: fix handling of space in reasoning markers common : re-arm reasoning budget after DONE on new <think> (#22323) common : determine generation prompt using longest common prefix (#22657) common/autoparser: fixes for newline handling / forced tool calls (#22654) * chat/autoparser: the fixes * Move optspace() to chat-peg-parser, comment out server tests invalidated due to content now allowed with forced tool calls. * Trim whitespace on apply instead common/chat : preserve media markers for typed-content templates (#22634) common : revert reasoning budget +inf logit bias (#22740) common : do not wrap raw strings in schema parser for tagged parsers (#22827) common : enable streaming JSON argument values (#23173) * common : remove atomic from json arguments * common : remove parsing logic on JSON arguments common : do not pass prompt tokens to reasoning budget sampler (#22488) reasoning-budget: clone should do a deep-copy (#23095) Co-authored-by: Piotr Wilkin (ilintar) <piotr.wilkin@syndatis.com>
44 lines
1.9 KiB
C++
44 lines
1.9 KiB
C++
#pragma once
|
|
|
|
#include "llama.h"
|
|
|
|
#include <cstdint>
|
|
#include <vector>
|
|
|
|
enum common_reasoning_budget_state {
|
|
REASONING_BUDGET_IDLE, // waiting for start sequence
|
|
REASONING_BUDGET_COUNTING, // counting down tokens
|
|
REASONING_BUDGET_FORCING, // forcing budget message + end sequence
|
|
REASONING_BUDGET_WAITING_UTF8, // budget exhausted, waiting for UTF-8 completion
|
|
REASONING_BUDGET_DONE, // passthrough forever
|
|
};
|
|
|
|
// Creates a reasoning budget sampler that limits token generation inside a
|
|
// reasoning block (e.g. between <think> and </think>).
|
|
//
|
|
// State machine: IDLE -> COUNTING -> WAITING_UTF8 -> FORCING -> DONE
|
|
// IDLE: passthrough, watching for start_tokens sequence
|
|
// COUNTING: counting down remaining tokens, watching for natural end_tokens
|
|
// WAITING_UTF8: budget exhausted, allowing tokens to complete a UTF-8 sequence
|
|
// FORCING: forces forced_tokens token-by-token (all other logits -> -inf)
|
|
// DONE: passthrough forever
|
|
//
|
|
// Parameters:
|
|
// vocab - vocabulary (used for UTF-8 boundary detection; can be nullptr)
|
|
// start_tokens - token sequence that activates counting
|
|
// end_tokens - token sequence for natural deactivation
|
|
// forced_tokens - token sequence forced when budget expires
|
|
// budget - max tokens allowed in the reasoning block
|
|
// initial_state - initial state
|
|
//
|
|
|
|
struct common_reasoning_budget_ctx * common_reasoning_budget_init(
|
|
const struct llama_vocab * vocab,
|
|
const std::vector<llama_token> & start_tokens,
|
|
const std::vector<llama_token> & end_tokens,
|
|
const std::vector<llama_token> & forced_tokens,
|
|
int32_t budget,
|
|
common_reasoning_budget_state initial_state = REASONING_BUDGET_IDLE);
|
|
|
|
common_reasoning_budget_state common_reasoning_budget_get_state(const common_reasoning_budget_ctx * smpl);
|