mirror of
https://github.com/ikawrakow/ik_llama.cpp.git
synced 2026-06-28 04:30:15 -05:00
server: fix double submits of infill (#1944)
Co-authored-by: firecoperana <firecoperana>
This commit is contained in:
parent
71d5aa21f7
commit
2a1148384c
@ -24,7 +24,7 @@ struct common_sampler * common_sampler_init(const struct llama_model * model, co
|
||||
result->grammar = nullptr;
|
||||
result->rbudget = nullptr;
|
||||
|
||||
struct llama_grammar* grmr;
|
||||
struct llama_grammar* grmr = nullptr;
|
||||
const std::string & grammar_str = common_grammar_value(params.grammar);
|
||||
if (grammar_str.compare(0, 11, "%llguidance") == 0) {
|
||||
#ifdef LLAMA_USE_LLGUIDANCE
|
||||
|
||||
@ -1295,7 +1295,7 @@ void server_tokens::push_back(server_tokens& tokens) {
|
||||
// Assert if we are copying MTMD chunks to a server_tokens that does not have mtmd.
|
||||
// We could also just check, but this will prevent silently dropping MTMD data.
|
||||
GGML_ASSERT(has_mtmd);
|
||||
for (auto it = tokens.map_idx_to_media.begin(); it != tokens.map_idx_to_media.end(); ) {
|
||||
for (auto it = tokens.map_idx_to_media.begin(); it != tokens.map_idx_to_media.end(); it++) {
|
||||
auto* chunk = tokens.map_idx_to_media[it->first].get();
|
||||
mtmd::input_chunk_ptr new_chunk(mtmd_input_chunk_copy(chunk));
|
||||
map_idx_to_media[start_idx + it->first] = std::move(new_chunk);
|
||||
|
||||
@ -2661,7 +2661,7 @@ void server_context::apply_server_biases(server_slot& slot) {
|
||||
}
|
||||
}
|
||||
|
||||
void server_context::request_completion(int id_task, int id_multi, json data, bool infill, bool embedding, server_tokens&& inputs) {
|
||||
void server_context::request_completion(int id_task, int id_multi, json data, bool infill, bool embedding, server_tokens & inputs) {
|
||||
server_task task;
|
||||
task.id = id_task;
|
||||
task.id_multi = id_multi;
|
||||
@ -2670,7 +2670,7 @@ void server_context::request_completion(int id_task, int id_multi, json data, bo
|
||||
task.infill = infill;
|
||||
task.embedding = embedding;
|
||||
task.type = SERVER_TASK_TYPE_COMPLETION;
|
||||
task.tokens = std::move(inputs);
|
||||
task.tokens = inputs.clone();
|
||||
// when a completion task's prompt array is not a singleton, we split it into multiple requests
|
||||
// otherwise, it's a single-prompt task, we actually queue it
|
||||
// if there's numbers in the prompt array it will be treated as an array of tokens
|
||||
@ -2709,7 +2709,8 @@ void server_context::request_cancel(int id_task) {
|
||||
}
|
||||
|
||||
void server_context::split_multiprompt_task(int id_multi, server_task& multiprompt_task) {
|
||||
const int prompt_count = multiprompt_task.data.at("prompt").size();
|
||||
auto prompts = multiprompt_task.data.at("prompt");
|
||||
const int prompt_count = prompts.size();
|
||||
if (prompt_count <= 1) {
|
||||
send_error(multiprompt_task, "error while handling multiple prompts");
|
||||
return;
|
||||
@ -2727,11 +2728,11 @@ void server_context::split_multiprompt_task(int id_multi, server_task& multiprom
|
||||
// add subtasks
|
||||
for (int i = 0; i < prompt_count; i++) {
|
||||
json subtask_data = multiprompt_task.data;
|
||||
subtask_data["prompt"] = subtask_data.at("prompt")[i];
|
||||
subtask_data["prompt"] = prompts[i];
|
||||
|
||||
// subtasks inherit everything else (infill mode, embedding mode, etc.)
|
||||
request_completion(subtask_ids[i], id_multi, subtask_data, multiprompt_task.infill, multiprompt_task.embedding,
|
||||
std::move(multiprompt_task.tokens));
|
||||
multiprompt_task.tokens);
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
@ -350,7 +350,7 @@ struct server_context {
|
||||
|
||||
void apply_server_biases(server_slot& slot);
|
||||
|
||||
void request_completion(int id_task, int id_multi, json data, bool infill, bool embedding, server_tokens&& inputs);
|
||||
void request_completion(int id_task, int id_multi, json data, bool infill, bool embedding, server_tokens & inputs);
|
||||
|
||||
void request_cancel(int id_task);
|
||||
|
||||
|
||||
@ -1358,10 +1358,11 @@ int main(int argc, char ** argv) {
|
||||
const auto handle_infill = [&ctx_server, &handle_completions_impl](const httplib::Request & req, httplib::Response & res) {
|
||||
log_prompt(ctx_server.params_base, json::parse(req.body));
|
||||
json data = json::parse(req.body);
|
||||
const int id_task = ctx_server.queue_tasks.get_new_id();
|
||||
server_tokens token; // dummy tokens
|
||||
ctx_server.queue_results.add_waiting_task_id(id_task);
|
||||
ctx_server.request_completion(id_task, -1, data, true, false, std::move(token));
|
||||
//avoid double submits
|
||||
//const int id_task = ctx_server.queue_tasks.get_new_id();
|
||||
//server_tokens token; // dummy tokens
|
||||
//ctx_server.queue_results.add_waiting_task_id(id_task);
|
||||
//ctx_server.request_completion(id_task, -1, data, true, false, token);
|
||||
std::vector<raw_buffer> files; // dummy
|
||||
handle_completions_impl(
|
||||
SERVER_TASK_TYPE_INFILL,
|
||||
|
||||
Loading…
x
Reference in New Issue
Block a user