diff --git a/common/sampling.cpp b/common/sampling.cpp index 03504bee..5a7a9b69 100644 --- a/common/sampling.cpp +++ b/common/sampling.cpp @@ -24,7 +24,7 @@ struct common_sampler * common_sampler_init(const struct llama_model * model, co result->grammar = nullptr; result->rbudget = nullptr; - struct llama_grammar* grmr; + struct llama_grammar* grmr = nullptr; const std::string & grammar_str = common_grammar_value(params.grammar); if (grammar_str.compare(0, 11, "%llguidance") == 0) { #ifdef LLAMA_USE_LLGUIDANCE diff --git a/examples/server/server-common.cpp b/examples/server/server-common.cpp index 3286f230..865b1f13 100644 --- a/examples/server/server-common.cpp +++ b/examples/server/server-common.cpp @@ -1295,7 +1295,7 @@ void server_tokens::push_back(server_tokens& tokens) { // Assert if we are copying MTMD chunks to a server_tokens that does not have mtmd. // We could also just check, but this will prevent silently dropping MTMD data. GGML_ASSERT(has_mtmd); - for (auto it = tokens.map_idx_to_media.begin(); it != tokens.map_idx_to_media.end(); ) { + for (auto it = tokens.map_idx_to_media.begin(); it != tokens.map_idx_to_media.end(); it++) { auto* chunk = tokens.map_idx_to_media[it->first].get(); mtmd::input_chunk_ptr new_chunk(mtmd_input_chunk_copy(chunk)); map_idx_to_media[start_idx + it->first] = std::move(new_chunk); diff --git a/examples/server/server-context.cpp b/examples/server/server-context.cpp index 4b1499f1..c895eda1 100644 --- a/examples/server/server-context.cpp +++ b/examples/server/server-context.cpp @@ -2661,7 +2661,7 @@ void server_context::apply_server_biases(server_slot& slot) { } } -void server_context::request_completion(int id_task, int id_multi, json data, bool infill, bool embedding, server_tokens&& inputs) { +void server_context::request_completion(int id_task, int id_multi, json data, bool infill, bool embedding, server_tokens & inputs) { server_task task; task.id = id_task; task.id_multi = id_multi; @@ -2670,7 +2670,7 @@ void server_context::request_completion(int id_task, int id_multi, json data, bo task.infill = infill; task.embedding = embedding; task.type = SERVER_TASK_TYPE_COMPLETION; - task.tokens = std::move(inputs); + task.tokens = inputs.clone(); // when a completion task's prompt array is not a singleton, we split it into multiple requests // otherwise, it's a single-prompt task, we actually queue it // if there's numbers in the prompt array it will be treated as an array of tokens @@ -2709,7 +2709,8 @@ void server_context::request_cancel(int id_task) { } void server_context::split_multiprompt_task(int id_multi, server_task& multiprompt_task) { - const int prompt_count = multiprompt_task.data.at("prompt").size(); + auto prompts = multiprompt_task.data.at("prompt"); + const int prompt_count = prompts.size(); if (prompt_count <= 1) { send_error(multiprompt_task, "error while handling multiple prompts"); return; @@ -2727,11 +2728,11 @@ void server_context::split_multiprompt_task(int id_multi, server_task& multiprom // add subtasks for (int i = 0; i < prompt_count; i++) { json subtask_data = multiprompt_task.data; - subtask_data["prompt"] = subtask_data.at("prompt")[i]; + subtask_data["prompt"] = prompts[i]; // subtasks inherit everything else (infill mode, embedding mode, etc.) request_completion(subtask_ids[i], id_multi, subtask_data, multiprompt_task.infill, multiprompt_task.embedding, - std::move(multiprompt_task.tokens)); + multiprompt_task.tokens); } } diff --git a/examples/server/server-context.h b/examples/server/server-context.h index a33c2113..12b029a5 100644 --- a/examples/server/server-context.h +++ b/examples/server/server-context.h @@ -350,7 +350,7 @@ struct server_context { void apply_server_biases(server_slot& slot); - void request_completion(int id_task, int id_multi, json data, bool infill, bool embedding, server_tokens&& inputs); + void request_completion(int id_task, int id_multi, json data, bool infill, bool embedding, server_tokens & inputs); void request_cancel(int id_task); diff --git a/examples/server/server.cpp b/examples/server/server.cpp index df2557f8..1771e665 100644 --- a/examples/server/server.cpp +++ b/examples/server/server.cpp @@ -1358,10 +1358,11 @@ int main(int argc, char ** argv) { const auto handle_infill = [&ctx_server, &handle_completions_impl](const httplib::Request & req, httplib::Response & res) { log_prompt(ctx_server.params_base, json::parse(req.body)); json data = json::parse(req.body); - const int id_task = ctx_server.queue_tasks.get_new_id(); - server_tokens token; // dummy tokens - ctx_server.queue_results.add_waiting_task_id(id_task); - ctx_server.request_completion(id_task, -1, data, true, false, std::move(token)); + //avoid double submits + //const int id_task = ctx_server.queue_tasks.get_new_id(); + //server_tokens token; // dummy tokens + //ctx_server.queue_results.add_waiting_task_id(id_task); + //ctx_server.request_completion(id_task, -1, data, true, false, token); std::vector files; // dummy handle_completions_impl( SERVER_TASK_TYPE_INFILL,