mirror of
https://github.com/ikawrakow/ik_llama.cpp.git
synced 2026-06-28 04:30:15 -05:00
Compare commits
19 Commits
6c00e87ac8
...
d3e86a5431
| Author | SHA1 | Date | |
|---|---|---|---|
|
|
d3e86a5431 | ||
|
|
bdf5c081dc | ||
|
|
4553cd0059 | ||
|
|
d5507e33ae | ||
|
|
bf23a7599c | ||
|
|
7cacf28eec | ||
|
|
8686ea708b | ||
|
|
5a4fa17947 | ||
|
|
997b289d93 | ||
|
|
a7d35d51dc | ||
|
|
befbc0945b | ||
|
|
7ccf1d2095 | ||
|
|
2d3ecd5e19 | ||
|
|
9eaf86a7c7 | ||
|
|
69a8336d08 | ||
|
|
b2b4f66fa0 | ||
|
|
b47b90d0be | ||
|
|
64fceb70bc | ||
|
|
72440a19fc |
208
common/chat.cpp
208
common/chat.cpp
@ -1933,6 +1933,172 @@ static common_chat_params common_chat_params_init_deepseek_v3_2(const common_cha
|
||||
return data;
|
||||
}
|
||||
|
||||
static common_chat_params common_chat_params_init_minimax_m3(const common_chat_template & tmpl,
|
||||
const autoparser::generation_params & inputs) {
|
||||
common_chat_params data;
|
||||
|
||||
data.prompt = common_chat_template_direct_apply_impl(tmpl, inputs);
|
||||
data.format = COMMON_CHAT_FORMAT_PEG_NATIVE;
|
||||
data.supports_thinking = true;
|
||||
data.thinking_start_tag = "<mm:think>";
|
||||
data.thinking_end_tag = "</mm:think>";
|
||||
|
||||
const std::string NS = "]<]minimax[>[";
|
||||
const std::string THINK_START = "<mm:think>";
|
||||
const std::string THINK_END = "</mm:think>";
|
||||
const std::string FC_START = NS + "<tool_call>";
|
||||
const std::string FC_END = NS + "</tool_call>";
|
||||
const std::string INVOKE_END = NS + "</invoke>";
|
||||
|
||||
data.preserved_tokens = {
|
||||
NS,
|
||||
"<tool_call>",
|
||||
"</tool_call>",
|
||||
THINK_START,
|
||||
THINK_END,
|
||||
};
|
||||
|
||||
auto has_tools = inputs.tools.is_array() && !inputs.tools.empty();
|
||||
auto has_response_format = !inputs.json_schema.is_null() && inputs.json_schema.is_object();
|
||||
auto extract_reasoning = inputs.reasoning_format != COMMON_REASONING_FORMAT_NONE;
|
||||
auto include_grammar = has_response_format || (has_tools && inputs.tool_choice != COMMON_CHAT_TOOL_CHOICE_NONE);
|
||||
|
||||
auto parser = build_chat_peg_parser([&](common_chat_peg_builder & p) {
|
||||
auto generation_prompt = p.prefix(inputs.generation_prompt, THINK_START);
|
||||
auto end = p.end();
|
||||
|
||||
auto reasoning = p.eps();
|
||||
if (extract_reasoning && inputs.enable_thinking) {
|
||||
reasoning = p.optional(p.optional(p.literal(THINK_START)) + p.reasoning(p.until(THINK_END)) + THINK_END);
|
||||
} else if (extract_reasoning) {
|
||||
reasoning = p.optional(p.optional(p.literal(THINK_START)) + p.until(THINK_END) + p.literal(THINK_END));
|
||||
}
|
||||
|
||||
if (has_response_format) {
|
||||
auto response_format = p.rule("response-format",
|
||||
p.literal("```json") + p.space() +
|
||||
p.content(p.schema(p.json(), "response-format-schema", inputs.json_schema)) +
|
||||
p.space() + p.literal("```"));
|
||||
return generation_prompt + reasoning + response_format + end;
|
||||
}
|
||||
|
||||
if (!has_tools || inputs.tool_choice == COMMON_CHAT_TOOL_CHOICE_NONE) {
|
||||
return generation_prompt + reasoning + p.content(p.rest()) + end;
|
||||
}
|
||||
|
||||
auto tool_choice = p.choice();
|
||||
foreach_function(inputs.tools, [&](const json & tool) {
|
||||
const auto & function = tool.at("function");
|
||||
std::string name = function.at("name");
|
||||
auto params = function.contains("parameters") ? function.at("parameters") : json::object();
|
||||
const auto & props = params.contains("properties") ? params.at("properties") : json::object();
|
||||
|
||||
std::set<std::string> required;
|
||||
if (params.contains("required")) {
|
||||
params.at("required").get_to(required);
|
||||
}
|
||||
|
||||
auto schema_info = common_schema_info();
|
||||
schema_info.resolve_refs(params);
|
||||
|
||||
std::vector<common_peg_parser> required_parsers;
|
||||
std::vector<common_peg_parser> optional_parsers;
|
||||
for (const auto & [param_name, param_schema] : props.items()) {
|
||||
bool is_required = required.find(param_name) != required.end();
|
||||
bool is_string = schema_info.resolves_to_string(param_schema);
|
||||
|
||||
const std::string p_close = NS + "</" + param_name + ">";
|
||||
|
||||
auto arg = p.tool_arg(
|
||||
p.tool_arg_open(
|
||||
p.literal(NS + "<") +
|
||||
p.tool_arg_name(p.literal(param_name)) +
|
||||
p.literal(">")) +
|
||||
(is_string
|
||||
? p.tool_arg_string_value(p.until(p_close))
|
||||
: p.tool_arg_json_value(p.schema(p.json(),
|
||||
"tool-" + name + "-arg-" + param_name + "-schema",
|
||||
param_schema, false))) +
|
||||
p.tool_arg_close(p.literal(p_close)));
|
||||
|
||||
auto named_arg = p.rule("tool-" + name + "-arg-" + param_name, arg);
|
||||
if (is_required) {
|
||||
required_parsers.push_back(named_arg);
|
||||
} else {
|
||||
optional_parsers.push_back(named_arg);
|
||||
}
|
||||
}
|
||||
|
||||
common_peg_parser args_seq = p.eps();
|
||||
for (size_t i = 0; i < required_parsers.size(); i++) {
|
||||
if (i > 0) {
|
||||
args_seq = args_seq + p.space();
|
||||
}
|
||||
args_seq = args_seq + required_parsers[i];
|
||||
}
|
||||
|
||||
if (!optional_parsers.empty()) {
|
||||
common_peg_parser any_opt = p.choice();
|
||||
for (const auto & opt : optional_parsers) {
|
||||
any_opt |= opt;
|
||||
}
|
||||
args_seq = args_seq + p.repeat(p.space() + any_opt, 0, -1);
|
||||
}
|
||||
|
||||
auto func_parser = p.tool(
|
||||
p.tool_open(p.literal(NS + "<invoke name=\"") +
|
||||
p.tool_name(p.literal(name)) + p.literal("\">")) +
|
||||
p.space() + args_seq + p.space() +
|
||||
p.tool_close(p.literal(INVOKE_END)));
|
||||
|
||||
tool_choice |= p.rule("tool-" + name, func_parser);
|
||||
});
|
||||
|
||||
auto require_tools = inputs.tool_choice == COMMON_CHAT_TOOL_CHOICE_REQUIRED;
|
||||
|
||||
common_peg_parser tool_calls = p.eps();
|
||||
if (inputs.parallel_tool_calls) {
|
||||
tool_calls = p.trigger_rule("tool-call",
|
||||
p.literal(FC_START) + p.space() + tool_choice +
|
||||
p.zero_or_more(p.space() + tool_choice) + p.space() + p.literal(FC_END));
|
||||
} else {
|
||||
tool_calls = p.trigger_rule("tool-call",
|
||||
p.literal(FC_START) + p.space() + tool_choice + p.space() + p.literal(FC_END));
|
||||
}
|
||||
|
||||
if (!require_tools) {
|
||||
tool_calls = p.optional(tool_calls);
|
||||
}
|
||||
|
||||
auto content_before_tools = p.content(p.until(FC_START));
|
||||
return generation_prompt + reasoning + p.space() + content_before_tools + tool_calls + end;
|
||||
});
|
||||
|
||||
data.parser = parser.save();
|
||||
|
||||
if (include_grammar) {
|
||||
data.grammar_lazy = !(has_response_format || (has_tools && inputs.tool_choice == COMMON_CHAT_TOOL_CHOICE_REQUIRED));
|
||||
data.grammar = build_grammar([&](const common_grammar_builder & builder) {
|
||||
foreach_function(inputs.tools, [&](const json & tool) {
|
||||
const auto & function = tool.at("function");
|
||||
auto schema = function.contains("parameters") ? function.at("parameters") : json::object();
|
||||
builder.resolve_refs(schema);
|
||||
});
|
||||
if (has_response_format) {
|
||||
auto schema = inputs.json_schema;
|
||||
builder.resolve_refs(schema);
|
||||
}
|
||||
parser.build_grammar(builder, data.grammar_lazy);
|
||||
});
|
||||
|
||||
data.grammar_triggers = {
|
||||
{ COMMON_GRAMMAR_TRIGGER_TYPE_WORD, FC_START },
|
||||
};
|
||||
}
|
||||
|
||||
return data;
|
||||
}
|
||||
|
||||
// Cohere2 MoE (a.k.a. "North Code") parser.
|
||||
//
|
||||
// The assistant turn is fully marker-wrapped:
|
||||
@ -1978,22 +2144,42 @@ static common_chat_params common_chat_params_init_cohere2moe(const common_chat_t
|
||||
};
|
||||
|
||||
auto has_tools = inputs.tools.is_array() && !inputs.tools.empty();
|
||||
// Surface reasoning as reasoning_content whenever the output format requests it
|
||||
// (Cohere2MoE has a non-empty THINK_START, so this matches the narrow condition
|
||||
// from the reasoning-delimiter work). Under --reasoning off the model may still
|
||||
// emit an (un)opened thinking block; keeping it in reasoning_content quarantines
|
||||
// it from the user-facing content rather than leaking it into the answer.
|
||||
auto extract_reasoning = inputs.reasoning_format != COMMON_REASONING_FORMAT_NONE;
|
||||
auto include_grammar = has_tools && inputs.tool_choice != COMMON_CHAT_TOOL_CHOICE_NONE;
|
||||
|
||||
auto parser = build_chat_peg_parser([&](common_chat_peg_builder & p) {
|
||||
auto generation_prompt = p.prefix(inputs.generation_prompt, THINK_START);
|
||||
auto end = p.optional(p.literal(TURN_END)) + p.end();
|
||||
// Cohere2MoE can emit a stray text terminator after an action envelope.
|
||||
auto end = p.optional(p.literal(TEXT_END)) + p.optional(p.literal(TURN_END)) + p.end();
|
||||
|
||||
auto thinking_body = [&]() {
|
||||
return p.until_one_of({ THINK_END, TEXT_START, ACTION_START });
|
||||
};
|
||||
|
||||
common_peg_parser reasoning = p.eps();
|
||||
if (extract_reasoning) {
|
||||
reasoning = p.optional(p.literal(THINK_START) +
|
||||
p.reasoning(p.until_one_of({ THINK_END, TEXT_START, ACTION_START })) +
|
||||
p.optional(p.literal(THINK_END)));
|
||||
auto opened = p.literal(THINK_START) +
|
||||
p.reasoning(thinking_body()) +
|
||||
p.optional(p.literal(THINK_END));
|
||||
auto unopened = p.reasoning(thinking_body()) + p.literal(THINK_END);
|
||||
reasoning = p.optional(p.choice({ opened, unopened }));
|
||||
} else if (inputs.enable_thinking) {
|
||||
auto opened = p.content(p.literal(THINK_START) +
|
||||
thinking_body() +
|
||||
p.optional(p.literal(THINK_END)));
|
||||
auto unopened = p.content(thinking_body() + p.literal(THINK_END));
|
||||
reasoning = p.optional(p.choice({ opened, unopened }));
|
||||
} else {
|
||||
reasoning = p.optional(p.content(p.literal(THINK_START) +
|
||||
p.until_one_of({ THINK_END, TEXT_START, ACTION_START }) +
|
||||
p.optional(p.literal(THINK_END))));
|
||||
auto opened = p.literal(THINK_START) +
|
||||
p.content(thinking_body()) +
|
||||
p.optional(p.literal(THINK_END));
|
||||
auto unopened = p.content(thinking_body()) + p.literal(THINK_END);
|
||||
reasoning = p.optional(p.choice({ opened, unopened }));
|
||||
}
|
||||
|
||||
auto text_content = p.literal(TEXT_START) + p.content(p.until(TEXT_END)) + p.optional(p.literal(TEXT_END));
|
||||
@ -2329,6 +2515,13 @@ std::optional<common_chat_params> common_chat_try_specialized_template(
|
||||
return common_chat_params_init_gigachat_v3(tmpl, params);
|
||||
}
|
||||
|
||||
if (src.find("]<]minimax[>[") != std::string::npos &&
|
||||
src.find("<tool_call>") != std::string::npos &&
|
||||
src.find("<invoke name=") != std::string::npos) {
|
||||
LOG_DBG("Using specialized template: MiniMax-M3\n");
|
||||
return common_chat_params_init_minimax_m3(tmpl, params);
|
||||
}
|
||||
|
||||
// DeepSeek V3.2 format detection: template defines dsml_token and uses it for tool calls.
|
||||
// The template source contains the token as a variable assignment, not as a literal in markup.
|
||||
if (src.find("dsml_token") != std::string::npos &&
|
||||
@ -2629,4 +2822,3 @@ std::map<std::string, bool> common_chat_templates_get_caps(const common_chat_tem
|
||||
GGML_ASSERT(chat_templates->template_default != nullptr);
|
||||
return chat_templates->template_default->caps.to_map();
|
||||
}
|
||||
|
||||
|
||||
@ -498,6 +498,18 @@ common_webui common_webui_from_name(const std::string& format) {
|
||||
}
|
||||
}
|
||||
|
||||
common_checkpoint_eviction common_checkpoint_eviction_from_name(const std::string & format) {
|
||||
if (format == "auto") {
|
||||
return COMMON_CHECKPOINT_EVICTION_AUTO;
|
||||
} else if (format == "fifo") {
|
||||
return COMMON_CHECKPOINT_EVICTION_FIFO;
|
||||
} else if (format == "variance") {
|
||||
return COMMON_CHECKPOINT_EVICTION_VARIANCE;
|
||||
} else {
|
||||
return COMMON_CHECKPOINT_EVICTION_AUTO;
|
||||
}
|
||||
}
|
||||
|
||||
thinking_tokens thinking_tokens_from_string(const std::string& format) {
|
||||
thinking_tokens think_token;
|
||||
std::string token_string = string_strip(format);
|
||||
@ -2772,6 +2784,11 @@ bool gpt_params_find_arg(int argc, char ** argv, const std::string & arg, gpt_pa
|
||||
params.ctx_checkpoints_tolerance = std::stoi(argv[i]);
|
||||
return true;
|
||||
}
|
||||
if (arg == "--ctx-checkpoints-eviction") {
|
||||
CHECK_ARG
|
||||
params.ctx_checkpoint_eviction= common_checkpoint_eviction_from_name(std::string(argv[i]));
|
||||
return true;
|
||||
}
|
||||
if (arg == "-cram" || arg == "--cache-ram") {
|
||||
CHECK_ARG
|
||||
params.cache_ram_mib = std::stoi(argv[i]);
|
||||
@ -2982,6 +2999,7 @@ void gpt_params_print_usage(int /*argc*/, char ** argv, const gpt_params & param
|
||||
options.push_back({ "*", "--ctx-checkpoints N", "max number of context checkpoints to create per slot (default: %d)",params.ctx_checkpoints_n});
|
||||
options.push_back({ "*", "--ctx-checkpoints-interval N", "minimum number of tokens between each context checkpoint. (default: %d, <=0 disable)",params.ctx_checkpoints_interval});
|
||||
options.push_back({ "*", "--ctx-checkpoints-tolerance N", "the number of tokens before the full prompt to create the checkpoint. (default: %d, <=0 disable)",params.ctx_checkpoints_tolerance});
|
||||
options.push_back({ "*", "--ctx-checkpoints-eviction NAME", "Eviction strategy for checkpoint. Accepts fifo, variance and auto. Auto defaults to variance. Variance preserves coverage and maintains uniform interval. (default: variance)" });
|
||||
options.push_back({ "*", "-cram, --cache-ram N", "set the maximum cache size in MiB (default: %d, -1 - no limit, 0 - disable)",params.cache_ram_mib });
|
||||
options.push_back({ "*", "-crs, --cache-ram-similarity N", "max of similarity of prompt tokens to cache tokens that triggers prompt cache (default: %.2f).",params.cache_ram_similarity });
|
||||
options.push_back({ "*", "-cram-n-min --cache-ram-n-min N", "minimum number of the cached tokens that triggers prompt cache (default: %d).", params.cache_ram_n_min });
|
||||
|
||||
@ -127,8 +127,17 @@ enum common_webui {
|
||||
COMMON_WEBUI_LLAMACPP,
|
||||
};
|
||||
|
||||
enum common_checkpoint_eviction {
|
||||
COMMON_CHECKPOINT_EVICTION_AUTO,
|
||||
COMMON_CHECKPOINT_EVICTION_FIFO,
|
||||
COMMON_CHECKPOINT_EVICTION_VARIANCE
|
||||
};
|
||||
|
||||
common_webui common_webui_from_name(const std::string& format);
|
||||
|
||||
common_checkpoint_eviction common_checkpoint_eviction_from_name(const std::string & format);
|
||||
|
||||
|
||||
struct thinking_tokens {
|
||||
bool exclude = true;
|
||||
std::string begin = "<think>";
|
||||
@ -527,6 +536,7 @@ struct gpt_params {
|
||||
int32_t ctx_checkpoints_n = 32; // max number of context checkpoints per slot
|
||||
int32_t ctx_checkpoints_interval = 512; // minimum number of tokens between each context checkpoints
|
||||
int32_t ctx_checkpoints_tolerance = 5; // the number of tokens before the full prompt to create the checkpoint
|
||||
common_checkpoint_eviction ctx_checkpoint_eviction = COMMON_CHECKPOINT_EVICTION_VARIANCE;
|
||||
int32_t cache_ram_mib = 8192; // -1 = no limit, 0 - disable, 1 = 1 MiB, etc.
|
||||
int32_t cache_ram_n_min = 0; // min number of tokens required to save in the ram
|
||||
float cache_ram_similarity = 0.5f; // similarity of tokens to cached tokens
|
||||
|
||||
@ -596,11 +596,18 @@ value for_statement::execute_impl(context & ctx) {
|
||||
loop_obj->insert("length", mk_val<value_int>(filtered_items.size()));
|
||||
loop_obj->insert("previtem", i > 0 ? filtered_items[i - 1] : mk_val<value_undefined>("previtem"));
|
||||
loop_obj->insert("nextitem", i < filtered_items.size() - 1 ? filtered_items[i + 1] : mk_val<value_undefined>("nextitem"));
|
||||
scope.set_val("loop", loop_obj);
|
||||
scope_update_fns[i](scope);
|
||||
// Use a fresh scope for each iteration so that {% set %} variables
|
||||
// (including ones assigned only conditionally inside the body) do not
|
||||
// leak across iterations. This matches standard Jinja2 semantics, where
|
||||
// each loop iteration starts with a clean scope. State that must
|
||||
// accumulate across iterations has to use namespace(), whose mutations
|
||||
// are applied to the shared object referenced from the enclosing scope.
|
||||
context iter_scope(scope);
|
||||
iter_scope.set_val("loop", loop_obj);
|
||||
scope_update_fns[i](iter_scope);
|
||||
try {
|
||||
for (auto & stmt : body) {
|
||||
value val = stmt->execute(scope);
|
||||
value val = stmt->execute(iter_scope);
|
||||
result->push_back(val);
|
||||
}
|
||||
} catch (const continue_statement::signal &) {
|
||||
|
||||
@ -2477,6 +2477,24 @@ class DFlashDraftModel(Qwen3Model):
|
||||
|
||||
self.gguf_writer.add_uint32(f"{arch}.dflash.n_target_features", n_target_features)
|
||||
|
||||
# DFlash drafts may be trained with sliding-window attention (for long-context). When the
|
||||
# source config enables it, emit the window size + the per-layer SWA pattern so the runtime
|
||||
# activates the kq_mask_swa path. These drafts are typically all sliding-window except a
|
||||
# final full-attention (global) layer, so honor layer_types when present; fall back to
|
||||
# all-SWA only when it is absent. Absent/false use_sliding_window => dense draft (unchanged).
|
||||
use_sliding_window = self.hparams.get("use_sliding_window")
|
||||
sliding_window = self.hparams.get("sliding_window")
|
||||
if use_sliding_window and sliding_window:
|
||||
n_swa_layers = int(self.hparams.get("num_hidden_layers", self.block_count))
|
||||
layer_types = self.hparams.get("layer_types")
|
||||
if layer_types:
|
||||
swa_pattern = [str(t) == "sliding_attention" for t in layer_types]
|
||||
else:
|
||||
swa_pattern = [True] * n_swa_layers
|
||||
self.gguf_writer.add_sliding_window(int(sliding_window))
|
||||
self.gguf_writer.add_sliding_window_pattern(swa_pattern)
|
||||
logger.info("DFlashDraftModel: sliding_window=%d, SWA pattern=%s", int(sliding_window), swa_pattern)
|
||||
|
||||
logger.info(
|
||||
"DFlashDraftModel metadata: block_size=%s mask_token_id=%s target_layer_ids=%s n_target_features=%s",
|
||||
block_size,
|
||||
@ -5427,6 +5445,12 @@ class LagunaModel(Model):
|
||||
rope_params = hparams.get("rope_parameters", {})
|
||||
full_rope = rope_params.get("full_attention", rope_params)
|
||||
swa_rope = rope_params.get("sliding_attention", {})
|
||||
# Laguna can specify different rotary widths for full-attention and SWA layers.
|
||||
# M.1 uses the full-attention value from rope_parameters; XS.2 SWA omits the key
|
||||
# because those layers rotate the whole head.
|
||||
partial_rotary_factor = float(hparams.get("partial_rotary_factor", 1.0))
|
||||
partial_rotary_factor_full = float(full_rope.get("partial_rotary_factor", partial_rotary_factor))
|
||||
partial_rotary_factor_swa = float(swa_rope.get("partial_rotary_factor", 1.0))
|
||||
|
||||
self.gguf_writer.add_context_length(int(hparams["max_position_embeddings"]))
|
||||
self.gguf_writer.add_embedding_length(int(hparams["hidden_size"]))
|
||||
@ -5443,8 +5467,11 @@ class LagunaModel(Model):
|
||||
self.gguf_writer.add_file_type(self.ftype)
|
||||
|
||||
self.gguf_writer.add_sliding_window(int(hparams["sliding_window"]))
|
||||
self.gguf_writer.add_rope_dimension_count(head_dim // 2)
|
||||
self.gguf_writer.add_uint32(f"{arch}.rope.dimension_count_swa", head_dim)
|
||||
# GGUF's rope.dimension_count is the number of scalar Q/K dimensions
|
||||
# that ggml_rope_ext should rotate. It is not the number of RoPE pairs;
|
||||
# the frequency table uses dimension_count / 2 entries later.
|
||||
self.gguf_writer.add_rope_dimension_count(int(head_dim * partial_rotary_factor_full))
|
||||
self.gguf_writer.add_uint32(f"{arch}.rope.dimension_count_swa", int(head_dim * partial_rotary_factor_swa))
|
||||
self.gguf_writer.add_rope_freq_base(float(full_rope.get("rope_theta", 500000.0)))
|
||||
self.gguf_writer.add_float32(f"{arch}.rope.freq_base_swa", float(swa_rope.get("rope_theta", 10000.0)))
|
||||
if full_rope.get("rope_type") == "yarn":
|
||||
@ -5454,7 +5481,9 @@ class LagunaModel(Model):
|
||||
"original_max_position_embeddings",
|
||||
rope_params.get("original_max_position_embeddings", hparams["max_position_embeddings"]),
|
||||
)))
|
||||
self.gguf_writer.add_rope_scaling_yarn_ext_factor(float(full_rope.get("factor", 1.0)))
|
||||
# GGUF's YaRN ext_factor is the config's extrapolation_factor. The main
|
||||
# factor above is the context-extension scale and should not be mirrored here.
|
||||
self.gguf_writer.add_rope_scaling_yarn_ext_factor(float(full_rope.get("extrapolation_factor", 1.0)))
|
||||
self.gguf_writer.add_rope_scaling_yarn_attn_factor(float(full_rope.get("attention_factor", 1.0)))
|
||||
self.gguf_writer.add_rope_scaling_yarn_beta_fast(float(full_rope.get("beta_fast", 32.0)))
|
||||
self.gguf_writer.add_rope_scaling_yarn_beta_slow(float(full_rope.get("beta_slow", 1.0)))
|
||||
|
||||
343
docs/development/on-demand-tensor-reload.md
Normal file
343
docs/development/on-demand-tensor-reload.md
Normal file
@ -0,0 +1,343 @@
|
||||
# On-Demand Tensor Reload
|
||||
|
||||
## Overview
|
||||
|
||||
This patch introduces **selective tensor hot-swapping** for `ik_llama.cpp` models, now with full support for `graph`/`layer` split mode.
|
||||
It allows individual tensors (or groups of tensors) to be reloaded from their original on-disk GGUF files **without tearing down the process, the `llama_model`, or the `llama_context`**. Tensors may reside on any backend—GPU, CPU, or split across multiple GPUs—and the reload logic preserves that placement.
|
||||
|
||||
This is primarily intended for:
|
||||
|
||||
* Iterative experimentation and LoRA-like surgical updates.
|
||||
* Dynamic MoE (Mixture-of-Experts) expert swapping.
|
||||
* **Mixed-quantization perplexity benchmarks**, where the bulk of a model lives in one quant (e.g., Q4_X) on GPU while individual experts are hot-swapped one-by-one into a different quant (e.g., IQ1_KT) to measure isolated quality impact.
|
||||
|
||||
---
|
||||
|
||||
## Motivation
|
||||
|
||||
Standard `ik_llama.cpp` workflows require restarting the entire executable to pick up new weights. For large models distributed across multiple GPUs—or models that spill into CPU memory—this incurs significant downtime. This patch solves that by:
|
||||
|
||||
1. **Tracking provenance**: At load time, every tensor is mapped back to its source GGUF shard, byte offset, and modification time.
|
||||
2. **Detecting changes**: At runtime, it cheaply `stat()`s the source files to see if a tensor’s backing data has changed.
|
||||
3. **Surgical replacement**: Only the changed tensors are re-mapped/re-allocated. The rest of the model stays resident in GPU/CPU memory.
|
||||
4. **Graph safety**: Cached CUDA graphs are invalidated and the context’s cached compute graphs (`ctx->prev` / `ctx->prev_mtp`) are reset so that the next evaluation rebuilds the graph with the new buffer pointers, sizes, or types.
|
||||
|
||||
---
|
||||
|
||||
## High-Level Architecture
|
||||
|
||||
The patch adds a `reload_info` registry to `llama_model` (defined in `src/llama-reload-info.h`). The lifecycle has five phases:
|
||||
|
||||
### 1. Registration Phase (`llama_model_load`)
|
||||
During model loading, every weight that is successfully mapped gets an entry in `model.reload->tensor_reload_sources` **only when the environment variable `LLAMA_HOTSWAP_ENABLED` is set**:
|
||||
|
||||
```cpp
|
||||
struct tensor_reload_source {
|
||||
std::string path; // Absolute path to the GGUF shard
|
||||
size_t data_offset; // Byte offset of the tensor data in the file
|
||||
size_t nbytes; // Current byte size
|
||||
int64_t last_mtime; // Last modification time (seconds)
|
||||
int64_t last_mtime_ns; // Nanosecond precision on Linux
|
||||
|
||||
// Snapshots of the *original* loaded state so we can reattach later
|
||||
ggml_backend_buffer_t original_buffer;
|
||||
void * original_data;
|
||||
ggml_type original_type;
|
||||
int64_t original_ne[GGML_MAX_DIMS];
|
||||
size_t original_nb[GGML_MAX_DIMS];
|
||||
ggml_split_tensor_t * original_extra;
|
||||
std::vector<split_info> original_splits;
|
||||
std::vector<std::string> sibling_names; // MoE siblings
|
||||
reload_state state;
|
||||
};
|
||||
```
|
||||
|
||||
### 2. Snapshot Phase (`snapshot_all_reload_tensors`)
|
||||
The first time a reload is requested, an **eager snapshot** is taken of every registered tensor and its MoE siblings. This captures the original buffer handles, split descriptors, and strides. This snapshot is essential for:
|
||||
|
||||
* **Reattachment**: If a tensor was detached to a private buffer because it grew, but later shrinks back to its original size/type, it can be reattached to the original shared buffer, avoiding memory fragmentation.
|
||||
* **MoE consistency**: MoE layers often have three sibling tensors (`ffn_down_exps`, `ffn_up_exps`, `ffn_gate_exps`) that must share the same split topology across GPUs.
|
||||
|
||||
### 3. Detection Phase (`reload_changed_tensors`)
|
||||
When the user (or the server health-check loop) calls `llama_reload_changed_tensors()`:
|
||||
|
||||
1. It iterates over the registry and `stat()`s each source file.
|
||||
2. If `mtime` (or `mtime_ns`) differs, it re-parses the GGUF header (`gguf_find_tensor_meta`) to get the new `offset`, `nbytes`, `ggml_type`, and on-disk shape (`ne`).
|
||||
3. **Shape verification**: If the on-disk dimensions differ from the model tensor (`file_ne[i] != tensor->ne[i]`), the tensor is skipped entirely; the reload logic refuses to change logical shapes.
|
||||
4. It builds a **sorted job list**: tensors that are **returning to their original snapshot** are processed first. This maximizes the chance of freeing private buffers before allocating new ones, reducing memory pressure.
|
||||
|
||||
### 4. Reload Phase (`reload_tensor`)
|
||||
For each changed tensor, the patch performs a careful in-place update.
|
||||
|
||||
#### 0. Shape Verification
|
||||
Before any metadata or buffer changes, the code verifies that the on-disk `ne[0..3]` exactly match the current model tensor. If any dimension differs, the reload is aborted with a log message and the tensor is left untouched.
|
||||
|
||||
#### A. Returning Check
|
||||
The first decision is whether the tensor's new on-disk type matches its **original** snapshot type (`curr_type == src.original_type`).
|
||||
|
||||
* **Returning to original**: The tensor is reattached to its original shared buffer and original split descriptors. Any private buffer allocated during a previous reload is freed (only if the tensor's state is `DETACHED` or `FALLBACK_CPU`). State becomes `ON_ORIGINAL`.
|
||||
* **Changed**: Proceed to metadata update and buffer reallocation.
|
||||
|
||||
#### B. Metadata Update & Block-Size Alignment
|
||||
If the tensor’s `ggml_type` changed (e.g., Q4_X → IQ1_KT), the main tensor descriptor and all its split descriptors are updated with new `type` and `nb` values. The logical shape (`ne`) is guaranteed unchanged by the preceding shape verification. However, for fused/multi-GPU splits the per-device boundaries must be recalculated.
|
||||
|
||||
**Critical constraint for fused/multi-GPU splits:**
|
||||
Different quants use different block sizes:
|
||||
* **Q4_X / Q4_0**: block size **32**
|
||||
* **IQ1_KT**: block size **256**
|
||||
|
||||
When a tensor changes between these types, `apply_tensor_type_change()` re-rounds every GPU slice’s `ne[0]` to the nearest multiple of the new block size. If this redistribution is not propagated to all siblings in the same MoE layer, the CUDA split backend dispatches rows to the wrong devices and **matmul fails**.
|
||||
|
||||
#### C. Buffer Lifecycle
|
||||
The patch tracks each tensor with a `reload_state` enum (`UNINITIALIZED`, `ON_ORIGINAL`, `DETACHED`, `FALLBACK_CPU`). Buffers are only freed if the state is not `ON_ORIGINAL`, ensuring shared original buffers are never corrupted.
|
||||
|
||||
| Scenario | Action |
|
||||
|----------|--------|
|
||||
| Returning to original snapshot | **Reattach** to `original_buffer`, restore original splits, free old private buffer if any. |
|
||||
| Changed type/size while previously on original | **Detach** from the shared buffer to a newly allocated private buffer so the shared region isn’t corrupted for other tensors. |
|
||||
| Changed type/size while already detached | Free old private buffer, allocate new one. |
|
||||
| Allocation fails on target backend | **CPU fallback**: allocate on `ggml_backend_cpu_buffer_type()` and clear split metadata. State becomes `FALLBACK_CPU`. |
|
||||
|
||||
#### D. Split Tensor (Multi-GPU) Handling
|
||||
For split tensors, the patch:
|
||||
- Recomputes per-device bounds using the new block-size alignment.
|
||||
- Reallocates per-device split buffers if necessary.
|
||||
- **Resyncs MoE siblings**: If `ffn_down_exps` changes its split topology, `ffn_up_exps` and `ffn_gate_exps` in the same layer are forced to adopt identical per-device `ne[0]` distributions and strides. This is required by the CUDA split-backend contract.
|
||||
|
||||
#### E. Data Copy
|
||||
Finally, the tensor bytes are read from the updated file and copied into the (possibly new) backend buffer via `ggml_backend_tensor_set`.
|
||||
|
||||
---
|
||||
|
||||
## Hybrid CPU/GPU Inference
|
||||
|
||||
When running with `--split-mode layer --fit --gpu-layers 99` (or any configuration where the model does not fully fit in VRAM), some tensors naturally land in CPU memory. The hot-swap system fully supports this:
|
||||
|
||||
* **CPU tensors are reloadable**: The reload logic reads the new data from disk and copies it into the CPU backend buffer exactly as it would for CUDA buffers.
|
||||
* **Fallback allocator**: If a GPU buffer allocation fails during a reload (e.g., because an IQ1_KT expert is larger than the original Q4_X expert), the system automatically falls back to a CPU buffer for that tensor.
|
||||
|
||||
This allows you to keep, for example, 90 % of an MoE model on 13 GPUs while a few large expert tensors cycle through CPU RAM, or to benchmark quants that vary in size per-expert without worrying about exact VRAM fitting.
|
||||
|
||||
---
|
||||
|
||||
## API & Environment Variables
|
||||
|
||||
### Public C API
|
||||
```cpp
|
||||
// include/llama.h
|
||||
LLAMA_API bool llama_reload_changed_tensors(struct llama_context * ctx);
|
||||
```
|
||||
|
||||
Returns `true` if at least one tensor was reloaded. When this happens, the function also resets the context’s cached compute graphs (`ctx->prev` and `ctx->prev_mtp`) so that the next evaluation performs a full graph rebuild with the new tensor pointers.
|
||||
|
||||
### Environment Variables
|
||||
|
||||
| Variable | Purpose |
|
||||
|----------|---------|
|
||||
| `LLAMA_HOTSWAP_ENABLED` | Enables the hot-swap loop in `perplexity` and the health-check hook in `server`. |
|
||||
| `LLAMA_PERPLEXITY_PRE_RELOAD_SCRIPT` | Path to an executable script run between perplexity iterations (e.g., to regenerate/re-quantize a tensor file). |
|
||||
|
||||
---
|
||||
|
||||
## Integration Points
|
||||
|
||||
### `examples/perplexity/perplexity.cpp`
|
||||
When `LLAMA_HOTSWAP_ENABLED` is set, the tool runs in a loop:
|
||||
|
||||
1. Perform an initial `llama_reload_changed_tensors()` to apply any pending changes before the first evaluation.
|
||||
2. Compute perplexity (or Hellaswag, etc.).
|
||||
3. Print timings and write logs.
|
||||
4. Execute the optional pre-reload script.
|
||||
5. Call `llama_reload_changed_tensors(ctx)`. If no tensors changed, exit; otherwise repeat from step 2.
|
||||
|
||||
### `examples/server/server.cpp`
|
||||
On every health-check (`/health`) request, if `LLAMA_HOTSWAP_ENABLED` is set, the server calls `llama_reload_changed_tensors()`. This provides a convenient, external trigger: simply `touch` or overwrite a tensor’s source GGUF file and poll `/health` to apply the change.
|
||||
|
||||
---
|
||||
|
||||
## MoE Sibling Resync
|
||||
|
||||
MoE weights are often stored as three separate tensors that must be split identically across GPUs. The patch automatically detects these families by suffix:
|
||||
|
||||
- `.ffn_down_exps.weight`
|
||||
- `.ffn_up_exps.weight`
|
||||
- `.ffn_gate_exps.weight`
|
||||
|
||||
When one member of the family is reloaded and its per-device split dimensions change—especially when crossing quant types with different block sizes (Q4_X=32 vs IQ1_KT=256)—`resync_moe_sibling_splits()` is invoked. The logic follows these steps:
|
||||
|
||||
1. **Fast path**: If the reference tensor is returning to its original snapshot, the siblings are also reattached to their original snapshots via `reattach_split_tensor_to_shared()`—no data movement is required.
|
||||
2. **Phase A – Detach**: Siblings are detached from shared buffers (freeing only non-original buffers) and new main handles are allocated. Split tensors receive a dummy `data` pointer because the split backend uses `extra->splits`.
|
||||
3. **Phase B – Propagate dimensions**: The reference tensor’s per-device `ne[0]` distribution is copied to the siblings, and strides (`nb[]`) are recomputed using a temporary `ggml_context`. This step is mandatory because the valid split boundaries depend on the quantization block size.
|
||||
4. **Phase C – Allocate GPU splits**: New per-device GPU buffers are allocated for each sibling split.
|
||||
5. **Phase D – CPU fallback (if needed)**: If any GPU allocation fails, the **entire** sibling group is moved to CPU buffers to maintain consistency.
|
||||
6. **Phase E – Write back**: The original sibling data (which has not changed, only the layout) is written back into the new buffers via `ggml_backend_tensor_set`.
|
||||
|
||||
---
|
||||
|
||||
## Buffer Lifecycle Details
|
||||
|
||||
### Reattachment to Shared Buffers
|
||||
If a tensor was originally loaded in a large shared GGUF buffer alongside other tensors, and it was previously detached because it grew, the patch attempts to **reattach** it when it returns to its original size and type. This is done by restoring:
|
||||
|
||||
- `tensor->buffer = original_buffer`
|
||||
- `tensor->data = original_data`
|
||||
- `tensor->extra = original_extra` (restoring all split descriptors)
|
||||
|
||||
This prevents unbounded memory growth during iterative experiments where tensors oscillate between two states.
|
||||
|
||||
### State Machine
|
||||
Because `ggml` does not provide native reference counting on buffers, the patch uses a per-tensor state machine to avoid corrupting shared allocations:
|
||||
|
||||
* `ON_ORIGINAL`: The tensor still lives in its initial shared buffer. This buffer is **never** freed during reload.
|
||||
* `DETACHED`: The tensor was moved to a privately allocated buffer. This buffer **is** freed before the next reload.
|
||||
* `FALLBACK_CPU`: The tensor was moved to CPU memory after a GPU allocation failure.
|
||||
|
||||
Only buffers belonging to tensors in the `DETACHED` or `FALLBACK_CPU` states are released, ensuring that shared original buffers remain valid for all other tensors that still reference them.
|
||||
|
||||
---
|
||||
|
||||
## Limitations & Safety Notes
|
||||
|
||||
1. **File path stability**: The source file must remain at the same path. Renaming or removing shards will cause `stat()` or `open()` to fail.
|
||||
2. **No locking**: There is no file-locking protocol. The user must ensure the GGUF file is not being written to while `ik_llama.cpp` is reading it.
|
||||
3. **Graph rebuild cost**: While cheaper than a full process restart, rebuilding the CUDA graph (or CPU graph) incurs a one-time latency spike after a reload.
|
||||
4. **Platform specifics**: Nanosecond mtime checks use `st_mtim.tv_nsec` and are guarded by `#ifdef __linux__`.
|
||||
5. **Thread safety**: `llama_reload_changed_tensors` is **not** thread-safe with active inference. Ensure the context is idle before calling (the perplexity example naturally guarantees this; the server example only invokes it during the synchronous `/health` handler).
|
||||
|
||||
---
|
||||
|
||||
## Usage Example: Per-Expert Quantization Sweep (Q4_X ↔ IQ1_KT)
|
||||
|
||||
This example benchmarks a massive MoE model where the base weights are **Q4_X**. The tool iteratively replaces individual `ffn_down_exps.weight` tensors with **IQ1_KT** equivalents to measure the isolated perplexity impact of each expert's quantization level.
|
||||
|
||||
A sanity check is embedded in the source directory: one of the "IQ1_KT" shard files is actually the original **Q4_X** tensor. When the rotation reaches that slot, the reloaded tensor is byte-for-byte identical to the baseline, so the PPL must match exactly—confirming that the hot-swap machinery introduces no loss.
|
||||
|
||||
### 1. Helper script (`tensor-swap.sh`)
|
||||
Place the rotation script in your model directory (e.g., `/opt/THIREUS/Kimi-K2.6/Q4_X/`). It maintains `.bak` files so that each iteration restores the previous tensor before installing the next candidate.
|
||||
|
||||
```bash
|
||||
#!/bin/bash
|
||||
set -euo pipefail
|
||||
|
||||
TARGET_GLOB="*Q4_X*gguf"
|
||||
SOURCE_DIR="../smol-IQ1-KT-mist.bin"
|
||||
TENSOR_NAME_PATTERN="blk\.[0-9]+\.ffn_down_exps\.weight"
|
||||
|
||||
# ... (see full script in patch) ...
|
||||
```
|
||||
|
||||
The script scans for target files matching `*Q4_X*gguf` containing `blk.[N].ffn_down_exps.weight`, then pulls replacements from `../smol-IQ1-KT-mist.bin/` by matching the `SPECIAL_TENSOR-NNNN-of-XXXX.gguf` shard number.
|
||||
|
||||
### 2. Launch perplexity with hot-swap enabled
|
||||
|
||||
```bash
|
||||
ulimit -n 9999
|
||||
ulimit -l unlimited
|
||||
|
||||
export CUDA_VISIBLE_DEVICES="0,1,2,3,4,5,6,7,8,9,10,11,12"
|
||||
export LLAMA_HOTSWAP_ENABLED=1
|
||||
export LLAMA_PERPLEXITY_PRE_RELOAD_SCRIPT=./tensor-swap.sh
|
||||
export LLAMA_DEBUG=1
|
||||
|
||||
# --offload-policy -1,off \
|
||||
|
||||
GGML_CUDA_NO_PINNED=1 \
|
||||
/opt/ik_llama.cpp/ik_llama.cpp/build/bin/llama-perplexity \
|
||||
--chunks 8 \
|
||||
-f /opt/ik_llama.cpp/wiki.test.raw \
|
||||
--model /opt/THIREUS/Kimi-K2.6/Q4_X/Kimi-K2.6-THIREUS-Q4_X-SPECIAL_TENSOR-00001-of-01097.gguf \
|
||||
--alias THIREUS/Kimi-K2.6-Q4_X.bin \
|
||||
-b 512 -ub 512 \
|
||||
--ctx-size 512 \
|
||||
--fit \
|
||||
--fit-margin 4200 \
|
||||
--gpu-fit-margin 0,4400,12,4400 \
|
||||
--temp 0.0 --top-k 0 --top-p 1.0 \
|
||||
-ctk f16 \
|
||||
-ctv q8_0 \
|
||||
-amb 128 \
|
||||
-mea 128 \
|
||||
-wgt 1 \
|
||||
--mlock \
|
||||
--split-mode layer \
|
||||
--graph-reduce-type f16 \
|
||||
--threads $(grep ^cpu\\scores /proc/cpuinfo | uniq | awk '{print $4}' | xargs -I{} echo "{}-0" | bc) \
|
||||
-sas \
|
||||
--gpu-layers 99 \
|
||||
--no-offload-only-active-experts \
|
||||
--host 0.0.0.0 \
|
||||
--port 8080 \
|
||||
--log-enable \
|
||||
--logdir /var/log/ \
|
||||
--jinja \
|
||||
--special \
|
||||
--prompt-cache "$HOME/.cache/ik_llama.cpp/prompt-cache.bin" --prompt-cache-all \
|
||||
--slot-save-path "$HOME/.cache/ik_llama.cpp/slot.bin" \
|
||||
--lookup-cache-dynamic "$HOME/.cache/ik_llama.cpp/slot.bin" \
|
||||
--keep -1 \
|
||||
--slot-prompt-similarity 0.35 \
|
||||
--metrics \
|
||||
-cuda fusion=1
|
||||
```
|
||||
|
||||
### 3. What happens
|
||||
|
||||
1. The model loads with **Q4_X** weights distributed across 13 GPUs using layer splitting.
|
||||
2. The first pass computes the baseline perplexity over 8 chunks.
|
||||
3. `tensor-swap.sh` runs between iterations:
|
||||
* Restores the previously swapped tensor from `.bak` to its original Q4_X state.
|
||||
* Copies the next IQ1_KT expert shard into place.
|
||||
4. `llama_reload_changed_tensors()` detects the `mtime` changes, re-parses the GGUF headers, and reloads the affected `ffn_down_exps.weight` tensor(s).
|
||||
* The restored tensor **returns to its original Q4_X snapshot** and reattaches to its shared buffer.
|
||||
* The newly swapped tensor is loaded into a private buffer with the new IQ1_KT data.
|
||||
* Because Q4_X and IQ1_KT have different block sizes (32 vs 256), the split backend redistributes per-device boundaries and resyncs the MoE siblings (`ffn_up_exps` and `ffn_gate_exps`) to the same layout.
|
||||
5. The CUDA graphs are invalidated and the next perplexity iteration begins.
|
||||
6. When the rotation hits the sanity-check slot (where the source file is actually the original Q4_X tensor), the perplexity returns to the exact baseline value, confirming the reload is lossless.
|
||||
|
||||
### 4. Expected behavior
|
||||
|
||||
```text
|
||||
snapshot_all_reload_tensors: eager snapshot of all reload tensors + siblings
|
||||
perplexity: calculating perplexity over 8 chunks, n_ctx=512, batch_size=512, n_seq=1
|
||||
[1]1.0622,[2]1.2068,[3]1.2327,[4]1.1873,[5]1.1487,[6]1.1283,[7]1.1214,[8]1.1109,
|
||||
Final estimate: PPL = 1.1109
|
||||
|
||||
main: executing pre-reload script: ./tensor-swap.sh
|
||||
main: [pre-reload] Swapped index 0 (tensor #00918)
|
||||
reloaded tensor 'blk.1.ffn_down_exps.weight'
|
||||
|
||||
perplexity: calculating perplexity over 8 chunks ...
|
||||
Final estimate: PPL = 1.1105
|
||||
|
||||
main: executing pre-reload script: ./tensor-swap.sh
|
||||
main: [pre-reload] Restored index 0. Advancing to index 1.
|
||||
main: [pre-reload] Swapped index 1 (tensor #00921)
|
||||
reloaded tensor 'blk.1.ffn_down_exps.weight'
|
||||
reloaded tensor 'blk.2.ffn_down_exps.weight'
|
||||
|
||||
perplexity: calculating perplexity over 8 chunks ...
|
||||
Final estimate: PPL = 1.1080
|
||||
```
|
||||
|
||||
Notice that when the script restores a tensor to its original Q4_X shard, the reload reattaches it to the shared buffer with zero copy. When the sanity-check slot is reached, the PPL returns to the exact baseline, proving the mechanism is sound.
|
||||
|
||||
---
|
||||
|
||||
## Summary of Changed Files
|
||||
|
||||
| File | Change |
|
||||
|------|--------|
|
||||
| `examples/perplexity/perplexity.cpp` | Hot-swap loop + pre-reload script execution. |
|
||||
| `examples/server/server.cpp` | Trigger reload on `/health` when env var is set. |
|
||||
| `ggml/include/ggml-cuda.h` | Add `ggml_backend_cuda_invalidate_graphs()`. |
|
||||
| `ggml/include/ggml.h` | Conditional `GGML_MAX_SRC` override. |
|
||||
| `ggml/src/CMakeLists.txt` | Propagate `GGML_MAX_SRC` compile definition. |
|
||||
| `ggml/src/ggml-cuda.cu` | Implement graph invalidation; debug prints for split tensors. |
|
||||
| `ggml/src/ggml.c` | Debug print in `ggml_mul_mat_id` for shape mismatches. |
|
||||
| `include/llama.h` | Declare `llama_reload_changed_tensors()`. |
|
||||
| `src/llama-mmap.cpp/h` | Expose `llama_file::get_path()` so reload registry knows the source file path. |
|
||||
| `src/llama-model.h` | Add `std::unique_ptr<reload_info> reload` to `llama_model`. |
|
||||
| `src/llama-reload-info.h` | **New.** Defines `tensor_reload_source`, `reload_state`, and `reload_info` registry. |
|
||||
| `src/llama-reload.cpp` | **New.** Core implementation: GGUF header parser, snapshot, reload, MoE resync, buffer management, CPU fallback, shape verification. |
|
||||
| `src/llama.cpp` | Wire reload registry into `llama_model_load`; reset cached compute graphs (`ctx->prev` / `ctx->prev_mtp`) on reload; export C API. |
|
||||
| `src/CMakeLists.txt` | Propagate `GGML_MAX_SRC` compile definition. |
|
||||
88
docs/development/on-demand-tensor-reload.mmd
Normal file
88
docs/development/on-demand-tensor-reload.mmd
Normal file
@ -0,0 +1,88 @@
|
||||
graph TD
|
||||
START([Start]) --> ENV{LLAMA_HOTSWAP_ENABLED?}
|
||||
ENV -->|No| ENDD([End])
|
||||
ENV -->|Yes| LOAD[Registration Phase<br/>llama_model_load]
|
||||
|
||||
subgraph Load_Time [Load Time]
|
||||
LOAD --> REG[Populate model.reload->tensor_reload_sources<br/>path / offset / mtime / nbytes]
|
||||
end
|
||||
|
||||
REG --> CALL([User calls<br/>llama_reload_changed_tensors])
|
||||
|
||||
CALL --> SNAP{Snapshots<br/>done?}
|
||||
SNAP -->|No| EAGER[snapshot_all_reload_tensors<br/>Capture original_buffer / data / type / ne / nb<br/>Capture original_splits<br/>Discover MoE siblings via populate_moe_siblings]
|
||||
SNAP -->|Yes| DET
|
||||
|
||||
subgraph Detection [Detection Phase]
|
||||
DET[reload_changed_tensors] --> STAT[For each registered tensor:<br/>stat source file]
|
||||
STAT --> CHG{mtime / mtime_ns<br/>changed?}
|
||||
CHG -->|No| SKIP[Skip]
|
||||
CHG -->|Yes| META[gguf_find_tensor_meta<br/>Parse GGUF header only<br/>Get new offset / type / size / ne]
|
||||
META --> DIM{"model ne[i] == file ne[i]?"}
|
||||
DIM -->|No| SKIP2[Skip: dimension mismatch]
|
||||
DIM -->|Yes| JOB[Add to job list<br/>Mark returning = <br/>new_type == original_type]
|
||||
end
|
||||
|
||||
JOB --> SORT[Sort jobs<br/>Returning to original FIRST]
|
||||
|
||||
subgraph Per_Tensor_Reload [Per-Tensor Reload Loop]
|
||||
SORT --> LOOP[For each job:<br/>reload_tensor name]
|
||||
|
||||
LOOP --> RET{Returning to<br/>original?}
|
||||
|
||||
RET -->|Yes| OG_SPLIT{Is split tensor?<br/>tensor->extra != nullptr}
|
||||
OG_SPLIT -->|Yes| REATT_SP[reattach_split_tensor_to_shared<br/>Restore original_buffer / data / extra<br/>Restore original_splits<br/>Free old private buffers ONLY]
|
||||
OG_SPLIT -->|No| REATT_NS[Restore original_buffer / data<br/>Restore original_type / ne / nb]
|
||||
REATT_SP --> ST_ORIG[Set state = ON_ORIGINAL]
|
||||
REATT_NS --> ST_ORIG
|
||||
ST_ORIG --> MT[Update file mtime]
|
||||
|
||||
RET -->|No| TCHG{Type changed<br/>from snapshot?}
|
||||
TCHG -->|Yes| APPLY["apply_tensor_type_change<br/>Update tensor->type / nb[]<br/>If split & blck_size>1:<br/>Re-round per-device ne[0] to block multiples"]
|
||||
TCHG -->|No| KEEP[Keep current metadata]
|
||||
APPLY --> READ[Read new bytes from disk<br/>into host_buf]
|
||||
KEEP --> READ
|
||||
READ --> IS_SPLIT{Is split tensor?}
|
||||
|
||||
IS_SPLIT -->|Yes| SPATH[Split Path:<br/>reload_tensor_split_path]
|
||||
SPATH --> F_SP[Free old main & split buffers<br/>ONLY if state != ON_ORIGINAL]
|
||||
F_SP --> A_SP[Allocate new main buffer<br/>alloc_buffer_fallback<br/>GPU preferred, CPU fallback]
|
||||
A_SP --> AL_SP[ggml_backend_tensor_alloc]
|
||||
AL_SP --> C_SP["ggml_backend_tensor_set<br/>host_buf -> device"]
|
||||
C_SP --> SIB{Has MoE siblings<br/>in this layer?}
|
||||
SIB -->|Yes| RESYNC[resync_moe_sibling_splits]
|
||||
SIB -->|No| ST_DET1[Set state = DETACHED]
|
||||
|
||||
subgraph MoE_Resync [MoE Sibling Resync]
|
||||
RESYNC --> RRET{Is reference<br/>returning to original?}
|
||||
RRET -->|Yes| R_SIB[reattach_split_tensor_to_shared<br/>for each sibling<br/>Zero-copy restore]
|
||||
RRET -->|No| PHA[Phase A: Detach siblings<br/>Free old non-original buffers<br/>Alloc new main handles<br/>data = 0x1 dummy]
|
||||
PHA --> PHB["Phase B: Propagate ref dimensions<br/>to siblings<br/>Recompute nb[] via temp ggml_context"]
|
||||
PHB --> PHC[Phase C: Alloc per-device<br/>GPU split buffers]
|
||||
PHC --> PHF{Any GPU alloc<br/>failed?}
|
||||
PHF -->|Yes| PHD[Phase D: Move ENTIRE layer to CPU<br/>Free GPU splits<br/>Alloc CPU buffer<br/>State = FALLBACK_CPU]
|
||||
PHF -->|No| PHE[Phase E: ggml_backend_tensor_set<br/>Write sibling data back]
|
||||
PHD --> PHE
|
||||
PHE --> ST_DET1
|
||||
R_SIB --> ST_DET1
|
||||
end
|
||||
|
||||
IS_SPLIT -->|No| NSPATH[Non-Split Path:<br/>reload_tensor_non_split_path]
|
||||
NSPATH --> F_NS[Free old buffer<br/>ONLY if state != ON_ORIGINAL]
|
||||
F_NS --> A_NS[Allocate new buffer<br/>alloc_buffer_fallback]
|
||||
A_NS --> AL_NS[ggml_backend_tensor_alloc]
|
||||
AL_NS --> C_NS["ggml_backend_tensor_set<br/>host_buf -> device"]
|
||||
C_NS --> ST_DET2[Set state = DETACHED]
|
||||
ST_DET2 --> MT
|
||||
ST_DET1 --> MT
|
||||
end
|
||||
|
||||
MT --> MORE{More jobs?}
|
||||
MORE -->|Yes| LOOP
|
||||
MORE -->|No| RELOADED{Any tensor<br/>actually reloaded?}
|
||||
|
||||
RELOADED -->|No| ENDD
|
||||
RELOADED -->|Yes| INV[ggml_backend_cuda_invalidate_graphs<br/>Clear cuda_graphs on ALL devices]
|
||||
INV --> CTX["Reset cached compute graphs<br/>ctx->prev.reset()<br/>ctx->prev_mtp.reset()"]
|
||||
CTX --> REUSE[can_reuse_graph sees no cached graph<br/>Forces full graph rebuild<br/>on next eval]
|
||||
REUSE --> ENDD
|
||||
@ -66,7 +66,7 @@ struct pca_model {
|
||||
pca_model(struct ggml_tensor * t_input) {
|
||||
#ifdef GGML_USE_CUDA
|
||||
fprintf(stderr, "%s: using CUDA backend\n", __func__);
|
||||
backend = ggml_backend_cuda_init(0, nullptr); // init device 0
|
||||
backend = ggml_backend_cuda_init(0, nullptr, nullptr); // init device 0
|
||||
if (!backend) {
|
||||
fprintf(stderr, "%s: ggml_backend_cuda_init() failed\n", __func__);
|
||||
}
|
||||
|
||||
@ -27,9 +27,38 @@ static std::string ggml_ne_string(const ggml_tensor * t) {
|
||||
return str;
|
||||
}
|
||||
|
||||
static float ggml_get_float_value(const uint8_t * data, ggml_type type, size_t i) {
|
||||
if (type == GGML_TYPE_F16) {
|
||||
return ggml_fp16_to_fp32(*(const ggml_fp16_t *) &data[i]);
|
||||
} else if (type == GGML_TYPE_F32) {
|
||||
return *(const float *) &data[i];
|
||||
} else if (type == GGML_TYPE_I32) {
|
||||
return (float) *(const int32_t *) &data[i];
|
||||
} else if (type == GGML_TYPE_I16) {
|
||||
return (float) *(const int16_t *) &data[i];
|
||||
} else if (type == GGML_TYPE_I8) {
|
||||
return (float) *(const int8_t *) &data[i];
|
||||
} else {
|
||||
GGML_ABORT("fatal error");
|
||||
}
|
||||
}
|
||||
|
||||
static void ggml_print_tensor(uint8_t * data, ggml_type type, const int64_t * ne, const size_t * nb, int64_t n) {
|
||||
GGML_ASSERT(n > 0);
|
||||
float sum = 0;
|
||||
// Compute the FULL tensor sum first (debug aid — was previously summing
|
||||
// only the displayed first/last elements, which made it useless for
|
||||
// comparing two binaries on the same tensor).
|
||||
double sum = 0.0;
|
||||
for (int64_t i3 = 0; i3 < ne[3]; i3++) {
|
||||
for (int64_t i2 = 0; i2 < ne[2]; i2++) {
|
||||
for (int64_t i1 = 0; i1 < ne[1]; i1++) {
|
||||
for (int64_t i0 = 0; i0 < ne[0]; i0++) {
|
||||
size_t i = i3 * nb[3] + i2 * nb[2] + i1 * nb[1] + i0 * nb[0];
|
||||
sum += ggml_get_float_value(data, type, i);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
for (int64_t i3 = 0; i3 < ne[3]; i3++) {
|
||||
printf(" [\n");
|
||||
for (int64_t i2 = 0; i2 < ne[2]; i2++) {
|
||||
@ -50,22 +79,7 @@ static void ggml_print_tensor(uint8_t * data, ggml_type type, const int64_t * ne
|
||||
i0 = ne[0] - n;
|
||||
}
|
||||
size_t i = i3 * nb[3] + i2 * nb[2] + i1 * nb[1] + i0 * nb[0];
|
||||
float v;
|
||||
if (type == GGML_TYPE_F16) {
|
||||
v = ggml_fp16_to_fp32(*(ggml_fp16_t *) &data[i]);
|
||||
} else if (type == GGML_TYPE_F32) {
|
||||
v = *(float *) &data[i];
|
||||
} else if (type == GGML_TYPE_I32) {
|
||||
v = (float) *(int32_t *) &data[i];
|
||||
} else if (type == GGML_TYPE_I16) {
|
||||
v = (float) *(int16_t *) &data[i];
|
||||
} else if (type == GGML_TYPE_I8) {
|
||||
v = (float) *(int8_t *) &data[i];
|
||||
} else {
|
||||
GGML_ABORT("fatal error");
|
||||
}
|
||||
printf("%12.4f", v);
|
||||
sum += v;
|
||||
printf("%12.4f", ggml_get_float_value(data, type, i));
|
||||
if (i0 < ne[0] - 1) printf(", ");
|
||||
}
|
||||
printf("],\n");
|
||||
@ -73,7 +87,7 @@ static void ggml_print_tensor(uint8_t * data, ggml_type type, const int64_t * ne
|
||||
printf(" ],\n");
|
||||
}
|
||||
printf(" ]\n");
|
||||
printf(" sum = %f\n", sum);
|
||||
printf(" sum = %f\n", (float) sum);
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
@ -783,12 +783,20 @@ int32_t mtmd_encode_chunk(mtmd_context * ctx, const mtmd_input_chunk * chunk) {
|
||||
LOG_ERR("%s: model does not support vision input\n", __func__);
|
||||
return 1;
|
||||
}
|
||||
// If in the future, we somehow accidentally try to reencode an already-encoded chunk,
|
||||
// chunk->tokens_image will have been cleared out to save memory
|
||||
GGML_ASSERT(!chunk->tokens_image->batch_f32.entries.empty()
|
||||
&& "mtmd_encode_chunk: image data already released (double encode?)");
|
||||
return mtmd_encode(ctx, chunk->tokens_image.get());
|
||||
} else if (chunk->type == MTMD_INPUT_CHUNK_TYPE_AUDIO) {
|
||||
if (!ctx->ctx_a) {
|
||||
LOG_ERR("%s: model does not support audio input\n", __func__);
|
||||
return 1;
|
||||
}
|
||||
// If in the future, we somehow accidentally try to reencode an already-encoded chunk,
|
||||
// chunk->tokens_audio will have been cleared out to save memory
|
||||
GGML_ASSERT(!chunk->tokens_audio->batch_f32.entries.empty()
|
||||
&& "mtmd_encode_chunk: audio data already released (double encode?)");
|
||||
int n_mmproj_embd = ctx->n_embd_text;
|
||||
ctx->image_embd_v.resize(chunk->tokens_audio->n_tokens * n_mmproj_embd);
|
||||
bool ok = clip_image_batch_encode(
|
||||
@ -1042,6 +1050,19 @@ void mtmd_input_chunk_free(mtmd_input_chunk * chunk) {
|
||||
}
|
||||
}
|
||||
|
||||
void mtmd_input_chunk_free_raw_data(mtmd_input_chunk * chunk) {
|
||||
if (!chunk) {
|
||||
return;
|
||||
}
|
||||
|
||||
if (chunk->tokens_image) {
|
||||
chunk->tokens_image->batch_f32 = clip_image_f32_batch{};
|
||||
}
|
||||
if (chunk->tokens_audio) {
|
||||
chunk->tokens_audio->batch_f32 = clip_image_f32_batch{};
|
||||
}
|
||||
}
|
||||
|
||||
// mtmd_image_tokens
|
||||
|
||||
size_t mtmd_image_tokens_get_n_tokens(const mtmd_image_tokens * image_tokens) {
|
||||
@ -1114,63 +1135,10 @@ mtmd_input_chunks * mtmd_test_create_input_chunks() {
|
||||
return chunks;
|
||||
}
|
||||
|
||||
static json mtmd_clip_image_f32_to_json(const clip_image_f32 & clip) {
|
||||
json j;
|
||||
j["nx"] = clip.nx;
|
||||
j["ny"] = clip.ny;
|
||||
j["buf"] = clip.buf;
|
||||
return j;
|
||||
}
|
||||
|
||||
static clip_image_f32 * mtmd_clip_image_f32_from_json(const json & j) {
|
||||
clip_image_f32 * clip = new clip_image_f32;
|
||||
clip->nx = j["nx"];
|
||||
clip->ny = j["ny"];
|
||||
clip->buf = j["buf"].get<std::vector<float>>();
|
||||
return clip;
|
||||
}
|
||||
|
||||
static json mtmd_clip_image_f32_batch_to_json(const clip_image_f32_batch & batch, bool full = false) {
|
||||
json j;
|
||||
j["is_audio"] = batch.is_audio;
|
||||
j["grid_x"] = batch.grid_x;
|
||||
j["grid_y"] = batch.grid_y;
|
||||
|
||||
if (full) {
|
||||
std::vector<nlohmann::json> entries;
|
||||
for (auto & entry : batch.entries) {
|
||||
entries.push_back(mtmd_clip_image_f32_to_json(*entry));
|
||||
}
|
||||
j["entries"] = entries;
|
||||
}
|
||||
|
||||
return j;
|
||||
}
|
||||
|
||||
static clip_image_f32_batch mtmd_clip_image_f32_batch_from_json(const json & j, bool full = false) {
|
||||
clip_image_f32_batch batch;
|
||||
if (j.contains("is_audio")) {
|
||||
batch.is_audio = j["is_audio"];
|
||||
batch.grid_x = j["grid_x"];
|
||||
batch.grid_y = j["grid_y"];
|
||||
if (full) {
|
||||
auto entries = j["entries"];
|
||||
if (entries.is_array()) {
|
||||
for (auto & entry : entries) {
|
||||
clip_image_f32 * clip = mtmd_clip_image_f32_from_json(entry);
|
||||
batch.entries.push_back(clip_image_f32_ptr(clip));
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
}
|
||||
return batch;
|
||||
}
|
||||
|
||||
static mtmd_audio_tokens mtmd_audio_tokens_from_json(json & j) {
|
||||
return mtmd_audio_tokens{
|
||||
j.value<uint32_t>("n_tokens", 0),
|
||||
mtmd_clip_image_f32_batch_from_json(j.value("batch_f32", json{})),
|
||||
clip_image_f32_batch {},
|
||||
j.value("id","")
|
||||
};
|
||||
}
|
||||
@ -1180,7 +1148,7 @@ static mtmd_image_tokens mtmd_image_tokens_from_json(json & j) {
|
||||
j.value<uint32_t>("nx", 0),
|
||||
j.value<uint32_t>("ny", 0),
|
||||
j.value("use_mrope_pos",false),
|
||||
mtmd_clip_image_f32_batch_from_json(j.value("batch_f32", json{})),
|
||||
clip_image_f32_batch {},
|
||||
j.value("id","")
|
||||
};
|
||||
}
|
||||
@ -1190,7 +1158,6 @@ static json mtmd_audio_tokens_to_json(mtmd_audio_tokens * chunk) {
|
||||
if (chunk) {
|
||||
j["n_tokens"] = chunk->n_tokens;
|
||||
j["id"] = chunk->id;
|
||||
j["batch_f32"] = mtmd_clip_image_f32_batch_to_json(chunk->batch_f32);
|
||||
}
|
||||
return j;
|
||||
}
|
||||
@ -1201,7 +1168,6 @@ static json mtmd_image_tokens_to_json(mtmd_image_tokens * chunk) {
|
||||
j["nx"] = chunk->nx;
|
||||
j["ny"] = chunk->ny;
|
||||
j["use_mrope_pos"] = chunk->use_mrope_pos;
|
||||
j["batch_f32"] = mtmd_clip_image_f32_batch_to_json(chunk->batch_f32);
|
||||
j["id"] = chunk->id;
|
||||
}
|
||||
return j;
|
||||
|
||||
@ -170,6 +170,9 @@ MTMD_API llama_pos mtmd_input_chunk_get_n_pos (const mtmd
|
||||
MTMD_API mtmd_input_chunk * mtmd_input_chunk_copy(const mtmd_input_chunk * chunk);
|
||||
MTMD_API void mtmd_input_chunk_free(mtmd_input_chunk * chunk);
|
||||
|
||||
// Free the raw audio (PCM) or imagebuffers (RGB-f32 (!)) of a multimedia chunk.
|
||||
// Provided for the benefit of llama-server as a stopgap to fix memory issues
|
||||
MTMD_API void mtmd_input_chunk_free_raw_data(mtmd_input_chunk * chunk);
|
||||
|
||||
// mtmd_image_tokens
|
||||
//
|
||||
|
||||
@ -20,11 +20,15 @@
|
||||
#include <array>
|
||||
#include <fstream>
|
||||
#include <sstream>
|
||||
#include <cstdlib>
|
||||
|
||||
#if defined(_MSC_VER)
|
||||
#pragma warning(disable: 4244 4267) // possible loss of data
|
||||
#endif
|
||||
|
||||
// Public C API for hot-swap (defined in src/llama.cpp)
|
||||
extern "C" bool llama_reload_changed_tensors(struct llama_context * ctx);
|
||||
|
||||
struct results_perplexity {
|
||||
std::vector<llama_token> tokens;
|
||||
double ppl_value;
|
||||
@ -2056,21 +2060,62 @@ int main(int argc, char ** argv) {
|
||||
fprintf(stderr, "%s\n", gpt_params_get_system_info(params).c_str());
|
||||
}
|
||||
|
||||
struct results_perplexity results;
|
||||
if (params.hellaswag) {
|
||||
hellaswag_score(ctx, params);
|
||||
} else if (params.winogrande) {
|
||||
winogrande_score(ctx, params);
|
||||
} else if (params.multiple_choice) {
|
||||
multiple_choice_score(ctx, params);
|
||||
} else if (params.kl_divergence) {
|
||||
kl_divergence(ctx, params);
|
||||
} else {
|
||||
results = perplexity(ctx, params, n_ctx);
|
||||
const char * hotswap_env = std::getenv("LLAMA_HOTSWAP_ENABLED");
|
||||
const char * pre_script = std::getenv("LLAMA_PERPLEXITY_PRE_RELOAD_SCRIPT");
|
||||
|
||||
if (hotswap_env) {
|
||||
llama_reload_changed_tensors(ctx);
|
||||
}
|
||||
|
||||
llama_print_timings(ctx);
|
||||
write_logfile(ctx, params, model, results);
|
||||
while (true) {
|
||||
struct results_perplexity results;
|
||||
if (params.hellaswag) {
|
||||
hellaswag_score(ctx, params);
|
||||
} else if (params.winogrande) {
|
||||
winogrande_score(ctx, params);
|
||||
} else if (params.multiple_choice) {
|
||||
multiple_choice_score(ctx, params);
|
||||
} else if (params.kl_divergence) {
|
||||
kl_divergence(ctx, params);
|
||||
} else {
|
||||
results = perplexity(ctx, params, n_ctx);
|
||||
}
|
||||
|
||||
llama_print_timings(ctx);
|
||||
write_logfile(ctx, params, model, results);
|
||||
|
||||
if (pre_script) {
|
||||
fprintf(stderr, "%s: executing pre-reload script: %s\n", __func__, pre_script);
|
||||
#ifdef _WIN32
|
||||
FILE * fp = _popen(pre_script, "r");
|
||||
#else
|
||||
FILE * fp = popen(pre_script, "r");
|
||||
#endif
|
||||
if (fp) {
|
||||
char buf[256];
|
||||
while (fgets(buf, sizeof(buf), fp)) {
|
||||
size_t len = strlen(buf);
|
||||
if (len > 0 && buf[len-1] == '\n') buf[len-1] = '\0';
|
||||
fprintf(stderr, "%s: [pre-reload] %s\n", __func__, buf);
|
||||
}
|
||||
#ifdef _WIN32
|
||||
_pclose(fp);
|
||||
#else
|
||||
pclose(fp);
|
||||
#endif
|
||||
} else {
|
||||
fprintf(stderr, "%s: failed to execute pre-reload script: %s\n", __func__, pre_script);
|
||||
}
|
||||
}
|
||||
|
||||
if (hotswap_env) {
|
||||
if (!llama_reload_changed_tensors(ctx)) {
|
||||
break;
|
||||
}
|
||||
} else {
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
llama_free(ctx);
|
||||
llama_free_model(model);
|
||||
|
||||
@ -1253,6 +1253,13 @@ const mtmd::input_chunk_ptr& server_tokens::find_chunk(size_t idx) const {
|
||||
throw std::runtime_error("Chunk not found, or idx is not the first token of a chunk");
|
||||
}
|
||||
|
||||
void server_tokens::free_raw_media_data(size_t idx) {
|
||||
auto it = map_idx_to_media.find(idx);
|
||||
if (it != map_idx_to_media.end() && it->second) {
|
||||
mtmd_input_chunk_free_raw_data(it->second.get());
|
||||
}
|
||||
}
|
||||
|
||||
void server_tokens::push_back(llama_token tok) {
|
||||
if (tok == LLAMA_TOKEN_NULL) {
|
||||
throw std::runtime_error("Invalid token");
|
||||
|
||||
@ -404,6 +404,10 @@ public:
|
||||
|
||||
const mtmd::input_chunk_ptr& find_chunk(size_t idx) const;
|
||||
|
||||
// Manual free for the raw audio (PCM) or imagebuffers (RGB-f32 (!)) of a multimedia chunk.
|
||||
// This data will never be read again after encoding on the first turn that multimedia are received.
|
||||
void free_raw_media_data(size_t idx);
|
||||
|
||||
void push_back(llama_token tok);
|
||||
|
||||
// will create a copy of the chunk if it contains non-text data
|
||||
|
||||
@ -3594,6 +3594,51 @@ void server_context::apply_checkpoint(server_slot & slot) {
|
||||
}
|
||||
}
|
||||
|
||||
static std::list<server_prompt_checkpoint>::iterator evict_checkpoint_by_variance(server_slot & slot, std::list<server_prompt_checkpoint> & ckpts) {
|
||||
auto it = ckpts.begin();
|
||||
if (ckpts.size() < 3) {
|
||||
return it;
|
||||
} else if (ckpts.size() == 3) {
|
||||
std::advance(it, 1);
|
||||
return it;
|
||||
}
|
||||
std::vector<int64_t> tokens;
|
||||
tokens.reserve(ckpts.size());
|
||||
for (const auto & ckpt : ckpts) {
|
||||
tokens.push_back(int64_t(ckpt.pos_max));
|
||||
}
|
||||
// Remove the checkpoint that makes the distribution most even after removal.
|
||||
// For each interior checkpoint, compute the variance of gaps that would result
|
||||
// if it were removed. Pick the one with the lowest variance (most uniform spacing).
|
||||
size_t best_idx = 1;
|
||||
const size_t n = tokens.size();
|
||||
const size_t start = 1; // never remove the first
|
||||
const size_t end = n - 1; // never remove the last
|
||||
double max_pos = tokens[n - 1];
|
||||
// To avoid doing double for loop to calculate variance,
|
||||
// We only need to find the one with the min product of two consecutive gaps.
|
||||
// Why:
|
||||
// Gap between checkpoints: x1, x2, .., x_n-1.
|
||||
// The average is constant because first and last checkpoint is never removed
|
||||
// Variance of the gap after removing i_th checkpoint is:
|
||||
// x1^2+..+(x_n-1)^2+2*x_i*x_(i+1) - average^2
|
||||
// Find the minimum variance is finding min { x_i*x_(i+1) }
|
||||
double diff = (tokens[start] - tokens[start - 1]);
|
||||
double diff2 = (tokens[start + 1] - tokens[start]);
|
||||
double best_variance = diff * (diff2 / max_pos);
|
||||
for (size_t i = start+1; i < end; i++) {
|
||||
diff = tokens[i] - tokens[i - 1];
|
||||
diff2 = tokens[i + 1] - tokens[i];
|
||||
double variance = diff * (diff2 / max_pos);
|
||||
if (variance < best_variance) {
|
||||
best_variance = variance;
|
||||
best_idx = i;
|
||||
}
|
||||
}
|
||||
std::advance(it, best_idx);
|
||||
return it;
|
||||
}
|
||||
|
||||
bool server_context::create_checkpoint(server_slot & slot) {
|
||||
bool do_checkpoint = !slot.image_just_processed;
|
||||
int32_t pos_min = llama_kv_cache_seq_pos_min(slot.ctx, slot.id);
|
||||
@ -3609,12 +3654,15 @@ bool server_context::create_checkpoint(server_slot & slot) {
|
||||
const int64_t t_start = ggml_time_us();
|
||||
while (slot.server_cached_prompt.checkpoints.size() >= (size_t)params_base.ctx_checkpoints_n) {
|
||||
// make room for the new checkpoint, if needed
|
||||
const auto & cur = slot.server_cached_prompt.checkpoints.front();
|
||||
|
||||
auto it = slot.server_cached_prompt.checkpoints.begin();
|
||||
if (params_base.ctx_checkpoint_eviction == COMMON_CHECKPOINT_EVICTION_VARIANCE ||
|
||||
params_base.ctx_checkpoint_eviction == COMMON_CHECKPOINT_EVICTION_AUTO) {
|
||||
it = evict_checkpoint_by_variance(slot, slot.server_cached_prompt.checkpoints);
|
||||
}
|
||||
const auto & cur = *it;
|
||||
SLT_WRN(slot, "erasing old context checkpoint (pos_min = %d, pos_max = %d, n_tokens = %" PRId64 ", size = %.3f MiB)\n",
|
||||
cur.pos_min, cur.pos_max, cur.n_tokens, (float)cur.data.size() / 1024 / 1024);
|
||||
|
||||
slot.server_cached_prompt.checkpoints.erase(slot.server_cached_prompt.checkpoints.begin());
|
||||
slot.server_cached_prompt.checkpoints.erase(it);
|
||||
}
|
||||
|
||||
auto & cur = slot.server_cached_prompt.checkpoints.emplace_back();
|
||||
@ -3898,6 +3946,7 @@ void server_context::batch_pending_prompt(const int32_t n_ubatch, const int32_t
|
||||
|
||||
// add the image chunk to cache
|
||||
{
|
||||
slot.prompt_tokens.free_raw_media_data(slot.n_past_prompt);
|
||||
const auto& chunk = slot.prompt_tokens.find_chunk(slot.n_past_prompt);
|
||||
slot.cache_tokens.push_back(chunk.get()); // copy
|
||||
}
|
||||
|
||||
@ -763,6 +763,14 @@ int main(int argc, char ** argv) {
|
||||
}
|
||||
|
||||
res.set_content(health.dump(), "application/json");
|
||||
|
||||
const char * hotswap_env = std::getenv("LLAMA_HOTSWAP_ENABLED");
|
||||
if (hotswap_env) {
|
||||
// WARNING: llama_reload_changed_tensors is NOT thread-safe with active inference.
|
||||
// Only enable this when you can guarantee the server is idle during health checks.
|
||||
llama_reload_changed_tensors(ctx_server.ctx);
|
||||
}
|
||||
|
||||
break;
|
||||
}
|
||||
case SERVER_STATE_LOADING_MODEL:
|
||||
|
||||
@ -21,7 +21,7 @@ extern "C" {
|
||||
#define GGML_CUDA_MAX_DEVICES 16
|
||||
|
||||
// backend API
|
||||
GGML_API GGML_CALL ggml_backend_t ggml_backend_cuda_init(int device, const void * params);
|
||||
GGML_API GGML_CALL ggml_backend_t ggml_backend_cuda_init(int device, const void * params, const void * model);
|
||||
|
||||
GGML_API GGML_CALL bool ggml_backend_is_cuda(ggml_backend_t backend);
|
||||
|
||||
@ -41,7 +41,9 @@ GGML_API GGML_CALL void ggml_backend_cuda_get_device_memory(int device, size_t *
|
||||
GGML_API GGML_CALL bool ggml_backend_cuda_register_host_buffer(void * buffer, size_t size);
|
||||
GGML_API GGML_CALL void ggml_backend_cuda_unregister_host_buffer(void * buffer);
|
||||
|
||||
GGML_API void ggml_backend_cuda_log_set_callback(ggml_log_callback log_callback, void * user_data);
|
||||
GGML_API GGML_CALL void ggml_backend_cuda_log_set_callback(ggml_log_callback log_callback, void * user_data);
|
||||
|
||||
GGML_API GGML_CALL void ggml_backend_cuda_invalidate_graphs(const void * model);
|
||||
#ifdef __cplusplus
|
||||
}
|
||||
#endif
|
||||
|
||||
@ -240,7 +240,10 @@
|
||||
// if you need to load more than 64 model shards.
|
||||
#define GGML_MAX_CONTEXTS 64
|
||||
#endif
|
||||
#ifndef GGML_MAX_SRC
|
||||
// For the machines with 11+ GPUs use -DGGML_MAX_SRC=N
|
||||
#define GGML_MAX_SRC 12
|
||||
#endif
|
||||
#ifndef GGML_MAX_NAME
|
||||
#define GGML_MAX_NAME 64
|
||||
#endif
|
||||
|
||||
@ -6,6 +6,9 @@ add_compile_definitions(GGML_SCHED_MAX_COPIES=${GGML_SCHED_MAX_COPIES})
|
||||
if (GGML_MAX_CONTEXTS)
|
||||
add_compile_definitions(GGML_MAX_CONTEXTS=${GGML_MAX_CONTEXTS})
|
||||
endif()
|
||||
if (GGML_MAX_SRC)
|
||||
add_compile_definitions(GGML_MAX_SRC=${GGML_MAX_SRC})
|
||||
endif()
|
||||
|
||||
# enable libstdc++ assertions for debug builds
|
||||
if (CMAKE_SYSTEM_NAME MATCHES "Linux")
|
||||
|
||||
@ -94,7 +94,7 @@ static void ggml_cuda_default_log_callback(enum ggml_log_level level, const char
|
||||
ggml_log_callback ggml_cuda_log_callback = ggml_cuda_default_log_callback;
|
||||
void * ggml_cuda_log_user_data = NULL;
|
||||
|
||||
GGML_API void ggml_backend_cuda_log_set_callback(ggml_log_callback log_callback, void * user_data) {
|
||||
GGML_CALL void ggml_backend_cuda_log_set_callback(ggml_log_callback log_callback, void * user_data) {
|
||||
ggml_cuda_log_callback = log_callback;
|
||||
ggml_cuda_log_user_data = user_data;
|
||||
}
|
||||
@ -204,14 +204,10 @@ static ggml_cuda_device_info ggml_cuda_init() {
|
||||
|
||||
int64_t total_vram = 0;
|
||||
#ifdef GGML_CUDA_FORCE_MMQ
|
||||
GGML_CUDA_LOG_INFO("%s: GGML_CUDA_FORCE_MMQ: yes\n", __func__);
|
||||
#else
|
||||
GGML_CUDA_LOG_INFO("%s: GGML_CUDA_FORCE_MMQ: no\n", __func__);
|
||||
GGML_CUDA_LOG_INFO("%s: GGML_CUDA_FORCE_MMQ (instead of CUBLAS): yes\n", __func__);
|
||||
#endif // GGML_CUDA_FORCE_MMQ
|
||||
#ifdef GGML_CUDA_FORCE_CUBLAS
|
||||
GGML_CUDA_LOG_INFO("%s: GGML_CUDA_FORCE_CUBLAS: yes\n", __func__);
|
||||
#else
|
||||
GGML_CUDA_LOG_INFO("%s: GGML_CUDA_FORCE_CUBLAS: no\n", __func__);
|
||||
GGML_CUDA_LOG_INFO("%s: GGML_CUDA_FORCE_CUBLAS (Instead of MMQ): yes\n", __func__);
|
||||
#endif // GGML_CUDA_FORCE_CUBLAS
|
||||
GGML_CUDA_LOG_INFO("%s: found %d " GGML_CUDA_NAME " devices:\n", __func__, info.device_count);
|
||||
for (int id = 0; id < info.device_count; ++id) {
|
||||
@ -301,6 +297,25 @@ const ggml_cuda_device_info & ggml_cuda_info() {
|
||||
return info;
|
||||
}
|
||||
|
||||
/* ---------- hot-swap: invalidate all cached CUDA graphs ---------- */
|
||||
extern "C" void ggml_backend_cuda_invalidate_graphs(const void * model) {
|
||||
auto & info = const_cast<ggml_cuda_device_info &>(ggml_cuda_info());
|
||||
if (auto it = info.all_ctx.find(model); it != info.all_ctx.end()) {
|
||||
for (auto ctx : it->second) {
|
||||
if (ctx) {
|
||||
ctx->cuda_graphs.clear();
|
||||
}
|
||||
}
|
||||
} else {
|
||||
fprintf(stderr, "================================= %s: did not find entry for model at %p\n", __func__, model);
|
||||
}
|
||||
//for (int i = 0; i < info.device_count; ++i) {
|
||||
// if (info.all_ctx[i]) {
|
||||
// info.all_ctx[i]->cuda_graphs.clear();
|
||||
// }
|
||||
//}
|
||||
}
|
||||
|
||||
// #define DEBUG_CUDA_MALLOC
|
||||
|
||||
// buffer pool for cuda (legacy)
|
||||
@ -511,13 +526,14 @@ static std::condition_variable ggml_cuda_lock_cv;
|
||||
//static std::atomic<int> ggml_cuda_lock_counter;
|
||||
static int ggml_cuda_lock_counter = 0;
|
||||
|
||||
ggml_backend_cuda_context::ggml_backend_cuda_context(int device) :
|
||||
device(device), name(GGML_CUDA_NAME + std::to_string(device)) {
|
||||
ggml_backend_cuda_context::ggml_backend_cuda_context(int device, const void * model) :
|
||||
device(device), name(GGML_CUDA_NAME + std::to_string(device)), model(model) {
|
||||
auto info = const_cast<ggml_cuda_device_info*>(&ggml_cuda_info());
|
||||
if (info->all_ctx[device]) {
|
||||
auto & all_ctx = info->all_ctx[model];
|
||||
if (all_ctx[device]) {
|
||||
GGML_CUDA_LOG_WARN("%s: a context for device %d already exists?\n", __func__, device);
|
||||
} else{
|
||||
info->all_ctx[device] = this;
|
||||
all_ctx[device] = this;
|
||||
}
|
||||
}
|
||||
|
||||
@ -549,9 +565,12 @@ ggml_backend_cuda_context::~ggml_backend_cuda_context() {
|
||||
}
|
||||
}
|
||||
auto info = const_cast<ggml_cuda_device_info*>(&ggml_cuda_info());
|
||||
if (info->all_ctx[device] == this) {
|
||||
info->all_ctx[device] = nullptr;
|
||||
if (auto it = info->all_ctx.find(model); it != info->all_ctx.end() && it->second[device] == this) {
|
||||
it->second[device] = nullptr;
|
||||
}
|
||||
//if (info->all_ctx[device] == this) {
|
||||
// info->all_ctx[device] = nullptr;
|
||||
//}
|
||||
|
||||
}
|
||||
|
||||
@ -847,6 +866,9 @@ GGML_CALL static void ggml_backend_cuda_split_buffer_init_tensor([[maybe_unused]
|
||||
}
|
||||
//printf(" allocated %zu bytes for tensor %s of type %s, dim = %ld x %ld x %ld. padding: %zu\n", padded_size, split->name, ggml_type_name(split->type),
|
||||
// split->ne[0], split->ne[1], split->ne[2], padded_size - size);
|
||||
//printf("DEBUG init_tensor: dev=%d split_ne0=%ld type=%s ggml_nbytes=%zu padded=%zu data_ptr=%p\n",
|
||||
// i, (long)ne0, ggml_type_name(split->type), size, padded_size, (void*)buf);
|
||||
//fflush(stdout);
|
||||
split->data = buf;
|
||||
auto ctx = new ggml_backend_cuda_buffer_context(i, buf);
|
||||
auto buft = ggml_backend_cuda_buffer_type(i);
|
||||
@ -1054,6 +1076,12 @@ GGML_CALL static void ggml_backend_cuda_split_buffer_set_tensor([[maybe_unused]]
|
||||
memcpy(dst + tt.row_meta_size*n_interleave, src + source_offset, n_interleave*(split_row_size - tt.row_meta_size));
|
||||
}
|
||||
}
|
||||
//printf("DEBUG set_tensor: dev=%d split_ne0=%ld nrows=%d split_row_size=%zu total=%zu "
|
||||
// "split_data=%p host_data=%p host_capacity=%zu source_offset=%zu\n",
|
||||
// i, (long)split->ne[0], nrows, split_row_size, nrows*split_row_size,
|
||||
// (void*)split->data, (void*)host_buffer.data(), host_buffer.size(),
|
||||
// (size_t)source_offset);
|
||||
//fflush(stdout);
|
||||
CUDA_CHECK(cudaMemcpyAsync(split->data, host_buffer.data(), nrows*split_row_size, cudaMemcpyHostToDevice, cudaStreamPerThread));
|
||||
ne += split->ne[0];
|
||||
}
|
||||
@ -1144,7 +1172,7 @@ GGML_CALL static void ggml_backend_cuda_split_buffer_set_tensor([[maybe_unused]]
|
||||
}
|
||||
}
|
||||
else {
|
||||
fprintf(stderr, "%s: not implemented for split dim %d\n", __func__, extra->split_dim == 0);
|
||||
fprintf(stderr, "%s: not implemented for split dim %d\n", __func__, extra->split_dim);
|
||||
GGML_ABORT("fatal error");
|
||||
}
|
||||
|
||||
@ -1409,17 +1437,16 @@ static void * ggml_cuda_host_malloc(size_t size) {
|
||||
return nullptr;
|
||||
}
|
||||
|
||||
// Whether to request the kernel to attempt to defragment memory to back the region with 2M hugepages.
|
||||
// Otherwise dependent on kernel settings:
|
||||
// * enabled="always": Hand over whatever 2M pages it has on hand and the rest will be 4k
|
||||
// * enabled="madvise": 4k pages
|
||||
// * enabled="never": 4k pages
|
||||
// Potluck on performance. If there's not much defragmentation to do, then you win. Otherwise come back in an hour.
|
||||
#if 0
|
||||
#ifdef MADV_HUGEPAGE
|
||||
madvise(ptr, size, MADV_HUGEPAGE);
|
||||
#endif
|
||||
#endif
|
||||
// Whether to request the kernel to attempt to defragment memory to back the region with 2M hugepages.
|
||||
// Otherwise dependent on kernel settings:
|
||||
// * enabled="always": Hand over whatever 2M pages it has on hand and the rest will be 4k
|
||||
// * enabled="madvise": 4k pages
|
||||
// * enabled="never": 4k pages
|
||||
// Potluck on performance. If there's not much defragmentation to do, then you win. Otherwise come back in an hour.
|
||||
// Defaults to disabled unless GGML_CUDA_HOST_MALLOC_THP is set.
|
||||
if (getenv("GGML_CUDA_HOST_MALLOC_THP") != nullptr) {
|
||||
madvise(ptr, size, MADV_HUGEPAGE);
|
||||
}
|
||||
|
||||
// prefault the whole region. If the kernel knows how to do this then let it do so.
|
||||
// Might be worth spawning threads to speed up this process on huge allocations.
|
||||
@ -1427,8 +1454,7 @@ static void * ggml_cuda_host_malloc(size_t size) {
|
||||
#ifdef MADV_POPULATE_WRITE
|
||||
needs_manual_prefault = madvise(ptr, size, MADV_POPULATE_WRITE);
|
||||
#endif
|
||||
if (needs_manual_prefault)
|
||||
{
|
||||
if (needs_manual_prefault) {
|
||||
char * p = (char *) ptr;
|
||||
for (size_t off = 0; off < size; off += 4096) {
|
||||
p[off] = 0;
|
||||
@ -3208,7 +3234,7 @@ static int ggml_cuda_moe_up_gate_unary(ggml_backend_cuda_context & ctx, ggml_ten
|
||||
if (dst->src[5]) {
|
||||
ggml_cuda_add_id((const float *)dst_row.data, (const float *)dst->src[5]->data, (const int32_t *)ids->data,
|
||||
(float *)dst_row.data, dst_row.ne[0], dst_row.ne[1], dst_row.ne[2], dst_row.ne[0], dst_row.ne[1],
|
||||
dst_row.nb[1], dst_row.nb[2], dst->src[4]->nb[1], ids->nb[1], stream);
|
||||
dst_row.nb[1], dst_row.nb[2], dst->src[5]->nb[1], ids->nb[1], stream);
|
||||
CUDA_CHECK(cudaGetLastError());
|
||||
}
|
||||
|
||||
@ -3554,7 +3580,7 @@ static void ggml_cuda_up_gate_unary(ggml_backend_cuda_context & ctx, ggml_tensor
|
||||
CUDA_CHECK(cudaGetLastError());
|
||||
|
||||
ggml_cuda_op_mul_mat_q(ctx, src0_2, src1, dst, (const char *)src0_2->data, nullptr, src1_quantized.get(), (float *)dst->data,
|
||||
0, src0_1->ne[1], src1->ne[1], ne10_padded, stream);
|
||||
0, src0_2->ne[1], src1->ne[1], ne10_padded, stream);
|
||||
CUDA_CHECK(cudaGetLastError());
|
||||
} else {
|
||||
auto local_dst = *dst;
|
||||
@ -4234,23 +4260,8 @@ GGML_CALL static bool ggml_backend_cuda_cpy_tensor_async(ggml_backend_t backend_
|
||||
needs_f16_f32_copy = true;
|
||||
|
||||
} else {
|
||||
#ifdef GGML_USE_NCCL__
|
||||
auto & info = ggml_cuda_info();
|
||||
auto nbytes = ggml_nbytes(src);
|
||||
ncclGroupStart();
|
||||
ggml_cuda_set_device(cuda_ctx_src->device);
|
||||
auto status1 = ncclSend(src->data, nbytes, ncclUint8, cuda_ctx_dst->device, info.nccl_coms[cuda_ctx_src->device],
|
||||
info.all_ctx[cuda_ctx_src->device]->stream());
|
||||
ggml_cuda_set_device(cuda_ctx_dst->device);
|
||||
auto status2 = ncclRecv(dst->data, nbytes, ncclUint8, cuda_ctx_src->device, info.nccl_coms[cuda_ctx_dst->device],
|
||||
info.all_ctx[cuda_ctx_dst->device]->stream());
|
||||
ncclGroupEnd();
|
||||
GGML_ASSERT(status1 == ncclSuccess && status2 == ncclSuccess);
|
||||
return true;
|
||||
#else
|
||||
ggml_cuda_set_device(cuda_ctx_src->device);
|
||||
CUDA_CHECK(cudaMemcpyPeerAsync(dst->data, cuda_ctx_dst->device, src->data, cuda_ctx_src->device, ggml_nbytes(dst), cuda_ctx_src->stream()));
|
||||
#endif
|
||||
}
|
||||
#endif
|
||||
}
|
||||
@ -5236,13 +5247,13 @@ static cuda_params ggml_cuda_parse_params(const char * params_string) {
|
||||
return params;
|
||||
}
|
||||
|
||||
GGML_CALL ggml_backend_t ggml_backend_cuda_init(int device, [[maybe_unused]] const void * param_string) {
|
||||
GGML_CALL ggml_backend_t ggml_backend_cuda_init(int device, [[maybe_unused]] const void * param_string, const void * model) {
|
||||
if (device < 0 || device >= ggml_backend_cuda_get_device_count()) {
|
||||
GGML_CUDA_LOG_ERROR("%s: invalid device %d\n", __func__, device);
|
||||
return nullptr;
|
||||
}
|
||||
|
||||
ggml_backend_cuda_context * ctx = new ggml_backend_cuda_context(device);
|
||||
ggml_backend_cuda_context * ctx = new ggml_backend_cuda_context(device, model);
|
||||
if (ctx == nullptr) {
|
||||
GGML_CUDA_LOG_ERROR("%s: failed to allocate context\n", __func__);
|
||||
return nullptr;
|
||||
@ -5357,7 +5368,7 @@ GGML_CALL void ggml_backend_cuda_unregister_host_buffer(void * buffer) {
|
||||
|
||||
// backend registry
|
||||
GGML_CALL static ggml_backend_t ggml_backend_reg_cuda_init(const char * params, void * user_data) {
|
||||
ggml_backend_t cuda_backend = ggml_backend_cuda_init((int) (intptr_t) user_data, nullptr);
|
||||
ggml_backend_t cuda_backend = ggml_backend_cuda_init((int) (intptr_t) user_data, nullptr, nullptr);
|
||||
return cuda_backend;
|
||||
|
||||
GGML_UNUSED(params);
|
||||
|
||||
@ -762,7 +762,8 @@ struct ggml_cuda_device_info {
|
||||
|
||||
std::array<float, GGML_CUDA_MAX_DEVICES> default_tensor_split = {};
|
||||
|
||||
ggml_backend_cuda_context * all_ctx[GGML_CUDA_MAX_DEVICES] = { nullptr };
|
||||
std::unordered_map<const void *, std::array<ggml_backend_cuda_context *, GGML_CUDA_MAX_DEVICES>> all_ctx;
|
||||
//ggml_backend_cuda_context * all_ctx[GGML_CUDA_MAX_DEVICES] = { nullptr };
|
||||
#ifdef GGML_USE_NCCL
|
||||
ncclComm_t nccl_coms[GGML_CUDA_MAX_DEVICES];
|
||||
bool have_nccl;
|
||||
@ -864,10 +865,11 @@ struct ggml_backend_cuda_context {
|
||||
|
||||
#endif
|
||||
|
||||
const void * model;
|
||||
void * copy_buffer = nullptr;
|
||||
size_t copy_size = 0;
|
||||
|
||||
explicit ggml_backend_cuda_context(int device);
|
||||
explicit ggml_backend_cuda_context(int device, const void * model);
|
||||
|
||||
~ggml_backend_cuda_context();
|
||||
|
||||
|
||||
@ -229,7 +229,7 @@ static __device__ void quantize_f32_q6_0_block(const float * __restrict__ xi, bl
|
||||
|
||||
for (int j = 0; j < QK6_0/2; ++j) {
|
||||
const float x0 = xi[0 + j]*id;
|
||||
const float x1 = xi[QK4_0/2 + j]*id;
|
||||
const float x1 = xi[QK6_0/2 + j]*id;
|
||||
|
||||
const uint8_t xi0 = min(63, (int8_t)(x0 + 32.5f));
|
||||
const uint8_t xi1 = min(63, (int8_t)(x1 + 32.5f));
|
||||
|
||||
@ -77,7 +77,7 @@ static __global__ void dequantize_mul_mat_vec_iq2_kt(const void * __restrict__ v
|
||||
const int ncols, int nrows, int64_t row_size) {
|
||||
|
||||
const int row = blockIdx.x*blockDim.y + threadIdx.y;
|
||||
if (row > nrows) return;
|
||||
if (row >= nrows) return;
|
||||
|
||||
const float * dptr = (const float *)((const char *)vx + row*row_size);
|
||||
const float d = *dptr * 31.75f * 1.05f;
|
||||
@ -121,7 +121,7 @@ static __global__ void dequantize_mul_mat_vec_iq3_kt(const void * __restrict__ v
|
||||
const int ncols, int nrows, int64_t row_size) {
|
||||
|
||||
const int row = blockIdx.x*blockDim.y + threadIdx.y;
|
||||
if (row > nrows) return;
|
||||
if (row >= nrows) return;
|
||||
|
||||
const float * dptr = (const float *)((const char *)vx + row*row_size);
|
||||
const float d = *dptr * 31.75f * 1.015f;
|
||||
@ -171,7 +171,7 @@ static __global__ void dequantize_mul_mat_vec_iq4_kt(const void * __restrict__ v
|
||||
constexpr int kNumGroups = 64;
|
||||
|
||||
const int row = blockIdx.x*blockDim.y + threadIdx.y;
|
||||
if (row > nrows) return;
|
||||
if (row >= nrows) return;
|
||||
|
||||
const float * dptr = (const float *)((const char *)vx + row*row_size);
|
||||
const float d = dptr[0] * 31.75f * 1.01f;
|
||||
@ -234,7 +234,7 @@ static __global__ void dequantize_mul_mat_vec_q2_k(const void * __restrict__ vx,
|
||||
static_assert(16%K_QUANTS_PER_ITERATION == 0, "16 must be divisible by K_QUANTS_PER_ITERATION");
|
||||
|
||||
const int row = blockIdx.x*blockDim.y + threadIdx.y;
|
||||
if (row > nrows) return;
|
||||
if (row >= nrows) return;
|
||||
|
||||
const int num_blocks_per_row = ncols / QK_K;
|
||||
const int ib0 = row*num_blocks_per_row;
|
||||
@ -303,7 +303,7 @@ static __global__ void dequantize_mul_mat_vec_q2_k(const void * __restrict__ vx,
|
||||
static __global__ void dequantize_mul_mat_vec_q3_k(const void * __restrict__ vx, const float * __restrict__ yy, float * __restrict__ dst, const int ncols, int nrows) {
|
||||
|
||||
const int row = blockIdx.x*blockDim.y + threadIdx.y;
|
||||
if (row > nrows) return;
|
||||
if (row >= nrows) return;
|
||||
|
||||
const int num_blocks_per_row = ncols / QK_K;
|
||||
const int ib0 = row*num_blocks_per_row;
|
||||
@ -374,7 +374,7 @@ static __global__ void dequantize_mul_mat_vec_q3_k(const void * __restrict__ vx,
|
||||
static __global__ void dequantize_mul_mat_vec_q4_k(const void * __restrict__ vx, const float * __restrict__ yy, float * __restrict__ dst, const int ncols, int nrows) {
|
||||
|
||||
const int row = blockIdx.x*blockDim.y + threadIdx.y;
|
||||
if (row > nrows) return;
|
||||
if (row >= nrows) return;
|
||||
const int num_blocks_per_row = ncols / QK_K;
|
||||
const int ib0 = row*num_blocks_per_row;
|
||||
|
||||
@ -566,7 +566,7 @@ static __global__ void dequantize_mul_mat_vec_q6_k(const void * __restrict__ vx,
|
||||
static_assert(16%K_QUANTS_PER_ITERATION == 0, "16 must be divisible by K_QUANTS_PER_ITERATION");
|
||||
|
||||
const int row = blockIdx.x*blockDim.y + threadIdx.y;
|
||||
if (row > nrows) return;
|
||||
if (row >= nrows) return;
|
||||
|
||||
const int num_blocks_per_row = ncols / QK_K;
|
||||
const int ib0 = row*num_blocks_per_row;
|
||||
|
||||
@ -350,7 +350,7 @@ void ggml_cuda_flash_attn_ext_tile_f16(ggml_backend_cuda_context & ctx, ggml_ten
|
||||
if (softcap == 0.0f) {
|
||||
launch_fattn_tile_f16_64_128<cols_per_block, parallel_blocks, false>(ctx, dst);
|
||||
} else {
|
||||
launch_fattn_tile_f16_64_128<cols_per_block, parallel_blocks, false>(ctx, dst);
|
||||
launch_fattn_tile_f16_64_128<cols_per_block, parallel_blocks, true>(ctx, dst);
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
@ -195,11 +195,11 @@ static __global__ void flash_attn_vec_ext_f16(
|
||||
// printf("gridDims = %u, %u, %u, ncols = %d, head = %d, blockIdx.x = %d, blockIdx.y = %d, bounds = %d, %d, ne11 = %d, nb11 = %d, blockIdx.y*Dk = %d\n", gridDim.x, gridDim.y, gridDim.z, ncols, head, blockIdx.x, blockIdx.y, KV_min_max[sequence*gridDim.x + blockIdx.x].x, KV_min_max[sequence*gridDim.x + blockIdx.x].y, ne11, nb11, blockIdx.y*Dk);
|
||||
//}
|
||||
K += (first_y + blockIdx.y*Dk) * nb11;
|
||||
V += (first_y + blockIdx.y*Dv) * nb21;
|
||||
V += (first_y + blockIdx.y*Dk) * nb21;
|
||||
maskh += (first_y + blockIdx.y*Dk);
|
||||
for (int k_VKQ_0 = first_y + blockIdx.y*Dk; k_VKQ_0 < k_VKQ_max; k_VKQ_0 += gridDim.y*Dk,
|
||||
// Increment pointers after each loop:
|
||||
K += gridDim.y*Dk*nb11, V += gridDim.y*Dv*nb21, maskh += gridDim.y*Dk) {
|
||||
K += gridDim.y*Dk*nb11, V += gridDim.y*Dk*nb21, maskh += gridDim.y*Dk) {
|
||||
|
||||
// For unknown reasons using a half array of size 1 for kqmax_new causes a performance regression,
|
||||
// see https://github.com/ggerganov/llama.cpp/pull/7061 .
|
||||
@ -278,15 +278,23 @@ static __global__ void flash_attn_vec_ext_f16(
|
||||
|
||||
__syncthreads();
|
||||
|
||||
// Accumulate V over all Dk scored KV positions of this block (not Dv). For asymmetric MLA
|
||||
// head sizes (Dk=576 K, Dv=512 V) the K/score loop above covers Dk KV rows, so the V loop and
|
||||
// the V pointer stride must also step Dk KV rows or K and V desync (= garbage decode on sm_60,
|
||||
// which uses this vec kernel for MLA -fa 1 batch=1). Dk==Dv leaves every other case unchanged.
|
||||
#pragma unroll
|
||||
for (int k0 = 0; k0 < Dv; k0 += 2) {
|
||||
if (FATTN_KQ_STRIDE % Dv != 0 && k_VKQ_0 + k0 >= ne11) {
|
||||
for (int k0 = 0; k0 < Dk; k0 += 2) {
|
||||
if (FATTN_KQ_STRIDE % Dk != 0 && k_VKQ_0 + k0 >= ne11) {
|
||||
break;
|
||||
}
|
||||
|
||||
half2 V_k;
|
||||
reinterpret_cast<half&>(V_k.x) = dequantize_1_v(V + (k0 + 0)*nb21, tid);
|
||||
reinterpret_cast<half&>(V_k.y) = dequantize_1_v(V + (k0 + 1)*nb21, tid);
|
||||
// For asymmetric Dk>Dv the V row is only Dv wide, so threads tid>=Dv have no V element
|
||||
// (their VKQ lane is discarded at output anyway). Read 0 to avoid stepping past the row.
|
||||
half2 V_k = make_half2(0.0f, 0.0f);
|
||||
if (tid < Dv) {
|
||||
reinterpret_cast<half&>(V_k.x) = dequantize_1_v(V + (k0 + 0)*nb21, tid);
|
||||
reinterpret_cast<half&>(V_k.y) = dequantize_1_v(V + (k0 + 1)*nb21, tid);
|
||||
}
|
||||
#pragma unroll
|
||||
for (int j = 0; j < ncols; ++j) {
|
||||
VKQ[j] += V_k*KQ2[j*(Dk/2) + k0/2];
|
||||
|
||||
@ -189,11 +189,11 @@ static __global__ void flash_attn_vec_ext_f32(
|
||||
const int first_y = KV_min_max ? KV_min_max[sequence*gridDim.x + blockIdx.x].x : 0;
|
||||
|
||||
K += (first_y + blockIdx.y*Dk) * nb11;
|
||||
V += (first_y + blockIdx.y*Dv) * nb21;
|
||||
V += (first_y + blockIdx.y*Dk) * nb21;
|
||||
maskh += (first_y + blockIdx.y*Dk);
|
||||
for (int k_VKQ_0 = first_y + blockIdx.y*Dk; k_VKQ_0 < k_VKQ_max; k_VKQ_0 += gridDim.y*Dk,
|
||||
// Increment pointers after each loop:
|
||||
K += gridDim.y*Dk*nb11, V += gridDim.y*Dv*nb21, maskh += gridDim.y*Dk) {
|
||||
K += gridDim.y*Dk*nb11, V += gridDim.y*Dk*nb21, maskh += gridDim.y*Dk) {
|
||||
|
||||
// Calculate KQ tile and keep track of new maximum KQ values:
|
||||
|
||||
@ -266,13 +266,19 @@ static __global__ void flash_attn_vec_ext_f32(
|
||||
|
||||
__syncthreads();
|
||||
|
||||
// Accumulate V over all Dk scored KV positions of this block (not Dv). For asymmetric MLA
|
||||
// head sizes (Dk=576 K, Dv=512 V) the K/score loop above covers Dk KV rows, so the V loop and
|
||||
// the V pointer stride must also step Dk KV rows or K and V desync (= garbage decode on sm_60,
|
||||
// which uses this vec kernel for MLA -fa 1 batch=1). Dk==Dv leaves every other case unchanged.
|
||||
#pragma unroll
|
||||
for (int k = 0; k < Dv; ++k) {
|
||||
if (FATTN_KQ_STRIDE % Dv != 0 && k_VKQ_0 + k >= ne11) {
|
||||
for (int k = 0; k < Dk; ++k) {
|
||||
if (FATTN_KQ_STRIDE % Dk != 0 && k_VKQ_0 + k >= ne11) {
|
||||
break;
|
||||
}
|
||||
|
||||
const float V_ki = dequantize_1_v(V + k*nb21, tid);
|
||||
// For asymmetric Dk>Dv the V row is only Dv wide, so threads tid>=Dv have no V element
|
||||
// (their VKQ lane is discarded at output anyway). Read 0 to avoid stepping past the row.
|
||||
const float V_ki = tid < Dv ? dequantize_1_v(V + k*nb21, tid) : 0.0f;
|
||||
#pragma unroll
|
||||
for (int j = 0; j < ncols; ++j) {
|
||||
VKQ[j] += V_ki*KQ[j*Dk + k];
|
||||
|
||||
@ -19,7 +19,7 @@ struct ggml_cuda_type_traits<GGML_TYPE_IQ1_M_R4> {
|
||||
};
|
||||
|
||||
template <ggml_type type, int vdr, vec_dot_q_cuda_t vec_dot_q_cuda, int ncols_y, int n_interleaved = 1>
|
||||
static __device__ void iqk_mul_mat_vec_q_kerne(
|
||||
static __device__ void iqk_mul_mat_vec_q_kernel(
|
||||
const void * __restrict__ vx, const void * __restrict__ vy,
|
||||
const float * bias, float * __restrict__ dst,
|
||||
const int ncols_x, const int nrows_x, const int nrows_y, const int nrows_dst, const int64_t row_size) {
|
||||
@ -244,7 +244,7 @@ static __global__ void iqk_mul_mat_vec_q(
|
||||
const char * cy = (const char *)vy + i2*nb12;
|
||||
char * cdst = (char *)dst + i2*nb2;
|
||||
const float * b = (const float *)(bias ? ids_data ? (const char *)bias + i02*bias_nb1 : bias : nullptr);
|
||||
iqk_mul_mat_vec_q_kerne<type, vdr, vec_dot_q_cuda, ncols_y, n_interleaved>(cx, cy, b, (float *)cdst, ncols_x, nrows_x, nrows_y, nrows_dst, row_size);
|
||||
iqk_mul_mat_vec_q_kernel<type, vdr, vec_dot_q_cuda, ncols_y, n_interleaved>(cx, cy, b, (float *)cdst, ncols_x, nrows_x, nrows_y, nrows_dst, row_size);
|
||||
}
|
||||
|
||||
template <ggml_type type, int vdr, vec_dot_q_cuda_t vec_dot_q_cuda, int ncols_y, int n_interleaved = 1>
|
||||
|
||||
@ -1046,7 +1046,7 @@ void ggml_cuda_op_fused_rms_rms_add(ggml_backend_cuda_context & ctx, ggml_tensor
|
||||
else if (dst->src[0]->type == GGML_TYPE_BF16) {
|
||||
fused_rms_rms_add_f32_cuda(ncols, nrows, (float *)dst->data,
|
||||
(const nv_bfloat16 *)dst->src[0]->data, (const float *)dst->src[1]->data,
|
||||
(const nv_bfloat16 *)dst->src[2]->data, (const float *)dst->src[2]->data,
|
||||
(const nv_bfloat16 *)dst->src[2]->data, (const float *)dst->src[3]->data,
|
||||
eps, ctx.stream());
|
||||
}
|
||||
else {
|
||||
|
||||
@ -94,6 +94,11 @@ static void copy_missing_tensors(ggml_backend_cuda_context & ctx, ggml_tensor *
|
||||
if (ncopy < 1) return;
|
||||
|
||||
auto & info = ggml_cuda_info();
|
||||
auto it = info.all_ctx.find(ctx.model);
|
||||
if (it == info.all_ctx.end()) {
|
||||
GGML_ABORT("Fatal error");
|
||||
}
|
||||
auto & all_ctx = it->second;
|
||||
auto size = ggml_nbytes(dst);
|
||||
int isrc = 0;
|
||||
for (int ii = 0; ii < ncopy; ++ii) {
|
||||
@ -102,9 +107,9 @@ static void copy_missing_tensors(ggml_backend_cuda_context & ctx, ggml_tensor *
|
||||
isrc = (isrc + 1)%nhave;
|
||||
//printf("%s: copying from device %d to device %d: %p -> %p\n", __func__, j, i, dst->src[j]->data, dst->src[i]->data);
|
||||
ggml_cuda_set_device(j);
|
||||
CUDA_CHECK(cudaMemcpyPeerAsync(dst->src[i]->data, info.all_ctx[i]->device, dst->src[j]->data, info.all_ctx[j]->device,
|
||||
size, info.all_ctx[j]->stream()));
|
||||
CUDA_CHECK(cudaEventRecord(info.all_ctx[j]->copy_event, info.all_ctx[j]->stream()));
|
||||
CUDA_CHECK(cudaMemcpyPeerAsync(dst->src[i]->data, all_ctx[i]->device, dst->src[j]->data, all_ctx[j]->device,
|
||||
size, all_ctx[j]->stream()));
|
||||
CUDA_CHECK(cudaEventRecord(all_ctx[j]->copy_event, all_ctx[j]->stream()));
|
||||
}
|
||||
isrc = 0;
|
||||
for (int ii = 0; ii < ncopy; ++ii) {
|
||||
@ -112,7 +117,7 @@ static void copy_missing_tensors(ggml_backend_cuda_context & ctx, ggml_tensor *
|
||||
int j = idx[isrc];
|
||||
isrc = (isrc + 1)%nhave;
|
||||
ggml_cuda_set_device(i);
|
||||
CUDA_CHECK(cudaStreamWaitEvent(info.all_ctx[i]->stream(), info.all_ctx[j]->copy_event, 0));
|
||||
CUDA_CHECK(cudaStreamWaitEvent(all_ctx[i]->stream(), all_ctx[j]->copy_event, 0));
|
||||
}
|
||||
ggml_cuda_set_device(ctx.device);
|
||||
}
|
||||
@ -133,6 +138,11 @@ void ggml_cuda_op_reduce([[maybe_unused]] ggml_backend_cuda_context & ctx, ggml_
|
||||
}
|
||||
|
||||
auto & info = ggml_cuda_info();
|
||||
auto it = info.all_ctx.find(ctx.model);
|
||||
if (it == info.all_ctx.end()) {
|
||||
GGML_ABORT("Fatal error");
|
||||
}
|
||||
auto & all_ctx = it->second;
|
||||
#ifdef GGML_USE_NCCL
|
||||
// Somehow I'm not able to figure out how to use NCCL correctly.
|
||||
// It does not work at all if not all GPUs participate in the reduce op, and we
|
||||
@ -153,7 +163,7 @@ void ggml_cuda_op_reduce([[maybe_unused]] ggml_backend_cuda_context & ctx, ggml_
|
||||
ggml_cuda_set_device(i);
|
||||
auto status = ncclAllReduce(dst->src[i] ? dst->src[i]->data : nullptr,
|
||||
dst->src[i] ? dst->src[i]->data : nullptr,
|
||||
ggml_nelements(dst), data_type, ncclSum, info.nccl_coms[i], info.all_ctx[i]->stream());
|
||||
ggml_nelements(dst), data_type, ncclSum, info.nccl_coms[i], all_ctx[i]->stream());
|
||||
if (status != ncclSuccess) {
|
||||
fprintf(stderr, "%s: ncclAllReduce failed with status %d\n", __func__, (int)status);
|
||||
GGML_ABORT("Fatal error");
|
||||
@ -275,7 +285,7 @@ void ggml_cuda_op_reduce([[maybe_unused]] ggml_backend_cuda_context & ctx, ggml_
|
||||
auto size_per_device = nblocks_per_device * tt.type_size;
|
||||
for (int ii = 0; ii < nhave; ++ii) {
|
||||
int i = idx[ii];
|
||||
auto this_ctx = info.all_ctx[i];
|
||||
auto this_ctx = all_ctx[i];
|
||||
if (!this_ctx->copy_event || !this_ctx->compute_event || size_per_device > this_ctx->copy_size) {
|
||||
ggml_cuda_set_device(this_ctx->device);
|
||||
if (!this_ctx->copy_event) {
|
||||
@ -300,14 +310,14 @@ void ggml_cuda_op_reduce([[maybe_unused]] ggml_backend_cuda_context & ctx, ggml_
|
||||
int peer = idx[(ii+1)%nhave];
|
||||
auto this_nelem = std::min(nelem_per_device, nelem - ichunk*nelem_per_device);
|
||||
auto this_size = (this_nelem / tt.blck_size) * tt.type_size;
|
||||
ggml_cuda_set_device(info.all_ctx[peer]->device);
|
||||
ggml_cuda_set_device(all_ctx[peer]->device);
|
||||
if (stage > 0) {
|
||||
CUDA_CHECK(cudaStreamWaitEvent(info.all_ctx[peer]->stream(), info.all_ctx[i]->compute_event, 0));
|
||||
CUDA_CHECK(cudaStreamWaitEvent(all_ctx[peer]->stream(), all_ctx[i]->compute_event, 0));
|
||||
}
|
||||
CUDA_CHECK(cudaMemcpyPeerAsync(info.all_ctx[i]->copy_buffer, info.all_ctx[i]->device,
|
||||
(const char *)dst->src[peer]->data + ichunk*size_per_device, info.all_ctx[peer]->device,
|
||||
this_size, info.all_ctx[peer]->stream()));
|
||||
CUDA_CHECK(cudaEventRecord(info.all_ctx[peer]->copy_event, info.all_ctx[peer]->stream()));
|
||||
CUDA_CHECK(cudaMemcpyPeerAsync(all_ctx[i]->copy_buffer, all_ctx[i]->device,
|
||||
(const char *)dst->src[peer]->data + ichunk*size_per_device, all_ctx[peer]->device,
|
||||
this_size, all_ctx[peer]->stream()));
|
||||
CUDA_CHECK(cudaEventRecord(all_ctx[peer]->copy_event, all_ctx[peer]->stream()));
|
||||
ichunk = (ichunk + 1)%nhave;
|
||||
}
|
||||
ichunk = stage;
|
||||
@ -315,24 +325,24 @@ void ggml_cuda_op_reduce([[maybe_unused]] ggml_backend_cuda_context & ctx, ggml_
|
||||
int i = idx[ii];
|
||||
int peer = idx[(ii+1)%nhave];
|
||||
auto this_nelem = std::min(nelem_per_device, nelem - ichunk*nelem_per_device);
|
||||
ggml_cuda_set_device(info.all_ctx[i]->device);
|
||||
CUDA_CHECK(cudaStreamWaitEvent(info.all_ctx[i]->stream(), info.all_ctx[peer]->copy_event, 0));
|
||||
ggml_cuda_set_device(all_ctx[i]->device);
|
||||
CUDA_CHECK(cudaStreamWaitEvent(all_ctx[i]->stream(), all_ctx[peer]->copy_event, 0));
|
||||
int num_blocks = (this_nelem + CUDA_REDUCE_BLOCK_SIZE - 1)/CUDA_REDUCE_BLOCK_SIZE;
|
||||
if (dst->type == GGML_TYPE_F16) {
|
||||
k_add<half, CUDA_REDUCE_BLOCK_SIZE><<<num_blocks, CUDA_REDUCE_BLOCK_SIZE, 0, info.all_ctx[i]->stream()>>>(this_nelem,
|
||||
(const half *)info.all_ctx[i]->copy_buffer, (half *)dst->src[i]->data + ichunk*nelem_per_device);
|
||||
k_add<half, CUDA_REDUCE_BLOCK_SIZE><<<num_blocks, CUDA_REDUCE_BLOCK_SIZE, 0, all_ctx[i]->stream()>>>(this_nelem,
|
||||
(const half *)all_ctx[i]->copy_buffer, (half *)dst->src[i]->data + ichunk*nelem_per_device);
|
||||
} else if (dst->type == GGML_TYPE_Q8_0) {
|
||||
k_add<CUDA_REDUCE_BLOCK_SIZE><<<num_blocks, CUDA_REDUCE_BLOCK_SIZE, 0, info.all_ctx[i]->stream()>>>(this_nelem,
|
||||
(const block_q8_0 *)info.all_ctx[i]->copy_buffer, (block_q8_0 *)dst->src[i]->data + ichunk*nelem_per_device/tt.blck_size);
|
||||
k_add<CUDA_REDUCE_BLOCK_SIZE><<<num_blocks, CUDA_REDUCE_BLOCK_SIZE, 0, all_ctx[i]->stream()>>>(this_nelem,
|
||||
(const block_q8_0 *)all_ctx[i]->copy_buffer, (block_q8_0 *)dst->src[i]->data + ichunk*nelem_per_device/tt.blck_size);
|
||||
} else if (dst->type == GGML_TYPE_BF16) {
|
||||
k_add<nv_bfloat16, CUDA_REDUCE_BLOCK_SIZE><<<num_blocks, CUDA_REDUCE_BLOCK_SIZE, 0, info.all_ctx[i]->stream()>>>(
|
||||
this_nelem, (const nv_bfloat16 *)info.all_ctx[i]->copy_buffer,
|
||||
k_add<nv_bfloat16, CUDA_REDUCE_BLOCK_SIZE><<<num_blocks, CUDA_REDUCE_BLOCK_SIZE, 0, all_ctx[i]->stream()>>>(
|
||||
this_nelem, (const nv_bfloat16 *)all_ctx[i]->copy_buffer,
|
||||
(nv_bfloat16 *)dst->src[i]->data + ichunk*nelem_per_device);
|
||||
} else {
|
||||
k_add<float, CUDA_REDUCE_BLOCK_SIZE><<<num_blocks, CUDA_REDUCE_BLOCK_SIZE, 0, info.all_ctx[i]->stream()>>>(this_nelem,
|
||||
(const float *)info.all_ctx[i]->copy_buffer, (float *)dst->src[i]->data + ichunk*nelem_per_device);
|
||||
k_add<float, CUDA_REDUCE_BLOCK_SIZE><<<num_blocks, CUDA_REDUCE_BLOCK_SIZE, 0, all_ctx[i]->stream()>>>(this_nelem,
|
||||
(const float *)all_ctx[i]->copy_buffer, (float *)dst->src[i]->data + ichunk*nelem_per_device);
|
||||
}
|
||||
CUDA_CHECK(cudaEventRecord(info.all_ctx[i]->compute_event, info.all_ctx[i]->stream()));
|
||||
CUDA_CHECK(cudaEventRecord(all_ctx[i]->compute_event, all_ctx[i]->stream()));
|
||||
ichunk = (ichunk + 1)%nhave;
|
||||
}
|
||||
}
|
||||
@ -343,21 +353,21 @@ void ggml_cuda_op_reduce([[maybe_unused]] ggml_backend_cuda_context & ctx, ggml_
|
||||
int peer = idx[(ii+1)%nhave];
|
||||
auto this_nelem = std::min(nelem_per_device, nelem - ichunk*nelem_per_device);
|
||||
auto this_size = (this_nelem / tt.blck_size) * tt.type_size;
|
||||
ggml_cuda_set_device(info.all_ctx[peer]->device);
|
||||
ggml_cuda_set_device(all_ctx[peer]->device);
|
||||
if (stage == 0) {
|
||||
CUDA_CHECK(cudaStreamWaitEvent(info.all_ctx[peer]->stream(), info.all_ctx[i]->compute_event, 0));
|
||||
CUDA_CHECK(cudaStreamWaitEvent(all_ctx[peer]->stream(), all_ctx[i]->compute_event, 0));
|
||||
}
|
||||
CUDA_CHECK(cudaMemcpyPeerAsync((char *)dst->src[i]->data + ichunk*size_per_device, info.all_ctx[i]->device,
|
||||
(const char *)dst->src[peer]->data + ichunk*size_per_device, info.all_ctx[peer]->device,
|
||||
this_size, info.all_ctx[peer]->stream()));
|
||||
CUDA_CHECK(cudaEventRecord(info.all_ctx[peer]->copy_event, info.all_ctx[peer]->stream()));
|
||||
CUDA_CHECK(cudaMemcpyPeerAsync((char *)dst->src[i]->data + ichunk*size_per_device, all_ctx[i]->device,
|
||||
(const char *)dst->src[peer]->data + ichunk*size_per_device, all_ctx[peer]->device,
|
||||
this_size, all_ctx[peer]->stream()));
|
||||
CUDA_CHECK(cudaEventRecord(all_ctx[peer]->copy_event, all_ctx[peer]->stream()));
|
||||
ichunk = (ichunk + 1)%nhave;
|
||||
}
|
||||
for (int ii = 0; ii < nhave; ++ii) {
|
||||
int i = idx[ii];
|
||||
int peer = idx[(ii+1)%nhave];
|
||||
ggml_cuda_set_device(info.all_ctx[i]->device);
|
||||
CUDA_CHECK(cudaStreamWaitEvent(info.all_ctx[i]->stream(), info.all_ctx[peer]->copy_event, 0));
|
||||
ggml_cuda_set_device(all_ctx[i]->device);
|
||||
CUDA_CHECK(cudaStreamWaitEvent(all_ctx[i]->stream(), all_ctx[peer]->copy_event, 0));
|
||||
}
|
||||
}
|
||||
ggml_cuda_set_device(ctx.device);
|
||||
@ -372,8 +382,8 @@ void ggml_cuda_op_reduce([[maybe_unused]] ggml_backend_cuda_context & ctx, ggml_
|
||||
GGML_ASSERT(dst->src[i]->type == dst->type);
|
||||
GGML_ASSERT(ggml_are_same_shape(dst, dst->src[i]));
|
||||
ggml_cuda_set_device(i);
|
||||
if (!info.all_ctx[i]->copy_event) {
|
||||
CUDA_CHECK(cudaEventCreateWithFlags(&info.all_ctx[i]->copy_event, cudaEventDisableTiming));
|
||||
if (!all_ctx[i]->copy_event) {
|
||||
CUDA_CHECK(cudaEventCreateWithFlags(&all_ctx[i]->copy_event, cudaEventDisableTiming));
|
||||
}
|
||||
}
|
||||
auto nelem = ggml_nelements(dst);
|
||||
@ -386,20 +396,20 @@ void ggml_cuda_op_reduce([[maybe_unused]] ggml_backend_cuda_context & ctx, ggml_
|
||||
task.ptrs[0] = (char *)dst->src[i]->data;
|
||||
int j = idx[2*ii+1];
|
||||
ggml_cuda_set_device(j);
|
||||
CUDA_CHECK(cudaEventRecord(info.all_ctx[j]->copy_event, info.all_ctx[j]->stream()));
|
||||
CUDA_CHECK(cudaEventRecord(all_ctx[j]->copy_event, all_ctx[j]->stream()));
|
||||
task.ptrs[1] = (char *)dst->src[j]->data;
|
||||
ggml_cuda_set_device(i);
|
||||
CUDA_CHECK(cudaStreamWaitEvent(info.all_ctx[i]->stream(), info.all_ctx[j]->copy_event));
|
||||
CUDA_CHECK(cudaStreamWaitEvent(all_ctx[i]->stream(), all_ctx[j]->copy_event));
|
||||
if (dst->type == GGML_TYPE_F16) {
|
||||
k_reduce_add_T<half, CUDA_REDUCE_BLOCK_SIZE, 2><<<nblocks, CUDA_REDUCE_BLOCK_SIZE, 0, info.all_ctx[i]->stream()>>>(task);
|
||||
k_reduce_add_T<half, CUDA_REDUCE_BLOCK_SIZE, 2><<<nblocks, CUDA_REDUCE_BLOCK_SIZE, 0, all_ctx[i]->stream()>>>(task);
|
||||
} else {
|
||||
k_reduce_add_T<float, CUDA_REDUCE_BLOCK_SIZE, 2><<<nblocks, CUDA_REDUCE_BLOCK_SIZE, 0, info.all_ctx[i]->stream()>>>(task);
|
||||
k_reduce_add_T<float, CUDA_REDUCE_BLOCK_SIZE, 2><<<nblocks, CUDA_REDUCE_BLOCK_SIZE, 0, all_ctx[i]->stream()>>>(task);
|
||||
}
|
||||
}
|
||||
for (int ii = 0; ii < nhave/2; ++ii) {
|
||||
int i = idx[2*ii+0];
|
||||
ggml_cuda_set_device(i);
|
||||
CUDA_CHECK(cudaEventRecord(info.all_ctx[i]->copy_event, info.all_ctx[i]->stream()));
|
||||
CUDA_CHECK(cudaEventRecord(all_ctx[i]->copy_event, all_ctx[i]->stream()));
|
||||
}
|
||||
for (int ii = 0; ii < nhave/2; ++ii) {
|
||||
int i = idx[2*ii+1];
|
||||
@ -411,23 +421,23 @@ void ggml_cuda_op_reduce([[maybe_unused]] ggml_backend_cuda_context & ctx, ggml_
|
||||
int j = idx[(2*ii+2)%nhave];
|
||||
task.ptrs[1] = (char *)dst->src[j]->data;
|
||||
ggml_cuda_set_device(i);
|
||||
CUDA_CHECK(cudaStreamWaitEvent(info.all_ctx[i]->stream(), info.all_ctx[j]->copy_event));
|
||||
CUDA_CHECK(cudaStreamWaitEvent(all_ctx[i]->stream(), all_ctx[j]->copy_event));
|
||||
if (dst->type == GGML_TYPE_F16) {
|
||||
k_reduce_add_T<half, CUDA_REDUCE_BLOCK_SIZE, 2><<<nblocks, CUDA_REDUCE_BLOCK_SIZE, 0, info.all_ctx[i]->stream()>>>(task);
|
||||
k_reduce_add_T<half, CUDA_REDUCE_BLOCK_SIZE, 2><<<nblocks, CUDA_REDUCE_BLOCK_SIZE, 0, all_ctx[i]->stream()>>>(task);
|
||||
} else {
|
||||
k_reduce_add_T<float, CUDA_REDUCE_BLOCK_SIZE, 2><<<nblocks, CUDA_REDUCE_BLOCK_SIZE, 0, info.all_ctx[i]->stream()>>>(task);
|
||||
k_reduce_add_T<float, CUDA_REDUCE_BLOCK_SIZE, 2><<<nblocks, CUDA_REDUCE_BLOCK_SIZE, 0, all_ctx[i]->stream()>>>(task);
|
||||
}
|
||||
}
|
||||
for (int ii = 0; ii < nhave/2; ++ii) {
|
||||
int i = idx[2*ii+1];
|
||||
ggml_cuda_set_device(i);
|
||||
CUDA_CHECK(cudaEventRecord(info.all_ctx[i]->copy_event, info.all_ctx[i]->stream()));
|
||||
CUDA_CHECK(cudaEventRecord(all_ctx[i]->copy_event, all_ctx[i]->stream()));
|
||||
}
|
||||
for (int ii = 0; ii < nhave/2; ++ii) {
|
||||
int i = idx[(2*ii+2)%nhave];
|
||||
ggml_cuda_set_device(i);
|
||||
int j = idx[2*ii+1];
|
||||
CUDA_CHECK(cudaStreamWaitEvent(info.all_ctx[i]->stream(), info.all_ctx[j]->copy_event));
|
||||
CUDA_CHECK(cudaStreamWaitEvent(all_ctx[i]->stream(), all_ctx[j]->copy_event));
|
||||
}
|
||||
ggml_cuda_set_device(ctx.device);
|
||||
if (ncopy > 0) {
|
||||
@ -442,10 +452,10 @@ void ggml_cuda_op_reduce([[maybe_unused]] ggml_backend_cuda_context & ctx, ggml_
|
||||
GGML_ASSERT(dst->src[i]->type == dst->type);
|
||||
GGML_ASSERT(ggml_are_same_shape(dst, dst->src[i]));
|
||||
ggml_cuda_set_device(i);
|
||||
if (!info.all_ctx[i]->copy_event) {
|
||||
CUDA_CHECK(cudaEventCreateWithFlags(&info.all_ctx[i]->copy_event, cudaEventDisableTiming));
|
||||
if (!all_ctx[i]->copy_event) {
|
||||
CUDA_CHECK(cudaEventCreateWithFlags(&all_ctx[i]->copy_event, cudaEventDisableTiming));
|
||||
}
|
||||
CUDA_CHECK(cudaEventRecord(info.all_ctx[i]->copy_event, info.all_ctx[i]->stream()));
|
||||
CUDA_CHECK(cudaEventRecord(all_ctx[i]->copy_event, all_ctx[i]->stream()));
|
||||
}
|
||||
//printf("Recorded events\n");
|
||||
auto nelem = ggml_nelements(dst);
|
||||
@ -465,37 +475,37 @@ void ggml_cuda_op_reduce([[maybe_unused]] ggml_backend_cuda_context & ctx, ggml_
|
||||
for (int jj = 0; jj < nhave; ++jj) {
|
||||
if (jj == ii) continue;
|
||||
int j = idx[jj];
|
||||
CUDA_CHECK(cudaStreamWaitEvent(info.all_ctx[i]->stream(), info.all_ctx[j]->copy_event));
|
||||
CUDA_CHECK(cudaStreamWaitEvent(all_ctx[i]->stream(), all_ctx[j]->copy_event));
|
||||
task.ptrs[k++] = (char *)dst->src[j]->data + ii*nelem_per_device*elem_size;
|
||||
}
|
||||
int nblock = (this_nelem + CUDA_REDUCE_BLOCK_SIZE - 1)/CUDA_REDUCE_BLOCK_SIZE;
|
||||
if (dst->type == GGML_TYPE_F16) {
|
||||
switch (nhave) {
|
||||
case 2:
|
||||
k_reduce_add_T<half, CUDA_REDUCE_BLOCK_SIZE, 2><<<nblock, CUDA_REDUCE_BLOCK_SIZE, 0, info.all_ctx[i]->stream()>>>(task);
|
||||
k_reduce_add_T<half, CUDA_REDUCE_BLOCK_SIZE, 2><<<nblock, CUDA_REDUCE_BLOCK_SIZE, 0, all_ctx[i]->stream()>>>(task);
|
||||
break;
|
||||
case 3:
|
||||
k_reduce_add_T<half, CUDA_REDUCE_BLOCK_SIZE, 3><<<nblock, CUDA_REDUCE_BLOCK_SIZE, 0, info.all_ctx[i]->stream()>>>(task);
|
||||
k_reduce_add_T<half, CUDA_REDUCE_BLOCK_SIZE, 3><<<nblock, CUDA_REDUCE_BLOCK_SIZE, 0, all_ctx[i]->stream()>>>(task);
|
||||
break;
|
||||
case 4:
|
||||
k_reduce_add_T<half, CUDA_REDUCE_BLOCK_SIZE, 4><<<nblock, CUDA_REDUCE_BLOCK_SIZE, 0, info.all_ctx[i]->stream()>>>(task);
|
||||
k_reduce_add_T<half, CUDA_REDUCE_BLOCK_SIZE, 4><<<nblock, CUDA_REDUCE_BLOCK_SIZE, 0, all_ctx[i]->stream()>>>(task);
|
||||
break;
|
||||
default:
|
||||
k_reduce_add<half, CUDA_REDUCE_BLOCK_SIZE><<<nblock, CUDA_REDUCE_BLOCK_SIZE, 0, info.all_ctx[i]->stream()>>>(task);
|
||||
k_reduce_add<half, CUDA_REDUCE_BLOCK_SIZE><<<nblock, CUDA_REDUCE_BLOCK_SIZE, 0, all_ctx[i]->stream()>>>(task);
|
||||
}
|
||||
} else {
|
||||
switch (nhave) {
|
||||
case 2:
|
||||
k_reduce_add_T<float, CUDA_REDUCE_BLOCK_SIZE, 2><<<nblock, CUDA_REDUCE_BLOCK_SIZE, 0, info.all_ctx[i]->stream()>>>(task);
|
||||
k_reduce_add_T<float, CUDA_REDUCE_BLOCK_SIZE, 2><<<nblock, CUDA_REDUCE_BLOCK_SIZE, 0, all_ctx[i]->stream()>>>(task);
|
||||
break;
|
||||
case 3:
|
||||
k_reduce_add_T<float, CUDA_REDUCE_BLOCK_SIZE, 3><<<nblock, CUDA_REDUCE_BLOCK_SIZE, 0, info.all_ctx[i]->stream()>>>(task);
|
||||
k_reduce_add_T<float, CUDA_REDUCE_BLOCK_SIZE, 3><<<nblock, CUDA_REDUCE_BLOCK_SIZE, 0, all_ctx[i]->stream()>>>(task);
|
||||
break;
|
||||
case 4:
|
||||
k_reduce_add_T<float, CUDA_REDUCE_BLOCK_SIZE, 4><<<nblock, CUDA_REDUCE_BLOCK_SIZE, 0, info.all_ctx[i]->stream()>>>(task);
|
||||
k_reduce_add_T<float, CUDA_REDUCE_BLOCK_SIZE, 4><<<nblock, CUDA_REDUCE_BLOCK_SIZE, 0, all_ctx[i]->stream()>>>(task);
|
||||
break;
|
||||
default:
|
||||
k_reduce_add<float, CUDA_REDUCE_BLOCK_SIZE><<<nblock, CUDA_REDUCE_BLOCK_SIZE, 0, info.all_ctx[i]->stream()>>>(task);
|
||||
k_reduce_add<float, CUDA_REDUCE_BLOCK_SIZE><<<nblock, CUDA_REDUCE_BLOCK_SIZE, 0, all_ctx[i]->stream()>>>(task);
|
||||
}
|
||||
}
|
||||
}
|
||||
@ -503,7 +513,7 @@ void ggml_cuda_op_reduce([[maybe_unused]] ggml_backend_cuda_context & ctx, ggml_
|
||||
for (int ii = 0; ii < nhave; ++ii) {
|
||||
int i = idx[ii];
|
||||
ggml_cuda_set_device(i);
|
||||
CUDA_CHECK(cudaEventRecord(info.all_ctx[i]->copy_event, info.all_ctx[i]->stream()));
|
||||
CUDA_CHECK(cudaEventRecord(all_ctx[i]->copy_event, all_ctx[i]->stream()));
|
||||
}
|
||||
//printf("Recorded events again\n");
|
||||
for (int ii = 0; ii < nhave; ++ii) {
|
||||
@ -512,7 +522,7 @@ void ggml_cuda_op_reduce([[maybe_unused]] ggml_backend_cuda_context & ctx, ggml_
|
||||
for (int jj = 0; jj < nhave; ++jj) {
|
||||
if (jj == ii) continue;
|
||||
int j = idx[jj];
|
||||
CUDA_CHECK(cudaStreamWaitEvent(info.all_ctx[i]->stream(), info.all_ctx[j]->copy_event));
|
||||
CUDA_CHECK(cudaStreamWaitEvent(all_ctx[i]->stream(), all_ctx[j]->copy_event));
|
||||
}
|
||||
}
|
||||
ggml_cuda_set_device(ctx.device);
|
||||
@ -536,11 +546,11 @@ void ggml_cuda_op_reduce([[maybe_unused]] ggml_backend_cuda_context & ctx, ggml_
|
||||
GGML_ASSERT(ggml_are_same_shape(dst, dst->src[i]));
|
||||
if (i == ctx.device) continue;
|
||||
ggml_cuda_set_device(i);
|
||||
CUDA_CHECK(cudaMemcpyPeerAsync(ptr, ctx.device, dst->src[i]->data, i, nbytes, info.all_ctx[i]->stream()));
|
||||
if (!info.all_ctx[i]->copy_event) {
|
||||
CUDA_CHECK(cudaEventCreateWithFlags(&info.all_ctx[i]->copy_event, cudaEventDisableTiming));
|
||||
CUDA_CHECK(cudaMemcpyPeerAsync(ptr, ctx.device, dst->src[i]->data, i, nbytes, all_ctx[i]->stream()));
|
||||
if (!all_ctx[i]->copy_event) {
|
||||
CUDA_CHECK(cudaEventCreateWithFlags(&all_ctx[i]->copy_event, cudaEventDisableTiming));
|
||||
}
|
||||
CUDA_CHECK(cudaEventRecord(info.all_ctx[i]->copy_event, info.all_ctx[i]->stream()));
|
||||
CUDA_CHECK(cudaEventRecord(all_ctx[i]->copy_event, all_ctx[i]->stream()));
|
||||
ptr += nbytes;
|
||||
}
|
||||
auto nelem = ggml_nelements(dst);
|
||||
@ -550,7 +560,7 @@ void ggml_cuda_op_reduce([[maybe_unused]] ggml_backend_cuda_context & ctx, ggml_
|
||||
for (int ii = 0; ii < nhave; ++ii) {
|
||||
int i = idx[ii];
|
||||
if (i == ctx.device) continue;
|
||||
CUDA_CHECK(cudaStreamWaitEvent(ctx.stream(), info.all_ctx[i]->copy_event, 0));
|
||||
CUDA_CHECK(cudaStreamWaitEvent(ctx.stream(), all_ctx[i]->copy_event, 0));
|
||||
if (dst->type == GGML_TYPE_F16) {
|
||||
k_add<half, CUDA_REDUCE_BLOCK_SIZE><<<num_blocks, CUDA_REDUCE_BLOCK_SIZE, 0, ctx.stream()>>>(nelem, (const half *)ptr, (half *)dst->data);
|
||||
} else if (dst->type == GGML_TYPE_BF16) {
|
||||
@ -572,15 +582,15 @@ void ggml_cuda_op_reduce([[maybe_unused]] ggml_backend_cuda_context & ctx, ggml_
|
||||
int i = idx[ii];
|
||||
if (i == ctx.device) continue;
|
||||
ggml_cuda_set_device(i);
|
||||
CUDA_CHECK(cudaStreamWaitEvent(info.all_ctx[i]->stream(), ctx.copy_event, 0));
|
||||
CUDA_CHECK(cudaMemcpyPeerAsync(dst->src[i]->data, i, dst->data, ctx.device, nbytes, info.all_ctx[i]->stream()));
|
||||
CUDA_CHECK(cudaEventRecord(info.all_ctx[i]->copy_event, info.all_ctx[i]->stream()));
|
||||
CUDA_CHECK(cudaStreamWaitEvent(all_ctx[i]->stream(), ctx.copy_event, 0));
|
||||
CUDA_CHECK(cudaMemcpyPeerAsync(dst->src[i]->data, i, dst->data, ctx.device, nbytes, all_ctx[i]->stream()));
|
||||
CUDA_CHECK(cudaEventRecord(all_ctx[i]->copy_event, all_ctx[i]->stream()));
|
||||
}
|
||||
ggml_cuda_set_device(ctx.device);
|
||||
for (int ii = 0; ii < nhave; ++ii) {
|
||||
int i = idx[ii];
|
||||
if (i == ctx.device) continue;
|
||||
CUDA_CHECK(cudaStreamWaitEvent(ctx.stream(), info.all_ctx[i]->copy_event, 0));
|
||||
CUDA_CHECK(cudaStreamWaitEvent(ctx.stream(), all_ctx[i]->copy_event, 0));
|
||||
}
|
||||
if (ncopy > 0) {
|
||||
copy_missing_tensors(ctx, dst, nhave, ncopy, idx, copy_idx);
|
||||
|
||||
@ -7868,6 +7868,14 @@ struct ggml_tensor * ggml_mul_mat_id(
|
||||
GGML_ASSERT(b->ne[3] == 1); // b is 3d
|
||||
GGML_ASSERT(ids->ne[2] == 1 && ids->ne[3] == 1); // ids is 2d
|
||||
GGML_ASSERT(ids->ne[1] == b->ne[2]); // must have an expert list per b row
|
||||
//// can_mul_mat
|
||||
//if (as->ne[0] != b->ne[0]) {
|
||||
// fprintf(stderr, "MUL_MAT_ID_FAIL: as='%s' ne[0]=%ld type=%s | b='%s' ne[0]=%ld type=%s | ids->ne[1]=%ld b->ne[2]=%ld as->ne[1]=%ld as->ne[2]=%ld\n",
|
||||
// as->name, (long)as->ne[0], ggml_type_name(as->type),
|
||||
// b->name, (long)b->ne[0], ggml_type_name(b->type),
|
||||
// (long)ids->ne[1], (long)b->ne[2], (long)as->ne[1], (long)as->ne[2]);
|
||||
// fflush(stderr);
|
||||
//}
|
||||
GGML_ASSERT(as->ne[0] == b->ne[0]); // can_mul_mat
|
||||
GGML_ASSERT(ids->ne[0] % b->ne[1] == 0); // can broadcast
|
||||
|
||||
@ -11407,13 +11415,13 @@ static void ggml_compute_forward_dup_f16(
|
||||
|
||||
memcpy(dst_ptr, src0_ptr, sizeof(ggml_fp16_t));
|
||||
|
||||
if (++i10 == ne00) {
|
||||
if (++i10 == ne0) {
|
||||
i10 = 0;
|
||||
if (++i11 == ne01) {
|
||||
if (++i11 == ne1) {
|
||||
i11 = 0;
|
||||
if (++i12 == ne02) {
|
||||
if (++i12 == ne2) {
|
||||
i12 = 0;
|
||||
if (++i13 == ne03) {
|
||||
if (++i13 == ne3) {
|
||||
i13 = 0;
|
||||
}
|
||||
}
|
||||
@ -11711,13 +11719,13 @@ static void ggml_compute_forward_dup_bf16(
|
||||
|
||||
memcpy(dst_ptr, src0_ptr, sizeof(ggml_bf16_t));
|
||||
|
||||
if (++i10 == ne00) {
|
||||
if (++i10 == ne0) {
|
||||
i10 = 0;
|
||||
if (++i11 == ne01) {
|
||||
if (++i11 == ne1) {
|
||||
i11 = 0;
|
||||
if (++i12 == ne02) {
|
||||
if (++i12 == ne2) {
|
||||
i12 = 0;
|
||||
if (++i13 == ne03) {
|
||||
if (++i13 == ne3) {
|
||||
i13 = 0;
|
||||
}
|
||||
}
|
||||
@ -13333,7 +13341,7 @@ static void ggml_compute_forward_add1_q_f32(
|
||||
const int i1 = (ir - i3*ne2*ne1 - i2*ne1);
|
||||
|
||||
void * src0_row = (void *) ((char *) src0->data + (i1*nb01 + i2*nb02 + i3*nb03));
|
||||
void * dst_row = (void *) ((char *) dst->data + (i1*nb1 + i2*nb2 + i3*nb0 ));
|
||||
void * dst_row = (void *) ((char *) dst->data + (i1*nb1 + i2*nb2 + i3*nb3 ));
|
||||
|
||||
assert(ne0 % 32 == 0);
|
||||
|
||||
@ -14393,7 +14401,7 @@ static void ggml_compute_forward_sum_rows_f32(
|
||||
for (int ir = first_row; ir < last_row; ++ir) {
|
||||
int i3 = ir / (ne01*ne02);
|
||||
int i2 = (ir - i3*ne01*ne02)/ne01;
|
||||
int i1 = ir - i3*ne01*ne0 - i2*ne01;
|
||||
int i1 = ir - i3*ne01*ne02 - i2*ne01;
|
||||
const float * src_row = (const float *)((const char *)src0->data + i1*nb01 + i2*nb02 + i3*nb03);
|
||||
float * dst_row = ( float *)(( char *)dst->data + i1*nb1 + i2*nb2 + i3*nb3);
|
||||
float row_sum = 0;
|
||||
@ -26624,7 +26632,7 @@ struct ggml_cplan ggml_graph_plan(const struct ggml_cgraph * cgraph, int n_threa
|
||||
case GGML_OP_ACC:
|
||||
{
|
||||
if (ggml_is_quantized(node->src[0]->type)) {
|
||||
cur = ggml_type_size(GGML_TYPE_F32) * node->src[1]->ne[0] * n_tasks;
|
||||
cur = ggml_type_size(GGML_TYPE_F32) * node->src[0]->ne[0] * n_tasks;
|
||||
}
|
||||
} break;
|
||||
case GGML_OP_MUL_MAT:
|
||||
@ -26684,7 +26692,9 @@ struct ggml_cplan ggml_graph_plan(const struct ggml_cgraph * cgraph, int n_threa
|
||||
}
|
||||
} break;
|
||||
case GGML_OP_SOFT_MAX:
|
||||
case GGML_OP_SOFT_CAP_MAX:
|
||||
case GGML_OP_ROPE:
|
||||
case GGML_OP_ROPE_BACK:
|
||||
{
|
||||
cur = ggml_type_size(GGML_TYPE_F32) * node->ne[0] * n_tasks;
|
||||
} break;
|
||||
|
||||
@ -1579,6 +1579,8 @@ LLAMA_API struct llama_grammar* llama_sampler_init_grammar_lazy_patterns(
|
||||
|
||||
LLAMA_API void llama_set_draft_input_hidden_state(struct llama_context * ctx, const float * hidden_state);
|
||||
|
||||
LLAMA_API bool llama_reload_changed_tensors(struct llama_context * ctx);
|
||||
|
||||
#ifdef __cplusplus
|
||||
}
|
||||
#endif
|
||||
|
||||
0
llama-mmap.h
Normal file
0
llama-mmap.h
Normal file
171
models/templates/ByteDance-Seed-OSS.jinja
Normal file
171
models/templates/ByteDance-Seed-OSS.jinja
Normal file
@ -0,0 +1,171 @@
|
||||
{# ----------‑‑‑ special token variables ‑‑‑---------- #}
|
||||
{%- set bos_token = '<seed:bos>' -%}
|
||||
{%- set eos_token = '<seed:eos>' -%}
|
||||
{%- set pad_token = '<seed:pad>' -%}
|
||||
{%- set toolcall_begin_token = '<seed:tool_call>' -%}
|
||||
{%- set toolcall_end_token = '</seed:tool_call>' -%}
|
||||
{%- set think_begin_token = '<seed:think>' -%}
|
||||
{%- set think_end_token = '</seed:think>' -%}
|
||||
{%- set budget_begin_token = '<seed:cot_budget_reflect>'-%}
|
||||
{%- set budget_end_token = '</seed:cot_budget_reflect>'-%}
|
||||
{# -------------- reflection-interval lookup -------------- #}
|
||||
{%- if not thinking_budget is defined %}
|
||||
{%- set thinking_budget = -1 -%}
|
||||
{%- endif -%}
|
||||
{%- set budget_reflections_v05 = {
|
||||
0: 0,
|
||||
512: 128,
|
||||
1024: 256,
|
||||
2048: 512,
|
||||
4096: 512,
|
||||
8192: 1024,
|
||||
16384: 1024
|
||||
} -%}
|
||||
{# Find the first gear that is greater than or equal to the thinking_budget. #}
|
||||
{%- set ns = namespace(interval = None) -%}
|
||||
{%- for k, v in budget_reflections_v05 | dictsort -%}
|
||||
{%- if ns.interval is none and thinking_budget <= k -%}
|
||||
{%- set ns.interval = v -%}
|
||||
{%- endif -%}
|
||||
{%- endfor -%}
|
||||
{# If it exceeds the maximum gear, use the value of the last gear #}
|
||||
{%- if ns.interval is none -%}
|
||||
{%- set ns.interval = budget_reflections_v05[16384] -%}
|
||||
{%- endif -%}
|
||||
{# ---------- Preprocess the system message ---------- #}
|
||||
{%- if messages[0]["role"] == "system" %}
|
||||
{%- set system_message = messages[0]["content"] %}
|
||||
{%- set loop_messages = messages[1:] %}
|
||||
{%- else %}
|
||||
{%- set loop_messages = messages %}
|
||||
{%- endif %}
|
||||
{# ---------- Ensure tools exist ---------- #}
|
||||
{%- if not tools is defined or tools is none %}
|
||||
{%- set tools = [] %}
|
||||
{%- endif %}
|
||||
{# tools2doc.jinja #}
|
||||
{%- macro py_type(t) -%}
|
||||
{%- if t == "string" -%}str
|
||||
{%- elif t in ("number", "integer") -%}int
|
||||
{%- elif t == "boolean" -%}bool
|
||||
{%- elif t == "array" -%}list
|
||||
{%- else -%}Any{%- endif -%}
|
||||
{%- endmacro -%}
|
||||
{# ---------- Output the system block ---------- #}
|
||||
{%- if system_message is defined %}
|
||||
{{ bos_token + "system\n" + system_message }}
|
||||
{%- else %}
|
||||
{%- if tools is iterable and tools | length > 0 %}
|
||||
{{ bos_token + "system\nYou are Doubao, a helpful AI assistant. You may call one or more functions to assist with the user query." }}
|
||||
{%- endif %}
|
||||
{%- endif %}
|
||||
{%- if use_json_tooldef is defined and use_json_tooldef %}
|
||||
|
||||
{{"Tool List:\nYou are authorized to use the following tools (described in JSON Schema format). Before performing any task, you must decide how to call them based on the descriptions and parameters of these tools."}}
|
||||
{{ tools | tojson(ensure_ascii=False) }}
|
||||
{%- else %}
|
||||
{%- for item in tools if item.type == "function" %}
|
||||
|
||||
|
||||
Function:
|
||||
def {{ item.function.name }}(
|
||||
{%- for name, spec in item.function.parameters.properties.items() %}
|
||||
{{- name }}: {{ py_type(spec.type) }}{% if not loop.last %},{% endif %}
|
||||
{%- endfor %}):
|
||||
"""
|
||||
{{ item.function.description | trim }}
|
||||
|
||||
{# ---------- Args ---------- #}
|
||||
{%- if item.function.parameters.properties %}
|
||||
Args:
|
||||
{%- for name, spec in item.function.parameters.properties.items() %}
|
||||
|
||||
- {{ name }} ({{ py_type(spec.type) }})
|
||||
{%- if name in item.function.parameters.required %} [必填]{% else %} [选填]{% endif %}:
|
||||
{{- " " ~ (spec.description or "") }}
|
||||
{%- endfor %}
|
||||
{%- endif %}
|
||||
|
||||
{# ---------- Returns ---------- #}
|
||||
{%- if item.function.returns is defined
|
||||
and item.function.returns.properties is defined
|
||||
and item.function.returns.properties %}
|
||||
Returns:
|
||||
{%- for name, spec in item.function.returns.properties.items() %}
|
||||
|
||||
- {{ name }} ({{ py_type(spec.type) }}):
|
||||
{{- " " ~ (spec.description or "") }}
|
||||
{%- endfor %}
|
||||
{%- endif %}
|
||||
|
||||
"""
|
||||
{%- endfor %}
|
||||
{%- endif %}
|
||||
{%- if tools is iterable and tools | length > 0 %}
|
||||
|
||||
{{"工具调用请遵循如下格式:\n<seed:tool_call>\n<function=example_function_name>\n<parameter=example_parameter_1>value_1</parameter>\n<parameter=example_parameter_2>This is the value for the second parameter\nthat can span\nmultiple lines</parameter>\n</function>\n</seed:tool_call>\n"}}
|
||||
{%- endif %}
|
||||
{# End the system block line #}
|
||||
{%- if system_message is defined or tools is iterable and tools | length > 0 %}
|
||||
{{ eos_token }}
|
||||
{%- endif %}
|
||||
{# ---------- Thinking Budget ---------- #}
|
||||
{%- if thinking_budget is defined %}
|
||||
{%- if thinking_budget == 0 %}
|
||||
{{ bos_token+"system" }}
|
||||
{{ "You are an intelligent assistant that can answer questions in one step without the need for reasoning and thinking, that is, your thinking budget is 0. Next, please skip the thinking process and directly start answering the user's questions." }}
|
||||
{{ eos_token }}
|
||||
{%- elif not thinking_budget == -1 %}
|
||||
{{ bos_token+"system" }}
|
||||
{{ "You are an intelligent assistant with reflective ability. In the process of thinking and reasoning, you need to strictly follow the thinking budget, which is "}}{{thinking_budget}}{{". That is, you need to complete your thinking within "}}{{thinking_budget}}{{" tokens and start answering the user's questions. You will reflect on your thinking process every "}}{{ns.interval}}{{" tokens, stating how many tokens have been used and how many are left."}}
|
||||
{{ eos_token }}
|
||||
{%- endif %}
|
||||
{%- endif %}
|
||||
{# ---------- List the historical messages one by one ---------- #}
|
||||
{%- for message in loop_messages %}
|
||||
{%- if message.role == "assistant"
|
||||
and message.tool_calls is defined
|
||||
and message.tool_calls is iterable
|
||||
and message.tool_calls | length > 0 %}
|
||||
{{ bos_token + message.role }}
|
||||
{%- if message.reasoning_content is defined and message.reasoning_content is string and message.reasoning_content | trim | length > 0 %}
|
||||
{{ "\n" + think_begin_token + message.reasoning_content | trim + think_end_token }}
|
||||
{%- endif %}
|
||||
{%- if message.content is defined and message.content is string and message.content | trim | length > 0 %}
|
||||
{{ "\n" + message.content | trim + "\n" }}
|
||||
{%- endif %}
|
||||
{%- for tool_call in message.tool_calls %}
|
||||
{%- if tool_call.function is defined %}{% set tool_call = tool_call.function %}{% endif %}
|
||||
{{ "\n" + toolcall_begin_token + "\n<function=" + tool_call.name + ">\n" }}
|
||||
{%- if tool_call.arguments is defined %}
|
||||
{%- for arg_name, arg_value in tool_call.arguments | items %}
|
||||
{{ "<parameter=" + arg_name + ">" }}
|
||||
{%- set arg_value = arg_value if arg_value is string else arg_value | string %}
|
||||
{{ arg_value+"</parameter>\n" }}
|
||||
{%- endfor %}
|
||||
{%- endif %}
|
||||
{{ "</function>\n" + toolcall_end_token }}
|
||||
{%- endfor %}
|
||||
{{ eos_token }}
|
||||
{%- elif message.role in ["user", "system"] %}
|
||||
{{ bos_token + message.role + "\n" + message.content + eos_token }}
|
||||
{%- elif message.role == "assistant" %}
|
||||
{{ bos_token + message.role }}
|
||||
{%- if message.reasoning_content is defined and message.reasoning_content is string and message.reasoning_content | trim | length > 0 %}
|
||||
{{ "\n" + think_begin_token + message.reasoning_content | trim + think_end_token }}
|
||||
{%- endif %}
|
||||
{%- if message.content is defined and message.content is string and message.content | trim | length > 0 %}
|
||||
{{ "\n" + message.content | trim + eos_token }}
|
||||
{%- endif %}
|
||||
{# Include the tool role #}
|
||||
{%- else %}
|
||||
{{ bos_token + message.role + "\n" + message.content + eos_token }}
|
||||
{%- endif %}
|
||||
{%- endfor %}
|
||||
{# ---------- Control the model to start continuation ---------- #}
|
||||
{%- if add_generation_prompt %}
|
||||
{{ bos_token+"assistant\n" }}
|
||||
{%- if thinking_budget == 0 %}
|
||||
{{ think_begin_token + "\n" + budget_begin_token + "The current thinking budget is 0, so I will directly start answering the question." + budget_end_token + "\n" + think_end_token }}
|
||||
{%- endif %}
|
||||
{%- endif %}
|
||||
@ -1,169 +0,0 @@
|
||||
{# MiniMax-M3 override.
|
||||
Keep MiniMax-M2's PEG-compatible tool-call wrapper, but use M3 thinking tags. #}
|
||||
{%- set toolcall_begin_token = '<minimax:tool_call>' -%}
|
||||
{%- set toolcall_end_token = '</minimax:tool_call>' -%}
|
||||
{%- set think_begin_token = '<mm:think>' -%}
|
||||
{%- set think_end_token = '</mm:think>' -%}
|
||||
|
||||
{#- Tool Rendering Functions ============================================== -#}
|
||||
{%- macro render_tool_namespace(namespace_name, tool_list) -%}
|
||||
{%- for tool in tool_list -%}
|
||||
<tool>{{ tool.function | tojson(ensure_ascii=False) }}</tool>
|
||||
{% endfor -%}
|
||||
{%- endmacro -%}
|
||||
|
||||
{%- macro visible_text(content) -%}
|
||||
{%- if content is string -%}
|
||||
{{ content }}
|
||||
{%- elif content is iterable and content is not mapping -%}
|
||||
{%- for item in content -%}
|
||||
{%- if item is mapping and item.type == 'text' -%}
|
||||
{{- item.text }}
|
||||
{%- elif item is string -%}
|
||||
{{- item }}
|
||||
{%- endif -%}
|
||||
{%- endfor -%}
|
||||
{%- else -%}
|
||||
{{- content }}
|
||||
{%- endif -%}
|
||||
{%- endmacro -%}
|
||||
|
||||
{#- System Message Construction ============================================ -#}
|
||||
{%- macro build_system_message(system_message) -%}
|
||||
{%- if system_message and system_message.content -%}
|
||||
{{- visible_text(system_message.content) }}
|
||||
{%- else -%}
|
||||
{%- if model_identity is not defined -%}
|
||||
{%- set model_identity = "You are a helpful assistant." -%}
|
||||
{%- endif -%}
|
||||
{{- model_identity }}
|
||||
{%- endif -%}
|
||||
|
||||
{%- if system_message and system_message.current_date -%}
|
||||
{{- '\n' ~ 'Current date: ' + system_message.current_date }}
|
||||
{%- endif -%}
|
||||
{%- if system_message and system_message.current_location -%}
|
||||
{{- '\n' ~ 'Current location: ' + system_message.current_location }}
|
||||
{%- endif -%}
|
||||
{%- endmacro -%}
|
||||
|
||||
{#- Main Template Logic ===================================================== -#}
|
||||
{%- set system_message = none -%}
|
||||
{%- set conversation_messages = messages -%}
|
||||
{%- if messages and messages[0].role == "system" -%}
|
||||
{%- set system_message = messages[0] -%}
|
||||
{%- set conversation_messages = messages[1:] -%}
|
||||
{%- endif -%}
|
||||
|
||||
{#- Get the last user message turn, for interleaved thinking -#}
|
||||
{%- set ns = namespace(last_user_index=-1) %}
|
||||
{% for m in conversation_messages %}
|
||||
{%- if m.role == 'user' %}
|
||||
{% set ns.last_user_index = loop.index0 -%}
|
||||
{%- endif %}
|
||||
{%- endfor %}
|
||||
|
||||
{#- Render system message -#}
|
||||
{{- ']~!b[' ~ ']~b]system' ~ '\n' }}
|
||||
{{- build_system_message(system_message) }}
|
||||
|
||||
{#- Render tools if available -#}
|
||||
{%- if tools -%}
|
||||
{{- '\n\n' ~ '# Tools' ~ '\n' ~ 'You may call one or more tools to assist with the user query.\nHere are the tools available in JSONSchema format:' ~ '\n' }}
|
||||
{{- '\n' ~ '<tools>' ~ '\n' }}
|
||||
{{- render_tool_namespace("functions", tools) }}
|
||||
{{- '</tools>' ~ '\n\n' }}
|
||||
{{- 'When making tool calls, use XML format to invoke tools and pass parameters:' ~ '\n' }}
|
||||
{{- '\n' ~ toolcall_begin_token }}
|
||||
<invoke name="tool-name">
|
||||
<parameter name="param-name1">param-value-1</parameter>
|
||||
<parameter name="param-name2">param-value-2</parameter>
|
||||
...
|
||||
</invoke>
|
||||
{{- '\n' ~ toolcall_end_token }}
|
||||
{%- endif -%}
|
||||
|
||||
{{- '[e~[\n' }}
|
||||
|
||||
{#- Render messages -#}
|
||||
{%- set last_tool_call = namespace(name=none) -%}
|
||||
{%- for message in conversation_messages -%}
|
||||
{%- if message.role == 'assistant' -%}
|
||||
{{- ']~b]ai' ~ '\n' }}
|
||||
|
||||
{%- set reasoning_content = '' %}
|
||||
{%- set content = visible_text(message.content) %}
|
||||
{%- if message.reasoning_content is string %}
|
||||
{%- set reasoning_content = message.reasoning_content %}
|
||||
{%- else %}
|
||||
{%- if think_end_token in content %}
|
||||
{%- set reasoning_content = content.split(think_end_token)[0].strip('\n').split(think_begin_token)[-1].strip('\n') %}
|
||||
{%- set content = content.split(think_end_token)[-1].strip('\n') %}
|
||||
{%- elif '</think>' in content %}
|
||||
{%- set reasoning_content = content.split('</think>')[0].strip('\n').split('<think>')[-1].strip('\n') %}
|
||||
{%- set content = content.split('</think>')[-1].strip('\n') %}
|
||||
{%- endif %}
|
||||
{%- endif %}
|
||||
{%- if reasoning_content and loop.index0 > ns.last_user_index -%}
|
||||
{{- think_begin_token ~ '\n' ~ reasoning_content ~ '\n' ~ think_end_token ~ '\n\n' }}
|
||||
{%- endif -%}
|
||||
{%- if content -%}
|
||||
{{- content }}
|
||||
{%- endif -%}
|
||||
|
||||
{%- if message.tool_calls -%}
|
||||
{{- '\n' ~ toolcall_begin_token ~ '\n' }}
|
||||
{%- for tool_call in message.tool_calls -%}
|
||||
{%- if tool_call.function %}
|
||||
{%- set tool_call = tool_call.function %}
|
||||
{%- endif %}
|
||||
{{- '<invoke name="' ~ tool_call.name ~ '">' }}
|
||||
{% set _args = tool_call.arguments %}
|
||||
{%- for k, v in _args.items() %}
|
||||
{{- '<parameter name="' ~ k ~ '">' }}
|
||||
{{- v | tojson(ensure_ascii=False) if v is not string else v }}
|
||||
{{- '</parameter>' }}
|
||||
{% endfor %}
|
||||
{{- '</invoke>' ~ '\n' }}
|
||||
{%- endfor -%}
|
||||
|
||||
{{- toolcall_end_token}}
|
||||
{%- set last_tool_call.name = message.tool_calls[-1].function.name -%}
|
||||
{%- else -%}
|
||||
{%- set last_tool_call.name = none -%}
|
||||
{%- endif -%}
|
||||
{{- '[e~[' ~ '\n' }}
|
||||
|
||||
{%- elif message.role == 'tool' -%}
|
||||
{%- if last_tool_call.name is none -%}
|
||||
{{- raise_exception("Message has tool role, but there was no previous assistant message with a tool call!") }}
|
||||
{%- endif -%}
|
||||
{%- if loop.first or (conversation_messages[loop.index0 - 1].role != 'tool') -%}
|
||||
{{- ']~b]tool' }}
|
||||
{%- endif -%}
|
||||
{%- if message.content is string -%}
|
||||
{{- '\n' }}
|
||||
{{- message.content }}
|
||||
{{- '</' ~ last_tool_call.name ~ '>' }}
|
||||
{%- else -%}
|
||||
{%- for tr in message.content -%}
|
||||
{{- '\n' }}
|
||||
{{- tr.output if tr.output is defined else (tr.text if tr.type == 'text' and tr.text is defined else tr) }}
|
||||
{{- '\n' }}
|
||||
{%- endfor -%}
|
||||
{%- endif -%}
|
||||
{%- if loop.last or (conversation_messages[loop.index0 + 1].role != 'tool') -%}
|
||||
{{- '[e~[\n' -}}
|
||||
{%- endif -%}
|
||||
|
||||
{%- elif message.role == 'user' -%}
|
||||
{{- ']~b]user' ~ '\n' }}
|
||||
{{- visible_text(message.content) }}
|
||||
{{- '[e~[' ~ '\n' }}
|
||||
{%- endif -%}
|
||||
{%- endfor -%}
|
||||
|
||||
{#- Generation prompt -#}
|
||||
{%- if add_generation_prompt -%}
|
||||
{{- ']~b]ai' ~ '\n' ~ think_begin_token ~ '\n' }}
|
||||
{%- endif -%}
|
||||
@ -53,6 +53,8 @@ add_library(llama
|
||||
llama-build-context.cpp
|
||||
llama-model.h
|
||||
llama-model.cpp
|
||||
llama-reload-info.h
|
||||
llama-reload.cpp
|
||||
llama-quantize.cpp
|
||||
llama-arch.h
|
||||
llama-arch.cpp
|
||||
@ -141,3 +143,6 @@ endif()
|
||||
if (GGML_MAX_CONTEXTS)
|
||||
add_compile_definitions(GGML_MAX_CONTEXTS=${GGML_MAX_CONTEXTS})
|
||||
endif()
|
||||
if (GGML_MAX_SRC)
|
||||
add_compile_definitions(GGML_MAX_SRC=${GGML_MAX_SRC})
|
||||
endif()
|
||||
|
||||
@ -4,108 +4,6 @@
|
||||
|
||||
#include <cmath>
|
||||
|
||||
ggml_cgraph * llm_build_context::build_dflash_kv_workspace() {
|
||||
const int64_t n_embd_head_k = hparams.n_embd_head_k(0);
|
||||
const int64_t n_embd_head_v = hparams.n_embd_head_v(0);
|
||||
const int64_t ctx_len = lctx.dflash.visible_cross_ctx > 0
|
||||
? (int64_t) lctx.dflash.visible_cross_ctx
|
||||
: std::max<int64_t>(1, (int64_t) cparams.n_ctx - (int64_t) hparams.dflash_block_size);
|
||||
const int32_t cache_rows = std::clamp(lctx.dflash.kv.cache_view_n_filled, 0, (int32_t) ctx_len);
|
||||
const int32_t cache_write_pos = ctx_len > 0
|
||||
? ((lctx.dflash.kv.cache_view_write_pos % (int32_t) ctx_len) + (int32_t) ctx_len) % (int32_t) ctx_len
|
||||
: 0;
|
||||
|
||||
GGML_ASSERT(n_embd_head_k == n_embd_head_v);
|
||||
GGML_ASSERT(lctx.ensure_dflash_kv_cache_tensors((int32_t) ctx_len));
|
||||
GGML_ASSERT((int32_t) lctx.dflash.kv.k_ctx_workspace.size() == n_layer);
|
||||
GGML_ASSERT((int32_t) lctx.dflash.kv.v_ctx_workspace.size() == n_layer);
|
||||
|
||||
ggml_cgraph * gf = ggml_new_graph_custom(ctx0, model.max_nodes((int) std::max<int64_t>(1, ctx_len)) + 16 * n_layer, false);
|
||||
|
||||
auto build_ordered_cache_view = [&](ggml_tensor * cache) -> ggml_tensor * {
|
||||
if (!lctx.dflash.kv.cache_view_valid || cache_rows <= 0) {
|
||||
return cache;
|
||||
}
|
||||
|
||||
if (cache_rows < ctx_len) {
|
||||
ggml_tensor * zero_pad = ggml_view_3d(ctx0, cache,
|
||||
cache->ne[0],
|
||||
cache->ne[1],
|
||||
ctx_len - cache_rows,
|
||||
cache->nb[1],
|
||||
cache->nb[2],
|
||||
(size_t) cache_rows * cache->nb[2]);
|
||||
ggml_tensor * valid = ggml_view_3d(ctx0, cache,
|
||||
cache->ne[0],
|
||||
cache->ne[1],
|
||||
cache_rows,
|
||||
cache->nb[1],
|
||||
cache->nb[2],
|
||||
0);
|
||||
return ggml_concat(ctx0, zero_pad, valid, 2);
|
||||
}
|
||||
|
||||
if (cache_write_pos == 0) {
|
||||
return cache;
|
||||
}
|
||||
|
||||
ggml_tensor * tail = ggml_view_3d(ctx0, cache,
|
||||
cache->ne[0],
|
||||
cache->ne[1],
|
||||
ctx_len - cache_write_pos,
|
||||
cache->nb[1],
|
||||
cache->nb[2],
|
||||
(size_t) cache_write_pos * cache->nb[2]);
|
||||
ggml_tensor * head = ggml_view_3d(ctx0, cache,
|
||||
cache->ne[0],
|
||||
cache->ne[1],
|
||||
cache_write_pos,
|
||||
cache->nb[1],
|
||||
cache->nb[2],
|
||||
0);
|
||||
return ggml_concat(ctx0, tail, head, 2);
|
||||
};
|
||||
|
||||
for (int il = 0; il < n_layer; ++il) {
|
||||
GGML_ASSERT(il < (int32_t) lctx.dflash.kv.k_ctx_cache.size());
|
||||
GGML_ASSERT(il < (int32_t) lctx.dflash.kv.v_ctx_cache.size());
|
||||
|
||||
ggml_tensor * Kordered = build_ordered_cache_view(lctx.dflash.kv.k_ctx_cache[il]);
|
||||
ggml_tensor * Vordered = build_ordered_cache_view(lctx.dflash.kv.v_ctx_cache[il]);
|
||||
cb(Kordered, "dflash_workspace_k_ctx_view", il);
|
||||
cb(Vordered, "dflash_workspace_v_ctx_view", il);
|
||||
|
||||
ggml_tensor * Kworkspace = ggml_cont(ctx0, ggml_permute(ctx0, Kordered, 0, 2, 1, 3));
|
||||
ggml_tensor * Vworkspace = ggml_cont(ctx0, ggml_permute(ctx0, Vordered, 0, 2, 1, 3));
|
||||
cb(Kworkspace, "dflash_workspace_k_perm_cont", il);
|
||||
cb(Vworkspace, "dflash_workspace_v_perm_cont", il);
|
||||
|
||||
ggml_tensor * Kdst = ggml_view_3d(ctx0, lctx.dflash.kv.k_ctx_workspace[il],
|
||||
lctx.dflash.kv.k_ctx_workspace[il]->ne[0],
|
||||
ctx_len,
|
||||
lctx.dflash.kv.k_ctx_workspace[il]->ne[2],
|
||||
lctx.dflash.kv.k_ctx_workspace[il]->nb[1],
|
||||
lctx.dflash.kv.k_ctx_workspace[il]->nb[2],
|
||||
0);
|
||||
ggml_tensor * Vdst = ggml_view_3d(ctx0, lctx.dflash.kv.v_ctx_workspace[il],
|
||||
lctx.dflash.kv.v_ctx_workspace[il]->ne[0],
|
||||
ctx_len,
|
||||
lctx.dflash.kv.v_ctx_workspace[il]->ne[2],
|
||||
lctx.dflash.kv.v_ctx_workspace[il]->nb[1],
|
||||
lctx.dflash.kv.v_ctx_workspace[il]->nb[2],
|
||||
0);
|
||||
|
||||
ggml_tensor * Kstore = ggml_cpy(ctx0, Kworkspace, Kdst);
|
||||
ggml_tensor * Vstore = ggml_cpy(ctx0, Vworkspace, Vdst);
|
||||
cb(Kstore, "dflash_workspace_k_store", il);
|
||||
cb(Vstore, "dflash_workspace_v_store", il);
|
||||
ggml_build_forward_expand(gf, Kstore);
|
||||
ggml_build_forward_expand(gf, Vstore);
|
||||
}
|
||||
|
||||
return gf;
|
||||
}
|
||||
|
||||
ggml_cgraph * llm_build_context::build_dflash_kv_cache() {
|
||||
const int64_t n_embd_head_k = hparams.n_embd_head_k(0);
|
||||
const int64_t n_embd_head_v = hparams.n_embd_head_v(0);
|
||||
@ -150,10 +48,14 @@ ggml_cgraph * llm_build_context::build_dflash_kv_cache() {
|
||||
n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
|
||||
ext_factor, attn_factor, beta_fast, beta_slow);
|
||||
cb(Kcur_ctx, "dflash_kv_k_rope", il);
|
||||
Kcur_ctx = ggml_cont(ctx0, ggml_permute(ctx0, Kcur_ctx, 0, 2, 1, 3));
|
||||
cb(Kcur_ctx, "dflash_kv_k_physical", il);
|
||||
|
||||
ggml_tensor * Vcur_ctx = llm_build_lora_mm(lctx, ctx0, model.layers[il].wv, fused_target);
|
||||
cb(Vcur_ctx, "dflash_kv_v_proj", il);
|
||||
Vcur_ctx = ggml_reshape_3d(ctx0, Vcur_ctx, n_embd_head_v, n_head_kv, update_rows);
|
||||
Vcur_ctx = ggml_cont(ctx0, ggml_permute(ctx0, Vcur_ctx, 0, 2, 1, 3));
|
||||
cb(Vcur_ctx, "dflash_kv_v_physical", il);
|
||||
|
||||
const int32_t first_rows = std::min<int32_t>((int32_t) update_rows, (int32_t) ctx_len - write_pos);
|
||||
const int32_t second_rows = (int32_t) update_rows - first_rows;
|
||||
@ -163,8 +65,8 @@ ggml_cgraph * llm_build_context::build_dflash_kv_cache() {
|
||||
? Kcur_ctx
|
||||
: ggml_view_3d(ctx0, Kcur_ctx,
|
||||
Kcur_ctx->ne[0],
|
||||
Kcur_ctx->ne[1],
|
||||
first_rows,
|
||||
Kcur_ctx->ne[2],
|
||||
Kcur_ctx->nb[1],
|
||||
Kcur_ctx->nb[2],
|
||||
0);
|
||||
@ -172,25 +74,25 @@ ggml_cgraph * llm_build_context::build_dflash_kv_cache() {
|
||||
? Vcur_ctx
|
||||
: ggml_view_3d(ctx0, Vcur_ctx,
|
||||
Vcur_ctx->ne[0],
|
||||
Vcur_ctx->ne[1],
|
||||
first_rows,
|
||||
Vcur_ctx->ne[2],
|
||||
Vcur_ctx->nb[1],
|
||||
Vcur_ctx->nb[2],
|
||||
0);
|
||||
ggml_tensor * Kdst_first = ggml_view_3d(ctx0, lctx.dflash.kv.k_ctx_cache[il],
|
||||
lctx.dflash.kv.k_ctx_cache[il]->ne[0],
|
||||
lctx.dflash.kv.k_ctx_cache[il]->ne[1],
|
||||
first_rows,
|
||||
lctx.dflash.kv.k_ctx_cache[il]->ne[2],
|
||||
lctx.dflash.kv.k_ctx_cache[il]->nb[1],
|
||||
lctx.dflash.kv.k_ctx_cache[il]->nb[2],
|
||||
(size_t) write_pos * lctx.dflash.kv.k_ctx_cache[il]->nb[2]);
|
||||
(size_t) write_pos * lctx.dflash.kv.k_ctx_cache[il]->nb[1]);
|
||||
ggml_tensor * Vdst_first = ggml_view_3d(ctx0, lctx.dflash.kv.v_ctx_cache[il],
|
||||
lctx.dflash.kv.v_ctx_cache[il]->ne[0],
|
||||
lctx.dflash.kv.v_ctx_cache[il]->ne[1],
|
||||
first_rows,
|
||||
lctx.dflash.kv.v_ctx_cache[il]->ne[2],
|
||||
lctx.dflash.kv.v_ctx_cache[il]->nb[1],
|
||||
lctx.dflash.kv.v_ctx_cache[il]->nb[2],
|
||||
(size_t) write_pos * lctx.dflash.kv.v_ctx_cache[il]->nb[2]);
|
||||
(size_t) write_pos * lctx.dflash.kv.v_ctx_cache[il]->nb[1]);
|
||||
|
||||
ggml_tensor * Kstore_first = ggml_cpy(ctx0, Ksrc_first, Kdst_first);
|
||||
cb(Kstore_first, "dflash_kv_k_store", il);
|
||||
@ -204,29 +106,29 @@ ggml_cgraph * llm_build_context::build_dflash_kv_cache() {
|
||||
if (second_rows > 0) {
|
||||
ggml_tensor * Ksrc_second = ggml_view_3d(ctx0, Kcur_ctx,
|
||||
Kcur_ctx->ne[0],
|
||||
Kcur_ctx->ne[1],
|
||||
second_rows,
|
||||
Kcur_ctx->ne[2],
|
||||
Kcur_ctx->nb[1],
|
||||
Kcur_ctx->nb[2],
|
||||
(size_t) first_rows * Kcur_ctx->nb[2]);
|
||||
(size_t) first_rows * Kcur_ctx->nb[1]);
|
||||
ggml_tensor * Vsrc_second = ggml_view_3d(ctx0, Vcur_ctx,
|
||||
Vcur_ctx->ne[0],
|
||||
Vcur_ctx->ne[1],
|
||||
second_rows,
|
||||
Vcur_ctx->ne[2],
|
||||
Vcur_ctx->nb[1],
|
||||
Vcur_ctx->nb[2],
|
||||
(size_t) first_rows * Vcur_ctx->nb[2]);
|
||||
(size_t) first_rows * Vcur_ctx->nb[1]);
|
||||
ggml_tensor * Kdst_second = ggml_view_3d(ctx0, lctx.dflash.kv.k_ctx_cache[il],
|
||||
lctx.dflash.kv.k_ctx_cache[il]->ne[0],
|
||||
lctx.dflash.kv.k_ctx_cache[il]->ne[1],
|
||||
second_rows,
|
||||
lctx.dflash.kv.k_ctx_cache[il]->ne[2],
|
||||
lctx.dflash.kv.k_ctx_cache[il]->nb[1],
|
||||
lctx.dflash.kv.k_ctx_cache[il]->nb[2],
|
||||
0);
|
||||
ggml_tensor * Vdst_second = ggml_view_3d(ctx0, lctx.dflash.kv.v_ctx_cache[il],
|
||||
lctx.dflash.kv.v_ctx_cache[il]->ne[0],
|
||||
lctx.dflash.kv.v_ctx_cache[il]->ne[1],
|
||||
second_rows,
|
||||
lctx.dflash.kv.v_ctx_cache[il]->ne[2],
|
||||
lctx.dflash.kv.v_ctx_cache[il]->nb[1],
|
||||
lctx.dflash.kv.v_ctx_cache[il]->nb[2],
|
||||
0);
|
||||
@ -251,16 +153,11 @@ ggml_cgraph * llm_build_context::build_dflash() {
|
||||
const int64_t ctx_len = lctx.dflash.visible_cross_ctx > 0
|
||||
? (int64_t) lctx.dflash.visible_cross_ctx
|
||||
: std::max<int64_t>(1, (int64_t) cparams.n_ctx - (int64_t) hparams.dflash_block_size);
|
||||
const int32_t cache_write_pos = ctx_len > 0
|
||||
? ((lctx.dflash.kv.cache_view_write_pos % (int32_t) ctx_len) + (int32_t) ctx_len) % (int32_t) ctx_len
|
||||
: 0;
|
||||
const int64_t n_kv_total = GGML_PAD(ctx_len + n_tokens, flash_attn ? 256 : 32);
|
||||
const int64_t n_kv_pad = n_kv_total - (ctx_len + n_tokens);
|
||||
const int64_t n_kv_total = GGML_PAD(ctx_len + n_tokens, flash_attn ? 256 : 32);
|
||||
|
||||
GGML_ASSERT(n_embd_head_k == n_embd_head_v);
|
||||
GGML_ASSERT(n_target_features > 0);
|
||||
GGML_ASSERT(lctx.ensure_dflash_kv_cache_tensors((int32_t) ctx_len));
|
||||
GGML_ASSERT(cache_write_pos >= 0 && cache_write_pos < ctx_len);
|
||||
|
||||
ggml_cgraph * gf = ggml_new_graph_custom(ctx0, model.max_nodes((int) std::max<int64_t>(n_tokens, ctx_len)) + 32 * n_layer, false);
|
||||
|
||||
@ -274,12 +171,34 @@ ggml_cgraph * llm_build_context::build_dflash() {
|
||||
}();
|
||||
const ggml_type mask_type = flash_attn ? GGML_TYPE_F16 : GGML_TYPE_F32;
|
||||
|
||||
lctx.dflash.inputs.kq_mask = ggml_new_tensor_2d(ctx0, mask_type, n_kv_total, GGML_PAD(n_tokens, GGML_KQ_MASK_PAD));
|
||||
lctx.dflash.kv.kq_mask_tensor = lctx.dflash.inputs.kq_mask;
|
||||
ggml_set_input(lctx.dflash.inputs.kq_mask);
|
||||
cb(lctx.dflash.inputs.kq_mask, "dflash_kq_mask", -1);
|
||||
// The full (non-SWA) mask is only consumed by non-SWA layers. For an all-SWA draft every layer
|
||||
// uses kq_mask_swa, leaving the full mask a dead graph node that the scheduler never backs with a
|
||||
// buffer (and the unconditional input-set then asserts buf!=NULL). So create each mask only when
|
||||
// some layer uses it: full mask iff any non-SWA layer; swa mask iff needs_swa_mask.
|
||||
const bool needs_full_mask = !needs_swa_mask || [&]() {
|
||||
for (int il = 0; il < n_layer; ++il) {
|
||||
if (!hparams.swa_layers[il]) {
|
||||
return true;
|
||||
}
|
||||
}
|
||||
return false;
|
||||
}();
|
||||
|
||||
lctx.dflash.inputs.kq_mask = nullptr;
|
||||
lctx.dflash.kv.kq_mask_tensor = nullptr;
|
||||
ggml_tensor * dflash_kq_mask_full = nullptr;
|
||||
if (needs_full_mask) {
|
||||
lctx.dflash.inputs.kq_mask = ggml_new_tensor_2d(ctx0, mask_type, n_kv_total, GGML_PAD(n_tokens, GGML_KQ_MASK_PAD));
|
||||
lctx.dflash.kv.kq_mask_tensor = lctx.dflash.inputs.kq_mask;
|
||||
ggml_set_input(lctx.dflash.inputs.kq_mask);
|
||||
cb(lctx.dflash.inputs.kq_mask, "dflash_kq_mask", -1);
|
||||
dflash_kq_mask_full = lctx.dflash.inputs.kq_mask;
|
||||
}
|
||||
|
||||
lctx.dflash.kv.draft_tail_rows_tensor = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, n_tokens);
|
||||
ggml_set_input(lctx.dflash.kv.draft_tail_rows_tensor);
|
||||
cb(lctx.dflash.kv.draft_tail_rows_tensor, "dflash_draft_tail_rows", -1);
|
||||
|
||||
ggml_tensor * dflash_kq_mask_full = lctx.dflash.inputs.kq_mask;
|
||||
ggml_tensor * dflash_kq_mask_swa = nullptr;
|
||||
lctx.dflash.inputs.kq_mask_swa = nullptr;
|
||||
lctx.dflash.kv.kq_mask_swa_tensor = nullptr;
|
||||
@ -326,43 +245,43 @@ ggml_cgraph * llm_build_context::build_dflash() {
|
||||
Vcur_noise = ggml_reshape_3d(ctx0, Vcur_noise, n_embd_head_v, n_head_kv, n_tokens);
|
||||
cb(Vcur_noise, "Vcur_noise", il);
|
||||
|
||||
GGML_ASSERT(il < (int32_t) lctx.dflash.kv.k_ctx_workspace.size());
|
||||
GGML_ASSERT(il < (int32_t) lctx.dflash.kv.v_ctx_workspace.size());
|
||||
GGML_ASSERT(lctx.dflash.kv.k_ctx_workspace[il] != nullptr);
|
||||
GGML_ASSERT(lctx.dflash.kv.v_ctx_workspace[il] != nullptr);
|
||||
|
||||
ggml_tensor * Kcur_ctx = ggml_view_3d(ctx0, lctx.dflash.kv.k_ctx_workspace[il],
|
||||
lctx.dflash.kv.k_ctx_workspace[il]->ne[0],
|
||||
ctx_len,
|
||||
lctx.dflash.kv.k_ctx_workspace[il]->ne[2],
|
||||
lctx.dflash.kv.k_ctx_workspace[il]->nb[1],
|
||||
lctx.dflash.kv.k_ctx_workspace[il]->nb[2],
|
||||
0);
|
||||
ggml_tensor * Vcur_ctx = ggml_view_3d(ctx0, lctx.dflash.kv.v_ctx_workspace[il],
|
||||
lctx.dflash.kv.v_ctx_workspace[il]->ne[0],
|
||||
ctx_len,
|
||||
lctx.dflash.kv.v_ctx_workspace[il]->ne[2],
|
||||
lctx.dflash.kv.v_ctx_workspace[il]->nb[1],
|
||||
lctx.dflash.kv.v_ctx_workspace[il]->nb[2],
|
||||
0);
|
||||
cb(Kcur_ctx, "Kcur_ctx_workspace", il);
|
||||
cb(Vcur_ctx, "Vcur_ctx_workspace", il);
|
||||
GGML_ASSERT(il < (int32_t) lctx.dflash.kv.k_ctx_cache.size());
|
||||
GGML_ASSERT(il < (int32_t) lctx.dflash.kv.v_ctx_cache.size());
|
||||
GGML_ASSERT(lctx.dflash.kv.k_ctx_cache[il] != nullptr);
|
||||
GGML_ASSERT(lctx.dflash.kv.v_ctx_cache[il] != nullptr);
|
||||
GGML_ASSERT(lctx.dflash.kv.k_ctx_cache[il]->type == lctx.dflash.kv.v_ctx_cache[il]->type);
|
||||
GGML_ASSERT(lctx.dflash.kv.k_ctx_cache[il]->ne[1] >= n_kv_total);
|
||||
GGML_ASSERT(lctx.dflash.kv.v_ctx_cache[il]->ne[1] >= n_kv_total);
|
||||
|
||||
ggml_tensor * Kcur_draft = ggml_cont(ctx0, ggml_permute(ctx0, Kcur_noise, 0, 2, 1, 3));
|
||||
ggml_tensor * Vcur_draft = ggml_cont(ctx0, ggml_permute(ctx0, Vcur_noise, 0, 2, 1, 3));
|
||||
cb(Kcur_draft, "dflash_main_k_perm_cont", il);
|
||||
cb(Vcur_draft, "dflash_main_v_perm_cont", il);
|
||||
|
||||
ggml_tensor * Kcur = ggml_concat(ctx0, Kcur_ctx, Kcur_draft, 1);
|
||||
ggml_tensor * Vcur = ggml_concat(ctx0, Vcur_ctx, Vcur_draft, 1);
|
||||
cb(Kcur, "dflash_main_k_concat", il);
|
||||
cb(Vcur, "dflash_main_v_concat", il);
|
||||
ggml_tensor * Kcur = ggml_set_rows(ctx0, lctx.dflash.kv.k_ctx_cache[il], Kcur_draft, lctx.dflash.kv.draft_tail_rows_tensor);
|
||||
ggml_tensor * Vcur = ggml_set_rows(ctx0, lctx.dflash.kv.v_ctx_cache[il], Vcur_draft, lctx.dflash.kv.draft_tail_rows_tensor);
|
||||
cb(Kcur, "dflash_main_k_set_tail", il);
|
||||
cb(Vcur, "dflash_main_v_set_tail", il);
|
||||
|
||||
if (n_kv_pad > 0) {
|
||||
Kcur = ggml_pad(ctx0, Kcur, 0, (int) n_kv_pad, 0, 0);
|
||||
Vcur = ggml_pad(ctx0, Vcur, 0, (int) n_kv_pad, 0, 0);
|
||||
cb(Kcur, "dflash_main_k_pad", il);
|
||||
cb(Vcur, "dflash_main_v_pad", il);
|
||||
if (Kcur->ne[1] != n_kv_total) {
|
||||
Kcur = ggml_view_3d(ctx0, Kcur,
|
||||
Kcur->ne[0],
|
||||
n_kv_total,
|
||||
Kcur->ne[2],
|
||||
Kcur->nb[1],
|
||||
Kcur->nb[2],
|
||||
0);
|
||||
cb(Kcur, "dflash_main_k_active_view", il);
|
||||
}
|
||||
if (Vcur->ne[1] != n_kv_total) {
|
||||
Vcur = ggml_view_3d(ctx0, Vcur,
|
||||
Vcur->ne[0],
|
||||
n_kv_total,
|
||||
Vcur->ne[2],
|
||||
Vcur->nb[1],
|
||||
Vcur->nb[2],
|
||||
0);
|
||||
cb(Vcur, "dflash_main_v_active_view", il);
|
||||
}
|
||||
|
||||
if (Kcur->type == GGML_TYPE_F32) {
|
||||
|
||||
@ -591,10 +591,6 @@ ggml_cgraph * llm_build_context::build_gemma4_mtp() {
|
||||
|
||||
ggml_tensor * KQ_mask = nullptr;
|
||||
ggml_tensor * KQ_mask_swa = nullptr;
|
||||
ggml_tensor * frozen_k_swa = nullptr;
|
||||
ggml_tensor * frozen_v_swa = nullptr;
|
||||
ggml_tensor * frozen_k_full = nullptr;
|
||||
ggml_tensor * frozen_v_full = nullptr;
|
||||
{
|
||||
const int64_t n_mask_tokens = GGML_PAD(n_tokens, GGML_KQ_MASK_PAD);
|
||||
lctx.inp_KQ_mask = ggml_new_tensor_2d(ctx0, flash_attn ? GGML_TYPE_F16 : GGML_TYPE_F32, target_n_kv, n_mask_tokens);
|
||||
@ -610,59 +606,248 @@ ggml_cgraph * llm_build_context::build_gemma4_mtp() {
|
||||
}
|
||||
}
|
||||
|
||||
for (int il = 0; il < n_layer; ++il) {
|
||||
if (model.split_mode == LLAMA_SPLIT_MODE_GRAPH) {
|
||||
int n_device = model.splits.size();
|
||||
std::vector<ggml_tensor *> sa_inp(n_device, nullptr);
|
||||
std::vector<ggml_tensor *> sa_out(n_device, nullptr);
|
||||
std::vector<ggml_tensor *> ffn_inp(n_device, nullptr);
|
||||
std::vector<ggml_tensor *> ffn_out(n_device, nullptr);
|
||||
ggml_tensor * inpL = cur;
|
||||
|
||||
const bool is_sliding = hparams.swa_layers[il] ? true : false;
|
||||
const float freq_base_l = is_sliding ? target_hparams.rope_freq_base_train_swa : target_cparams.rope_freq_base;
|
||||
const float freq_scale_l = is_sliding ? target_hparams.rope_freq_scale_train_swa : target_cparams.rope_freq_scale;
|
||||
const int n_rot_l = is_sliding ? target_hparams.n_rot_swa : target_hparams.n_rot;
|
||||
const int n_swa = is_sliding ? target_hparams.n_swa : 0;
|
||||
const int n_embd_head = hparams.n_embd_head_k(il);
|
||||
const int n_head = hparams.n_head(il);
|
||||
ggml_tensor * KQ_mask_l = is_sliding ? KQ_mask_swa : KQ_mask;
|
||||
for (int il = 0; il < n_layer; ++il) {
|
||||
|
||||
cur = llm_build_norm(ctx0, inpL, hparams, model.layers[il].attn_norm, nullptr, LLM_NORM_RMS, cb, il);
|
||||
cb(cur, "attn_norm", il);
|
||||
const bool is_sliding = hparams.swa_layers[il] ? true : false;
|
||||
const float freq_base_l = is_sliding ? target_hparams.rope_freq_base_train_swa : target_cparams.rope_freq_base;
|
||||
const float freq_scale_l = is_sliding ? target_hparams.rope_freq_scale_train_swa : target_cparams.rope_freq_scale;
|
||||
const int n_rot_l = is_sliding ? target_hparams.n_rot_swa : target_hparams.n_rot;
|
||||
const int n_swa = is_sliding ? target_hparams.n_swa : 0;
|
||||
const int n_embd_head = hparams.n_embd_head_k(il);
|
||||
ggml_tensor * KQ_mask_l = is_sliding ? KQ_mask_swa : KQ_mask;
|
||||
|
||||
ggml_tensor * Qcur = llm_build_lora_mm(lctx, ctx0, model.layers[il].wq, cur);
|
||||
cb(Qcur, "Qcur", il);
|
||||
Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens);
|
||||
Qcur = llm_build_norm(ctx0, Qcur, hparams, model.layers[il].attn_q_norm, nullptr, LLM_NORM_RMS, cb, il);
|
||||
cb(Qcur, "Qcur_normed", il);
|
||||
auto freq_factors = is_sliding ? nullptr : model.layers[il].rope_freqs;
|
||||
Qcur = ggml_rope_ext(ctx0, Qcur, inp_pos, freq_factors, n_rot_l, rope_type, n_ctx_orig, freq_base_l, freq_scale_l,
|
||||
ext_factor, attn_factor, beta_fast, beta_slow);
|
||||
cb(Qcur, "Qcur_rope", il);
|
||||
const int target_il = gemma4_mtp_target_kv_layer(hparams, target_hparams, il);
|
||||
|
||||
const int target_il = gemma4_mtp_target_kv_layer(hparams, target_hparams, il);
|
||||
ggml_tensor *& frozen_k = is_sliding ? frozen_k_swa : frozen_k_full;
|
||||
ggml_tensor *& frozen_v = is_sliding ? frozen_v_swa : frozen_v_full;
|
||||
gemma4_mtp_prepare_frozen_kv_views(ctx0, lctx, target_kv, il, target_il, target_n_kv, &frozen_k, &frozen_v, cb);
|
||||
cur = llm_build_kv(ctx0, lctx, target_kv, gf, model.layers[il].wo, model.layers[il].bo,
|
||||
nullptr, nullptr, Qcur, KQ_mask_l, n_tokens, target_kv_head, target_n_kv, hparams.f_attention_scale, cb, il, nullptr, n_swa, target_il,
|
||||
&frozen_k, &frozen_v);
|
||||
auto split_kl = (const ggml_split_tensor_t *)target_kv.k_l[target_il]->extra;
|
||||
auto split_vl = (const ggml_split_tensor_t *)target_kv.v_l[target_il]->extra;
|
||||
GGML_ASSERT(split_kl && split_vl);
|
||||
auto split_ql = (const ggml_split_tensor_t *)model.layers[il].wq->extra;
|
||||
auto split_ol = (const ggml_split_tensor_t *)model.layers[il].wo->extra;
|
||||
GGML_ASSERT(split_ql && split_ol);
|
||||
GGML_ASSERT(split_ql->n_device == n_device && split_kl->n_device == n_device && split_vl->n_device == n_device && split_ol->n_device == n_device);
|
||||
ggml_tensor * sa_last = nullptr;
|
||||
int nhave = 0;
|
||||
for (int id = 0; id < n_device; ++id) {
|
||||
GGML_ASSERT((split_kl->splits[id] && split_vl->splits[id] && split_ql->splits[id] && split_ol->splits[id]) ||
|
||||
!(split_kl->splits[id] || split_vl->splits[id] || split_ql->splits[id] || split_ol->splits[id]));
|
||||
if (!split_kl->splits[id]) {
|
||||
sa_inp[id] = sa_out[id] = nullptr;
|
||||
continue;
|
||||
}
|
||||
|
||||
cur = llm_build_norm(ctx0, cur, hparams, model.layers[il].attn_post_norm, nullptr, LLM_NORM_RMS, cb, il);
|
||||
cb(cur, "attn_post_norm", il);
|
||||
cur = ggml_add(ctx0, cur, inpL);
|
||||
cb(cur, "attn_out", il);
|
||||
int il_cb = 1000*(il + 1) + id;
|
||||
|
||||
ggml_tensor * ffn = llm_build_ffn(ctx0, lctx, model.layers[il].ffn_norm, cur,
|
||||
model.layers[il].ffn_up, nullptr, nullptr,
|
||||
model.layers[il].ffn_gate, nullptr, nullptr,
|
||||
model.layers[il].ffn_down, nullptr, nullptr,
|
||||
nullptr,
|
||||
LLM_FFN_GELU, LLM_FFN_PAR, cb, il, gf, true, false, nullptr, model.layers[il].ffn_post_norm);
|
||||
cb(ffn, "ffn_out", il);
|
||||
if (il == 0) {
|
||||
sa_inp[id] = inpL;
|
||||
} else {
|
||||
GGML_ASSERT(inpL->op == GGML_OP_REDUCE);
|
||||
cur = get_input_tensor_sm_graph(ctx0, inpL, id);
|
||||
GGML_ASSERT(model.layers[il-1].ffn_post_norm && model.layers[il-1].ffn_post_norm->extra);
|
||||
cur = do_split_norm(ctx0, cur, model.layers[il-1].ffn_post_norm, hparams, cb, id, il_cb, false);
|
||||
cb(cur, "ffn_normed", il_cb);
|
||||
auto add = ffn_inp[id];
|
||||
if (!add) {
|
||||
for (int j = 0; j < n_device; ++j) {
|
||||
if (ffn_inp[j]) {
|
||||
add = ffn_inp[j]; break;
|
||||
}
|
||||
}
|
||||
GGML_ASSERT(add);
|
||||
}
|
||||
sa_inp[id] = ggml_add(ctx0, cur, add);
|
||||
cb(sa_inp[id], "sa_inp", il_cb);
|
||||
if (model.layers[il-1].out_scale) {
|
||||
auto scale = (const ggml_split_tensor_t *)model.layers[il-1].out_scale->extra;
|
||||
sa_inp[id] = ggml_mul(ctx0, sa_inp[id], scale->splits[id]);
|
||||
cb(sa_inp[id], "sa_inp_scaled", il_cb);
|
||||
}
|
||||
}
|
||||
GGML_ASSERT(model.layers[il].attn_norm && model.layers[il].attn_norm->extra);
|
||||
cur = do_split_norm(ctx0, sa_inp[id], model.layers[il].attn_norm, hparams, cb, id, il_cb, false);
|
||||
cb(cur, "sa_inp_normed", il_cb);
|
||||
auto Qcur = llm_build_lora_mm(lctx, ctx0, split_ql->splits[id], cur);
|
||||
cb(Qcur, "Qcur", il_cb);
|
||||
Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, Qcur->ne[0]/n_embd_head, n_tokens);
|
||||
GGML_ASSERT(model.layers[il].attn_q_norm && model.layers[il].attn_q_norm->extra);
|
||||
Qcur = do_split_norm(ctx0, Qcur, model.layers[il].attn_q_norm, hparams, cb, id, il_cb, false);
|
||||
cb(Qcur, "Qcur_normed", il_cb);
|
||||
auto freq_factors = is_sliding ? nullptr : ((const ggml_split_tensor_t *)model.layers[il].rope_freqs->extra)->splits[id];
|
||||
Qcur = ggml_rope_ext(ctx0, Qcur, inp_pos, freq_factors, n_rot_l, rope_type, n_ctx_orig, freq_base_l, freq_scale_l,
|
||||
ext_factor, attn_factor, beta_fast, beta_slow);
|
||||
cb(Qcur, "Qcur_rope", il_cb);
|
||||
GGML_ASSERT(split_kl->splits[id]->ne[1] % target_kv.size == 0);
|
||||
int n_head_kv = split_kl->splits[id]->ne[1] / target_kv.size;
|
||||
auto q = ggml_permute(ctx0, Qcur, 0, 2, 1, 3);
|
||||
auto k = ggml_view_3d(ctx0, split_kl->splits[id], n_embd_head, target_n_kv, n_head_kv,
|
||||
ggml_row_size(split_kl->splits[id]->type, n_embd_head)*n_head_kv,
|
||||
ggml_row_size(split_kl->splits[id]->type, n_embd_head), 0);
|
||||
auto v = ggml_view_3d(ctx0, split_vl->splits[id], n_embd_head, target_n_kv, n_head_kv,
|
||||
ggml_row_size(split_vl->splits[id]->type, n_embd_head)*n_head_kv,
|
||||
ggml_row_size(split_vl->splits[id]->type, n_embd_head), 0);
|
||||
cur = ggml_flash_attn_ext(ctx0, q, k, v, KQ_mask_l, hparams.f_attention_scale, 0.0f, 0.0f);
|
||||
cur->op_params[4] = n_swa;
|
||||
cb(cur, "fa", il_cb);
|
||||
cur = ggml_reshape_2d(ctx0, cur, split_ol->splits[id]->ne[0], ggml_nelements(cur)/split_ol->splits[id]->ne[0]);
|
||||
cur = llm_build_lora_mm(lctx, ctx0, split_ol->splits[id], cur);
|
||||
cb(cur, "qkv", il_cb);
|
||||
ggml_build_forward_expand(gf, cur);
|
||||
sa_out[id] = cur;
|
||||
sa_last = cur;
|
||||
++nhave;
|
||||
}
|
||||
|
||||
cur = ffn;
|
||||
if (model.layers[il].out_scale) {
|
||||
cur = ggml_mul(ctx0, cur, model.layers[il].out_scale);
|
||||
cb(cur, "out_scaled", il);
|
||||
auto last_ffn_inp = nhave > 1 ? ggml_reduce(ctx0, sa_out.data(), n_device, GGML_OP_ADD) : sa_last;
|
||||
ggml_build_forward_expand(gf, last_ffn_inp);
|
||||
cb(last_ffn_inp, "sa_reduce", il);
|
||||
|
||||
auto ffn_up = (const ggml_split_tensor_t *)model.layers[il].ffn_up->extra;
|
||||
auto ffn_gate = (const ggml_split_tensor_t *)model.layers[il].ffn_gate->extra;
|
||||
auto ffn_down = (const ggml_split_tensor_t *)model.layers[il].ffn_down->extra;
|
||||
GGML_ASSERT(ffn_up && ffn_gate && ffn_down);
|
||||
|
||||
for (int id = 0; id < n_device; ++id) {
|
||||
GGML_ASSERT((ffn_up->splits[id] && ffn_gate->splits[id] && ffn_down->splits[id]) ||
|
||||
(!ffn_up->splits[id] && !ffn_gate->splits[id] && !ffn_down->splits[id]));
|
||||
if (!ffn_up->splits[id]) {
|
||||
ffn_inp[id] = ffn_out[id] = nullptr;
|
||||
continue;
|
||||
}
|
||||
|
||||
GGML_ASSERT(last_ffn_inp && (nhave == 1 || last_ffn_inp->op == GGML_OP_REDUCE));
|
||||
|
||||
int il_cb = 1000*(il + 1) + id;
|
||||
|
||||
cur = get_input_tensor_sm_graph(ctx0, last_ffn_inp, id);
|
||||
cur = do_split_norm(ctx0, cur, model.layers[il].attn_post_norm, hparams, cb, id, il_cb, false);
|
||||
cb(cur, "sa_post", il_cb);
|
||||
auto add = sa_inp[id];
|
||||
if (!add) {
|
||||
for (int j = 0; j < n_device; ++j) {
|
||||
if (sa_inp[j]) {
|
||||
add = sa_inp[j]; break;
|
||||
}
|
||||
}
|
||||
}
|
||||
ffn_inp[id] = ggml_add(ctx0, cur, add);
|
||||
cb(ffn_inp[id], "ffn_inp", il_cb);
|
||||
cur = do_split_norm(ctx0, ffn_inp[id], model.layers[il].ffn_norm, hparams, cb, id, il_cb, false);
|
||||
cb(cur, "ffn_inp_normed", il_cb);
|
||||
cur = llm_build_ffn(ctx0, lctx, nullptr, cur,
|
||||
ffn_up->splits[id], nullptr, nullptr,
|
||||
ffn_gate->splits[id], nullptr, nullptr,
|
||||
ffn_down->splits[id], nullptr, nullptr,
|
||||
nullptr,
|
||||
LLM_FFN_GELU, LLM_FFN_PAR, cb, il, gf, false, false, nullptr, nullptr);
|
||||
cb(cur, "ffn", il_cb);
|
||||
ggml_build_forward_expand(gf, cur);
|
||||
ffn_out[id] = cur;
|
||||
|
||||
}
|
||||
|
||||
inpL = ggml_reduce(ctx0, ffn_out.data(), n_device, GGML_OP_ADD);
|
||||
cb(inpL, "ffn_reduce", il);
|
||||
ggml_build_forward_expand(gf, inpL);
|
||||
}
|
||||
|
||||
int idx = lctx.model.default_layer_device[lctx.model.hparams.n_layer];
|
||||
int idx_out = ggml_backend_sched_get_backend_idx(lctx.sched, lctx.model.output->buffer);
|
||||
if (idx_out >= 0) idx = idx_out;
|
||||
cur = inpL->src[idx];
|
||||
if (!cur) {
|
||||
cur = inpL->view_src;
|
||||
}
|
||||
|
||||
auto post_norm = (const ggml_split_tensor_t *)model.layers[hparams.n_layer-1].ffn_post_norm->extra;
|
||||
cur = llm_build_norm(ctx0, cur, hparams, post_norm->splits[idx], NULL, LLM_NORM_RMS, cb, -1);
|
||||
|
||||
cb(cur, "ffn_normed", hparams.n_layer-1);
|
||||
auto add = ffn_inp[idx];
|
||||
if (!add) {
|
||||
for (int j = 0; j < n_device; ++j) {
|
||||
if (ffn_inp[j]) {
|
||||
add = ffn_inp[j]; break;
|
||||
}
|
||||
}
|
||||
}
|
||||
cur = ggml_add(ctx0, cur, add);
|
||||
cb(cur, "ffn_out", hparams.n_layer-1);
|
||||
|
||||
if (model.layers[hparams.n_layer-1].out_scale) {
|
||||
auto scale = (const ggml_split_tensor_t *)model.layers[hparams.n_layer-1].out_scale->extra;
|
||||
cur = ggml_mul(ctx0, cur, scale->splits[idx]);
|
||||
cb(cur, "ffn_out_scaled", hparams.n_layer-1);
|
||||
}
|
||||
|
||||
} else {
|
||||
|
||||
ggml_tensor * frozen_k_swa = nullptr;
|
||||
ggml_tensor * frozen_v_swa = nullptr;
|
||||
ggml_tensor * frozen_k_full = nullptr;
|
||||
ggml_tensor * frozen_v_full = nullptr;
|
||||
|
||||
for (int il = 0; il < n_layer; ++il) {
|
||||
ggml_tensor * inpL = cur;
|
||||
|
||||
const bool is_sliding = hparams.swa_layers[il] ? true : false;
|
||||
const float freq_base_l = is_sliding ? target_hparams.rope_freq_base_train_swa : target_cparams.rope_freq_base;
|
||||
const float freq_scale_l = is_sliding ? target_hparams.rope_freq_scale_train_swa : target_cparams.rope_freq_scale;
|
||||
const int n_rot_l = is_sliding ? target_hparams.n_rot_swa : target_hparams.n_rot;
|
||||
const int n_swa = is_sliding ? target_hparams.n_swa : 0;
|
||||
const int n_embd_head = hparams.n_embd_head_k(il);
|
||||
const int n_head = hparams.n_head(il);
|
||||
ggml_tensor * KQ_mask_l = is_sliding ? KQ_mask_swa : KQ_mask;
|
||||
|
||||
const int target_il = gemma4_mtp_target_kv_layer(hparams, target_hparams, il);
|
||||
|
||||
cur = llm_build_norm(ctx0, inpL, hparams, model.layers[il].attn_norm, nullptr, LLM_NORM_RMS, cb, il);
|
||||
cb(cur, "attn_norm", il);
|
||||
|
||||
ggml_tensor * Qcur = llm_build_lora_mm(lctx, ctx0, model.layers[il].wq, cur);
|
||||
cb(Qcur, "Qcur", il);
|
||||
Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens);
|
||||
Qcur = llm_build_norm(ctx0, Qcur, hparams, model.layers[il].attn_q_norm, nullptr, LLM_NORM_RMS, cb, il);
|
||||
cb(Qcur, "Qcur_normed", il);
|
||||
auto freq_factors = is_sliding ? nullptr : model.layers[il].rope_freqs;
|
||||
Qcur = ggml_rope_ext(ctx0, Qcur, inp_pos, freq_factors, n_rot_l, rope_type, n_ctx_orig, freq_base_l, freq_scale_l,
|
||||
ext_factor, attn_factor, beta_fast, beta_slow);
|
||||
cb(Qcur, "Qcur_rope", il);
|
||||
|
||||
ggml_tensor *& frozen_k = is_sliding ? frozen_k_swa : frozen_k_full;
|
||||
ggml_tensor *& frozen_v = is_sliding ? frozen_v_swa : frozen_v_full;
|
||||
gemma4_mtp_prepare_frozen_kv_views(ctx0, lctx, target_kv, il, target_il, target_n_kv, &frozen_k, &frozen_v, cb);
|
||||
cur = llm_build_kv(ctx0, lctx, target_kv, gf, model.layers[il].wo, model.layers[il].bo,
|
||||
nullptr, nullptr, Qcur, KQ_mask_l, n_tokens, target_kv_head, target_n_kv, hparams.f_attention_scale, cb, il, nullptr, n_swa, target_il,
|
||||
&frozen_k, &frozen_v);
|
||||
|
||||
cur = llm_build_norm(ctx0, cur, hparams, model.layers[il].attn_post_norm, nullptr, LLM_NORM_RMS, cb, il);
|
||||
cb(cur, "attn_post_norm", il);
|
||||
cur = ggml_add(ctx0, cur, inpL);
|
||||
cb(cur, "attn_out", il);
|
||||
|
||||
ggml_tensor * ffn = llm_build_ffn(ctx0, lctx, model.layers[il].ffn_norm, cur,
|
||||
model.layers[il].ffn_up, nullptr, nullptr,
|
||||
model.layers[il].ffn_gate, nullptr, nullptr,
|
||||
model.layers[il].ffn_down, nullptr, nullptr,
|
||||
nullptr,
|
||||
LLM_FFN_GELU, LLM_FFN_PAR, cb, il, gf, true, false, nullptr, model.layers[il].ffn_post_norm);
|
||||
cb(ffn, "ffn_out", il);
|
||||
|
||||
cur = ffn;
|
||||
if (model.layers[il].out_scale) {
|
||||
cur = ggml_mul(ctx0, cur, model.layers[il].out_scale);
|
||||
cb(cur, "out_scaled", il);
|
||||
}
|
||||
cur = lctx.cvec.apply_to(ctx0, cur, il);
|
||||
cb(cur, "l_out", il);
|
||||
}
|
||||
cur = lctx.cvec.apply_to(ctx0, cur, il);
|
||||
cb(cur, "l_out", il);
|
||||
}
|
||||
|
||||
cur = llm_build_norm(ctx0, cur, hparams, model.output_norm, nullptr, LLM_NORM_RMS, cb, -1);
|
||||
|
||||
@ -9,13 +9,18 @@ ggml_cgraph * llm_build_context::build_laguna() {
|
||||
ggml_tensor * inp_pos = build_inp_pos();
|
||||
ggml_tensor * inp_out_ids = n_tokens > 1 ? build_inp_out_ids() : nullptr;
|
||||
ggml_tensor * KQ_mask = build_inp_KQ_mask();
|
||||
ggml_tensor * KQ_mask_swa = build_inp_KQ_mask_swa();
|
||||
// Laguna M.1 has only global-attention layers and leaves n_swa at zero; building
|
||||
// the SWA mask in that case trips the generic SWA precondition.
|
||||
ggml_tensor * KQ_mask_swa = hparams.n_swa > 0 ? build_inp_KQ_mask_swa() : nullptr;
|
||||
|
||||
for (int il = 0; il < n_layer; ++il) {
|
||||
const bool is_swa = hparams.swa_layers[il];
|
||||
const int n_swa_l = is_swa ? hparams.n_swa : 0;
|
||||
|
||||
auto KQ_mask_l = is_swa ? KQ_mask_swa : KQ_mask;
|
||||
// If a future Laguna GGUF marks SWA layers, it must also carry a real
|
||||
// sliding-window size so those layers get an SWA mask.
|
||||
GGML_ASSERT(KQ_mask_l != nullptr);
|
||||
auto rope_factors = is_swa ? nullptr : build_rope_factors(il);
|
||||
|
||||
auto cur = build_std_attention(gf, model.layers[il].attn_norm, inpL,
|
||||
|
||||
@ -2244,43 +2244,6 @@ struct ggml_cgraph * llm_build_context::llama_build_graph_dflash_kv_cache(llama_
|
||||
return result;
|
||||
}
|
||||
|
||||
struct ggml_cgraph * llm_build_context::llama_build_graph_dflash_kv_workspace(llama_context & lctx) {
|
||||
llama_batch dummy;
|
||||
dummy.n_tokens = 0;
|
||||
|
||||
llm_build_cb cb = [&](struct ggml_tensor * cur, const char * name, int il) {
|
||||
if (il >= 0) {
|
||||
int j = 0;
|
||||
for (; j < GGML_MAX_NAME - 1; ++j) {
|
||||
cur->name[j] = name[j];
|
||||
if (!name[j]) {
|
||||
break;
|
||||
}
|
||||
}
|
||||
if (j < GGML_MAX_NAME - 3) {
|
||||
cur->name[j++] = '-';
|
||||
auto sil = std::to_string(il);
|
||||
for (int k = 0; k < (int) sil.size() && j < GGML_MAX_NAME - 1; ++k) {
|
||||
cur->name[j++] = sil[k];
|
||||
}
|
||||
}
|
||||
cur->name[j] = 0;
|
||||
} else {
|
||||
ggml_set_name(cur, name);
|
||||
}
|
||||
};
|
||||
|
||||
struct llm_build_context llm(lctx, dummy, cb, false, false, 0, false, &lctx.dflash.kv.workspace_compute_meta);
|
||||
|
||||
llm.init();
|
||||
|
||||
struct ggml_cgraph * result = llm.build_dflash_kv_workspace();
|
||||
|
||||
llm.free();
|
||||
|
||||
return result;
|
||||
}
|
||||
|
||||
ggml_cgraph * llm_build_context::llama_build_graph(
|
||||
llama_context & lctx,
|
||||
const llama_batch & batch,
|
||||
@ -2910,10 +2873,19 @@ ggml_tensor * llm_build_context::build_std_attention(ggml_cgraph * gf, ggml_tens
|
||||
cb(gate, "attn_gate", il_cb);
|
||||
int nh = split_wo->ne[0]/n_embd_head_v;
|
||||
auto attn_3d = ggml_reshape_3d(ctx0, cur, n_embd_head_v, nh, n_tokens);
|
||||
auto gate_3d = ggml_reshape_3d(ctx0, gate, 1, nh, n_tokens);
|
||||
if (model.arch == LLM_ARCH_LAGUNA) {
|
||||
cur = ggml_mul(ctx0, attn_3d, gate_3d);
|
||||
// Laguna uses a softplus gate. XS.2 stores one gate per head,
|
||||
// while M.1 stores one gate per attention output element.
|
||||
if (gate->ne[0] == nh) {
|
||||
auto gate_3d = ggml_reshape_3d(ctx0, gate, 1, nh, n_tokens);
|
||||
cur = ggml_mul(ctx0, attn_3d, gate_3d);
|
||||
} else {
|
||||
GGML_ASSERT(gate->ne[0] == split_wo->ne[0]);
|
||||
cur = ggml_reshape_2d(ctx0, cur, split_wo->ne[0], n_tokens);
|
||||
cur = ggml_mul(ctx0, cur, gate);
|
||||
}
|
||||
} else {
|
||||
auto gate_3d = ggml_reshape_3d(ctx0, gate, 1, nh, n_tokens);
|
||||
cur = ggml_fused_mul_unary(ctx0, gate_3d, attn_3d, GGML_UNARY_OP_SIGMOID);
|
||||
}
|
||||
cb(attn_3d, "attn_gated_3d", il_cb);
|
||||
@ -3031,17 +3003,25 @@ ggml_tensor * llm_build_context::build_std_attention(ggml_cgraph * gf, ggml_tens
|
||||
nullptr, nullptr,
|
||||
Kcur, Vcur, Qcur, KQ_mask, n_tokens, kv_head, n_kv, KQ_scale, cb, il, sinks, n_swa);
|
||||
cb(cur, "wqkv", il);
|
||||
auto gate = llm_build_lora_mm(lctx, ctx0, wqkv_gate, input_normed); // [n_head_l, n_tokens]
|
||||
auto gate = llm_build_lora_mm(lctx, ctx0, wqkv_gate, input_normed);
|
||||
if (model.arch == LLM_ARCH_LAGUNA) {
|
||||
gate = ggml_softplus(ctx0, gate);
|
||||
}
|
||||
cb(gate, "attn_gate", il);
|
||||
int n_head_l = hparams.n_head(il);
|
||||
auto attn_3d = ggml_reshape_3d(ctx0, cur, n_embd_head_v, n_head_l, n_tokens);
|
||||
auto gate_3d = ggml_reshape_3d(ctx0, gate, 1, n_head_l, n_tokens);
|
||||
if (model.arch == LLM_ARCH_LAGUNA) {
|
||||
cur = ggml_mul(ctx0, attn_3d, gate_3d);
|
||||
// Laguna uses a softplus gate. XS.2 stores one gate per head,
|
||||
// while M.1 stores one gate per attention output element.
|
||||
if (gate->ne[0] == n_head_l) {
|
||||
auto gate_3d = ggml_reshape_3d(ctx0, gate, 1, n_head_l, n_tokens);
|
||||
cur = ggml_mul(ctx0, attn_3d, gate_3d);
|
||||
} else {
|
||||
GGML_ASSERT(gate->ne[0] == n_embd_head_v * n_head_l);
|
||||
cur = ggml_mul(ctx0, cur, gate);
|
||||
}
|
||||
} else {
|
||||
auto gate_3d = ggml_reshape_3d(ctx0, gate, 1, n_head_l, n_tokens);
|
||||
cur = ggml_fused_mul_unary(ctx0, gate_3d, attn_3d, GGML_UNARY_OP_SIGMOID);
|
||||
}
|
||||
cb(cur, "attn_gated_3d", il);
|
||||
|
||||
@ -251,8 +251,6 @@ struct llm_build_context {
|
||||
|
||||
ggml_cgraph * build_dflash_kv_cache();
|
||||
|
||||
ggml_cgraph * build_dflash_kv_workspace();
|
||||
|
||||
ggml_cgraph * build_starcoder2();
|
||||
|
||||
ggml_cgraph * build_mamba();
|
||||
@ -474,8 +472,6 @@ llm_expert_gating_func_type gating_op,
|
||||
|
||||
static ggml_cgraph * llama_build_graph_dflash_kv_cache(llama_context & lctx);
|
||||
|
||||
static ggml_cgraph * llama_build_graph_dflash_kv_workspace(llama_context & lctx);
|
||||
|
||||
static ggml_cgraph * llama_build_graph(llama_context & lctx, const llama_batch & batch, bool worst_case, int n_outputs = 0);
|
||||
|
||||
ggml_tensor * build_std_attention(ggml_cgraph * gf, ggml_tensor * attn_norm, ggml_tensor * cur,
|
||||
|
||||
@ -301,10 +301,10 @@ struct llama_context {
|
||||
struct kv_runtime_state {
|
||||
std::vector<struct ggml_tensor *> k_ctx_cache;
|
||||
std::vector<struct ggml_tensor *> v_ctx_cache;
|
||||
std::vector<struct ggml_tensor *> k_ctx_workspace;
|
||||
std::vector<struct ggml_tensor *> v_ctx_workspace;
|
||||
struct ggml_context * cache_ctx = nullptr;
|
||||
std::vector<ggml_backend_buffer_t> cache_bufs;
|
||||
std::vector<llama_pos> cache_pos;
|
||||
std::vector<uint8_t> cache_slot_valid;
|
||||
int32_t cache_write_pos = 0;
|
||||
int32_t cache_n_filled = 0;
|
||||
int32_t cache_update_rows = 0;
|
||||
@ -314,28 +314,16 @@ struct llama_context {
|
||||
uint64_t cache_applied_window_version = 0;
|
||||
bool cache_valid = false;
|
||||
bool cache_view_valid = false;
|
||||
int32_t workspace_write_pos = 0;
|
||||
int32_t workspace_n_filled = 0;
|
||||
int32_t workspace_reserved_rows = 0;
|
||||
int32_t workspace_token_capacity = 0;
|
||||
int32_t workspace_n_kv_total = 0;
|
||||
uint64_t workspace_applied_window_version = 0;
|
||||
bool workspace_valid = false;
|
||||
bool workspace_sync_pending = false;
|
||||
std::vector<uint8_t> cache_compute_meta;
|
||||
std::vector<uint8_t> workspace_compute_meta;
|
||||
ggml_backend_sched_t cache_sched = nullptr;
|
||||
ggml_backend_sched_t workspace_sched = nullptr;
|
||||
ggml_cgraph * cache_graph = nullptr;
|
||||
ggml_cgraph * workspace_graph = nullptr;
|
||||
int32_t cache_graph_rows = 0;
|
||||
int32_t cache_graph_write_pos = 0;
|
||||
int32_t workspace_graph_rows = 0;
|
||||
int32_t workspace_graph_write_pos = 0;
|
||||
struct ggml_tensor * cache_input_target_features = nullptr;
|
||||
struct ggml_tensor * cache_input_pos_ctx = nullptr;
|
||||
struct ggml_tensor * kq_mask_tensor = nullptr;
|
||||
struct ggml_tensor * kq_mask_swa_tensor = nullptr;
|
||||
struct ggml_tensor * draft_tail_rows_tensor = nullptr;
|
||||
};
|
||||
|
||||
struct capture_state {
|
||||
@ -416,5 +404,4 @@ struct llama_context {
|
||||
void set_mtp_op_type(llama_mtp_op_type value);
|
||||
|
||||
int max_nodes(int n_tokens, int n_kv) const;
|
||||
|
||||
};
|
||||
|
||||
@ -15,15 +15,6 @@
|
||||
#include <type_traits>
|
||||
#include <vector>
|
||||
|
||||
void llama_sync_dflash_workspace_if_pending(struct llama_context & lctx) {
|
||||
if (!lctx.dflash.kv.workspace_sync_pending || lctx.dflash.kv.workspace_sched == nullptr) {
|
||||
return;
|
||||
}
|
||||
|
||||
ggml_backend_sched_synchronize(lctx.dflash.kv.workspace_sched);
|
||||
lctx.dflash.kv.workspace_sync_pending = false;
|
||||
}
|
||||
|
||||
static ggml_backend_buffer_type_t llama_dflash_kv_cache_layer_buft(const llama_context & lctx, int32_t il) {
|
||||
if (il >= 0 && il < (int32_t) lctx.model.buft_layer.size() && lctx.model.buft_layer[il].buft != nullptr) {
|
||||
return lctx.model.buft_layer[il].buft;
|
||||
@ -64,8 +55,11 @@ static ggml_backend_t llama_backend_for_tensor(const llama_context & lctx, const
|
||||
|
||||
bool llama_context::ensure_dflash_kv_cache_tensors(int32_t cross_ctx) {
|
||||
const int32_t target_cross_ctx = std::max<int32_t>(1, cross_ctx);
|
||||
const int32_t target_token_capacity = std::max<int32_t>(1, (int32_t) model.hparams.dflash_block_size);
|
||||
const int32_t target_workspace_n_kv_total = GGML_PAD(target_cross_ctx + target_token_capacity, cparams.flash_attn ? 256 : 32);
|
||||
const int32_t target_token_capacity = std::max<int32_t>(
|
||||
1,
|
||||
std::max<int32_t>((int32_t) model.hparams.dflash_block_size, (int32_t) cparams.n_ubatch));
|
||||
const int32_t target_cache_n_kv_total = GGML_PAD(target_cross_ctx + target_token_capacity, cparams.flash_attn ? 256 : 32);
|
||||
const ggml_type target_cache_type = cparams.flash_attn ? GGML_TYPE_F16 : GGML_TYPE_F32;
|
||||
const int32_t n_layer = model.hparams.n_layer;
|
||||
const int64_t n_embd_head_k = model.hparams.n_embd_head_k(0);
|
||||
const int64_t n_embd_head_v = model.hparams.n_embd_head_v(0);
|
||||
@ -73,13 +67,16 @@ bool llama_context::ensure_dflash_kv_cache_tensors(int32_t cross_ctx) {
|
||||
|
||||
if (dflash.kv.cache_ctx != nullptr &&
|
||||
(int32_t) dflash.kv.k_ctx_cache.size() == n_layer &&
|
||||
(int32_t) dflash.kv.k_ctx_workspace.size() == n_layer) {
|
||||
(int32_t) dflash.kv.cache_pos.size() == target_cross_ctx &&
|
||||
(int32_t) dflash.kv.cache_slot_valid.size() == target_cross_ctx) {
|
||||
const bool cache_matches =
|
||||
(int32_t) dflash.kv.k_ctx_cache.front()->ne[2] == target_cross_ctx;
|
||||
const bool workspace_matches =
|
||||
(int32_t) dflash.kv.k_ctx_workspace.front()->ne[1] == target_workspace_n_kv_total;
|
||||
|
||||
if (cache_matches && workspace_matches) {
|
||||
dflash.kv.k_ctx_cache.front() != nullptr &&
|
||||
dflash.kv.v_ctx_cache.front() != nullptr &&
|
||||
dflash.kv.k_ctx_cache.front()->type == target_cache_type &&
|
||||
dflash.kv.v_ctx_cache.front()->type == target_cache_type &&
|
||||
(int32_t) dflash.kv.k_ctx_cache.front()->ne[1] == target_cache_n_kv_total &&
|
||||
(int32_t) dflash.kv.v_ctx_cache.front()->ne[1] == target_cache_n_kv_total;
|
||||
if (cache_matches) {
|
||||
return true;
|
||||
}
|
||||
|
||||
@ -88,17 +85,9 @@ bool llama_context::ensure_dflash_kv_cache_tensors(int32_t cross_ctx) {
|
||||
ggml_backend_sched_free(dflash.kv.cache_sched);
|
||||
dflash.kv.cache_sched = nullptr;
|
||||
}
|
||||
if (dflash.kv.workspace_sched != nullptr) {
|
||||
ggml_backend_sched_free(dflash.kv.workspace_sched);
|
||||
dflash.kv.workspace_sched = nullptr;
|
||||
}
|
||||
dflash.kv.cache_graph = nullptr;
|
||||
dflash.kv.workspace_graph = nullptr;
|
||||
dflash.kv.cache_graph_rows = 0;
|
||||
dflash.kv.cache_graph_write_pos = 0;
|
||||
dflash.kv.workspace_graph_rows = 0;
|
||||
dflash.kv.workspace_graph_write_pos = 0;
|
||||
dflash.kv.workspace_reserved_rows = 0;
|
||||
}
|
||||
|
||||
ggml_init_params params = {
|
||||
@ -115,22 +104,18 @@ bool llama_context::ensure_dflash_kv_cache_tensors(int32_t cross_ctx) {
|
||||
|
||||
dflash.kv.k_ctx_cache.resize((size_t) n_layer);
|
||||
dflash.kv.v_ctx_cache.resize((size_t) n_layer);
|
||||
dflash.kv.k_ctx_workspace.clear();
|
||||
dflash.kv.v_ctx_workspace.clear();
|
||||
dflash.kv.k_ctx_workspace.resize((size_t) n_layer);
|
||||
dflash.kv.v_ctx_workspace.resize((size_t) n_layer);
|
||||
dflash.kv.cache_pos.assign((size_t) target_cross_ctx, 0);
|
||||
dflash.kv.cache_slot_valid.assign((size_t) target_cross_ctx, 0);
|
||||
dflash.kv.cache_bufs.clear();
|
||||
dflash.kv.cache_bufs.reserve((size_t) std::max(1, n_layer) * 4);
|
||||
dflash.kv.cache_bufs.reserve((size_t) std::max(1, n_layer) * 2);
|
||||
for (int32_t il = 0; il < n_layer; ++il) {
|
||||
ggml_backend_buffer_type_t layer_buft = llama_dflash_kv_cache_layer_buft(*this, il);
|
||||
ggml_tensor *& k_ctx_cache = dflash.kv.k_ctx_cache[il];
|
||||
ggml_tensor *& v_ctx_cache = dflash.kv.v_ctx_cache[il];
|
||||
ggml_tensor *& k_ctx_workspace = dflash.kv.k_ctx_workspace[il];
|
||||
ggml_tensor *& v_ctx_workspace = dflash.kv.v_ctx_workspace[il];
|
||||
|
||||
auto alloc_kv_input = [&](ggml_tensor *& tensor, const char * tensor_tag, const char * tensor_name,
|
||||
int64_t ne0, int64_t ne1, int64_t ne2) -> bool {
|
||||
tensor = ggml_new_tensor_3d(dflash.kv.cache_ctx, GGML_TYPE_F32, ne0, ne1, ne2);
|
||||
ggml_type type, int64_t ne0, int64_t ne1, int64_t ne2) -> bool {
|
||||
tensor = ggml_new_tensor_3d(dflash.kv.cache_ctx, type, ne0, ne1, ne2);
|
||||
if (tensor == nullptr) {
|
||||
LLAMA_LOG_ERROR("%s: failed to create %s for layer %d\n", __func__, tensor_tag, il);
|
||||
return false;
|
||||
@ -156,20 +141,14 @@ bool llama_context::ensure_dflash_kv_cache_tensors(int32_t cross_ctx) {
|
||||
};
|
||||
|
||||
if (!alloc_kv_input(k_ctx_cache, "dflash_k_ctx_cache", "dflash_k_ctx_cache_%d",
|
||||
n_embd_head_k, n_head_kv, target_cross_ctx) ||
|
||||
target_cache_type, n_embd_head_k, target_cache_n_kv_total, n_head_kv) ||
|
||||
!alloc_kv_input(v_ctx_cache, "dflash_v_ctx_cache", "dflash_v_ctx_cache_%d",
|
||||
n_embd_head_v, n_head_kv, target_cross_ctx) ||
|
||||
!alloc_kv_input(k_ctx_workspace, "dflash_k_ctx_workspace", "dflash_k_ctx_workspace_%d",
|
||||
n_embd_head_k, target_workspace_n_kv_total, n_head_kv) ||
|
||||
!alloc_kv_input(v_ctx_workspace, "dflash_v_ctx_workspace", "dflash_v_ctx_workspace_%d",
|
||||
n_embd_head_v, target_workspace_n_kv_total, n_head_kv)) {
|
||||
target_cache_type, n_embd_head_v, target_cache_n_kv_total, n_head_kv)) {
|
||||
free_dflash_kv_cache_tensors();
|
||||
return false;
|
||||
}
|
||||
}
|
||||
|
||||
dflash.kv.workspace_token_capacity = target_token_capacity;
|
||||
dflash.kv.workspace_n_kv_total = target_workspace_n_kv_total;
|
||||
llama_reset_dflash_kv_cache_state(this);
|
||||
|
||||
return true;
|
||||
@ -183,8 +162,8 @@ void llama_context::free_dflash_kv_cache_tensors() {
|
||||
|
||||
release_vector(dflash.kv.k_ctx_cache);
|
||||
release_vector(dflash.kv.v_ctx_cache);
|
||||
release_vector(dflash.kv.k_ctx_workspace);
|
||||
release_vector(dflash.kv.v_ctx_workspace);
|
||||
release_vector(dflash.kv.cache_pos);
|
||||
release_vector(dflash.kv.cache_slot_valid);
|
||||
dflash.kv.cache_write_pos = 0;
|
||||
dflash.kv.cache_n_filled = 0;
|
||||
dflash.kv.cache_update_rows = 0;
|
||||
@ -194,30 +173,14 @@ void llama_context::free_dflash_kv_cache_tensors() {
|
||||
dflash.kv.cache_applied_window_version = 0;
|
||||
dflash.kv.cache_valid = false;
|
||||
dflash.kv.cache_view_valid = false;
|
||||
dflash.kv.workspace_write_pos = 0;
|
||||
dflash.kv.workspace_n_filled = 0;
|
||||
dflash.kv.workspace_reserved_rows = 0;
|
||||
dflash.kv.workspace_token_capacity = 0;
|
||||
dflash.kv.workspace_n_kv_total = 0;
|
||||
dflash.kv.workspace_applied_window_version = 0;
|
||||
dflash.kv.workspace_valid = false;
|
||||
dflash.kv.workspace_sync_pending = false;
|
||||
dflash.kv.cache_graph = nullptr;
|
||||
dflash.kv.workspace_graph = nullptr;
|
||||
dflash.kv.cache_graph_rows = 0;
|
||||
dflash.kv.cache_graph_write_pos = 0;
|
||||
dflash.kv.workspace_graph_rows = 0;
|
||||
dflash.kv.workspace_graph_write_pos = 0;
|
||||
dflash.kv.cache_input_target_features = nullptr;
|
||||
dflash.kv.cache_input_pos_ctx = nullptr;
|
||||
dflash.kv.kq_mask_tensor = nullptr;
|
||||
dflash.kv.kq_mask_swa_tensor = nullptr;
|
||||
|
||||
if (dflash.kv.workspace_sched != nullptr) {
|
||||
ggml_backend_sched_synchronize(dflash.kv.workspace_sched);
|
||||
ggml_backend_sched_free(dflash.kv.workspace_sched);
|
||||
dflash.kv.workspace_sched = nullptr;
|
||||
}
|
||||
dflash.kv.draft_tail_rows_tensor = nullptr;
|
||||
|
||||
for (ggml_backend_buffer_t buf : dflash.kv.cache_bufs) {
|
||||
if (buf != nullptr) {
|
||||
@ -226,7 +189,6 @@ void llama_context::free_dflash_kv_cache_tensors() {
|
||||
}
|
||||
release_vector(dflash.kv.cache_bufs);
|
||||
release_vector(dflash.kv.cache_compute_meta);
|
||||
release_vector(dflash.kv.workspace_compute_meta);
|
||||
if (dflash.kv.cache_ctx != nullptr) {
|
||||
ggml_free(dflash.kv.cache_ctx);
|
||||
dflash.kv.cache_ctx = nullptr;
|
||||
@ -365,7 +327,10 @@ bool llama_prepare_dflash_graph_inputs(
|
||||
ggml_tensor * kq_mask = lctx.dflash.kv.kq_mask_tensor;
|
||||
ggml_tensor * kq_mask_swa = lctx.dflash.kv.kq_mask_swa_tensor;
|
||||
|
||||
if (kq_mask == nullptr) {
|
||||
// An all-SWA draft has no full mask; an all-full draft has no SWA mask. Both masks share the
|
||||
// same dimensions, so use whichever one is live to derive shape.
|
||||
ggml_tensor * mask_dims = kq_mask != nullptr ? kq_mask : kq_mask_swa;
|
||||
if (mask_dims == nullptr) {
|
||||
LLAMA_LOG_ERROR("%s: DFlash graph inputs are not initialized\n", __func__);
|
||||
return false;
|
||||
}
|
||||
@ -388,13 +353,10 @@ bool llama_prepare_dflash_graph_inputs(
|
||||
const int32_t n_rows = lctx.dflash.target.features_n_rows;
|
||||
const int32_t append_rows_available = lctx.dflash.target.append_features_n_rows;
|
||||
const int32_t width = (int32_t) lctx.model.hparams.dflash_n_target_features;
|
||||
const int32_t graph_cross_ctx = lctx.dflash.kv.k_ctx_cache.front() != nullptr
|
||||
? (int32_t) lctx.dflash.kv.k_ctx_cache.front()->ne[2]
|
||||
: 0;
|
||||
const int32_t n_mask_tokens = (int32_t) kq_mask->ne[1];
|
||||
const int32_t n_kv_total = (int32_t) kq_mask->ne[0];
|
||||
|
||||
llama_sync_dflash_workspace_if_pending(lctx);
|
||||
const int32_t graph_cross_ctx = (int32_t) lctx.dflash.kv.cache_pos.size();
|
||||
const int32_t n_mask_tokens = (int32_t) mask_dims->ne[1];
|
||||
const int32_t n_kv_total = (int32_t) mask_dims->ne[0];
|
||||
ggml_tensor * draft_tail_rows = lctx.dflash.kv.draft_tail_rows_tensor;
|
||||
|
||||
if (graph_cross_ctx != cross_ctx) {
|
||||
LLAMA_LOG_ERROR("%s: DFlash graph cross_ctx drift (graph=%d configured=%d)\n",
|
||||
@ -418,8 +380,10 @@ bool llama_prepare_dflash_graph_inputs(
|
||||
__func__, n_kv_total, cross_ctx + (int32_t) n_tokens);
|
||||
return false;
|
||||
}
|
||||
|
||||
const int32_t left_pad = cross_ctx - n_rows;
|
||||
if (draft_tail_rows == nullptr || draft_tail_rows->type != GGML_TYPE_I32 || draft_tail_rows->ne[0] != (int64_t) n_tokens) {
|
||||
LLAMA_LOG_ERROR("%s: DFlash draft tail row input is not initialized for n_tokens=%u\n", __func__, n_tokens);
|
||||
return false;
|
||||
}
|
||||
|
||||
lctx.dflash.target.pos_ctx_data.resize((size_t) cross_ctx);
|
||||
std::fill(lctx.dflash.target.pos_ctx_data.begin(), lctx.dflash.target.pos_ctx_data.end(), 0);
|
||||
@ -437,7 +401,6 @@ bool llama_prepare_dflash_graph_inputs(
|
||||
return false;
|
||||
}
|
||||
}
|
||||
std::copy(src_pos, src_pos + n_rows, lctx.dflash.target.pos_ctx_data.begin() + (ptrdiff_t) left_pad);
|
||||
|
||||
const llama_dflash_kv_cache_transition cache_plan = llama_plan_dflash_kv_cache_transition(
|
||||
cross_ctx,
|
||||
@ -520,6 +483,7 @@ bool llama_prepare_dflash_graph_inputs(
|
||||
llama_reset_dflash_kv_cache_state(&lctx);
|
||||
}
|
||||
|
||||
const int32_t cache_write_start = lctx.dflash.kv.cache_write_pos;
|
||||
lctx.dflash.kv.cache_update_rows = update_rows;
|
||||
ggml_cgraph * gf_kv = nullptr;
|
||||
const bool can_reuse_kv_graph = lctx.dflash.kv.cache_graph != nullptr &&
|
||||
@ -558,6 +522,18 @@ bool llama_prepare_dflash_graph_inputs(
|
||||
llama_graph_compute_sched(lctx, lctx.dflash.kv.cache_sched, gf_kv, lctx.cparams.n_threads);
|
||||
ggml_backend_sched_synchronize(lctx.dflash.kv.cache_sched);
|
||||
|
||||
if ((int32_t) lctx.dflash.kv.cache_pos.size() != cross_ctx) {
|
||||
lctx.dflash.kv.cache_pos.assign((size_t) cross_ctx, 0);
|
||||
}
|
||||
if ((int32_t) lctx.dflash.kv.cache_slot_valid.size() != cross_ctx) {
|
||||
lctx.dflash.kv.cache_slot_valid.assign((size_t) cross_ctx, 0);
|
||||
}
|
||||
for (int32_t i = 0; i < update_rows; ++i) {
|
||||
const int32_t slot = (cache_write_start + i) % cross_ctx;
|
||||
lctx.dflash.kv.cache_pos[(size_t) slot] = update_pos[i];
|
||||
lctx.dflash.kv.cache_slot_valid[(size_t) slot] = 1;
|
||||
}
|
||||
|
||||
lctx.dflash.kv.cache_n_filled = std::min(cross_ctx, lctx.dflash.kv.cache_n_filled + update_rows);
|
||||
lctx.dflash.kv.cache_write_pos = (lctx.dflash.kv.cache_write_pos + update_rows) % cross_ctx;
|
||||
lctx.dflash.kv.cache_applied_window_version = lctx.dflash.target.version;
|
||||
@ -567,101 +543,39 @@ bool llama_prepare_dflash_graph_inputs(
|
||||
lctx.dflash.kv.cache_view_valid = true;
|
||||
}
|
||||
|
||||
if (lctx.dflash.kv.cache_view_valid &&
|
||||
!lctx.dflash.kv.k_ctx_workspace.empty() && !lctx.dflash.kv.v_ctx_workspace.empty()) {
|
||||
const bool need_workspace_refresh = !lctx.dflash.kv.workspace_valid ||
|
||||
lctx.dflash.kv.workspace_n_filled != lctx.dflash.kv.cache_view_n_filled ||
|
||||
lctx.dflash.kv.workspace_write_pos != lctx.dflash.kv.cache_view_write_pos ||
|
||||
lctx.dflash.kv.workspace_applied_window_version != lctx.dflash.kv.cache_applied_window_version;
|
||||
if ((int32_t) lctx.dflash.kv.cache_pos.size() != cross_ctx ||
|
||||
(int32_t) lctx.dflash.kv.cache_slot_valid.size() != cross_ctx) {
|
||||
LLAMA_LOG_ERROR("%s: DFlash physical cache slot map is not initialized\n", __func__);
|
||||
return false;
|
||||
}
|
||||
|
||||
if (need_workspace_refresh) {
|
||||
const size_t max_nodes = lctx.model.max_nodes((int) std::max<int32_t>(1, cross_ctx)) + 16 * lctx.model.hparams.n_layer;
|
||||
const size_t meta_size = ggml_tensor_overhead()*max_nodes + ggml_graph_overhead_custom(max_nodes, false);
|
||||
if (lctx.dflash.kv.workspace_compute_meta.size() != meta_size) {
|
||||
lctx.dflash.kv.workspace_compute_meta.resize(meta_size);
|
||||
}
|
||||
|
||||
ggml_cgraph * gf_workspace = nullptr;
|
||||
const bool can_reuse_workspace_graph = lctx.dflash.kv.workspace_graph != nullptr &&
|
||||
lctx.dflash.kv.workspace_graph_rows == lctx.dflash.kv.cache_view_n_filled &&
|
||||
lctx.dflash.kv.workspace_graph_write_pos == lctx.dflash.kv.cache_view_write_pos;
|
||||
|
||||
if (can_reuse_workspace_graph) {
|
||||
gf_workspace = lctx.dflash.kv.workspace_graph;
|
||||
} else {
|
||||
gf_workspace = llm_build_context::llama_build_graph_dflash_kv_workspace(lctx);
|
||||
if (gf_workspace == nullptr) {
|
||||
LLAMA_LOG_ERROR("%s: failed to build DFlash K/V workspace graph\n", __func__);
|
||||
return false;
|
||||
}
|
||||
|
||||
std::vector<ggml_backend_buffer_type_t> backend_buft;
|
||||
backend_buft.reserve(lctx.backends.size());
|
||||
for (auto * backend : lctx.backends) {
|
||||
if (ggml_backend_is_cpu(backend)) {
|
||||
backend_buft.push_back(llama_default_buffer_type_cpu(true));
|
||||
} else {
|
||||
backend_buft.push_back(ggml_backend_get_default_buffer_type(backend));
|
||||
}
|
||||
}
|
||||
|
||||
if (lctx.dflash.kv.workspace_sched == nullptr) {
|
||||
lctx.dflash.kv.workspace_sched = ggml_backend_sched_new(lctx.backends.data(), backend_buft.data(), lctx.backends.size(), max_nodes, false);
|
||||
}
|
||||
|
||||
if (lctx.dflash.kv.workspace_reserved_rows != cross_ctx) {
|
||||
const bool saved_view_valid = lctx.dflash.kv.cache_view_valid;
|
||||
const int32_t saved_view_rows = lctx.dflash.kv.cache_view_n_filled;
|
||||
const int32_t saved_view_write_pos = lctx.dflash.kv.cache_view_write_pos;
|
||||
|
||||
lctx.dflash.kv.cache_view_valid = true;
|
||||
lctx.dflash.kv.cache_view_n_filled = cross_ctx;
|
||||
lctx.dflash.kv.cache_view_write_pos = cross_ctx > 1 ? 1 : 0;
|
||||
|
||||
ggml_cgraph * gf_workspace_reserve = llm_build_context::llama_build_graph_dflash_kv_workspace(lctx);
|
||||
|
||||
lctx.dflash.kv.cache_view_valid = saved_view_valid;
|
||||
lctx.dflash.kv.cache_view_n_filled = saved_view_rows;
|
||||
lctx.dflash.kv.cache_view_write_pos = saved_view_write_pos;
|
||||
|
||||
const bool reserved = lctx.dflash.kv.workspace_sched != nullptr &&
|
||||
gf_workspace_reserve != nullptr &&
|
||||
ggml_backend_sched_reserve(lctx.dflash.kv.workspace_sched, gf_workspace_reserve);
|
||||
if (!reserved) {
|
||||
LLAMA_LOG_ERROR("%s: failed to initialize DFlash K/V workspace scheduler\n", __func__);
|
||||
return false;
|
||||
}
|
||||
|
||||
lctx.dflash.kv.workspace_reserved_rows = cross_ctx;
|
||||
}
|
||||
|
||||
ggml_backend_sched_reset(lctx.dflash.kv.workspace_sched);
|
||||
ggml_backend_sched_alloc_graph(lctx.dflash.kv.workspace_sched, gf_workspace);
|
||||
|
||||
lctx.dflash.kv.workspace_graph = gf_workspace;
|
||||
lctx.dflash.kv.workspace_graph_rows = lctx.dflash.kv.cache_view_n_filled;
|
||||
lctx.dflash.kv.workspace_graph_write_pos = lctx.dflash.kv.cache_view_write_pos;
|
||||
}
|
||||
|
||||
llama_graph_compute_sched(lctx, lctx.dflash.kv.workspace_sched, gf_workspace, lctx.cparams.n_threads);
|
||||
lctx.dflash.kv.workspace_sync_pending = true;
|
||||
|
||||
lctx.dflash.kv.workspace_n_filled = lctx.dflash.kv.cache_view_n_filled;
|
||||
lctx.dflash.kv.workspace_write_pos = lctx.dflash.kv.cache_view_write_pos;
|
||||
lctx.dflash.kv.workspace_applied_window_version = lctx.dflash.kv.cache_applied_window_version;
|
||||
lctx.dflash.kv.workspace_valid = true;
|
||||
for (int32_t i = 0; i < cross_ctx; ++i) {
|
||||
if (lctx.dflash.kv.cache_slot_valid[(size_t) i]) {
|
||||
lctx.dflash.target.pos_ctx_data[(size_t) i] = lctx.dflash.kv.cache_pos[(size_t) i];
|
||||
}
|
||||
}
|
||||
|
||||
const int32_t full_visible_first = left_pad;
|
||||
const int32_t full_visible_last = cross_ctx + (int32_t) n_tokens - 1;
|
||||
std::vector<int32_t> draft_tail_rows_data((size_t) n_tokens);
|
||||
for (uint32_t i = 0; i < n_tokens; ++i) {
|
||||
draft_tail_rows_data[(size_t) i] = cross_ctx + (int32_t) i;
|
||||
}
|
||||
ggml_backend_tensor_set(draft_tail_rows, draft_tail_rows_data.data(), 0, ggml_nbytes(draft_tail_rows));
|
||||
|
||||
const size_t mask_elems = (size_t) n_kv_total * (size_t) n_mask_tokens;
|
||||
if (kq_mask->type == GGML_TYPE_F16) {
|
||||
if (kq_mask == nullptr) {
|
||||
// all-SWA draft: the full mask was not created (no non-SWA layer consumes it); only the
|
||||
// SWA mask below is populated.
|
||||
} else if (kq_mask->type == GGML_TYPE_F16) {
|
||||
const ggml_fp16_t h_inf = ggml_fp32_to_fp16(-INFINITY);
|
||||
const ggml_fp16_t h_zero = ggml_fp32_to_fp16(0.0f);
|
||||
std::vector<ggml_fp16_t> mask_f16(mask_elems, h_inf);
|
||||
std::vector<ggml_fp16_t> row_f16((size_t) n_kv_total, h_inf);
|
||||
std::fill(row_f16.begin() + full_visible_first, row_f16.begin() + full_visible_last + 1, h_zero);
|
||||
for (int32_t i = 0; i < cross_ctx; ++i) {
|
||||
if (lctx.dflash.kv.cache_slot_valid[(size_t) i]) {
|
||||
row_f16[(size_t) i] = h_zero;
|
||||
}
|
||||
}
|
||||
std::fill(row_f16.begin() + cross_ctx, row_f16.begin() + cross_ctx + n_tokens, h_zero);
|
||||
for (uint32_t j = 0; j < n_tokens; ++j) {
|
||||
std::memcpy(mask_f16.data() + (size_t) j * (size_t) n_kv_total, row_f16.data(), (size_t) n_kv_total * sizeof(ggml_fp16_t));
|
||||
}
|
||||
@ -669,7 +583,12 @@ bool llama_prepare_dflash_graph_inputs(
|
||||
} else {
|
||||
lctx.dflash.target.kq_mask_data.assign(mask_elems, -INFINITY);
|
||||
std::vector<float> row_f32((size_t) n_kv_total, -INFINITY);
|
||||
std::fill(row_f32.begin() + full_visible_first, row_f32.begin() + full_visible_last + 1, 0.0f);
|
||||
for (int32_t i = 0; i < cross_ctx; ++i) {
|
||||
if (lctx.dflash.kv.cache_slot_valid[(size_t) i]) {
|
||||
row_f32[(size_t) i] = 0.0f;
|
||||
}
|
||||
}
|
||||
std::fill(row_f32.begin() + cross_ctx, row_f32.begin() + cross_ctx + n_tokens, 0.0f);
|
||||
for (uint32_t j = 0; j < n_tokens; ++j) {
|
||||
std::memcpy(lctx.dflash.target.kq_mask_data.data() + (size_t) j * (size_t) n_kv_total, row_f32.data(), (size_t) n_kv_total * sizeof(float));
|
||||
}
|
||||
@ -688,7 +607,10 @@ bool llama_prepare_dflash_graph_inputs(
|
||||
ggml_fp16_t * row = mask_swa_f16.data() + (size_t) j * (size_t) n_kv_total;
|
||||
const int32_t q_pos = draft_pos_base + (int32_t) j;
|
||||
|
||||
for (int32_t k = left_pad; k < cross_ctx; ++k) {
|
||||
for (int32_t k = 0; k < cross_ctx; ++k) {
|
||||
if (!lctx.dflash.kv.cache_slot_valid[(size_t) k]) {
|
||||
continue;
|
||||
}
|
||||
const int32_t k_pos = (int32_t) lctx.dflash.target.pos_ctx_data[(size_t) k];
|
||||
if (q_pos - k_pos < swa_window) {
|
||||
row[k] = h_zero;
|
||||
@ -697,7 +619,10 @@ bool llama_prepare_dflash_graph_inputs(
|
||||
|
||||
for (int32_t k = cross_ctx; k < cross_ctx + (int32_t) n_tokens; ++k) {
|
||||
const int32_t block_k = k - cross_ctx;
|
||||
if (block_k <= (int32_t) j) {
|
||||
// intra-block draft tokens are contiguous from draft_pos_base, so the
|
||||
// SWA distance is (j - block_k); apply the same window bound as the
|
||||
// cross-context section above (causal AND within n_swa).
|
||||
if (block_k <= (int32_t) j && ((int32_t) j - block_k) < swa_window) {
|
||||
row[k] = h_zero;
|
||||
}
|
||||
}
|
||||
@ -709,7 +634,10 @@ bool llama_prepare_dflash_graph_inputs(
|
||||
float * row = lctx.dflash.target.kq_mask_swa_data.data() + (size_t) j * (size_t) n_kv_total;
|
||||
const int32_t q_pos = draft_pos_base + (int32_t) j;
|
||||
|
||||
for (int32_t k = left_pad; k < cross_ctx; ++k) {
|
||||
for (int32_t k = 0; k < cross_ctx; ++k) {
|
||||
if (!lctx.dflash.kv.cache_slot_valid[(size_t) k]) {
|
||||
continue;
|
||||
}
|
||||
const int32_t k_pos = (int32_t) lctx.dflash.target.pos_ctx_data[(size_t) k];
|
||||
if (q_pos - k_pos < swa_window) {
|
||||
row[k] = 0.0f;
|
||||
@ -718,7 +646,10 @@ bool llama_prepare_dflash_graph_inputs(
|
||||
|
||||
for (int32_t k = cross_ctx; k < cross_ctx + (int32_t) n_tokens; ++k) {
|
||||
const int32_t block_k = k - cross_ctx;
|
||||
if (block_k <= (int32_t) j) {
|
||||
// intra-block draft tokens are contiguous from draft_pos_base, so the
|
||||
// SWA distance is (j - block_k); apply the same window bound as the
|
||||
// cross-context section above (causal AND within n_swa).
|
||||
if (block_k <= (int32_t) j && ((int32_t) j - block_k) < swa_window) {
|
||||
row[k] = 0.0f;
|
||||
}
|
||||
}
|
||||
|
||||
@ -5,4 +5,3 @@
|
||||
struct llama_context;
|
||||
|
||||
bool llama_prepare_dflash_graph_inputs(llama_context & lctx, uint32_t n_tokens);
|
||||
void llama_sync_dflash_workspace_if_pending(llama_context & lctx);
|
||||
|
||||
@ -903,6 +903,11 @@ void llm_load_hparams(
|
||||
ml.get_key(LLM_KV_DFLASH_MASK_TOKEN_ID, hparams.dflash_mask_token_id, false);
|
||||
ml.get_key(LLM_KV_DFLASH_N_TARGET_FEATURES, hparams.dflash_n_target_features, false);
|
||||
load_dflash_target_layer_ids(ml, LLM_KV(model.arch)(LLM_KV_DFLASH_TARGET_LAYER_IDS), hparams, false);
|
||||
// DFlash drafts may be trained with sliding-window attention (for long-context).
|
||||
// Read the window + per-layer pattern so the SWA mask path activates; absent keys
|
||||
// leave n_swa=0 / swa_layers all-zero (dense behavior, unchanged).
|
||||
ml.get_key(LLM_KV_ATTENTION_SLIDING_WINDOW, hparams.n_swa, false);
|
||||
ml.get_key_or_arr(LLM_KV_ATTENTION_SLIDING_WINDOW_PATTERN, hparams.swa_layers, hparams.n_layer, false);
|
||||
validate_dflash_hparams(hparams, model.arch);
|
||||
|
||||
hparams.n_layer_kv_from_start = hparams.n_layer;
|
||||
@ -1538,11 +1543,22 @@ void llm_load_hparams(
|
||||
}
|
||||
}
|
||||
|
||||
// GGUF stores the Poolside partial-rotary setting; the graph RoPE
|
||||
// argument for full-attention Laguna layers follows the upstream
|
||||
// Laguna loader and uses half of that count. SWA layers remain
|
||||
// full-head rotary via n_rot_swa.
|
||||
hparams.n_rot /= 2;
|
||||
const bool found_rope_dim = ml.get_key(LLM_KV_ROPE_DIMENSION_COUNT, hparams.n_rot, false);
|
||||
const bool found_rope_dim_swa = ml.get_key(LLM_KV_ROPE_DIMENSION_COUNT_SWA, hparams.n_rot_swa, false);
|
||||
|
||||
// Laguna GGUFs store the number of scalar Q/K dimensions that ggml_rope_ext
|
||||
// rotates. Correct files carry those values explicitly. Some early public
|
||||
// XS.2 GGUFs omitted both keys, so fall back to the HF XS.2 layout only for
|
||||
// missing metadata: full-attention layers rotate half the head, SWA layers
|
||||
// rotate the full head. Explicit but wrong halved metadata still needs repair.
|
||||
if (hparams.n_swa > 0) {
|
||||
if (!found_rope_dim) {
|
||||
hparams.n_rot = hparams.n_embd_head_k_full / 2;
|
||||
}
|
||||
if (!found_rope_dim_swa) {
|
||||
hparams.n_rot_swa = hparams.n_embd_head_k_swa;
|
||||
}
|
||||
}
|
||||
|
||||
ml.get_key(LLM_KV_ROPE_SCALING_YARN_EXT_FACTOR, hparams.yarn_ext_factor, false);
|
||||
ml.get_key(LLM_KV_ROPE_SCALING_YARN_ATTN_FACTOR, hparams.yarn_attn_factor, false);
|
||||
@ -1553,12 +1569,6 @@ void llm_load_hparams(
|
||||
for (uint32_t i = 0; i < hparams.n_layer; ++i) {
|
||||
hparams.rope_dim_per_layer[i] = hparams.swa_layers[i] ? hparams.n_rot_swa : hparams.n_rot;
|
||||
}
|
||||
} else {
|
||||
for (uint32_t i = 0; i < hparams.n_layer; ++i) {
|
||||
if (!hparams.swa_layers[i]) {
|
||||
hparams.rope_dim_per_layer[i] /= 2;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
switch (hparams.n_layer) {
|
||||
|
||||
@ -233,3 +233,5 @@ struct llama_split_tensor {
|
||||
|
||||
void llama_decode_reset();
|
||||
void llama_decode_stop();
|
||||
|
||||
std::vector<llama_model *> & llama_all_loaded_models();
|
||||
|
||||
@ -194,6 +194,8 @@ struct create_tensors_helper : public create_tensors_helper_interface {
|
||||
ggml_context * ctx_output;
|
||||
ggml_context * ctx_output_split;
|
||||
|
||||
llama_model * tgt_model = nullptr;
|
||||
|
||||
ggml_backend_buffer_type_t default_cpu_buft;
|
||||
bool has_buft_overrides = false;
|
||||
|
||||
@ -221,6 +223,21 @@ struct create_tensors_helper : public create_tensors_helper_interface {
|
||||
|
||||
create_tensors_helper::create_tensors_helper(llama_model_loader & _ml, llama_model & _model) : ml(_ml), model(_model) {
|
||||
|
||||
if (model.arch == LLM_ARCH_GEMMA4_MTP || model.arch == LLM_ARCH_GEMMA4_ASSISTANT) {
|
||||
auto & all_models = llama_all_loaded_models();
|
||||
for (auto model : all_models) {
|
||||
if (model->arch == LLM_ARCH_GEMMA4) {
|
||||
tgt_model = model;
|
||||
}
|
||||
}
|
||||
if (tgt_model) {
|
||||
LLAMA_LOG_INFO("==================== Found target model for Gemma4-Assistant. split mode graph: %d\n", model.split_mode == LLAMA_SPLIT_MODE_GRAPH);
|
||||
} else {
|
||||
LLAMA_LOG_INFO("==================== Did not find target model for Gemma4-Assistant\n");
|
||||
model.split_mode = LLAMA_SPLIT_MODE_LAYER;
|
||||
}
|
||||
}
|
||||
|
||||
const int n_layer = model.hparams.n_layer;
|
||||
buft_layer_count[model.buft_input.buft]++;
|
||||
buft_layer_count[model.buft_input.buft_matrix]++;
|
||||
@ -1198,8 +1215,18 @@ bool create_tensors_helper::create_step35_tensors(const LLM_TN & tn) {
|
||||
//layer.wk = create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_K, "weight", i), {n_embd, n_embd_k_gqa}, 0);
|
||||
//layer.wv = create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_V, "weight", i), {n_embd, n_embd_v_gqa}, 0);
|
||||
layer.wo = create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd_head_v * n_head_l, n_embd}, 0);
|
||||
// head-wise attention gate (Step35 self_attn.g_proj)
|
||||
layer.wqkv_gate = create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_GATE, "weight", i), {n_embd, n_head_l}, llama_model_loader::TENSOR_NOT_REQUIRED);
|
||||
const std::string attn_gate_name = tn(LLM_TENSOR_ATTN_GATE, "weight", i);
|
||||
int64_t n_attn_gate = n_head_l;
|
||||
if (model.arch == LLM_ARCH_LAGUNA) {
|
||||
// Step35-style models normally use a head-wise attention gate. Laguna
|
||||
// XS.2 keeps that layout, but M.1 gates every attention output element,
|
||||
// so infer the width from GGUF metadata instead of baking in a model size.
|
||||
const ggml_tensor * meta = ml.get_tensor_meta(attn_gate_name.c_str());
|
||||
if (meta && meta->ne[1] == n_embd_head_v * n_head_l) {
|
||||
n_attn_gate = n_embd_head_v * n_head_l;
|
||||
}
|
||||
}
|
||||
layer.wqkv_gate = create_tensor(ctx_split, attn_gate_name, {n_embd, n_attn_gate}, llama_model_loader::TENSOR_NOT_REQUIRED);
|
||||
layer.ffn_norm = create_tensor(ctx_split, tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, 0);
|
||||
// dense MLP (leading dense blocks)
|
||||
layer.ffn_gate = create_tensor(ctx_split, tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd, n_ff}, llama_model_loader::TENSOR_NOT_REQUIRED);
|
||||
@ -2232,7 +2259,7 @@ bool create_tensors_helper::create_gemma4_mtp_tensors(const LLM_TN & tn) {
|
||||
const int64_t n_ff_cur = hparams.n_ff(i);
|
||||
|
||||
if (!hparams.swa_layers[i]) {
|
||||
layer.rope_freqs = create_tensor(ctx_layer, tn(LLM_TENSOR_ROPE_FREQS, "weight"), { n_rot/2 },
|
||||
layer.rope_freqs = create_tensor(ctx_split, tn(LLM_TENSOR_ROPE_FREQS, "weight"), { n_rot/2 },
|
||||
llama_model_loader::TENSOR_NOT_REQUIRED | rope_flag);
|
||||
rope_flag = llama_model_loader::TENSOR_DUPLICATED;
|
||||
}
|
||||
@ -2778,11 +2805,15 @@ bool create_tensors_helper::create_glm_dsa_tensors(const LLM_TN & tn) {
|
||||
layer.wo = create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_head * n_embd_head_v, n_embd}, flags);
|
||||
|
||||
// DSA indexer
|
||||
layer.indexer_k_norm = create_tensor(ctx_split, tn(LLM_TENSOR_INDEXER_K_NORM, "weight", i), {hparams.indexer_head_size}, flags);
|
||||
layer.indexer_k_norm_b = create_tensor(ctx_split, tn(LLM_TENSOR_INDEXER_K_NORM, "bias", i), {hparams.indexer_head_size}, flags);
|
||||
layer.indexer_proj = create_tensor(ctx_split, tn(LLM_TENSOR_INDEXER_PROJ, "weight", i), {n_embd, hparams.indexer_n_head}, flags);
|
||||
layer.indexer_attn_k = create_tensor(ctx_split, tn(LLM_TENSOR_INDEXER_ATTN_K, "weight", i), {n_embd, hparams.indexer_head_size}, flags);
|
||||
layer.indexer_attn_q_b = create_tensor(ctx_split, tn(LLM_TENSOR_INDEXER_ATTN_Q_B, "weight", i), {q_lora_rank, hparams.indexer_n_head * hparams.indexer_head_size}, flags);
|
||||
// GLM-5.2 only ships the DSA indexer on a subset of layers; the rest omit it.
|
||||
// The DSA indexer runtime is not implemented (graph is plain MLA), so these
|
||||
// tensors are loaded-but-unused. Mark them optional so layers without an
|
||||
// indexer load as nullptr (ported from ggml-org/llama.cpp#24770).
|
||||
layer.indexer_k_norm = create_tensor(ctx_split, tn(LLM_TENSOR_INDEXER_K_NORM, "weight", i), {hparams.indexer_head_size}, flags | llama_model_loader::TENSOR_NOT_REQUIRED);
|
||||
layer.indexer_k_norm_b = create_tensor(ctx_split, tn(LLM_TENSOR_INDEXER_K_NORM, "bias", i), {hparams.indexer_head_size}, flags | llama_model_loader::TENSOR_NOT_REQUIRED);
|
||||
layer.indexer_proj = create_tensor(ctx_split, tn(LLM_TENSOR_INDEXER_PROJ, "weight", i), {n_embd, hparams.indexer_n_head}, flags | llama_model_loader::TENSOR_NOT_REQUIRED);
|
||||
layer.indexer_attn_k = create_tensor(ctx_split, tn(LLM_TENSOR_INDEXER_ATTN_K, "weight", i), {n_embd, hparams.indexer_head_size}, flags | llama_model_loader::TENSOR_NOT_REQUIRED);
|
||||
layer.indexer_attn_q_b = create_tensor(ctx_split, tn(LLM_TENSOR_INDEXER_ATTN_Q_B, "weight", i), {q_lora_rank, hparams.indexer_n_head * hparams.indexer_head_size}, flags | llama_model_loader::TENSOR_NOT_REQUIRED);
|
||||
|
||||
layer.ffn_norm = create_tensor(norm_ctx, tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, flags);
|
||||
|
||||
@ -4526,7 +4557,7 @@ bool create_tensors_helper::create_tensors() {
|
||||
|
||||
{
|
||||
const bool unsupported =
|
||||
(model.arch == LLM_ARCH_GEMMA4_MTP || model.arch == LLM_ARCH_GEMMA4_ASSISTANT) ||
|
||||
//(model.arch == LLM_ARCH_GEMMA4_MTP || model.arch == LLM_ARCH_GEMMA4_ASSISTANT) ||
|
||||
(model.arch == LLM_ARCH_GEMMA4 && model.tok_embd_per_layer);
|
||||
if (unsupported && (model.split_mode == LLAMA_SPLIT_MODE_GRAPH || model.split_mode == LLAMA_SPLIT_MODE_ATTN)) {
|
||||
LLAMA_LOG_WARN("\n=========================================================\n");
|
||||
@ -4569,6 +4600,12 @@ bool create_tensors_helper::create_tensors() {
|
||||
if (model.max_gpu > 0 && model.max_gpu < int(model.splits.size())) {
|
||||
gpu_split_count.resize(model.splits.size(), 0.0f);
|
||||
}
|
||||
auto is_gemma4_model = [this] () {
|
||||
return model.arch == LLM_ARCH_GEMMA4 || model.arch == LLM_ARCH_GEMMA4_MTP || model.arch == LLM_ARCH_GEMMA4_ASSISTANT;
|
||||
};
|
||||
auto is_gemma4_assistant = [this] () {
|
||||
return model.arch == LLM_ARCH_GEMMA4_MTP || model.arch == LLM_ARCH_GEMMA4_ASSISTANT;
|
||||
};
|
||||
for (int il = 0; il < n_layer; ++il) {
|
||||
// For now only run MTP into the per-layer
|
||||
if (model.mtp && hparams.nextn_predict_layers > 0 &&
|
||||
@ -4606,7 +4643,7 @@ bool create_tensors_helper::create_tensors() {
|
||||
if (layer.attn_norm) {
|
||||
prepare_split_tensors(-1, ctx_split, layer.attn_norm, layer.split_attn_norm, mirror, mem_used);
|
||||
}
|
||||
if (model.arch == LLM_ARCH_GEMMA4 && layer.attn_post_norm) {
|
||||
if (is_gemma4_model() && layer.attn_post_norm) {
|
||||
prepare_split_tensors(-1, ctx_split, layer.attn_post_norm, layer.split_attn_post_norm, mirror, mem_used);
|
||||
}
|
||||
if (layer.rope_freqs) {
|
||||
@ -4616,6 +4653,48 @@ bool create_tensors_helper::create_tensors() {
|
||||
if (hparams.is_recurrent(il)) {
|
||||
split_recurrent_tensors(hparams, layer, cur_splits, mem_used, ctx_split, il); //, model.arch == LLM_ARCH_QWEN3NEXT ? 0 : 1);
|
||||
}
|
||||
else if (is_gemma4_assistant()) {
|
||||
GGML_ASSERT(layer.wo && layer.wq);
|
||||
GGML_ASSERT(tgt_model);
|
||||
int n_embd_head = hparams.n_embd_head_k(il);
|
||||
int n_head = hparams.n_head(il);
|
||||
bool is_sliding = hparams.swa_layers[il] != 0;
|
||||
int target_n_kv_layer = tgt_model->hparams.n_layer_kv_from_start > 0
|
||||
? std::min<int>((int) tgt_model->hparams.n_layer, tgt_model->hparams.n_layer_kv_from_start)
|
||||
: (int) tgt_model->hparams.n_layer;
|
||||
int target_il = target_n_kv_layer - 1;
|
||||
for (; target_il >= 0; --target_il) {
|
||||
if ((tgt_model->hparams.swa_layers[target_il] != 0) == is_sliding) break;
|
||||
}
|
||||
GGML_ASSERT(target_il >= 0 && "Gemma4 MTP could not find a matching target KV layer");
|
||||
int n_head_tgt = tgt_model->hparams.n_head(target_il);
|
||||
GGML_ASSERT(tgt_model->hparams.n_embd_head_k(target_il) == n_embd_head);
|
||||
auto & target_layer = tgt_model->layers[target_il];
|
||||
auto split_wq = (const ggml_split_tensor_t *)target_layer.wq->extra;
|
||||
auto split_wo = (const ggml_split_tensor_t *)target_layer.wo->extra;
|
||||
GGML_ASSERT(split_wq && split_wo);
|
||||
std::vector<int> q_split(split_wq->n_device, 0);
|
||||
std::vector<int> o_split(split_wo->n_device, 0);
|
||||
for (int id = 0; id < split_wq->n_device; ++id) {
|
||||
if (split_wq->splits[id]) {
|
||||
int nh = split_wq->splits[id]->ne[1] / n_embd_head;
|
||||
GGML_ASSERT((nh*n_head) % n_head_tgt == 0);
|
||||
q_split[id] = ((nh*n_head)/n_head_tgt)*n_embd_head;
|
||||
}
|
||||
}
|
||||
for (int id = 0; id < split_wo->n_device; ++id) {
|
||||
if (split_wo->splits[id]) {
|
||||
int64_t no = split_wo->splits[id]->ne[0] * layer.wo->ne[0];
|
||||
GGML_ASSERT(no % target_layer.wo->ne[0] == 0);
|
||||
o_split[id] = no / target_layer.wo->ne[0];
|
||||
}
|
||||
}
|
||||
prepare_split_tensors(1, ctx_split, layer.wq, layer.split_wq, q_split, mem_used);
|
||||
prepare_split_tensors(0, ctx_split, layer.wo, layer.split_wo, o_split, mem_used);
|
||||
if (layer.attn_q_norm) {
|
||||
prepare_split_tensors(-1, ctx_split, layer.attn_q_norm, layer.split_q_norm, o_split, mem_used);
|
||||
}
|
||||
}
|
||||
else if (layer.wo && layer.wq && layer.wk && (layer.wv || model.arch == LLM_ARCH_GEMMA4)) {
|
||||
auto granularity_kq = hparams.n_embd_head_k(il) * gqa_ratio;
|
||||
int wq_ne1 = layer.wq->ne[1];
|
||||
@ -4681,11 +4760,15 @@ bool create_tensors_helper::create_tensors() {
|
||||
}
|
||||
if (layer.wqkv_gate) {
|
||||
auto wqkv_gate_split = split_kq;
|
||||
LLAMA_LOG_DEBUG("=================== wqkv_gate_split:");
|
||||
for (auto & s : wqkv_gate_split) {
|
||||
s /= hparams.n_embd_head_k(il);
|
||||
LLAMA_LOG_DEBUG(" %d", s);
|
||||
if (model.arch == LLM_ARCH_LAGUNA && layer.wqkv_gate->ne[1] == layer.wo->ne[0]) {
|
||||
// Full-width Laguna M.1 gates follow the value/output partition.
|
||||
// Head-wise gates still follow the K/Q partition collapsed by head size.
|
||||
wqkv_gate_split = split_vo;
|
||||
} else {
|
||||
for (auto & s : wqkv_gate_split) s /= hparams.n_embd_head_k(il);
|
||||
}
|
||||
LLAMA_LOG_DEBUG("=================== wqkv_gate_split:");
|
||||
for ([[maybe_unused]] auto s : wqkv_gate_split) LLAMA_LOG_DEBUG(" %d", s);
|
||||
LLAMA_LOG_DEBUG("\n");
|
||||
prepare_split_tensors(1, ctx_split, layer.wqkv_gate, layer.split_wqkv_gate, wqkv_gate_split, mem_used);
|
||||
}
|
||||
|
||||
@ -76,7 +76,7 @@ struct llama_file::impl {
|
||||
return ret;
|
||||
}
|
||||
|
||||
impl(const char * fname, const char * mode) {
|
||||
impl(const char * fname, const char * mode) : path(fname) {
|
||||
fp = ggml_fopen(fname, mode);
|
||||
if (fp == NULL) {
|
||||
throw std::runtime_error(format("failed to open %s: %s", fname, strerror(errno)));
|
||||
@ -155,13 +155,15 @@ struct llama_file::impl {
|
||||
write_raw(&val, sizeof(val));
|
||||
}
|
||||
|
||||
std::string path;
|
||||
|
||||
~impl() {
|
||||
if (fp) {
|
||||
std::fclose(fp);
|
||||
}
|
||||
}
|
||||
#else
|
||||
impl(const char * fname, const char * mode) {
|
||||
impl(const char * fname, const char * mode) : path(fname) {
|
||||
fp = ggml_fopen(fname, mode);
|
||||
if (fp == NULL) {
|
||||
throw std::runtime_error(format("failed to open %s: %s", fname, strerror(errno)));
|
||||
@ -231,6 +233,7 @@ struct llama_file::impl {
|
||||
void write_u32(uint32_t val) const {
|
||||
write_raw(&val, sizeof(val));
|
||||
}
|
||||
std::string path;
|
||||
|
||||
~impl() {
|
||||
if (fp) {
|
||||
@ -681,3 +684,5 @@ const bool llama_mlock::SUPPORTED = false;
|
||||
size_t llama_path_max() {
|
||||
return PATH_MAX;
|
||||
}
|
||||
|
||||
const std::string & llama_file::get_path() const { return pimpl->path; }
|
||||
|
||||
@ -3,6 +3,7 @@
|
||||
#include <cstdint>
|
||||
#include <memory>
|
||||
#include <vector>
|
||||
#include <string>
|
||||
|
||||
struct llama_file;
|
||||
struct llama_mmap;
|
||||
@ -28,6 +29,7 @@ struct llama_file {
|
||||
|
||||
void write_raw(const void * ptr, size_t len) const;
|
||||
void write_u32(uint32_t val) const;
|
||||
const std::string & get_path() const;
|
||||
|
||||
private:
|
||||
struct impl;
|
||||
|
||||
@ -1053,6 +1053,7 @@ void llama_model_loader::load_data_for(struct ggml_tensor * cur) const {
|
||||
// Returns false if cancelled by progress_callback
|
||||
bool llama_model_loader::load_all_data(
|
||||
struct ggml_context * ctx,
|
||||
[[maybe_unused]] llama_model * model,
|
||||
llama_buf_map & bufs_mmap,
|
||||
llama_mlocks * lmlocks,
|
||||
llama_progress_callback progress_callback,
|
||||
@ -1083,7 +1084,7 @@ bool llama_model_loader::load_all_data(
|
||||
for (int i = 0; i < ggml_backend_cuda_get_device_count(); ++i) {
|
||||
auto * cuda_buffer_type = ggml_backend_cuda_buffer_type(i);
|
||||
if (buffer_type == cuda_buffer_type) {
|
||||
cuda_backend = ggml_backend_cuda_init(i, nullptr);
|
||||
cuda_backend = ggml_backend_cuda_init(i, nullptr, model);
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
@ -184,6 +184,7 @@ struct llama_model_loader {
|
||||
// Returns false if cancelled by progress_callback
|
||||
bool load_all_data(
|
||||
struct ggml_context * ctx,
|
||||
struct llama_model * model,
|
||||
llama_buf_map & bufs_mmap,
|
||||
llama_mlocks * lmlocks,
|
||||
llama_progress_callback progress_callback,
|
||||
|
||||
@ -12,6 +12,8 @@
|
||||
#include <unordered_map>
|
||||
#include <set>
|
||||
|
||||
#include "llama-reload-info.h"
|
||||
|
||||
// available llama models
|
||||
enum e_model {
|
||||
MODEL_UNKNOWN,
|
||||
@ -552,6 +554,8 @@ struct llama_model {
|
||||
|
||||
std::vector<float> splits;
|
||||
ggml_backend_buffer_type_t split_buft = nullptr;
|
||||
|
||||
std::unique_ptr<reload_info> reload;
|
||||
};
|
||||
|
||||
struct llama_lora_weight {
|
||||
|
||||
60
src/llama-reload-info.h
Normal file
60
src/llama-reload-info.h
Normal file
@ -0,0 +1,60 @@
|
||||
#pragma once
|
||||
|
||||
#include "ggml.h"
|
||||
#include "ggml-backend.h"
|
||||
|
||||
#include <string>
|
||||
#include <vector>
|
||||
#include <unordered_map>
|
||||
#include <atomic>
|
||||
#include <sys/stat.h>
|
||||
#include <fstream>
|
||||
|
||||
struct llama_model;
|
||||
struct llama_model_loader;
|
||||
|
||||
struct tensor_reload_source {
|
||||
std::string path;
|
||||
size_t data_offset = 0;
|
||||
size_t nbytes = 0;
|
||||
int64_t last_mtime = 0;
|
||||
int64_t last_mtime_ns = 0;
|
||||
|
||||
ggml_backend_buffer_t original_buffer = nullptr;
|
||||
void * original_data = nullptr;
|
||||
ggml_type original_type = GGML_TYPE_COUNT;
|
||||
size_t original_nbytes = 0;
|
||||
int64_t original_ne[GGML_MAX_DIMS];
|
||||
size_t original_nb[GGML_MAX_DIMS];
|
||||
|
||||
struct split_info {
|
||||
int64_t ne[GGML_MAX_DIMS];
|
||||
size_t nb[GGML_MAX_DIMS];
|
||||
void * data;
|
||||
ggml_backend_buffer_t buffer;
|
||||
struct ggml_tensor * tensor = nullptr;
|
||||
};
|
||||
std::vector<split_info> original_splits;
|
||||
|
||||
std::vector<std::string> sibling_names;
|
||||
ggml_split_tensor_t * original_extra = nullptr;
|
||||
|
||||
enum class reload_state {
|
||||
UNINITIALIZED,
|
||||
ON_ORIGINAL,
|
||||
DETACHED,
|
||||
FALLBACK_CPU
|
||||
};
|
||||
reload_state state = reload_state::UNINITIALIZED;
|
||||
};
|
||||
|
||||
struct reload_info {
|
||||
std::unordered_map<std::string, tensor_reload_source> tensor_reload_sources;
|
||||
std::atomic<bool> reload_snapshots_done{false};
|
||||
|
||||
reload_info(const llama_model_loader & ml);
|
||||
|
||||
bool reload_tensor(const char * name, llama_model & model);
|
||||
bool reload_changed_tensors(llama_model & model);
|
||||
void snapshot_all_reload_tensors(llama_model & model);
|
||||
};
|
||||
947
src/llama-reload.cpp
Normal file
947
src/llama-reload.cpp
Normal file
@ -0,0 +1,947 @@
|
||||
#include "llama-reload-info.h"
|
||||
#include "llama-model.h"
|
||||
#include "llama-model-loader.h"
|
||||
|
||||
#ifdef GGML_USE_CUDA
|
||||
#include "ggml-cuda.h"
|
||||
#endif
|
||||
|
||||
#include <map>
|
||||
#include <fstream>
|
||||
#include <algorithm>
|
||||
#include <vector>
|
||||
#include <cstdint>
|
||||
#include <cstring>
|
||||
|
||||
|
||||
// ------------------------------------------------------------------
|
||||
// Debug helpers
|
||||
// ------------------------------------------------------------------
|
||||
static void log_tensor_state(const char * ctx, struct ggml_tensor * t) {
|
||||
#ifndef NDEBUG
|
||||
if (!t) {
|
||||
LLAMA_LOG_DEBUG("%s: tensor=NULL\n", ctx);
|
||||
return;
|
||||
}
|
||||
const char * buft_name = "null";
|
||||
if (t->buffer) {
|
||||
auto buft = ggml_backend_buffer_get_type(t->buffer);
|
||||
if (buft) buft_name = ggml_backend_buft_name(buft);
|
||||
}
|
||||
LLAMA_LOG_DEBUG("%s: tensor='%s' type=%s ne={%ld,%ld,%ld,%ld} nb={%zu,%zu,%zu,%zu} "
|
||||
"buffer=%p data=%p extra=%p buft=%s\n",
|
||||
ctx, t->name, ggml_type_name(t->type),
|
||||
(long)t->ne[0], (long)t->ne[1], (long)t->ne[2], (long)t->ne[3],
|
||||
t->nb[0], t->nb[1], t->nb[2], t->nb[3],
|
||||
(void*)t->buffer, t->data, (void*)t->extra, buft_name);
|
||||
#else
|
||||
(void)ctx;
|
||||
(void)t;
|
||||
#endif
|
||||
}
|
||||
|
||||
static void log_split_state(const char * ctx, struct ggml_tensor * t) {
|
||||
#ifndef NDEBUG
|
||||
if (!t || !t->extra) {
|
||||
LLAMA_LOG_DEBUG("%s: no splits (extra=%p)\n", ctx, (void*)(t ? t->extra : nullptr));
|
||||
return;
|
||||
}
|
||||
auto extra = (ggml_split_tensor_t *)t->extra;
|
||||
LLAMA_LOG_DEBUG("%s: tensor='%s' n_device=%d split_dim=%d\n",
|
||||
ctx, t->name, extra->n_device, extra->split_dim);
|
||||
for (int i = 0; i < extra->n_device; ++i) {
|
||||
if (!extra->splits[i]) {
|
||||
LLAMA_LOG_DEBUG("%s: split[%d]=NULL\n", ctx, i);
|
||||
continue;
|
||||
}
|
||||
const char * split_buft_name = "null";
|
||||
if (extra->splits[i]->buffer) {
|
||||
auto buft = ggml_backend_buffer_get_type(extra->splits[i]->buffer);
|
||||
if (buft) split_buft_name = ggml_backend_buft_name(buft);
|
||||
}
|
||||
LLAMA_LOG_DEBUG("%s: split[%d] type=%s ne={%ld,%ld,%ld,%ld} nb={%zu,%zu,%zu,%zu} "
|
||||
"buffer=%p data=%p buft=%s\n",
|
||||
ctx, i, ggml_type_name(extra->splits[i]->type),
|
||||
(long)extra->splits[i]->ne[0], (long)extra->splits[i]->ne[1],
|
||||
(long)extra->splits[i]->ne[2], (long)extra->splits[i]->ne[3],
|
||||
extra->splits[i]->nb[0], extra->splits[i]->nb[1],
|
||||
extra->splits[i]->nb[2], extra->splits[i]->nb[3],
|
||||
(void*)extra->splits[i]->buffer, extra->splits[i]->data, split_buft_name);
|
||||
}
|
||||
#else
|
||||
(void)ctx;
|
||||
(void)t;
|
||||
#endif
|
||||
}
|
||||
|
||||
// ------------------------------------------------------------------
|
||||
// GGUF header parser (reuses llama.cpp / ggml GGUF loader)
|
||||
// ------------------------------------------------------------------
|
||||
static bool gguf_find_tensor_meta(const char * path, const char * target_name,
|
||||
size_t & out_offset, size_t & out_nbytes,
|
||||
ggml_type & out_type,
|
||||
int64_t out_ne[GGML_MAX_DIMS])
|
||||
{
|
||||
struct ggml_context * ctx = nullptr;
|
||||
struct gguf_init_params params = {
|
||||
/*.no_alloc = */ true,
|
||||
/*.ctx = */ &ctx,
|
||||
};
|
||||
struct gguf_context * gguf = gguf_init_from_file(path, params);
|
||||
if (!gguf) {
|
||||
return false;
|
||||
}
|
||||
const int idx = gguf_find_tensor(gguf, target_name);
|
||||
if (idx < 0) {
|
||||
ggml_free(ctx);
|
||||
gguf_free(gguf);
|
||||
return false;
|
||||
}
|
||||
struct ggml_tensor * tensor = ggml_get_tensor(ctx, target_name);
|
||||
if (!tensor) {
|
||||
ggml_free(ctx);
|
||||
gguf_free(gguf);
|
||||
return false;
|
||||
}
|
||||
|
||||
out_offset = gguf_get_data_offset(gguf) + gguf_get_tensor_offset(gguf, idx);
|
||||
out_nbytes = ggml_nbytes(tensor);
|
||||
out_type = tensor->type;
|
||||
for (int i = 0; i < GGML_MAX_DIMS; ++i) {
|
||||
out_ne[i] = tensor->ne[i];
|
||||
}
|
||||
|
||||
ggml_free(ctx);
|
||||
gguf_free(gguf);
|
||||
return true;
|
||||
}
|
||||
|
||||
// ------------------------------------------------------------------
|
||||
// Buffer census helper
|
||||
// ------------------------------------------------------------------
|
||||
static size_t count_buffer_users(
|
||||
const std::vector<std::pair<std::string, struct ggml_tensor *>> & tensors_by_name,
|
||||
ggml_backend_buffer_t buf)
|
||||
{
|
||||
if (!buf) return 0;
|
||||
size_t n = 0;
|
||||
for (auto & p : tensors_by_name) {
|
||||
if (p.second->buffer == buf) ++n;
|
||||
}
|
||||
return n;
|
||||
}
|
||||
|
||||
static bool is_original_snapshot_buffer(llama_model & model, ggml_backend_buffer_t buf) {
|
||||
if (!buf) return false;
|
||||
if (!model.reload) return false;
|
||||
for (const auto & kv : model.reload->tensor_reload_sources) {
|
||||
const auto & src = kv.second;
|
||||
if (buf == src.original_buffer) return true;
|
||||
for (const auto & os : src.original_splits) {
|
||||
if (buf == os.buffer) return true;
|
||||
}
|
||||
}
|
||||
return false;
|
||||
}
|
||||
|
||||
// ------------------------------------------------------------------
|
||||
// Final size estimator
|
||||
// ------------------------------------------------------------------
|
||||
static size_t llama_model_compute_final_nbytes(struct ggml_tensor * tensor, ggml_type new_type) {
|
||||
if (new_type == tensor->type) {
|
||||
return ggml_nbytes(tensor);
|
||||
}
|
||||
return ggml_row_size(new_type, tensor->ne[0]) * ggml_nrows(tensor);
|
||||
}
|
||||
|
||||
// ------------------------------------------------------------------
|
||||
// Fallback allocator
|
||||
// ------------------------------------------------------------------
|
||||
static ggml_backend_buffer_t alloc_buffer_fallback(ggml_backend_buffer_type_t buft, size_t size) {
|
||||
ggml_backend_buffer_t buf = ggml_backend_buft_alloc_buffer(buft, size);
|
||||
if (buf) {
|
||||
LLAMA_LOG_DEBUG("%s: allocated %zu bytes on backend '%s'\n",
|
||||
__func__, size, ggml_backend_buft_name(buft));
|
||||
return buf;
|
||||
}
|
||||
|
||||
auto cpu_buft = ggml_backend_cpu_buffer_type();
|
||||
if (buft == cpu_buft) {
|
||||
LLAMA_LOG_WARN("%s: CPU alloc failed (%zu bytes)\n", __func__, size);
|
||||
return nullptr;
|
||||
}
|
||||
|
||||
LLAMA_LOG_WARN("%s: backend alloc failed (%zu bytes on '%s'), trying CPU fallback\n",
|
||||
__func__, size, ggml_backend_buft_name(buft));
|
||||
|
||||
buf = ggml_backend_buft_alloc_buffer(cpu_buft, size);
|
||||
if (!buf) {
|
||||
LLAMA_LOG_WARN("%s: CPU fallback alloc failed (%zu bytes)\n", __func__, size);
|
||||
return nullptr;
|
||||
}
|
||||
LLAMA_LOG_DEBUG("%s: allocated %zu bytes on CPU fallback\n", __func__, size);
|
||||
return buf;
|
||||
}
|
||||
|
||||
// ------------------------------------------------------------------
|
||||
// MoE sibling resync
|
||||
// ------------------------------------------------------------------
|
||||
// MoE layers have three weight tensors per block: gate, up, down.
|
||||
// The CUDA split backend distributes each tensor across GPUs by splitting
|
||||
// one dimension (usually dim 0 or 1). Split boundaries must be multiples
|
||||
// of the quantization block size (e.g. 256 for IQ1_KT). If the reference
|
||||
// tensor changes quantization type, its block size changes, which changes
|
||||
// the valid split boundaries. ALL siblings in the same layer MUST adopt
|
||||
// the SAME per-device split dimensions, otherwise the backend dispatches
|
||||
// rows to the wrong devices and corrupts inference.
|
||||
//
|
||||
// When the reference tensor is back on its original snapshot, siblings
|
||||
// can simply be reattached to their original snapshots too -- no data
|
||||
// movement or allocation is required.
|
||||
// ------------------------------------------------------------------
|
||||
|
||||
|
||||
// ------------------------------------------------------------------
|
||||
// Sibling name registration
|
||||
// ------------------------------------------------------------------
|
||||
static void populate_moe_siblings(const char * name, tensor_reload_source & src) {
|
||||
LLAMA_LOG_DEBUG("%s: name='%s'\n", __func__, name);
|
||||
|
||||
static const char * suffixes[] = {
|
||||
".ffn_down_exps.weight",
|
||||
".ffn_up_exps.weight",
|
||||
".ffn_gate_exps.weight",
|
||||
};
|
||||
std::string n(name);
|
||||
for (const char * sfx : suffixes) {
|
||||
size_t pos = n.find(sfx);
|
||||
if (pos == std::string::npos) continue;
|
||||
std::string base = n.substr(0, pos);
|
||||
for (const char * other : suffixes) {
|
||||
if (strcmp(other, sfx) != 0) {
|
||||
src.sibling_names.push_back(base + other);
|
||||
LLAMA_LOG_DEBUG("%s: registered sibling '%s' for '%s'\n",
|
||||
__func__, (base + other).c_str(), name);
|
||||
}
|
||||
}
|
||||
return;
|
||||
}
|
||||
LLAMA_LOG_DEBUG("%s: '%s' no MoE suffix matched\n", __func__, name);
|
||||
}
|
||||
|
||||
// ------------------------------------------------------------------
|
||||
// Snapshot helper
|
||||
// ------------------------------------------------------------------
|
||||
static void snapshot_tensor_source(struct ggml_tensor * tensor,
|
||||
tensor_reload_source & src)
|
||||
{
|
||||
if (!tensor || src.original_buffer != nullptr) return;
|
||||
|
||||
src.original_buffer = tensor->buffer;
|
||||
src.original_data = tensor->data;
|
||||
src.original_nbytes = ggml_nbytes(tensor);
|
||||
src.original_type = tensor->type;
|
||||
for (int i = 0; i < GGML_MAX_DIMS; ++i) {
|
||||
src.original_ne[i] = tensor->ne[i];
|
||||
src.original_nb[i] = tensor->nb[i];
|
||||
}
|
||||
auto extra = (ggml_split_tensor_t *)tensor->extra;
|
||||
if (extra) {
|
||||
src.original_extra = extra;
|
||||
src.original_splits.clear();
|
||||
for (int i = 0; i < extra->n_device; ++i) {
|
||||
tensor_reload_source::split_info si;
|
||||
if (extra->splits[i]) {
|
||||
for (int j = 0; j < GGML_MAX_DIMS; ++j) {
|
||||
si.ne[j] = extra->splits[i]->ne[j];
|
||||
si.nb[j] = extra->splits[i]->nb[j];
|
||||
}
|
||||
si.data = extra->splits[i]->data;
|
||||
si.buffer = extra->splits[i]->buffer;
|
||||
si.tensor = extra->splits[i];
|
||||
}
|
||||
src.original_splits.push_back(si);
|
||||
}
|
||||
}
|
||||
populate_moe_siblings(ggml_get_name(tensor), src);
|
||||
src.state = tensor_reload_source::reload_state::ON_ORIGINAL;
|
||||
log_tensor_state("snapshot_tensor_source", tensor);
|
||||
}
|
||||
|
||||
// ------------------------------------------------------------------
|
||||
// Constructor
|
||||
// ------------------------------------------------------------------
|
||||
reload_info::reload_info(const llama_model_loader & ml) {
|
||||
for (const auto & w : ml.weights) {
|
||||
if (!w.tensor || w.idx >= (int)ml.files.size()) continue;
|
||||
|
||||
struct stat st;
|
||||
if (stat(ml.files[w.idx]->get_path().c_str(), &st) != 0) continue;
|
||||
|
||||
tensor_reload_source src;
|
||||
src.path = ml.files[w.idx]->get_path();
|
||||
src.data_offset = w.offs;
|
||||
src.nbytes = ggml_nbytes(w.tensor);
|
||||
src.last_mtime = st.st_mtime;
|
||||
#ifdef __linux__
|
||||
src.last_mtime_ns = st.st_mtim.tv_nsec;
|
||||
#endif
|
||||
tensor_reload_sources[ggml_get_name(w.tensor)] = std::move(src);
|
||||
}
|
||||
}
|
||||
|
||||
// ------------------------------------------------------------------
|
||||
// Eager snapshot
|
||||
// ------------------------------------------------------------------
|
||||
void reload_info::snapshot_all_reload_tensors(llama_model & model) {
|
||||
if (this->reload_snapshots_done.exchange(true)) return;
|
||||
|
||||
LLAMA_LOG_INFO("%s: eager snapshot of all reload tensors + siblings\n", __func__);
|
||||
|
||||
for (auto & kv : tensor_reload_sources) {
|
||||
struct ggml_tensor * tensor = nullptr;
|
||||
for (auto & p : model.tensors_by_name) {
|
||||
if (p.first == kv.first) { tensor = p.second; break; }
|
||||
}
|
||||
if (!tensor) continue;
|
||||
snapshot_tensor_source(tensor, kv.second);
|
||||
}
|
||||
|
||||
for (auto & kv : tensor_reload_sources) {
|
||||
auto & src = kv.second;
|
||||
for (const auto & sib_name : src.sibling_names) {
|
||||
auto it = this->tensor_reload_sources.find(sib_name);
|
||||
if (it == this->tensor_reload_sources.end()) continue;
|
||||
if (it->second.original_buffer != nullptr) continue;
|
||||
|
||||
struct ggml_tensor * sib = nullptr;
|
||||
for (auto & p : model.tensors_by_name) {
|
||||
if (p.first == sib_name) { sib = p.second; break; }
|
||||
}
|
||||
if (!sib) continue;
|
||||
snapshot_tensor_source(sib, it->second);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// ------------------------------------------------------------------
|
||||
// Re-attachment helper
|
||||
// ------------------------------------------------------------------
|
||||
static bool reattach_split_tensor_to_shared(llama_model & model, const char * name) {
|
||||
auto it = model.reload->tensor_reload_sources.find(name);
|
||||
if (it == model.reload->tensor_reload_sources.end()) return false;
|
||||
auto & src = it->second;
|
||||
|
||||
if (!src.original_buffer) return false;
|
||||
|
||||
struct ggml_tensor * tensor = nullptr;
|
||||
for (auto & p : model.tensors_by_name) {
|
||||
if (p.first == name) { tensor = p.second; break; }
|
||||
}
|
||||
if (!tensor) return false;
|
||||
if (tensor->buffer == src.original_buffer) {
|
||||
log_tensor_state("reattach_split_tensor_to_shared", tensor);
|
||||
src.state = tensor_reload_source::reload_state::ON_ORIGINAL;
|
||||
return true;
|
||||
}
|
||||
|
||||
if (tensor->buffer && src.state != tensor_reload_source::reload_state::ON_ORIGINAL) {
|
||||
ggml_backend_buffer_free(tensor->buffer);
|
||||
}
|
||||
tensor->buffer = nullptr;
|
||||
tensor->data = nullptr;
|
||||
|
||||
tensor->buffer = src.original_buffer;
|
||||
tensor->data = src.original_data;
|
||||
tensor->type = src.original_type;
|
||||
for (int i = 0; i < GGML_MAX_DIMS; ++i) {
|
||||
tensor->ne[i] = src.original_ne[i];
|
||||
tensor->nb[i] = src.original_nb[i];
|
||||
}
|
||||
|
||||
if (src.original_extra) {
|
||||
tensor->extra = src.original_extra;
|
||||
auto extra = (ggml_split_tensor_t *)tensor->extra;
|
||||
for (int i = 0; i < extra->n_device && i < (int)src.original_splits.size(); ++i) {
|
||||
auto & os = src.original_splits[i];
|
||||
if (!extra->splits[i] && os.tensor) {
|
||||
extra->splits[i] = os.tensor;
|
||||
}
|
||||
if (extra->splits[i]) {
|
||||
if (extra->splits[i]->buffer && extra->splits[i]->buffer != os.buffer &&
|
||||
src.state != tensor_reload_source::reload_state::ON_ORIGINAL) {
|
||||
ggml_backend_buffer_free(extra->splits[i]->buffer);
|
||||
}
|
||||
extra->splits[i]->data = os.data;
|
||||
extra->splits[i]->buffer = os.buffer;
|
||||
extra->splits[i]->type = src.original_type;
|
||||
for (int j = 0; j < GGML_MAX_DIMS; ++j) {
|
||||
extra->splits[i]->ne[j] = os.ne[j];
|
||||
extra->splits[i]->nb[j] = os.nb[j];
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
src.state = tensor_reload_source::reload_state::ON_ORIGINAL;
|
||||
return true;
|
||||
}
|
||||
|
||||
// ------------------------------------------------------------------
|
||||
// MoE sibling resync
|
||||
// ------------------------------------------------------------------
|
||||
static void resync_moe_sibling_splits(
|
||||
llama_model & model,
|
||||
struct ggml_context * /*ctx_tmp*/,
|
||||
struct ggml_tensor * ref_tensor,
|
||||
const char * ref_name)
|
||||
{
|
||||
std::string name_str(ref_name);
|
||||
std::string layer_prefix;
|
||||
std::vector<std::string> suffixes;
|
||||
|
||||
if (name_str.find(".ffn_down_exps.weight") != std::string::npos) {
|
||||
layer_prefix = name_str.substr(0, name_str.find(".ffn_down_exps.weight"));
|
||||
suffixes = {".ffn_up_exps.weight", ".ffn_gate_exps.weight"};
|
||||
} else if (name_str.find(".ffn_up_exps.weight") != std::string::npos) {
|
||||
layer_prefix = name_str.substr(0, name_str.find(".ffn_up_exps.weight"));
|
||||
suffixes = {".ffn_down_exps.weight", ".ffn_gate_exps.weight"};
|
||||
} else if (name_str.find(".ffn_gate_exps.weight") != std::string::npos) {
|
||||
layer_prefix = name_str.substr(0, name_str.find(".ffn_gate_exps.weight"));
|
||||
suffixes = {".ffn_up_exps.weight", ".ffn_down_exps.weight"};
|
||||
} else {
|
||||
return;
|
||||
}
|
||||
|
||||
auto ref_extra = (ggml_split_tensor_t *)ref_tensor->extra;
|
||||
if (!ref_extra) return;
|
||||
|
||||
auto it_ref_src = model.reload->tensor_reload_sources.find(ref_name);
|
||||
if (it_ref_src != model.reload->tensor_reload_sources.end() && ref_tensor->buffer == it_ref_src->second.original_buffer) {
|
||||
for (const auto & suffix : suffixes) {
|
||||
reattach_split_tensor_to_shared(model, (layer_prefix + suffix).c_str());
|
||||
}
|
||||
return;
|
||||
}
|
||||
|
||||
struct sibling_job {
|
||||
std::string name;
|
||||
struct ggml_tensor * tensor;
|
||||
ggml_split_tensor_t * extra;
|
||||
std::vector<char> host_buf;
|
||||
bool needs_resync = false;
|
||||
};
|
||||
std::vector<sibling_job> jobs;
|
||||
|
||||
for (const auto & suffix : suffixes) {
|
||||
std::string sib_name = layer_prefix + suffix;
|
||||
struct ggml_tensor * sib = nullptr;
|
||||
for (auto & p : model.tensors_by_name) {
|
||||
if (p.first == sib_name) { sib = p.second; break; }
|
||||
}
|
||||
if (!sib || !sib->extra || sib == ref_tensor) continue;
|
||||
|
||||
auto sib_extra = (ggml_split_tensor_t *)sib->extra;
|
||||
if (sib_extra->n_device != ref_extra->n_device) continue;
|
||||
|
||||
int sib_dim = sib_extra->split_dim < 0 ? 0 : sib_extra->split_dim;
|
||||
int ref_dim = ref_extra->split_dim < 0 ? 0 : ref_extra->split_dim;
|
||||
|
||||
bool need = false;
|
||||
for (int i = 0; i < ref_extra->n_device; ++i) {
|
||||
bool rh = ref_extra->splits[i] != nullptr;
|
||||
bool sh = sib_extra->splits[i] != nullptr;
|
||||
if (rh != sh) { need = true; break; }
|
||||
if (rh && sh && sib_extra->splits[i]->ne[sib_dim] != ref_extra->splits[i]->ne[ref_dim]) {
|
||||
need = true; break;
|
||||
}
|
||||
}
|
||||
if (!need) continue;
|
||||
|
||||
size_t nbytes = ggml_nbytes(sib);
|
||||
std::vector<char> buf(nbytes);
|
||||
ggml_backend_tensor_get(sib, buf.data(), 0, nbytes);
|
||||
jobs.push_back({sib_name, sib, sib_extra, std::move(buf), true});
|
||||
}
|
||||
|
||||
if (jobs.empty()) return;
|
||||
log_split_state("resync_moe_sibling_splits", ref_tensor);
|
||||
|
||||
// Phase A: Detach / free old buffers, allocate new main handles
|
||||
for (auto & job : jobs) {
|
||||
auto sib = job.tensor;
|
||||
|
||||
ggml_backend_buffer_type_t buft = sib->buffer
|
||||
? ggml_backend_buffer_get_type(sib->buffer)
|
||||
: ggml_backend_cpu_buffer_type();
|
||||
|
||||
auto it = model.reload->tensor_reload_sources.find(job.name);
|
||||
bool was_orig = (it != model.reload->tensor_reload_sources.end() && it->second.state == tensor_reload_source::reload_state::ON_ORIGINAL);
|
||||
|
||||
if (sib->buffer) {
|
||||
if (!was_orig) ggml_backend_buffer_free(sib->buffer);
|
||||
sib->buffer = nullptr;
|
||||
sib->data = nullptr;
|
||||
}
|
||||
|
||||
size_t alloc_size = ggml_backend_buft_get_alloc_size(buft, sib);
|
||||
ggml_backend_buffer_t new_buf = alloc_buffer_fallback(buft, alloc_size);
|
||||
if (!new_buf) {
|
||||
job.needs_resync = false;
|
||||
continue;
|
||||
}
|
||||
sib->buffer = new_buf;
|
||||
sib->data = (void*)0x1; // dummy; split backend uses extra->splits
|
||||
|
||||
if (it != model.reload->tensor_reload_sources.end()) {
|
||||
it->second.state = tensor_reload_source::reload_state::DETACHED;
|
||||
}
|
||||
}
|
||||
|
||||
// Phase B: Propagate dimensions & recompute strides
|
||||
for (auto & job : jobs) {
|
||||
if (!job.needs_resync) continue;
|
||||
auto sib = job.tensor;
|
||||
auto sib_extra = job.extra;
|
||||
|
||||
for (int i = 0; i < ref_extra->n_device; ++i) {
|
||||
if (!ref_extra->splits[i]) {
|
||||
if (sib_extra->splits[i]) sib_extra->splits[i] = nullptr;
|
||||
continue;
|
||||
}
|
||||
if (!sib_extra->splits[i]) continue;
|
||||
sib_extra->splits[i]->ne[sib_extra->split_dim < 0 ? 0 : sib_extra->split_dim] =
|
||||
ref_extra->splits[i]->ne[ref_extra->split_dim < 0 ? 0 : ref_extra->split_dim];
|
||||
}
|
||||
|
||||
int n_dims = 0;
|
||||
for (int i = GGML_MAX_DIMS - 1; i >= 0; --i) {
|
||||
if (sib->ne[i] != 1) { n_dims = i + 1; break; }
|
||||
}
|
||||
size_t ctx_size = ggml_tensor_overhead() * (sib_extra->n_device + 4);
|
||||
if (ctx_size < 16384) ctx_size = 16384;
|
||||
struct ggml_init_params p = { ctx_size, NULL, true };
|
||||
struct ggml_context * ctx = ggml_init(p);
|
||||
if (ctx) {
|
||||
for (int i = 0; i < sib_extra->n_device; ++i) {
|
||||
if (!sib_extra->splits[i]) continue;
|
||||
auto tmp = ggml_new_tensor(ctx, sib->type, n_dims, sib_extra->splits[i]->ne);
|
||||
if (tmp) {
|
||||
for (int j = 0; j < GGML_MAX_DIMS; ++j) {
|
||||
sib_extra->splits[i]->nb[j] = tmp->nb[j];
|
||||
}
|
||||
}
|
||||
}
|
||||
ggml_free(ctx);
|
||||
}
|
||||
}
|
||||
|
||||
// Phase C: Allocate GPU split buffers
|
||||
bool gpu_failed = false;
|
||||
#ifdef GGML_USE_CUDA
|
||||
for (auto & job : jobs) {
|
||||
if (!job.needs_resync) continue;
|
||||
auto sib_extra = job.extra;
|
||||
|
||||
for (int i = 0; i < sib_extra->n_device; ++i) {
|
||||
if (!sib_extra->splits[i]) continue;
|
||||
size_t need = ggml_nbytes(sib_extra->splits[i]);
|
||||
auto buft = ggml_backend_cuda_buffer_type(i);
|
||||
auto b = ggml_backend_buft_alloc_buffer(buft, need);
|
||||
if (!b) { gpu_failed = true; break; }
|
||||
sib_extra->splits[i]->buffer = b;
|
||||
sib_extra->splits[i]->data = ggml_backend_buffer_get_base(b);
|
||||
}
|
||||
if (gpu_failed) break;
|
||||
}
|
||||
#else
|
||||
// Without CUDA support, force CPU fallback for any resync jobs
|
||||
for (auto & job : jobs) {
|
||||
if (job.needs_resync) { gpu_failed = true; break; }
|
||||
}
|
||||
#endif
|
||||
|
||||
// Phase D: If any GPU alloc failed, move entire layer to CPU
|
||||
if (gpu_failed) {
|
||||
for (auto & job : jobs) {
|
||||
if (!job.needs_resync) continue;
|
||||
auto sib = job.tensor;
|
||||
auto sib_extra = job.extra;
|
||||
|
||||
for (int i = 0; i < sib_extra->n_device; ++i) {
|
||||
if (sib_extra->splits[i] && sib_extra->splits[i]->buffer) {
|
||||
auto it = model.reload->tensor_reload_sources.find(job.name);
|
||||
bool is_orig = false;
|
||||
if (it != model.reload->tensor_reload_sources.end() && i < (int)it->second.original_splits.size()) {
|
||||
is_orig = (sib_extra->splits[i]->buffer == it->second.original_splits[i].buffer);
|
||||
}
|
||||
if (!is_orig) ggml_backend_buffer_free(sib_extra->splits[i]->buffer);
|
||||
sib_extra->splits[i]->buffer = nullptr;
|
||||
sib_extra->splits[i]->data = nullptr;
|
||||
}
|
||||
}
|
||||
|
||||
if (sib->buffer) {
|
||||
auto it = model.reload->tensor_reload_sources.find(job.name);
|
||||
bool is_orig = (it != model.reload->tensor_reload_sources.end() && it->second.state == tensor_reload_source::reload_state::ON_ORIGINAL);
|
||||
if (!is_orig) ggml_backend_buffer_free(sib->buffer);
|
||||
sib->buffer = nullptr;
|
||||
sib->data = nullptr;
|
||||
}
|
||||
|
||||
size_t need = ggml_nbytes(sib);
|
||||
auto cpu = alloc_buffer_fallback(ggml_backend_cpu_buffer_type(), need);
|
||||
if (cpu) {
|
||||
sib->buffer = cpu;
|
||||
sib->data = ggml_backend_buffer_get_base(cpu);
|
||||
auto it = model.reload->tensor_reload_sources.find(job.name);
|
||||
if (it != model.reload->tensor_reload_sources.end()) it->second.state = tensor_reload_source::reload_state::FALLBACK_CPU;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// Phase E: Write data back
|
||||
for (auto & job : jobs) {
|
||||
if (!job.needs_resync) continue;
|
||||
ggml_backend_tensor_set(job.tensor, job.host_buf.data(), 0, job.host_buf.size());
|
||||
}
|
||||
}
|
||||
|
||||
// ------------------------------------------------------------------
|
||||
// reload_tensor_split_path
|
||||
// ------------------------------------------------------------------
|
||||
static bool reload_tensor_split_path(
|
||||
llama_model & model,
|
||||
struct ggml_tensor * tensor,
|
||||
tensor_reload_source & src,
|
||||
const std::vector<char> & host_buf,
|
||||
ggml_type curr_type,
|
||||
bool returning_to_original,
|
||||
ggml_backend_buffer_t old_buf)
|
||||
{
|
||||
(void)curr_type;
|
||||
const char * name = ggml_get_name(tensor);
|
||||
|
||||
if (returning_to_original) {
|
||||
if (old_buf && src.state != tensor_reload_source::reload_state::ON_ORIGINAL) {
|
||||
ggml_backend_buffer_free(old_buf);
|
||||
}
|
||||
tensor->buffer = nullptr;
|
||||
tensor->data = nullptr;
|
||||
|
||||
if (!reattach_split_tensor_to_shared(model, name)) return false;
|
||||
for (const auto & sib : src.sibling_names) {
|
||||
reattach_split_tensor_to_shared(model, sib.c_str());
|
||||
}
|
||||
return true;
|
||||
}
|
||||
|
||||
ggml_backend_buffer_type_t buft = old_buf
|
||||
? ggml_backend_buffer_get_type(old_buf)
|
||||
: ggml_backend_cpu_buffer_type();
|
||||
|
||||
if (old_buf && src.state != tensor_reload_source::reload_state::ON_ORIGINAL) {
|
||||
ggml_backend_buffer_free(old_buf);
|
||||
}
|
||||
tensor->buffer = nullptr;
|
||||
tensor->data = nullptr;
|
||||
|
||||
size_t alloc_size = ggml_backend_buft_get_alloc_size(buft, tensor);
|
||||
ggml_backend_buffer_t new_buf = alloc_buffer_fallback(buft, alloc_size);
|
||||
if (!new_buf) return false;
|
||||
|
||||
ggml_backend_tensor_alloc(new_buf, tensor, ggml_backend_buffer_get_base(new_buf));
|
||||
//ggml_backend_buffer_init_tensor(tensor->buffer, tensor);
|
||||
|
||||
ggml_backend_tensor_set(tensor, host_buf.data(), 0, host_buf.size());
|
||||
log_tensor_state("reload_tensor_split_path", tensor);
|
||||
if (tensor->extra) resync_moe_sibling_splits(model, nullptr, tensor, name);
|
||||
|
||||
src.state = tensor_reload_source::reload_state::DETACHED;
|
||||
return true;
|
||||
}
|
||||
|
||||
// ------------------------------------------------------------------
|
||||
// reload_tensor_non_split_path
|
||||
// ------------------------------------------------------------------
|
||||
static bool reload_tensor_non_split_path(
|
||||
llama_model & model,
|
||||
struct ggml_tensor * tensor,
|
||||
tensor_reload_source & src,
|
||||
const std::vector<char> & host_buf,
|
||||
ggml_type curr_type,
|
||||
bool returning_to_original,
|
||||
ggml_backend_buffer_t old_buf)
|
||||
{
|
||||
(void)model;
|
||||
(void)curr_type;
|
||||
#ifndef NDEBUG
|
||||
const char * name = ggml_get_name(tensor);
|
||||
#endif
|
||||
|
||||
if (returning_to_original) {
|
||||
if (old_buf && src.state != tensor_reload_source::reload_state::ON_ORIGINAL) {
|
||||
ggml_backend_buffer_free(old_buf);
|
||||
}
|
||||
tensor->buffer = src.original_buffer;
|
||||
tensor->data = src.original_data;
|
||||
tensor->type = src.original_type;
|
||||
for (int i = 0; i < GGML_MAX_DIMS; ++i) {
|
||||
tensor->ne[i] = src.original_ne[i];
|
||||
tensor->nb[i] = src.original_nb[i];
|
||||
}
|
||||
src.state = tensor_reload_source::reload_state::ON_ORIGINAL;
|
||||
return true;
|
||||
}
|
||||
|
||||
ggml_backend_buffer_type_t buft = old_buf
|
||||
? ggml_backend_buffer_get_type(old_buf)
|
||||
: ggml_backend_cpu_buffer_type();
|
||||
|
||||
if (old_buf && src.state != tensor_reload_source::reload_state::ON_ORIGINAL) {
|
||||
ggml_backend_buffer_free(old_buf);
|
||||
#ifndef NDEBUG
|
||||
} else if (old_buf) {
|
||||
LLAMA_LOG_DEBUG("detaching from original snapshot buffer %p for '%s'\n", (void*)old_buf, name);
|
||||
#endif
|
||||
}
|
||||
tensor->buffer = nullptr;
|
||||
tensor->data = nullptr;
|
||||
|
||||
size_t alloc_size = ggml_backend_buft_get_alloc_size(buft, tensor);
|
||||
ggml_backend_buffer_t new_buf = alloc_buffer_fallback(buft, alloc_size);
|
||||
if (!new_buf) return false;
|
||||
|
||||
ggml_backend_tensor_alloc(new_buf, tensor, ggml_backend_buffer_get_base(new_buf));
|
||||
ggml_backend_tensor_set(tensor, host_buf.data(), 0, host_buf.size());
|
||||
|
||||
src.state = tensor_reload_source::reload_state::DETACHED;
|
||||
return true;
|
||||
}
|
||||
|
||||
// ------------------------------------------------------------------
|
||||
// apply_tensor_type_change
|
||||
// ------------------------------------------------------------------
|
||||
static bool apply_tensor_type_change(
|
||||
llama_model & /*model*/,
|
||||
struct ggml_tensor * tensor,
|
||||
tensor_reload_source & /*src*/,
|
||||
ggml_type curr_type)
|
||||
{
|
||||
#ifndef NDEBUG
|
||||
const char * name = ggml_get_name(tensor);
|
||||
(void)name;
|
||||
#endif
|
||||
tensor->type = curr_type;
|
||||
|
||||
int n_dims = 0;
|
||||
for (int i = GGML_MAX_DIMS - 1; i >= 0; --i) {
|
||||
if (tensor->ne[i] != 1) { n_dims = i + 1; break; }
|
||||
}
|
||||
|
||||
size_t ctx_size = ggml_tensor_overhead() * (1 + (tensor->extra ? ((ggml_split_tensor_t*)tensor->extra)->n_device : 0))
|
||||
+ ggml_graph_overhead_custom(1, false);
|
||||
struct ggml_init_params p = { ctx_size, NULL, true };
|
||||
struct ggml_context * ctx = ggml_init(p);
|
||||
if (!ctx) return false;
|
||||
|
||||
auto tmp = ggml_new_tensor(ctx, curr_type, n_dims, tensor->ne);
|
||||
if (!tmp) { ggml_free(ctx); return false; }
|
||||
for (int i = 0; i < GGML_MAX_DIMS; ++i) tensor->nb[i] = tmp->nb[i];
|
||||
|
||||
if (tensor->extra) {
|
||||
auto extra = (ggml_split_tensor_t *)tensor->extra;
|
||||
auto tt = ggml_internal_get_type_traits(curr_type);
|
||||
|
||||
if (tt.blck_size > 1 && extra->split_dim == 0) {
|
||||
int64_t bs = tt.blck_size;
|
||||
int n = extra->n_device;
|
||||
std::vector<int64_t> bounds(n, 0);
|
||||
int64_t acc = 0;
|
||||
for (int i = 0; i < n; ++i) {
|
||||
if (extra->splits[i]) acc += extra->splits[i]->ne[0];
|
||||
bounds[i] = acc;
|
||||
}
|
||||
for (int i = 0; i < n - 1; ++i) {
|
||||
if (bounds[i] > 0) {
|
||||
bounds[i] = ((bounds[i] + bs - 1) / bs) * bs;
|
||||
}
|
||||
}
|
||||
bounds[n - 1] = tensor->ne[0];
|
||||
for (int i = 1; i < n; ++i) {
|
||||
if (bounds[i] < bounds[i - 1]) bounds[i] = bounds[i - 1];
|
||||
}
|
||||
int64_t prev = 0;
|
||||
for (int i = 0; i < n; ++i) {
|
||||
if (extra->splits[i]) {
|
||||
int64_t ne0 = bounds[i] - prev;
|
||||
if (ne0 <= 0) {
|
||||
extra->splits[i] = nullptr;
|
||||
} else {
|
||||
extra->splits[i]->ne[0] = ne0;
|
||||
}
|
||||
}
|
||||
prev = bounds[i];
|
||||
}
|
||||
}
|
||||
|
||||
for (int i = 0; i < extra->n_device; ++i) {
|
||||
auto split = extra->splits[i];
|
||||
if (!split) continue;
|
||||
split->type = curr_type;
|
||||
auto t = ggml_new_tensor(ctx, curr_type, n_dims, split->ne);
|
||||
if (t) {
|
||||
for (int j = 0; j < GGML_MAX_DIMS; ++j) split->nb[j] = t->nb[j];
|
||||
}
|
||||
}
|
||||
|
||||
int64_t sum = 0;
|
||||
for (int i = 0; i < extra->n_device; ++i) {
|
||||
if (extra->splits[i]) sum += extra->splits[i]->ne[0];
|
||||
}
|
||||
GGML_ASSERT(sum == tensor->ne[0]);
|
||||
}
|
||||
|
||||
ggml_free(ctx);
|
||||
return true;
|
||||
}
|
||||
|
||||
// ------------------------------------------------------------------
|
||||
// reload_tensor
|
||||
// ------------------------------------------------------------------
|
||||
bool reload_info::reload_tensor(const char * name, llama_model & model) {
|
||||
auto it = tensor_reload_sources.find(name);
|
||||
if (it == tensor_reload_sources.end()) return false;
|
||||
auto & src = it->second;
|
||||
|
||||
struct stat st;
|
||||
if (stat(src.path.c_str(), &st) != 0) return false;
|
||||
|
||||
bool changed = (st.st_mtime != src.last_mtime);
|
||||
#ifdef __linux__
|
||||
changed = changed || (st.st_mtim.tv_nsec != src.last_mtime_ns);
|
||||
#endif
|
||||
if (!changed) return false;
|
||||
|
||||
size_t off = 0, file_nbytes = 0;
|
||||
ggml_type curr_type = GGML_TYPE_COUNT;
|
||||
int64_t file_ne[GGML_MAX_DIMS];
|
||||
if (!gguf_find_tensor_meta(src.path.c_str(), name, off, file_nbytes, curr_type, file_ne)) return false;
|
||||
|
||||
std::ifstream file(src.path, std::ios::binary);
|
||||
if (!file) return false;
|
||||
file.seekg((std::streamoff)off);
|
||||
if (!file) return false;
|
||||
|
||||
struct ggml_tensor * tensor = nullptr;
|
||||
for (auto & p : model.tensors_by_name) {
|
||||
if (p.first == name) { tensor = p.second; break; }
|
||||
}
|
||||
if (!tensor || !src.original_buffer) return false;
|
||||
|
||||
// Refuse to swap if the on-disk shape differs from the model tensor
|
||||
for (int i = 0; i < GGML_MAX_DIMS; ++i) {
|
||||
if (tensor->ne[i] != file_ne[i]) {
|
||||
LLAMA_LOG_INFO("reload_tensor: dimension mismatch for '%s': model ne[%d]=%ld, file ne[%d]=%ld — refusing swap\n",
|
||||
name, i, (long)tensor->ne[i], i, (long)file_ne[i]);
|
||||
return false;
|
||||
}
|
||||
}
|
||||
|
||||
ggml_backend_buffer_t old_buf = tensor->buffer;
|
||||
bool returning = (curr_type == src.original_type);
|
||||
|
||||
std::vector<char> host_buf;
|
||||
if (!returning) {
|
||||
if (curr_type != tensor->type) {
|
||||
if (!apply_tensor_type_change(model, tensor, src, curr_type)) return false;
|
||||
}
|
||||
size_t need = ggml_nbytes(tensor);
|
||||
if (file_nbytes < need) return false;
|
||||
host_buf.resize(need);
|
||||
file.read(host_buf.data(), (std::streamsize)need);
|
||||
if (!file || (size_t)file.gcount() != need) return false;
|
||||
}
|
||||
|
||||
bool ok = false;
|
||||
if (tensor->extra) {
|
||||
ok = reload_tensor_split_path(model, tensor, src, host_buf, curr_type, returning, old_buf);
|
||||
} else {
|
||||
ok = reload_tensor_non_split_path(model, tensor, src, host_buf, curr_type, returning, old_buf);
|
||||
}
|
||||
|
||||
if (ok) {
|
||||
src.last_mtime = st.st_mtime;
|
||||
#ifdef __linux__
|
||||
src.last_mtime_ns = st.st_mtim.tv_nsec;
|
||||
#endif
|
||||
}
|
||||
return ok;
|
||||
}
|
||||
|
||||
// ------------------------------------------------------------------
|
||||
// reload_changed_tensors
|
||||
// ------------------------------------------------------------------
|
||||
bool reload_info::reload_changed_tensors(llama_model & model) {
|
||||
snapshot_all_reload_tensors(model);
|
||||
|
||||
struct job { const char * name; bool returning; };
|
||||
std::vector<job> jobs;
|
||||
|
||||
for (auto & kv : tensor_reload_sources) {
|
||||
auto & src = kv.second;
|
||||
struct stat st;
|
||||
if (stat(src.path.c_str(), &st) != 0) continue;
|
||||
|
||||
bool changed = (st.st_mtime != src.last_mtime);
|
||||
#ifdef __linux__
|
||||
changed = changed || (st.st_mtim.tv_nsec != src.last_mtime_ns);
|
||||
#endif
|
||||
if (!changed) continue;
|
||||
|
||||
size_t off = 0, nbytes = 0;
|
||||
ggml_type t = GGML_TYPE_COUNT;
|
||||
int64_t file_ne[GGML_MAX_DIMS];
|
||||
if (!gguf_find_tensor_meta(src.path.c_str(), kv.first.c_str(), off, nbytes, t, file_ne)) continue;
|
||||
|
||||
struct ggml_tensor * tensor = nullptr;
|
||||
for (auto & p : model.tensors_by_name) {
|
||||
if (p.first == kv.first) { tensor = p.second; break; }
|
||||
}
|
||||
if (!tensor) continue;
|
||||
|
||||
bool dims_ok = true;
|
||||
for (int i = 0; i < GGML_MAX_DIMS; ++i) {
|
||||
if (tensor->ne[i] != file_ne[i]) {
|
||||
LLAMA_LOG_INFO("reload_changed_tensors: dimension mismatch for '%s': model ne[%d]=%ld, file ne[%d]=%ld — skipping\n",
|
||||
kv.first.c_str(), i, (long)tensor->ne[i], i, (long)file_ne[i]);
|
||||
dims_ok = false;
|
||||
break;
|
||||
}
|
||||
}
|
||||
if (!dims_ok) continue;
|
||||
|
||||
bool returning = (t == src.original_type);
|
||||
jobs.push_back({kv.first.c_str(), returning});
|
||||
}
|
||||
|
||||
std::sort(jobs.begin(), jobs.end(), [](const job & a, const job & b) {
|
||||
return a.returning > b.returning;
|
||||
});
|
||||
|
||||
bool r = false;
|
||||
for (auto & j : jobs) {
|
||||
if (reload_tensor(j.name, model)) {
|
||||
r = true;
|
||||
LLAMA_LOG_INFO("reloaded tensor '%s'\n", j.name);
|
||||
}
|
||||
}
|
||||
|
||||
if (r) {
|
||||
#ifdef GGML_USE_CUDA
|
||||
ggml_backend_cuda_invalidate_graphs(&model);
|
||||
#endif
|
||||
}
|
||||
return r;
|
||||
}
|
||||
|
||||
@ -21,11 +21,8 @@ void llama_reset_dflash_kv_cache_state(struct llama_context * ctx) {
|
||||
ctx->dflash.kv.cache_applied_window_version = 0;
|
||||
ctx->dflash.kv.cache_valid = false;
|
||||
ctx->dflash.kv.cache_view_valid = false;
|
||||
ctx->dflash.kv.workspace_write_pos = 0;
|
||||
ctx->dflash.kv.workspace_n_filled = 0;
|
||||
ctx->dflash.kv.workspace_applied_window_version = 0;
|
||||
ctx->dflash.kv.workspace_valid = false;
|
||||
ctx->dflash.kv.workspace_sync_pending = false;
|
||||
std::fill(ctx->dflash.kv.cache_pos.begin(), ctx->dflash.kv.cache_pos.end(), 0);
|
||||
std::fill(ctx->dflash.kv.cache_slot_valid.begin(), ctx->dflash.kv.cache_slot_valid.end(), 0);
|
||||
|
||||
for (ggml_backend_buffer_t buf : ctx->dflash.kv.cache_bufs) {
|
||||
if (buf != nullptr) {
|
||||
|
||||
101
src/llama.cpp
101
src/llama.cpp
@ -682,6 +682,11 @@ bool llama_context::update_cache_copies() {
|
||||
return true;
|
||||
}
|
||||
|
||||
static std::vector<llama_context *> & llama_all_contexts() {
|
||||
static std::vector<llama_context *> contexts;
|
||||
return contexts;
|
||||
}
|
||||
|
||||
llama_context::llama_context(const llama_model & model)
|
||||
: model(model) , sampling(llama_n_vocab(&model)) , t_start_us(model.t_start_us) , t_load_us(model.t_load_us) {
|
||||
const auto & hparams = model.hparams;
|
||||
@ -690,6 +695,7 @@ llama_context::llama_context(const llama_model & model)
|
||||
} else {
|
||||
cache_copies.resize(2*hparams.n_layer);
|
||||
}
|
||||
llama_all_contexts().push_back(this);
|
||||
}
|
||||
|
||||
void llama_context::set_mtp_op_type(llama_mtp_op_type value) {
|
||||
@ -710,6 +716,14 @@ llama_context::~llama_context() {
|
||||
}
|
||||
|
||||
ggml_backend_buffer_free(buf_output);
|
||||
|
||||
auto & all_contexts = llama_all_contexts();
|
||||
for (auto it = all_contexts.begin(); it != all_contexts.end(); ++it) {
|
||||
if (*it == this) {
|
||||
all_contexts.erase(it);
|
||||
break;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
int llama_context::max_nodes(int n_tokens, int n_kv) const {
|
||||
@ -3093,6 +3107,8 @@ static bool is_model_split_supported(const llama_model & model) {
|
||||
LLM_ARCH_QWEN35,
|
||||
LLM_ARCH_QWEN35MOE,
|
||||
LLM_ARCH_GEMMA4,
|
||||
LLM_ARCH_GEMMA4_MTP,
|
||||
LLM_ARCH_GEMMA4_ASSISTANT,
|
||||
LLM_ARCH_DEEPSEEK2,
|
||||
LLM_ARCH_GLM_DSA,
|
||||
LLM_ARCH_MISTRAL4,
|
||||
@ -3174,7 +3190,9 @@ static std::pair<std::vector<double>, double> get_layer_sizes(const llama_model_
|
||||
}
|
||||
}
|
||||
if (name == "mtp_pre_proj.weight" || name == "mtp_post_proj.weight" ||
|
||||
name == "mtp_centroids.weight" || name == "mtp_token_ordering.weight") {
|
||||
name == "mtp_centroids.weight" || name == "mtp_token_ordering.weight" ||
|
||||
name == "nextn.post_projection.weight" || name == "nextn.pre_projection.weight" ||
|
||||
name == "rope_freqs.weight") {
|
||||
continue;
|
||||
}
|
||||
if (name == "dflash_fc.weight" || name == "dflash_hidden_norm.weight") {
|
||||
@ -3369,11 +3387,21 @@ static bool llm_load_tensors(
|
||||
|
||||
auto & hparams = model.hparams;
|
||||
|
||||
if (model.arch == LLM_ARCH_GEMMA4_MTP || model.arch == LLM_ARCH_GEMMA4_ASSISTANT) {
|
||||
auto & all_models = llama_all_loaded_models();
|
||||
llama_model * tgt_model = nullptr;
|
||||
for (auto model : all_models) {
|
||||
if (model->arch == LLM_ARCH_GEMMA4) {
|
||||
tgt_model = model;
|
||||
}
|
||||
}
|
||||
if (tgt_model) {
|
||||
split_mode = tgt_model->split_mode;
|
||||
}
|
||||
}
|
||||
|
||||
if (split_mode == LLAMA_SPLIT_MODE_GRAPH || split_mode == LLAMA_SPLIT_MODE_ATTN) {
|
||||
const bool unsupported_gemma_split =
|
||||
model.arch == LLM_ARCH_GEMMA4_MTP ||
|
||||
model.arch == LLM_ARCH_GEMMA4_ASSISTANT ||
|
||||
(model.arch == LLM_ARCH_GEMMA4 && hparams.n_embd_per_layer > 0);
|
||||
const bool unsupported_gemma_split = model.arch == LLM_ARCH_GEMMA4 && hparams.n_embd_per_layer > 0;
|
||||
|
||||
if (unsupported_gemma_split) {
|
||||
LLAMA_LOG_WARN("\n=========================================================\n");
|
||||
@ -3399,6 +3427,25 @@ static bool llm_load_tensors(
|
||||
}
|
||||
}
|
||||
}
|
||||
if ((split_mode == LLAMA_SPLIT_MODE_GRAPH || split_mode == LLAMA_SPLIT_MODE_ATTN) &&
|
||||
(model.arch == LLM_ARCH_GEMMA4_MTP || model.arch == LLM_ARCH_GEMMA4_ASSISTANT)) {
|
||||
auto & all_models = llama_all_loaded_models();
|
||||
bool has_target_gemma = false;
|
||||
for (auto model : all_models) {
|
||||
if (model->arch == LLM_ARCH_GEMMA4) {
|
||||
has_target_gemma = true;
|
||||
break;
|
||||
}
|
||||
}
|
||||
if (!has_target_gemma) {
|
||||
LLAMA_LOG_WARN("\n=======================================================\n");
|
||||
LLAMA_LOG_WARN("Split mode 'graph' requested for Gemma4-assistant model\n");
|
||||
LLAMA_LOG_WARN("but no loaded Gemma4 model found.\n");
|
||||
LLAMA_LOG_WARN(" => changing split mode to 'layer'\n");
|
||||
LLAMA_LOG_WARN("=======================================================\n\n");
|
||||
split_mode = LLAMA_SPLIT_MODE_LAYER;
|
||||
}
|
||||
}
|
||||
|
||||
if (iqk_has_fancy_simd()) {
|
||||
LLAMA_LOG_INFO("======================================= HAVE_FANCY_SIMD is defined\n");
|
||||
@ -3969,7 +4016,7 @@ static bool llm_load_tensors(
|
||||
for (auto & it : ctx_bufs) {
|
||||
ggml_context * ctx = it.first;
|
||||
auto & bufs = it.second;
|
||||
if (!ml.load_all_data(ctx, bufs, use_mlock ? &model.mlock_mmaps : NULL, progress_callback, progress_callback_user_data)) {
|
||||
if (!ml.load_all_data(ctx, &model, bufs, use_mlock ? &model.mlock_mmaps : NULL, progress_callback, progress_callback_user_data)) {
|
||||
return false;
|
||||
}
|
||||
}
|
||||
@ -4158,6 +4205,11 @@ static int llama_model_load(const std::string & fname, llama_model & model, llam
|
||||
)) {
|
||||
return -2;
|
||||
}
|
||||
|
||||
// ---- populate reload registry ONLY when hot-swap is requested ----
|
||||
if (std::getenv("LLAMA_HOTSWAP_ENABLED") != nullptr) {
|
||||
model.reload = std::make_unique<reload_info>(ml);
|
||||
}
|
||||
} catch (const std::exception & err) {
|
||||
LLAMA_LOG_ERROR("%s: error loading model: %s\n", __func__, err.what());
|
||||
return -1;
|
||||
@ -5488,9 +5540,6 @@ static int llama_decode_internal(
|
||||
#if IK_PRINT_TIMING
|
||||
tim1 = ggml_time_us();
|
||||
#endif
|
||||
if (lctx.dflash.kv.workspace_sync_pending) {
|
||||
llama_sync_dflash_workspace_if_pending(lctx);
|
||||
}
|
||||
llama_graph_compute(lctx, gf, n_threads);
|
||||
#if IK_PRINT_TIMING
|
||||
llama_synchronize(&lctx);
|
||||
@ -6718,10 +6767,19 @@ struct llama_model * llama_model_load_from_file(
|
||||
return nullptr;
|
||||
}
|
||||
|
||||
llama_all_loaded_models().push_back(model);
|
||||
|
||||
return model;
|
||||
}
|
||||
|
||||
void llama_free_model(struct llama_model * model) {
|
||||
auto & all_models = llama_all_loaded_models();
|
||||
for (auto it = all_models.begin(); it != all_models.end(); ++it) {
|
||||
if (*it == model) {
|
||||
all_models.erase(it);
|
||||
break;
|
||||
}
|
||||
}
|
||||
delete model;
|
||||
}
|
||||
|
||||
@ -7090,7 +7148,7 @@ struct llama_context * llama_init_from_model(
|
||||
// main_gpu is a local index into model->devices throughout the codebase
|
||||
// (auto-fit assigns device_count-1, MTP clamps to [0, device_count), buffer-type
|
||||
// setup wraps with model.devices[main_gpu]). Translate to a raw device id here.
|
||||
const int main_gpu_id = (model->main_gpu >= 0 && model->main_gpu < (int)model->devices.size())
|
||||
[[maybe_unused]] const int main_gpu_id = (model->main_gpu >= 0 && model->main_gpu < (int)model->devices.size())
|
||||
? model->devices[model->main_gpu]
|
||||
: model->main_gpu;
|
||||
#if defined(GGML_USE_METAL)
|
||||
@ -7106,7 +7164,7 @@ struct llama_context * llama_init_from_model(
|
||||
#elif defined(GGML_USE_CUDA)
|
||||
if (model->split_mode == LLAMA_SPLIT_MODE_NONE) {
|
||||
// with split_mode LLAMA_SPLIT_MODE_NONE or LLAMA_SPLIT_MODE_GRAPH, only the main GPU backend is used
|
||||
ggml_backend_t backend = ggml_backend_cuda_init(main_gpu_id, cparams.cuda_params);
|
||||
ggml_backend_t backend = ggml_backend_cuda_init(main_gpu_id, cparams.cuda_params, ctx);
|
||||
if (backend == nullptr) {
|
||||
LLAMA_LOG_ERROR("%s: failed to initialize CUDA%d backend\n", __func__, main_gpu_id);
|
||||
llama_free(ctx);
|
||||
@ -7125,7 +7183,7 @@ struct llama_context * llama_init_from_model(
|
||||
params = new_params.data();
|
||||
}
|
||||
for (int device = 0; device < ggml_backend_cuda_get_device_count(); ++device) {
|
||||
ggml_backend_t backend = ggml_backend_cuda_init(device, params);
|
||||
ggml_backend_t backend = ggml_backend_cuda_init(device, params, ctx);
|
||||
if (backend == nullptr) {
|
||||
LLAMA_LOG_ERROR("%s: failed to initialize CUDA%d backend\n", __func__, device);
|
||||
llama_free(ctx);
|
||||
@ -11081,3 +11139,22 @@ void llama_set_mtp_target_context(struct llama_context * ctx, struct llama_conte
|
||||
size_t llama_fill_from_utf8(void* utf8, void* cpts, void* scripts) {
|
||||
return unicode_fill_from_utf8((std::string*)utf8, (std::vector<uint32_t>*)cpts, (std::vector<std::string>*)scripts);
|
||||
}
|
||||
|
||||
|
||||
bool llama_reload_changed_tensors(struct llama_context * ctx) {
|
||||
if (!ctx) return false;
|
||||
llama_model & model = const_cast<llama_model &>(ctx->model);
|
||||
if (!model.reload) return false;
|
||||
bool result = model.reload->reload_changed_tensors(model);
|
||||
if (result) {
|
||||
// Reset cached compute graphs so they are rebuilt with new tensor pointers/sizes
|
||||
ctx->prev.reset();
|
||||
ctx->prev_mtp.reset();
|
||||
}
|
||||
return result;
|
||||
}
|
||||
|
||||
std::vector<llama_model *> & llama_all_loaded_models() {
|
||||
static std::vector<llama_model *> models;
|
||||
return models;
|
||||
}
|
||||
|
||||
@ -62,6 +62,9 @@ static void test_nemotron_tool_format(testing & t);
|
||||
static void test_cohere_reasoning_detection(testing & t);
|
||||
static void test_cohere_analysis(testing & t);
|
||||
|
||||
// End-to-end Cohere2MoE (North Code) dedicated PEG parser coverage.
|
||||
static void test_cohere2moe_parser(testing & t);
|
||||
|
||||
// SmolLM3 template analysis tests
|
||||
static void test_smollm3_analysis(testing & t);
|
||||
|
||||
@ -98,6 +101,7 @@ int main(int argc, char * argv[]) {
|
||||
t.test("segments", test_marker_separation);
|
||||
t.test("seed_oss_diffs", test_seed_oss_tool_analysis);
|
||||
t.test("cohere", test_cohere_analysis);
|
||||
t.test("cohere2moe_parser", test_cohere2moe_parser);
|
||||
t.test("nemotron", test_nemotron_analysis);
|
||||
t.test("smollm3", test_smollm3_analysis);
|
||||
t.test("standard_json_tools", test_standard_json_tools_formats);
|
||||
@ -1967,3 +1971,157 @@ static void test_tagged_args_with_embedded_quotes(testing & t) {
|
||||
}
|
||||
}
|
||||
|
||||
// End-to-end coverage for the dedicated Cohere2MoE (North Code) parser:
|
||||
// template apply -> PEG parse -> assert message. Exercises the reasoning-mode
|
||||
// matrix, including the unopened-thinking-under---reasoning-off case (#1968
|
||||
// follow-up). Routing rule: reasoning surfaces to reasoning_content whenever the
|
||||
// output format != NONE (DEEPSEEK), and folds into content under NONE.
|
||||
static void test_cohere2moe_parser(testing & t) {
|
||||
std::ifstream fin("models/templates/Cohere2MoE.jinja", std::ios::binary);
|
||||
std::ostringstream buf; buf << fin.rdbuf();
|
||||
std::string src = buf.str();
|
||||
t.assert_true("Cohere2MoE template loaded", src.length() > 0);
|
||||
if (src.empty()) {
|
||||
return;
|
||||
}
|
||||
|
||||
common_chat_templates_ptr tmpls(common_chat_templates_init(/* model = */ nullptr, src));
|
||||
|
||||
common_chat_tool special_function{
|
||||
/* .name = */ "special_function",
|
||||
/* .description = */ "I'm special",
|
||||
/* .parameters = */ R"({"type":"object","properties":{"arg1":{"type":"integer"}},"required":["arg1"]})",
|
||||
};
|
||||
common_chat_tool python{
|
||||
/* .name = */ "python",
|
||||
/* .description = */ "Run Python code",
|
||||
/* .parameters = */ R"({"type":"object","properties":{"code":{"type":"string"}},"required":["code"]})",
|
||||
};
|
||||
|
||||
common_chat_msg user;
|
||||
user.role = "user";
|
||||
user.content = "Hey";
|
||||
|
||||
const std::string act =
|
||||
"<|START_ACTION|>[\n"
|
||||
" {\"tool_call_id\": \"0\", \"tool_name\": \"special_function\", \"parameters\": {\"arg1\": 1}}\n"
|
||||
"]<|END_ACTION|>";
|
||||
const std::string act_numeric_id =
|
||||
"<|START_ACTION|>[\n"
|
||||
" {\"tool_call_id\": 0, \"tool_name\": \"special_function\", \"parameters\": {\"arg1\": 1}}\n"
|
||||
"]<|END_ACTION|>";
|
||||
const std::string text_resp = "<|START_TEXT|>Hello, world!<|END_TEXT|>";
|
||||
|
||||
struct cohere_case {
|
||||
const char * name;
|
||||
std::string input;
|
||||
common_reasoning_format reasoning_format;
|
||||
bool enable_thinking;
|
||||
common_chat_tool_choice tool_choice;
|
||||
std::string exp_content;
|
||||
std::string exp_reasoning;
|
||||
size_t exp_tool_calls;
|
||||
};
|
||||
|
||||
const std::vector<cohere_case> cases = {
|
||||
// #1968 follow-up fix: orphaned thinking (no <|START_THINKING|>) under --reasoning off.
|
||||
{ "unopened/DEEPSEEK -> reasoning_content", "I'm\nthinking<|END_THINKING|>" + act,
|
||||
COMMON_REASONING_FORMAT_DEEPSEEK, false, COMMON_CHAT_TOOL_CHOICE_AUTO, "", "I'm\nthinking", 1 },
|
||||
{ "unopened/NONE -> content", "I'm\nthinking<|END_THINKING|>" + act,
|
||||
COMMON_REASONING_FORMAT_NONE, false, COMMON_CHAT_TOOL_CHOICE_AUTO, "I'm\nthinking", "", 1 },
|
||||
{ "unopened/DEEPSEEK/required -> reasoning_content", "I'm\nthinking<|END_THINKING|>" + act,
|
||||
COMMON_REASONING_FORMAT_DEEPSEEK, false, COMMON_CHAT_TOOL_CHOICE_REQUIRED, "", "I'm\nthinking", 1 },
|
||||
{ "unopened text/DEEPSEEK -> reasoning_content + content", "I'm\nthinking<|END_THINKING|>" + text_resp,
|
||||
COMMON_REASONING_FORMAT_DEEPSEEK, false, COMMON_CHAT_TOOL_CHOICE_AUTO, "Hello, world!", "I'm\nthinking", 0 },
|
||||
{ "tool-choice-none text/DEEPSEEK -> reasoning_content + content", "I'm\nthinking<|END_THINKING|>" + text_resp,
|
||||
COMMON_REASONING_FORMAT_DEEPSEEK, false, COMMON_CHAT_TOOL_CHOICE_NONE, "Hello, world!", "I'm\nthinking", 0 },
|
||||
// Regression: reasoning enabled still routes thinking to reasoning_content.
|
||||
{ "thinking-on/DEEPSEEK -> reasoning_content", "I'm\nthinking<|END_THINKING|>" + act,
|
||||
COMMON_REASONING_FORMAT_DEEPSEEK, true, COMMON_CHAT_TOOL_CHOICE_AUTO, "", "I'm\nthinking", 1 },
|
||||
{ "thinking-on/NONE -> tagged content", "<|START_THINKING|>I'm\nthinking<|END_THINKING|>" + act,
|
||||
COMMON_REASONING_FORMAT_NONE, true, COMMON_CHAT_TOOL_CHOICE_AUTO,
|
||||
"<|START_THINKING|>I'm\nthinking<|END_THINKING|>", "", 1 },
|
||||
// Regression: existing #1968 shapes still parse to clean native tool calls.
|
||||
{ "bare-end/DEEPSEEK -> clean call", "<|END_THINKING|>" + act,
|
||||
COMMON_REASONING_FORMAT_DEEPSEEK, false, COMMON_CHAT_TOOL_CHOICE_AUTO, "", "", 1 },
|
||||
{ "bare-end/trailing-end-text/DEEPSEEK -> clean call", "<|END_THINKING|>" + act + "<|END_TEXT|>",
|
||||
COMMON_REASONING_FORMAT_DEEPSEEK, false, COMMON_CHAT_TOOL_CHOICE_AUTO, "", "", 1 },
|
||||
{ "bare-end/numeric-id/DEEPSEEK -> clean call", "<|END_THINKING|>" + act_numeric_id,
|
||||
COMMON_REASONING_FORMAT_DEEPSEEK, false, COMMON_CHAT_TOOL_CHOICE_AUTO, "", "", 1 },
|
||||
{ "empty-block/DEEPSEEK -> clean call", "<|START_THINKING|><|END_THINKING|>" + act,
|
||||
COMMON_REASONING_FORMAT_DEEPSEEK, false, COMMON_CHAT_TOOL_CHOICE_AUTO, "", "", 1 },
|
||||
{ "no-thinking/DEEPSEEK -> clean call", act,
|
||||
COMMON_REASONING_FORMAT_DEEPSEEK, false, COMMON_CHAT_TOOL_CHOICE_AUTO, "", "", 1 },
|
||||
};
|
||||
|
||||
for (const auto & c : cases) {
|
||||
common_chat_templates_inputs inputs;
|
||||
inputs.messages = { user };
|
||||
inputs.tools = { special_function };
|
||||
inputs.tool_choice = c.tool_choice;
|
||||
inputs.reasoning_format = c.reasoning_format;
|
||||
inputs.enable_thinking = c.enable_thinking;
|
||||
|
||||
auto params = common_chat_templates_apply(tmpls.get(), inputs);
|
||||
auto pos = params.generation_prompt.rfind("<|START_THINKING|>");
|
||||
|
||||
common_peg_arena arena;
|
||||
arena.load(params.parser);
|
||||
|
||||
common_chat_parser_params pp(params);
|
||||
if (pos != std::string::npos) {
|
||||
pp.generation_prompt = params.generation_prompt.substr(0, pos);
|
||||
}
|
||||
|
||||
auto msg = common_chat_peg_parse(arena, c.input, /* is_partial = */ false, pp);
|
||||
|
||||
t.assert_equal(std::string(c.name) + " : content", c.exp_content, msg.content);
|
||||
t.assert_equal(std::string(c.name) + " : reasoning", c.exp_reasoning, msg.reasoning_content);
|
||||
t.assert_equal(std::string(c.name) + " : tool calls", c.exp_tool_calls, msg.tool_calls.size());
|
||||
if (c.exp_tool_calls == 1 && msg.tool_calls.size() == 1) {
|
||||
t.assert_equal(std::string(c.name) + " : tool name", std::string("special_function"), msg.tool_calls[0].name);
|
||||
t.assert_equal(std::string(c.name) + " : tool id", std::string("0"), msg.tool_calls[0].id);
|
||||
}
|
||||
}
|
||||
|
||||
const std::string parallel_act =
|
||||
"<|START_ACTION|>[\n"
|
||||
" {\"tool_call_id\": \"0\", \"tool_name\": \"special_function\", \"parameters\": {\"arg1\": 1}},\n"
|
||||
" {\"tool_call_id\": \"1\", \"tool_name\": \"python\", \"parameters\": {\"code\": \"print('hey')\"}}\n"
|
||||
"]<|END_ACTION|>";
|
||||
|
||||
common_chat_templates_inputs parallel_inputs;
|
||||
parallel_inputs.messages = { user };
|
||||
parallel_inputs.tools = { special_function, python };
|
||||
parallel_inputs.tool_choice = COMMON_CHAT_TOOL_CHOICE_AUTO;
|
||||
parallel_inputs.reasoning_format = COMMON_REASONING_FORMAT_DEEPSEEK;
|
||||
parallel_inputs.enable_thinking = false;
|
||||
parallel_inputs.parallel_tool_calls = true;
|
||||
|
||||
auto parallel_params = common_chat_templates_apply(tmpls.get(), parallel_inputs);
|
||||
auto parallel_pos = parallel_params.generation_prompt.rfind("<|START_THINKING|>");
|
||||
|
||||
common_peg_arena parallel_arena;
|
||||
parallel_arena.load(parallel_params.parser);
|
||||
|
||||
common_chat_parser_params parallel_pp(parallel_params);
|
||||
if (parallel_pos != std::string::npos) {
|
||||
parallel_pp.generation_prompt = parallel_params.generation_prompt.substr(0, parallel_pos);
|
||||
}
|
||||
|
||||
auto parallel_msg = common_chat_peg_parse(
|
||||
parallel_arena,
|
||||
"I'm\nthinking<|END_THINKING|>" + parallel_act,
|
||||
/* is_partial = */ false,
|
||||
parallel_pp);
|
||||
|
||||
t.assert_equal("parallel : content", std::string(""), parallel_msg.content);
|
||||
t.assert_equal("parallel : reasoning", std::string("I'm\nthinking"), parallel_msg.reasoning_content);
|
||||
t.assert_equal("parallel : tool calls", 2u, parallel_msg.tool_calls.size());
|
||||
if (parallel_msg.tool_calls.size() == 2) {
|
||||
t.assert_equal("parallel : tool 0 name", std::string("special_function"), parallel_msg.tool_calls[0].name);
|
||||
t.assert_equal("parallel : tool 0 id", std::string("0"), parallel_msg.tool_calls[0].id);
|
||||
t.assert_equal("parallel : tool 1 name", std::string("python"), parallel_msg.tool_calls[1].name);
|
||||
t.assert_equal("parallel : tool 1 id", std::string("1"), parallel_msg.tool_calls[1].id);
|
||||
}
|
||||
}
|
||||
|
||||
@ -23,6 +23,7 @@
|
||||
using json = nlohmann::ordered_json;
|
||||
|
||||
static int main_automated_tests(void);
|
||||
static common_chat_msg simple_msg(const std::string & role, const std::string & content);
|
||||
|
||||
static void run_multiple(const std::string& dir_path, bool stop_on_first_failure, const json& input, bool use_common = false);
|
||||
static void run_single(const std::string& contents, json input, bool use_common = false, const std::string & output_path = "");
|
||||
@ -225,7 +226,6 @@ static std::string normalize_newlines(const std::string & s) {
|
||||
#endif
|
||||
}
|
||||
|
||||
|
||||
static std::string format_using_common(
|
||||
const std::string & template_str,
|
||||
const std::string & bos_token,
|
||||
@ -243,6 +243,87 @@ static std::string format_using_common(
|
||||
return output;
|
||||
}
|
||||
|
||||
static void test_minimax_m3_native_tool_parser(void) {
|
||||
const std::string template_str = R"(
|
||||
{%- set ns_token = ']<]minimax[>[' -%}
|
||||
{%- set toolcall_begin_token = ns_token ~ '<tool_call>' -%}
|
||||
{%- set toolcall_end_token = ns_token ~ '</tool_call>' -%}
|
||||
{%- for message in messages -%}
|
||||
{{- message.role ~ ': ' ~ message.content ~ '\n' -}}
|
||||
{%- endfor -%}
|
||||
{%- if tools -%}
|
||||
{{- toolcall_begin_token ~ ns_token ~ '<invoke name="example">' ~ ns_token ~ '</invoke>' ~ toolcall_end_token -}}
|
||||
{%- endif -%}
|
||||
{%- if add_generation_prompt -%}
|
||||
{{- '<mm:think>' -}}
|
||||
{%- endif -%}
|
||||
)";
|
||||
|
||||
common_chat_templates_inputs inputs;
|
||||
inputs.use_jinja = true;
|
||||
inputs.messages = { simple_msg("user", "Call a tool") };
|
||||
inputs.add_generation_prompt = true;
|
||||
inputs.reasoning_format = COMMON_REASONING_FORMAT_DEEPSEEK;
|
||||
inputs.enable_thinking = true;
|
||||
inputs.parallel_tool_calls = true;
|
||||
inputs.tools = {
|
||||
common_chat_tool{
|
||||
/* .name = */ "special_function",
|
||||
/* .description = */ "Test function",
|
||||
/* .parameters = */ R"({
|
||||
"type": "object",
|
||||
"properties": {
|
||||
"arg1": { "type": "integer" }
|
||||
},
|
||||
"required": ["arg1"]
|
||||
})",
|
||||
},
|
||||
common_chat_tool{
|
||||
/* .name = */ "python",
|
||||
/* .description = */ "Run Python",
|
||||
/* .parameters = */ R"({
|
||||
"type": "object",
|
||||
"properties": {
|
||||
"code": { "type": "string" }
|
||||
},
|
||||
"required": ["code"]
|
||||
})",
|
||||
},
|
||||
};
|
||||
|
||||
common_chat_templates_ptr tmpls(common_chat_templates_init(/* model= */ nullptr, template_str));
|
||||
auto params = common_chat_templates_apply(tmpls.get(), inputs);
|
||||
|
||||
assert(params.format == COMMON_CHAT_FORMAT_PEG_NATIVE);
|
||||
assert(!params.parser.empty());
|
||||
|
||||
common_peg_arena arena;
|
||||
arena.load(params.parser);
|
||||
|
||||
common_chat_parser_params parser_params(params);
|
||||
parser_params.parser = arena;
|
||||
|
||||
const std::string output =
|
||||
"Calling both</mm:think>\n"
|
||||
"]<]minimax[>[<tool_call>\n"
|
||||
"]<]minimax[>[<invoke name=\"special_function\">"
|
||||
"]<]minimax[>[<arg1>1]<]minimax[>[</arg1>"
|
||||
"]<]minimax[>[</invoke>\n"
|
||||
"]<]minimax[>[<invoke name=\"python\">"
|
||||
"]<]minimax[>[<code>print('hey')]<]minimax[>[</code>"
|
||||
"]<]minimax[>[</invoke>\n"
|
||||
"]<]minimax[>[</tool_call>";
|
||||
|
||||
auto msg = common_chat_parse(output, /* is_partial = */ false, parser_params);
|
||||
assert(msg.reasoning_content == "Calling both");
|
||||
assert(msg.content.empty());
|
||||
assert(msg.tool_calls.size() == 2);
|
||||
assert(msg.tool_calls[0].name == "special_function");
|
||||
assert(json::parse(msg.tool_calls[0].arguments) == json::parse(R"({"arg1": 1})"));
|
||||
assert(msg.tool_calls[1].name == "python");
|
||||
assert(json::parse(msg.tool_calls[1].arguments) == json::parse("{\"code\": \"print('hey')\"}"));
|
||||
}
|
||||
|
||||
|
||||
// skip libcommon, use direct jinja engine
|
||||
static jinja::value_string format_using_direct_engine(
|
||||
@ -336,6 +417,8 @@ static common_chat_msg simple_msg(const std::string & role, const std::string &
|
||||
int main_automated_tests(void) {
|
||||
// jinja::enable_debug(true);
|
||||
|
||||
test_minimax_m3_native_tool_parser();
|
||||
|
||||
std::vector<llama_chat_message> conversation {
|
||||
{"system", "You are a helpful assistant"},
|
||||
{"user", "Hello"},
|
||||
|
||||
@ -360,6 +360,18 @@ static void test_loops(testing & t) {
|
||||
json::object(),
|
||||
"012"
|
||||
);
|
||||
|
||||
// {% set %} of a non-loop variable inside a loop body must not leak across
|
||||
// iterations. When a variable is only assigned on some iterations (here via
|
||||
// a conditional branch), the engine must still treat it as undefined on the
|
||||
// iterations where the branch is not taken, rather than reusing the value
|
||||
// from a previous iteration. This matches standard Jinja2 semantics, where
|
||||
// each loop iteration starts with a fresh scope.
|
||||
test_template(t, "conditional set does not leak across iterations",
|
||||
"{%- for m in msgs %}{% if m.x %}{% set r = m.x %}{% endif %}[{{ r|default('-') }}]{% endfor %}",
|
||||
{{"msgs", json::array({json{{"x", "a"}}, json::object(), json{{"x", "c"}}})}},
|
||||
"[a][-][c]"
|
||||
);
|
||||
}
|
||||
|
||||
static void test_expressions(testing & t) {
|
||||
|
||||
Loading…
x
Reference in New Issue
Block a user