From bb02f74c612064947e51d23269a1cf810b67c9a7 Mon Sep 17 00:00:00 2001 From: Xuan-Son Nguyen Date: Sat, 24 Jan 2026 17:58:45 +0100 Subject: [PATCH 01/22] chat: fix language input for translategemma (#19052) * chat: fix language input for translategemma * Update common/chat.cpp Co-authored-by: Aldehir Rojas --------- Co-authored-by: Aldehir Rojas --- common/chat.cpp | 10 ++++++++-- 1 file changed, 8 insertions(+), 2 deletions(-) diff --git a/common/chat.cpp b/common/chat.cpp index aba26e97a1..eeb38ad06a 100644 --- a/common/chat.cpp +++ b/common/chat.cpp @@ -2659,6 +2659,10 @@ static common_chat_params common_chat_params_init_translate_gemma(const common_c templates_params inputs_new = inputs; json & messages = inputs_new.messages; + // default to chat_template_kwargs, or en-GB if not specified + std::string default_src_lang = inputs.extra_context.value("source_lang_code", "en-GB"); + std::string default_tgt_lang = inputs.extra_context.value("target_lang_code", "en-GB"); + GGML_ASSERT(messages.is_array()); for (auto & message : messages) { if (message.contains("role") && message["role"].get() != "user") { @@ -2670,8 +2674,10 @@ static common_chat_params common_chat_params_init_translate_gemma(const common_c if (message.contains("content") && !message["content"].is_array()) { auto content_str = message["content"].get(); // default to en-GB if not specified (to make common_chat_format_example works) - auto src_lang = message.contains("source_lang_code") ? message["source_lang_code"].get() : "en-GB"; - auto tgt_lang = message.contains("target_lang_code") ? message["target_lang_code"].get() : "en-GB"; + auto src_lang = message.contains("source_lang_code") + ? message["source_lang_code"].get() : default_src_lang; + auto tgt_lang = message.contains("target_lang_code") + ? message["target_lang_code"].get() : default_tgt_lang; message["content"] = json::array({ json{ {"type", "text"}, From 4e5b83b226919c146fbfa754641a6772bcb2d722 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Johannes=20G=C3=A4=C3=9Fler?= Date: Sat, 24 Jan 2026 21:57:51 +0100 Subject: [PATCH 02/22] GGUF: check that tensor size is representable (#19072) --- ggml/src/gguf.cpp | 8 ++++++++ tests/test-gguf.cpp | 19 +++++++++++++++---- 2 files changed, 23 insertions(+), 4 deletions(-) diff --git a/ggml/src/gguf.cpp b/ggml/src/gguf.cpp index bfab5c4d60..ed0d7f2cae 100644 --- a/ggml/src/gguf.cpp +++ b/ggml/src/gguf.cpp @@ -585,6 +585,14 @@ struct gguf_context * gguf_init_from_file_impl(FILE * file, struct gguf_init_par break; } + // check that the size of the tensor in bytes is representable + if (ok && uint64_t(ggml_nelements(&info.t)/ggml_blck_size(info.t.type)) > SIZE_MAX/ggml_type_size(info.t.type)) { + GGML_LOG_ERROR("%s: tensor '%s' with shape (%" PRIi64 ", %" PRIi64 ", %" PRIi64 ", %" PRIi64 ") has a size in bytes > %zu\n", + __func__, info.t.name, info.t.ne[0], info.t.ne[1], info.t.ne[2], info.t.ne[3], SIZE_MAX); + ok = false; + break; + } + // calculate byte offsets given the tensor shape and type info.t.nb[0] = type_size; info.t.nb[1] = info.t.nb[0]*(info.t.ne[0]/blck_size); diff --git a/tests/test-gguf.cpp b/tests/test-gguf.cpp index 3f0c312e2f..84b7f3bc49 100644 --- a/tests/test-gguf.cpp +++ b/tests/test-gguf.cpp @@ -1,9 +1,11 @@ #include "ggml.h" #include "ggml-backend.h" #include "../ggml/src/ggml-impl.h" +#include "gguf.h" #include #include +#include #include #include #include @@ -34,6 +36,7 @@ enum handcrafted_file_type { HANDCRAFTED_TENSORS_BAD_N_DIMS = 20 + offset_has_tensors, HANDCRAFTED_TENSORS_BAD_SHAPE = 30 + offset_has_tensors, HANDCRAFTED_TENSORS_NE_TOO_BIG = 40 + offset_has_tensors, + HANDCRAFTED_TENSORS_NBYTES_TOO_BIG = 45 + offset_has_tensors, HANDCRAFTED_TENSORS_BAD_TYPE = 50 + offset_has_tensors, HANDCRAFTED_TENSORS_BAD_OFFSET = 60 + offset_has_tensors, HANDCRAFTED_TENSORS_DUPLICATE_NAME = 70 + offset_has_tensors, @@ -69,6 +72,7 @@ static std::string handcrafted_file_type_name(const enum handcrafted_file_type h case HANDCRAFTED_TENSORS_BAD_N_DIMS: return "TENSORS_BAD_N_DIMS"; case HANDCRAFTED_TENSORS_BAD_SHAPE: return "TENSORS_BAD_SHAPE"; case HANDCRAFTED_TENSORS_NE_TOO_BIG: return "TENSORS_NE_TOO_BIG"; + case HANDCRAFTED_TENSORS_NBYTES_TOO_BIG: return "TENSORS_NBYTES_TOO_BIG"; case HANDCRAFTED_TENSORS_BAD_TYPE: return "TENSORS_BAD_TYPE"; case HANDCRAFTED_TENSORS_BAD_OFFSET: return "TENSORS_BAD_OFFSET"; case HANDCRAFTED_TENSORS_DUPLICATE_NAME: return "TENSORS_DUPLICATE_NAME"; @@ -326,7 +330,7 @@ static FILE * get_handcrafted_file(const unsigned int seed, const enum handcraft uint64_t offset = 0; for (int i = 0; i < int(tensor_configs.size()); ++i) { - const ggml_type type = tensor_configs[i].first; + const ggml_type type = hft == HANDCRAFTED_TENSORS_NBYTES_TOO_BIG ? GGML_TYPE_I64 : tensor_configs[i].first; const std::array shape = tensor_configs[i].second; std::string name = "my_tensor"; @@ -343,7 +347,7 @@ static FILE * get_handcrafted_file(const unsigned int seed, const enum handcraft } helper_write(file, name.data(), name.length()); - uint32_t n_dims = hft == HANDCRAFTED_TENSORS_NE_TOO_BIG ? 2 : 1; + uint32_t n_dims = (hft == HANDCRAFTED_TENSORS_NE_TOO_BIG || hft == HANDCRAFTED_TENSORS_NBYTES_TOO_BIG) ? 2 : 1; for (int i = GGML_MAX_DIMS-1; i >= 1; --i) { if (shape[i] != 1) { n_dims = i + 1; @@ -358,13 +362,19 @@ static FILE * get_handcrafted_file(const unsigned int seed, const enum handcraft } if (hft == HANDCRAFTED_TENSORS_BAD_SHAPE) { + const int64_t bad_dim = -1; for (uint32_t j = 0; j < n_dims; ++j) { - const int64_t bad_dim = -1; helper_write(file, bad_dim); } } else if (hft == HANDCRAFTED_TENSORS_NE_TOO_BIG){ + const int64_t big_dim = 4*int64_t(INT32_MAX); + for (uint32_t j = 0; j < n_dims; ++j) { + helper_write(file, big_dim); + } + } else if (hft == HANDCRAFTED_TENSORS_NBYTES_TOO_BIG){ + const size_t big_ne = SIZE_MAX/ggml_type_size(type); + const int64_t big_dim = GGML_PAD(int64_t(1.01f*std::pow(big_ne, 1.0f/n_dims)) + 1, ggml_blck_size(type)); for (uint32_t j = 0; j < n_dims; ++j) { - const int64_t big_dim = 4*int64_t(INT32_MAX); helper_write(file, big_dim); } } else { @@ -682,6 +692,7 @@ static std::pair test_handcrafted_file(const unsigned int seed) { HANDCRAFTED_TENSORS_BAD_N_DIMS, HANDCRAFTED_TENSORS_BAD_SHAPE, HANDCRAFTED_TENSORS_NE_TOO_BIG, + HANDCRAFTED_TENSORS_NBYTES_TOO_BIG, HANDCRAFTED_TENSORS_BAD_TYPE, HANDCRAFTED_TENSORS_BAD_OFFSET, HANDCRAFTED_TENSORS_DUPLICATE_NAME, From e9fd8dcab45d6cd147874e32565923bdfd0efbdb Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Johannes=20G=C3=A4=C3=9Fler?= Date: Sat, 24 Jan 2026 22:13:08 +0100 Subject: [PATCH 03/22] llama-fit-params: keep explicit --ctx-size 0 (#19070) --- common/arg.cpp | 4 ++++ include/llama.h | 1 + src/llama.cpp | 8 ++++++-- tools/fit-params/fit-params.cpp | 2 +- 4 files changed, 12 insertions(+), 3 deletions(-) diff --git a/common/arg.cpp b/common/arg.cpp index 163c9b71b0..98477e8117 100644 --- a/common/arg.cpp +++ b/common/arg.cpp @@ -1231,6 +1231,10 @@ common_params_context common_params_parser_init(common_params & params, llama_ex string_format("size of the prompt context (default: %d, 0 = loaded from model)", params.n_ctx), [](common_params & params, int value) { params.n_ctx = value; + if (value == 0) { + // disable context reduction in llama_params_fit if the user explicitly requests the full context size: + params.fit_params_min_ctx = UINT32_MAX; + } } ).set_env("LLAMA_ARG_CTX_SIZE")); add_opt(common_arg( diff --git a/include/llama.h b/include/llama.h index 280745713e..1507107f1a 100644 --- a/include/llama.h +++ b/include/llama.h @@ -489,6 +489,7 @@ extern "C" { // - returns true if the parameters could be successfully modified to fit device memory // - this function is NOT thread safe because it modifies the global llama logger state // - only parameters that have the same value as in llama_default_model_params are modified + // with the exception of the context size which is modified if and only if equal to 0 LLAMA_API enum llama_params_fit_status llama_params_fit( const char * path_model, struct llama_model_params * mparams, diff --git a/src/llama.cpp b/src/llama.cpp index f1096d960e..11b75fcff9 100644 --- a/src/llama.cpp +++ b/src/llama.cpp @@ -311,8 +311,12 @@ static void llama_params_fit_impl( __func__, hp_nct, cparams->n_ctx, memory_reduction/MiB); } } else { - LLAMA_LOG_INFO("%s: default model context size is %" PRIu32 " which is <= the min. context size of %" PRIu32 " -> no change\n", - __func__, hp_nct, n_ctx_min); + if (n_ctx_min == UINT32_MAX) { + LLAMA_LOG_INFO("%s: user has requested full context size of %" PRIu32 " -> no change\n", __func__, hp_nct); + } else { + LLAMA_LOG_INFO("%s: default model context size is %" PRIu32 " which is <= the min. context size of %" PRIu32 " -> no change\n", + __func__, hp_nct, n_ctx_min); + } } } else { LLAMA_LOG_INFO("%s: context size set by user to %" PRIu32 " -> no change\n", __func__, cparams->n_ctx); diff --git a/tools/fit-params/fit-params.cpp b/tools/fit-params/fit-params.cpp index f9d9cb34c7..0176be06e7 100644 --- a/tools/fit-params/fit-params.cpp +++ b/tools/fit-params/fit-params.cpp @@ -36,7 +36,7 @@ int main(int argc, char ** argv) { LOG_INF("%s: printing fitted CLI arguments to stdout...\n", __func__); common_log_flush(common_log_main()); - printf("-c %" PRIu32 " -ngl %" PRIu32, cparams.n_ctx, mparams.n_gpu_layers); + printf("-c %" PRIu32 " -ngl %" PRIi32, cparams.n_ctx, mparams.n_gpu_layers); size_t nd = llama_max_devices(); while (nd > 1 && mparams.tensor_split[nd - 1] == 0.0f) { From 9981c3013018b1e955f2e44dbe514032496d3f6e Mon Sep 17 00:00:00 2001 From: Bartowski <3266127+bartowski1182@users.noreply.github.com> Date: Sat, 24 Jan 2026 20:36:47 -0500 Subject: [PATCH 04/22] convert : fix conversion for inheriting models that were bypassing modify_tensors (#19064) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit * Add undo_permute = False where needed * Replace super().modify_tensors with ModelBase * Add one more ModelBase.modify_tensors * Update convert_hf_to_gguf.py Co-authored-by: Sigbjørn Skjæret * Update convert_hf_to_gguf.py Co-authored-by: Sigbjørn Skjæret * Update convert_hf_to_gguf.py Co-authored-by: Sigbjørn Skjæret --------- Co-authored-by: Sigbjørn Skjæret --- convert_hf_to_gguf.py | 22 +++++++++++----------- 1 file changed, 11 insertions(+), 11 deletions(-) diff --git a/convert_hf_to_gguf.py b/convert_hf_to_gguf.py index 8cc4963fb2..d8bc95fa6a 100755 --- a/convert_hf_to_gguf.py +++ b/convert_hf_to_gguf.py @@ -2736,7 +2736,7 @@ class AfmoeModel(LlamaModel): data_torch = torch.stack(datas, dim=0) merged_name = f"model.layers.{bid}.mlp.experts.{w_name}.weight" - yield from super().modify_tensors(data_torch, merged_name, bid) + yield from ModelBase.modify_tensors(self, data_torch, merged_name, bid) return else: @@ -2745,7 +2745,7 @@ class AfmoeModel(LlamaModel): if name.endswith(".expert_bias"): name = name.replace(".expert_bias", ".expert_bias.bias") - yield from super().modify_tensors(data_torch, name, bid) + yield from ModelBase.modify_tensors(self, data_torch, name, bid) @ModelBase.register( @@ -8918,7 +8918,7 @@ class GraniteHybridModel(Mamba2Model, GraniteMoeModel): return Mamba2Model.modify_tensors(self, data_torch, name, bid) elif bid in self._attn_layers: return GraniteMoeModel.modify_tensors(self, data_torch, name, bid) - yield from super().modify_tensors(data_torch, name, bid) + yield from ModelBase.modify_tensors(self, data_torch, name, bid) def set_gguf_parameters(self): """This method merges params from both parents and some that are @@ -9050,33 +9050,33 @@ class NemotronHModel(GraniteHybridModel): if self.is_moe and bid is not None: if name.endswith("mixer.gate.e_score_correction_bias"): new_name = name.replace("e_score_correction_bias", "e_score_correction.bias") - yield from super().modify_tensors(data_torch, new_name, bid) + yield from ModelBase.modify_tensors(self, data_torch, new_name, bid) return if name.endswith("mixer.dt_bias"): new_name = name.replace("dt_bias", "dt.bias") - yield from super().modify_tensors(data_torch, new_name, bid) + yield from ModelBase.modify_tensors(self, data_torch, new_name, bid) return if name.endswith("mixer.conv1d.weight"): squeezed_data = data_torch.squeeze() - yield from super().modify_tensors(squeezed_data, name, bid) + yield from ModelBase.modify_tensors(self, squeezed_data, name, bid) return if name.endswith("mixer.A_log"): transformed_data = -torch.exp(data_torch) reshaped_data = transformed_data.squeeze().reshape(-1, 1) - yield from super().modify_tensors(reshaped_data, name, bid) + yield from ModelBase.modify_tensors(self, reshaped_data, name, bid) return if name.endswith("mixer.D"): reshaped_data = data_torch.squeeze().reshape(-1, 1) - yield from super().modify_tensors(reshaped_data, name, bid) + yield from ModelBase.modify_tensors(self, reshaped_data, name, bid) return if name.endswith("mixer.norm.weight"): reshaped_data = data_torch.reshape(self.n_group, -1) - yield from super().modify_tensors(reshaped_data, name, bid) + yield from ModelBase.modify_tensors(self, reshaped_data, name, bid) return if name.find("mixer.experts") != -1: @@ -9101,7 +9101,7 @@ class NemotronHModel(GraniteHybridModel): data_torch = torch.stack(datas, dim=0) merged_name = f"model.layers.{bid}.mlp.experts.{w_name}.weight" - yield from super().modify_tensors(data_torch, merged_name, bid) + yield from ModelBase.modify_tensors(self, data_torch, merged_name, bid) return else: return @@ -10731,7 +10731,7 @@ class CogVLMModel(LlamaModel): if name.startswith("model.vision."): return - yield from super().modify_tensors(data_torch, name, bid) + yield from ModelBase.modify_tensors(self, data_torch, name, bid) @ModelBase.register("JanusForConditionalGeneration") From 16639ba2178fc1ccbae912d678d8b515dc4dc322 Mon Sep 17 00:00:00 2001 From: Daniel Bevenius Date: Sun, 25 Jan 2026 07:31:42 +0100 Subject: [PATCH 05/22] common : use two decimal places for float arg help messages (#19048) * common : use two decimal places for float arg help messages This commit updates the help messages for various command-line arguments in arg.cpp to display floating-point default values with two decimal places instead of one. The motivation for this changes is that currently only having one decimal place means that values generated using --help or llama-gen-docs will not display the correct values. For example, currently the value of top-p in tools/server/README.md is `0.9`, but the default value is actually '0.95'. And running llama-gen-docs does not update this value as it uses the output from the help message, which shows only one decimal place, so the values look like they are unchanged. * docs : run llama-gen-docs to update docs --- common/arg.cpp | 42 +++++++++++++++---------------- tools/cli/README.md | 48 +++++++++++++++++------------------ tools/completion/README.md | 44 ++++++++++++++++---------------- tools/server/README.md | 51 +++++++++++++++++++------------------- 4 files changed, 94 insertions(+), 91 deletions(-) diff --git a/common/arg.cpp b/common/arg.cpp index 98477e8117..04fd375d56 100644 --- a/common/arg.cpp +++ b/common/arg.cpp @@ -1577,7 +1577,7 @@ common_params_context common_params_parser_init(common_params & params, llama_ex ).set_sparam()); add_opt(common_arg( {"--temp"}, "N", - string_format("temperature (default: %.1f)", (double)params.sampling.temp), + string_format("temperature (default: %.2f)", (double)params.sampling.temp), [](common_params & params, const std::string & value) { params.sampling.temp = std::stof(value); params.sampling.temp = std::max(params.sampling.temp, 0.0f); @@ -1594,7 +1594,7 @@ common_params_context common_params_parser_init(common_params & params, llama_ex ).set_sparam().set_env("LLAMA_ARG_TOP_K")); add_opt(common_arg( {"--top-p"}, "N", - string_format("top-p sampling (default: %.1f, 1.0 = disabled)", (double)params.sampling.top_p), + string_format("top-p sampling (default: %.2f, 1.0 = disabled)", (double)params.sampling.top_p), [](common_params & params, const std::string & value) { params.sampling.top_p = std::stof(value); params.sampling.user_sampling_config |= common_params_sampling_config::COMMON_PARAMS_SAMPLING_CONFIG_TOP_P; @@ -1602,7 +1602,7 @@ common_params_context common_params_parser_init(common_params & params, llama_ex ).set_sparam()); add_opt(common_arg( {"--min-p"}, "N", - string_format("min-p sampling (default: %.1f, 0.0 = disabled)", (double)params.sampling.min_p), + string_format("min-p sampling (default: %.2f, 0.0 = disabled)", (double)params.sampling.min_p), [](common_params & params, const std::string & value) { params.sampling.min_p = std::stof(value); params.sampling.user_sampling_config |= common_params_sampling_config::COMMON_PARAMS_SAMPLING_CONFIG_MIN_P; @@ -1610,14 +1610,14 @@ common_params_context common_params_parser_init(common_params & params, llama_ex ).set_sparam()); add_opt(common_arg( {"--top-nsigma"}, "N", - string_format("top-n-sigma sampling (default: %.1f, -1.0 = disabled)", params.sampling.top_n_sigma), + string_format("top-n-sigma sampling (default: %.2f, -1.0 = disabled)", params.sampling.top_n_sigma), [](common_params & params, const std::string & value) { params.sampling.top_n_sigma = std::stof(value); } ).set_sparam()); add_opt(common_arg( {"--xtc-probability"}, "N", - string_format("xtc probability (default: %.1f, 0.0 = disabled)", (double)params.sampling.xtc_probability), + string_format("xtc probability (default: %.2f, 0.0 = disabled)", (double)params.sampling.xtc_probability), [](common_params & params, const std::string & value) { params.sampling.xtc_probability = std::stof(value); params.sampling.user_sampling_config |= common_params_sampling_config::COMMON_PARAMS_SAMPLING_CONFIG_XTC_PROBABILITY; @@ -1625,7 +1625,7 @@ common_params_context common_params_parser_init(common_params & params, llama_ex ).set_sparam()); add_opt(common_arg( {"--xtc-threshold"}, "N", - string_format("xtc threshold (default: %.1f, 1.0 = disabled)", (double)params.sampling.xtc_threshold), + string_format("xtc threshold (default: %.2f, 1.0 = disabled)", (double)params.sampling.xtc_threshold), [](common_params & params, const std::string & value) { params.sampling.xtc_threshold = std::stof(value); params.sampling.user_sampling_config |= common_params_sampling_config::COMMON_PARAMS_SAMPLING_CONFIG_XTC_THRESHOLD; @@ -1633,7 +1633,7 @@ common_params_context common_params_parser_init(common_params & params, llama_ex ).set_sparam()); add_opt(common_arg( {"--typical"}, "N", - string_format("locally typical sampling, parameter p (default: %.1f, 1.0 = disabled)", (double)params.sampling.typ_p), + string_format("locally typical sampling, parameter p (default: %.2f, 1.0 = disabled)", (double)params.sampling.typ_p), [](common_params & params, const std::string & value) { params.sampling.typ_p = std::stof(value); } @@ -1652,7 +1652,7 @@ common_params_context common_params_parser_init(common_params & params, llama_ex ).set_sparam()); add_opt(common_arg( {"--repeat-penalty"}, "N", - string_format("penalize repeat sequence of tokens (default: %.1f, 1.0 = disabled)", (double)params.sampling.penalty_repeat), + string_format("penalize repeat sequence of tokens (default: %.2f, 1.0 = disabled)", (double)params.sampling.penalty_repeat), [](common_params & params, const std::string & value) { params.sampling.penalty_repeat = std::stof(value); params.sampling.user_sampling_config |= common_params_sampling_config::COMMON_PARAMS_SAMPLING_CONFIG_PENALTY_REPEAT; @@ -1660,21 +1660,21 @@ common_params_context common_params_parser_init(common_params & params, llama_ex ).set_sparam()); add_opt(common_arg( {"--presence-penalty"}, "N", - string_format("repeat alpha presence penalty (default: %.1f, 0.0 = disabled)", (double)params.sampling.penalty_present), + string_format("repeat alpha presence penalty (default: %.2f, 0.0 = disabled)", (double)params.sampling.penalty_present), [](common_params & params, const std::string & value) { params.sampling.penalty_present = std::stof(value); } ).set_sparam()); add_opt(common_arg( {"--frequency-penalty"}, "N", - string_format("repeat alpha frequency penalty (default: %.1f, 0.0 = disabled)", (double)params.sampling.penalty_freq), + string_format("repeat alpha frequency penalty (default: %.2f, 0.0 = disabled)", (double)params.sampling.penalty_freq), [](common_params & params, const std::string & value) { params.sampling.penalty_freq = std::stof(value); } ).set_sparam()); add_opt(common_arg( {"--dry-multiplier"}, "N", - string_format("set DRY sampling multiplier (default: %.1f, 0.0 = disabled)", (double)params.sampling.dry_multiplier), + string_format("set DRY sampling multiplier (default: %.2f, 0.0 = disabled)", (double)params.sampling.dry_multiplier), [](common_params & params, const std::string & value) { params.sampling.dry_multiplier = std::stof(value); } @@ -1755,14 +1755,14 @@ common_params_context common_params_parser_init(common_params & params, llama_ex ).set_sparam()); add_opt(common_arg( {"--dynatemp-range"}, "N", - string_format("dynamic temperature range (default: %.1f, 0.0 = disabled)", (double)params.sampling.dynatemp_range), + string_format("dynamic temperature range (default: %.2f, 0.0 = disabled)", (double)params.sampling.dynatemp_range), [](common_params & params, const std::string & value) { params.sampling.dynatemp_range = std::stof(value); } ).set_sparam()); add_opt(common_arg( {"--dynatemp-exp"}, "N", - string_format("dynamic temperature exponent (default: %.1f)", (double)params.sampling.dynatemp_exponent), + string_format("dynamic temperature exponent (default: %.2f)", (double)params.sampling.dynatemp_exponent), [](common_params & params, const std::string & value) { params.sampling.dynatemp_exponent = std::stof(value); } @@ -1778,7 +1778,7 @@ common_params_context common_params_parser_init(common_params & params, llama_ex ).set_sparam()); add_opt(common_arg( {"--mirostat-lr"}, "N", - string_format("Mirostat learning rate, parameter eta (default: %.1f)", (double)params.sampling.mirostat_eta), + string_format("Mirostat learning rate, parameter eta (default: %.2f)", (double)params.sampling.mirostat_eta), [](common_params & params, const std::string & value) { params.sampling.mirostat_eta = std::stof(value); params.sampling.user_sampling_config |= common_params_sampling_config::COMMON_PARAMS_SAMPLING_CONFIG_MIROSTAT_ETA; @@ -1786,7 +1786,7 @@ common_params_context common_params_parser_init(common_params & params, llama_ex ).set_sparam()); add_opt(common_arg( {"--mirostat-ent"}, "N", - string_format("Mirostat target entropy, parameter tau (default: %.1f)", (double)params.sampling.mirostat_tau), + string_format("Mirostat target entropy, parameter tau (default: %.2f)", (double)params.sampling.mirostat_tau), [](common_params & params, const std::string & value) { params.sampling.mirostat_tau = std::stof(value); params.sampling.user_sampling_config |= common_params_sampling_config::COMMON_PARAMS_SAMPLING_CONFIG_MIROSTAT_TAU; @@ -1920,28 +1920,28 @@ common_params_context common_params_parser_init(common_params & params, llama_ex ).set_env("LLAMA_ARG_YARN_ORIG_CTX")); add_opt(common_arg( {"--yarn-ext-factor"}, "N", - string_format("YaRN: extrapolation mix factor (default: %.1f, 0.0 = full interpolation)", (double)params.yarn_ext_factor), + string_format("YaRN: extrapolation mix factor (default: %.2f, 0.0 = full interpolation)", (double)params.yarn_ext_factor), [](common_params & params, const std::string & value) { params.yarn_ext_factor = std::stof(value); } ).set_env("LLAMA_ARG_YARN_EXT_FACTOR")); add_opt(common_arg( {"--yarn-attn-factor"}, "N", - string_format("YaRN: scale sqrt(t) or attention magnitude (default: %.1f)", (double)params.yarn_attn_factor), + string_format("YaRN: scale sqrt(t) or attention magnitude (default: %.2f)", (double)params.yarn_attn_factor), [](common_params & params, const std::string & value) { params.yarn_attn_factor = std::stof(value); } ).set_env("LLAMA_ARG_YARN_ATTN_FACTOR")); add_opt(common_arg( {"--yarn-beta-slow"}, "N", - string_format("YaRN: high correction dim or alpha (default: %.1f)", (double)params.yarn_beta_slow), + string_format("YaRN: high correction dim or alpha (default: %.2f)", (double)params.yarn_beta_slow), [](common_params & params, const std::string & value) { params.yarn_beta_slow = std::stof(value); } ).set_env("LLAMA_ARG_YARN_BETA_SLOW")); add_opt(common_arg( {"--yarn-beta-fast"}, "N", - string_format("YaRN: low correction dim or beta (default: %.1f)", (double)params.yarn_beta_fast), + string_format("YaRN: low correction dim or beta (default: %.2f)", (double)params.yarn_beta_fast), [](common_params & params, const std::string & value) { params.yarn_beta_fast = std::stof(value); } @@ -3335,14 +3335,14 @@ common_params_context common_params_parser_init(common_params & params, llama_ex ).set_examples({LLAMA_EXAMPLE_SPECULATIVE, LLAMA_EXAMPLE_LOOKUP, LLAMA_EXAMPLE_SERVER, LLAMA_EXAMPLE_CLI}).set_env("LLAMA_ARG_DRAFT_MIN")); add_opt(common_arg( {"--draft-p-split"}, "P", - string_format("speculative decoding split probability (default: %.1f)", (double)params.speculative.p_split), + string_format("speculative decoding split probability (default: %.2f)", (double)params.speculative.p_split), [](common_params & params, const std::string & value) { params.speculative.p_split = std::stof(value); } ).set_examples({LLAMA_EXAMPLE_SPECULATIVE}).set_env("LLAMA_ARG_DRAFT_P_SPLIT")); add_opt(common_arg( {"--draft-p-min"}, "P", - string_format("minimum speculative decoding probability (greedy) (default: %.1f)", (double)params.speculative.p_min), + string_format("minimum speculative decoding probability (greedy) (default: %.2f)", (double)params.speculative.p_min), [](common_params & params, const std::string & value) { params.speculative.p_min = std::stof(value); } diff --git a/tools/cli/README.md b/tools/cli/README.md index 3b6f0708ed..4a15cbad9d 100644 --- a/tools/cli/README.md +++ b/tools/cli/README.md @@ -45,10 +45,10 @@ | `--rope-freq-base N` | RoPE base frequency, used by NTK-aware scaling (default: loaded from model)
(env: LLAMA_ARG_ROPE_FREQ_BASE) | | `--rope-freq-scale N` | RoPE frequency scaling factor, expands context by a factor of 1/N
(env: LLAMA_ARG_ROPE_FREQ_SCALE) | | `--yarn-orig-ctx N` | YaRN: original context size of model (default: 0 = model training context size)
(env: LLAMA_ARG_YARN_ORIG_CTX) | -| `--yarn-ext-factor N` | YaRN: extrapolation mix factor (default: -1.0, 0.0 = full interpolation)
(env: LLAMA_ARG_YARN_EXT_FACTOR) | -| `--yarn-attn-factor N` | YaRN: scale sqrt(t) or attention magnitude (default: -1.0)
(env: LLAMA_ARG_YARN_ATTN_FACTOR) | -| `--yarn-beta-slow N` | YaRN: high correction dim or alpha (default: -1.0)
(env: LLAMA_ARG_YARN_BETA_SLOW) | -| `--yarn-beta-fast N` | YaRN: low correction dim or beta (default: -1.0)
(env: LLAMA_ARG_YARN_BETA_FAST) | +| `--yarn-ext-factor N` | YaRN: extrapolation mix factor (default: -1.00, 0.0 = full interpolation)
(env: LLAMA_ARG_YARN_EXT_FACTOR) | +| `--yarn-attn-factor N` | YaRN: scale sqrt(t) or attention magnitude (default: -1.00)
(env: LLAMA_ARG_YARN_ATTN_FACTOR) | +| `--yarn-beta-slow N` | YaRN: high correction dim or alpha (default: -1.00)
(env: LLAMA_ARG_YARN_BETA_SLOW) | +| `--yarn-beta-fast N` | YaRN: low correction dim or beta (default: -1.00)
(env: LLAMA_ARG_YARN_BETA_FAST) | | `-kvo, --kv-offload, -nkvo, --no-kv-offload` | whether to enable KV cache offloading (default: enabled)
(env: LLAMA_ARG_KV_OFFLOAD) | | `--repack, -nr, --no-repack` | whether to enable weight repacking (default: enabled)
(env: LLAMA_ARG_REPACK) | | `--no-host` | bypass host buffer allowing extra buffers to be used
(env: LLAMA_ARG_NO_HOST) | @@ -109,30 +109,30 @@ | `-s, --seed SEED` | RNG seed (default: -1, use random seed for -1) | | `--sampler-seq, --sampling-seq SEQUENCE` | simplified sequence for samplers that will be used (default: edskypmxt) | | `--ignore-eos` | ignore end of stream token and continue generating (implies --logit-bias EOS-inf) | -| `--temp N` | temperature (default: 0.8) | +| `--temp N` | temperature (default: 0.80) | | `--top-k N` | top-k sampling (default: 40, 0 = disabled)
(env: LLAMA_ARG_TOP_K) | -| `--top-p N` | top-p sampling (default: 0.9, 1.0 = disabled) | -| `--min-p N` | min-p sampling (default: 0.1, 0.0 = disabled) | -| `--adaptive-target N` | adaptive-p: select tokens near this probability (valid range 0.0 to 1.0; negative = disabled) | -| `--adaptive-decay N` | adaptive-p: EMA decay for adaptation; effective history length ≈ 1/(1-decay) tokens (valid range 0.0 - 0.99) | -| `--top-nsigma N` | top-n-sigma sampling (default: -1.0, -1.0 = disabled) | -| `--xtc-probability N` | xtc probability (default: 0.0, 0.0 = disabled) | -| `--xtc-threshold N` | xtc threshold (default: 0.1, 1.0 = disabled) | -| `--typical N` | locally typical sampling, parameter p (default: 1.0, 1.0 = disabled) | +| `--top-p N` | top-p sampling (default: 0.95, 1.0 = disabled) | +| `--min-p N` | min-p sampling (default: 0.05, 0.0 = disabled) | +| `--top-nsigma N` | top-n-sigma sampling (default: -1.00, -1.0 = disabled) | +| `--xtc-probability N` | xtc probability (default: 0.00, 0.0 = disabled) | +| `--xtc-threshold N` | xtc threshold (default: 0.10, 1.0 = disabled) | +| `--typical N` | locally typical sampling, parameter p (default: 1.00, 1.0 = disabled) | | `--repeat-last-n N` | last n tokens to consider for penalize (default: 64, 0 = disabled, -1 = ctx_size) | -| `--repeat-penalty N` | penalize repeat sequence of tokens (default: 1.0, 1.0 = disabled) | -| `--presence-penalty N` | repeat alpha presence penalty (default: 0.0, 0.0 = disabled) | -| `--frequency-penalty N` | repeat alpha frequency penalty (default: 0.0, 0.0 = disabled) | -| `--dry-multiplier N` | set DRY sampling multiplier (default: 0.0, 0.0 = disabled) | +| `--repeat-penalty N` | penalize repeat sequence of tokens (default: 1.00, 1.0 = disabled) | +| `--presence-penalty N` | repeat alpha presence penalty (default: 0.00, 0.0 = disabled) | +| `--frequency-penalty N` | repeat alpha frequency penalty (default: 0.00, 0.0 = disabled) | +| `--dry-multiplier N` | set DRY sampling multiplier (default: 0.00, 0.0 = disabled) | | `--dry-base N` | set DRY sampling base value (default: 1.75) | | `--dry-allowed-length N` | set allowed length for DRY sampling (default: 2) | | `--dry-penalty-last-n N` | set DRY penalty for the last n tokens (default: -1, 0 = disable, -1 = context size) | | `--dry-sequence-breaker STRING` | add sequence breaker for DRY sampling, clearing out default breakers ('\n', ':', '"', '*') in the process; use "none" to not use any sequence breakers | -| `--dynatemp-range N` | dynamic temperature range (default: 0.0, 0.0 = disabled) | -| `--dynatemp-exp N` | dynamic temperature exponent (default: 1.0) | +| `--adaptive-target N` | adaptive-p: select tokens near this probability (valid range 0.0 to 1.0; negative = disabled) (default: -1.00)
[(more info)](https://github.com/ggml-org/llama.cpp/pull/17927) | +| `--adaptive-decay N` | adaptive-p: decay rate for target adaptation over time. lower values are more reactive, higher values are more stable.
(valid range 0.0 to 0.99) (default: 0.90) | +| `--dynatemp-range N` | dynamic temperature range (default: 0.00, 0.0 = disabled) | +| `--dynatemp-exp N` | dynamic temperature exponent (default: 1.00) | | `--mirostat N` | use Mirostat sampling.
Top K, Nucleus and Locally Typical samplers are ignored if used.
(default: 0, 0 = disabled, 1 = Mirostat, 2 = Mirostat 2.0) | -| `--mirostat-lr N` | Mirostat learning rate, parameter eta (default: 0.1) | -| `--mirostat-ent N` | Mirostat target entropy, parameter tau (default: 5.0) | +| `--mirostat-lr N` | Mirostat learning rate, parameter eta (default: 0.10) | +| `--mirostat-ent N` | Mirostat target entropy, parameter tau (default: 5.00) | | `-l, --logit-bias TOKEN_ID(+/-)BIAS` | modifies the likelihood of token appearing in the completion,
i.e. `--logit-bias 15043+1` to increase likelihood of token ' Hello',
or `--logit-bias 15043-1` to decrease likelihood of token ' Hello' | | `--grammar GRAMMAR` | BNF-like grammar to constrain generations (see samples in grammars/ dir) (default: '') | | `--grammar-file FNAME` | file to read grammar from | @@ -173,12 +173,12 @@ | `--jinja, --no-jinja` | whether to use jinja template engine for chat (default: enabled)
(env: LLAMA_ARG_JINJA) | | `--reasoning-format FORMAT` | controls whether thought tags are allowed and/or extracted from the response, and in which format they're returned; one of:
- none: leaves thoughts unparsed in `message.content`
- deepseek: puts thoughts in `message.reasoning_content`
- deepseek-legacy: keeps `` tags in `message.content` while also populating `message.reasoning_content`
(default: auto)
(env: LLAMA_ARG_THINK) | | `--reasoning-budget N` | controls the amount of thinking allowed; currently only one of: -1 for unrestricted thinking budget, or 0 to disable thinking (default: -1)
(env: LLAMA_ARG_THINK_BUDGET) | -| `--chat-template JINJA_TEMPLATE` | set custom jinja chat template (default: template taken from model's metadata)
if suffix/prefix are specified, template will be disabled
only commonly used templates are accepted (unless --jinja is set before this flag):
list of built-in templates:
bailing, bailing-think, bailing2, chatglm3, chatglm4, chatml, command-r, deepseek, deepseek2, deepseek3, exaone3, exaone4, falcon3, gemma, gigachat, glmedge, gpt-oss, granite, grok-2, hunyuan-dense, hunyuan-moe, kimi-k2, llama2, llama2-sys, llama2-sys-bos, llama2-sys-strip, llama3, llama4, megrez, minicpm, mistral-v1, mistral-v3, mistral-v3-tekken, mistral-v7, mistral-v7-tekken, monarch, openchat, orion, pangu-embedded, phi3, phi4, rwkv-world, seed_oss, smolvlm, solar-open, vicuna, vicuna-orca, yandex, zephyr
(env: LLAMA_ARG_CHAT_TEMPLATE) | -| `--chat-template-file JINJA_TEMPLATE_FILE` | set custom jinja chat template file (default: template taken from model's metadata)
if suffix/prefix are specified, template will be disabled
only commonly used templates are accepted (unless --jinja is set before this flag):
list of built-in templates:
bailing, bailing-think, bailing2, chatglm3, chatglm4, chatml, command-r, deepseek, deepseek2, deepseek3, exaone3, exaone4, falcon3, gemma, gigachat, glmedge, gpt-oss, granite, grok-2, hunyuan-dense, hunyuan-moe, kimi-k2, llama2, llama2-sys, llama2-sys-bos, llama2-sys-strip, llama3, llama4, megrez, minicpm, mistral-v1, mistral-v3, mistral-v3-tekken, mistral-v7, mistral-v7-tekken, monarch, openchat, orion, pangu-embedded, phi3, phi4, rwkv-world, seed_oss, smolvlm, solar-open, vicuna, vicuna-orca, yandex, zephyr
(env: LLAMA_ARG_CHAT_TEMPLATE_FILE) | +| `--chat-template JINJA_TEMPLATE` | set custom jinja chat template (default: template taken from model's metadata)
if suffix/prefix are specified, template will be disabled
only commonly used templates are accepted (unless --jinja is set before this flag):
list of built-in templates:
bailing, bailing-think, bailing2, chatglm3, chatglm4, chatml, command-r, deepseek, deepseek2, deepseek3, exaone-moe, exaone3, exaone4, falcon3, gemma, gigachat, glmedge, gpt-oss, granite, grok-2, hunyuan-dense, hunyuan-moe, kimi-k2, llama2, llama2-sys, llama2-sys-bos, llama2-sys-strip, llama3, llama4, megrez, minicpm, mistral-v1, mistral-v3, mistral-v3-tekken, mistral-v7, mistral-v7-tekken, monarch, openchat, orion, pangu-embedded, phi3, phi4, rwkv-world, seed_oss, smolvlm, solar-open, vicuna, vicuna-orca, yandex, zephyr
(env: LLAMA_ARG_CHAT_TEMPLATE) | +| `--chat-template-file JINJA_TEMPLATE_FILE` | set custom jinja chat template file (default: template taken from model's metadata)
if suffix/prefix are specified, template will be disabled
only commonly used templates are accepted (unless --jinja is set before this flag):
list of built-in templates:
bailing, bailing-think, bailing2, chatglm3, chatglm4, chatml, command-r, deepseek, deepseek2, deepseek3, exaone-moe, exaone3, exaone4, falcon3, gemma, gigachat, glmedge, gpt-oss, granite, grok-2, hunyuan-dense, hunyuan-moe, kimi-k2, llama2, llama2-sys, llama2-sys-bos, llama2-sys-strip, llama3, llama4, megrez, minicpm, mistral-v1, mistral-v3, mistral-v3-tekken, mistral-v7, mistral-v7-tekken, monarch, openchat, orion, pangu-embedded, phi3, phi4, rwkv-world, seed_oss, smolvlm, solar-open, vicuna, vicuna-orca, yandex, zephyr
(env: LLAMA_ARG_CHAT_TEMPLATE_FILE) | | `--simple-io` | use basic IO for better compatibility in subprocesses and limited consoles | | `--draft, --draft-n, --draft-max N` | number of tokens to draft for speculative decoding (default: 16)
(env: LLAMA_ARG_DRAFT_MAX) | | `--draft-min, --draft-n-min N` | minimum number of draft tokens to use for speculative decoding (default: 0)
(env: LLAMA_ARG_DRAFT_MIN) | -| `--draft-p-min P` | minimum speculative decoding probability (greedy) (default: 0.8)
(env: LLAMA_ARG_DRAFT_P_MIN) | +| `--draft-p-min P` | minimum speculative decoding probability (greedy) (default: 0.75)
(env: LLAMA_ARG_DRAFT_P_MIN) | | `-cd, --ctx-size-draft N` | size of the prompt context for the draft model (default: 0, 0 = loaded from model)
(env: LLAMA_ARG_CTX_SIZE_DRAFT) | | `-devd, --device-draft ` | comma-separated list of devices to use for offloading the draft model (none = don't offload)
use --list-devices to see a list of available devices | | `-ngld, --gpu-layers-draft, --n-gpu-layers-draft N` | max. number of draft model layers to store in VRAM, either an exact number, 'auto', or 'all' (default: auto)
(env: LLAMA_ARG_N_GPU_LAYERS_DRAFT) | diff --git a/tools/completion/README.md b/tools/completion/README.md index a16be3f684..3ca3e68454 100644 --- a/tools/completion/README.md +++ b/tools/completion/README.md @@ -128,10 +128,10 @@ llama-completion.exe -m models\gemma-1.1-7b-it.Q4_K_M.gguf --ignore-eos -n -1 | `--rope-freq-base N` | RoPE base frequency, used by NTK-aware scaling (default: loaded from model)
(env: LLAMA_ARG_ROPE_FREQ_BASE) | | `--rope-freq-scale N` | RoPE frequency scaling factor, expands context by a factor of 1/N
(env: LLAMA_ARG_ROPE_FREQ_SCALE) | | `--yarn-orig-ctx N` | YaRN: original context size of model (default: 0 = model training context size)
(env: LLAMA_ARG_YARN_ORIG_CTX) | -| `--yarn-ext-factor N` | YaRN: extrapolation mix factor (default: -1.0, 0.0 = full interpolation)
(env: LLAMA_ARG_YARN_EXT_FACTOR) | -| `--yarn-attn-factor N` | YaRN: scale sqrt(t) or attention magnitude (default: -1.0)
(env: LLAMA_ARG_YARN_ATTN_FACTOR) | -| `--yarn-beta-slow N` | YaRN: high correction dim or alpha (default: -1.0)
(env: LLAMA_ARG_YARN_BETA_SLOW) | -| `--yarn-beta-fast N` | YaRN: low correction dim or beta (default: -1.0)
(env: LLAMA_ARG_YARN_BETA_FAST) | +| `--yarn-ext-factor N` | YaRN: extrapolation mix factor (default: -1.00, 0.0 = full interpolation)
(env: LLAMA_ARG_YARN_EXT_FACTOR) | +| `--yarn-attn-factor N` | YaRN: scale sqrt(t) or attention magnitude (default: -1.00)
(env: LLAMA_ARG_YARN_ATTN_FACTOR) | +| `--yarn-beta-slow N` | YaRN: high correction dim or alpha (default: -1.00)
(env: LLAMA_ARG_YARN_BETA_SLOW) | +| `--yarn-beta-fast N` | YaRN: low correction dim or beta (default: -1.00)
(env: LLAMA_ARG_YARN_BETA_FAST) | | `-kvo, --kv-offload, -nkvo, --no-kv-offload` | whether to enable KV cache offloading (default: enabled)
(env: LLAMA_ARG_KV_OFFLOAD) | | `--repack, -nr, --no-repack` | whether to enable weight repacking (default: enabled)
(env: LLAMA_ARG_REPACK) | | `--no-host` | bypass host buffer allowing extra buffers to be used
(env: LLAMA_ARG_NO_HOST) | @@ -192,28 +192,30 @@ llama-completion.exe -m models\gemma-1.1-7b-it.Q4_K_M.gguf --ignore-eos -n -1 | `-s, --seed SEED` | RNG seed (default: -1, use random seed for -1) | | `--sampler-seq, --sampling-seq SEQUENCE` | simplified sequence for samplers that will be used (default: edskypmxt) | | `--ignore-eos` | ignore end of stream token and continue generating (implies --logit-bias EOS-inf) | -| `--temp N` | temperature (default: 0.8) | +| `--temp N` | temperature (default: 0.80) | | `--top-k N` | top-k sampling (default: 40, 0 = disabled)
(env: LLAMA_ARG_TOP_K) | -| `--top-p N` | top-p sampling (default: 0.9, 1.0 = disabled) | -| `--min-p N` | min-p sampling (default: 0.1, 0.0 = disabled) | -| `--top-nsigma N` | top-n-sigma sampling (default: -1.0, -1.0 = disabled) | -| `--xtc-probability N` | xtc probability (default: 0.0, 0.0 = disabled) | -| `--xtc-threshold N` | xtc threshold (default: 0.1, 1.0 = disabled) | -| `--typical N` | locally typical sampling, parameter p (default: 1.0, 1.0 = disabled) | +| `--top-p N` | top-p sampling (default: 0.95, 1.0 = disabled) | +| `--min-p N` | min-p sampling (default: 0.05, 0.0 = disabled) | +| `--top-nsigma N` | top-n-sigma sampling (default: -1.00, -1.0 = disabled) | +| `--xtc-probability N` | xtc probability (default: 0.00, 0.0 = disabled) | +| `--xtc-threshold N` | xtc threshold (default: 0.10, 1.0 = disabled) | +| `--typical N` | locally typical sampling, parameter p (default: 1.00, 1.0 = disabled) | | `--repeat-last-n N` | last n tokens to consider for penalize (default: 64, 0 = disabled, -1 = ctx_size) | -| `--repeat-penalty N` | penalize repeat sequence of tokens (default: 1.0, 1.0 = disabled) | -| `--presence-penalty N` | repeat alpha presence penalty (default: 0.0, 0.0 = disabled) | -| `--frequency-penalty N` | repeat alpha frequency penalty (default: 0.0, 0.0 = disabled) | -| `--dry-multiplier N` | set DRY sampling multiplier (default: 0.0, 0.0 = disabled) | +| `--repeat-penalty N` | penalize repeat sequence of tokens (default: 1.00, 1.0 = disabled) | +| `--presence-penalty N` | repeat alpha presence penalty (default: 0.00, 0.0 = disabled) | +| `--frequency-penalty N` | repeat alpha frequency penalty (default: 0.00, 0.0 = disabled) | +| `--dry-multiplier N` | set DRY sampling multiplier (default: 0.00, 0.0 = disabled) | | `--dry-base N` | set DRY sampling base value (default: 1.75) | | `--dry-allowed-length N` | set allowed length for DRY sampling (default: 2) | | `--dry-penalty-last-n N` | set DRY penalty for the last n tokens (default: -1, 0 = disable, -1 = context size) | | `--dry-sequence-breaker STRING` | add sequence breaker for DRY sampling, clearing out default breakers ('\n', ':', '"', '*') in the process; use "none" to not use any sequence breakers | -| `--dynatemp-range N` | dynamic temperature range (default: 0.0, 0.0 = disabled) | -| `--dynatemp-exp N` | dynamic temperature exponent (default: 1.0) | +| `--adaptive-target N` | adaptive-p: select tokens near this probability (valid range 0.0 to 1.0; negative = disabled) (default: -1.00)
[(more info)](https://github.com/ggml-org/llama.cpp/pull/17927) | +| `--adaptive-decay N` | adaptive-p: decay rate for target adaptation over time. lower values are more reactive, higher values are more stable.
(valid range 0.0 to 0.99) (default: 0.90) | +| `--dynatemp-range N` | dynamic temperature range (default: 0.00, 0.0 = disabled) | +| `--dynatemp-exp N` | dynamic temperature exponent (default: 1.00) | | `--mirostat N` | use Mirostat sampling.
Top K, Nucleus and Locally Typical samplers are ignored if used.
(default: 0, 0 = disabled, 1 = Mirostat, 2 = Mirostat 2.0) | -| `--mirostat-lr N` | Mirostat learning rate, parameter eta (default: 0.1) | -| `--mirostat-ent N` | Mirostat target entropy, parameter tau (default: 5.0) | +| `--mirostat-lr N` | Mirostat learning rate, parameter eta (default: 0.10) | +| `--mirostat-ent N` | Mirostat target entropy, parameter tau (default: 5.00) | | `-l, --logit-bias TOKEN_ID(+/-)BIAS` | modifies the likelihood of token appearing in the completion,
i.e. `--logit-bias 15043+1` to increase likelihood of token ' Hello',
or `--logit-bias 15043-1` to decrease likelihood of token ' Hello' | | `--grammar GRAMMAR` | BNF-like grammar to constrain generations (see samples in grammars/ dir) (default: '') | | `--grammar-file FNAME` | file to read grammar from | @@ -251,8 +253,8 @@ llama-completion.exe -m models\gemma-1.1-7b-it.Q4_K_M.gguf --ignore-eos -n -1 | `--jinja, --no-jinja` | whether to use jinja template engine for chat (default: disabled)
(env: LLAMA_ARG_JINJA) | | `--reasoning-format FORMAT` | controls whether thought tags are allowed and/or extracted from the response, and in which format they're returned; one of:
- none: leaves thoughts unparsed in `message.content`
- deepseek: puts thoughts in `message.reasoning_content`
- deepseek-legacy: keeps `` tags in `message.content` while also populating `message.reasoning_content`
(default: auto)
(env: LLAMA_ARG_THINK) | | `--reasoning-budget N` | controls the amount of thinking allowed; currently only one of: -1 for unrestricted thinking budget, or 0 to disable thinking (default: -1)
(env: LLAMA_ARG_THINK_BUDGET) | -| `--chat-template JINJA_TEMPLATE` | set custom jinja chat template (default: template taken from model's metadata)
if suffix/prefix are specified, template will be disabled
only commonly used templates are accepted (unless --jinja is set before this flag):
list of built-in templates:
bailing, bailing-think, bailing2, chatglm3, chatglm4, chatml, command-r, deepseek, deepseek2, deepseek3, exaone3, exaone4, falcon3, gemma, gigachat, glmedge, gpt-oss, granite, grok-2, hunyuan-dense, hunyuan-moe, kimi-k2, llama2, llama2-sys, llama2-sys-bos, llama2-sys-strip, llama3, llama4, megrez, minicpm, mistral-v1, mistral-v3, mistral-v3-tekken, mistral-v7, mistral-v7-tekken, monarch, openchat, orion, pangu-embedded, phi3, phi4, rwkv-world, seed_oss, smolvlm, solar-open, vicuna, vicuna-orca, yandex, zephyr
(env: LLAMA_ARG_CHAT_TEMPLATE) | -| `--chat-template-file JINJA_TEMPLATE_FILE` | set custom jinja chat template file (default: template taken from model's metadata)
if suffix/prefix are specified, template will be disabled
only commonly used templates are accepted (unless --jinja is set before this flag):
list of built-in templates:
bailing, bailing-think, bailing2, chatglm3, chatglm4, chatml, command-r, deepseek, deepseek2, deepseek3, exaone3, exaone4, falcon3, gemma, gigachat, glmedge, gpt-oss, granite, grok-2, hunyuan-dense, hunyuan-moe, kimi-k2, llama2, llama2-sys, llama2-sys-bos, llama2-sys-strip, llama3, llama4, megrez, minicpm, mistral-v1, mistral-v3, mistral-v3-tekken, mistral-v7, mistral-v7-tekken, monarch, openchat, orion, pangu-embedded, phi3, phi4, rwkv-world, seed_oss, smolvlm, solar-open, vicuna, vicuna-orca, yandex, zephyr
(env: LLAMA_ARG_CHAT_TEMPLATE_FILE) | +| `--chat-template JINJA_TEMPLATE` | set custom jinja chat template (default: template taken from model's metadata)
if suffix/prefix are specified, template will be disabled
only commonly used templates are accepted (unless --jinja is set before this flag):
list of built-in templates:
bailing, bailing-think, bailing2, chatglm3, chatglm4, chatml, command-r, deepseek, deepseek2, deepseek3, exaone-moe, exaone3, exaone4, falcon3, gemma, gigachat, glmedge, gpt-oss, granite, grok-2, hunyuan-dense, hunyuan-moe, kimi-k2, llama2, llama2-sys, llama2-sys-bos, llama2-sys-strip, llama3, llama4, megrez, minicpm, mistral-v1, mistral-v3, mistral-v3-tekken, mistral-v7, mistral-v7-tekken, monarch, openchat, orion, pangu-embedded, phi3, phi4, rwkv-world, seed_oss, smolvlm, solar-open, vicuna, vicuna-orca, yandex, zephyr
(env: LLAMA_ARG_CHAT_TEMPLATE) | +| `--chat-template-file JINJA_TEMPLATE_FILE` | set custom jinja chat template file (default: template taken from model's metadata)
if suffix/prefix are specified, template will be disabled
only commonly used templates are accepted (unless --jinja is set before this flag):
list of built-in templates:
bailing, bailing-think, bailing2, chatglm3, chatglm4, chatml, command-r, deepseek, deepseek2, deepseek3, exaone-moe, exaone3, exaone4, falcon3, gemma, gigachat, glmedge, gpt-oss, granite, grok-2, hunyuan-dense, hunyuan-moe, kimi-k2, llama2, llama2-sys, llama2-sys-bos, llama2-sys-strip, llama3, llama4, megrez, minicpm, mistral-v1, mistral-v3, mistral-v3-tekken, mistral-v7, mistral-v7-tekken, monarch, openchat, orion, pangu-embedded, phi3, phi4, rwkv-world, seed_oss, smolvlm, solar-open, vicuna, vicuna-orca, yandex, zephyr
(env: LLAMA_ARG_CHAT_TEMPLATE_FILE) | | `--simple-io` | use basic IO for better compatibility in subprocesses and limited consoles | diff --git a/tools/server/README.md b/tools/server/README.md index f113f9cb75..d132830171 100644 --- a/tools/server/README.md +++ b/tools/server/README.md @@ -63,10 +63,10 @@ For the ful list of features, please refer to [server's changelog](https://githu | `--rope-freq-base N` | RoPE base frequency, used by NTK-aware scaling (default: loaded from model)
(env: LLAMA_ARG_ROPE_FREQ_BASE) | | `--rope-freq-scale N` | RoPE frequency scaling factor, expands context by a factor of 1/N
(env: LLAMA_ARG_ROPE_FREQ_SCALE) | | `--yarn-orig-ctx N` | YaRN: original context size of model (default: 0 = model training context size)
(env: LLAMA_ARG_YARN_ORIG_CTX) | -| `--yarn-ext-factor N` | YaRN: extrapolation mix factor (default: -1.0, 0.0 = full interpolation)
(env: LLAMA_ARG_YARN_EXT_FACTOR) | -| `--yarn-attn-factor N` | YaRN: scale sqrt(t) or attention magnitude (default: -1.0)
(env: LLAMA_ARG_YARN_ATTN_FACTOR) | -| `--yarn-beta-slow N` | YaRN: high correction dim or alpha (default: -1.0)
(env: LLAMA_ARG_YARN_BETA_SLOW) | -| `--yarn-beta-fast N` | YaRN: low correction dim or beta (default: -1.0)
(env: LLAMA_ARG_YARN_BETA_FAST) | +| `--yarn-ext-factor N` | YaRN: extrapolation mix factor (default: -1.00, 0.0 = full interpolation)
(env: LLAMA_ARG_YARN_EXT_FACTOR) | +| `--yarn-attn-factor N` | YaRN: scale sqrt(t) or attention magnitude (default: -1.00)
(env: LLAMA_ARG_YARN_ATTN_FACTOR) | +| `--yarn-beta-slow N` | YaRN: high correction dim or alpha (default: -1.00)
(env: LLAMA_ARG_YARN_BETA_SLOW) | +| `--yarn-beta-fast N` | YaRN: low correction dim or beta (default: -1.00)
(env: LLAMA_ARG_YARN_BETA_FAST) | | `-kvo, --kv-offload, -nkvo, --no-kv-offload` | whether to enable KV cache offloading (default: enabled)
(env: LLAMA_ARG_KV_OFFLOAD) | | `--repack, -nr, --no-repack` | whether to enable weight repacking (default: enabled)
(env: LLAMA_ARG_REPACK) | | `--no-host` | bypass host buffer allowing extra buffers to be used
(env: LLAMA_ARG_NO_HOST) | @@ -126,30 +126,30 @@ For the ful list of features, please refer to [server's changelog](https://githu | `-s, --seed SEED` | RNG seed (default: -1, use random seed for -1) | | `--sampler-seq, --sampling-seq SEQUENCE` | simplified sequence for samplers that will be used (default: edskypmxt) | | `--ignore-eos` | ignore end of stream token and continue generating (implies --logit-bias EOS-inf) | -| `--temp N` | temperature (default: 0.8) | +| `--temp N` | temperature (default: 0.80) | | `--top-k N` | top-k sampling (default: 40, 0 = disabled)
(env: LLAMA_ARG_TOP_K) | -| `--top-p N` | top-p sampling (default: 0.9, 1.0 = disabled) | -| `--min-p N` | min-p sampling (default: 0.1, 0.0 = disabled) | -| `--adaptive-target N` | adaptive-p: select tokens near this probability (valid range 0.0 to 1.0; negative = disabled) | -| `--adaptive-decay N` | adaptive-p: EMA decay for adaptation; effective history length ≈ 1/(1-decay) tokens (valid range 0.0 - 0.99) | -| `--top-nsigma N` | top-n-sigma sampling (default: -1.0, -1.0 = disabled) | -| `--xtc-probability N` | xtc probability (default: 0.0, 0.0 = disabled) | -| `--xtc-threshold N` | xtc threshold (default: 0.1, 1.0 = disabled) | -| `--typical N` | locally typical sampling, parameter p (default: 1.0, 1.0 = disabled) | +| `--top-p N` | top-p sampling (default: 0.95, 1.0 = disabled) | +| `--min-p N` | min-p sampling (default: 0.05, 0.0 = disabled) | +| `--top-nsigma N` | top-n-sigma sampling (default: -1.00, -1.0 = disabled) | +| `--xtc-probability N` | xtc probability (default: 0.00, 0.0 = disabled) | +| `--xtc-threshold N` | xtc threshold (default: 0.10, 1.0 = disabled) | +| `--typical N` | locally typical sampling, parameter p (default: 1.00, 1.0 = disabled) | | `--repeat-last-n N` | last n tokens to consider for penalize (default: 64, 0 = disabled, -1 = ctx_size) | -| `--repeat-penalty N` | penalize repeat sequence of tokens (default: 1.0, 1.0 = disabled) | -| `--presence-penalty N` | repeat alpha presence penalty (default: 0.0, 0.0 = disabled) | -| `--frequency-penalty N` | repeat alpha frequency penalty (default: 0.0, 0.0 = disabled) | -| `--dry-multiplier N` | set DRY sampling multiplier (default: 0.0, 0.0 = disabled) | +| `--repeat-penalty N` | penalize repeat sequence of tokens (default: 1.00, 1.0 = disabled) | +| `--presence-penalty N` | repeat alpha presence penalty (default: 0.00, 0.0 = disabled) | +| `--frequency-penalty N` | repeat alpha frequency penalty (default: 0.00, 0.0 = disabled) | +| `--dry-multiplier N` | set DRY sampling multiplier (default: 0.00, 0.0 = disabled) | | `--dry-base N` | set DRY sampling base value (default: 1.75) | | `--dry-allowed-length N` | set allowed length for DRY sampling (default: 2) | | `--dry-penalty-last-n N` | set DRY penalty for the last n tokens (default: -1, 0 = disable, -1 = context size) | | `--dry-sequence-breaker STRING` | add sequence breaker for DRY sampling, clearing out default breakers ('\n', ':', '"', '*') in the process; use "none" to not use any sequence breakers | -| `--dynatemp-range N` | dynamic temperature range (default: 0.0, 0.0 = disabled) | -| `--dynatemp-exp N` | dynamic temperature exponent (default: 1.0) | +| `--adaptive-target N` | adaptive-p: select tokens near this probability (valid range 0.0 to 1.0; negative = disabled) (default: -1.00)
[(more info)](https://github.com/ggml-org/llama.cpp/pull/17927) | +| `--adaptive-decay N` | adaptive-p: decay rate for target adaptation over time. lower values are more reactive, higher values are more stable.
(valid range 0.0 to 0.99) (default: 0.90) | +| `--dynatemp-range N` | dynamic temperature range (default: 0.00, 0.0 = disabled) | +| `--dynatemp-exp N` | dynamic temperature exponent (default: 1.00) | | `--mirostat N` | use Mirostat sampling.
Top K, Nucleus and Locally Typical samplers are ignored if used.
(default: 0, 0 = disabled, 1 = Mirostat, 2 = Mirostat 2.0) | -| `--mirostat-lr N` | Mirostat learning rate, parameter eta (default: 0.1) | -| `--mirostat-ent N` | Mirostat target entropy, parameter tau (default: 5.0) | +| `--mirostat-lr N` | Mirostat learning rate, parameter eta (default: 0.10) | +| `--mirostat-ent N` | Mirostat target entropy, parameter tau (default: 5.00) | | `-l, --logit-bias TOKEN_ID(+/-)BIAS` | modifies the likelihood of token appearing in the completion,
i.e. `--logit-bias 15043+1` to increase likelihood of token ' Hello',
or `--logit-bias 15043-1` to decrease likelihood of token ' Hello' | | `--grammar GRAMMAR` | BNF-like grammar to constrain generations (see samples in grammars/ dir) (default: '') | | `--grammar-file FNAME` | file to read grammar from | @@ -199,7 +199,8 @@ For the ful list of features, please refer to [server's changelog](https://githu | `--chat-template-kwargs STRING` | sets additional params for the json template parser, must be a valid json object string, e.g. '{"key1":"value1","key2":"value2"}'
(env: LLAMA_CHAT_TEMPLATE_KWARGS) | | `-to, --timeout N` | server read/write timeout in seconds (default: 600)
(env: LLAMA_ARG_TIMEOUT) | | `--threads-http N` | number of threads used to process HTTP requests (default: -1)
(env: LLAMA_ARG_THREADS_HTTP) | -| `--cache-reuse N` | min chunk size to attempt reusing from the cache via KV shifting (default: 0)
[(card)](https://ggml.ai/f0.png)
(env: LLAMA_ARG_CACHE_REUSE) | +| `--cache-prompt, --no-cache-prompt` | whether to enable prompt caching (default: enabled)
(env: LLAMA_ARG_CACHE_PROMPT) | +| `--cache-reuse N` | min chunk size to attempt reusing from the cache via KV shifting, requires prompt caching to be enabled (default: 0)
[(card)](https://ggml.ai/f0.png)
(env: LLAMA_ARG_CACHE_REUSE) | | `--metrics` | enable prometheus compatible metrics endpoint (default: disabled)
(env: LLAMA_ARG_ENDPOINT_METRICS) | | `--props` | enable changing global properties via POST /props (default: disabled)
(env: LLAMA_ARG_ENDPOINT_PROPS) | | `--slots, --no-slots` | expose slots monitoring endpoint (default: enabled)
(env: LLAMA_ARG_ENDPOINT_SLOTS) | @@ -212,8 +213,8 @@ For the ful list of features, please refer to [server's changelog](https://githu | `--jinja, --no-jinja` | whether to use jinja template engine for chat (default: enabled)
(env: LLAMA_ARG_JINJA) | | `--reasoning-format FORMAT` | controls whether thought tags are allowed and/or extracted from the response, and in which format they're returned; one of:
- none: leaves thoughts unparsed in `message.content`
- deepseek: puts thoughts in `message.reasoning_content`
- deepseek-legacy: keeps `` tags in `message.content` while also populating `message.reasoning_content`
(default: auto)
(env: LLAMA_ARG_THINK) | | `--reasoning-budget N` | controls the amount of thinking allowed; currently only one of: -1 for unrestricted thinking budget, or 0 to disable thinking (default: -1)
(env: LLAMA_ARG_THINK_BUDGET) | -| `--chat-template JINJA_TEMPLATE` | set custom jinja chat template (default: template taken from model's metadata)
if suffix/prefix are specified, template will be disabled
only commonly used templates are accepted (unless --jinja is set before this flag):
list of built-in templates:
bailing, bailing-think, bailing2, chatglm3, chatglm4, chatml, command-r, deepseek, deepseek2, deepseek3, exaone3, exaone4, falcon3, gemma, gigachat, glmedge, gpt-oss, granite, grok-2, hunyuan-dense, hunyuan-moe, kimi-k2, llama2, llama2-sys, llama2-sys-bos, llama2-sys-strip, llama3, llama4, megrez, minicpm, mistral-v1, mistral-v3, mistral-v3-tekken, mistral-v7, mistral-v7-tekken, monarch, openchat, orion, pangu-embedded, phi3, phi4, rwkv-world, seed_oss, smolvlm, solar-open, vicuna, vicuna-orca, yandex, zephyr
(env: LLAMA_ARG_CHAT_TEMPLATE) | -| `--chat-template-file JINJA_TEMPLATE_FILE` | set custom jinja chat template file (default: template taken from model's metadata)
if suffix/prefix are specified, template will be disabled
only commonly used templates are accepted (unless --jinja is set before this flag):
list of built-in templates:
bailing, bailing-think, bailing2, chatglm3, chatglm4, chatml, command-r, deepseek, deepseek2, deepseek3, exaone3, exaone4, falcon3, gemma, gigachat, glmedge, gpt-oss, granite, grok-2, hunyuan-dense, hunyuan-moe, kimi-k2, llama2, llama2-sys, llama2-sys-bos, llama2-sys-strip, llama3, llama4, megrez, minicpm, mistral-v1, mistral-v3, mistral-v3-tekken, mistral-v7, mistral-v7-tekken, monarch, openchat, orion, pangu-embedded, phi3, phi4, rwkv-world, seed_oss, smolvlm, solar-open, vicuna, vicuna-orca, yandex, zephyr
(env: LLAMA_ARG_CHAT_TEMPLATE_FILE) | +| `--chat-template JINJA_TEMPLATE` | set custom jinja chat template (default: template taken from model's metadata)
if suffix/prefix are specified, template will be disabled
only commonly used templates are accepted (unless --jinja is set before this flag):
list of built-in templates:
bailing, bailing-think, bailing2, chatglm3, chatglm4, chatml, command-r, deepseek, deepseek2, deepseek3, exaone-moe, exaone3, exaone4, falcon3, gemma, gigachat, glmedge, gpt-oss, granite, grok-2, hunyuan-dense, hunyuan-moe, kimi-k2, llama2, llama2-sys, llama2-sys-bos, llama2-sys-strip, llama3, llama4, megrez, minicpm, mistral-v1, mistral-v3, mistral-v3-tekken, mistral-v7, mistral-v7-tekken, monarch, openchat, orion, pangu-embedded, phi3, phi4, rwkv-world, seed_oss, smolvlm, solar-open, vicuna, vicuna-orca, yandex, zephyr
(env: LLAMA_ARG_CHAT_TEMPLATE) | +| `--chat-template-file JINJA_TEMPLATE_FILE` | set custom jinja chat template file (default: template taken from model's metadata)
if suffix/prefix are specified, template will be disabled
only commonly used templates are accepted (unless --jinja is set before this flag):
list of built-in templates:
bailing, bailing-think, bailing2, chatglm3, chatglm4, chatml, command-r, deepseek, deepseek2, deepseek3, exaone-moe, exaone3, exaone4, falcon3, gemma, gigachat, glmedge, gpt-oss, granite, grok-2, hunyuan-dense, hunyuan-moe, kimi-k2, llama2, llama2-sys, llama2-sys-bos, llama2-sys-strip, llama3, llama4, megrez, minicpm, mistral-v1, mistral-v3, mistral-v3-tekken, mistral-v7, mistral-v7-tekken, monarch, openchat, orion, pangu-embedded, phi3, phi4, rwkv-world, seed_oss, smolvlm, solar-open, vicuna, vicuna-orca, yandex, zephyr
(env: LLAMA_ARG_CHAT_TEMPLATE_FILE) | | `--prefill-assistant, --no-prefill-assistant` | whether to prefill the assistant's response if the last message is an assistant message (default: prefill enabled)
when this flag is set, if the last message is an assistant message then it will be treated as a full message and not prefilled

(env: LLAMA_ARG_PREFILL_ASSISTANT) | | `-sps, --slot-prompt-similarity SIMILARITY` | how much the prompt of a request must match the prompt of a slot in order to use that slot (default: 0.10, 0.0 = disabled) | | `--lora-init-without-apply` | load LoRA adapters without applying them (apply later via POST /lora-adapters) (default: disabled) | @@ -222,7 +223,7 @@ For the ful list of features, please refer to [server's changelog](https://githu | `-tbd, --threads-batch-draft N` | number of threads to use during batch and prompt processing (default: same as --threads-draft) | | `--draft, --draft-n, --draft-max N` | number of tokens to draft for speculative decoding (default: 16)
(env: LLAMA_ARG_DRAFT_MAX) | | `--draft-min, --draft-n-min N` | minimum number of draft tokens to use for speculative decoding (default: 0)
(env: LLAMA_ARG_DRAFT_MIN) | -| `--draft-p-min P` | minimum speculative decoding probability (greedy) (default: 0.8)
(env: LLAMA_ARG_DRAFT_P_MIN) | +| `--draft-p-min P` | minimum speculative decoding probability (greedy) (default: 0.75)
(env: LLAMA_ARG_DRAFT_P_MIN) | | `-cd, --ctx-size-draft N` | size of the prompt context for the draft model (default: 0, 0 = loaded from model)
(env: LLAMA_ARG_CTX_SIZE_DRAFT) | | `-devd, --device-draft ` | comma-separated list of devices to use for offloading the draft model (none = don't offload)
use --list-devices to see a list of available devices | | `-ngld, --gpu-layers-draft, --n-gpu-layers-draft N` | max. number of draft model layers to store in VRAM, either an exact number, 'auto', or 'all' (default: auto)
(env: LLAMA_ARG_N_GPU_LAYERS_DRAFT) | From 24bc23830313ce13080ae393309cef080f2141c2 Mon Sep 17 00:00:00 2001 From: Jakkala Mahesh <155058658+MaheshJakkala@users.noreply.github.com> Date: Sun, 25 Jan 2026 12:40:52 +0530 Subject: [PATCH 06/22] llama: fix integer type consistency in split helpers (#18894) * llama: fix integer type consistency in split helpers * llama: apply minor style fixes * llama: remove trailing whitespace --- include/llama.h | 4 ++-- src/llama.cpp | 58 +++++++++++++++++++++++++++++++++++++------------ 2 files changed, 46 insertions(+), 16 deletions(-) diff --git a/include/llama.h b/include/llama.h index 1507107f1a..c3360ae57c 100644 --- a/include/llama.h +++ b/include/llama.h @@ -1476,12 +1476,12 @@ extern "C" { /// @details Build a split GGUF final path for this chunk. /// llama_split_path(split_path, sizeof(split_path), "/models/ggml-model-q4_0", 2, 4) => split_path = "/models/ggml-model-q4_0-00002-of-00004.gguf" // Returns the split_path length. - LLAMA_API int llama_split_path(char * split_path, size_t maxlen, const char * path_prefix, int split_no, int split_count); + LLAMA_API int32_t llama_split_path(char * split_path, size_t maxlen, const char * path_prefix, int32_t split_no, int32_t split_count); /// @details Extract the path prefix from the split_path if and only if the split_no and split_count match. /// llama_split_prefix(split_prefix, 64, "/models/ggml-model-q4_0-00002-of-00004.gguf", 2, 4) => split_prefix = "/models/ggml-model-q4_0" // Returns the split_prefix length. - LLAMA_API int llama_split_prefix(char * split_prefix, size_t maxlen, const char * split_path, int split_no, int split_count); + LLAMA_API int32_t llama_split_prefix(char * split_prefix, size_t maxlen, const char * split_path, int32_t split_no, int32_t split_count); // Print system information LLAMA_API const char * llama_print_system_info(void); diff --git a/src/llama.cpp b/src/llama.cpp index 11b75fcff9..6da90d6f1f 100644 --- a/src/llama.cpp +++ b/src/llama.cpp @@ -1095,25 +1095,55 @@ int32_t llama_chat_apply_template( // model split // -int llama_split_path(char * split_path, size_t maxlen, const char * path_prefix, int split_no, int split_count) { +int32_t llama_split_path( + char * split_path, + size_t maxlen, + const char * path_prefix, + int32_t split_no, + int32_t split_count) { + static const char * const SPLIT_PATH_FORMAT = "%s-%05d-of-%05d.gguf"; - if (snprintf(split_path, maxlen, SPLIT_PATH_FORMAT, path_prefix, split_no + 1, split_count)) { - return strlen(split_path); + + const int written = snprintf( + split_path, + maxlen, + SPLIT_PATH_FORMAT, + path_prefix, + split_no + 1, + split_count + ); + + if (written < 0 || (size_t) written >= maxlen) { + return 0; } - return 0; + + return (int32_t) written; } -int llama_split_prefix(char * split_prefix, size_t maxlen, const char * split_path, int split_no, int split_count) { - std::string str_split_path(split_path); - char postfix[32]; - snprintf(postfix, 32, "-%05d-of-%05d.gguf", split_no + 1, split_count); - std::string str_postfix(postfix); +int32_t llama_split_prefix( + char * split_prefix, + size_t maxlen, + const char * split_path, + int32_t split_no, + int32_t split_count) { - // check if split_prefix ends with postfix - int size_prefix = str_split_path.size() - str_postfix.size(); - if (size_prefix > 0 && str_split_path.find(str_postfix, size_prefix) != std::string::npos) { - snprintf(split_prefix, std::min((size_t) size_prefix + 1, maxlen), "%s", split_path); - return size_prefix; + const std::string str_split_path(split_path); + + char postfix[32]; + snprintf(postfix, sizeof(postfix), "-%05d-of-%05d.gguf", split_no + 1, split_count); + + const std::string str_postfix(postfix); + if (str_split_path.size() <= str_postfix.size()) { + return 0; + } + + const size_t size_prefix = str_split_path.size() - str_postfix.size(); + + if (str_split_path.compare(size_prefix, std::string::npos, str_postfix) == 0) { + const size_t copy_len = std::min(size_prefix + 1, maxlen); + snprintf(split_prefix, copy_len, "%s", split_path); + + return (int32_t) size_prefix; } return 0; From 1243f93a2de868e16a9e52af55b7ab930110c04e Mon Sep 17 00:00:00 2001 From: Molly Sophia Date: Sun, 25 Jan 2026 15:11:19 +0800 Subject: [PATCH 07/22] readme: update RWKV7 model links (#19061) Signed-off-by: Molly Sophia --- README.md | 1 + 1 file changed, 1 insertion(+) diff --git a/README.md b/README.md index 91a8f25d1c..0783e43e5c 100644 --- a/README.md +++ b/README.md @@ -132,6 +132,7 @@ Instructions for adding support for new models: [HOWTO-add-model.md](docs/develo - [x] [FalconMamba Models](https://huggingface.co/collections/tiiuae/falconmamba-7b-66b9a580324dd1598b0f6d4a) - [x] [Jais](https://huggingface.co/inceptionai/jais-13b-chat) - [x] [Bielik-11B-v2.3](https://huggingface.co/collections/speakleash/bielik-11b-v23-66ee813238d9b526a072408a) +- [x] [RWKV-7](https://huggingface.co/collections/shoumenchougou/rwkv7-gxx-gguf) - [x] [RWKV-6](https://github.com/BlinkDL/RWKV-LM) - [x] [QRWKV-6](https://huggingface.co/recursal/QRWKV6-32B-Instruct-Preview-v0.1) - [x] [GigaChat-20B-A3B](https://huggingface.co/ai-sage/GigaChat-20B-A3B-instruct) From 080b161995218bb40bfc03a6446c9e2b4c7e81e0 Mon Sep 17 00:00:00 2001 From: Georgi Gerganov Date: Sun, 25 Jan 2026 09:12:50 +0200 Subject: [PATCH 08/22] completion : fix prompt cache for recurrent models (#19045) --- src/llama-context.cpp | 1 + tools/completion/completion.cpp | 88 ++++++++++++++++++--------------- 2 files changed, 48 insertions(+), 41 deletions(-) diff --git a/src/llama-context.cpp b/src/llama-context.cpp index a35cf5a94b..fb817e1385 100644 --- a/src/llama-context.cpp +++ b/src/llama-context.cpp @@ -2559,6 +2559,7 @@ size_t llama_context::state_write_data(llama_io_write_i & io) { } } + // [TAG_CONTEXT_STATE_LOGITS] // write logits { LLAMA_LOG_DEBUG("%s: - writing logits\n", __func__); diff --git a/tools/completion/completion.cpp b/tools/completion/completion.cpp index a9eda119d7..f368a2f4c6 100644 --- a/tools/completion/completion.cpp +++ b/tools/completion/completion.cpp @@ -342,44 +342,51 @@ int main(int argc, char ** argv) { return 1; } - // debug message about similarity of saved session, if applicable - size_t n_matching_session_tokens = 0; - if (!session_tokens.empty()) { - for (llama_token id : session_tokens) { - if (n_matching_session_tokens >= embd_inp.size() || id != embd_inp[n_matching_session_tokens]) { - break; + bool session_do_save = false; + + { + size_t n_match = 0; + + if (!session_tokens.empty()) { + for (llama_token id : session_tokens) { + if (n_match >= embd_inp.size() || id != embd_inp[n_match]) { + break; + } + n_match++; + } + if (params.prompt.empty() && n_match == embd_inp.size()) { + LOG_INF("%s: using full prompt from session file\n", __func__); + } else if (n_match >= embd_inp.size()) { + LOG_INF("%s: session file has exact match for prompt!\n", __func__); + } else if (n_match < (embd_inp.size() / 2)) { + LOG_WRN("%s: session file has low similarity to prompt (%zu / %zu tokens); will mostly be reevaluated\n", + __func__, n_match, embd_inp.size()); + } else { + LOG_INF("%s: session file matches %zu / %zu tokens of prompt\n", + __func__, n_match, embd_inp.size()); + } + + if (session_tokens.size() == n_match) { + // [TAG_CONTEXT_STATE_LOGITS] + // in this case, we are going to reuse the logits from the session + // if we ever decide to remove the logits from the session, we need to handle this somehow + // ref: https://github.com/ggml-org/llama.cpp/pull/18862#issuecomment-3756330941 + } + + // remove any "future" tokens that we might have inherited from the previous session + if (session_tokens.size() > n_match) { + if (!llama_memory_seq_rm(mem, -1, n_match, -1)) { + LOG_WRN("%s: unable to resuse common prefix (for example, when the memory is recurrent)\n", __func__); + llama_memory_clear(mem, true); + session_tokens.clear(); + n_match = 0; + } else { + session_tokens.resize(n_match); + } } - n_matching_session_tokens++; - } - if (params.prompt.empty() && n_matching_session_tokens == embd_inp.size()) { - LOG_INF("%s: using full prompt from session file\n", __func__); - } else if (n_matching_session_tokens >= embd_inp.size()) { - LOG_INF("%s: session file has exact match for prompt!\n", __func__); - } else if (n_matching_session_tokens < (embd_inp.size() / 2)) { - LOG_WRN("%s: session file has low similarity to prompt (%zu / %zu tokens); will mostly be reevaluated\n", - __func__, n_matching_session_tokens, embd_inp.size()); - } else { - LOG_INF("%s: session file matches %zu / %zu tokens of prompt\n", - __func__, n_matching_session_tokens, embd_inp.size()); } - // remove any "future" tokens that we might have inherited from the previous session - if (!llama_memory_seq_rm(mem, -1, n_matching_session_tokens, -1)) { - LOG_INF("%s: unable to resuse common prefix\n", __func__); - n_matching_session_tokens = 0; - llama_memory_seq_rm(mem, -1, -1, -1); - } - } - - LOG_DBG("recalculate the cached logits (check): embd_inp.size() %zu, n_matching_session_tokens %zu, embd_inp.size() %zu, session_tokens.size() %zu\n", - embd_inp.size(), n_matching_session_tokens, embd_inp.size(), session_tokens.size()); - - // if we will use the cache for the full prompt without reaching the end of the cache, force - // reevaluation of the last token to recalculate the cached logits - if (!embd_inp.empty() && n_matching_session_tokens == embd_inp.size() && session_tokens.size() > embd_inp.size()) { - LOG_DBG("recalculate the cached logits (do): session_tokens.resize( %zu )\n", embd_inp.size() - 1); - - session_tokens.resize(embd_inp.size() - 1); + session_do_save = !path_session.empty() && n_match < embd_inp.size() && !params.prompt_cache_ro; } // number of tokens to keep when resetting context @@ -521,10 +528,9 @@ int main(int argc, char ** argv) { is_interacting = params.interactive_first; } - bool is_antiprompt = false; - bool input_echo = true; - bool display = true; - bool need_to_save_session = !path_session.empty() && n_matching_session_tokens < embd_inp.size(); + bool is_antiprompt = false; + bool input_echo = true; + bool display = true; int n_past = 0; int n_remain = params.n_predict; @@ -700,8 +706,8 @@ int main(int argc, char ** argv) { if ((int) embd_inp.size() <= n_consumed && !is_interacting) { // optionally save the session on first sample (for faster prompt loading next time) - if (!path_session.empty() && need_to_save_session && !params.prompt_cache_ro) { - need_to_save_session = false; + if (session_do_save) { + session_do_save = false; llama_state_save_file(ctx, path_session.c_str(), session_tokens.data(), session_tokens.size()); LOG_DBG("saved session to %s\n", path_session.c_str()); From 70d860824a72c3daaea678b74d5051fdd46d9c38 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sigbj=C3=B8rn=20Skj=C3=A6ret?= Date: Sun, 25 Jan 2026 13:05:05 +0100 Subject: [PATCH 09/22] convert : fix Gemma3N, GraniteMoe and Ernie4.5Moe (#19084) * fix Gemma3N and Ernie4.5Moe * fix GraniteMoe --- convert_hf_to_gguf.py | 24 ++++++++++++------------ 1 file changed, 12 insertions(+), 12 deletions(-) diff --git a/convert_hf_to_gguf.py b/convert_hf_to_gguf.py index d8bc95fa6a..b56a99c5fa 100755 --- a/convert_hf_to_gguf.py +++ b/convert_hf_to_gguf.py @@ -3799,7 +3799,7 @@ class Ernie4_5MoeModel(Ernie4_5Model): merged_name = f"model.layers.{bid}.mlp.experts.{w_name}.weight" yield from super().modify_tensors(data_torch, merged_name, bid) else: - yield from super().modify_tensors(data_torch, name, bid) + yield from ModelBase.modify_tensors(self, data_torch, name, bid) def prepare_tensors(self): super().prepare_tensors() @@ -6153,7 +6153,7 @@ class Gemma3nVisionAudioModel(ConformerAudioModel): if new_name.endswith("conv_stem.conv.bias") or new_name.endswith("layer_scale.gamma"): data_torch = data_torch.unsqueeze(0).unsqueeze(-1).unsqueeze(-1) # [1, C, 1, 1] - yield from super().modify_tensors(data_torch, new_name, bid) + yield from ModelBase.modify_tensors(self, data_torch, new_name, bid) @ModelBase.register("Gemma3nForCausalLM", "Gemma3nForConditionalGeneration") @@ -6253,7 +6253,7 @@ class Gemma3NModel(Gemma3Model): # Continue with normal processing name = name.replace("language_model.", "") - yield from super().modify_tensors(data_torch, name, bid) + yield from ModelBase.modify_tensors(self, data_torch, name, bid) return if "altup_unembed_projections" in name: @@ -6270,7 +6270,7 @@ class Gemma3NModel(Gemma3Model): raise ValueError(f"Unknown name: {name}") out = self._stack_matrices(self._altup_unembd) if out is not None: - yield from super().modify_tensors(out, "model.altup_unembed_projections.weight", bid) + yield from ModelBase.modify_tensors(self, out, "model.altup_unembed_projections.weight", bid) return else: return @@ -6287,7 +6287,7 @@ class Gemma3NModel(Gemma3Model): raise ValueError(f"Unknown name: {name}") out = self._stack_matrices(self._altup_proj) if out is not None: - yield from super().modify_tensors(out, "model.altup_projections.weight", bid) + yield from ModelBase.modify_tensors(self, out, "model.altup_projections.weight", bid) return else: return @@ -8803,8 +8803,8 @@ class GraniteMoeModel(GraniteModel): ffn_dim = self.hparams["intermediate_size"] assert data_torch.shape[-2] == 2 * ffn_dim, "Merged FFN tensor size must be 2 * intermediate_size" gate, up = data_torch.split(ffn_dim, dim=-2) - yield from super().modify_tensors(gate, self.format_tensor_name(gguf.MODEL_TENSOR.FFN_GATE_EXP, bid), bid) - yield from super().modify_tensors(up, self.format_tensor_name(gguf.MODEL_TENSOR.FFN_UP_EXP, bid), bid) + yield from ModelBase.modify_tensors(self, gate, self.format_tensor_name(gguf.MODEL_TENSOR.FFN_GATE_EXP, bid), bid) + yield from ModelBase.modify_tensors(self, up, self.format_tensor_name(gguf.MODEL_TENSOR.FFN_UP_EXP, bid), bid) has_experts = bool(self.hparams.get('num_local_experts')) @@ -8813,15 +8813,15 @@ class GraniteMoeModel(GraniteModel): assert data_torch.shape[-2] == 2 * ffn_dim, "Merged FFN tensor size must be 2 * shared_intermediate_size" gate, up = data_torch.split(ffn_dim, dim=-2) if has_experts: - yield from super().modify_tensors(gate,self.format_tensor_name(gguf.MODEL_TENSOR.FFN_GATE_SHEXP, bid), bid) - yield from super().modify_tensors(up, self.format_tensor_name(gguf.MODEL_TENSOR.FFN_UP_SHEXP, bid), bid) + yield from ModelBase.modify_tensors(self, gate,self.format_tensor_name(gguf.MODEL_TENSOR.FFN_GATE_SHEXP, bid), bid) + yield from ModelBase.modify_tensors(self, up, self.format_tensor_name(gguf.MODEL_TENSOR.FFN_UP_SHEXP, bid), bid) return - yield from super().modify_tensors(gate, self.format_tensor_name(gguf.MODEL_TENSOR.FFN_GATE, bid), bid) - yield from super().modify_tensors(up, self.format_tensor_name(gguf.MODEL_TENSOR.FFN_UP, bid), bid) + yield from ModelBase.modify_tensors(self, gate, self.format_tensor_name(gguf.MODEL_TENSOR.FFN_GATE, bid), bid) + yield from ModelBase.modify_tensors(self, up, self.format_tensor_name(gguf.MODEL_TENSOR.FFN_UP, bid), bid) return if not has_experts and name.endswith("shared_mlp.output_linear.weight"): - yield from super().modify_tensors(data_torch, self.format_tensor_name(gguf.MODEL_TENSOR.FFN_DOWN, bid), bid) + yield from ModelBase.modify_tensors(self, data_torch, self.format_tensor_name(gguf.MODEL_TENSOR.FFN_DOWN, bid), bid) return yield from super().modify_tensors(data_torch, name, bid) From d9c6ce46f747189cd6238ca7699253613f77c016 Mon Sep 17 00:00:00 2001 From: Georgi Gerganov Date: Sun, 25 Jan 2026 15:48:56 +0200 Subject: [PATCH 10/22] kv-cache : support V-less cache (#19067) * kv-cache : support V-less cache * cuda : better check for V_is_K_view * cuda : improve V_is_K_view check * graph : add comments * hparams : refactor --- ggml/src/ggml-cuda/fattn-common.cuh | 2 +- ggml/src/ggml-cuda/fattn.cu | 2 +- src/llama-context.cpp | 8 +- src/llama-graph.cpp | 117 ++++++++++++++++++++++++++-- src/llama-graph.h | 48 ++++++++++++ src/llama-hparams.cpp | 19 ++++- src/llama-hparams.h | 14 +++- src/llama-kv-cache.cpp | 36 +++++++-- src/llama-model-saver.cpp | 4 +- src/llama-model.cpp | 30 ++++--- src/models/deepseek2.cpp | 19 ++--- 11 files changed, 246 insertions(+), 53 deletions(-) diff --git a/ggml/src/ggml-cuda/fattn-common.cuh b/ggml/src/ggml-cuda/fattn-common.cuh index 40c7725784..13c5b0a459 100644 --- a/ggml/src/ggml-cuda/fattn-common.cuh +++ b/ggml/src/ggml-cuda/fattn-common.cuh @@ -782,7 +782,7 @@ void launch_fattn( const ggml_tensor * K = dst->src[1]; const ggml_tensor * V = dst->src[2]; - const bool V_is_K_view = V->op == GGML_OP_VIEW && V->src[0] == K && V->data == K->data; + const bool V_is_K_view = V->view_src && V->view_offs == 0 && (V->view_src == K || V->view_src == K->view_src); const ggml_tensor * mask = dst->src[3]; const ggml_tensor * sinks = dst->src[4]; diff --git a/ggml/src/ggml-cuda/fattn.cu b/ggml/src/ggml-cuda/fattn.cu index ba2b96bc32..a5e6624181 100644 --- a/ggml/src/ggml-cuda/fattn.cu +++ b/ggml/src/ggml-cuda/fattn.cu @@ -247,7 +247,7 @@ static best_fattn_kernel ggml_cuda_get_best_fattn_kernel(const int device, const } } - const bool V_is_K_view = V->op == GGML_OP_VIEW && V->src[0] == K && V->data == K->data; + const bool V_is_K_view = V->view_src && V->view_offs == 0 && (V->view_src == K || V->view_src == K->view_src); const int cc = ggml_cuda_info().devices[device].cc; diff --git a/src/llama-context.cpp b/src/llama-context.cpp index fb817e1385..72211db17b 100644 --- a/src/llama-context.cpp +++ b/src/llama-context.cpp @@ -793,7 +793,7 @@ float * llama_context::get_embeddings_ith(int32_t i) { throw std::runtime_error(format("corrupt output buffer (j=%" PRId64 ", n_outputs=%d)", j, n_outputs)); } - const uint32_t n_embd_out = model.hparams.get_n_embd_out(); + const uint32_t n_embd_out = model.hparams.n_embd_out(); return embd + j*n_embd_out; } catch (const std::exception & err) { LLAMA_LOG_ERROR("%s: invalid embeddings id %d, reason: %s\n", __func__, i, err.what()); @@ -1279,7 +1279,7 @@ int llama_context::encode(const llama_batch & batch_inp) { { // extract token embeddings GGML_ASSERT(embd != nullptr); - const uint32_t n_embd_out = hparams.get_n_embd_out(); + const uint32_t n_embd_out = hparams.n_embd_out(); GGML_ASSERT(n_tokens*n_embd_out <= (int64_t) embd_size); ggml_backend_tensor_get_async(backend_embd, t_embd, embd, 0, n_tokens*n_embd_out*sizeof(float)); @@ -1688,7 +1688,7 @@ int llama_context::decode(const llama_batch & batch_inp) { { // extract token embeddings GGML_ASSERT(embd != nullptr); - const uint32_t n_embd_out = hparams.get_n_embd_out(); + const uint32_t n_embd_out = hparams.n_embd_out(); float * embd_out = embd + n_outputs_prev*n_embd_out; if (n_outputs) { @@ -1821,7 +1821,7 @@ uint32_t llama_context::output_reserve(int32_t n_outputs, const llama_batch & ba const auto n_batch = cparams.n_batch; const auto n_vocab = vocab.n_tokens(); - const auto n_embd_out = hparams.get_n_embd_out(); + const auto n_embd_out = hparams.n_embd_out(); bool has_logits = true; bool has_embd = cparams.embeddings; diff --git a/src/llama-graph.cpp b/src/llama-graph.cpp index f9ed87cce1..16d42c4ae3 100644 --- a/src/llama-graph.cpp +++ b/src/llama-graph.cpp @@ -407,6 +407,27 @@ bool llm_graph_input_attn_kv::can_reuse(const llm_graph_params & params) { return res; } +void llm_graph_input_attn_k::set_input(const llama_ubatch * ubatch) { + mctx->set_input_k_idxs(self_k_idxs, ubatch); + + mctx->set_input_kq_mask(self_kq_mask, ubatch, cparams.causal_attn); +} + +bool llm_graph_input_attn_k::can_reuse(const llm_graph_params & params) { + const auto * mctx = static_cast(params.mctx); + + this->mctx = mctx; + + bool res = true; + + res &= self_k_idxs->ne[0] == params.ubatch.n_tokens; + + res &= self_kq_mask->ne[0] == mctx->get_n_kv(); + res &= self_kq_mask->ne[1] == params.ubatch.n_tokens; + + return res; +} + void llm_graph_input_attn_kv_iswa::set_input(const llama_ubatch * ubatch) { mctx->get_base()->set_input_k_idxs(self_k_idxs, ubatch); mctx->get_base()->set_input_v_idxs(self_v_idxs, ubatch); @@ -1596,11 +1617,6 @@ ggml_tensor * llm_graph_context::build_attn_mha( v = ggml_transpose(ctx0, v); } - // TODO: update llama_kv_cache to not store V cache in the MLA case and automatically return a view of K - if (v_mla) { - v = ggml_view_4d(ctx0, k, v->ne[0], v->ne[1], v->ne[2], v->ne[3], k->nb[1], k->nb[2], k->nb[3], 0); - } - // this can happen when KV cache is not used (e.g. an embedding model with non-causal attn) if (k->type == GGML_TYPE_F32) { k = ggml_cast(ctx0, k, GGML_TYPE_F16); @@ -1823,9 +1839,11 @@ ggml_tensor * llm_graph_context::build_attn( ggml_tensor * v_cur, ggml_tensor * kq_b, ggml_tensor * sinks, - ggml_tensor * v_mla, + ggml_tensor * v_mla, // TODO: remove float kq_scale, int il) const { + GGML_ASSERT(v_mla == nullptr); + // these nodes are added to the graph together so that they are not reordered // by doing so, the number of splits in the graph is reduced // expand k later to enable rope fusion which directly writes into k-v cache @@ -1868,6 +1886,93 @@ ggml_tensor * llm_graph_context::build_attn( return cur; } +static std::unique_ptr build_attn_inp_k_impl( + ggml_context * ctx0, + const llama_ubatch & ubatch, + const llama_hparams & hparams, + const llama_cparams & cparams, + const llama_kv_cache_context * mctx_cur) { + + auto inp = std::make_unique(hparams, cparams, mctx_cur); + + { + GGML_ASSERT(hparams.swa_type == LLAMA_SWA_TYPE_NONE && "Use llama_kv_cache_iswa for SWA"); + + const auto n_kv = mctx_cur->get_n_kv(); + const auto n_tokens = ubatch.n_tokens; + const auto n_stream = cparams.kv_unified ? 1 : ubatch.n_seqs_unq; + + inp->self_k_idxs = mctx_cur->build_input_k_idxs(ctx0, ubatch); + + inp->self_kq_mask = ggml_new_tensor_4d(ctx0, GGML_TYPE_F32, n_kv, n_tokens/n_stream, 1, n_stream); + ggml_set_input(inp->self_kq_mask); + + inp->self_kq_mask_cnv = cparams.flash_attn ? ggml_cast(ctx0, inp->self_kq_mask, GGML_TYPE_F16) : inp->self_kq_mask; + } + + return inp; +} + +llm_graph_input_attn_k * llm_graph_context::build_attn_inp_k() const { + const auto * mctx_cur = static_cast(mctx); + + auto inp = build_attn_inp_k_impl(ctx0, ubatch, hparams, cparams, mctx_cur); + + return (llm_graph_input_attn_k *) res->add_input(std::move(inp)); +} + +ggml_tensor * llm_graph_context::build_attn( + llm_graph_input_attn_k * inp, + ggml_tensor * wo, + ggml_tensor * wo_b, + ggml_tensor * q_cur, + ggml_tensor * k_cur, + ggml_tensor * v_cur, + ggml_tensor * kq_b, + ggml_tensor * sinks, + ggml_tensor * v_mla, + float kq_scale, + int il) const { + // these nodes are added to the graph together so that they are not reordered + // by doing so, the number of splits in the graph is reduced + // expand k later to enable rope fusion which directly writes into k-v cache + ggml_build_forward_expand(gf, q_cur); + ggml_build_forward_expand(gf, v_cur); + ggml_build_forward_expand(gf, k_cur); + + const auto * mctx_cur = inp->mctx; + + // store to KV cache + { + const auto & k_idxs = inp->get_k_idxs(); + + ggml_build_forward_expand(gf, mctx_cur->cpy_k(ctx0, k_cur, k_idxs, il)); + } + + const auto & kq_mask = inp->get_kq_mask(); + + ggml_tensor * q = q_cur; + ggml_tensor * k = mctx_cur->get_k(ctx0, il); + ggml_tensor * v = ggml_view_4d(ctx0, k, v_cur->ne[0], k->ne[1], k->ne[2], k->ne[3], k->nb[1], k->nb[2], k->nb[3], 0); + + ggml_tensor * cur = build_attn_mha(q, k, v, kq_b, kq_mask, sinks, v_mla, kq_scale, il); + cb(cur, "kqv_out", il); + + if (wo) { + cur = build_lora_mm(wo, cur); + if (arch == LLM_ARCH_GLM4 || arch == LLM_ARCH_GLM4_MOE) { + // GLM4 and GLM4_MOE seem to have numerical issues with half-precision accumulators + ggml_mul_mat_set_prec(cur, GGML_PREC_F32); + } + } + + if (wo_b) { + cur = ggml_add(ctx0, cur, wo_b); + } + + return cur; +} + ggml_tensor * llm_graph_context::build_attn( llm_graph_input_attn_kv_iswa * inp, ggml_tensor * wo, diff --git a/src/llama-graph.h b/src/llama-graph.h index 242a046d56..4090d8116c 100644 --- a/src/llama-graph.h +++ b/src/llama-graph.h @@ -317,6 +317,39 @@ public: const llama_kv_cache_context * mctx; }; +// V-less input for the KV cache +// ref: https://github.com/ggml-org/llama.cpp/pull/19067 +class llm_graph_input_attn_k : public llm_graph_input_i { +public: + llm_graph_input_attn_k( + const llama_hparams & hparams, + const llama_cparams & cparams, + const llama_kv_cache_context * mctx) : + hparams(hparams), + cparams(cparams), + mctx(mctx) { + } + ~llm_graph_input_attn_k() = default; + + void set_input(const llama_ubatch * ubatch) override; + + bool can_reuse(const llm_graph_params & params) override; + + ggml_tensor * get_k_idxs() const { return self_k_idxs; } + + ggml_tensor * get_kq_mask() const { return self_kq_mask_cnv; } + + ggml_tensor * self_k_idxs = nullptr; // I64 [n_batch] + + ggml_tensor * self_kq_mask = nullptr; // F32 [n_kv, n_batch/n_stream, 1, n_stream] + ggml_tensor * self_kq_mask_cnv = nullptr; // [n_kv, n_batch/n_stream, 1, n_stream] + + const llama_hparams hparams; + const llama_cparams cparams; + + const llama_kv_cache_context * mctx; +}; + class llm_graph_input_attn_kv_iswa : public llm_graph_input_i { public: llm_graph_input_attn_kv_iswa( @@ -833,6 +866,21 @@ struct llm_graph_context { ggml_tensor * v_cur, // [n_embd_head_v, n_head_v, n_tokens] ggml_tensor * kq_b, ggml_tensor * sinks, // [n_head_q] + ggml_tensor * v_mla, // [n_embd_head_v_mla, n_embd_head_v, n_head_v] // TODO: remove + float kq_scale, + int il) const; + + llm_graph_input_attn_k * build_attn_inp_k() const; + + ggml_tensor * build_attn( + llm_graph_input_attn_k * inp, + ggml_tensor * wo, + ggml_tensor * wo_b, + ggml_tensor * q_cur, // [n_embd_head_q, n_head_q, n_tokens] + ggml_tensor * k_cur, // [n_embd_head_k, n_head_k, n_tokens] + ggml_tensor * v_cur, // [n_embd_head_v, n_head_v, n_tokens] + ggml_tensor * kq_b, + ggml_tensor * sinks, // [n_head_q] ggml_tensor * v_mla, // [n_embd_head_v_mla, n_embd_head_v, n_head_v] float kq_scale, int il) const; diff --git a/src/llama-hparams.cpp b/src/llama-hparams.cpp index 5f1df995f3..392f9160ce 100644 --- a/src/llama-hparams.cpp +++ b/src/llama-hparams.cpp @@ -72,8 +72,8 @@ uint32_t llama_hparams::n_embd_inp() const { return n_embd_inp; } -uint32_t llama_hparams::get_n_embd_out() const { - return n_embd_out > 0 ? n_embd_out : n_embd; +uint32_t llama_hparams::n_embd_out() const { + return n_embd_out_impl > 0 ? n_embd_out_impl : n_embd; } uint32_t llama_hparams::n_embd_k_gqa(uint32_t il) const { @@ -175,6 +175,21 @@ bool llama_hparams::is_swa(uint32_t il) const { GGML_ABORT("fatal error"); } +bool llama_hparams::is_mla() const { + assert((n_embd_head_k_mla_impl == 0 && n_embd_head_v_mla_impl == 0) || + (n_embd_head_k_mla_impl != 0 && n_embd_head_v_mla_impl != 0)); + + return n_embd_head_k_mla_impl != 0 && n_embd_head_v_mla_impl != 0; +} + +uint32_t llama_hparams::n_embd_head_k_mla() const { + return is_mla() ? n_embd_head_k_mla_impl : n_embd_head_k; +} + +uint32_t llama_hparams::n_embd_head_v_mla() const { + return is_mla() ? n_embd_head_v_mla_impl : n_embd_head_v; +} + bool llama_hparams::has_kv(uint32_t il) const { if (n_layer_kv_from_start >= 0) { if (il < (uint32_t) n_layer_kv_from_start) { diff --git a/src/llama-hparams.h b/src/llama-hparams.h index 2bf8665520..caed0ec1b7 100644 --- a/src/llama-hparams.h +++ b/src/llama-hparams.h @@ -53,8 +53,8 @@ struct llama_hparams { uint32_t n_rel_attn_bkts = 0; // note: deepseek2 using MLA converts into MQA with larger heads, then decompresses to MHA - uint32_t n_embd_head_k_mla = 0; - uint32_t n_embd_head_v_mla = 0; + uint32_t n_embd_head_k_mla_impl = 0; + uint32_t n_embd_head_v_mla_impl = 0; // for WavTokenizer struct llama_hparams_posnet posnet; @@ -164,7 +164,7 @@ struct llama_hparams { uint32_t n_cls_out = 1; // output embedding dimension (0 = use n_embd) - uint32_t n_embd_out = 0; + uint32_t n_embd_out_impl = 0; // llama4 smallthinker uint32_t n_moe_layer_step = 0; @@ -239,7 +239,7 @@ struct llama_hparams { uint32_t n_embd_inp() const; // dimension of output embeddings - uint32_t get_n_embd_out() const; + uint32_t n_embd_out() const; // dimension of key embeddings across all k-v heads uint32_t n_embd_k_gqa(uint32_t il = 0) const; @@ -269,6 +269,12 @@ struct llama_hparams { bool is_swa(uint32_t il) const; + // note: currently only support if either all or none of the layers are MLA + bool is_mla() const; + + uint32_t n_embd_head_k_mla() const; + uint32_t n_embd_head_v_mla() const; + bool has_kv(uint32_t il) const; // number of layers for which has_kv() returns true diff --git a/src/llama-kv-cache.cpp b/src/llama-kv-cache.cpp index a7327c4987..f3c9b49f30 100644 --- a/src/llama-kv-cache.cpp +++ b/src/llama-kv-cache.cpp @@ -97,6 +97,8 @@ llama_kv_cache::llama_kv_cache( __func__, hparams.n_embd_v_gqa_max()); } + const bool is_mla = hparams.is_mla(); + for (uint32_t il = 0; il < hparams.n_layer; il++) { if (!hparams.has_kv(il)) { LLAMA_LOG_DEBUG("%s: layer %3d: does not have KV cache\n", __func__, il); @@ -130,18 +132,21 @@ llama_kv_cache::llama_kv_cache( throw std::runtime_error("failed to create ggml context for kv cache"); } - ggml_tensor * k = ggml_new_tensor_3d(ctx, type_k, n_embd_k_gqa, kv_size, n_stream); - ggml_tensor * v = ggml_new_tensor_3d(ctx, type_v, n_embd_v_gqa, kv_size, n_stream); + const bool has_k = true; + const bool has_v = !is_mla; - ggml_format_name(k, "cache_k_l%d", il); - ggml_format_name(v, "cache_v_l%d", il); + ggml_tensor * k = has_k ? ggml_new_tensor_3d(ctx, type_k, n_embd_k_gqa, kv_size, n_stream) : nullptr; + ggml_tensor * v = has_v ? ggml_new_tensor_3d(ctx, type_v, n_embd_v_gqa, kv_size, n_stream) : nullptr; + + has_k && ggml_format_name(k, "cache_k_l%d", il); + has_v && ggml_format_name(v, "cache_v_l%d", il); std::vector k_stream; std::vector v_stream; for (uint32_t s = 0; s < n_stream; ++s) { - k_stream.push_back(ggml_view_2d(ctx, k, n_embd_k_gqa, kv_size, k->nb[1], s*k->nb[2])); - v_stream.push_back(ggml_view_2d(ctx, v, n_embd_v_gqa, kv_size, v->nb[1], s*v->nb[2])); + k_stream.push_back(has_k ? ggml_view_2d(ctx, k, n_embd_k_gqa, kv_size, k->nb[1], s*k->nb[2]) : nullptr); + v_stream.push_back(has_v ? ggml_view_2d(ctx, v, n_embd_v_gqa, kv_size, v->nb[1], s*v->nb[2]) : nullptr); } map_layer_ids[il] = layers.size(); @@ -647,7 +652,10 @@ bool llama_kv_cache::update(llama_context * lctx, bool do_shift, const stream_co const auto & layer = layers[il]; ggml_backend_tensor_copy(layer.k_stream[ssrc], layer.k_stream[sdst]); - ggml_backend_tensor_copy(layer.v_stream[ssrc], layer.v_stream[sdst]); + + if (layer.v_stream[ssrc]) { + ggml_backend_tensor_copy(layer.v_stream[ssrc], layer.v_stream[sdst]); + } } } } @@ -1516,7 +1524,7 @@ size_t llama_kv_cache::size_v_bytes() const { size_t size_v_bytes = 0; for (const auto & layer : layers) { - size_v_bytes += ggml_nbytes(layer.v); + size_v_bytes += layer.v ? ggml_nbytes(layer.v) : 0; } return size_v_bytes; @@ -1798,6 +1806,9 @@ void llama_kv_cache::state_write_data(llama_io_write_i & io, const cell_ranges_t const uint32_t n_embd_v_gqa = hparams.n_embd_v_gqa(il); auto * v = layer.v_stream[cr.strm]; + if (!v) { + continue; + } // Write value type const int32_t v_type_i = (int32_t) v->type; @@ -1824,6 +1835,9 @@ void llama_kv_cache::state_write_data(llama_io_write_i & io, const cell_ranges_t const uint32_t n_embd_v_gqa = hparams.n_embd_v_gqa(il); auto * v = layer.v_stream[cr.strm]; + if (!v) { + continue; + } // Write value type const int32_t v_type_i = (int32_t) v->type; @@ -2027,6 +2041,9 @@ bool llama_kv_cache::state_read_data(llama_io_read_i & io, uint32_t strm, uint32 const uint32_t n_embd_v_gqa = hparams.n_embd_v_gqa(il); auto * v = layer.v_stream[strm]; + if (!v) { + continue; + } // Read type of value int32_t v_type_i_ref; @@ -2068,6 +2085,9 @@ bool llama_kv_cache::state_read_data(llama_io_read_i & io, uint32_t strm, uint32 const uint32_t n_embd_v_gqa = hparams.n_embd_v_gqa(il); auto * v = layer.v_stream[strm]; + if (!v) { + continue; + } // Read type of value int32_t v_type_i_ref; diff --git a/src/llama-model-saver.cpp b/src/llama-model-saver.cpp index ae27c71ce2..36e353074e 100644 --- a/src/llama-model-saver.cpp +++ b/src/llama-model-saver.cpp @@ -146,8 +146,8 @@ void llama_model_saver::add_kv_from_model() { add_kv(LLM_KV_VOCAB_SIZE, vocab.n_tokens()); add_kv(LLM_KV_CONTEXT_LENGTH, hparams.n_ctx_train); add_kv(LLM_KV_EMBEDDING_LENGTH, hparams.n_embd); - if (hparams.n_embd_out > 0) { - add_kv(LLM_KV_EMBEDDING_LENGTH_OUT, hparams.n_embd_out); + if (hparams.n_embd_out_impl > 0) { + add_kv(LLM_KV_EMBEDDING_LENGTH_OUT, hparams.n_embd_out_impl); } add_kv(LLM_KV_BLOCK_COUNT, hparams.n_layer); add_kv(LLM_KV_LEADING_DENSE_BLOCK_COUNT, hparams.n_layer_dense_lead); diff --git a/src/llama-model.cpp b/src/llama-model.cpp index b58b35a426..32f49e7996 100644 --- a/src/llama-model.cpp +++ b/src/llama-model.cpp @@ -512,7 +512,7 @@ void llama_model::load_hparams(llama_model_loader & ml) { ml.get_key(LLM_KV_CONTEXT_LENGTH, hparams.n_ctx_train); ml.get_key(LLM_KV_EMBEDDING_LENGTH, hparams.n_embd); - ml.get_key(LLM_KV_EMBEDDING_LENGTH_OUT, hparams.n_embd_out, false); + ml.get_key(LLM_KV_EMBEDDING_LENGTH_OUT, hparams.n_embd_out_impl, false); ml.get_key(LLM_KV_BLOCK_COUNT, hparams.n_layer); ml.get_key(LLM_KV_EXPERT_COUNT, hparams.n_expert, false); ml.get_key(LLM_KV_EXPERT_USED_COUNT, hparams.n_expert_used, false); @@ -1697,15 +1697,16 @@ void llama_model::load_hparams(llama_model_loader & ml) { case LLM_ARCH_DEEPSEEK2: { // lite variants include DeepSeek-V2-Lite, GigaChat3-10B-A1.8B - bool is_lite = (hparams.n_layer == 27 || hparams.n_layer == 26); + const bool is_lite = (hparams.n_layer == 27 || hparams.n_layer == 26); + ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps); ml.get_key(LLM_KV_LEADING_DENSE_BLOCK_COUNT, hparams.n_layer_dense_lead); if (!is_lite) { ml.get_key(LLM_KV_ATTENTION_Q_LORA_RANK, hparams.n_lora_q); } ml.get_key(LLM_KV_ATTENTION_KV_LORA_RANK, hparams.n_lora_kv); - ml.get_key(LLM_KV_ATTENTION_KEY_LENGTH_MLA, hparams.n_embd_head_k_mla, false); - ml.get_key(LLM_KV_ATTENTION_VALUE_LENGTH_MLA, hparams.n_embd_head_v_mla, false); + ml.get_key(LLM_KV_ATTENTION_KEY_LENGTH_MLA, hparams.n_embd_head_k_mla_impl, false); + ml.get_key(LLM_KV_ATTENTION_VALUE_LENGTH_MLA, hparams.n_embd_head_v_mla_impl, false); ml.get_key(LLM_KV_EXPERT_FEED_FORWARD_LENGTH, hparams.n_ff_exp); ml.get_key(LLM_KV_EXPERT_SHARED_COUNT, hparams.n_expert_shared); ml.get_key(LLM_KV_EXPERT_WEIGHTS_SCALE, hparams.expert_weights_scale, false); @@ -4909,14 +4910,11 @@ bool llama_model::load_tensors(llama_model_loader & ml) { } break; case LLM_ARCH_DEEPSEEK2: { - // lite variants include DeepSeek-V2-Lite, GigaChat3-10B-A1.8B - const bool is_lite = (hparams.n_layer == 27 || hparams.n_layer == 26); - - const bool is_mla = (hparams.n_embd_head_k_mla != 0 && hparams.n_embd_head_v_mla != 0); + const bool is_mla = hparams.is_mla(); // note: these are the actual head sizes you get when treating as MHA or after "decompression" using wv_b for MLA - const int64_t n_embd_head_k_mla = is_mla ? hparams.n_embd_head_k_mla : hparams.n_embd_head_k; - const int64_t n_embd_head_v_mla = is_mla ? hparams.n_embd_head_v_mla : hparams.n_embd_head_v; + const int64_t n_embd_head_k_mla = hparams.n_embd_head_k_mla(); + const int64_t n_embd_head_v_mla = hparams.n_embd_head_v_mla(); const int64_t n_embd_head_qk_rope = hparams.n_rot; const int64_t n_embd_head_qk_nope = n_embd_head_k_mla - n_embd_head_qk_rope; @@ -4941,13 +4939,13 @@ bool llama_model::load_tensors(llama_model_loader & ml) { auto & layer = layers[i]; layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, 0); - if (!is_lite) { + if (q_lora_rank > 0) { layer.attn_q_a_norm = create_tensor(tn(LLM_TENSOR_ATTN_Q_A_NORM, "weight", i), {q_lora_rank}, 0); } layer.attn_kv_a_norm = create_tensor(tn(LLM_TENSOR_ATTN_KV_A_NORM, "weight", i), {kv_lora_rank}, 0); - if (!is_lite) { + if (q_lora_rank > 0) { layer.wq_a = create_tensor(tn(LLM_TENSOR_ATTN_Q_A, "weight", i), {n_embd, q_lora_rank}, 0); layer.wq_b = create_tensor(tn(LLM_TENSOR_ATTN_Q_B, "weight", i), {q_lora_rank, n_head * n_embd_head_k_mla}, 0); } else { @@ -6597,7 +6595,7 @@ bool llama_model::load_tensors(llama_model_loader & ml) { } // for LFM2-ColBert-350M - dense_2_out_layers = create_tensor(tn(LLM_TENSOR_DENSE_2_OUT, "weight"), {n_embd, hparams.get_n_embd_out()}, TENSOR_NOT_REQUIRED); + dense_2_out_layers = create_tensor(tn(LLM_TENSOR_DENSE_2_OUT, "weight"), {n_embd, hparams.n_embd_out()}, TENSOR_NOT_REQUIRED); } break; case LLM_ARCH_SMALLTHINKER: { @@ -7316,8 +7314,8 @@ void llama_model::print_info() const { LLAMA_LOG_INFO("%s: n_layer_dense_lead = %d\n", __func__, hparams.n_layer_dense_lead); LLAMA_LOG_INFO("%s: n_lora_q = %d\n", __func__, hparams.n_lora_q); LLAMA_LOG_INFO("%s: n_lora_kv = %d\n", __func__, hparams.n_lora_kv); - LLAMA_LOG_INFO("%s: n_embd_head_k_mla = %d\n", __func__, hparams.n_embd_head_k_mla); - LLAMA_LOG_INFO("%s: n_embd_head_v_mla = %d\n", __func__, hparams.n_embd_head_v_mla); + LLAMA_LOG_INFO("%s: n_embd_head_k_mla = %d\n", __func__, hparams.n_embd_head_k_mla()); + LLAMA_LOG_INFO("%s: n_embd_head_v_mla = %d\n", __func__, hparams.n_embd_head_v_mla()); LLAMA_LOG_INFO("%s: n_ff_exp = %d\n", __func__, hparams.n_ff_exp); LLAMA_LOG_INFO("%s: n_expert_shared = %d\n", __func__, hparams.n_expert_shared); LLAMA_LOG_INFO("%s: expert_weights_scale = %.1f\n", __func__, hparams.expert_weights_scale); @@ -8162,7 +8160,7 @@ int32_t llama_model_n_embd_inp(const llama_model * model) { } int32_t llama_model_n_embd_out(const llama_model * model) { - return model->hparams.get_n_embd_out(); + return model->hparams.n_embd_out(); } int32_t llama_model_n_layer(const llama_model * model) { diff --git a/src/models/deepseek2.cpp b/src/models/deepseek2.cpp index c404c1946d..297dca5136 100644 --- a/src/models/deepseek2.cpp +++ b/src/models/deepseek2.cpp @@ -2,14 +2,11 @@ llm_build_deepseek2::llm_build_deepseek2(const llama_model & model, const llm_graph_params & params) : llm_graph_context(params) { - // lite variants include DeepSeek-V2-Lite, GigaChat3-10B-A1.8B - bool is_lite = (hparams.n_layer == 27 || hparams.n_layer == 26); - - const bool is_mla = (hparams.n_embd_head_k_mla != 0 && hparams.n_embd_head_v_mla != 0); + const bool is_mla = hparams.is_mla(); // note: these are the actual head sizes you get when treating as MHA or after "decompression" using wv_b for MLA - const int64_t n_embd_head_k = is_mla ? hparams.n_embd_head_k_mla : hparams.n_embd_head_k; - const int64_t n_embd_head_v = is_mla ? hparams.n_embd_head_v_mla : hparams.n_embd_head_v; + const int64_t n_embd_head_k = hparams.n_embd_head_k_mla(); + const int64_t n_embd_head_v = hparams.n_embd_head_v_mla(); const int64_t n_embd_head_qk_rope = hparams.n_rot; const int64_t n_embd_head_qk_nope = n_embd_head_k - n_embd_head_qk_rope; @@ -43,7 +40,8 @@ llm_build_deepseek2::llm_build_deepseek2(const llama_model & model, const llm_gr // inp_pos - contains the positions ggml_tensor * inp_pos = build_inp_pos(); - auto * inp_attn = build_attn_inp_kv(); + auto * inp_attn_kv = !is_mla ? build_attn_inp_kv() : nullptr; + auto * inp_attn_k = is_mla ? build_attn_inp_k() : nullptr; ggml_tensor * inp_out_ids = build_inp_out_ids(); @@ -57,6 +55,9 @@ llm_build_deepseek2::llm_build_deepseek2(const llama_model & model, const llm_gr // self_attention { ggml_tensor * q = NULL; + + const bool is_lite = model.layers[il].wq; + if (!is_lite) { q = ggml_mul_mat(ctx0, model.layers[il].wq_a, cur); cb(q, "q", il); @@ -145,7 +146,7 @@ llm_build_deepseek2::llm_build_deepseek2(const llama_model & model, const llm_gr } // note: MLA with the absorption optimzation converts into MQA (ie: GQA with 1 group) - cur = build_attn(inp_attn, + cur = build_attn(inp_attn_k, model.layers[il].wo, NULL, Qcur, Kcur, Vcur, nullptr, nullptr, model.layers[il].wv_b, kq_scale, il); } else { @@ -182,7 +183,7 @@ llm_build_deepseek2::llm_build_deepseek2(const llama_model & model, const llm_gr } // note: MLA without the absorption optimization converts into MHA (ie: GQA with full n_head groups) - cur = build_attn(inp_attn, + cur = build_attn(inp_attn_kv, model.layers[il].wo, NULL, Qcur, Kcur, Vcur, nullptr, nullptr, nullptr, kq_scale, il); } From bcb43163aed6a8986cf3d66e90848c9c258d4936 Mon Sep 17 00:00:00 2001 From: Aman Gupta Date: Sun, 25 Jan 2026 23:25:58 +0800 Subject: [PATCH 11/22] ggml-cpu: Use tiled FA for prompt-processing (#19012) * ggml-cpu: Use tiled FA for prompt-processing the FA performance is gimped on CPU on long contexts because it essentially uses a vector kernel. This PR adds a tiled FA for PP. Perf tuning for tile sizes done on a AMD EPYC single-socket 64-c machine. * fix out of bounds for mask * skip rows where there are all masks * skip tile if mask is inf * store mask in worksize * check inf tile earlier --- ggml/src/ggml-cpu/common.h | 8 + ggml/src/ggml-cpu/ggml-cpu.c | 9 +- ggml/src/ggml-cpu/ops.cpp | 290 ++++++++++++++++++++++++++++++++++- 3 files changed, 303 insertions(+), 4 deletions(-) diff --git a/ggml/src/ggml-cpu/common.h b/ggml/src/ggml-cpu/common.h index 6adca5437f..1057b5bb15 100644 --- a/ggml/src/ggml-cpu/common.h +++ b/ggml/src/ggml-cpu/common.h @@ -6,6 +6,9 @@ #include "ggml-impl.h" #include "simd-mappings.h" +#define GGML_FA_TILE_Q 32 +#define GGML_FA_TILE_KV 16 + #ifdef __cplusplus #include @@ -84,4 +87,9 @@ static std::pair get_thread_range(const struct ggml_compute_pa return {ir0, ir1}; } +struct ggml_fa_tile_config { + static constexpr size_t Q = GGML_FA_TILE_Q; + static constexpr size_t KV = GGML_FA_TILE_KV; +}; + #endif diff --git a/ggml/src/ggml-cpu/ggml-cpu.c b/ggml/src/ggml-cpu/ggml-cpu.c index 4c7a75e768..b1de2ae871 100644 --- a/ggml/src/ggml-cpu/ggml-cpu.c +++ b/ggml/src/ggml-cpu/ggml-cpu.c @@ -14,6 +14,7 @@ #include "vec.h" #include "ops.h" #include "ggml.h" +#include "common.h" #if defined(_MSC_VER) || defined(__MINGW32__) #include // using malloc.h with MSC/MINGW @@ -2866,10 +2867,12 @@ struct ggml_cplan ggml_graph_plan( } break; case GGML_OP_FLASH_ATTN_EXT: { - const int64_t ne10 = node->src[1]->ne[0]; // DK - const int64_t ne20 = node->src[2]->ne[0]; // DV + const int64_t DK = node->src[1]->ne[0]; + const int64_t DV = node->src[2]->ne[0]; - cur = sizeof(float)*(1*ne10 + 2*ne20)*n_tasks; // 1x head size K + 2x head size V (per thread) + // Tiled flash attention scratch (tile sizes defined in common.h) + // Per-thread: Q_q + KQ + mask + VKQ32 + V32 + padding + cur = sizeof(float)*(GGML_FA_TILE_Q*DK + 2*GGML_FA_TILE_Q*GGML_FA_TILE_KV + GGML_FA_TILE_Q*DV + GGML_FA_TILE_KV*DV)*n_tasks; } break; case GGML_OP_FLASH_ATTN_BACK: { diff --git a/ggml/src/ggml-cpu/ops.cpp b/ggml/src/ggml-cpu/ops.cpp index 387e2fe42c..48c8964361 100644 --- a/ggml/src/ggml-cpu/ops.cpp +++ b/ggml/src/ggml-cpu/ops.cpp @@ -8164,6 +8164,7 @@ static void ggml_compute_forward_flash_attn_ext_f16_one_chunk( // online softmax / attention // loop over n_kv and n_head_kv // ref: https://arxiv.org/pdf/2112.05682.pdf + for (int64_t ic = 0; ic < nek1; ++ic) { const float mv = mp ? slope*GGML_CPU_FP16_TO_FP32(mp[ic]) : 0.0f; if (mv == -INFINITY) { @@ -8271,6 +8272,280 @@ static void ggml_compute_forward_flash_attn_ext_f16_one_chunk( } } +static void ggml_compute_forward_flash_attn_ext_tiled( + const ggml_compute_params * params, + ggml_tensor * dst, + int ir0, int ir1) { + const ggml_tensor * q = dst->src[0]; + const ggml_tensor * k = dst->src[1]; + const ggml_tensor * v = dst->src[2]; + const ggml_tensor * mask = dst->src[3]; + const ggml_tensor * sinks = dst->src[4]; + + GGML_TENSOR_LOCALS(int64_t, neq, q, ne) + GGML_TENSOR_LOCALS(size_t, nbq, q, nb) + GGML_TENSOR_LOCALS(int64_t, nek, k, ne) + GGML_TENSOR_LOCALS(size_t, nbk, k, nb) + GGML_TENSOR_LOCALS(int64_t, nev, v, ne) + GGML_TENSOR_LOCALS(size_t, nbv, v, nb) + GGML_TENSOR_LOCALS(int64_t, ne, dst, ne) + GGML_TENSOR_LOCALS(size_t, nb, dst, nb) + + const int64_t DK = nek0; + const int64_t DV = nev0; + const int64_t N = neq1; + + GGML_ASSERT(ne0 == DV); + GGML_ASSERT(ne2 == N); + + // input tensor rows must be contiguous + GGML_ASSERT(nbq0 == ggml_type_size(q->type)); + GGML_ASSERT(nbk0 == ggml_type_size(k->type)); + GGML_ASSERT(nbv0 == ggml_type_size(v->type)); + + GGML_ASSERT(neq0 == DK); + GGML_ASSERT(nek0 == DK); + GGML_ASSERT(nev0 == DV); + + GGML_ASSERT(neq1 == N); + + // dst cannot be transposed or permuted + GGML_ASSERT(nb0 == sizeof(float)); + GGML_ASSERT(nb0 <= nb1); + GGML_ASSERT(nb1 <= nb2); + GGML_ASSERT(nb2 <= nb3); + + GGML_ASSERT(k->type == v->type); + const ggml_type kv_type = k->type; + + const auto * kv_type_traits_cpu = ggml_get_type_traits_cpu(kv_type); + const ggml_from_float_t kv_from_float = kv_type_traits_cpu->from_float; + const ggml_vec_dot_t kv_vec_dot = kv_type_traits_cpu->vec_dot; + const size_t kv_type_size = ggml_type_size(kv_type); + + // broadcast factors + const int64_t rk2 = neq2/nek2; + const int64_t rk3 = neq3/nek3; + + const int64_t rv2 = neq2/nev2; + const int64_t rv3 = neq3/nev3; + + float scale = 1.0f; + float max_bias = 0.0f; + float logit_softcap = 0.0f; + + memcpy(&scale, (float *) dst->op_params + 0, sizeof(float)); + memcpy(&max_bias, (float *) dst->op_params + 1, sizeof(float)); + memcpy(&logit_softcap, (float *) dst->op_params + 2, sizeof(float)); + + if (logit_softcap != 0) { + scale /= logit_softcap; + } + + const uint32_t n_head = neq2; + const uint32_t n_head_log2 = 1u << (uint32_t) floor(log2(n_head)); + + const float m0 = powf(2.0f, -(max_bias ) / n_head_log2); + const float m1 = powf(2.0f, -(max_bias / 2.0f) / n_head_log2); + + int ith = params->ith; + + static constexpr int Q_TILE_SZ = ggml_fa_tile_config::Q; + static constexpr int KV_TILE_SZ = ggml_fa_tile_config::KV; + + GGML_ASSERT(nek1 % KV_TILE_SZ == 0 && "KV sequence length must be divisible by KV_TILE_SZ"); + + int ir = ir0; + while (ir < ir1) { + // q indices for the start of this tile + const int iq3 = ir/(neq2*neq1); + const int iq2 = (ir - iq3*neq2*neq1)/neq1; + const int iq1 = (ir - iq3*neq2*neq1 - iq2*neq1); + + // Number of valid rows in this tile: + // - limited by tile size (Q_TILE_SZ) + // - limited by chunk boundary (ir1 - ir) + // - limited by head boundary (neq1 - iq1) to avoid crossing into next head + const int tile_rows = MIN(Q_TILE_SZ, MIN((int)(ir1 - ir), (int)(neq1 - iq1))); + GGML_ASSERT(tile_rows > 0); + + const uint32_t h = iq2; // head index + const float slope = (max_bias > 0.0f) ? h < n_head_log2 ? powf(m0, h + 1) : powf(m1, 2*(h - n_head_log2) + 1) : 1.0f; + + float S[Q_TILE_SZ]; + float M[Q_TILE_SZ]; + + for (int i = 0 ; i < Q_TILE_SZ; ++i) { + S[i] = 0.; + M[i] = -INFINITY; + } + + // Per-thread scratch layout: + // Q_q: Q_TILE_SZ * DK (converted Q tile in KV type) + // KQ: Q_TILE_SZ * KV_TILE_SZ (attention scores in float) + // mask: Q_TILE_SZ * KV_TILE_SZ (mask in float) + // VKQ32: Q_TILE_SZ * DV (FP32 output accumulator) + // V32: KV_TILE_SZ * DV (F32 buffer for V tile - used for f166 conversion) + float * base = (float *) params->wdata + ith*(Q_TILE_SZ*DK + 2*Q_TILE_SZ*KV_TILE_SZ + Q_TILE_SZ*DV + KV_TILE_SZ*DV + CACHE_LINE_SIZE_F32); + + void * Q_q = base; + float * KQ = (float *)((char *)base + Q_TILE_SZ * DK * sizeof(float)); + float * mask32 = KQ + Q_TILE_SZ * KV_TILE_SZ; + float * VKQ32 = mask32 + Q_TILE_SZ * KV_TILE_SZ; + float * V32 = VKQ32 + Q_TILE_SZ * DV; // F32 buffer for V tile + + memset(VKQ32, 0, Q_TILE_SZ * DV * sizeof(float)); + memset(mask32, 0, Q_TILE_SZ * KV_TILE_SZ * sizeof(float)); + + // k indices + const int ik3 = iq3 / rk3; + const int ik2 = iq2 / rk2; + + // v indices + const int iv3 = iq3 / rv3; + const int iv2 = iq2 / rv2; + + for (int tq = 0; tq < tile_rows; tq++) { + const float * pq = (const float *) ((char *) q->data + ((iq1 + tq)*nbq1 + iq2*nbq2 + iq3*nbq3)); + kv_from_float(pq, (char *)Q_q + tq * DK * kv_type_size, DK); + } + // Zero-pad remaining rows + for (int tq = tile_rows; tq < Q_TILE_SZ; tq++) { + memset((char *)Q_q + tq * DK * kv_type_size, 0, DK * kv_type_size); + } + + for (int64_t ic = 0; ic < nek1; ic += KV_TILE_SZ) { + + // skip the tile entirely if all the masks are -inf + if (mask) { + bool can_skip = true; + for (int tq = 0; tq < tile_rows; tq++) { + const ggml_fp16_t * mp_row = (const ggml_fp16_t *)((const char *) mask->data + (iq1 + tq)*mask->nb[1] + (iq2%mask->ne[2])*mask->nb[2] + (iq3%mask->ne[3])*mask->nb[3]); + for (int tk = 0; tk < KV_TILE_SZ; tk++) { + mask32[tq * KV_TILE_SZ + tk] = slope * GGML_CPU_FP16_TO_FP32(mp_row[ic + tk]); + if (mask32[tq * KV_TILE_SZ + tk] != -INFINITY) { + can_skip = false; + } + } + } + + if (can_skip) { + continue; + } + } + + for (int tq = 0; tq < Q_TILE_SZ; tq++) { + const void * q_row = (const char *)Q_q + tq * DK * kv_type_size; + for (int tk = 0; tk < KV_TILE_SZ; tk++) { + const void * k_row = (const char *) k->data + ((ic + tk)*nbk1 + ik2*nbk2 + ik3*nbk3); + float s; + kv_vec_dot(DK, &s, 0, k_row, 0, q_row, 0, 1); + KQ[tq * KV_TILE_SZ + tk] = s * scale; + } + } + + if (logit_softcap != 0.0f) { + ggml_vec_tanh_f32(Q_TILE_SZ * KV_TILE_SZ, KQ, KQ); + ggml_vec_scale_f32(Q_TILE_SZ * KV_TILE_SZ, KQ, logit_softcap); + } + + if (mask) { + ggml_vec_add_f32(tile_rows * KV_TILE_SZ, KQ, KQ, mask32); + } + + bool skip[Q_TILE_SZ] = {}; + + for (int tq = 0; tq < Q_TILE_SZ; tq++) { + float * kq_row = KQ + tq * KV_TILE_SZ; + + float tile_max; + ggml_vec_max_f32(KV_TILE_SZ, &tile_max, kq_row); + + if (tile_max == -INFINITY) { + skip[tq] = true; + continue; + } + + const float Mold = M[tq]; + const float Mnew = fmaxf(Mold, tile_max); + + if (Mnew > Mold) { + const float ms = expf(Mold - Mnew); + ggml_vec_scale_f32(DV, VKQ32 + tq * DV, ms); + S[tq] *= ms; + } + M[tq] = Mnew; + + + S[tq] += ggml_vec_soft_max_f32(KV_TILE_SZ, kq_row, kq_row, Mnew); + } + + // Convert V tile to F32 first (if F16), then do MAD + // On x86, ggml_vec_mad_f16 internall converts F16<->F32 on every load/store, so pre-converting is faster. + // TODO: on ARM, native f16 should be faster + if (kv_type == GGML_TYPE_F16) { + for (int tk = 0; tk < KV_TILE_SZ; tk++) { + const ggml_fp16_t * v_row = (const ggml_fp16_t *)((const char *) v->data + ((ic + tk)*nbv1 + iv2*nbv2 + iv3*nbv3)); + ggml_fp16_to_fp32_row(v_row, V32 + tk * DV, DV); + } + for (int tq = 0; tq < Q_TILE_SZ; tq++) { + if (skip[tq]) continue; + float * vkq_row = VKQ32 + tq * DV; + for (int tk = 0; tk < KV_TILE_SZ; tk++) { + const float p = KQ[tq * KV_TILE_SZ + tk]; + ggml_vec_mad_f32(DV, vkq_row, V32 + tk * DV, p); + } + } + } else { + for (int tq = 0; tq < Q_TILE_SZ; tq++) { + if (skip[tq]) continue; + float * vkq_row = VKQ32 + tq * DV; + for (int tk = 0; tk < KV_TILE_SZ; tk++) { + const float p = KQ[tq * KV_TILE_SZ + tk]; + const float * v_row = (const float *)((const char *) v->data + ((ic + tk)*nbv1 + iv2*nbv2 + iv3*nbv3)); + ggml_vec_mad_f32(DV, vkq_row, v_row, p); + } + } + } + } + + // sinks (apply only to valid rows in the tile) + if (sinks) { + const float s = ((float *)((char *) sinks->data))[h]; + + for (int tq = 0; tq < tile_rows; tq++) { + float ms = 1.0f; + float vs = 1.0f; + + if (s > M[tq]) { + ms = expf(M[tq] - s); + ggml_vec_scale_f32(DV, VKQ32 + tq * DV, ms); + } else { + vs = expf(s - M[tq]); + } + + S[tq] = S[tq] * ms + vs; + } + } + + for (int tq = 0; tq < tile_rows; tq++) { + // V /= S + const float S_inv = S[tq] == 0.0f ? 0.0f : 1.0f / S[tq]; + ggml_vec_scale_f32(DV, VKQ32 + tq * DV, S_inv); + + // dst indices + const int i1 = iq1 + tq; + const int i2 = iq2; + const int i3 = iq3; + + // permute(0, 2, 1, 3) + memcpy((char *) dst->data + (i3*ne2*ne1 + i2 + i1*ne1)*nb1, VKQ32 + tq * DV, nb1); + } + + ir += tile_rows; + } +} + static void ggml_compute_forward_flash_attn_ext_f16( const ggml_compute_params * params, ggml_tensor * dst) { @@ -8343,6 +8618,15 @@ static void ggml_compute_forward_flash_attn_ext_f16( // The number of elements in each chunk const int64_t dr = (nr + nchunk - 1) / nchunk; + static constexpr int64_t KV_TILE_SZ = ggml_fa_tile_config::KV; + static constexpr int64_t Q_TILE_SZ = ggml_fa_tile_config::Q; + const bool kv_is_f32_or_f16 = (k->type == GGML_TYPE_F32 || k->type == GGML_TYPE_F16); + const bool use_tiled = (q->type == GGML_TYPE_F32 && + kv_is_f32_or_f16 && + k->type == v->type && + nek1 % KV_TILE_SZ == 0 && + neq1 >= Q_TILE_SZ); // Only use tiled for batch >= tile size + // The first chunk comes from our thread_id, the rest will get auto-assigned. int current_chunk = ith; @@ -8350,7 +8634,11 @@ static void ggml_compute_forward_flash_attn_ext_f16( const int64_t ir0 = dr * current_chunk; const int64_t ir1 = MIN(ir0 + dr, nr); - ggml_compute_forward_flash_attn_ext_f16_one_chunk(params, dst, ir0, ir1); + if (use_tiled) { + ggml_compute_forward_flash_attn_ext_tiled(params, dst, ir0, ir1); + } else { + ggml_compute_forward_flash_attn_ext_f16_one_chunk(params, dst, ir0, ir1); + } current_chunk = ggml_threadpool_chunk_add(params->threadpool, 1); } From 0bf56369384ff1af2731cd1168785e283a1ba266 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sigbj=C3=B8rn=20Skj=C3=A6ret?= Date: Sun, 25 Jan 2026 18:03:34 +0100 Subject: [PATCH 12/22] convert : yield Gemma3N custom_map tensors directly (#19091) --- convert_hf_to_gguf.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/convert_hf_to_gguf.py b/convert_hf_to_gguf.py index b56a99c5fa..6e6e618989 100755 --- a/convert_hf_to_gguf.py +++ b/convert_hf_to_gguf.py @@ -6145,7 +6145,8 @@ class Gemma3nVisionAudioModel(ConformerAudioModel): if name.startswith("model.vision_tower.timm_model.blocks."): # Double-indexed block tensors through custom logic - new_name = self.custom_map(name) + yield (self.custom_map(name), data_torch) + return else: # Route non-repeating (conv_stem, msfa, embedding, etc.) and un-catched through tensor_mapping.py new_name = self.map_tensor_name(name) From 0440bfd1605333726ea0fb7a836942660bf2f9a6 Mon Sep 17 00:00:00 2001 From: ccbinn Date: Mon, 26 Jan 2026 02:07:19 +0800 Subject: [PATCH 13/22] metal : fix recommendedMaxWorkingSetSize availability on legacy iOS/macOS (#19088) Co-authored-by: chenbin11 --- ggml/src/ggml-metal/ggml-metal-device.m | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/ggml/src/ggml-metal/ggml-metal-device.m b/ggml/src/ggml-metal/ggml-metal-device.m index eb4e2c209c..7f9c384c34 100644 --- a/ggml/src/ggml-metal/ggml-metal-device.m +++ b/ggml/src/ggml-metal/ggml-metal-device.m @@ -785,8 +785,12 @@ ggml_metal_device_t ggml_metal_device_init(void) { dev->props.op_offload_min_batch_size = getenv("GGML_OP_OFFLOAD_MIN_BATCH") ? atoi(getenv("GGML_OP_OFFLOAD_MIN_BATCH")) : 32; dev->props.max_buffer_size = dev->mtl_device.maxBufferLength; - dev->props.max_working_set_size = dev->mtl_device.recommendedMaxWorkingSetSize; dev->props.max_theadgroup_memory_size = dev->mtl_device.maxThreadgroupMemoryLength; + if (@available(macOS 10.12, iOS 16.0, *)) { + dev->props.max_working_set_size = dev->mtl_device.recommendedMaxWorkingSetSize; + } else { + dev->props.max_working_set_size = dev->mtl_device.maxBufferLength; + } strncpy(dev->props.name, [[dev->mtl_device name] UTF8String], sizeof(dev->props.name) - 1); From 0c21677e43044d27f6f7a7f9f95c67f7c4b3fdb4 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Johannes=20G=C3=A4=C3=9Fler?= Date: Sun, 25 Jan 2026 21:19:47 +0100 Subject: [PATCH 14/22] CUDA: faster FA for GQA > 1 but not power of 2 (#19092) --- ggml/src/ggml-cuda/fattn-common.cuh | 22 +++--- ggml/src/ggml-cuda/fattn-mma-f16.cuh | 30 +++++---- ggml/src/ggml-cuda/fattn.cu | 67 ++++++++++++++++--- ...ttn-mma-f16-instance-ncols1_1-ncols2_32.cu | 5 ++ ...ttn-mma-f16-instance-ncols1_2-ncols2_32.cu | 5 ++ .../template-instances/generate_cu_files.py | 6 +- 6 files changed, 99 insertions(+), 36 deletions(-) create mode 100644 ggml/src/ggml-cuda/template-instances/fattn-mma-f16-instance-ncols1_1-ncols2_32.cu create mode 100644 ggml/src/ggml-cuda/template-instances/fattn-mma-f16-instance-ncols1_2-ncols2_32.cu diff --git a/ggml/src/ggml-cuda/fattn-common.cuh b/ggml/src/ggml-cuda/fattn-common.cuh index 13c5b0a459..1f5f1b9206 100644 --- a/ggml/src/ggml-cuda/fattn-common.cuh +++ b/ggml/src/ggml-cuda/fattn-common.cuh @@ -643,9 +643,10 @@ static __global__ void flash_attn_stream_k_fixup( const int iter_k = (ne11 + (nbatch_fa - 1)) / nbatch_fa; const int iter_j = (ne01 + (ncols1 - 1)) / ncols1; + const int iter_z = (ne02 + (ncols2 - 1)) / ncols2; - const int kbc0 = int64_t(bidx0 + 0)*(iter_k*iter_j*(ne02/ncols2)*ne03) / gridDim.x; - const int kbc0_stop = int64_t(bidx0 + 1)*(iter_k*iter_j*(ne02/ncols2)*ne03) / gridDim.x; + const int kbc0 = int64_t(bidx0 + 0)*(iter_k*iter_j*iter_z*ne03) / gridDim.x; + const int kbc0_stop = int64_t(bidx0 + 1)*(iter_k*iter_j*iter_z*ne03) / gridDim.x; const bool did_not_have_any_data = kbc0 == kbc0_stop; const bool wrote_beginning_of_tile = kbc0 % iter_k == 0; @@ -654,15 +655,15 @@ static __global__ void flash_attn_stream_k_fixup( return; } - const int sequence = kbc0 / (iter_k*iter_j*(ne02/ncols2)); - const int head = (kbc0 - iter_k*iter_j*(ne02/ncols2)*sequence) / (iter_k*iter_j); - const int jt = (kbc0 - iter_k*iter_j*(ne02/ncols2)*sequence - iter_k*iter_j*head) / iter_k; // j index of current tile. + const int sequence = kbc0 / (iter_k*iter_j*iter_z); + const int zt = (kbc0 - iter_k*iter_j*iter_z*sequence) / (iter_k*iter_j); + const int jt = (kbc0 - iter_k*iter_j*iter_z*sequence - iter_k*iter_j*zt) / iter_k; // j index of current tile. - if (jt*ncols1 + j >= ne01) { + if (jt*ncols1 + j >= ne01 || zt*ncols2 + c >= ne02) { return; } - dst += sequence*ne02*ne01*D + jt*ne02*(ncols1*D) + head*(ncols2*D) + (j*ne02 + c)*D + tid; + dst += sequence*ne02*ne01*D + jt*ne02*(ncols1*D) + zt*(ncols2*D) + (j*ne02 + c)*D + tid; // Load the partial result that needs a fixup: float dst_val = 0.0f; @@ -681,7 +682,7 @@ static __global__ void flash_attn_stream_k_fixup( int bidx = bidx0 - 1; int kbc_stop = kbc0; while(true) { - const int kbc = int64_t(bidx)*(iter_k*iter_j*(ne02/ncols2)*ne03) / gridDim.x; + const int kbc = int64_t(bidx)*(iter_k*iter_j*iter_z*ne03) / gridDim.x; if (kbc == kbc_stop) { // Did not have any data. bidx--; kbc_stop = kbc; @@ -883,7 +884,8 @@ void launch_fattn( } const int ntiles_x = ((Q->ne[1] + ncols1 - 1) / ncols1); - const int ntiles_total = ntiles_x * (Q->ne[2] / ncols2) * Q->ne[3]; + const int ntiles_z = ((Q->ne[2] + ncols2 - 1) / ncols2); + const int ntiles_total = ntiles_x * ntiles_z * Q->ne[3]; // Optional optimization where the mask is scanned to determine whether part of the calculation can be skipped. // Only worth the overhead if there is at lease one FATTN_KQ_STRIDE x FATTN_KQ_STRIDE square to be skipped or @@ -958,7 +960,7 @@ void launch_fattn( blocks_num.x = ntiles_x; blocks_num.y = parallel_blocks; - blocks_num.z = (Q->ne[2]/ncols2)*Q->ne[3]; + blocks_num.z = ntiles_z*Q->ne[3]; if (parallel_blocks > 1) { dst_tmp.alloc(parallel_blocks*ggml_nelements(KQV)); diff --git a/ggml/src/ggml-cuda/fattn-mma-f16.cuh b/ggml/src/ggml-cuda/fattn-mma-f16.cuh index 3e7d67b40d..9004d46904 100644 --- a/ggml/src/ggml-cuda/fattn-mma-f16.cuh +++ b/ggml/src/ggml-cuda/fattn-mma-f16.cuh @@ -940,6 +940,7 @@ static __device__ __forceinline__ void flash_attn_ext_f16_process_tile( const int stride_V, const int stride_mask, const int jt, + const int zt, const int kb0_start, const int kb0_stop) { #if defined(VOLTA_MMA_AVAILABLE) || defined(TURING_MMA_AVAILABLE) || (defined(AMD_WMMA_AVAILABLE) && defined(RDNA4)) @@ -1022,7 +1023,7 @@ static __device__ __forceinline__ void flash_attn_ext_f16_process_tile( const int j = jc / ncols2; const int c = jc % ncols2; - if (jt*ncols1 + j < int(ne01.z)) { + if ((ncols1 == 1 || jt*ncols1 + j < int(ne01.z)) && (ncols2 == 1 || zt*ncols2 + c < ne02)) { #pragma unroll for (int k0 = k0_start; k0 < k0_stop; k0 += stride_k) { const int k = k0 + (stride_k == WARP_SIZE ? threadIdx.x : threadIdx.x % stride_k); @@ -1408,7 +1409,7 @@ static __device__ __forceinline__ void flash_attn_ext_f16_process_tile( const int j_dst = jc_dst / ncols2; const int c_dst = jc_dst % ncols2; - if (!is_fixup && jt*ncols1 + j_dst >= int(ne01.z)) { + if (!is_fixup && ((ncols1 > 1 && jt*ncols1 + j_dst >= int(ne01.z)) || (ncols2 > 1 && zt*ncols2 + c_dst >= ne02))) { continue; } @@ -1522,10 +1523,11 @@ static __global__ void flash_attn_ext_f16( const int iter_k = (ne11 + (nbatch_fa - 1)) / nbatch_fa; const int iter_j = (ne01.z + (ncols1 - 1)) / ncols1; + const int iter_z = (ne02 + (ncols2 - 1)) / ncols2; // kbc == k block continuous, current index in continuous ijk space. - int kbc = int64_t(blockIdx.x + 0)*(iter_k*iter_j*(ne02/ncols2)*ne03) / gridDim.x; - const int kbc_stop = int64_t(blockIdx.x + 1)*(iter_k*iter_j*(ne02/ncols2)*ne03) / gridDim.x; + int kbc = int64_t(blockIdx.x + 0)*(iter_k*iter_j*iter_z*ne03) / gridDim.x; + const int kbc_stop = int64_t(blockIdx.x + 1)*(iter_k*iter_j*iter_z*ne03) / gridDim.x; // If the seams of 2 CUDA blocks fall within an output tile their results need to be combined. // For this we need to track both the block that starts the tile (needs_fixup) and the block that finishes the tile (is_fixup). @@ -1536,9 +1538,9 @@ static __global__ void flash_attn_ext_f16( int kb0_stop = min(iter_k, kb0_start + kbc_stop - kbc); while (kbc < kbc_stop && kb0_stop == iter_k) { - const int sequence = kbc / (iter_k*iter_j*(ne02/ncols2)); - const int zt = (kbc - iter_k*iter_j*(ne02/ncols2)*sequence) / (iter_k*iter_j); // head in units of ncols2 - const int jt = (kbc - iter_k*iter_j*(ne02/ncols2)*sequence - iter_k*iter_j*zt) / iter_k; // j index of current tile. + const int sequence = kbc / (iter_k*iter_j*iter_z); + const int zt = (kbc - iter_k*iter_j*iter_z*sequence) / (iter_k*iter_j); // head in units of ncols2 + const int jt = (kbc - iter_k*iter_j*iter_z*sequence - iter_k*iter_j*zt) / iter_k; // j index of current tile. const int head0 = zt * ncols2; @@ -1561,12 +1563,12 @@ static __global__ void flash_attn_ext_f16( constexpr bool needs_fixup = false; // CUDA block is working on an entire tile. flash_attn_ext_f16_process_tile (Q_f2, K_h2, V_h2, mask_h, sinks_f, dstk, dst_meta, scale, slope, logit_softcap, - ne01, ne02, ne11, stride_Q1, stride_Q2, stride_K, stride_V, stride_mask, jt, kb0_start, kb0_stop); + ne01, ne02, ne11, stride_Q1, stride_Q2, stride_K, stride_V, stride_mask, jt, zt, kb0_start, kb0_stop); } else { constexpr bool needs_fixup = true; // CUDA block is missing the beginning of a tile. flash_attn_ext_f16_process_tile (Q_f2, K_h2, V_h2, mask_h, sinks_f, dstk, dst_meta, scale, slope, logit_softcap, - ne01, ne02, ne11, stride_Q1, stride_Q2, stride_K, stride_V, stride_mask, jt, kb0_start, kb0_stop); + ne01, ne02, ne11, stride_Q1, stride_Q2, stride_K, stride_V, stride_mask, jt, zt, kb0_start, kb0_stop); } kbc += iter_k; @@ -1580,9 +1582,9 @@ static __global__ void flash_attn_ext_f16( return; } - const int sequence = kbc / (iter_k*iter_j*(ne02/ncols2)); - const int zt = (kbc - iter_k*iter_j*(ne02/ncols2)*sequence) / (iter_k*iter_j); // head in units of ncols2 - const int jt = (kbc - iter_k*iter_j*(ne02/ncols2)*sequence - iter_k*iter_j*zt) / iter_k; // j index of current tile. + const int sequence = kbc / (iter_k*iter_j*iter_z); + const int zt = (kbc - iter_k*iter_j*iter_z*sequence) / (iter_k*iter_j); // head in units of ncols2 + const int jt = (kbc - iter_k*iter_j*iter_z*sequence - iter_k*iter_j*zt) / iter_k; // j index of current tile. const int head0 = zt * ncols2; @@ -1605,7 +1607,7 @@ static __global__ void flash_attn_ext_f16( constexpr bool needs_fixup = false; flash_attn_ext_f16_process_tile (Q_f2, K_h2, V_h2, mask_h, sinks_f, dstk, dst_meta, scale, slope, logit_softcap, - ne01, ne02, ne11, stride_Q1, stride_Q2, stride_K, stride_V, stride_mask, jt, kb0_start, kb0_stop); + ne01, ne02, ne11, stride_Q1, stride_Q2, stride_K, stride_V, stride_mask, jt, zt, kb0_start, kb0_stop); #else GGML_UNUSED_VARS(Q, K, V, mask, sinks, KV_max, dst, dst_meta, scale, max_bias, m0, m1, n_head_log2, logit_softcap, @@ -1739,3 +1741,5 @@ extern DECL_FATTN_MMA_F16_CASE(576, 512, 4, 16); extern DECL_FATTN_MMA_F16_CASE(576, 512, 4, 4); extern DECL_FATTN_MMA_F16_CASE(576, 512, 8, 4); extern DECL_FATTN_MMA_F16_CASE(576, 512, 16, 4); +extern DECL_FATTN_MMA_F16_CASE(576, 512, 1, 32); +extern DECL_FATTN_MMA_F16_CASE(576, 512, 2, 32); diff --git a/ggml/src/ggml-cuda/fattn.cu b/ggml/src/ggml-cuda/fattn.cu index a5e6624181..2f5dbd13a3 100644 --- a/ggml/src/ggml-cuda/fattn.cu +++ b/ggml/src/ggml-cuda/fattn.cu @@ -18,9 +18,11 @@ static void ggml_cuda_flash_attn_ext_mma_f16_switch_ncols1(ggml_backend_cuda_con } } - if ((turing_mma_available(cc) || amd_wmma_available(cc)) && Q->ne[1] <= 16/ncols2) { - ggml_cuda_flash_attn_ext_mma_f16_case(ctx, dst); - return; + if constexpr (ncols2 <= 16) { + if ((turing_mma_available(cc) || amd_wmma_available(cc)) && Q->ne[1] <= 16/ncols2) { + ggml_cuda_flash_attn_ext_mma_f16_case(ctx, dst); + return; + } } if (ggml_cuda_highest_compiled_arch(cc) == GGML_CUDA_CC_TURING || amd_wmma_available(cc) || Q->ne[1] <= 32/ncols2) { @@ -33,6 +35,7 @@ static void ggml_cuda_flash_attn_ext_mma_f16_switch_ncols1(ggml_backend_cuda_con template static void ggml_cuda_flash_attn_ext_mma_f16_switch_ncols2(ggml_backend_cuda_context & ctx, ggml_tensor * dst) { + const int cc = ggml_cuda_info().devices[ggml_cuda_get_device()].cc; const ggml_tensor * KQV = dst; const ggml_tensor * Q = dst->src[0]; const ggml_tensor * K = dst->src[1]; @@ -60,17 +63,38 @@ static void ggml_cuda_flash_attn_ext_mma_f16_switch_ncols2(ggml_backend_cuda_con GGML_ASSERT(Q->ne[2] % K->ne[2] == 0); const int gqa_ratio = Q->ne[2] / K->ne[2]; - if (use_gqa_opt && gqa_ratio % 8 == 0) { + // On Volta the GQA optimizations aren't as impactful vs. minimizing wasted compute: + if (cc == GGML_CUDA_CC_VOLTA) { + if (use_gqa_opt && gqa_ratio % 8 == 0) { + ggml_cuda_flash_attn_ext_mma_f16_switch_ncols1(ctx, dst); + return; + } + + if (use_gqa_opt && gqa_ratio % 4 == 0) { + ggml_cuda_flash_attn_ext_mma_f16_switch_ncols1(ctx, dst); + return; + } + + if (use_gqa_opt && gqa_ratio % 2 == 0) { + ggml_cuda_flash_attn_ext_mma_f16_switch_ncols1(ctx, dst); + return; + } + + ggml_cuda_flash_attn_ext_mma_f16_switch_ncols1(ctx, dst); + return; + } + + if (use_gqa_opt && gqa_ratio > 4) { ggml_cuda_flash_attn_ext_mma_f16_switch_ncols1(ctx, dst); return; } - if (use_gqa_opt && gqa_ratio % 4 == 0) { + if (use_gqa_opt && gqa_ratio > 2) { ggml_cuda_flash_attn_ext_mma_f16_switch_ncols1(ctx, dst); return; } - if (use_gqa_opt && gqa_ratio % 2 == 0) { + if (use_gqa_opt && gqa_ratio > 1) { ggml_cuda_flash_attn_ext_mma_f16_switch_ncols1(ctx, dst); return; } @@ -79,6 +103,7 @@ static void ggml_cuda_flash_attn_ext_mma_f16_switch_ncols2(ggml_backend_cuda_con } static void ggml_cuda_flash_attn_ext_mma_f16(ggml_backend_cuda_context & ctx, ggml_tensor * dst) { + const int cc = ggml_cuda_info().devices[ggml_cuda_get_device()].cc; const ggml_tensor * KQV = dst; const ggml_tensor * Q = dst->src[0]; const ggml_tensor * K = dst->src[1]; @@ -121,8 +146,30 @@ static void ggml_cuda_flash_attn_ext_mma_f16(ggml_backend_cuda_context & ctx, gg GGML_ASSERT(Q->ne[2] % K->ne[2] == 0); const int gqa_ratio = Q->ne[2] / K->ne[2]; - GGML_ASSERT(gqa_ratio % 4 == 0); - if (gqa_ratio % 16 == 0) { + if (gqa_ratio == 20) { // GLM 4.7 Flash + if (cc >= GGML_CUDA_CC_BLACKWELL) { + ggml_cuda_flash_attn_ext_mma_f16_switch_ncols1<576, 512, 4>(ctx, dst); + break; + } + if (cc >= GGML_CUDA_CC_ADA_LOVELACE) { + if (Q->ne[1] <= 4) { + ggml_cuda_flash_attn_ext_mma_f16_switch_ncols1<576, 512, 16>(ctx, dst); + break; + } + ggml_cuda_flash_attn_ext_mma_f16_switch_ncols1<576, 512, 4>(ctx, dst); + break; + } + if (cc >= GGML_CUDA_CC_TURING) { + if (Q->ne[1] <= 4) { + ggml_cuda_flash_attn_ext_mma_f16_switch_ncols1<576, 512, 32>(ctx, dst); + break; + } + ggml_cuda_flash_attn_ext_mma_f16_switch_ncols1<576, 512, 4>(ctx, dst); + break; + } + // Volta: + ggml_cuda_flash_attn_ext_mma_f16_switch_ncols1<576, 512, 4>(ctx, dst); + } else if (gqa_ratio % 16 == 0) { ggml_cuda_flash_attn_ext_mma_f16_switch_ncols1<576, 512, 16>(ctx, dst); } else { ggml_cuda_flash_attn_ext_mma_f16_switch_ncols1<576, 512, 4>(ctx, dst); @@ -234,7 +281,7 @@ static best_fattn_kernel ggml_cuda_get_best_fattn_kernel(const int device, const // The effective batch size for the kernel can be increased by gqa_ratio. // The kernel versions without this optimization are also used for ALiBi, if there is no mask, or if the KV cache is not padded, - bool gqa_opt_applies = gqa_ratio % 2 == 0 && mask && max_bias == 0.0f && K->ne[1] % FATTN_KQ_STRIDE == 0; + bool gqa_opt_applies = gqa_ratio >= 2 && mask && max_bias == 0.0f && K->ne[1] % FATTN_KQ_STRIDE == 0; for (const ggml_tensor * t : {Q, K, V, mask}) { if (t == nullptr || ggml_is_quantized(t->type)) { continue; @@ -268,7 +315,7 @@ static best_fattn_kernel ggml_cuda_get_best_fattn_kernel(const int device, const if (V->ne[0] != 512) { return BEST_FATTN_KERNEL_NONE; } - if (!gqa_opt_applies || gqa_ratio % 4 != 0) { + if (!gqa_opt_applies) { return BEST_FATTN_KERNEL_NONE; } if (!V_is_K_view) { diff --git a/ggml/src/ggml-cuda/template-instances/fattn-mma-f16-instance-ncols1_1-ncols2_32.cu b/ggml/src/ggml-cuda/template-instances/fattn-mma-f16-instance-ncols1_1-ncols2_32.cu new file mode 100644 index 0000000000..1f554d81e5 --- /dev/null +++ b/ggml/src/ggml-cuda/template-instances/fattn-mma-f16-instance-ncols1_1-ncols2_32.cu @@ -0,0 +1,5 @@ +// This file has been autogenerated by generate_cu_files.py, do not edit manually. + +#include "../fattn-mma-f16.cuh" + +DECL_FATTN_MMA_F16_CASE(576, 512, 1, 32); diff --git a/ggml/src/ggml-cuda/template-instances/fattn-mma-f16-instance-ncols1_2-ncols2_32.cu b/ggml/src/ggml-cuda/template-instances/fattn-mma-f16-instance-ncols1_2-ncols2_32.cu new file mode 100644 index 0000000000..264751d65e --- /dev/null +++ b/ggml/src/ggml-cuda/template-instances/fattn-mma-f16-instance-ncols1_2-ncols2_32.cu @@ -0,0 +1,5 @@ +// This file has been autogenerated by generate_cu_files.py, do not edit manually. + +#include "../fattn-mma-f16.cuh" + +DECL_FATTN_MMA_F16_CASE(576, 512, 2, 32); diff --git a/ggml/src/ggml-cuda/template-instances/generate_cu_files.py b/ggml/src/ggml-cuda/template-instances/generate_cu_files.py index 10be71ab57..e382df1ae2 100755 --- a/ggml/src/ggml-cuda/template-instances/generate_cu_files.py +++ b/ggml/src/ggml-cuda/template-instances/generate_cu_files.py @@ -71,7 +71,7 @@ for type_k in TYPES_KV: f.write(SOURCE_FATTN_VEC.format(type_k=type_k, type_v=type_v)) for ncols in [8, 16, 32, 64]: - for ncols2 in [1, 2, 4, 8, 16]: + for ncols2 in [1, 2, 4, 8, 16, 32]: if ncols2 > ncols: continue ncols1 = ncols // ncols2 @@ -83,9 +83,9 @@ for ncols in [8, 16, 32, 64]: continue if head_size_kq == 72: continue - if head_size_kq != 576 and ncols2 == 16: + if head_size_kq != 576 and ncols2 in (16, 32): continue - if head_size_kq == 576 and ncols2 not in (4, 16): + if head_size_kq == 576 and ncols2 not in (4, 16, 32): continue head_size_v = head_size_kq if head_size_kq != 576 else 512 f.write(SOURCE_FATTN_MMA_CASE.format(ncols1=ncols1, ncols2=ncols2, head_size_kq=head_size_kq, head_size_v=head_size_v)) From 56f3ebf38ea5757447e7ea167cda15e1eebf9601 Mon Sep 17 00:00:00 2001 From: Georgi Gerganov Date: Mon, 26 Jan 2026 11:24:30 +0200 Subject: [PATCH 15/22] model : add correct type for GLM 4.7 Flash (#19106) --- src/llama-model.cpp | 1 + 1 file changed, 1 insertion(+) diff --git a/src/llama-model.cpp b/src/llama-model.cpp index 32f49e7996..cc784e1cb0 100644 --- a/src/llama-model.cpp +++ b/src/llama-model.cpp @@ -1737,6 +1737,7 @@ void llama_model::load_hparams(llama_model_loader & ml) { switch (hparams.n_layer) { case 27: type = LLM_TYPE_16B; break; + case 47: type = LLM_TYPE_30B_A3B; break; case 60: type = LLM_TYPE_236B; break; case 61: type = LLM_TYPE_671B; break; default: type = LLM_TYPE_UNKNOWN; From 142cbe2ac68978e5dec3a2e19c1b64ef1c5740b1 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sigbj=C3=B8rn=20Skj=C3=A6ret?= Date: Mon, 26 Jan 2026 15:22:49 +0100 Subject: [PATCH 16/22] ci : use new 1vCPU runner for lightweight jobs (#19107) * use new 1vCPU runner for lightweight jobs * pyright is too heavy, look into ty some day use new pip-install input --- .github/workflows/check-vendor.yml | 2 +- .github/workflows/close-issue.yml | 2 +- .github/workflows/editorconfig.yml | 2 +- .github/workflows/gguf-publish.yml | 2 +- .github/workflows/labeler.yml | 2 +- .github/workflows/pre-tokenizer-hashes.yml | 2 +- .github/workflows/python-check-requirements.yml | 2 +- .github/workflows/python-lint.yml | 2 +- .github/workflows/python-type-check.yml | 4 +--- .github/workflows/update-ops-docs.yml | 2 +- .github/workflows/winget.yml | 2 +- 11 files changed, 11 insertions(+), 13 deletions(-) diff --git a/.github/workflows/check-vendor.yml b/.github/workflows/check-vendor.yml index b9e8ac7658..1671ed7b8b 100644 --- a/.github/workflows/check-vendor.yml +++ b/.github/workflows/check-vendor.yml @@ -19,7 +19,7 @@ on: jobs: check-vendor: - runs-on: ubuntu-latest + runs-on: ubuntu-slim steps: - name: Checkout diff --git a/.github/workflows/close-issue.yml b/.github/workflows/close-issue.yml index 8fb5310d0b..ec3df08b2d 100644 --- a/.github/workflows/close-issue.yml +++ b/.github/workflows/close-issue.yml @@ -10,7 +10,7 @@ permissions: jobs: close-issues: - runs-on: ubuntu-latest + runs-on: ubuntu-slim permissions: issues: write pull-requests: write diff --git a/.github/workflows/editorconfig.yml b/.github/workflows/editorconfig.yml index a5cd590017..702dc89f5b 100644 --- a/.github/workflows/editorconfig.yml +++ b/.github/workflows/editorconfig.yml @@ -20,7 +20,7 @@ concurrency: jobs: editorconfig: - runs-on: ubuntu-latest + runs-on: ubuntu-slim steps: - uses: actions/checkout@v6 - uses: editorconfig-checker/action-editorconfig-checker@v2 diff --git a/.github/workflows/gguf-publish.yml b/.github/workflows/gguf-publish.yml index 5bdab0f157..0e95766459 100644 --- a/.github/workflows/gguf-publish.yml +++ b/.github/workflows/gguf-publish.yml @@ -21,7 +21,7 @@ on: jobs: deploy: - runs-on: ubuntu-latest + runs-on: ubuntu-slim steps: - uses: actions/checkout@v6 diff --git a/.github/workflows/labeler.yml b/.github/workflows/labeler.yml index 42f00c0cd8..eab20c6881 100644 --- a/.github/workflows/labeler.yml +++ b/.github/workflows/labeler.yml @@ -7,7 +7,7 @@ jobs: permissions: contents: read pull-requests: write - runs-on: ubuntu-latest + runs-on: ubuntu-slim steps: - uses: actions/checkout@v6 with: diff --git a/.github/workflows/pre-tokenizer-hashes.yml b/.github/workflows/pre-tokenizer-hashes.yml index 8120df0e36..7126b62b69 100644 --- a/.github/workflows/pre-tokenizer-hashes.yml +++ b/.github/workflows/pre-tokenizer-hashes.yml @@ -12,7 +12,7 @@ on: jobs: pre-tokenizer-hashes: - runs-on: ubuntu-latest + runs-on: ubuntu-slim steps: - name: Checkout repository diff --git a/.github/workflows/python-check-requirements.yml b/.github/workflows/python-check-requirements.yml index 08cdcb9d01..1219b87459 100644 --- a/.github/workflows/python-check-requirements.yml +++ b/.github/workflows/python-check-requirements.yml @@ -20,7 +20,7 @@ concurrency: jobs: python-check-requirements: - runs-on: ubuntu-latest + runs-on: ubuntu-slim name: check-requirements steps: - name: Check out source repository diff --git a/.github/workflows/python-lint.yml b/.github/workflows/python-lint.yml index 91dc4d78a4..8d1dd7a7d5 100644 --- a/.github/workflows/python-lint.yml +++ b/.github/workflows/python-lint.yml @@ -15,7 +15,7 @@ concurrency: jobs: flake8-lint: - runs-on: ubuntu-latest + runs-on: ubuntu-slim name: Lint steps: - name: Check out source repository diff --git a/.github/workflows/python-type-check.yml b/.github/workflows/python-type-check.yml index 54d5fab5ba..e801a9f42e 100644 --- a/.github/workflows/python-type-check.yml +++ b/.github/workflows/python-type-check.yml @@ -29,9 +29,7 @@ jobs: uses: actions/setup-python@v6 with: python-version: "3.11" - - name: Install Python dependencies - # TODO: use a venv - run: pip install -r requirements/requirements-all.txt + pip-install: -r requirements/requirements-all.txt - name: Type-check with Pyright uses: jakebailey/pyright-action@v2 with: diff --git a/.github/workflows/update-ops-docs.yml b/.github/workflows/update-ops-docs.yml index 40447db4e4..2ab06eb981 100644 --- a/.github/workflows/update-ops-docs.yml +++ b/.github/workflows/update-ops-docs.yml @@ -14,7 +14,7 @@ on: jobs: update-ops-docs: - runs-on: ubuntu-latest + runs-on: ubuntu-slim steps: - name: Checkout repository diff --git a/.github/workflows/winget.yml b/.github/workflows/winget.yml index 7506091647..e3981f30b9 100644 --- a/.github/workflows/winget.yml +++ b/.github/workflows/winget.yml @@ -8,7 +8,7 @@ on: jobs: update: name: Update Winget Package - runs-on: ubuntu-latest + runs-on: ubuntu-slim if: github.repository_owner == 'ggml-org' steps: From 8f80d1b254aef70a0959e314be368d05debe7294 Mon Sep 17 00:00:00 2001 From: Georgi Gerganov Date: Mon, 26 Jan 2026 20:18:34 +0200 Subject: [PATCH 17/22] graph : fix nkvo offload with FA (#19105) --- src/llama-context.cpp | 7 ------- src/llama-graph.cpp | 5 +++++ 2 files changed, 5 insertions(+), 7 deletions(-) diff --git a/src/llama-context.cpp b/src/llama-context.cpp index 72211db17b..0b2b05c419 100644 --- a/src/llama-context.cpp +++ b/src/llama-context.cpp @@ -2173,13 +2173,6 @@ llm_graph_cb llama_context::graph_get_cb() const { ggml_set_name(cur, name); } - if (!cparams.offload_kqv) { - if (strcmp(name, "kqv_merged_cont") == 0) { - // all nodes between the KV store and the attention output are run on the CPU - ggml_backend_sched_set_tensor_backend(sched.get(), cur, backend_cpu); - } - } - // norm may be automatically assigned to the backend of the previous layer, increasing data transfer between backends // FIXME: fix in ggml_backend_sched const bool full_offload = model.n_gpu_layers() > model.hparams.n_layer; diff --git a/src/llama-graph.cpp b/src/llama-graph.cpp index 16d42c4ae3..b3198b7e3a 100644 --- a/src/llama-graph.cpp +++ b/src/llama-graph.cpp @@ -1630,6 +1630,11 @@ ggml_tensor * llm_graph_context::build_attn_mha( hparams.attn_soft_cap ? hparams.f_attn_logit_softcapping : 0.0f); cb(cur, LLAMA_TENSOR_NAME_FATTN, il); + if (!cparams.offload_kqv) { + // all nodes between the KV store and the attention output are run on the CPU + ggml_backend_sched_set_tensor_backend(sched, cur, backend_cpu); + } + ggml_flash_attn_ext_add_sinks(cur, sinks); ggml_flash_attn_ext_set_prec (cur, GGML_PREC_F32); From b0311c16d2f650a8bd5af652549075b458bd713a Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Johannes=20G=C3=A4=C3=9Fler?= Date: Mon, 26 Jan 2026 23:24:58 +0100 Subject: [PATCH 18/22] CUDA: fix padding of GQA to power of 2 in FA (#19115) --- ggml/src/ggml-cuda/fattn-common.cuh | 43 +++++++++-------- ggml/src/ggml-cuda/fattn-mma-f16.cuh | 69 +++++++++++++++------------- tests/test-backend-ops.cpp | 4 +- 3 files changed, 64 insertions(+), 52 deletions(-) diff --git a/ggml/src/ggml-cuda/fattn-common.cuh b/ggml/src/ggml-cuda/fattn-common.cuh index 1f5f1b9206..3d7daccfdf 100644 --- a/ggml/src/ggml-cuda/fattn-common.cuh +++ b/ggml/src/ggml-cuda/fattn-common.cuh @@ -629,8 +629,8 @@ static __global__ void flash_attn_mask_to_KV_max( template // D == head size __launch_bounds__(D, 1) static __global__ void flash_attn_stream_k_fixup( - float * __restrict__ dst, const float2 * __restrict__ dst_fixup, const int ne01, const int ne02, const int ne03, const int ne11, - const int nbatch_fa) { + float * __restrict__ dst, const float2 * __restrict__ dst_fixup, const int ne01, const int ne02, const int ne03, + const int ne11, const int ne12, const int nbatch_fa) { constexpr int ncols = ncols1*ncols2; const int bidx0 = blockIdx.x; @@ -641,12 +641,14 @@ static __global__ void flash_attn_stream_k_fixup( const float * dst_fixup_data = ((const float *) dst_fixup) + gridDim.x*(2*2*ncols); - const int iter_k = (ne11 + (nbatch_fa - 1)) / nbatch_fa; - const int iter_j = (ne01 + (ncols1 - 1)) / ncols1; - const int iter_z = (ne02 + (ncols2 - 1)) / ncols2; + const int gqa_ratio = ne02 / ne12; // With grouped query attention there are > 1 Q matrices per K, V matrix. - const int kbc0 = int64_t(bidx0 + 0)*(iter_k*iter_j*iter_z*ne03) / gridDim.x; - const int kbc0_stop = int64_t(bidx0 + 1)*(iter_k*iter_j*iter_z*ne03) / gridDim.x; + const int iter_k = (ne11 + (nbatch_fa - 1)) / nbatch_fa; + const int iter_j = (ne01 + (ncols1 - 1)) / ncols1; + const int iter_z_gqa = (gqa_ratio + (ncols2 - 1)) / ncols2; + + const int kbc0 = int64_t(bidx0 + 0)*(iter_k*iter_j*iter_z_gqa*ne12*ne03) / gridDim.x; + const int kbc0_stop = int64_t(bidx0 + 1)*(iter_k*iter_j*iter_z_gqa*ne12*ne03) / gridDim.x; const bool did_not_have_any_data = kbc0 == kbc0_stop; const bool wrote_beginning_of_tile = kbc0 % iter_k == 0; @@ -655,15 +657,19 @@ static __global__ void flash_attn_stream_k_fixup( return; } - const int sequence = kbc0 / (iter_k*iter_j*iter_z); - const int zt = (kbc0 - iter_k*iter_j*iter_z*sequence) / (iter_k*iter_j); - const int jt = (kbc0 - iter_k*iter_j*iter_z*sequence - iter_k*iter_j*zt) / iter_k; // j index of current tile. + // z_KV == K/V head index, zt_gqa = Q head start index per K/V head, jt = token position start index + const int sequence = kbc0 /(iter_k*iter_j*iter_z_gqa*ne12); + const int z_KV = (kbc0 - iter_k*iter_j*iter_z_gqa*ne12 * sequence)/(iter_k*iter_j*iter_z_gqa); + const int zt_gqa = (kbc0 - iter_k*iter_j*iter_z_gqa*ne12 * sequence - iter_k*iter_j*iter_z_gqa * z_KV)/(iter_k*iter_j); + const int jt = (kbc0 - iter_k*iter_j*iter_z_gqa*ne12 * sequence - iter_k*iter_j*iter_z_gqa * z_KV - iter_k*iter_j * zt_gqa) / iter_k; - if (jt*ncols1 + j >= ne01 || zt*ncols2 + c >= ne02) { + const int zt_Q = z_KV*gqa_ratio + zt_gqa*ncols2; // Global Q head start index. + + if (jt*ncols1 + j >= ne01 || zt_gqa*ncols2 + c >= gqa_ratio) { return; } - dst += sequence*ne02*ne01*D + jt*ne02*(ncols1*D) + zt*(ncols2*D) + (j*ne02 + c)*D + tid; + dst += sequence*ne02*ne01*D + jt*ne02*(ncols1*D) + zt_Q*D + (j*ne02 + c)*D + tid; // Load the partial result that needs a fixup: float dst_val = 0.0f; @@ -682,7 +688,7 @@ static __global__ void flash_attn_stream_k_fixup( int bidx = bidx0 - 1; int kbc_stop = kbc0; while(true) { - const int kbc = int64_t(bidx)*(iter_k*iter_j*iter_z*ne03) / gridDim.x; + const int kbc = int64_t(bidx)*(iter_k*iter_j*iter_z_gqa*ne12*ne03) / gridDim.x; if (kbc == kbc_stop) { // Did not have any data. bidx--; kbc_stop = kbc; @@ -883,9 +889,10 @@ void launch_fattn( } } - const int ntiles_x = ((Q->ne[1] + ncols1 - 1) / ncols1); - const int ntiles_z = ((Q->ne[2] + ncols2 - 1) / ncols2); - const int ntiles_total = ntiles_x * ntiles_z * Q->ne[3]; + const int ntiles_x = ((Q->ne[1] + ncols1 - 1) / ncols1); + const int gqa_ratio = Q->ne[2] / K->ne[2]; + const int ntiles_z_gqa = ((gqa_ratio + ncols2 - 1) / ncols2); + const int ntiles_total = ntiles_x * ntiles_z_gqa * K->ne[2] * Q->ne[3]; // Optional optimization where the mask is scanned to determine whether part of the calculation can be skipped. // Only worth the overhead if there is at lease one FATTN_KQ_STRIDE x FATTN_KQ_STRIDE square to be skipped or @@ -960,7 +967,7 @@ void launch_fattn( blocks_num.x = ntiles_x; blocks_num.y = parallel_blocks; - blocks_num.z = ntiles_z*Q->ne[3]; + blocks_num.z = ntiles_z_gqa*K->ne[2]*Q->ne[3]; if (parallel_blocks > 1) { dst_tmp.alloc(parallel_blocks*ggml_nelements(KQV)); @@ -1014,7 +1021,7 @@ void launch_fattn( flash_attn_stream_k_fixup <<>> - ((float *) KQV->data, dst_tmp_meta.ptr, Q->ne[1], Q->ne[2], Q->ne[3], K->ne[1], nbatch_fa); + ((float *) KQV->data, dst_tmp_meta.ptr, Q->ne[1], Q->ne[2], Q->ne[3], K->ne[1], K->ne[2], nbatch_fa); } } else if (parallel_blocks > 1) { const dim3 block_dim_combine(DV, 1, 1); diff --git a/ggml/src/ggml-cuda/fattn-mma-f16.cuh b/ggml/src/ggml-cuda/fattn-mma-f16.cuh index 9004d46904..0b8ef90794 100644 --- a/ggml/src/ggml-cuda/fattn-mma-f16.cuh +++ b/ggml/src/ggml-cuda/fattn-mma-f16.cuh @@ -933,6 +933,7 @@ static __device__ __forceinline__ void flash_attn_ext_f16_process_tile( const float logit_softcap, const uint3 ne01, const int ne02, + const int gqa_ratio, const int ne11, const int stride_Q1, const int stride_Q2, @@ -940,7 +941,7 @@ static __device__ __forceinline__ void flash_attn_ext_f16_process_tile( const int stride_V, const int stride_mask, const int jt, - const int zt, + const int zt_gqa, const int kb0_start, const int kb0_stop) { #if defined(VOLTA_MMA_AVAILABLE) || defined(TURING_MMA_AVAILABLE) || (defined(AMD_WMMA_AVAILABLE) && defined(RDNA4)) @@ -1023,7 +1024,7 @@ static __device__ __forceinline__ void flash_attn_ext_f16_process_tile( const int j = jc / ncols2; const int c = jc % ncols2; - if ((ncols1 == 1 || jt*ncols1 + j < int(ne01.z)) && (ncols2 == 1 || zt*ncols2 + c < ne02)) { + if ((ncols1 == 1 || jt*ncols1 + j < int(ne01.z)) && (ncols2 == 1 || zt_gqa*ncols2 + c < gqa_ratio)) { #pragma unroll for (int k0 = k0_start; k0 < k0_stop; k0 += stride_k) { const int k = k0 + (stride_k == WARP_SIZE ? threadIdx.x : threadIdx.x % stride_k); @@ -1409,7 +1410,7 @@ static __device__ __forceinline__ void flash_attn_ext_f16_process_tile( const int j_dst = jc_dst / ncols2; const int c_dst = jc_dst % ncols2; - if (!is_fixup && ((ncols1 > 1 && jt*ncols1 + j_dst >= int(ne01.z)) || (ncols2 > 1 && zt*ncols2 + c_dst >= ne02))) { + if (!is_fixup && ((ncols1 > 1 && jt*ncols1 + j_dst >= int(ne01.z)) || (ncols2 > 1 && zt_gqa*ncols2 + c_dst >= gqa_ratio))) { continue; } @@ -1448,7 +1449,7 @@ static __device__ __forceinline__ void flash_attn_ext_f16_process_tile( } #else GGML_UNUSED_VARS(Q_f2, K_h2, V_h2, mask_h, sinks_f, dstk, dstk_fixup, - scale, slope, logit_softcap, ne01, ne02, + scale, slope, logit_softcap, ne01, ne02, gqa_ratio, stride_Q1, stride_Q2, stride_K, stride_V, stride_mask, jt, kb0_start, kb0_stop); NO_DEVICE_CODE; @@ -1521,13 +1522,13 @@ static __global__ void flash_attn_ext_f16( const int stride_V = V_is_K_view ? stride_K : nb21 / sizeof(half2); - const int iter_k = (ne11 + (nbatch_fa - 1)) / nbatch_fa; - const int iter_j = (ne01.z + (ncols1 - 1)) / ncols1; - const int iter_z = (ne02 + (ncols2 - 1)) / ncols2; + const int iter_k = (ne11 + (nbatch_fa - 1)) / nbatch_fa; + const int iter_j = (ne01.z + (ncols1 - 1)) / ncols1; + const int iter_z_gqa = (gqa_ratio + (ncols2 - 1)) / ncols2; // kbc == k block continuous, current index in continuous ijk space. - int kbc = int64_t(blockIdx.x + 0)*(iter_k*iter_j*iter_z*ne03) / gridDim.x; - const int kbc_stop = int64_t(blockIdx.x + 1)*(iter_k*iter_j*iter_z*ne03) / gridDim.x; + int kbc = int64_t(blockIdx.x + 0)*(iter_k*iter_j*iter_z_gqa*ne12*ne03) / gridDim.x; + const int kbc_stop = int64_t(blockIdx.x + 1)*(iter_k*iter_j*iter_z_gqa*ne12*ne03) / gridDim.x; // If the seams of 2 CUDA blocks fall within an output tile their results need to be combined. // For this we need to track both the block that starts the tile (needs_fixup) and the block that finishes the tile (is_fixup). @@ -1538,22 +1539,24 @@ static __global__ void flash_attn_ext_f16( int kb0_stop = min(iter_k, kb0_start + kbc_stop - kbc); while (kbc < kbc_stop && kb0_stop == iter_k) { - const int sequence = kbc / (iter_k*iter_j*iter_z); - const int zt = (kbc - iter_k*iter_j*iter_z*sequence) / (iter_k*iter_j); // head in units of ncols2 - const int jt = (kbc - iter_k*iter_j*iter_z*sequence - iter_k*iter_j*zt) / iter_k; // j index of current tile. + // z_KV == K/V head index, zt_gqa = Q head start index per K/V head, jt = token position start index + const int sequence = kbc /(iter_k*iter_j*iter_z_gqa*ne12); + const int z_KV = (kbc - iter_k*iter_j*iter_z_gqa*ne12 * sequence)/(iter_k*iter_j*iter_z_gqa); + const int zt_gqa = (kbc - iter_k*iter_j*iter_z_gqa*ne12 * sequence - iter_k*iter_j*iter_z_gqa * z_KV)/(iter_k*iter_j); + const int jt = (kbc - iter_k*iter_j*iter_z_gqa*ne12 * sequence - iter_k*iter_j*iter_z_gqa * z_KV - iter_k*iter_j * zt_gqa) / iter_k; - const int head0 = zt * ncols2; + const int zt_Q = z_KV*gqa_ratio + zt_gqa*ncols2; // Global Q head start index. - const float2 * Q_f2 = (const float2 *) (Q + nb03*sequence + nb02* head0); - const half2 * K_h2 = (const half2 *) (K + nb13*sequence + nb12*(head0 / gqa_ratio)); + const float2 * Q_f2 = (const float2 *) (Q + nb03*sequence + nb02*zt_Q); + const half2 * K_h2 = (const half2 *) (K + nb13*sequence + nb12*z_KV); const half * mask_h = ncols2 == 1 && !mask ? nullptr : (const half *) (mask + nb33*(sequence % ne33)); - float2 * dstk = ((float2 *) dst) + (sequence*ne01.z*ne02 + head0) * (DV/2); + float2 * dstk = ((float2 *) dst) + (sequence*ne01.z*ne02 + zt_Q) * (DV/2); - const half2 * V_h2 = V_is_K_view ? K_h2 : (const half2 *) (V + nb23*sequence + nb22*(head0 / gqa_ratio)); - const float * sinks_f = sinks ? (const float *) sinks + head0 : nullptr; + const half2 * V_h2 = V_is_K_view ? K_h2 : (const half2 *) (V + nb23*sequence + nb22*z_KV); + const float * sinks_f = sinks ? (const float *) sinks + zt_Q : nullptr; - const float slope = ncols2 == 1 ? get_alibi_slope(max_bias, head0, n_head_log2, m0, m1) : 1.0f; + const float slope = ncols2 == 1 ? get_alibi_slope(max_bias, zt_Q, n_head_log2, m0, m1) : 1.0f; if (KV_max) { kb0_stop = min(kb0_stop, KV_max[sequence*iter_j + jt] / nbatch_fa); @@ -1563,12 +1566,12 @@ static __global__ void flash_attn_ext_f16( constexpr bool needs_fixup = false; // CUDA block is working on an entire tile. flash_attn_ext_f16_process_tile (Q_f2, K_h2, V_h2, mask_h, sinks_f, dstk, dst_meta, scale, slope, logit_softcap, - ne01, ne02, ne11, stride_Q1, stride_Q2, stride_K, stride_V, stride_mask, jt, zt, kb0_start, kb0_stop); + ne01, ne02, gqa_ratio, ne11, stride_Q1, stride_Q2, stride_K, stride_V, stride_mask, jt, zt_gqa, kb0_start, kb0_stop); } else { constexpr bool needs_fixup = true; // CUDA block is missing the beginning of a tile. flash_attn_ext_f16_process_tile (Q_f2, K_h2, V_h2, mask_h, sinks_f, dstk, dst_meta, scale, slope, logit_softcap, - ne01, ne02, ne11, stride_Q1, stride_Q2, stride_K, stride_V, stride_mask, jt, zt, kb0_start, kb0_stop); + ne01, ne02, gqa_ratio, ne11, stride_Q1, stride_Q2, stride_K, stride_V, stride_mask, jt, zt_gqa, kb0_start, kb0_stop); } kbc += iter_k; @@ -1582,22 +1585,24 @@ static __global__ void flash_attn_ext_f16( return; } - const int sequence = kbc / (iter_k*iter_j*iter_z); - const int zt = (kbc - iter_k*iter_j*iter_z*sequence) / (iter_k*iter_j); // head in units of ncols2 - const int jt = (kbc - iter_k*iter_j*iter_z*sequence - iter_k*iter_j*zt) / iter_k; // j index of current tile. + // z_KV == K/V head index, zt_gqa = Q head start index per K/V head, jt = token position start index. + const int sequence = kbc /(iter_k*iter_j*iter_z_gqa*ne12); + const int z_KV = (kbc - iter_k*iter_j*iter_z_gqa*ne12 * sequence)/(iter_k*iter_j*iter_z_gqa); + const int zt_gqa = (kbc - iter_k*iter_j*iter_z_gqa*ne12 * sequence - iter_k*iter_j*iter_z_gqa * z_KV)/(iter_k*iter_j); + const int jt = (kbc - iter_k*iter_j*iter_z_gqa*ne12 * sequence - iter_k*iter_j*iter_z_gqa * z_KV - iter_k*iter_j * zt_gqa) / iter_k; - const int head0 = zt * ncols2; + const int zt_Q = z_KV*gqa_ratio + zt_gqa*ncols2; // Global Q head start index. - const float2 * Q_f2 = (const float2 *) (Q + nb03*sequence + nb02* head0); - const half2 * K_h2 = (const half2 *) (K + nb13*sequence + nb12*(head0 / gqa_ratio)); + const float2 * Q_f2 = (const float2 *) (Q + nb03*sequence + nb02*zt_Q); + const half2 * K_h2 = (const half2 *) (K + nb13*sequence + nb12*z_KV); const half * mask_h = ncols2 == 1 && !mask ? nullptr : (const half *) (mask + nb33*(sequence % ne33)); - float2 * dstk = ((float2 *) dst) + (sequence*ne01.z*ne02 + head0) * (DV/2); + float2 * dstk = ((float2 *) dst) + (sequence*ne01.z*ne02 + zt_Q) * (DV/2); - const half2 * V_h2 = V_is_K_view ? K_h2 : (const half2 *) (V + nb23*sequence + nb22*(head0 / gqa_ratio)); - const float * sinks_f = sinks ? (const float *) sinks + head0 : nullptr; + const half2 * V_h2 = V_is_K_view ? K_h2 : (const half2 *) (V + nb23*sequence + nb22*z_KV); + const float * sinks_f = sinks ? (const float *) sinks + zt_Q : nullptr; - const float slope = ncols2 == 1 ? get_alibi_slope(max_bias, head0, n_head_log2, m0, m1) : 1.0f; + const float slope = ncols2 == 1 ? get_alibi_slope(max_bias, zt_Q, n_head_log2, m0, m1) : 1.0f; if (KV_max) { kb0_stop = min(kb0_stop, KV_max[sequence*iter_j + jt] / nbatch_fa); @@ -1607,7 +1612,7 @@ static __global__ void flash_attn_ext_f16( constexpr bool needs_fixup = false; flash_attn_ext_f16_process_tile (Q_f2, K_h2, V_h2, mask_h, sinks_f, dstk, dst_meta, scale, slope, logit_softcap, - ne01, ne02, ne11, stride_Q1, stride_Q2, stride_K, stride_V, stride_mask, jt, zt, kb0_start, kb0_stop); + ne01, ne02, gqa_ratio, ne11, stride_Q1, stride_Q2, stride_K, stride_V, stride_mask, jt, zt_gqa, kb0_start, kb0_stop); #else GGML_UNUSED_VARS(Q, K, V, mask, sinks, KV_max, dst, dst_meta, scale, max_bias, m0, m1, n_head_log2, logit_softcap, diff --git a/tests/test-backend-ops.cpp b/tests/test-backend-ops.cpp index 146d05f53b..d4c1f525c6 100644 --- a/tests/test-backend-ops.cpp +++ b/tests/test-backend-ops.cpp @@ -8216,8 +8216,8 @@ static std::vector> make_test_cases_eval() { for (int nh : { 4, }) { for (int nr3 : { 1, 3, }) { if (hsk > 64 && nr3 > 1) continue; // skip broadcast for large head sizes - for (int nr2 : { 1, 4, 16 }) { - if (nr2 == 16 && hsk != 128) continue; + for (int nr2 : { 1, 4, 12 }) { + if (nr2 == 12 && hsk != 128) continue; //for (int kv : { 1, 17, 31, 33, 61, 113, 65, 127, 129, 130, 255, 260, 371, 380, 407, 512, 1024, }) { for (int kv : { 113, 512, 1024, }) { if (nr2 != 1 && kv != 512) continue; From 94eeb5967c129365f50ca8462a7595ea319430d9 Mon Sep 17 00:00:00 2001 From: lhez Date: Mon, 26 Jan 2026 19:36:24 -0800 Subject: [PATCH 19/22] opencl: add flattened q6_K mv (#19054) * opencl: flatten `q6_K` and add `kernel_mul_mv_q6_K_f32_flat` * opencl: clean up * opencl: refactor q6_K mv - put loop body in `block_q_6_K_dot_y_flat` * opencl: tweak the workgroup size a bit * opencl: output 4 values per subgroup for `kernel_mul_mv_q6_K_f32_flat` * opencl: proper alignment for q6_K * opencl: boundary handling for flattened q6_K mv * opencl: rename q6_K mv kernel file * opencl: put flattened q6_K mv in its own file * opencl: use lower k in file name * opencl: use K in variable names --- ggml/src/ggml-opencl/CMakeLists.txt | 3 +- ggml/src/ggml-opencl/ggml-opencl.cpp | 260 +++++++++++++++++- ggml/src/ggml-opencl/kernels/cvt.cl | 70 +++++ .../{mul_mv_q6_k.cl => mul_mv_q6_k_f32.cl} | 0 .../kernels/mul_mv_q6_k_f32_flat.cl | 194 +++++++++++++ 5 files changed, 518 insertions(+), 9 deletions(-) rename ggml/src/ggml-opencl/kernels/{mul_mv_q6_k.cl => mul_mv_q6_k_f32.cl} (100%) create mode 100644 ggml/src/ggml-opencl/kernels/mul_mv_q6_k_f32_flat.cl diff --git a/ggml/src/ggml-opencl/CMakeLists.txt b/ggml/src/ggml-opencl/CMakeLists.txt index 79039c30e1..0259474b6e 100644 --- a/ggml/src/ggml-opencl/CMakeLists.txt +++ b/ggml/src/ggml-opencl/CMakeLists.txt @@ -85,7 +85,8 @@ set(GGML_OPENCL_KERNELS mul_mv_q4_0_f32_8x_flat mul_mv_q4_0_f32_1d_8x_flat mul_mv_q4_0_f32_1d_16x_flat - mul_mv_q6_k + mul_mv_q6_k_f32 + mul_mv_q6_k_f32_flat mul_mv_q8_0_f32 mul_mv_q8_0_f32_flat mul_mv_mxfp4_f32 diff --git a/ggml/src/ggml-opencl/ggml-opencl.cpp b/ggml/src/ggml-opencl/ggml-opencl.cpp index 27b2761ef1..678e40965a 100644 --- a/ggml/src/ggml-opencl/ggml-opencl.cpp +++ b/ggml/src/ggml-opencl/ggml-opencl.cpp @@ -533,8 +533,10 @@ struct ggml_backend_opencl_context { cl_kernel kernel_mul_mat_q4_0_f32_8x_flat; cl_kernel kernel_convert_block_q4_0_noshuffle; cl_kernel kernel_restore_block_q4_0_noshuffle; + cl_kernel kernel_convert_block_q6_K, kernel_restore_block_q6_K; cl_kernel kernel_mul_mat_q4_0_f32_1d_8x_flat, kernel_mul_mat_q4_0_f32_1d_16x_flat; cl_kernel kernel_mul_mv_q6_K_f32; + cl_kernel kernel_mul_mv_q6_K_f32_flat; cl_kernel kernel_mul_mv_mxfp4_f32, kernel_mul_mv_mxfp4_f32_flat; cl_kernel kernel_mul_mv_q8_0_f32, kernel_mul_mv_q8_0_f32_flat; cl_kernel kernel_solve_tri_f32; @@ -892,6 +894,8 @@ static void load_cl_kernels(ggml_backend_opencl_context *backend_ctx, ggml_cl_ve CL_CHECK((backend_ctx->kernel_restore_block_mxfp4 = clCreateKernel(backend_ctx->program_cvt, "kernel_restore_block_mxfp4", &err), err)); CL_CHECK((backend_ctx->kernel_convert_block_q8_0 = clCreateKernel(backend_ctx->program_cvt, "kernel_convert_block_q8_0", &err), err)); CL_CHECK((backend_ctx->kernel_restore_block_q8_0 = clCreateKernel(backend_ctx->program_cvt, "kernel_restore_block_q8_0", &err), err)); + CL_CHECK((backend_ctx->kernel_convert_block_q6_K = clCreateKernel(backend_ctx->program_cvt, "kernel_convert_block_q6_K", &err), err)); + CL_CHECK((backend_ctx->kernel_restore_block_q6_K = clCreateKernel(backend_ctx->program_cvt, "kernel_restore_block_q6_K", &err), err)); GGML_LOG_CONT("."); } @@ -1114,14 +1118,14 @@ static void load_cl_kernels(ggml_backend_opencl_context *backend_ctx, ggml_cl_ve GGML_LOG_CONT("."); } - // mul_mv_q6_k + // mul_mv_q6_k_f32 { #ifdef GGML_OPENCL_EMBED_KERNELS const std::string kernel_src { - #include "mul_mv_q6_k.cl.h" + #include "mul_mv_q6_k_f32.cl.h" }; #else - const std::string kernel_src = read_file("mul_mv_q6_k.cl"); + const std::string kernel_src = read_file("mul_mv_q6_k_f32.cl"); #endif backend_ctx->program_mul_mv_q6_K = build_program_from_source(backend_ctx->context, backend_ctx->device, kernel_src.c_str(), compile_opts); @@ -1130,6 +1134,23 @@ static void load_cl_kernels(ggml_backend_opencl_context *backend_ctx, ggml_cl_ve GGML_LOG_CONT("."); } + // mul_mv_q6_k_f32_flat + { +#ifdef GGML_OPENCL_EMBED_KERNELS + const std::string kernel_src { + #include "mul_mv_q6_k_f32_flat.cl.h" + }; +#else + const std::string kernel_src = read_file("mul_mv_q6_k_f32_flat.cl"); +#endif + cl_program prog = + build_program_from_source(backend_ctx->context, backend_ctx->device, kernel_src.c_str(), compile_opts); + + CL_CHECK((backend_ctx->kernel_mul_mv_q6_K_f32_flat = clCreateKernel(prog, "kernel_mul_mv_q6_K_f32_flat", &err), err)); + CL_CHECK(clReleaseProgram(prog)); + GGML_LOG_CONT("."); + } + // mul_mv_q8_0_f32 { #ifdef GGML_OPENCL_EMBED_KERNELS @@ -2919,6 +2940,50 @@ struct ggml_tensor_extra_cl_q8_0 { } }; +struct ggml_tensor_extra_cl_q6_K { + // Lower 4 bits of quantized weights. + cl_mem ql = nullptr; + // Upper 2 bits of quantized weights. + cl_mem qh = nullptr; + // Scales for each block. + cl_mem s = nullptr; + // Scales for each super block. + cl_mem d = nullptr; + + size_t size_ql = 0; + size_t size_qh = 0; + size_t size_s = 0; + size_t size_d = 0; + + ~ggml_tensor_extra_cl_q6_K() { + reset(); + } + + void reset() { + if (ql != nullptr) { + CL_CHECK(clReleaseMemObject(ql)); + ql = nullptr; + } + if (qh != nullptr) { + CL_CHECK(clReleaseMemObject(qh)); + qh = nullptr; + } + if (s != nullptr) { + CL_CHECK(clReleaseMemObject(s)); + s = nullptr; + } + if (d != nullptr) { + CL_CHECK(clReleaseMemObject(d)); + d = nullptr; + } + + size_ql = 0; + size_qh = 0; + size_s = 0; + size_d = 0; + } +}; + //------------------------------------------------------------------------------ // Backend API //------------------------------------------------------------------------------ @@ -3465,6 +3530,12 @@ struct ggml_backend_opencl_buffer_context { for (ggml_tensor_extra_cl_q8_0 * e : temp_tensor_extras_q8_0_in_use) { delete e; } + for (ggml_tensor_extra_cl_q6_K * e : temp_tensor_extras_q6_K) { + delete e; + } + for (ggml_tensor_extra_cl_q6_K * e : temp_tensor_extras_q6_K_in_use) { + delete e; + } } ggml_tensor_extra_cl * ggml_opencl_alloc_temp_tensor_extra() { @@ -3527,6 +3598,21 @@ struct ggml_backend_opencl_buffer_context { return extra; } + ggml_tensor_extra_cl_q6_K * ggml_opencl_alloc_temp_tensor_extra_q6_K() { + ggml_tensor_extra_cl_q6_K * extra; + if (temp_tensor_extras_q6_K.empty()) { + extra = new ggml_tensor_extra_cl_q6_K(); + } else { + extra = temp_tensor_extras_q6_K.back(); + temp_tensor_extras_q6_K.pop_back(); + } + + temp_tensor_extras_q6_K_in_use.push_back(extra); + + extra->reset(); + return extra; + } + void reset() { for (ggml_tensor_extra_cl * e : temp_tensor_extras_in_use) { temp_tensor_extras.push_back(e); @@ -3547,6 +3633,11 @@ struct ggml_backend_opencl_buffer_context { temp_tensor_extras_q8_0.push_back(e); } temp_tensor_extras_q8_0_in_use.clear(); + + for (ggml_tensor_extra_cl_q6_K * e : temp_tensor_extras_q6_K_in_use) { + temp_tensor_extras_q6_K.push_back(e); + } + temp_tensor_extras_q6_K_in_use.clear(); } // Pools for extras. Available extras are in `temp_tensor_extras`. Extras @@ -3562,6 +3653,8 @@ struct ggml_backend_opencl_buffer_context { std::vector temp_tensor_extras_mxfp4_in_use; std::vector temp_tensor_extras_q8_0; std::vector temp_tensor_extras_q8_0_in_use; + std::vector temp_tensor_extras_q6_K; + std::vector temp_tensor_extras_q6_K_in_use; // The buffer_context is initially created by ggml_backend_buft_alloc_buffer // before any tensor is initialized (at the beginning of alloc_tensor_range). @@ -4068,6 +4161,92 @@ static void ggml_backend_opencl_buffer_set_tensor(ggml_backend_buffer_t buffer, return; } + if (tensor->type == GGML_TYPE_Q6_K) { + ggml_tensor_extra_cl * extra_orig = (ggml_tensor_extra_cl *)tensor->extra; + GGML_ASSERT(extra_orig && "Tesnors in OpenCL backend should have been allocated and initialized"); + + // Allocate the new extra and create aliases from the original. + ggml_backend_opencl_buffer_context * ctx = (ggml_backend_opencl_buffer_context *) buffer->context; + ggml_tensor_extra_cl_q6_K * extra = ctx->ggml_opencl_alloc_temp_tensor_extra_q6_K(); + + size_t size_ql = ggml_nelements(tensor)/ggml_blck_size(tensor->type)*ggml_blck_size(tensor->type)/2; + size_t size_qh = ggml_nelements(tensor)/ggml_blck_size(tensor->type)*ggml_blck_size(tensor->type)/4; + size_t size_s = ggml_nelements(tensor)/ggml_blck_size(tensor->type)*ggml_blck_size(tensor->type)/16; + size_t size_d = ggml_nelements(tensor)/ggml_blck_size(tensor->type)*sizeof(ggml_fp16_t); + GGML_ASSERT(size_ql + size_qh + size_s + size_d == ggml_nbytes(tensor) && + "Incorrect tensor size"); + + cl_int err; + cl_mem data_device = clCreateBuffer(context, CL_MEM_READ_WRITE, + ggml_nbytes(tensor), NULL, &err); + CL_CHECK(err); + CL_CHECK(clEnqueueWriteBuffer( + queue, data_device, CL_TRUE, 0, + ggml_nbytes(tensor), data, 0, NULL, NULL)); + + cl_buffer_region region; + + // Subbuffer for ql + region.origin = align_to(extra_orig->offset + tensor->view_offs + offset, backend_ctx->alignment); + region.size = size_ql; + extra->ql = clCreateSubBuffer( + extra_orig->data_device, CL_MEM_READ_WRITE, + CL_BUFFER_CREATE_TYPE_REGION, ®ion, &err); + CL_CHECK(err); + auto previous_origin = region.origin; + + // Subbuffer for qh + region.origin = align_to(previous_origin + size_ql, backend_ctx->alignment); + region.size = size_qh; + extra->qh = clCreateSubBuffer( + extra_orig->data_device, CL_MEM_READ_WRITE, + CL_BUFFER_CREATE_TYPE_REGION, ®ion, &err); + CL_CHECK(err); + previous_origin = region.origin; + + // Subbuffer for scales + region.origin = align_to(previous_origin + size_qh, backend_ctx->alignment); + region.size = size_s; + extra->s = clCreateSubBuffer( + extra_orig->data_device, CL_MEM_READ_WRITE, + CL_BUFFER_CREATE_TYPE_REGION, ®ion, &err); + CL_CHECK(err); + previous_origin = region.origin; + + // Create subbuffer for d. + region.origin = align_to(previous_origin + size_s, backend_ctx->alignment); + region.size = size_d; + extra->d = clCreateSubBuffer( + extra_orig->data_device, CL_MEM_READ_WRITE, + CL_BUFFER_CREATE_TYPE_REGION, ®ion, &err); + CL_CHECK(err); + previous_origin = region.origin; + + // Flatten the weights + cl_kernel kernel = backend_ctx->kernel_convert_block_q6_K; + + CL_CHECK(clSetKernelArg(kernel, 0, sizeof(cl_mem), &data_device)); + CL_CHECK(clSetKernelArg(kernel, 1, sizeof(cl_mem), &extra->ql)); + CL_CHECK(clSetKernelArg(kernel, 2, sizeof(cl_mem), &extra->qh)); + CL_CHECK(clSetKernelArg(kernel, 3, sizeof(cl_mem), &extra->s)); + CL_CHECK(clSetKernelArg(kernel, 4, sizeof(cl_mem), &extra->d)); + + size_t global_work_size[] = {(size_t)ggml_nelements(tensor)/ggml_blck_size(tensor->type), 1, 1}; + size_t local_work_size[] = {64, 1, 1}; + + cl_event evt; + CL_CHECK(clEnqueueNDRangeKernel(queue, kernel, 3, NULL, global_work_size, local_work_size, 0, NULL, &evt)); + CL_CHECK(clWaitForEvents(1, &evt)); + CL_CHECK(clReleaseMemObject(data_device)); + + extra->size_ql = size_ql; + extra->size_qh = size_qh; + extra->size_s = size_s; + extra->size_d = size_d; + + tensor->extra = extra; + return; + } #endif // GGML_OPENCL_SOA_Q ggml_tensor_extra_cl * extra = (ggml_tensor_extra_cl *) tensor->extra; @@ -4277,6 +4456,34 @@ static void ggml_backend_opencl_buffer_get_tensor(ggml_backend_buffer_t buffer, size_t global_work_size[] = {(size_t)ggml_nelements(tensor)/ggml_blck_size(tensor->type), 1, 1}; size_t local_work_size[] = {1, 1, 1}; + cl_event evt; + CL_CHECK(clEnqueueNDRangeKernel(queue, kernel, 3, NULL, + global_work_size, local_work_size, 0, NULL, &evt)); + CL_CHECK(clWaitForEvents(1, &evt)); + CL_CHECK(clEnqueueReadBuffer( + queue, data_device, CL_TRUE, offset, + size, data, 0, NULL, NULL)); + CL_CHECK(clReleaseMemObject(data_device)); + return; + } + if (tensor->type == GGML_TYPE_Q6_K) { + ggml_tensor_extra_cl_q6_K * extra = (ggml_tensor_extra_cl_q6_K *)tensor->extra; + + cl_int err; + cl_mem data_device = clCreateBuffer(context, CL_MEM_READ_WRITE, + ggml_nbytes(tensor), NULL, &err); + CL_CHECK(err); + + cl_kernel kernel = backend_ctx->kernel_restore_block_q6_K; + CL_CHECK(clSetKernelArg(kernel, 0, sizeof(cl_mem), &extra->ql)); + CL_CHECK(clSetKernelArg(kernel, 1, sizeof(cl_mem), &extra->qh)); + CL_CHECK(clSetKernelArg(kernel, 2, sizeof(cl_mem), &extra->s)); + CL_CHECK(clSetKernelArg(kernel, 3, sizeof(cl_mem), &extra->d)); + CL_CHECK(clSetKernelArg(kernel, 4, sizeof(cl_mem), &data_device)); + + size_t global_work_size[] = {(size_t)ggml_nelements(tensor)/ggml_blck_size(tensor->type), 1, 1}; + size_t local_work_size[] = {1, 1, 1}; + cl_event evt; CL_CHECK(clEnqueueNDRangeKernel(queue, kernel, 3, NULL, global_work_size, local_work_size, 0, NULL, &evt)); @@ -7765,6 +7972,7 @@ static void ggml_cl_mul_mat(ggml_backend_t backend, const ggml_tensor * src0, co ggml_tensor_extra_cl_q4_0 * extra0_q4_0 = (ggml_tensor_extra_cl_q4_0 *)src0->extra; ggml_tensor_extra_cl_mxfp4 * extra0_mxfp4 = (ggml_tensor_extra_cl_mxfp4 *)src0->extra; ggml_tensor_extra_cl_q8_0 * extra0_q8_0 = (ggml_tensor_extra_cl_q8_0 *)src0->extra; + ggml_tensor_extra_cl_q6_K * extra0_q6_K = (ggml_tensor_extra_cl_q6_K *)src0->extra; #endif const int ne00 = src0 ? src0->ne[0] : 0; @@ -8648,14 +8856,49 @@ static void ggml_cl_mul_mat(ggml_backend_t backend, const ggml_tensor * src0, co case GGML_TYPE_Q4_K: case GGML_TYPE_Q5_K: case GGML_TYPE_Q6_K: +#ifdef GGML_OPENCL_SOA_Q + kernel = backend_ctx->kernel_mul_mv_q6_K_f32_flat; + + if (backend_ctx->gpu_family == INTEL) { + nth0 = 16; + nth1 = 2; + ndst = 4; + } else if (backend_ctx->gpu_family == ADRENO) { + nth0 = 64; + nth1 = 2; + ndst = 4; + } else { + GGML_ASSERT(false && "TODO: Unknown GPU"); + } + + CL_CHECK(clSetKernelArg(kernel, 0, sizeof(cl_mem), &extra0_q6_K->ql)); + CL_CHECK(clSetKernelArg(kernel, 1, sizeof(cl_mem), &extra0_q6_K->qh)); + CL_CHECK(clSetKernelArg(kernel, 2, sizeof(cl_mem), &extra0_q6_K->s)); + CL_CHECK(clSetKernelArg(kernel, 3, sizeof(cl_mem), &extra0_q6_K->d)); + CL_CHECK(clSetKernelArg(kernel, 4, sizeof(cl_mem), &extra1->data_device)); + CL_CHECK(clSetKernelArg(kernel, 5, sizeof(cl_ulong), &offset1)); + CL_CHECK(clSetKernelArg(kernel, 6, sizeof(cl_mem), &extrad->data_device)); + CL_CHECK(clSetKernelArg(kernel, 7, sizeof(cl_ulong), &offsetd)); + CL_CHECK(clSetKernelArg(kernel, 8, sizeof(int), &ne00)); + CL_CHECK(clSetKernelArg(kernel, 9, sizeof(int), &ne01)); + CL_CHECK(clSetKernelArg(kernel, 10, sizeof(int), &ne02)); + CL_CHECK(clSetKernelArg(kernel, 11, sizeof(int), &ne10)); + CL_CHECK(clSetKernelArg(kernel, 12, sizeof(int), &ne12)); + CL_CHECK(clSetKernelArg(kernel, 13, sizeof(int), &ne0)); + CL_CHECK(clSetKernelArg(kernel, 14, sizeof(int), &ne1)); + CL_CHECK(clSetKernelArg(kernel, 15, sizeof(int), &r2)); + CL_CHECK(clSetKernelArg(kernel, 16, sizeof(int), &r3)); +#else kernel = backend_ctx->kernel_mul_mv_q6_K_f32; if (backend_ctx->gpu_family == INTEL) { - nth0 = 2; - nth1 = 16; + nth0 = 16; + nth1 = 2; + ndst = 1; } else if (backend_ctx->gpu_family == ADRENO) { - nth0 = 2; - nth1 = 64; + nth0 = 64; + nth1 = 2; + ndst = 1; } else { GGML_ASSERT(false && "TODO: Unknown GPU"); } @@ -8675,6 +8918,7 @@ static void ggml_cl_mul_mat(ggml_backend_t backend, const ggml_tensor * src0, co CL_CHECK(clSetKernelArg(kernel, 12, sizeof(int), &ne1)); CL_CHECK(clSetKernelArg(kernel, 13, sizeof(int), &r2)); CL_CHECK(clSetKernelArg(kernel, 14, sizeof(int), &r3)); +#endif // GGML_OPENCL_SOA_Q break; case GGML_TYPE_MXFP4: { #ifdef GGML_OPENCL_SOA_Q @@ -8777,7 +9021,7 @@ static void ggml_cl_mul_mat(ggml_backend_t backend, const ggml_tensor * src0, co } else if (src0t == GGML_TYPE_Q5_K) { GGML_ASSERT(false && "not implemented"); } else if (src0t == GGML_TYPE_Q6_K) { - size_t global_work_size[] = {(size_t)(ne01+1)/2*nth0, (size_t)ne11*nth1, (size_t)ne12*ne13}; + size_t global_work_size[] = {(size_t)(ne01+ndst*nth1-1)/(ndst*nth1)*nth0, (size_t)ne11*nth1, (size_t)ne12*ne13}; size_t local_work_size[] = {(size_t)nth0, (size_t)nth1, 1}; backend_ctx->enqueue_ndrange_kernel(kernel, 3, global_work_size, local_work_size, dst); diff --git a/ggml/src/ggml-opencl/kernels/cvt.cl b/ggml/src/ggml-opencl/kernels/cvt.cl index 513a4d3e28..adf576a839 100644 --- a/ggml/src/ggml-opencl/kernels/cvt.cl +++ b/ggml/src/ggml-opencl/kernels/cvt.cl @@ -46,6 +46,16 @@ struct block_q4_0 uint8_t qs[QK4_0 / 2]; }; +//------------------------------------------------------------------------------ +// block_q6_K +//------------------------------------------------------------------------------ +struct block_q6_K { + uint8_t ql[QK_K/2]; // quants, lower 4 bits + uint8_t qh[QK_K/4]; // quants, upper 2 bits + int8_t scales[QK_K/16]; // scales, quantized with 8 bits + half d; // super-block scale +}; + //------------------------------------------------------------------------------ // kernel_convert_block_q4_0 // Convert the block_q4_0 format to 2 separate arrays (AOS -> SOA). @@ -263,3 +273,63 @@ kernel void kernel_restore_block_q8_0( b->qs[i] = q[i]; } } + +//------------------------------------------------------------------------------ +// kernel_convert_block_q6_K +// Convert the block_q6_K format to 3 separate arrays (AOS -> SOA). +// This kernel does not deshuffle the bits. +// Each thread processes a super block. +//------------------------------------------------------------------------------ +kernel void kernel_convert_block_q6_K( + global struct block_q6_K * src0, + global uchar * dst_ql, + global uchar * dst_qh, + global char * dst_s, + global half * dst_d +) { + global struct block_q6_K * b = (global struct block_q6_K *) src0 + get_global_id(0); + global uchar * ql = (global uchar *) dst_ql + QK_K/2*get_global_id(0); + global uchar * qh = (global uchar *) dst_qh + QK_K/4*get_global_id(0); + global char * s = (global char *) dst_s + QK_K/16*get_global_id(0); + global half * d = (global half *) dst_d + get_global_id(0); + + *d = b->d; + + for (int i = 0; i < QK_K/2; ++i) { + ql[i] = b->ql[i]; + } + for (int i = 0; i < QK_K/4; ++i) { + qh[i] = b->qh[i]; + } + for (int i = 0; i < QK_K/16; ++i) { + s[i] = b->scales[i]; + } +} + +// Restore block_q6_K from flattened arrays. +// Each thread processes a super block. +kernel void kernel_restore_block_q6_K( + global uchar * dst_ql, + global uchar * dst_qh, + global char * dst_s, + global half * dst_d, + global struct block_q6_K * dst +) { + global struct block_q6_K * b = (global struct block_q6_K *) dst + get_global_id(0); + global uchar * ql = (global uchar *) dst_ql + QK_K/2*get_global_id(0); + global uchar * qh = (global uchar *) dst_qh + QK_K/4*get_global_id(0); + global char * s = (global char *) dst_s + QK_K/16*get_global_id(0); + global half * d = (global half *) dst_d + get_global_id(0); + + b->d = *d; + + for (int i = 0; i < QK_K/2; ++i) { + b->ql[i] = ql[i]; + } + for (int i = 0; i < QK_K/4; ++i) { + b->qh[i] = qh[i]; + } + for (int i = 0; i < QK_K/16; ++i) { + b->scales[i] = s[i]; + } +} diff --git a/ggml/src/ggml-opencl/kernels/mul_mv_q6_k.cl b/ggml/src/ggml-opencl/kernels/mul_mv_q6_k_f32.cl similarity index 100% rename from ggml/src/ggml-opencl/kernels/mul_mv_q6_k.cl rename to ggml/src/ggml-opencl/kernels/mul_mv_q6_k_f32.cl diff --git a/ggml/src/ggml-opencl/kernels/mul_mv_q6_k_f32_flat.cl b/ggml/src/ggml-opencl/kernels/mul_mv_q6_k_f32_flat.cl new file mode 100644 index 0000000000..86fe09c6dd --- /dev/null +++ b/ggml/src/ggml-opencl/kernels/mul_mv_q6_k_f32_flat.cl @@ -0,0 +1,194 @@ +#pragma OPENCL EXTENSION cl_khr_fp16 : enable + +#ifdef cl_intel_subgroups +#pragma OPENCL EXTENSION cl_intel_subgroups : enable +#else +#pragma OPENCL EXTENSION cl_khr_subgroups : enable +#endif + +#ifdef cl_intel_required_subgroup_size +#pragma OPENCL EXTENSION cl_intel_required_subgroup_size : enable +#define INTEL_GPU 1 +#define REQD_SUBGROUP_SIZE_16 __attribute__((intel_reqd_sub_group_size(16))) +#define REQD_SUBGROUP_SIZE_32 __attribute__((intel_reqd_sub_group_size(32))) +#elif defined(cl_qcom_reqd_sub_group_size) +#pragma OPENCL EXTENSION cl_qcom_reqd_sub_group_size : enable +#define ADRENO_GPU 1 +#define REQD_SUBGROUP_SIZE_64 __attribute__((qcom_reqd_sub_group_size("half"))) +#define REQD_SUBGROUP_SIZE_128 __attribute__((qcom_reqd_sub_group_size("full"))) +#endif + +//------------------------------------------------------------------------------ +// kernel_mul_mv_q6_K_f32_flat +//------------------------------------------------------------------------------ +#define Q6_K_MASK1 0x03 +#define Q6_K_MASK2 0x0C +#define Q6_K_MASK3 0x30 +#define Q6_K_MASK4 0xC0 + +#define QK_K 256 + +inline float block_q_6_K_dot_y_flat( + global uchar * blk_ql, + global uchar * blk_qh, + global char * blk_scales, + global half * blk_d, + global float * yy, + int ib, + int ip, + int is, + int l0 +) { + int y_offset = 128*ip + l0; + int q_offset_l = 64*ip + l0; + int q_offset_h = 32*ip + l0; + + global uchar * q1 = blk_ql + ib*128 + q_offset_l; + global uchar * q2 = q1 + QK_K/8; + global uchar * qh = blk_qh + ib*64 + q_offset_h; + global char * sc = blk_scales + ib*16 + is; + + global float * y = yy + ib * QK_K + y_offset; + + float dall = blk_d[ib]; + + float sumf = 0; + float4 sums = {0.f, 0.f, 0.f, 0.f}; + + sums.s0 += y[0+ 0] * ((float)((q1[0] & 0xF) | ((qh[0] & Q6_K_MASK1) << 4)) - 32.f); + sums.s1 += y[0+32] * ((float)((q2[0] & 0xF) | ((qh[0] & Q6_K_MASK2) << 2)) - 32.f); + sums.s2 += y[0+64] * ((float)((q1[0] >> 4) | ((qh[0] & Q6_K_MASK3) << 0)) - 32.f); + sums.s3 += y[0+96] * ((float)((q2[0] >> 4) | ((qh[0] & Q6_K_MASK4) >> 2)) - 32.f); + + sums.s0 += y[1+ 0] * ((float)((q1[1] & 0xF) | ((qh[1] & Q6_K_MASK1) << 4)) - 32.f); + sums.s1 += y[1+32] * ((float)((q2[1] & 0xF) | ((qh[1] & Q6_K_MASK2) << 2)) - 32.f); + sums.s2 += y[1+64] * ((float)((q1[1] >> 4) | ((qh[1] & Q6_K_MASK3) << 0)) - 32.f); + sums.s3 += y[1+96] * ((float)((q2[1] >> 4) | ((qh[1] & Q6_K_MASK4) >> 2)) - 32.f); + + sums.s0 += y[2+ 0] * ((float)((q1[2] & 0xF) | ((qh[2] & Q6_K_MASK1) << 4)) - 32.f); + sums.s1 += y[2+32] * ((float)((q2[2] & 0xF) | ((qh[2] & Q6_K_MASK2) << 2)) - 32.f); + sums.s2 += y[2+64] * ((float)((q1[2] >> 4) | ((qh[2] & Q6_K_MASK3) << 0)) - 32.f); + sums.s3 += y[2+96] * ((float)((q2[2] >> 4) | ((qh[2] & Q6_K_MASK4) >> 2)) - 32.f); + + sums.s0 += y[3+ 0] * ((float)((q1[3] & 0xF) | ((qh[3] & Q6_K_MASK1) << 4)) - 32.f); + sums.s1 += y[3+32] * ((float)((q2[3] & 0xF) | ((qh[3] & Q6_K_MASK2) << 2)) - 32.f); + sums.s2 += y[3+64] * ((float)((q1[3] >> 4) | ((qh[3] & Q6_K_MASK3) << 0)) - 32.f); + sums.s3 += y[3+96] * ((float)((q2[3] >> 4) | ((qh[3] & Q6_K_MASK4) >> 2)) - 32.f); + + sumf += dall * (sums.s0 * sc[0] + sums.s1 * sc[2] + sums.s2 * sc[4] + sums.s3 * sc[6]); + + return sumf; +} + +#undef N_DST +#undef N_SIMDGROUP +#undef N_SIMDWIDTH + +#ifdef INTEL_GPU +#define N_DST 4 +#define N_SIMDGROUP 2 +#define N_SIMDWIDTH 16 +#elif defined (ADRENO_GPU) +#define N_DST 4 +#define N_SIMDGROUP 2 +#define N_SIMDWIDTH 64 +#endif + +#define BLOCK_STRIDE (N_SIMDWIDTH/16) // number of blocks each subgroup processes + +#ifdef INTEL_GPU +REQD_SUBGROUP_SIZE_16 +#elif defined (ADRENO_GPU) +REQD_SUBGROUP_SIZE_64 +#endif +kernel void kernel_mul_mv_q6_K_f32_flat( + global uchar * src0_ql, + global uchar * src0_qh, + global char * src0_s, + global half * src0_d, + global float * src1, + ulong offset1, + global float * dst, + ulong offsetd, + int ne00, + int ne01, + int ne02, + int ne10, + int ne12, + int ne0, + int ne1, + int r2, + int r3 +) { + src1 = (global float*)((global char*)src1 + offset1); + dst = (global float*)((global char*)dst + offsetd); + + int nb = ne00/QK_K; + + int r0 = get_group_id(0); + int r1 = get_group_id(1); + int im = get_group_id(2); + + int i12 = im%ne12; + int i13 = im/ne12; + + int first_row = (N_SIMDGROUP * r0 + get_sub_group_id()) * N_DST; + + ulong offset_src0 = first_row*nb + (i12/r2)*(nb*ne01) + (i13/r3)*(nb*ne01*ne02); + ulong offset_src0_ql = offset_src0 * 128; + ulong offset_src0_qh = offset_src0 * 64; + ulong offset_src0_s = offset_src0 * 16; + ulong offset_src0_d = offset_src0; + + global uchar * blk_ql = (global uchar *) src0_ql + offset_src0_ql; + global uchar * blk_qh = (global uchar *) src0_qh + offset_src0_qh; + global char * blk_scales = (global char *) src0_s + offset_src0_s; + global half * blk_d = (global half *) src0_d + offset_src0_d; + global float * yy = (global float *) src1 + r1*ne10 + im*ne00*ne1; + + int tid = get_sub_group_local_id()/BLOCK_STRIDE; // first block_stride groups have tid=0 + int ix = get_sub_group_local_id()%BLOCK_STRIDE; // first block is 0..block_stride-1 + int ip = tid/8; // first or second half of (super) block (0 or 1) + int il = tid%8; // each half has 8 parts, one per scale + int n = 4; // 4 scales at a time (and 4 sums) + int l0 = n*il; // offset into half-block, 0..28 + int is = 8*ip + l0/16; // 0, 1, 8, 9 + + float4 sumf = 0; + + for (int ib = ix; ib < nb; ib += BLOCK_STRIDE) { + if (first_row + 0 < ne01) { + sumf.s0 += block_q_6_K_dot_y_flat(blk_ql + 0*nb*128, blk_qh + 0*nb*64, blk_scales + 0*nb*16, blk_d + 0*nb, yy, ib, ip, is, l0); + } + if (first_row + 1 < ne01) { + sumf.s1 += block_q_6_K_dot_y_flat(blk_ql + 1*nb*128, blk_qh + 1*nb*64, blk_scales + 1*nb*16, blk_d + 1*nb, yy, ib, ip, is, l0); + } + if (first_row + 2 < ne01) { + sumf.s2 += block_q_6_K_dot_y_flat(blk_ql + 2*nb*128, blk_qh + 2*nb*64, blk_scales + 2*nb*16, blk_d + 2*nb, yy, ib, ip, is, l0); + } + if (first_row + 3 < ne01) { + sumf.s3 += block_q_6_K_dot_y_flat(blk_ql + 3*nb*128, blk_qh + 3*nb*64, blk_scales + 3*nb*16, blk_d + 3*nb, yy, ib, ip, is, l0); + } + } + + float4 tot = (float4)( + sub_group_reduce_add(sumf.s0), + sub_group_reduce_add(sumf.s1), + sub_group_reduce_add(sumf.s2), + sub_group_reduce_add(sumf.s3) + ); + if (get_sub_group_local_id() == 0) { + if (first_row + 0 < ne01) { + dst[r1*ne0 + im*ne0*ne1 + first_row + 0] = tot.s0; + } + if (first_row + 1 < ne01) { + dst[r1*ne0 + im*ne0*ne1 + first_row + 1] = tot.s1; + } + if (first_row + 2 < ne01) { + dst[r1*ne0 + im*ne0*ne1 + first_row + 2] = tot.s2; + } + if (first_row + 3 < ne01) { + dst[r1*ne0 + im*ne0*ne1 + first_row + 3] = tot.s3; + } + } +} From 7afdfc9b844ce38179fc4f0e4caa8b5c9a98db43 Mon Sep 17 00:00:00 2001 From: shalinib-ibm Date: Tue, 27 Jan 2026 09:22:34 +0530 Subject: [PATCH 20/22] ggml-cpu: Enable FP16 MMA kernels on PPC (#19060) --- ggml/src/ggml-cpu/llamafile/sgemm.cpp | 81 +++++++++++++++++++-------- 1 file changed, 58 insertions(+), 23 deletions(-) diff --git a/ggml/src/ggml-cpu/llamafile/sgemm.cpp b/ggml/src/ggml-cpu/llamafile/sgemm.cpp index 7dc36d4f8a..8f980c16b9 100644 --- a/ggml/src/ggml-cpu/llamafile/sgemm.cpp +++ b/ggml/src/ggml-cpu/llamafile/sgemm.cpp @@ -1797,10 +1797,27 @@ class tinyBLAS_Q0_AVX { } \ } \ +template +struct mma_instr; + +template<> +struct mma_instr { + static inline void outer_product(acc_t *acc, vec_t a, vec_t b) { + __builtin_mma_xvbf16ger2pp(acc, a, b); + } +}; + +template<> +struct mma_instr { + static inline void outer_product(acc_t *acc, vec_t a, vec_t b) { + __builtin_mma_xvf16ger2pp(acc, a, b); + } +}; + template -class tinyBLAS_BF16_PPC { +class tinyBLAS_HP16_PPC { public: - tinyBLAS_BF16_PPC(int64_t k, + tinyBLAS_HP16_PPC(int64_t k, const TA *A, int64_t lda, const TB *B, int64_t ldb, TC *C, int64_t ldc, @@ -2118,8 +2135,8 @@ class tinyBLAS_BF16_PPC { packNormal((A+(ii*lda)+l), lda, 4, 8, (uint8_t*)vec_A); packNormal((B+(jj*ldb)+l), ldb, 8, 8, (uint8_t*)vec_B); for (int x = 0; x < 4; x++) { - __builtin_mma_xvbf16ger2pp(&acc_0, vec_A[x], vec_B[x]); - __builtin_mma_xvbf16ger2pp(&acc_1, vec_A[x], vec_B[x+4]); + mma_instr::outer_product(&acc_0, vec_A[x], vec_B[x]); + mma_instr::outer_product(&acc_1, vec_A[x], vec_B[x+4]); } } SAVE_ACC(&acc_0, ii, jj); @@ -2135,8 +2152,8 @@ class tinyBLAS_BF16_PPC { packNormal((A+(ii*lda)+l), lda, 8, 8, (uint8_t*)vec_A); packNormal((B+(jj*ldb)+l), ldb, 8, 4, (uint8_t*)vec_B); for (int x = 0; x < 4; x++) { - __builtin_mma_xvbf16ger2pp(&acc_0, vec_A[x], vec_B[x]); - __builtin_mma_xvbf16ger2pp(&acc_1, vec_A[x+4], vec_B[x]); + mma_instr::outer_product(&acc_0, vec_A[x], vec_B[x]); + mma_instr::outer_product(&acc_1, vec_A[x], vec_B[x+4]); } } SAVE_ACC(&acc_0, ii, jj); @@ -2155,10 +2172,10 @@ class tinyBLAS_BF16_PPC { packNormal(A+(ii*lda)+l, lda, 8, 8, (uint8_t*)vec_A); packNormal(B+(jj*ldb)+l, ldb, 8, 8, (uint8_t*)vec_B); for (int x = 0; x < 4; x++) { - __builtin_mma_xvbf16ger2pp(&acc_0, vec_A[x], vec_B[x]); - __builtin_mma_xvbf16ger2pp(&acc_1, (vec_t)vec_A[x], (vec_t)vec_B[x+4]); - __builtin_mma_xvbf16ger2pp(&acc_2, (vec_t)vec_A[x+4], (vec_t)vec_B[x]); - __builtin_mma_xvbf16ger2pp(&acc_3, (vec_t)vec_A[x+4], (vec_t)vec_B[x+4]); + mma_instr::outer_product(&acc_0, vec_A[x], vec_B[x]); + mma_instr::outer_product(&acc_1, vec_A[x], vec_B[x+4]); + mma_instr::outer_product(&acc_2, vec_A[x+4], vec_B[x]); + mma_instr::outer_product(&acc_3, vec_A[x+4], vec_B[x+4]); } } @@ -2189,7 +2206,7 @@ class tinyBLAS_BF16_PPC { packNormal(A+(ii*lda)+l, lda, RM, 4, (uint8_t*)vec_A); packNormal(B+(jj*ldb)+l, ldb, RN, 4, (uint8_t*)vec_B); for (int x = 0; x<2; x++) { - __builtin_mma_xvbf16ger2pp(&acc_0, vec_A[x], vec_B[x]); + mma_instr::outer_product(&acc_0, vec_A[x], vec_B[x]); } } __builtin_mma_disassemble_acc(vec_C, &acc_0); @@ -2224,8 +2241,8 @@ class tinyBLAS_BF16_PPC { packNormal(A+(ii*lda)+l, lda, RM, 8, (uint8_t*)vec_A); packNormal(B+(jj*ldb)+l, ldb, RN, 8, (uint8_t*)vec_B); for (int x = 0; x<4; x++) { - __builtin_mma_xvbf16ger2pp(&acc_0, vec_A[x], vec_B[x]); - __builtin_mma_xvbf16ger2pp(&acc_1, vec_A[x], vec_B[x+4]); + mma_instr::outer_product(&acc_0, vec_A[x], vec_B[x]); + mma_instr::outer_product(&acc_1, vec_A[x], vec_B[x+4]); } } __builtin_mma_disassemble_acc(vec_C, &acc_0); @@ -3418,16 +3435,19 @@ bool llamafile_sgemm(const struct ggml_compute_params * params, int64_t m, int64 return tb.matmul(m, n); } #elif defined(__MMA__) - if ((k % 8)) - return false; - if(Btype == GGML_TYPE_BF16) { - tinyBLAS_BF16_PPC tb{ k, - (const ggml_bf16_t *)A, lda, - (const ggml_bf16_t *)B, ldb, - (float *)C, ldc, - params->ith, params->nth}; - tb.matmul(m, n); - return true; + if (k % 8) { + return false; + } + + if (Btype == GGML_TYPE_BF16) { + tinyBLAS_HP16_PPC tb{ k, + (const ggml_bf16_t *)A, lda, + (const ggml_bf16_t *)B, ldb, + (float *)C, ldc, + params->ith, params->nth }; + + tb.matmul(m, n); + return true; } #elif defined(__riscv_zvfbfwma) #if LMUL == 1 @@ -3516,6 +3536,21 @@ bool llamafile_sgemm(const struct ggml_compute_params * params, int64_t m, int64 #endif return tb.matmul(m, n); } +#elif defined(__MMA__) + if (k % 8) { + return false; + } + + if (Btype == GGML_TYPE_F16) { + tinyBLAS_HP16_PPC tb{ k, + (const ggml_fp16_t *)A, lda, + (const ggml_fp16_t *)B, ldb, + (float *)C, ldc, + params->ith, params->nth }; + + tb.matmul(m, n); + return true; + } #endif return false; } From fc3cdf32ce5ea3017299d2afb947d3ba9844445a Mon Sep 17 00:00:00 2001 From: Daniel Bevenius Date: Tue, 27 Jan 2026 06:16:00 +0100 Subject: [PATCH 21/22] common : clarify HTTPS build options in error message (#19103) * common : clarify HTTPS build options in error message This commit updates the https error message to provide clearer instructions for users who encounter the "HTTPS is not supported" error. The motivation for this is that it might not be clear to users that only one of these options are needed to enable HTTPS support. The LLAMA_OPENSSL option is also added to the message to cover all possible build configurations. * clarify that OpenSSL is the default for HTTPS support --- common/http.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/common/http.h b/common/http.h index 7c683aafcf..e8ed56f952 100644 --- a/common/http.h +++ b/common/http.h @@ -60,10 +60,10 @@ static std::pair common_http_client(const std: #ifndef CPPHTTPLIB_OPENSSL_SUPPORT if (parts.scheme == "https") { throw std::runtime_error( - "HTTPS is not supported. Please rebuild with:\n" + "HTTPS is not supported. Please rebuild with one of:\n" " -DLLAMA_BUILD_BORINGSSL=ON\n" " -DLLAMA_BUILD_LIBRESSL=ON\n" - "or ensure dev files of an OpenSSL-compatible library are available when building." + " -DLLAMA_OPENSSL=ON (default, requires OpenSSL dev files installed)" ); } #endif From a83c73a18aaffba253ffd01e7cd3af41feaf8179 Mon Sep 17 00:00:00 2001 From: Gaurav Garg Date: Tue, 27 Jan 2026 06:52:44 +0000 Subject: [PATCH 22/22] [CUDA] Reduce CPU-side stalls due to the CUDA command buffer being full (#19042) * [CUDA] Reduce CPU-side stalls due to the CUDA command buffer being full With pipeline parallelism, during prompt processing, the CPU-side CUDA command buffer gets full, stalling the CPU. Due to this, enough work doesn't get submitted to the GPU, causing bubbles in the GPU timeline. Fix this by setting the CUDA environment variable CUDA_SCALE_LAUNCH_QUEUES to 4x to increase the command buffer size. * Set the env variable in the CUDA backend registry allocation * Add link to PR in code comment * Remove warning logs and update documentation --- docs/build.md | 8 ++++++++ ggml/src/ggml-cuda/ggml-cuda.cu | 10 ++++++++++ 2 files changed, 18 insertions(+) diff --git a/docs/build.md b/docs/build.md index fce9361b2d..4983cfcfea 100644 --- a/docs/build.md +++ b/docs/build.md @@ -248,6 +248,14 @@ You may set the [cuda environmental variables](https://docs.nvidia.com/cuda/cuda CUDA_VISIBLE_DEVICES="-0" ./build/bin/llama-server --model /srv/models/llama.gguf ``` +#### CUDA_SCALE_LAUNCH_QUEUES + +The environment variable [`CUDA_SCALE_LAUNCH_QUEUES`](https://docs.nvidia.com/cuda/cuda-programming-guide/05-appendices/environment-variables.html#cuda-scale-launch-queues) controls the size of CUDA's command buffer, which determines how many GPU operations can be queued before the CPU must wait for the GPU to catch up. A larger buffer reduces CPU-side stalls and allows more work to be queued on a GPU. + +**Default behavior:** llama.cpp automatically sets `CUDA_SCALE_LAUNCH_QUEUES=4x`, which increases the CUDA command buffer to 4 times its default size. This optimization is particularly beneficial for **Multi-GPU setups with pipeline parallelism**, where it significantly improves prompt processing throughput by allowing more operations to be enqueued across GPUs. + +See PR [#19042](https://github.com/ggml-org/llama.cpp/pull/19042) for performance benchmarks and technical details. + ### Unified Memory The environment variable `GGML_CUDA_ENABLE_UNIFIED_MEMORY=1` can be used to enable unified memory in Linux. This allows swapping to system RAM instead of crashing when the GPU VRAM is exhausted. In Windows this setting is available in the NVIDIA control panel as `System Memory Fallback`. diff --git a/ggml/src/ggml-cuda/ggml-cuda.cu b/ggml/src/ggml-cuda/ggml-cuda.cu index 99f0919a51..e9df0ea4a7 100644 --- a/ggml/src/ggml-cuda/ggml-cuda.cu +++ b/ggml/src/ggml-cuda/ggml-cuda.cu @@ -4876,6 +4876,16 @@ ggml_backend_reg_t ggml_backend_cuda_reg() { static std::mutex mutex; std::lock_guard lock(mutex); if (!initialized) { + // Set CUDA_SCALE_LAUNCH_QUEUES before any CUDA API call to improve multi-GPU pipeline parallelism performance + // PR: https://github.com/ggml-org/llama.cpp/pull/19042 + if (getenv("CUDA_SCALE_LAUNCH_QUEUES") == nullptr) { +#ifdef _WIN32 + _putenv_s("CUDA_SCALE_LAUNCH_QUEUES", "4x"); +#else + setenv("CUDA_SCALE_LAUNCH_QUEUES", "4x", 0); // don't overwrite if already set +#endif // _WIN32 + } + ggml_backend_cuda_reg_context * ctx = new ggml_backend_cuda_reg_context; const int min_batch_size = getenv("GGML_OP_OFFLOAD_MIN_BATCH") ? atoi(getenv("GGML_OP_OFFLOAD_MIN_BATCH")) : 32;