From 094f76ee86d188fc0caea0c083b71c5f6ae80470 Mon Sep 17 00:00:00 2001 From: Nexes the Elder <124105151+Nexesenex@users.noreply.github.com> Date: Tue, 24 Mar 2026 07:49:40 +0100 Subject: [PATCH] Cleaner log for adjusted splits (#1494) * sweep-bench: add more skipped patterns to --minilog * cleaner log for adjusted splits * Add totalization for adjusted splits * Clean up semicolons * Addition for totalizer ^^ * Change accordingly to review * Forgotten leftover removed * 'total' instead of 'totalized' --- examples/sweep-bench/sweep-bench.cpp | 2 ++ src/llama-load-tensors.cpp | 29 ++++++++++++++++++++++++---- src/llama.cpp | 4 ++-- 3 files changed, 29 insertions(+), 6 deletions(-) diff --git a/examples/sweep-bench/sweep-bench.cpp b/examples/sweep-bench/sweep-bench.cpp index 19a0af87..741f2230 100644 --- a/examples/sweep-bench/sweep-bench.cpp +++ b/examples/sweep-bench/sweep-bench.cpp @@ -37,6 +37,8 @@ static void llama_selective_log_callback(ggml_log_level level, const char * text "Layer ", "llm_load_tensors:", "==========================", + "merging up/gate in layer", + "repacking up/gate experts weight in layer", }; for (const char * pat : skip_patterns) { if (strstr(text, pat) != nullptr) { diff --git a/src/llama-load-tensors.cpp b/src/llama-load-tensors.cpp index bef7762b..a64df9b9 100644 --- a/src/llama-load-tensors.cpp +++ b/src/llama-load-tensors.cpp @@ -3961,6 +3961,10 @@ bool create_tensors_helper::create_tensors() { } } } + std::vector gpu_split_count; + if (model.max_gpu > 0 && model.max_gpu < int(model.splits.size())) { + gpu_split_count.resize(model.splits.size(), 0.0f); + } for (int il = 0; il < n_layer; ++il) { int gqa_ratio = hparams.n_head(il) / hparams.n_head_kv(il); if (ggml_backend_buft_is_host(model.buft_layer[il].buft_matrix)) { @@ -3970,11 +3974,17 @@ bool create_tensors_helper::create_tensors() { if (model.max_gpu > 0 && model.max_gpu < int(model.splits.size()) && il % adjust_step == 0) { cur_splits = model.splits; adjust_split(cur_splits, mem_used, model.max_gpu); - LLAMA_LOG_INFO("Adjusted split at layer %2d:", il); + LLAMA_LOG_INFO("Adjusted split at layer %2d: ", il); float last_split = 0; - for (auto & p : cur_splits) { - LLAMA_LOG_INFO(" %g", p - last_split); - last_split = p; + for (int i = 0; i < (int)cur_splits.size(); ++i) { + if (i > 0) { + LLAMA_LOG_INFO(" ; "); + } + LLAMA_LOG_INFO("GPU%d: %4g", i, cur_splits[i] - last_split); + if (i < int(gpu_split_count.size())) { + gpu_split_count[i] += cur_splits[i] - last_split; + } + last_split = cur_splits[i]; } LLAMA_LOG_INFO("\n"); } @@ -4243,6 +4253,17 @@ bool create_tensors_helper::create_tensors() { } } + if (!gpu_split_count.empty()) { + LLAMA_LOG_INFO("Adjusted splits (total) : "); + for (int i = 0; i < (int)gpu_split_count.size(); ++i) { + if (i > 0) { + LLAMA_LOG_INFO(" ; "); + } + LLAMA_LOG_INFO("GPU%d: %4g", i, gpu_split_count[i]); + } + LLAMA_LOG_INFO("\n"); + } + if (model.output) { if (auto it = split_tensors.find(model.output); it != split_tensors.end()) { if (ggml_backend_buft_is_host(model.buft_output.buft_matrix)) { diff --git a/src/llama.cpp b/src/llama.cpp index c6c8c89c..8e7c7803 100644 --- a/src/llama.cpp +++ b/src/llama.cpp @@ -4888,7 +4888,7 @@ static void llama_repack_up_gate_exps(llama_context & lctx) { if (nbytes > aux_buffer_gate.size()) { aux_buffer_gate.resize(nbytes); } - printf("%s: repacking up/gate experts weight in layer %d\n", __func__, il); + LLAMA_LOG_INFO("%s: repacking up/gate experts weight in layer %d\n", __func__, il); ggml_backend_tensor_get(l.ffn_up_exps, aux_buffer_up.data(), 0, nbytes); ggml_backend_tensor_get(l.ffn_gate_exps, aux_buffer_gate.data(), 0, nbytes); if (aux_buffer_up_gate.size() < 2*nbytes) { @@ -4914,7 +4914,7 @@ static void llama_repack_up_gate_exps(llama_context & lctx) { if (nbytes > aux_buffer_gate.size()) { aux_buffer_gate.resize(nbytes); } - printf("%s: repacking up/gate experts bias in layer %d\n", __func__, il); + LLAMA_LOG_INFO("%s: repacking up/gate experts bias in layer %d\n", __func__, il); ggml_backend_tensor_get(l.ffn_up_exps_b, aux_buffer_up.data(), 0, nbytes); ggml_backend_tensor_get(l.ffn_gate_exps_b, aux_buffer_gate.data(), 0, nbytes); if (aux_buffer_up_gate.size() < 2*nbytes) {