From 094f76ee86d188fc0caea0c083b71c5f6ae80470 Mon Sep 17 00:00:00 2001
From: Nexes the Elder <124105151+Nexesenex@users.noreply.github.com>
Date: Tue, 24 Mar 2026 07:49:40 +0100
Subject: [PATCH] Cleaner log for adjusted splits (#1494)

* sweep-bench: add more skipped patterns to --minilog

* cleaner log for adjusted splits

* Add totalization for adjusted splits

* Clean up semicolons

* Addition for totalizer ^^

* Change accordingly to review

* Forgotten leftover removed

* 'total' instead of 'totalized'
---
 examples/sweep-bench/sweep-bench.cpp |  2 ++
 src/llama-load-tensors.cpp           | 29 ++++++++++++++++++++++++----
 src/llama.cpp                        |  4 ++--
 3 files changed, 29 insertions(+), 6 deletions(-)
diff --git a/examples/sweep-bench/sweep-bench.cpp b/examples/sweep-bench/sweep-bench.cpp
index 19a0af87..741f2230 100644
--- a/examples/sweep-bench/sweep-bench.cpp
+++ b/examples/sweep-bench/sweep-bench.cpp
@@ -37,6 +37,8 @@ static void llama_selective_log_callback(ggml_log_level level, const char * text
         "Layer ",
         "llm_load_tensors:",
         "==========================",
+        "merging up/gate in layer",
+        "repacking up/gate experts weight in layer",
     };
     for (const char * pat : skip_patterns) {
         if (strstr(text, pat) != nullptr) {
diff --git a/src/llama-load-tensors.cpp b/src/llama-load-tensors.cpp
index bef7762b..a64df9b9 100644
--- a/src/llama-load-tensors.cpp
+++ b/src/llama-load-tensors.cpp
@@ -3961,6 +3961,10 @@ bool create_tensors_helper::create_tensors() {
                 }
             }
         }
+        std::vector<float> gpu_split_count;
+        if (model.max_gpu > 0 && model.max_gpu < int(model.splits.size())) {
+            gpu_split_count.resize(model.splits.size(), 0.0f);
+        }
         for (int il = 0; il < n_layer; ++il) {
             int gqa_ratio = hparams.n_head(il) / hparams.n_head_kv(il);
             if (ggml_backend_buft_is_host(model.buft_layer[il].buft_matrix)) {
@@ -3970,11 +3974,17 @@ bool create_tensors_helper::create_tensors() {
             if (model.max_gpu > 0 && model.max_gpu < int(model.splits.size()) && il % adjust_step == 0) {
                 cur_splits = model.splits;
                 adjust_split(cur_splits, mem_used, model.max_gpu);
-                LLAMA_LOG_INFO("Adjusted split at layer %2d:", il);
+                LLAMA_LOG_INFO("Adjusted split at layer %2d:  ", il);
                 float last_split = 0;
-                for (auto & p : cur_splits) {
-                    LLAMA_LOG_INFO(" %g", p - last_split);
-                    last_split = p;
+                for (int i = 0; i < (int)cur_splits.size(); ++i) {
+                    if (i > 0) {
+                        LLAMA_LOG_INFO(" ; ");
+                    }
+                    LLAMA_LOG_INFO("GPU%d: %4g", i, cur_splits[i] - last_split);
+                    if (i < int(gpu_split_count.size())) {
+                        gpu_split_count[i] += cur_splits[i] - last_split;
+                    }
+                    last_split = cur_splits[i];
                 }
                 LLAMA_LOG_INFO("\n");
             }
@@ -4243,6 +4253,17 @@ bool create_tensors_helper::create_tensors() {
             }
         }
 
+        if (!gpu_split_count.empty()) {
+            LLAMA_LOG_INFO("Adjusted splits (total)   :  ");
+            for (int i = 0; i < (int)gpu_split_count.size(); ++i) {
+                if (i > 0) {
+                    LLAMA_LOG_INFO(" ; ");
+                }
+                LLAMA_LOG_INFO("GPU%d: %4g", i, gpu_split_count[i]);
+            }
+            LLAMA_LOG_INFO("\n");
+        }
+
         if (model.output) {
             if (auto it = split_tensors.find(model.output); it != split_tensors.end()) {
                 if (ggml_backend_buft_is_host(model.buft_output.buft_matrix)) {
diff --git a/src/llama.cpp b/src/llama.cpp
index c6c8c89c..8e7c7803 100644
--- a/src/llama.cpp
+++ b/src/llama.cpp
@@ -4888,7 +4888,7 @@ static void llama_repack_up_gate_exps(llama_context & lctx) {
             if (nbytes > aux_buffer_gate.size()) {
                 aux_buffer_gate.resize(nbytes);
             }
-            printf("%s: repacking up/gate experts weight in layer %d\n", __func__, il);
+            LLAMA_LOG_INFO("%s: repacking up/gate experts weight in layer %d\n", __func__, il);
             ggml_backend_tensor_get(l.ffn_up_exps, aux_buffer_up.data(), 0, nbytes);
             ggml_backend_tensor_get(l.ffn_gate_exps, aux_buffer_gate.data(), 0, nbytes);
             if (aux_buffer_up_gate.size() < 2*nbytes) {
@@ -4914,7 +4914,7 @@ static void llama_repack_up_gate_exps(llama_context & lctx) {
                 if (nbytes > aux_buffer_gate.size()) {
                     aux_buffer_gate.resize(nbytes);
                 }
-                printf("%s: repacking up/gate experts bias in layer %d\n", __func__, il);
+                LLAMA_LOG_INFO("%s: repacking up/gate experts bias in layer %d\n", __func__, il);
                 ggml_backend_tensor_get(l.ffn_up_exps_b, aux_buffer_up.data(), 0, nbytes);
                 ggml_backend_tensor_get(l.ffn_gate_exps_b, aux_buffer_gate.data(), 0, nbytes);
                 if (aux_buffer_up_gate.size() < 2*nbytes) {