Cleaner log for adjusted splits (#1494)

* sweep-bench: add more skipped patterns to --minilog * cleaner log for adjusted splits * Add totalization for adjusted splits * Clean up semicolons * Addition for totalizer ^^ * Change accordingly to review * Forgotten leftover removed * 'total' instead of 'totalized'
2026-06-28 04:30:15 -05:00 · 2026-03-24 07:49:40 +01:00 · 2026-03-24 07:49:40 +01:00 · 094f76ee86
commit 094f76ee86
parent cdf9142aa5
3 changed files with 29 additions and 6 deletions
--- a/examples/sweep-bench/sweep-bench.cpp
+++ b/examples/sweep-bench/sweep-bench.cpp
@ -37,6 +37,8 @@ static void llama_selective_log_callback(ggml_log_level level, const char * text
        "Layer ",
        "llm_load_tensors:",
        "==========================",
+        "merging up/gate in layer",
+        "repacking up/gate experts weight in layer",
    };
    for (const char * pat : skip_patterns) {
        if (strstr(text, pat) != nullptr) {
--- a/src/llama-load-tensors.cpp
+++ b/src/llama-load-tensors.cpp
@ -3961,6 +3961,10 @@ bool create_tensors_helper::create_tensors() {
                }
            }
        }
+        std::vector<float> gpu_split_count;
+        if (model.max_gpu > 0 && model.max_gpu < int(model.splits.size())) {
+            gpu_split_count.resize(model.splits.size(), 0.0f);
+        }
        for (int il = 0; il < n_layer; ++il) {
            int gqa_ratio = hparams.n_head(il) / hparams.n_head_kv(il);
            if (ggml_backend_buft_is_host(model.buft_layer[il].buft_matrix)) {
@ -3970,11 +3974,17 @@ bool create_tensors_helper::create_tensors() {
            if (model.max_gpu > 0 && model.max_gpu < int(model.splits.size()) && il % adjust_step == 0) {
                cur_splits = model.splits;
                adjust_split(cur_splits, mem_used, model.max_gpu);
-                LLAMA_LOG_INFO("Adjusted split at layer %2d:", il);
+                LLAMA_LOG_INFO("Adjusted split at layer %2d:  ", il);
                float last_split = 0;
-                for (auto & p : cur_splits) {
-                    LLAMA_LOG_INFO(" %g", p - last_split);
-                    last_split = p;
+                for (int i = 0; i < (int)cur_splits.size(); ++i) {
+                    if (i > 0) {
+                        LLAMA_LOG_INFO(" ; ");
+                    }
+                    LLAMA_LOG_INFO("GPU%d: %4g", i, cur_splits[i] - last_split);
+                    if (i < int(gpu_split_count.size())) {
+                        gpu_split_count[i] += cur_splits[i] - last_split;
+                    }
+                    last_split = cur_splits[i];
                }
                LLAMA_LOG_INFO("\n");
            }
@ -4243,6 +4253,17 @@ bool create_tensors_helper::create_tensors() {
            }
        }

+        if (!gpu_split_count.empty()) {
+            LLAMA_LOG_INFO("Adjusted splits (total)   :  ");
+            for (int i = 0; i < (int)gpu_split_count.size(); ++i) {
+                if (i > 0) {
+                    LLAMA_LOG_INFO(" ; ");
+                }
+                LLAMA_LOG_INFO("GPU%d: %4g", i, gpu_split_count[i]);
+            }
+            LLAMA_LOG_INFO("\n");
+        }
+
        if (model.output) {
            if (auto it = split_tensors.find(model.output); it != split_tensors.end()) {
                if (ggml_backend_buft_is_host(model.buft_output.buft_matrix)) {
--- a/src/llama.cpp
+++ b/src/llama.cpp
@ -4888,7 +4888,7 @@ static void llama_repack_up_gate_exps(llama_context & lctx) {
            if (nbytes > aux_buffer_gate.size()) {
                aux_buffer_gate.resize(nbytes);
            }
-            printf("%s: repacking up/gate experts weight in layer %d\n", __func__, il);
+            LLAMA_LOG_INFO("%s: repacking up/gate experts weight in layer %d\n", __func__, il);
            ggml_backend_tensor_get(l.ffn_up_exps, aux_buffer_up.data(), 0, nbytes);
            ggml_backend_tensor_get(l.ffn_gate_exps, aux_buffer_gate.data(), 0, nbytes);
            if (aux_buffer_up_gate.size() < 2*nbytes) {
@ -4914,7 +4914,7 @@ static void llama_repack_up_gate_exps(llama_context & lctx) {
                if (nbytes > aux_buffer_gate.size()) {
                    aux_buffer_gate.resize(nbytes);
                }
-                printf("%s: repacking up/gate experts bias in layer %d\n", __func__, il);
+                LLAMA_LOG_INFO("%s: repacking up/gate experts bias in layer %d\n", __func__, il);
                ggml_backend_tensor_get(l.ffn_up_exps_b, aux_buffer_up.data(), 0, nbytes);
                ggml_backend_tensor_get(l.ffn_gate_exps_b, aux_buffer_gate.data(), 0, nbytes);
                if (aux_buffer_up_gate.size() < 2*nbytes) {