Cleaner log for adjusted splits (#1494)

* sweep-bench: add more skipped patterns to --minilog

* cleaner log for adjusted splits

* Add totalization for adjusted splits

* Clean up semicolons

* Addition for totalizer ^^

* Change accordingly to review

* Forgotten leftover removed

* 'total' instead of 'totalized'
This commit is contained in:
Nexes the Elder 2026-03-24 07:49:40 +01:00 committed by GitHub
parent cdf9142aa5
commit 094f76ee86
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
3 changed files with 29 additions and 6 deletions

View File

@ -37,6 +37,8 @@ static void llama_selective_log_callback(ggml_log_level level, const char * text
"Layer ",
"llm_load_tensors:",
"==========================",
"merging up/gate in layer",
"repacking up/gate experts weight in layer",
};
for (const char * pat : skip_patterns) {
if (strstr(text, pat) != nullptr) {

View File

@ -3961,6 +3961,10 @@ bool create_tensors_helper::create_tensors() {
}
}
}
std::vector<float> gpu_split_count;
if (model.max_gpu > 0 && model.max_gpu < int(model.splits.size())) {
gpu_split_count.resize(model.splits.size(), 0.0f);
}
for (int il = 0; il < n_layer; ++il) {
int gqa_ratio = hparams.n_head(il) / hparams.n_head_kv(il);
if (ggml_backend_buft_is_host(model.buft_layer[il].buft_matrix)) {
@ -3970,11 +3974,17 @@ bool create_tensors_helper::create_tensors() {
if (model.max_gpu > 0 && model.max_gpu < int(model.splits.size()) && il % adjust_step == 0) {
cur_splits = model.splits;
adjust_split(cur_splits, mem_used, model.max_gpu);
LLAMA_LOG_INFO("Adjusted split at layer %2d:", il);
LLAMA_LOG_INFO("Adjusted split at layer %2d: ", il);
float last_split = 0;
for (auto & p : cur_splits) {
LLAMA_LOG_INFO(" %g", p - last_split);
last_split = p;
for (int i = 0; i < (int)cur_splits.size(); ++i) {
if (i > 0) {
LLAMA_LOG_INFO(" ; ");
}
LLAMA_LOG_INFO("GPU%d: %4g", i, cur_splits[i] - last_split);
if (i < int(gpu_split_count.size())) {
gpu_split_count[i] += cur_splits[i] - last_split;
}
last_split = cur_splits[i];
}
LLAMA_LOG_INFO("\n");
}
@ -4243,6 +4253,17 @@ bool create_tensors_helper::create_tensors() {
}
}
if (!gpu_split_count.empty()) {
LLAMA_LOG_INFO("Adjusted splits (total) : ");
for (int i = 0; i < (int)gpu_split_count.size(); ++i) {
if (i > 0) {
LLAMA_LOG_INFO(" ; ");
}
LLAMA_LOG_INFO("GPU%d: %4g", i, gpu_split_count[i]);
}
LLAMA_LOG_INFO("\n");
}
if (model.output) {
if (auto it = split_tensors.find(model.output); it != split_tensors.end()) {
if (ggml_backend_buft_is_host(model.buft_output.buft_matrix)) {

View File

@ -4888,7 +4888,7 @@ static void llama_repack_up_gate_exps(llama_context & lctx) {
if (nbytes > aux_buffer_gate.size()) {
aux_buffer_gate.resize(nbytes);
}
printf("%s: repacking up/gate experts weight in layer %d\n", __func__, il);
LLAMA_LOG_INFO("%s: repacking up/gate experts weight in layer %d\n", __func__, il);
ggml_backend_tensor_get(l.ffn_up_exps, aux_buffer_up.data(), 0, nbytes);
ggml_backend_tensor_get(l.ffn_gate_exps, aux_buffer_gate.data(), 0, nbytes);
if (aux_buffer_up_gate.size() < 2*nbytes) {
@ -4914,7 +4914,7 @@ static void llama_repack_up_gate_exps(llama_context & lctx) {
if (nbytes > aux_buffer_gate.size()) {
aux_buffer_gate.resize(nbytes);
}
printf("%s: repacking up/gate experts bias in layer %d\n", __func__, il);
LLAMA_LOG_INFO("%s: repacking up/gate experts bias in layer %d\n", __func__, il);
ggml_backend_tensor_get(l.ffn_up_exps_b, aux_buffer_up.data(), 0, nbytes);
ggml_backend_tensor_get(l.ffn_gate_exps_b, aux_buffer_gate.data(), 0, nbytes);
if (aux_buffer_up_gate.size() < 2*nbytes) {