mirror of
https://github.com/ikawrakow/ik_llama.cpp.git
synced 2026-06-28 04:30:15 -05:00
Cleaner log for adjusted splits (#1494)
* sweep-bench: add more skipped patterns to --minilog * cleaner log for adjusted splits * Add totalization for adjusted splits * Clean up semicolons * Addition for totalizer ^^ * Change accordingly to review * Forgotten leftover removed * 'total' instead of 'totalized'
This commit is contained in:
parent
cdf9142aa5
commit
094f76ee86
@ -37,6 +37,8 @@ static void llama_selective_log_callback(ggml_log_level level, const char * text
|
||||
"Layer ",
|
||||
"llm_load_tensors:",
|
||||
"==========================",
|
||||
"merging up/gate in layer",
|
||||
"repacking up/gate experts weight in layer",
|
||||
};
|
||||
for (const char * pat : skip_patterns) {
|
||||
if (strstr(text, pat) != nullptr) {
|
||||
|
||||
@ -3961,6 +3961,10 @@ bool create_tensors_helper::create_tensors() {
|
||||
}
|
||||
}
|
||||
}
|
||||
std::vector<float> gpu_split_count;
|
||||
if (model.max_gpu > 0 && model.max_gpu < int(model.splits.size())) {
|
||||
gpu_split_count.resize(model.splits.size(), 0.0f);
|
||||
}
|
||||
for (int il = 0; il < n_layer; ++il) {
|
||||
int gqa_ratio = hparams.n_head(il) / hparams.n_head_kv(il);
|
||||
if (ggml_backend_buft_is_host(model.buft_layer[il].buft_matrix)) {
|
||||
@ -3970,11 +3974,17 @@ bool create_tensors_helper::create_tensors() {
|
||||
if (model.max_gpu > 0 && model.max_gpu < int(model.splits.size()) && il % adjust_step == 0) {
|
||||
cur_splits = model.splits;
|
||||
adjust_split(cur_splits, mem_used, model.max_gpu);
|
||||
LLAMA_LOG_INFO("Adjusted split at layer %2d:", il);
|
||||
LLAMA_LOG_INFO("Adjusted split at layer %2d: ", il);
|
||||
float last_split = 0;
|
||||
for (auto & p : cur_splits) {
|
||||
LLAMA_LOG_INFO(" %g", p - last_split);
|
||||
last_split = p;
|
||||
for (int i = 0; i < (int)cur_splits.size(); ++i) {
|
||||
if (i > 0) {
|
||||
LLAMA_LOG_INFO(" ; ");
|
||||
}
|
||||
LLAMA_LOG_INFO("GPU%d: %4g", i, cur_splits[i] - last_split);
|
||||
if (i < int(gpu_split_count.size())) {
|
||||
gpu_split_count[i] += cur_splits[i] - last_split;
|
||||
}
|
||||
last_split = cur_splits[i];
|
||||
}
|
||||
LLAMA_LOG_INFO("\n");
|
||||
}
|
||||
@ -4243,6 +4253,17 @@ bool create_tensors_helper::create_tensors() {
|
||||
}
|
||||
}
|
||||
|
||||
if (!gpu_split_count.empty()) {
|
||||
LLAMA_LOG_INFO("Adjusted splits (total) : ");
|
||||
for (int i = 0; i < (int)gpu_split_count.size(); ++i) {
|
||||
if (i > 0) {
|
||||
LLAMA_LOG_INFO(" ; ");
|
||||
}
|
||||
LLAMA_LOG_INFO("GPU%d: %4g", i, gpu_split_count[i]);
|
||||
}
|
||||
LLAMA_LOG_INFO("\n");
|
||||
}
|
||||
|
||||
if (model.output) {
|
||||
if (auto it = split_tensors.find(model.output); it != split_tensors.end()) {
|
||||
if (ggml_backend_buft_is_host(model.buft_output.buft_matrix)) {
|
||||
|
||||
@ -4888,7 +4888,7 @@ static void llama_repack_up_gate_exps(llama_context & lctx) {
|
||||
if (nbytes > aux_buffer_gate.size()) {
|
||||
aux_buffer_gate.resize(nbytes);
|
||||
}
|
||||
printf("%s: repacking up/gate experts weight in layer %d\n", __func__, il);
|
||||
LLAMA_LOG_INFO("%s: repacking up/gate experts weight in layer %d\n", __func__, il);
|
||||
ggml_backend_tensor_get(l.ffn_up_exps, aux_buffer_up.data(), 0, nbytes);
|
||||
ggml_backend_tensor_get(l.ffn_gate_exps, aux_buffer_gate.data(), 0, nbytes);
|
||||
if (aux_buffer_up_gate.size() < 2*nbytes) {
|
||||
@ -4914,7 +4914,7 @@ static void llama_repack_up_gate_exps(llama_context & lctx) {
|
||||
if (nbytes > aux_buffer_gate.size()) {
|
||||
aux_buffer_gate.resize(nbytes);
|
||||
}
|
||||
printf("%s: repacking up/gate experts bias in layer %d\n", __func__, il);
|
||||
LLAMA_LOG_INFO("%s: repacking up/gate experts bias in layer %d\n", __func__, il);
|
||||
ggml_backend_tensor_get(l.ffn_up_exps_b, aux_buffer_up.data(), 0, nbytes);
|
||||
ggml_backend_tensor_get(l.ffn_gate_exps_b, aux_buffer_gate.data(), 0, nbytes);
|
||||
if (aux_buffer_up_gate.size() < 2*nbytes) {
|
||||
|
||||
Loading…
x
Reference in New Issue
Block a user