mirror of
https://github.com/ikawrakow/ik_llama.cpp.git
synced 2026-06-28 04:30:15 -05:00
parent
86f4f516e5
commit
b6bac1aedb
@ -2263,7 +2263,7 @@ static bool llm_load_tensors(
|
||||
size_t mem_margin = fit_margin > 0 ? size_t(fit_margin)*1024*1024 : k_default_mem_margin;
|
||||
|
||||
const int n_layer = hparams.n_layer;
|
||||
const int i_gpu_start = std::max((int) hparams.n_layer - n_gpu_layers, (int) 0);
|
||||
int i_gpu_start = std::max((int) hparams.n_layer - n_gpu_layers, (int) 0);
|
||||
bool use_mmap_buffer = true;
|
||||
|
||||
// there is very little benefit to offloading the input layer, so always keep it on the CPU
|
||||
@ -2330,6 +2330,13 @@ static bool llm_load_tensors(
|
||||
for (int i = 0; i <= n_layer; ++i) {
|
||||
required_mem += layer_sizes[i];
|
||||
}
|
||||
bool has_experts = false;
|
||||
for (int il = 0; il < n_layer; ++il) {
|
||||
if (experts[il].down || experts[il].up || experts[il].gate) {
|
||||
has_experts = true;
|
||||
break;
|
||||
}
|
||||
}
|
||||
size_t available_mem = 0;
|
||||
for (int id = 0; id < device_count; ++id) {
|
||||
if (device_mem[id] > max_compute) {
|
||||
@ -2368,10 +2375,14 @@ static bool llm_load_tensors(
|
||||
if (loaded_sum + layer_sizes[il] <= split_size) {
|
||||
model.default_layer_device[il] = id;
|
||||
loaded_sum += layer_sizes[il];
|
||||
LLAMA_LOG_INFO("Setting default device in layer %2d to %d\n", il, id);
|
||||
if (required_mem <= available_mem) {
|
||||
LLAMA_LOG_INFO("Setting default device in layer %2d to %d\n", il, id);
|
||||
}
|
||||
} else {
|
||||
if (loaded_sum + layer_sizes[il] - split_size < split_size - loaded_sum) {
|
||||
LLAMA_LOG_INFO("Setting default device in layer %2d to %d\n", il, id);
|
||||
if (required_mem <= available_mem) {
|
||||
LLAMA_LOG_INFO("Setting default device in layer %2d to %d\n", il, id);
|
||||
}
|
||||
model.default_layer_device[il] = id;
|
||||
loaded_sum += layer_sizes[il++];
|
||||
}
|
||||
@ -2383,6 +2394,7 @@ static bool llm_load_tensors(
|
||||
}
|
||||
if (fit && required_mem > available_mem) {
|
||||
auto buft = ggml_backend_cpu_buffer_type();
|
||||
if (has_experts) {
|
||||
if (model.split_mode == LLAMA_SPLIT_MODE_GRAPH || model.split_mode == LLAMA_SPLIT_MODE_ATTN) {
|
||||
auto cur_mem = required_mem;
|
||||
int n_override = 0;
|
||||
@ -2500,6 +2512,28 @@ static bool llm_load_tensors(
|
||||
}
|
||||
}
|
||||
}
|
||||
} else {
|
||||
int id = model.devices.size() - 1;
|
||||
for (int il = n_layer; il >= 0; --il) {
|
||||
if (device_mem[id] >= layer_sizes[il]) {
|
||||
device_mem[id] -= layer_sizes[il];
|
||||
model.default_layer_device[il] = id;
|
||||
LLAMA_LOG_INFO("Setting layer %d to device %d\n", il, id);
|
||||
} else {
|
||||
--id;
|
||||
if (id >= 0) {
|
||||
++il;
|
||||
} else {
|
||||
i_gpu_start = il+1;
|
||||
break;
|
||||
}
|
||||
}
|
||||
}
|
||||
for (int il = 0; il < i_gpu_start; ++il) {
|
||||
model.default_layer_device[il] = -1;
|
||||
model.buft_layer[il] = llama_default_buffer_type_cpu(true);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
// assign the repeating layers to the devices according to the splits
|
||||
|
||||
Loading…
x
Reference in New Issue
Block a user