ik_llama.cpp/src/llama-reload.cpp
Kawrakow d5507e33ae
Split mode graph for dense Gemma4 assistant (#2022)
* WIP: Split mode graph for Gemma4 assistant

Something is not right - acceptance drops to nearly zero.

* Per model CUDA contexts

Still not working!?

* This works

The issue was that I was not correctly calculating the number
of KV heads for the split KV cache.

* Compiler warnings

* It is better to use llama_context pointers as keys
2026-06-24 18:29:32 +02:00

948 lines
34 KiB
C++

#include "llama-reload-info.h"
#include "llama-model.h"
#include "llama-model-loader.h"
#ifdef GGML_USE_CUDA
#include "ggml-cuda.h"
#endif
#include <map>
#include <fstream>
#include <algorithm>
#include <vector>
#include <cstdint>
#include <cstring>
// ------------------------------------------------------------------
// Debug helpers
// ------------------------------------------------------------------
static void log_tensor_state(const char * ctx, struct ggml_tensor * t) {
#ifndef NDEBUG
if (!t) {
LLAMA_LOG_DEBUG("%s: tensor=NULL\n", ctx);
return;
}
const char * buft_name = "null";
if (t->buffer) {
auto buft = ggml_backend_buffer_get_type(t->buffer);
if (buft) buft_name = ggml_backend_buft_name(buft);
}
LLAMA_LOG_DEBUG("%s: tensor='%s' type=%s ne={%ld,%ld,%ld,%ld} nb={%zu,%zu,%zu,%zu} "
"buffer=%p data=%p extra=%p buft=%s\n",
ctx, t->name, ggml_type_name(t->type),
(long)t->ne[0], (long)t->ne[1], (long)t->ne[2], (long)t->ne[3],
t->nb[0], t->nb[1], t->nb[2], t->nb[3],
(void*)t->buffer, t->data, (void*)t->extra, buft_name);
#else
(void)ctx;
(void)t;
#endif
}
static void log_split_state(const char * ctx, struct ggml_tensor * t) {
#ifndef NDEBUG
if (!t || !t->extra) {
LLAMA_LOG_DEBUG("%s: no splits (extra=%p)\n", ctx, (void*)(t ? t->extra : nullptr));
return;
}
auto extra = (ggml_split_tensor_t *)t->extra;
LLAMA_LOG_DEBUG("%s: tensor='%s' n_device=%d split_dim=%d\n",
ctx, t->name, extra->n_device, extra->split_dim);
for (int i = 0; i < extra->n_device; ++i) {
if (!extra->splits[i]) {
LLAMA_LOG_DEBUG("%s: split[%d]=NULL\n", ctx, i);
continue;
}
const char * split_buft_name = "null";
if (extra->splits[i]->buffer) {
auto buft = ggml_backend_buffer_get_type(extra->splits[i]->buffer);
if (buft) split_buft_name = ggml_backend_buft_name(buft);
}
LLAMA_LOG_DEBUG("%s: split[%d] type=%s ne={%ld,%ld,%ld,%ld} nb={%zu,%zu,%zu,%zu} "
"buffer=%p data=%p buft=%s\n",
ctx, i, ggml_type_name(extra->splits[i]->type),
(long)extra->splits[i]->ne[0], (long)extra->splits[i]->ne[1],
(long)extra->splits[i]->ne[2], (long)extra->splits[i]->ne[3],
extra->splits[i]->nb[0], extra->splits[i]->nb[1],
extra->splits[i]->nb[2], extra->splits[i]->nb[3],
(void*)extra->splits[i]->buffer, extra->splits[i]->data, split_buft_name);
}
#else
(void)ctx;
(void)t;
#endif
}
// ------------------------------------------------------------------
// GGUF header parser (reuses llama.cpp / ggml GGUF loader)
// ------------------------------------------------------------------
static bool gguf_find_tensor_meta(const char * path, const char * target_name,
size_t & out_offset, size_t & out_nbytes,
ggml_type & out_type,
int64_t out_ne[GGML_MAX_DIMS])
{
struct ggml_context * ctx = nullptr;
struct gguf_init_params params = {
/*.no_alloc = */ true,
/*.ctx = */ &ctx,
};
struct gguf_context * gguf = gguf_init_from_file(path, params);
if (!gguf) {
return false;
}
const int idx = gguf_find_tensor(gguf, target_name);
if (idx < 0) {
ggml_free(ctx);
gguf_free(gguf);
return false;
}
struct ggml_tensor * tensor = ggml_get_tensor(ctx, target_name);
if (!tensor) {
ggml_free(ctx);
gguf_free(gguf);
return false;
}
out_offset = gguf_get_data_offset(gguf) + gguf_get_tensor_offset(gguf, idx);
out_nbytes = ggml_nbytes(tensor);
out_type = tensor->type;
for (int i = 0; i < GGML_MAX_DIMS; ++i) {
out_ne[i] = tensor->ne[i];
}
ggml_free(ctx);
gguf_free(gguf);
return true;
}
// ------------------------------------------------------------------
// Buffer census helper
// ------------------------------------------------------------------
static size_t count_buffer_users(
const std::vector<std::pair<std::string, struct ggml_tensor *>> & tensors_by_name,
ggml_backend_buffer_t buf)
{
if (!buf) return 0;
size_t n = 0;
for (auto & p : tensors_by_name) {
if (p.second->buffer == buf) ++n;
}
return n;
}
static bool is_original_snapshot_buffer(llama_model & model, ggml_backend_buffer_t buf) {
if (!buf) return false;
if (!model.reload) return false;
for (const auto & kv : model.reload->tensor_reload_sources) {
const auto & src = kv.second;
if (buf == src.original_buffer) return true;
for (const auto & os : src.original_splits) {
if (buf == os.buffer) return true;
}
}
return false;
}
// ------------------------------------------------------------------
// Final size estimator
// ------------------------------------------------------------------
static size_t llama_model_compute_final_nbytes(struct ggml_tensor * tensor, ggml_type new_type) {
if (new_type == tensor->type) {
return ggml_nbytes(tensor);
}
return ggml_row_size(new_type, tensor->ne[0]) * ggml_nrows(tensor);
}
// ------------------------------------------------------------------
// Fallback allocator
// ------------------------------------------------------------------
static ggml_backend_buffer_t alloc_buffer_fallback(ggml_backend_buffer_type_t buft, size_t size) {
ggml_backend_buffer_t buf = ggml_backend_buft_alloc_buffer(buft, size);
if (buf) {
LLAMA_LOG_DEBUG("%s: allocated %zu bytes on backend '%s'\n",
__func__, size, ggml_backend_buft_name(buft));
return buf;
}
auto cpu_buft = ggml_backend_cpu_buffer_type();
if (buft == cpu_buft) {
LLAMA_LOG_WARN("%s: CPU alloc failed (%zu bytes)\n", __func__, size);
return nullptr;
}
LLAMA_LOG_WARN("%s: backend alloc failed (%zu bytes on '%s'), trying CPU fallback\n",
__func__, size, ggml_backend_buft_name(buft));
buf = ggml_backend_buft_alloc_buffer(cpu_buft, size);
if (!buf) {
LLAMA_LOG_WARN("%s: CPU fallback alloc failed (%zu bytes)\n", __func__, size);
return nullptr;
}
LLAMA_LOG_DEBUG("%s: allocated %zu bytes on CPU fallback\n", __func__, size);
return buf;
}
// ------------------------------------------------------------------
// MoE sibling resync
// ------------------------------------------------------------------
// MoE layers have three weight tensors per block: gate, up, down.
// The CUDA split backend distributes each tensor across GPUs by splitting
// one dimension (usually dim 0 or 1). Split boundaries must be multiples
// of the quantization block size (e.g. 256 for IQ1_KT). If the reference
// tensor changes quantization type, its block size changes, which changes
// the valid split boundaries. ALL siblings in the same layer MUST adopt
// the SAME per-device split dimensions, otherwise the backend dispatches
// rows to the wrong devices and corrupts inference.
//
// When the reference tensor is back on its original snapshot, siblings
// can simply be reattached to their original snapshots too -- no data
// movement or allocation is required.
// ------------------------------------------------------------------
// ------------------------------------------------------------------
// Sibling name registration
// ------------------------------------------------------------------
static void populate_moe_siblings(const char * name, tensor_reload_source & src) {
LLAMA_LOG_DEBUG("%s: name='%s'\n", __func__, name);
static const char * suffixes[] = {
".ffn_down_exps.weight",
".ffn_up_exps.weight",
".ffn_gate_exps.weight",
};
std::string n(name);
for (const char * sfx : suffixes) {
size_t pos = n.find(sfx);
if (pos == std::string::npos) continue;
std::string base = n.substr(0, pos);
for (const char * other : suffixes) {
if (strcmp(other, sfx) != 0) {
src.sibling_names.push_back(base + other);
LLAMA_LOG_DEBUG("%s: registered sibling '%s' for '%s'\n",
__func__, (base + other).c_str(), name);
}
}
return;
}
LLAMA_LOG_DEBUG("%s: '%s' no MoE suffix matched\n", __func__, name);
}
// ------------------------------------------------------------------
// Snapshot helper
// ------------------------------------------------------------------
static void snapshot_tensor_source(struct ggml_tensor * tensor,
tensor_reload_source & src)
{
if (!tensor || src.original_buffer != nullptr) return;
src.original_buffer = tensor->buffer;
src.original_data = tensor->data;
src.original_nbytes = ggml_nbytes(tensor);
src.original_type = tensor->type;
for (int i = 0; i < GGML_MAX_DIMS; ++i) {
src.original_ne[i] = tensor->ne[i];
src.original_nb[i] = tensor->nb[i];
}
auto extra = (ggml_split_tensor_t *)tensor->extra;
if (extra) {
src.original_extra = extra;
src.original_splits.clear();
for (int i = 0; i < extra->n_device; ++i) {
tensor_reload_source::split_info si;
if (extra->splits[i]) {
for (int j = 0; j < GGML_MAX_DIMS; ++j) {
si.ne[j] = extra->splits[i]->ne[j];
si.nb[j] = extra->splits[i]->nb[j];
}
si.data = extra->splits[i]->data;
si.buffer = extra->splits[i]->buffer;
si.tensor = extra->splits[i];
}
src.original_splits.push_back(si);
}
}
populate_moe_siblings(ggml_get_name(tensor), src);
src.state = tensor_reload_source::reload_state::ON_ORIGINAL;
log_tensor_state("snapshot_tensor_source", tensor);
}
// ------------------------------------------------------------------
// Constructor
// ------------------------------------------------------------------
reload_info::reload_info(const llama_model_loader & ml) {
for (const auto & w : ml.weights) {
if (!w.tensor || w.idx >= (int)ml.files.size()) continue;
struct stat st;
if (stat(ml.files[w.idx]->get_path().c_str(), &st) != 0) continue;
tensor_reload_source src;
src.path = ml.files[w.idx]->get_path();
src.data_offset = w.offs;
src.nbytes = ggml_nbytes(w.tensor);
src.last_mtime = st.st_mtime;
#ifdef __linux__
src.last_mtime_ns = st.st_mtim.tv_nsec;
#endif
tensor_reload_sources[ggml_get_name(w.tensor)] = std::move(src);
}
}
// ------------------------------------------------------------------
// Eager snapshot
// ------------------------------------------------------------------
void reload_info::snapshot_all_reload_tensors(llama_model & model) {
if (this->reload_snapshots_done.exchange(true)) return;
LLAMA_LOG_INFO("%s: eager snapshot of all reload tensors + siblings\n", __func__);
for (auto & kv : tensor_reload_sources) {
struct ggml_tensor * tensor = nullptr;
for (auto & p : model.tensors_by_name) {
if (p.first == kv.first) { tensor = p.second; break; }
}
if (!tensor) continue;
snapshot_tensor_source(tensor, kv.second);
}
for (auto & kv : tensor_reload_sources) {
auto & src = kv.second;
for (const auto & sib_name : src.sibling_names) {
auto it = this->tensor_reload_sources.find(sib_name);
if (it == this->tensor_reload_sources.end()) continue;
if (it->second.original_buffer != nullptr) continue;
struct ggml_tensor * sib = nullptr;
for (auto & p : model.tensors_by_name) {
if (p.first == sib_name) { sib = p.second; break; }
}
if (!sib) continue;
snapshot_tensor_source(sib, it->second);
}
}
}
// ------------------------------------------------------------------
// Re-attachment helper
// ------------------------------------------------------------------
static bool reattach_split_tensor_to_shared(llama_model & model, const char * name) {
auto it = model.reload->tensor_reload_sources.find(name);
if (it == model.reload->tensor_reload_sources.end()) return false;
auto & src = it->second;
if (!src.original_buffer) return false;
struct ggml_tensor * tensor = nullptr;
for (auto & p : model.tensors_by_name) {
if (p.first == name) { tensor = p.second; break; }
}
if (!tensor) return false;
if (tensor->buffer == src.original_buffer) {
log_tensor_state("reattach_split_tensor_to_shared", tensor);
src.state = tensor_reload_source::reload_state::ON_ORIGINAL;
return true;
}
if (tensor->buffer && src.state != tensor_reload_source::reload_state::ON_ORIGINAL) {
ggml_backend_buffer_free(tensor->buffer);
}
tensor->buffer = nullptr;
tensor->data = nullptr;
tensor->buffer = src.original_buffer;
tensor->data = src.original_data;
tensor->type = src.original_type;
for (int i = 0; i < GGML_MAX_DIMS; ++i) {
tensor->ne[i] = src.original_ne[i];
tensor->nb[i] = src.original_nb[i];
}
if (src.original_extra) {
tensor->extra = src.original_extra;
auto extra = (ggml_split_tensor_t *)tensor->extra;
for (int i = 0; i < extra->n_device && i < (int)src.original_splits.size(); ++i) {
auto & os = src.original_splits[i];
if (!extra->splits[i] && os.tensor) {
extra->splits[i] = os.tensor;
}
if (extra->splits[i]) {
if (extra->splits[i]->buffer && extra->splits[i]->buffer != os.buffer &&
src.state != tensor_reload_source::reload_state::ON_ORIGINAL) {
ggml_backend_buffer_free(extra->splits[i]->buffer);
}
extra->splits[i]->data = os.data;
extra->splits[i]->buffer = os.buffer;
extra->splits[i]->type = src.original_type;
for (int j = 0; j < GGML_MAX_DIMS; ++j) {
extra->splits[i]->ne[j] = os.ne[j];
extra->splits[i]->nb[j] = os.nb[j];
}
}
}
}
src.state = tensor_reload_source::reload_state::ON_ORIGINAL;
return true;
}
// ------------------------------------------------------------------
// MoE sibling resync
// ------------------------------------------------------------------
static void resync_moe_sibling_splits(
llama_model & model,
struct ggml_context * /*ctx_tmp*/,
struct ggml_tensor * ref_tensor,
const char * ref_name)
{
std::string name_str(ref_name);
std::string layer_prefix;
std::vector<std::string> suffixes;
if (name_str.find(".ffn_down_exps.weight") != std::string::npos) {
layer_prefix = name_str.substr(0, name_str.find(".ffn_down_exps.weight"));
suffixes = {".ffn_up_exps.weight", ".ffn_gate_exps.weight"};
} else if (name_str.find(".ffn_up_exps.weight") != std::string::npos) {
layer_prefix = name_str.substr(0, name_str.find(".ffn_up_exps.weight"));
suffixes = {".ffn_down_exps.weight", ".ffn_gate_exps.weight"};
} else if (name_str.find(".ffn_gate_exps.weight") != std::string::npos) {
layer_prefix = name_str.substr(0, name_str.find(".ffn_gate_exps.weight"));
suffixes = {".ffn_up_exps.weight", ".ffn_down_exps.weight"};
} else {
return;
}
auto ref_extra = (ggml_split_tensor_t *)ref_tensor->extra;
if (!ref_extra) return;
auto it_ref_src = model.reload->tensor_reload_sources.find(ref_name);
if (it_ref_src != model.reload->tensor_reload_sources.end() && ref_tensor->buffer == it_ref_src->second.original_buffer) {
for (const auto & suffix : suffixes) {
reattach_split_tensor_to_shared(model, (layer_prefix + suffix).c_str());
}
return;
}
struct sibling_job {
std::string name;
struct ggml_tensor * tensor;
ggml_split_tensor_t * extra;
std::vector<char> host_buf;
bool needs_resync = false;
};
std::vector<sibling_job> jobs;
for (const auto & suffix : suffixes) {
std::string sib_name = layer_prefix + suffix;
struct ggml_tensor * sib = nullptr;
for (auto & p : model.tensors_by_name) {
if (p.first == sib_name) { sib = p.second; break; }
}
if (!sib || !sib->extra || sib == ref_tensor) continue;
auto sib_extra = (ggml_split_tensor_t *)sib->extra;
if (sib_extra->n_device != ref_extra->n_device) continue;
int sib_dim = sib_extra->split_dim < 0 ? 0 : sib_extra->split_dim;
int ref_dim = ref_extra->split_dim < 0 ? 0 : ref_extra->split_dim;
bool need = false;
for (int i = 0; i < ref_extra->n_device; ++i) {
bool rh = ref_extra->splits[i] != nullptr;
bool sh = sib_extra->splits[i] != nullptr;
if (rh != sh) { need = true; break; }
if (rh && sh && sib_extra->splits[i]->ne[sib_dim] != ref_extra->splits[i]->ne[ref_dim]) {
need = true; break;
}
}
if (!need) continue;
size_t nbytes = ggml_nbytes(sib);
std::vector<char> buf(nbytes);
ggml_backend_tensor_get(sib, buf.data(), 0, nbytes);
jobs.push_back({sib_name, sib, sib_extra, std::move(buf), true});
}
if (jobs.empty()) return;
log_split_state("resync_moe_sibling_splits", ref_tensor);
// Phase A: Detach / free old buffers, allocate new main handles
for (auto & job : jobs) {
auto sib = job.tensor;
ggml_backend_buffer_type_t buft = sib->buffer
? ggml_backend_buffer_get_type(sib->buffer)
: ggml_backend_cpu_buffer_type();
auto it = model.reload->tensor_reload_sources.find(job.name);
bool was_orig = (it != model.reload->tensor_reload_sources.end() && it->second.state == tensor_reload_source::reload_state::ON_ORIGINAL);
if (sib->buffer) {
if (!was_orig) ggml_backend_buffer_free(sib->buffer);
sib->buffer = nullptr;
sib->data = nullptr;
}
size_t alloc_size = ggml_backend_buft_get_alloc_size(buft, sib);
ggml_backend_buffer_t new_buf = alloc_buffer_fallback(buft, alloc_size);
if (!new_buf) {
job.needs_resync = false;
continue;
}
sib->buffer = new_buf;
sib->data = (void*)0x1; // dummy; split backend uses extra->splits
if (it != model.reload->tensor_reload_sources.end()) {
it->second.state = tensor_reload_source::reload_state::DETACHED;
}
}
// Phase B: Propagate dimensions & recompute strides
for (auto & job : jobs) {
if (!job.needs_resync) continue;
auto sib = job.tensor;
auto sib_extra = job.extra;
for (int i = 0; i < ref_extra->n_device; ++i) {
if (!ref_extra->splits[i]) {
if (sib_extra->splits[i]) sib_extra->splits[i] = nullptr;
continue;
}
if (!sib_extra->splits[i]) continue;
sib_extra->splits[i]->ne[sib_extra->split_dim < 0 ? 0 : sib_extra->split_dim] =
ref_extra->splits[i]->ne[ref_extra->split_dim < 0 ? 0 : ref_extra->split_dim];
}
int n_dims = 0;
for (int i = GGML_MAX_DIMS - 1; i >= 0; --i) {
if (sib->ne[i] != 1) { n_dims = i + 1; break; }
}
size_t ctx_size = ggml_tensor_overhead() * (sib_extra->n_device + 4);
if (ctx_size < 16384) ctx_size = 16384;
struct ggml_init_params p = { ctx_size, NULL, true };
struct ggml_context * ctx = ggml_init(p);
if (ctx) {
for (int i = 0; i < sib_extra->n_device; ++i) {
if (!sib_extra->splits[i]) continue;
auto tmp = ggml_new_tensor(ctx, sib->type, n_dims, sib_extra->splits[i]->ne);
if (tmp) {
for (int j = 0; j < GGML_MAX_DIMS; ++j) {
sib_extra->splits[i]->nb[j] = tmp->nb[j];
}
}
}
ggml_free(ctx);
}
}
// Phase C: Allocate GPU split buffers
bool gpu_failed = false;
#ifdef GGML_USE_CUDA
for (auto & job : jobs) {
if (!job.needs_resync) continue;
auto sib_extra = job.extra;
for (int i = 0; i < sib_extra->n_device; ++i) {
if (!sib_extra->splits[i]) continue;
size_t need = ggml_nbytes(sib_extra->splits[i]);
auto buft = ggml_backend_cuda_buffer_type(i);
auto b = ggml_backend_buft_alloc_buffer(buft, need);
if (!b) { gpu_failed = true; break; }
sib_extra->splits[i]->buffer = b;
sib_extra->splits[i]->data = ggml_backend_buffer_get_base(b);
}
if (gpu_failed) break;
}
#else
// Without CUDA support, force CPU fallback for any resync jobs
for (auto & job : jobs) {
if (job.needs_resync) { gpu_failed = true; break; }
}
#endif
// Phase D: If any GPU alloc failed, move entire layer to CPU
if (gpu_failed) {
for (auto & job : jobs) {
if (!job.needs_resync) continue;
auto sib = job.tensor;
auto sib_extra = job.extra;
for (int i = 0; i < sib_extra->n_device; ++i) {
if (sib_extra->splits[i] && sib_extra->splits[i]->buffer) {
auto it = model.reload->tensor_reload_sources.find(job.name);
bool is_orig = false;
if (it != model.reload->tensor_reload_sources.end() && i < (int)it->second.original_splits.size()) {
is_orig = (sib_extra->splits[i]->buffer == it->second.original_splits[i].buffer);
}
if (!is_orig) ggml_backend_buffer_free(sib_extra->splits[i]->buffer);
sib_extra->splits[i]->buffer = nullptr;
sib_extra->splits[i]->data = nullptr;
}
}
if (sib->buffer) {
auto it = model.reload->tensor_reload_sources.find(job.name);
bool is_orig = (it != model.reload->tensor_reload_sources.end() && it->second.state == tensor_reload_source::reload_state::ON_ORIGINAL);
if (!is_orig) ggml_backend_buffer_free(sib->buffer);
sib->buffer = nullptr;
sib->data = nullptr;
}
size_t need = ggml_nbytes(sib);
auto cpu = alloc_buffer_fallback(ggml_backend_cpu_buffer_type(), need);
if (cpu) {
sib->buffer = cpu;
sib->data = ggml_backend_buffer_get_base(cpu);
auto it = model.reload->tensor_reload_sources.find(job.name);
if (it != model.reload->tensor_reload_sources.end()) it->second.state = tensor_reload_source::reload_state::FALLBACK_CPU;
}
}
}
// Phase E: Write data back
for (auto & job : jobs) {
if (!job.needs_resync) continue;
ggml_backend_tensor_set(job.tensor, job.host_buf.data(), 0, job.host_buf.size());
}
}
// ------------------------------------------------------------------
// reload_tensor_split_path
// ------------------------------------------------------------------
static bool reload_tensor_split_path(
llama_model & model,
struct ggml_tensor * tensor,
tensor_reload_source & src,
const std::vector<char> & host_buf,
ggml_type curr_type,
bool returning_to_original,
ggml_backend_buffer_t old_buf)
{
(void)curr_type;
const char * name = ggml_get_name(tensor);
if (returning_to_original) {
if (old_buf && src.state != tensor_reload_source::reload_state::ON_ORIGINAL) {
ggml_backend_buffer_free(old_buf);
}
tensor->buffer = nullptr;
tensor->data = nullptr;
if (!reattach_split_tensor_to_shared(model, name)) return false;
for (const auto & sib : src.sibling_names) {
reattach_split_tensor_to_shared(model, sib.c_str());
}
return true;
}
ggml_backend_buffer_type_t buft = old_buf
? ggml_backend_buffer_get_type(old_buf)
: ggml_backend_cpu_buffer_type();
if (old_buf && src.state != tensor_reload_source::reload_state::ON_ORIGINAL) {
ggml_backend_buffer_free(old_buf);
}
tensor->buffer = nullptr;
tensor->data = nullptr;
size_t alloc_size = ggml_backend_buft_get_alloc_size(buft, tensor);
ggml_backend_buffer_t new_buf = alloc_buffer_fallback(buft, alloc_size);
if (!new_buf) return false;
ggml_backend_tensor_alloc(new_buf, tensor, ggml_backend_buffer_get_base(new_buf));
//ggml_backend_buffer_init_tensor(tensor->buffer, tensor);
ggml_backend_tensor_set(tensor, host_buf.data(), 0, host_buf.size());
log_tensor_state("reload_tensor_split_path", tensor);
if (tensor->extra) resync_moe_sibling_splits(model, nullptr, tensor, name);
src.state = tensor_reload_source::reload_state::DETACHED;
return true;
}
// ------------------------------------------------------------------
// reload_tensor_non_split_path
// ------------------------------------------------------------------
static bool reload_tensor_non_split_path(
llama_model & model,
struct ggml_tensor * tensor,
tensor_reload_source & src,
const std::vector<char> & host_buf,
ggml_type curr_type,
bool returning_to_original,
ggml_backend_buffer_t old_buf)
{
(void)model;
(void)curr_type;
#ifndef NDEBUG
const char * name = ggml_get_name(tensor);
#endif
if (returning_to_original) {
if (old_buf && src.state != tensor_reload_source::reload_state::ON_ORIGINAL) {
ggml_backend_buffer_free(old_buf);
}
tensor->buffer = src.original_buffer;
tensor->data = src.original_data;
tensor->type = src.original_type;
for (int i = 0; i < GGML_MAX_DIMS; ++i) {
tensor->ne[i] = src.original_ne[i];
tensor->nb[i] = src.original_nb[i];
}
src.state = tensor_reload_source::reload_state::ON_ORIGINAL;
return true;
}
ggml_backend_buffer_type_t buft = old_buf
? ggml_backend_buffer_get_type(old_buf)
: ggml_backend_cpu_buffer_type();
if (old_buf && src.state != tensor_reload_source::reload_state::ON_ORIGINAL) {
ggml_backend_buffer_free(old_buf);
#ifndef NDEBUG
} else if (old_buf) {
LLAMA_LOG_DEBUG("detaching from original snapshot buffer %p for '%s'\n", (void*)old_buf, name);
#endif
}
tensor->buffer = nullptr;
tensor->data = nullptr;
size_t alloc_size = ggml_backend_buft_get_alloc_size(buft, tensor);
ggml_backend_buffer_t new_buf = alloc_buffer_fallback(buft, alloc_size);
if (!new_buf) return false;
ggml_backend_tensor_alloc(new_buf, tensor, ggml_backend_buffer_get_base(new_buf));
ggml_backend_tensor_set(tensor, host_buf.data(), 0, host_buf.size());
src.state = tensor_reload_source::reload_state::DETACHED;
return true;
}
// ------------------------------------------------------------------
// apply_tensor_type_change
// ------------------------------------------------------------------
static bool apply_tensor_type_change(
llama_model & /*model*/,
struct ggml_tensor * tensor,
tensor_reload_source & /*src*/,
ggml_type curr_type)
{
#ifndef NDEBUG
const char * name = ggml_get_name(tensor);
(void)name;
#endif
tensor->type = curr_type;
int n_dims = 0;
for (int i = GGML_MAX_DIMS - 1; i >= 0; --i) {
if (tensor->ne[i] != 1) { n_dims = i + 1; break; }
}
size_t ctx_size = ggml_tensor_overhead() * (1 + (tensor->extra ? ((ggml_split_tensor_t*)tensor->extra)->n_device : 0))
+ ggml_graph_overhead_custom(1, false);
struct ggml_init_params p = { ctx_size, NULL, true };
struct ggml_context * ctx = ggml_init(p);
if (!ctx) return false;
auto tmp = ggml_new_tensor(ctx, curr_type, n_dims, tensor->ne);
if (!tmp) { ggml_free(ctx); return false; }
for (int i = 0; i < GGML_MAX_DIMS; ++i) tensor->nb[i] = tmp->nb[i];
if (tensor->extra) {
auto extra = (ggml_split_tensor_t *)tensor->extra;
auto tt = ggml_internal_get_type_traits(curr_type);
if (tt.blck_size > 1 && extra->split_dim == 0) {
int64_t bs = tt.blck_size;
int n = extra->n_device;
std::vector<int64_t> bounds(n, 0);
int64_t acc = 0;
for (int i = 0; i < n; ++i) {
if (extra->splits[i]) acc += extra->splits[i]->ne[0];
bounds[i] = acc;
}
for (int i = 0; i < n - 1; ++i) {
if (bounds[i] > 0) {
bounds[i] = ((bounds[i] + bs - 1) / bs) * bs;
}
}
bounds[n - 1] = tensor->ne[0];
for (int i = 1; i < n; ++i) {
if (bounds[i] < bounds[i - 1]) bounds[i] = bounds[i - 1];
}
int64_t prev = 0;
for (int i = 0; i < n; ++i) {
if (extra->splits[i]) {
int64_t ne0 = bounds[i] - prev;
if (ne0 <= 0) {
extra->splits[i] = nullptr;
} else {
extra->splits[i]->ne[0] = ne0;
}
}
prev = bounds[i];
}
}
for (int i = 0; i < extra->n_device; ++i) {
auto split = extra->splits[i];
if (!split) continue;
split->type = curr_type;
auto t = ggml_new_tensor(ctx, curr_type, n_dims, split->ne);
if (t) {
for (int j = 0; j < GGML_MAX_DIMS; ++j) split->nb[j] = t->nb[j];
}
}
int64_t sum = 0;
for (int i = 0; i < extra->n_device; ++i) {
if (extra->splits[i]) sum += extra->splits[i]->ne[0];
}
GGML_ASSERT(sum == tensor->ne[0]);
}
ggml_free(ctx);
return true;
}
// ------------------------------------------------------------------
// reload_tensor
// ------------------------------------------------------------------
bool reload_info::reload_tensor(const char * name, llama_model & model) {
auto it = tensor_reload_sources.find(name);
if (it == tensor_reload_sources.end()) return false;
auto & src = it->second;
struct stat st;
if (stat(src.path.c_str(), &st) != 0) return false;
bool changed = (st.st_mtime != src.last_mtime);
#ifdef __linux__
changed = changed || (st.st_mtim.tv_nsec != src.last_mtime_ns);
#endif
if (!changed) return false;
size_t off = 0, file_nbytes = 0;
ggml_type curr_type = GGML_TYPE_COUNT;
int64_t file_ne[GGML_MAX_DIMS];
if (!gguf_find_tensor_meta(src.path.c_str(), name, off, file_nbytes, curr_type, file_ne)) return false;
std::ifstream file(src.path, std::ios::binary);
if (!file) return false;
file.seekg((std::streamoff)off);
if (!file) return false;
struct ggml_tensor * tensor = nullptr;
for (auto & p : model.tensors_by_name) {
if (p.first == name) { tensor = p.second; break; }
}
if (!tensor || !src.original_buffer) return false;
// Refuse to swap if the on-disk shape differs from the model tensor
for (int i = 0; i < GGML_MAX_DIMS; ++i) {
if (tensor->ne[i] != file_ne[i]) {
LLAMA_LOG_INFO("reload_tensor: dimension mismatch for '%s': model ne[%d]=%ld, file ne[%d]=%ld — refusing swap\n",
name, i, (long)tensor->ne[i], i, (long)file_ne[i]);
return false;
}
}
ggml_backend_buffer_t old_buf = tensor->buffer;
bool returning = (curr_type == src.original_type);
std::vector<char> host_buf;
if (!returning) {
if (curr_type != tensor->type) {
if (!apply_tensor_type_change(model, tensor, src, curr_type)) return false;
}
size_t need = ggml_nbytes(tensor);
if (file_nbytes < need) return false;
host_buf.resize(need);
file.read(host_buf.data(), (std::streamsize)need);
if (!file || (size_t)file.gcount() != need) return false;
}
bool ok = false;
if (tensor->extra) {
ok = reload_tensor_split_path(model, tensor, src, host_buf, curr_type, returning, old_buf);
} else {
ok = reload_tensor_non_split_path(model, tensor, src, host_buf, curr_type, returning, old_buf);
}
if (ok) {
src.last_mtime = st.st_mtime;
#ifdef __linux__
src.last_mtime_ns = st.st_mtim.tv_nsec;
#endif
}
return ok;
}
// ------------------------------------------------------------------
// reload_changed_tensors
// ------------------------------------------------------------------
bool reload_info::reload_changed_tensors(llama_model & model) {
snapshot_all_reload_tensors(model);
struct job { const char * name; bool returning; };
std::vector<job> jobs;
for (auto & kv : tensor_reload_sources) {
auto & src = kv.second;
struct stat st;
if (stat(src.path.c_str(), &st) != 0) continue;
bool changed = (st.st_mtime != src.last_mtime);
#ifdef __linux__
changed = changed || (st.st_mtim.tv_nsec != src.last_mtime_ns);
#endif
if (!changed) continue;
size_t off = 0, nbytes = 0;
ggml_type t = GGML_TYPE_COUNT;
int64_t file_ne[GGML_MAX_DIMS];
if (!gguf_find_tensor_meta(src.path.c_str(), kv.first.c_str(), off, nbytes, t, file_ne)) continue;
struct ggml_tensor * tensor = nullptr;
for (auto & p : model.tensors_by_name) {
if (p.first == kv.first) { tensor = p.second; break; }
}
if (!tensor) continue;
bool dims_ok = true;
for (int i = 0; i < GGML_MAX_DIMS; ++i) {
if (tensor->ne[i] != file_ne[i]) {
LLAMA_LOG_INFO("reload_changed_tensors: dimension mismatch for '%s': model ne[%d]=%ld, file ne[%d]=%ld — skipping\n",
kv.first.c_str(), i, (long)tensor->ne[i], i, (long)file_ne[i]);
dims_ok = false;
break;
}
}
if (!dims_ok) continue;
bool returning = (t == src.original_type);
jobs.push_back({kv.first.c_str(), returning});
}
std::sort(jobs.begin(), jobs.end(), [](const job & a, const job & b) {
return a.returning > b.returning;
});
bool r = false;
for (auto & j : jobs) {
if (reload_tensor(j.name, model)) {
r = true;
LLAMA_LOG_INFO("reloaded tensor '%s'\n", j.name);
}
}
if (r) {
#ifdef GGML_USE_CUDA
ggml_backend_cuda_invalidate_graphs(&model);
#endif
}
return r;
}