#include "llama-reload-info.h" #include "llama-model.h" #include "llama-model-loader.h" #ifdef GGML_USE_CUDA #include "ggml-cuda.h" #endif #include #include #include #include #include #include // ------------------------------------------------------------------ // Debug helpers // ------------------------------------------------------------------ static void log_tensor_state(const char * ctx, struct ggml_tensor * t) { #ifndef NDEBUG if (!t) { LLAMA_LOG_DEBUG("%s: tensor=NULL\n", ctx); return; } const char * buft_name = "null"; if (t->buffer) { auto buft = ggml_backend_buffer_get_type(t->buffer); if (buft) buft_name = ggml_backend_buft_name(buft); } LLAMA_LOG_DEBUG("%s: tensor='%s' type=%s ne={%ld,%ld,%ld,%ld} nb={%zu,%zu,%zu,%zu} " "buffer=%p data=%p extra=%p buft=%s\n", ctx, t->name, ggml_type_name(t->type), (long)t->ne[0], (long)t->ne[1], (long)t->ne[2], (long)t->ne[3], t->nb[0], t->nb[1], t->nb[2], t->nb[3], (void*)t->buffer, t->data, (void*)t->extra, buft_name); #else (void)ctx; (void)t; #endif } static void log_split_state(const char * ctx, struct ggml_tensor * t) { #ifndef NDEBUG if (!t || !t->extra) { LLAMA_LOG_DEBUG("%s: no splits (extra=%p)\n", ctx, (void*)(t ? t->extra : nullptr)); return; } auto extra = (ggml_split_tensor_t *)t->extra; LLAMA_LOG_DEBUG("%s: tensor='%s' n_device=%d split_dim=%d\n", ctx, t->name, extra->n_device, extra->split_dim); for (int i = 0; i < extra->n_device; ++i) { if (!extra->splits[i]) { LLAMA_LOG_DEBUG("%s: split[%d]=NULL\n", ctx, i); continue; } const char * split_buft_name = "null"; if (extra->splits[i]->buffer) { auto buft = ggml_backend_buffer_get_type(extra->splits[i]->buffer); if (buft) split_buft_name = ggml_backend_buft_name(buft); } LLAMA_LOG_DEBUG("%s: split[%d] type=%s ne={%ld,%ld,%ld,%ld} nb={%zu,%zu,%zu,%zu} " "buffer=%p data=%p buft=%s\n", ctx, i, ggml_type_name(extra->splits[i]->type), (long)extra->splits[i]->ne[0], (long)extra->splits[i]->ne[1], (long)extra->splits[i]->ne[2], (long)extra->splits[i]->ne[3], extra->splits[i]->nb[0], extra->splits[i]->nb[1], extra->splits[i]->nb[2], extra->splits[i]->nb[3], (void*)extra->splits[i]->buffer, extra->splits[i]->data, split_buft_name); } #else (void)ctx; (void)t; #endif } // ------------------------------------------------------------------ // GGUF header parser (reuses llama.cpp / ggml GGUF loader) // ------------------------------------------------------------------ static bool gguf_find_tensor_meta(const char * path, const char * target_name, size_t & out_offset, size_t & out_nbytes, ggml_type & out_type, int64_t out_ne[GGML_MAX_DIMS]) { struct ggml_context * ctx = nullptr; struct gguf_init_params params = { /*.no_alloc = */ true, /*.ctx = */ &ctx, }; struct gguf_context * gguf = gguf_init_from_file(path, params); if (!gguf) { return false; } const int idx = gguf_find_tensor(gguf, target_name); if (idx < 0) { ggml_free(ctx); gguf_free(gguf); return false; } struct ggml_tensor * tensor = ggml_get_tensor(ctx, target_name); if (!tensor) { ggml_free(ctx); gguf_free(gguf); return false; } out_offset = gguf_get_data_offset(gguf) + gguf_get_tensor_offset(gguf, idx); out_nbytes = ggml_nbytes(tensor); out_type = tensor->type; for (int i = 0; i < GGML_MAX_DIMS; ++i) { out_ne[i] = tensor->ne[i]; } ggml_free(ctx); gguf_free(gguf); return true; } // ------------------------------------------------------------------ // Buffer census helper // ------------------------------------------------------------------ static size_t count_buffer_users( const std::vector> & tensors_by_name, ggml_backend_buffer_t buf) { if (!buf) return 0; size_t n = 0; for (auto & p : tensors_by_name) { if (p.second->buffer == buf) ++n; } return n; } static bool is_original_snapshot_buffer(llama_model & model, ggml_backend_buffer_t buf) { if (!buf) return false; if (!model.reload) return false; for (const auto & kv : model.reload->tensor_reload_sources) { const auto & src = kv.second; if (buf == src.original_buffer) return true; for (const auto & os : src.original_splits) { if (buf == os.buffer) return true; } } return false; } // ------------------------------------------------------------------ // Final size estimator // ------------------------------------------------------------------ static size_t llama_model_compute_final_nbytes(struct ggml_tensor * tensor, ggml_type new_type) { if (new_type == tensor->type) { return ggml_nbytes(tensor); } return ggml_row_size(new_type, tensor->ne[0]) * ggml_nrows(tensor); } // ------------------------------------------------------------------ // Fallback allocator // ------------------------------------------------------------------ static ggml_backend_buffer_t alloc_buffer_fallback(ggml_backend_buffer_type_t buft, size_t size) { ggml_backend_buffer_t buf = ggml_backend_buft_alloc_buffer(buft, size); if (buf) { LLAMA_LOG_DEBUG("%s: allocated %zu bytes on backend '%s'\n", __func__, size, ggml_backend_buft_name(buft)); return buf; } auto cpu_buft = ggml_backend_cpu_buffer_type(); if (buft == cpu_buft) { LLAMA_LOG_WARN("%s: CPU alloc failed (%zu bytes)\n", __func__, size); return nullptr; } LLAMA_LOG_WARN("%s: backend alloc failed (%zu bytes on '%s'), trying CPU fallback\n", __func__, size, ggml_backend_buft_name(buft)); buf = ggml_backend_buft_alloc_buffer(cpu_buft, size); if (!buf) { LLAMA_LOG_WARN("%s: CPU fallback alloc failed (%zu bytes)\n", __func__, size); return nullptr; } LLAMA_LOG_DEBUG("%s: allocated %zu bytes on CPU fallback\n", __func__, size); return buf; } // ------------------------------------------------------------------ // MoE sibling resync // ------------------------------------------------------------------ // MoE layers have three weight tensors per block: gate, up, down. // The CUDA split backend distributes each tensor across GPUs by splitting // one dimension (usually dim 0 or 1). Split boundaries must be multiples // of the quantization block size (e.g. 256 for IQ1_KT). If the reference // tensor changes quantization type, its block size changes, which changes // the valid split boundaries. ALL siblings in the same layer MUST adopt // the SAME per-device split dimensions, otherwise the backend dispatches // rows to the wrong devices and corrupts inference. // // When the reference tensor is back on its original snapshot, siblings // can simply be reattached to their original snapshots too -- no data // movement or allocation is required. // ------------------------------------------------------------------ // ------------------------------------------------------------------ // Sibling name registration // ------------------------------------------------------------------ static void populate_moe_siblings(const char * name, tensor_reload_source & src) { LLAMA_LOG_DEBUG("%s: name='%s'\n", __func__, name); static const char * suffixes[] = { ".ffn_down_exps.weight", ".ffn_up_exps.weight", ".ffn_gate_exps.weight", }; std::string n(name); for (const char * sfx : suffixes) { size_t pos = n.find(sfx); if (pos == std::string::npos) continue; std::string base = n.substr(0, pos); for (const char * other : suffixes) { if (strcmp(other, sfx) != 0) { src.sibling_names.push_back(base + other); LLAMA_LOG_DEBUG("%s: registered sibling '%s' for '%s'\n", __func__, (base + other).c_str(), name); } } return; } LLAMA_LOG_DEBUG("%s: '%s' no MoE suffix matched\n", __func__, name); } // ------------------------------------------------------------------ // Snapshot helper // ------------------------------------------------------------------ static void snapshot_tensor_source(struct ggml_tensor * tensor, tensor_reload_source & src) { if (!tensor || src.original_buffer != nullptr) return; src.original_buffer = tensor->buffer; src.original_data = tensor->data; src.original_nbytes = ggml_nbytes(tensor); src.original_type = tensor->type; for (int i = 0; i < GGML_MAX_DIMS; ++i) { src.original_ne[i] = tensor->ne[i]; src.original_nb[i] = tensor->nb[i]; } auto extra = (ggml_split_tensor_t *)tensor->extra; if (extra) { src.original_extra = extra; src.original_splits.clear(); for (int i = 0; i < extra->n_device; ++i) { tensor_reload_source::split_info si; if (extra->splits[i]) { for (int j = 0; j < GGML_MAX_DIMS; ++j) { si.ne[j] = extra->splits[i]->ne[j]; si.nb[j] = extra->splits[i]->nb[j]; } si.data = extra->splits[i]->data; si.buffer = extra->splits[i]->buffer; si.tensor = extra->splits[i]; } src.original_splits.push_back(si); } } populate_moe_siblings(ggml_get_name(tensor), src); src.state = tensor_reload_source::reload_state::ON_ORIGINAL; log_tensor_state("snapshot_tensor_source", tensor); } // ------------------------------------------------------------------ // Constructor // ------------------------------------------------------------------ reload_info::reload_info(const llama_model_loader & ml) { for (const auto & w : ml.weights) { if (!w.tensor || w.idx >= (int)ml.files.size()) continue; struct stat st; if (stat(ml.files[w.idx]->get_path().c_str(), &st) != 0) continue; tensor_reload_source src; src.path = ml.files[w.idx]->get_path(); src.data_offset = w.offs; src.nbytes = ggml_nbytes(w.tensor); src.last_mtime = st.st_mtime; #ifdef __linux__ src.last_mtime_ns = st.st_mtim.tv_nsec; #endif tensor_reload_sources[ggml_get_name(w.tensor)] = std::move(src); } } // ------------------------------------------------------------------ // Eager snapshot // ------------------------------------------------------------------ void reload_info::snapshot_all_reload_tensors(llama_model & model) { if (this->reload_snapshots_done.exchange(true)) return; LLAMA_LOG_INFO("%s: eager snapshot of all reload tensors + siblings\n", __func__); for (auto & kv : tensor_reload_sources) { struct ggml_tensor * tensor = nullptr; for (auto & p : model.tensors_by_name) { if (p.first == kv.first) { tensor = p.second; break; } } if (!tensor) continue; snapshot_tensor_source(tensor, kv.second); } for (auto & kv : tensor_reload_sources) { auto & src = kv.second; for (const auto & sib_name : src.sibling_names) { auto it = this->tensor_reload_sources.find(sib_name); if (it == this->tensor_reload_sources.end()) continue; if (it->second.original_buffer != nullptr) continue; struct ggml_tensor * sib = nullptr; for (auto & p : model.tensors_by_name) { if (p.first == sib_name) { sib = p.second; break; } } if (!sib) continue; snapshot_tensor_source(sib, it->second); } } } // ------------------------------------------------------------------ // Re-attachment helper // ------------------------------------------------------------------ static bool reattach_split_tensor_to_shared(llama_model & model, const char * name) { auto it = model.reload->tensor_reload_sources.find(name); if (it == model.reload->tensor_reload_sources.end()) return false; auto & src = it->second; if (!src.original_buffer) return false; struct ggml_tensor * tensor = nullptr; for (auto & p : model.tensors_by_name) { if (p.first == name) { tensor = p.second; break; } } if (!tensor) return false; if (tensor->buffer == src.original_buffer) { log_tensor_state("reattach_split_tensor_to_shared", tensor); src.state = tensor_reload_source::reload_state::ON_ORIGINAL; return true; } if (tensor->buffer && src.state != tensor_reload_source::reload_state::ON_ORIGINAL) { ggml_backend_buffer_free(tensor->buffer); } tensor->buffer = nullptr; tensor->data = nullptr; tensor->buffer = src.original_buffer; tensor->data = src.original_data; tensor->type = src.original_type; for (int i = 0; i < GGML_MAX_DIMS; ++i) { tensor->ne[i] = src.original_ne[i]; tensor->nb[i] = src.original_nb[i]; } if (src.original_extra) { tensor->extra = src.original_extra; auto extra = (ggml_split_tensor_t *)tensor->extra; for (int i = 0; i < extra->n_device && i < (int)src.original_splits.size(); ++i) { auto & os = src.original_splits[i]; if (!extra->splits[i] && os.tensor) { extra->splits[i] = os.tensor; } if (extra->splits[i]) { if (extra->splits[i]->buffer && extra->splits[i]->buffer != os.buffer && src.state != tensor_reload_source::reload_state::ON_ORIGINAL) { ggml_backend_buffer_free(extra->splits[i]->buffer); } extra->splits[i]->data = os.data; extra->splits[i]->buffer = os.buffer; extra->splits[i]->type = src.original_type; for (int j = 0; j < GGML_MAX_DIMS; ++j) { extra->splits[i]->ne[j] = os.ne[j]; extra->splits[i]->nb[j] = os.nb[j]; } } } } src.state = tensor_reload_source::reload_state::ON_ORIGINAL; return true; } // ------------------------------------------------------------------ // MoE sibling resync // ------------------------------------------------------------------ static void resync_moe_sibling_splits( llama_model & model, struct ggml_context * /*ctx_tmp*/, struct ggml_tensor * ref_tensor, const char * ref_name) { std::string name_str(ref_name); std::string layer_prefix; std::vector suffixes; if (name_str.find(".ffn_down_exps.weight") != std::string::npos) { layer_prefix = name_str.substr(0, name_str.find(".ffn_down_exps.weight")); suffixes = {".ffn_up_exps.weight", ".ffn_gate_exps.weight"}; } else if (name_str.find(".ffn_up_exps.weight") != std::string::npos) { layer_prefix = name_str.substr(0, name_str.find(".ffn_up_exps.weight")); suffixes = {".ffn_down_exps.weight", ".ffn_gate_exps.weight"}; } else if (name_str.find(".ffn_gate_exps.weight") != std::string::npos) { layer_prefix = name_str.substr(0, name_str.find(".ffn_gate_exps.weight")); suffixes = {".ffn_up_exps.weight", ".ffn_down_exps.weight"}; } else { return; } auto ref_extra = (ggml_split_tensor_t *)ref_tensor->extra; if (!ref_extra) return; auto it_ref_src = model.reload->tensor_reload_sources.find(ref_name); if (it_ref_src != model.reload->tensor_reload_sources.end() && ref_tensor->buffer == it_ref_src->second.original_buffer) { for (const auto & suffix : suffixes) { reattach_split_tensor_to_shared(model, (layer_prefix + suffix).c_str()); } return; } struct sibling_job { std::string name; struct ggml_tensor * tensor; ggml_split_tensor_t * extra; std::vector host_buf; bool needs_resync = false; }; std::vector jobs; for (const auto & suffix : suffixes) { std::string sib_name = layer_prefix + suffix; struct ggml_tensor * sib = nullptr; for (auto & p : model.tensors_by_name) { if (p.first == sib_name) { sib = p.second; break; } } if (!sib || !sib->extra || sib == ref_tensor) continue; auto sib_extra = (ggml_split_tensor_t *)sib->extra; if (sib_extra->n_device != ref_extra->n_device) continue; int sib_dim = sib_extra->split_dim < 0 ? 0 : sib_extra->split_dim; int ref_dim = ref_extra->split_dim < 0 ? 0 : ref_extra->split_dim; bool need = false; for (int i = 0; i < ref_extra->n_device; ++i) { bool rh = ref_extra->splits[i] != nullptr; bool sh = sib_extra->splits[i] != nullptr; if (rh != sh) { need = true; break; } if (rh && sh && sib_extra->splits[i]->ne[sib_dim] != ref_extra->splits[i]->ne[ref_dim]) { need = true; break; } } if (!need) continue; size_t nbytes = ggml_nbytes(sib); std::vector buf(nbytes); ggml_backend_tensor_get(sib, buf.data(), 0, nbytes); jobs.push_back({sib_name, sib, sib_extra, std::move(buf), true}); } if (jobs.empty()) return; log_split_state("resync_moe_sibling_splits", ref_tensor); // Phase A: Detach / free old buffers, allocate new main handles for (auto & job : jobs) { auto sib = job.tensor; ggml_backend_buffer_type_t buft = sib->buffer ? ggml_backend_buffer_get_type(sib->buffer) : ggml_backend_cpu_buffer_type(); auto it = model.reload->tensor_reload_sources.find(job.name); bool was_orig = (it != model.reload->tensor_reload_sources.end() && it->second.state == tensor_reload_source::reload_state::ON_ORIGINAL); if (sib->buffer) { if (!was_orig) ggml_backend_buffer_free(sib->buffer); sib->buffer = nullptr; sib->data = nullptr; } size_t alloc_size = ggml_backend_buft_get_alloc_size(buft, sib); ggml_backend_buffer_t new_buf = alloc_buffer_fallback(buft, alloc_size); if (!new_buf) { job.needs_resync = false; continue; } sib->buffer = new_buf; sib->data = (void*)0x1; // dummy; split backend uses extra->splits if (it != model.reload->tensor_reload_sources.end()) { it->second.state = tensor_reload_source::reload_state::DETACHED; } } // Phase B: Propagate dimensions & recompute strides for (auto & job : jobs) { if (!job.needs_resync) continue; auto sib = job.tensor; auto sib_extra = job.extra; for (int i = 0; i < ref_extra->n_device; ++i) { if (!ref_extra->splits[i]) { if (sib_extra->splits[i]) sib_extra->splits[i] = nullptr; continue; } if (!sib_extra->splits[i]) continue; sib_extra->splits[i]->ne[sib_extra->split_dim < 0 ? 0 : sib_extra->split_dim] = ref_extra->splits[i]->ne[ref_extra->split_dim < 0 ? 0 : ref_extra->split_dim]; } int n_dims = 0; for (int i = GGML_MAX_DIMS - 1; i >= 0; --i) { if (sib->ne[i] != 1) { n_dims = i + 1; break; } } size_t ctx_size = ggml_tensor_overhead() * (sib_extra->n_device + 4); if (ctx_size < 16384) ctx_size = 16384; struct ggml_init_params p = { ctx_size, NULL, true }; struct ggml_context * ctx = ggml_init(p); if (ctx) { for (int i = 0; i < sib_extra->n_device; ++i) { if (!sib_extra->splits[i]) continue; auto tmp = ggml_new_tensor(ctx, sib->type, n_dims, sib_extra->splits[i]->ne); if (tmp) { for (int j = 0; j < GGML_MAX_DIMS; ++j) { sib_extra->splits[i]->nb[j] = tmp->nb[j]; } } } ggml_free(ctx); } } // Phase C: Allocate GPU split buffers bool gpu_failed = false; #ifdef GGML_USE_CUDA for (auto & job : jobs) { if (!job.needs_resync) continue; auto sib_extra = job.extra; for (int i = 0; i < sib_extra->n_device; ++i) { if (!sib_extra->splits[i]) continue; size_t need = ggml_nbytes(sib_extra->splits[i]); auto buft = ggml_backend_cuda_buffer_type(i); auto b = ggml_backend_buft_alloc_buffer(buft, need); if (!b) { gpu_failed = true; break; } sib_extra->splits[i]->buffer = b; sib_extra->splits[i]->data = ggml_backend_buffer_get_base(b); } if (gpu_failed) break; } #else // Without CUDA support, force CPU fallback for any resync jobs for (auto & job : jobs) { if (job.needs_resync) { gpu_failed = true; break; } } #endif // Phase D: If any GPU alloc failed, move entire layer to CPU if (gpu_failed) { for (auto & job : jobs) { if (!job.needs_resync) continue; auto sib = job.tensor; auto sib_extra = job.extra; for (int i = 0; i < sib_extra->n_device; ++i) { if (sib_extra->splits[i] && sib_extra->splits[i]->buffer) { auto it = model.reload->tensor_reload_sources.find(job.name); bool is_orig = false; if (it != model.reload->tensor_reload_sources.end() && i < (int)it->second.original_splits.size()) { is_orig = (sib_extra->splits[i]->buffer == it->second.original_splits[i].buffer); } if (!is_orig) ggml_backend_buffer_free(sib_extra->splits[i]->buffer); sib_extra->splits[i]->buffer = nullptr; sib_extra->splits[i]->data = nullptr; } } if (sib->buffer) { auto it = model.reload->tensor_reload_sources.find(job.name); bool is_orig = (it != model.reload->tensor_reload_sources.end() && it->second.state == tensor_reload_source::reload_state::ON_ORIGINAL); if (!is_orig) ggml_backend_buffer_free(sib->buffer); sib->buffer = nullptr; sib->data = nullptr; } size_t need = ggml_nbytes(sib); auto cpu = alloc_buffer_fallback(ggml_backend_cpu_buffer_type(), need); if (cpu) { sib->buffer = cpu; sib->data = ggml_backend_buffer_get_base(cpu); auto it = model.reload->tensor_reload_sources.find(job.name); if (it != model.reload->tensor_reload_sources.end()) it->second.state = tensor_reload_source::reload_state::FALLBACK_CPU; } } } // Phase E: Write data back for (auto & job : jobs) { if (!job.needs_resync) continue; ggml_backend_tensor_set(job.tensor, job.host_buf.data(), 0, job.host_buf.size()); } } // ------------------------------------------------------------------ // reload_tensor_split_path // ------------------------------------------------------------------ static bool reload_tensor_split_path( llama_model & model, struct ggml_tensor * tensor, tensor_reload_source & src, const std::vector & host_buf, ggml_type curr_type, bool returning_to_original, ggml_backend_buffer_t old_buf) { (void)curr_type; const char * name = ggml_get_name(tensor); if (returning_to_original) { if (old_buf && src.state != tensor_reload_source::reload_state::ON_ORIGINAL) { ggml_backend_buffer_free(old_buf); } tensor->buffer = nullptr; tensor->data = nullptr; if (!reattach_split_tensor_to_shared(model, name)) return false; for (const auto & sib : src.sibling_names) { reattach_split_tensor_to_shared(model, sib.c_str()); } return true; } ggml_backend_buffer_type_t buft = old_buf ? ggml_backend_buffer_get_type(old_buf) : ggml_backend_cpu_buffer_type(); if (old_buf && src.state != tensor_reload_source::reload_state::ON_ORIGINAL) { ggml_backend_buffer_free(old_buf); } tensor->buffer = nullptr; tensor->data = nullptr; size_t alloc_size = ggml_backend_buft_get_alloc_size(buft, tensor); ggml_backend_buffer_t new_buf = alloc_buffer_fallback(buft, alloc_size); if (!new_buf) return false; ggml_backend_tensor_alloc(new_buf, tensor, ggml_backend_buffer_get_base(new_buf)); //ggml_backend_buffer_init_tensor(tensor->buffer, tensor); ggml_backend_tensor_set(tensor, host_buf.data(), 0, host_buf.size()); log_tensor_state("reload_tensor_split_path", tensor); if (tensor->extra) resync_moe_sibling_splits(model, nullptr, tensor, name); src.state = tensor_reload_source::reload_state::DETACHED; return true; } // ------------------------------------------------------------------ // reload_tensor_non_split_path // ------------------------------------------------------------------ static bool reload_tensor_non_split_path( llama_model & model, struct ggml_tensor * tensor, tensor_reload_source & src, const std::vector & host_buf, ggml_type curr_type, bool returning_to_original, ggml_backend_buffer_t old_buf) { (void)model; (void)curr_type; #ifndef NDEBUG const char * name = ggml_get_name(tensor); #endif if (returning_to_original) { if (old_buf && src.state != tensor_reload_source::reload_state::ON_ORIGINAL) { ggml_backend_buffer_free(old_buf); } tensor->buffer = src.original_buffer; tensor->data = src.original_data; tensor->type = src.original_type; for (int i = 0; i < GGML_MAX_DIMS; ++i) { tensor->ne[i] = src.original_ne[i]; tensor->nb[i] = src.original_nb[i]; } src.state = tensor_reload_source::reload_state::ON_ORIGINAL; return true; } ggml_backend_buffer_type_t buft = old_buf ? ggml_backend_buffer_get_type(old_buf) : ggml_backend_cpu_buffer_type(); if (old_buf && src.state != tensor_reload_source::reload_state::ON_ORIGINAL) { ggml_backend_buffer_free(old_buf); #ifndef NDEBUG } else if (old_buf) { LLAMA_LOG_DEBUG("detaching from original snapshot buffer %p for '%s'\n", (void*)old_buf, name); #endif } tensor->buffer = nullptr; tensor->data = nullptr; size_t alloc_size = ggml_backend_buft_get_alloc_size(buft, tensor); ggml_backend_buffer_t new_buf = alloc_buffer_fallback(buft, alloc_size); if (!new_buf) return false; ggml_backend_tensor_alloc(new_buf, tensor, ggml_backend_buffer_get_base(new_buf)); ggml_backend_tensor_set(tensor, host_buf.data(), 0, host_buf.size()); src.state = tensor_reload_source::reload_state::DETACHED; return true; } // ------------------------------------------------------------------ // apply_tensor_type_change // ------------------------------------------------------------------ static bool apply_tensor_type_change( llama_model & /*model*/, struct ggml_tensor * tensor, tensor_reload_source & /*src*/, ggml_type curr_type) { #ifndef NDEBUG const char * name = ggml_get_name(tensor); (void)name; #endif tensor->type = curr_type; int n_dims = 0; for (int i = GGML_MAX_DIMS - 1; i >= 0; --i) { if (tensor->ne[i] != 1) { n_dims = i + 1; break; } } size_t ctx_size = ggml_tensor_overhead() * (1 + (tensor->extra ? ((ggml_split_tensor_t*)tensor->extra)->n_device : 0)) + ggml_graph_overhead_custom(1, false); struct ggml_init_params p = { ctx_size, NULL, true }; struct ggml_context * ctx = ggml_init(p); if (!ctx) return false; auto tmp = ggml_new_tensor(ctx, curr_type, n_dims, tensor->ne); if (!tmp) { ggml_free(ctx); return false; } for (int i = 0; i < GGML_MAX_DIMS; ++i) tensor->nb[i] = tmp->nb[i]; if (tensor->extra) { auto extra = (ggml_split_tensor_t *)tensor->extra; auto tt = ggml_internal_get_type_traits(curr_type); if (tt.blck_size > 1 && extra->split_dim == 0) { int64_t bs = tt.blck_size; int n = extra->n_device; std::vector bounds(n, 0); int64_t acc = 0; for (int i = 0; i < n; ++i) { if (extra->splits[i]) acc += extra->splits[i]->ne[0]; bounds[i] = acc; } for (int i = 0; i < n - 1; ++i) { if (bounds[i] > 0) { bounds[i] = ((bounds[i] + bs - 1) / bs) * bs; } } bounds[n - 1] = tensor->ne[0]; for (int i = 1; i < n; ++i) { if (bounds[i] < bounds[i - 1]) bounds[i] = bounds[i - 1]; } int64_t prev = 0; for (int i = 0; i < n; ++i) { if (extra->splits[i]) { int64_t ne0 = bounds[i] - prev; if (ne0 <= 0) { extra->splits[i] = nullptr; } else { extra->splits[i]->ne[0] = ne0; } } prev = bounds[i]; } } for (int i = 0; i < extra->n_device; ++i) { auto split = extra->splits[i]; if (!split) continue; split->type = curr_type; auto t = ggml_new_tensor(ctx, curr_type, n_dims, split->ne); if (t) { for (int j = 0; j < GGML_MAX_DIMS; ++j) split->nb[j] = t->nb[j]; } } int64_t sum = 0; for (int i = 0; i < extra->n_device; ++i) { if (extra->splits[i]) sum += extra->splits[i]->ne[0]; } GGML_ASSERT(sum == tensor->ne[0]); } ggml_free(ctx); return true; } // ------------------------------------------------------------------ // reload_tensor // ------------------------------------------------------------------ bool reload_info::reload_tensor(const char * name, llama_model & model) { auto it = tensor_reload_sources.find(name); if (it == tensor_reload_sources.end()) return false; auto & src = it->second; struct stat st; if (stat(src.path.c_str(), &st) != 0) return false; bool changed = (st.st_mtime != src.last_mtime); #ifdef __linux__ changed = changed || (st.st_mtim.tv_nsec != src.last_mtime_ns); #endif if (!changed) return false; size_t off = 0, file_nbytes = 0; ggml_type curr_type = GGML_TYPE_COUNT; int64_t file_ne[GGML_MAX_DIMS]; if (!gguf_find_tensor_meta(src.path.c_str(), name, off, file_nbytes, curr_type, file_ne)) return false; std::ifstream file(src.path, std::ios::binary); if (!file) return false; file.seekg((std::streamoff)off); if (!file) return false; struct ggml_tensor * tensor = nullptr; for (auto & p : model.tensors_by_name) { if (p.first == name) { tensor = p.second; break; } } if (!tensor || !src.original_buffer) return false; // Refuse to swap if the on-disk shape differs from the model tensor for (int i = 0; i < GGML_MAX_DIMS; ++i) { if (tensor->ne[i] != file_ne[i]) { LLAMA_LOG_INFO("reload_tensor: dimension mismatch for '%s': model ne[%d]=%ld, file ne[%d]=%ld — refusing swap\n", name, i, (long)tensor->ne[i], i, (long)file_ne[i]); return false; } } ggml_backend_buffer_t old_buf = tensor->buffer; bool returning = (curr_type == src.original_type); std::vector host_buf; if (!returning) { if (curr_type != tensor->type) { if (!apply_tensor_type_change(model, tensor, src, curr_type)) return false; } size_t need = ggml_nbytes(tensor); if (file_nbytes < need) return false; host_buf.resize(need); file.read(host_buf.data(), (std::streamsize)need); if (!file || (size_t)file.gcount() != need) return false; } bool ok = false; if (tensor->extra) { ok = reload_tensor_split_path(model, tensor, src, host_buf, curr_type, returning, old_buf); } else { ok = reload_tensor_non_split_path(model, tensor, src, host_buf, curr_type, returning, old_buf); } if (ok) { src.last_mtime = st.st_mtime; #ifdef __linux__ src.last_mtime_ns = st.st_mtim.tv_nsec; #endif } return ok; } // ------------------------------------------------------------------ // reload_changed_tensors // ------------------------------------------------------------------ bool reload_info::reload_changed_tensors(llama_model & model) { snapshot_all_reload_tensors(model); struct job { const char * name; bool returning; }; std::vector jobs; for (auto & kv : tensor_reload_sources) { auto & src = kv.second; struct stat st; if (stat(src.path.c_str(), &st) != 0) continue; bool changed = (st.st_mtime != src.last_mtime); #ifdef __linux__ changed = changed || (st.st_mtim.tv_nsec != src.last_mtime_ns); #endif if (!changed) continue; size_t off = 0, nbytes = 0; ggml_type t = GGML_TYPE_COUNT; int64_t file_ne[GGML_MAX_DIMS]; if (!gguf_find_tensor_meta(src.path.c_str(), kv.first.c_str(), off, nbytes, t, file_ne)) continue; struct ggml_tensor * tensor = nullptr; for (auto & p : model.tensors_by_name) { if (p.first == kv.first) { tensor = p.second; break; } } if (!tensor) continue; bool dims_ok = true; for (int i = 0; i < GGML_MAX_DIMS; ++i) { if (tensor->ne[i] != file_ne[i]) { LLAMA_LOG_INFO("reload_changed_tensors: dimension mismatch for '%s': model ne[%d]=%ld, file ne[%d]=%ld — skipping\n", kv.first.c_str(), i, (long)tensor->ne[i], i, (long)file_ne[i]); dims_ok = false; break; } } if (!dims_ok) continue; bool returning = (t == src.original_type); jobs.push_back({kv.first.c_str(), returning}); } std::sort(jobs.begin(), jobs.end(), [](const job & a, const job & b) { return a.returning > b.returning; }); bool r = false; for (auto & j : jobs) { if (reload_tensor(j.name, model)) { r = true; LLAMA_LOG_INFO("reloaded tensor '%s'\n", j.name); } } if (r) { #ifdef GGML_USE_CUDA ggml_backend_cuda_invalidate_graphs(&model); #endif } return r; }