From 0d135df48ccee9a799fa9d9ea0ed494bd4fdd74f Mon Sep 17 00:00:00 2001 From: Xuan-Son Nguyen Date: Sun, 21 Jun 2026 14:12:15 +0200 Subject: [PATCH] mtmd: fix mtmd_get_memory_usage (#24867) --- tools/mtmd/clip.cpp | 62 ++++++++++++++++++--------------- tools/mtmd/mtmd.cpp | 3 +- tools/server/server-context.cpp | 4 ++- 3 files changed, 37 insertions(+), 32 deletions(-) diff --git a/tools/mtmd/clip.cpp b/tools/mtmd/clip.cpp index fccc1e3487..7dd7023c41 100644 --- a/tools/mtmd/clip.cpp +++ b/tools/mtmd/clip.cpp @@ -2796,7 +2796,7 @@ struct clip_model_loader { } // load data - if (!ctx_clip.no_alloc) { + { std::vector read_buf; // start loading event @@ -2814,38 +2814,42 @@ struct clip_model_loader { ggml_backend_buffer_type_t buft = ggml_backend_get_default_buffer_type(ctx_clip.backend); ctx_clip.buf.reset(ggml_backend_alloc_ctx_tensors_from_buft(ctx_clip.ctx_data.get(), buft)); ggml_backend_buffer_set_usage(ctx_clip.buf.get(), GGML_BACKEND_BUFFER_USAGE_WEIGHTS); - size_t data_loaded = 0; - for (auto & t : tensors_to_load) { - ggml_tensor * cur = ggml_get_tensor(ctx_clip.ctx_data.get(), t->name); - GGML_ASSERT(cur && "tensor not found in ctx_data"); - auto it_off = tensor_offset.find(t->name); - GGML_ASSERT(it_off != tensor_offset.end() && "no offset for tensor"); - const size_t offset = it_off->second; - fin.seekg(offset, std::ios::beg); - if (!fin) { - throw std::runtime_error(string_format("%s: failed to seek for tensor %s\n", __func__, t->name)); - } - size_t num_bytes = ggml_nbytes(cur); - if (ggml_backend_buft_is_host(buft)) { - // for the CPU and Metal backend, we can read directly into the tensor - fin.read(reinterpret_cast(cur->data), num_bytes); - } else { - // read into a temporary buffer first, then copy to device memory - read_buf.resize(num_bytes); - fin.read(reinterpret_cast(read_buf.data()), num_bytes); - ggml_backend_tensor_set(cur, read_buf.data(), 0, num_bytes); - } - data_loaded += num_bytes; - if (progress_callback && total_data_size > 0) { - const float progress = (float)data_loaded / (float)total_data_size; - if (!progress_callback(progress, progress_callback_user_data)) { - throw std::runtime_error(string_format("%s: model loading cancelled by progress_callback\n", __func__)); + // read the weight from file + if (!ctx_clip.no_alloc) { + size_t data_loaded = 0; + for (auto & t : tensors_to_load) { + ggml_tensor * cur = ggml_get_tensor(ctx_clip.ctx_data.get(), t->name); + GGML_ASSERT(cur && "tensor not found in ctx_data"); + auto it_off = tensor_offset.find(t->name); + GGML_ASSERT(it_off != tensor_offset.end() && "no offset for tensor"); + const size_t offset = it_off->second; + fin.seekg(offset, std::ios::beg); + if (!fin) { + throw std::runtime_error(string_format("%s: failed to seek for tensor %s\n", __func__, t->name)); + } + size_t num_bytes = ggml_nbytes(cur); + if (ggml_backend_buft_is_host(buft)) { + // for the CPU and Metal backend, we can read directly into the tensor + fin.read(reinterpret_cast(cur->data), num_bytes); + } else { + // read into a temporary buffer first, then copy to device memory + read_buf.resize(num_bytes); + fin.read(reinterpret_cast(read_buf.data()), num_bytes); + ggml_backend_tensor_set(cur, read_buf.data(), 0, num_bytes); + } + data_loaded += num_bytes; + if (progress_callback && total_data_size > 0) { + const float progress = (float)data_loaded / (float)total_data_size; + if (!progress_callback(progress, progress_callback_user_data)) { + throw std::runtime_error(string_format("%s: model loading cancelled by progress_callback\n", __func__)); + } } } + LOG_DBG("%s: loaded %zu tensors from %s\n", __func__, tensors_to_load.size(), fname.c_str()); + } else { + LOG_DBG("%s: no_alloc is set, skipping tensor data loading (%zu tensors)\n", __func__, tensors_to_load.size()); } fin.close(); - - LOG_DBG("%s: loaded %zu tensors from %s\n", __func__, tensors_to_load.size(), fname.c_str()); } } diff --git a/tools/mtmd/mtmd.cpp b/tools/mtmd/mtmd.cpp index 564bafc621..724538b585 100644 --- a/tools/mtmd/mtmd.cpp +++ b/tools/mtmd/mtmd.cpp @@ -2142,8 +2142,7 @@ std::map mtmd_get_memory_usage(const char * mmproj_f try { mtmd_log_set(stub_log_callback, nullptr); // suppress logging - // TODO @ngxson : fix no_alloc here - ctx.reset(new mtmd_context(mmproj_fname, nullptr, ctx_params)); + ctx.reset(new mtmd_context(mmproj_fname, nullptr, ctx_params, true)); mtmd_log_set(saved_log_callback, saved_log_user_data); // restore log callback std::map total_mem; auto merge = [&](const struct clip_ctx * c) { diff --git a/tools/server/server-context.cpp b/tools/server/server-context.cpp index 7db4cb1986..aeb15096c8 100644 --- a/tools/server/server-context.cpp +++ b/tools/server/server-context.cpp @@ -926,13 +926,15 @@ private: // optionally get the memory usage of mmproj if (has_mmproj && params_base.fit_params) { + int64_t t_start = ggml_time_us(); auto mmproj_mem = mtmd_get_memory_usage(mmproj_path.c_str(), mparams); + int64_t t_elapsed = ggml_time_us() - t_start; if (!mmproj_mem.empty()) { size_t total = 0; for (auto & [dev, size] : mmproj_mem) { total += size; } - SRV_INF("[mtmd] estimated worst-case memory usage of mmproj is %.2f MiB\n", total / (1024.0 * 1024.0)); + SRV_INF("[mtmd] estimated worst-case memory usage of mmproj is %.2f MiB (took %.2f ms)\n", total / (1024.0 * 1024.0), t_elapsed / 1000.0); GGML_ASSERT(!params_base.fit_params_target.empty()); for (auto & [dev, size] : mmproj_mem) { for (size_t i = 0; i < ggml_backend_dev_count(); i++) {