From d3e86a5431e95f8f0853e4f5aa4dad701bb99d67 Mon Sep 17 00:00:00 2001 From: Farmadupe Date: Thu, 25 Jun 2026 09:18:32 +0100 Subject: [PATCH] Free raw multimedia data from server_tokens after encoding, as it will never be read again (#2029) Data server_tokens.map_idx_to_media.tokens_image.batch_f32 is read exactly once, by mtmd_encode, however it was retained as long as the input image was present in the sequence. Add a manual free function to clear out this data after encoding. Solves: * Memory wasted in struct server_tokens * The same wasted memory in the ram cache * Long copy durations cloning this data to/from ram cache * Accounting failures in ram cache (`batch_f32` can be larger than a sequence's entire KV) * The above accounting failures leading to terminal memory leaks in pathological cases * Remove JSON serialization for `batch_32` which was unused, and had no foreseeable usecase --- examples/mtmd/mtmd.cpp | 80 +++++++++--------------------- examples/mtmd/mtmd.h | 3 ++ examples/server/server-common.cpp | 7 +++ examples/server/server-common.h | 4 ++ examples/server/server-context.cpp | 1 + 5 files changed, 38 insertions(+), 57 deletions(-) diff --git a/examples/mtmd/mtmd.cpp b/examples/mtmd/mtmd.cpp index 0490f2c6..04cf77f6 100644 --- a/examples/mtmd/mtmd.cpp +++ b/examples/mtmd/mtmd.cpp @@ -783,12 +783,20 @@ int32_t mtmd_encode_chunk(mtmd_context * ctx, const mtmd_input_chunk * chunk) { LOG_ERR("%s: model does not support vision input\n", __func__); return 1; } + // If in the future, we somehow accidentally try to reencode an already-encoded chunk, + // chunk->tokens_image will have been cleared out to save memory + GGML_ASSERT(!chunk->tokens_image->batch_f32.entries.empty() + && "mtmd_encode_chunk: image data already released (double encode?)"); return mtmd_encode(ctx, chunk->tokens_image.get()); } else if (chunk->type == MTMD_INPUT_CHUNK_TYPE_AUDIO) { if (!ctx->ctx_a) { LOG_ERR("%s: model does not support audio input\n", __func__); return 1; } + // If in the future, we somehow accidentally try to reencode an already-encoded chunk, + // chunk->tokens_audio will have been cleared out to save memory + GGML_ASSERT(!chunk->tokens_audio->batch_f32.entries.empty() + && "mtmd_encode_chunk: audio data already released (double encode?)"); int n_mmproj_embd = ctx->n_embd_text; ctx->image_embd_v.resize(chunk->tokens_audio->n_tokens * n_mmproj_embd); bool ok = clip_image_batch_encode( @@ -1042,6 +1050,19 @@ void mtmd_input_chunk_free(mtmd_input_chunk * chunk) { } } +void mtmd_input_chunk_free_raw_data(mtmd_input_chunk * chunk) { + if (!chunk) { + return; + } + + if (chunk->tokens_image) { + chunk->tokens_image->batch_f32 = clip_image_f32_batch{}; + } + if (chunk->tokens_audio) { + chunk->tokens_audio->batch_f32 = clip_image_f32_batch{}; + } +} + // mtmd_image_tokens size_t mtmd_image_tokens_get_n_tokens(const mtmd_image_tokens * image_tokens) { @@ -1114,63 +1135,10 @@ mtmd_input_chunks * mtmd_test_create_input_chunks() { return chunks; } -static json mtmd_clip_image_f32_to_json(const clip_image_f32 & clip) { - json j; - j["nx"] = clip.nx; - j["ny"] = clip.ny; - j["buf"] = clip.buf; - return j; -} - -static clip_image_f32 * mtmd_clip_image_f32_from_json(const json & j) { - clip_image_f32 * clip = new clip_image_f32; - clip->nx = j["nx"]; - clip->ny = j["ny"]; - clip->buf = j["buf"].get>(); - return clip; -} - -static json mtmd_clip_image_f32_batch_to_json(const clip_image_f32_batch & batch, bool full = false) { - json j; - j["is_audio"] = batch.is_audio; - j["grid_x"] = batch.grid_x; - j["grid_y"] = batch.grid_y; - - if (full) { - std::vector entries; - for (auto & entry : batch.entries) { - entries.push_back(mtmd_clip_image_f32_to_json(*entry)); - } - j["entries"] = entries; - } - - return j; -} - -static clip_image_f32_batch mtmd_clip_image_f32_batch_from_json(const json & j, bool full = false) { - clip_image_f32_batch batch; - if (j.contains("is_audio")) { - batch.is_audio = j["is_audio"]; - batch.grid_x = j["grid_x"]; - batch.grid_y = j["grid_y"]; - if (full) { - auto entries = j["entries"]; - if (entries.is_array()) { - for (auto & entry : entries) { - clip_image_f32 * clip = mtmd_clip_image_f32_from_json(entry); - batch.entries.push_back(clip_image_f32_ptr(clip)); - } - } - } - - } - return batch; -} - static mtmd_audio_tokens mtmd_audio_tokens_from_json(json & j) { return mtmd_audio_tokens{ j.value("n_tokens", 0), - mtmd_clip_image_f32_batch_from_json(j.value("batch_f32", json{})), + clip_image_f32_batch {}, j.value("id","") }; } @@ -1180,7 +1148,7 @@ static mtmd_image_tokens mtmd_image_tokens_from_json(json & j) { j.value("nx", 0), j.value("ny", 0), j.value("use_mrope_pos",false), - mtmd_clip_image_f32_batch_from_json(j.value("batch_f32", json{})), + clip_image_f32_batch {}, j.value("id","") }; } @@ -1190,7 +1158,6 @@ static json mtmd_audio_tokens_to_json(mtmd_audio_tokens * chunk) { if (chunk) { j["n_tokens"] = chunk->n_tokens; j["id"] = chunk->id; - j["batch_f32"] = mtmd_clip_image_f32_batch_to_json(chunk->batch_f32); } return j; } @@ -1201,7 +1168,6 @@ static json mtmd_image_tokens_to_json(mtmd_image_tokens * chunk) { j["nx"] = chunk->nx; j["ny"] = chunk->ny; j["use_mrope_pos"] = chunk->use_mrope_pos; - j["batch_f32"] = mtmd_clip_image_f32_batch_to_json(chunk->batch_f32); j["id"] = chunk->id; } return j; diff --git a/examples/mtmd/mtmd.h b/examples/mtmd/mtmd.h index 3285f24b..755cbd74 100644 --- a/examples/mtmd/mtmd.h +++ b/examples/mtmd/mtmd.h @@ -170,6 +170,9 @@ MTMD_API llama_pos mtmd_input_chunk_get_n_pos (const mtmd MTMD_API mtmd_input_chunk * mtmd_input_chunk_copy(const mtmd_input_chunk * chunk); MTMD_API void mtmd_input_chunk_free(mtmd_input_chunk * chunk); +// Free the raw audio (PCM) or imagebuffers (RGB-f32 (!)) of a multimedia chunk. +// Provided for the benefit of llama-server as a stopgap to fix memory issues +MTMD_API void mtmd_input_chunk_free_raw_data(mtmd_input_chunk * chunk); // mtmd_image_tokens // diff --git a/examples/server/server-common.cpp b/examples/server/server-common.cpp index 865b1f13..4b35f488 100644 --- a/examples/server/server-common.cpp +++ b/examples/server/server-common.cpp @@ -1253,6 +1253,13 @@ const mtmd::input_chunk_ptr& server_tokens::find_chunk(size_t idx) const { throw std::runtime_error("Chunk not found, or idx is not the first token of a chunk"); } +void server_tokens::free_raw_media_data(size_t idx) { + auto it = map_idx_to_media.find(idx); + if (it != map_idx_to_media.end() && it->second) { + mtmd_input_chunk_free_raw_data(it->second.get()); + } +} + void server_tokens::push_back(llama_token tok) { if (tok == LLAMA_TOKEN_NULL) { throw std::runtime_error("Invalid token"); diff --git a/examples/server/server-common.h b/examples/server/server-common.h index d6c54904..45598393 100644 --- a/examples/server/server-common.h +++ b/examples/server/server-common.h @@ -404,6 +404,10 @@ public: const mtmd::input_chunk_ptr& find_chunk(size_t idx) const; + // Manual free for the raw audio (PCM) or imagebuffers (RGB-f32 (!)) of a multimedia chunk. + // This data will never be read again after encoding on the first turn that multimedia are received. + void free_raw_media_data(size_t idx); + void push_back(llama_token tok); // will create a copy of the chunk if it contains non-text data diff --git a/examples/server/server-context.cpp b/examples/server/server-context.cpp index b23de8bd..aa9abf0d 100644 --- a/examples/server/server-context.cpp +++ b/examples/server/server-context.cpp @@ -3946,6 +3946,7 @@ void server_context::batch_pending_prompt(const int32_t n_ubatch, const int32_t // add the image chunk to cache { + slot.prompt_tokens.free_raw_media_data(slot.n_past_prompt); const auto& chunk = slot.prompt_tokens.find_chunk(slot.n_past_prompt); slot.cache_tokens.push_back(chunk.get()); // copy }