From d3e86a5431e95f8f0853e4f5aa4dad701bb99d67 Mon Sep 17 00:00:00 2001
From: Farmadupe <tho119cl@gmail.com>
Date: Thu, 25 Jun 2026 09:18:32 +0100
Subject: [PATCH] Free raw multimedia data from server_tokens after encoding,
 as it will never be read again (#2029)

Data server_tokens.map_idx_to_media.tokens_image.batch_f32 is read exactly once,
by mtmd_encode, however it was retained as long as the input image was present
in the sequence. Add a manual free function to clear out this data after encoding.

Solves:
* Memory wasted in struct server_tokens
* The same wasted memory in the ram cache
* Long copy durations cloning this data to/from ram cache
* Accounting failures in ram cache (`batch_f32` can be larger than a sequence's entire KV)
* The above accounting failures leading to terminal memory leaks in pathological cases
* Remove JSON serialization for `batch_32` which was unused, and had no foreseeable usecase
---
 examples/mtmd/mtmd.cpp             | 80 +++++++++---------------------
 examples/mtmd/mtmd.h               |  3 ++
 examples/server/server-common.cpp  |  7 +++
 examples/server/server-common.h    |  4 ++
 examples/server/server-context.cpp |  1 +
 5 files changed, 38 insertions(+), 57 deletions(-)
diff --git a/examples/mtmd/mtmd.cpp b/examples/mtmd/mtmd.cpp
index 0490f2c6..04cf77f6 100644
--- a/examples/mtmd/mtmd.cpp
+++ b/examples/mtmd/mtmd.cpp
@@ -783,12 +783,20 @@ int32_t mtmd_encode_chunk(mtmd_context * ctx, const mtmd_input_chunk * chunk) {
             LOG_ERR("%s: model does not support vision input\n", __func__);
             return 1;
         }
+        // If in the future, we somehow accidentally try to reencode an already-encoded chunk,
+        // chunk->tokens_image will have been cleared out to save memory
+        GGML_ASSERT(!chunk->tokens_image->batch_f32.entries.empty()
+            && "mtmd_encode_chunk: image data already released (double encode?)");
         return mtmd_encode(ctx, chunk->tokens_image.get());
     } else if (chunk->type == MTMD_INPUT_CHUNK_TYPE_AUDIO) {
         if (!ctx->ctx_a) {
             LOG_ERR("%s: model does not support audio input\n", __func__);
             return 1;
         }
+        // If in the future, we somehow accidentally try to reencode an already-encoded chunk,
+        // chunk->tokens_audio will have been cleared out to save memory
+        GGML_ASSERT(!chunk->tokens_audio->batch_f32.entries.empty()
+            && "mtmd_encode_chunk: audio data already released (double encode?)");
         int n_mmproj_embd = ctx->n_embd_text;
         ctx->image_embd_v.resize(chunk->tokens_audio->n_tokens * n_mmproj_embd);
         bool ok = clip_image_batch_encode(
@@ -1042,6 +1050,19 @@ void mtmd_input_chunk_free(mtmd_input_chunk * chunk) {
     }
 }
 
+void mtmd_input_chunk_free_raw_data(mtmd_input_chunk * chunk) {
+    if (!chunk) {
+        return;
+    }
+
+    if (chunk->tokens_image) {
+        chunk->tokens_image->batch_f32 = clip_image_f32_batch{};
+    }
+    if (chunk->tokens_audio) {
+        chunk->tokens_audio->batch_f32 = clip_image_f32_batch{};
+    }
+}
+
 // mtmd_image_tokens
 
 size_t mtmd_image_tokens_get_n_tokens(const mtmd_image_tokens * image_tokens) {
@@ -1114,63 +1135,10 @@ mtmd_input_chunks * mtmd_test_create_input_chunks() {
     return chunks;
 }
 
-static json mtmd_clip_image_f32_to_json(const clip_image_f32 & clip) {
-    json j;
-    j["nx"] = clip.nx;
-    j["ny"] = clip.ny;
-    j["buf"] = clip.buf;
-    return j;
-}
-
-static clip_image_f32 * mtmd_clip_image_f32_from_json(const json & j) {
-    clip_image_f32 * clip = new clip_image_f32;
-    clip->nx = j["nx"];
-    clip->ny = j["ny"];
-    clip->buf = j["buf"].get<std::vector<float>>();
-    return clip;
-}
-
-static json mtmd_clip_image_f32_batch_to_json(const clip_image_f32_batch & batch, bool full = false) {
-    json j;
-    j["is_audio"] = batch.is_audio;
-    j["grid_x"] = batch.grid_x;
-    j["grid_y"] = batch.grid_y;
-
-    if (full) {
-        std::vector<nlohmann::json> entries;
-        for (auto & entry : batch.entries) {
-            entries.push_back(mtmd_clip_image_f32_to_json(*entry));
-        }
-        j["entries"] = entries;
-    }
-
-    return j;
-}
-
-static clip_image_f32_batch mtmd_clip_image_f32_batch_from_json(const json & j, bool full = false) {
-    clip_image_f32_batch batch;
-    if (j.contains("is_audio")) {
-        batch.is_audio = j["is_audio"];
-        batch.grid_x = j["grid_x"];
-        batch.grid_y = j["grid_y"];
-        if (full) {
-            auto entries = j["entries"];
-            if (entries.is_array()) {
-                for (auto & entry : entries) {
-                    clip_image_f32 * clip = mtmd_clip_image_f32_from_json(entry);
-                    batch.entries.push_back(clip_image_f32_ptr(clip));
-                }
-            }
-        }
-
-    }
-    return batch;
-}
-
 static mtmd_audio_tokens mtmd_audio_tokens_from_json(json & j) {
     return mtmd_audio_tokens{
         j.value<uint32_t>("n_tokens", 0),
-        mtmd_clip_image_f32_batch_from_json(j.value("batch_f32", json{})),
+        clip_image_f32_batch {},
         j.value("id","")
     };
 }
@@ -1180,7 +1148,7 @@ static mtmd_image_tokens mtmd_image_tokens_from_json(json & j) {
         j.value<uint32_t>("nx", 0),
         j.value<uint32_t>("ny", 0),
         j.value("use_mrope_pos",false),
-        mtmd_clip_image_f32_batch_from_json(j.value("batch_f32", json{})),
+        clip_image_f32_batch {},
         j.value("id","")
     };
 }
@@ -1190,7 +1158,6 @@ static json mtmd_audio_tokens_to_json(mtmd_audio_tokens *  chunk) {
     if (chunk) {
         j["n_tokens"] = chunk->n_tokens;
         j["id"] = chunk->id;
-        j["batch_f32"] = mtmd_clip_image_f32_batch_to_json(chunk->batch_f32);
     }
     return j;
 }
@@ -1201,7 +1168,6 @@ static json mtmd_image_tokens_to_json(mtmd_image_tokens * chunk) {
         j["nx"] = chunk->nx;
         j["ny"] = chunk->ny;
         j["use_mrope_pos"] = chunk->use_mrope_pos;
-        j["batch_f32"] = mtmd_clip_image_f32_batch_to_json(chunk->batch_f32);
         j["id"] = chunk->id;
     }
     return j;
diff --git a/examples/mtmd/mtmd.h b/examples/mtmd/mtmd.h
index 3285f24b..755cbd74 100644
--- a/examples/mtmd/mtmd.h
+++ b/examples/mtmd/mtmd.h
@@ -170,6 +170,9 @@ MTMD_API llama_pos                  mtmd_input_chunk_get_n_pos       (const mtmd
 MTMD_API mtmd_input_chunk * mtmd_input_chunk_copy(const mtmd_input_chunk * chunk);
 MTMD_API void               mtmd_input_chunk_free(mtmd_input_chunk * chunk);
 
+// Free the raw audio (PCM) or imagebuffers (RGB-f32 (!)) of a multimedia chunk.
+// Provided for the benefit of llama-server as a stopgap to fix memory issues
+MTMD_API void               mtmd_input_chunk_free_raw_data(mtmd_input_chunk * chunk);
 
 // mtmd_image_tokens
 //
diff --git a/examples/server/server-common.cpp b/examples/server/server-common.cpp
index 865b1f13..4b35f488 100644
--- a/examples/server/server-common.cpp
+++ b/examples/server/server-common.cpp
@@ -1253,6 +1253,13 @@ const mtmd::input_chunk_ptr& server_tokens::find_chunk(size_t idx) const {
     throw std::runtime_error("Chunk not found, or idx is not the first token of a chunk");
 }
 
+void server_tokens::free_raw_media_data(size_t idx) {
+    auto it = map_idx_to_media.find(idx);
+    if (it != map_idx_to_media.end() && it->second) {
+        mtmd_input_chunk_free_raw_data(it->second.get());
+    }
+}
+
 void server_tokens::push_back(llama_token tok) {
     if (tok == LLAMA_TOKEN_NULL) {
         throw std::runtime_error("Invalid token");
diff --git a/examples/server/server-common.h b/examples/server/server-common.h
index d6c54904..45598393 100644
--- a/examples/server/server-common.h
+++ b/examples/server/server-common.h
@@ -404,6 +404,10 @@ public:
 
     const mtmd::input_chunk_ptr& find_chunk(size_t idx) const;
 
+    // Manual free for the raw audio (PCM) or imagebuffers (RGB-f32 (!)) of a multimedia chunk.
+    // This data will never be read again after encoding on the first turn that multimedia are received.
+    void free_raw_media_data(size_t idx);
+
     void push_back(llama_token tok);
 
     // will create a copy of the chunk if it contains non-text data
diff --git a/examples/server/server-context.cpp b/examples/server/server-context.cpp
index b23de8bd..aa9abf0d 100644
--- a/examples/server/server-context.cpp
+++ b/examples/server/server-context.cpp
@@ -3946,6 +3946,7 @@ void server_context::batch_pending_prompt(const int32_t n_ubatch, const int32_t
 
                     // add the image chunk to cache
                     {
+                        slot.prompt_tokens.free_raw_media_data(slot.n_past_prompt);
                         const auto& chunk = slot.prompt_tokens.find_chunk(slot.n_past_prompt);
                         slot.cache_tokens.push_back(chunk.get()); // copy
                     }