Free raw multimedia data from server_tokens after encoding, as it will never be read again (#2029)

Data server_tokens.map_idx_to_media.tokens_image.batch_f32 is read exactly once, by mtmd_encode, however it was retained as long as the input image was present in the sequence. Add a manual free function to clear out this data after encoding. Solves: * Memory wasted in struct server_tokens * The same wasted memory in the ram cache * Long copy durations cloning this data to/from ram cache * Accounting failures in ram cache (`batch_f32` can be larger than a sequence's entire KV) * The above accounting failures leading to terminal memory leaks in pathological cases * Remove JSON serialization for `batch_32` which was unused, and had no foreseeable usecase
2026-06-28 04:30:15 -05:00 · 2026-06-25 09:18:32 +01:00 · 2026-06-25 09:18:32 +01:00 · d3e86a5431
commit d3e86a5431
parent bdf5c081dc
5 changed files with 38 additions and 57 deletions
--- a/examples/mtmd/mtmd.cpp
+++ b/examples/mtmd/mtmd.cpp
@ -783,12 +783,20 @@ int32_t mtmd_encode_chunk(mtmd_context * ctx, const mtmd_input_chunk * chunk) {
            LOG_ERR("%s: model does not support vision input\n", __func__);
            return 1;
        }
+        // If in the future, we somehow accidentally try to reencode an already-encoded chunk,
+        // chunk->tokens_image will have been cleared out to save memory
+        GGML_ASSERT(!chunk->tokens_image->batch_f32.entries.empty()
+            && "mtmd_encode_chunk: image data already released (double encode?)");
        return mtmd_encode(ctx, chunk->tokens_image.get());
    } else if (chunk->type == MTMD_INPUT_CHUNK_TYPE_AUDIO) {
        if (!ctx->ctx_a) {
            LOG_ERR("%s: model does not support audio input\n", __func__);
            return 1;
        }
+        // If in the future, we somehow accidentally try to reencode an already-encoded chunk,
+        // chunk->tokens_audio will have been cleared out to save memory
+        GGML_ASSERT(!chunk->tokens_audio->batch_f32.entries.empty()
+            && "mtmd_encode_chunk: audio data already released (double encode?)");
        int n_mmproj_embd = ctx->n_embd_text;
        ctx->image_embd_v.resize(chunk->tokens_audio->n_tokens * n_mmproj_embd);
        bool ok = clip_image_batch_encode(
@ -1042,6 +1050,19 @@ void mtmd_input_chunk_free(mtmd_input_chunk * chunk) {
    }
 }

+void mtmd_input_chunk_free_raw_data(mtmd_input_chunk * chunk) {
+    if (!chunk) {
+        return;
+    }
+
+    if (chunk->tokens_image) {
+        chunk->tokens_image->batch_f32 = clip_image_f32_batch{};
+    }
+    if (chunk->tokens_audio) {
+        chunk->tokens_audio->batch_f32 = clip_image_f32_batch{};
+    }
+}
+
 // mtmd_image_tokens

 size_t mtmd_image_tokens_get_n_tokens(const mtmd_image_tokens * image_tokens) {
@ -1114,63 +1135,10 @@ mtmd_input_chunks * mtmd_test_create_input_chunks() {
    return chunks;
 }

-static json mtmd_clip_image_f32_to_json(const clip_image_f32 & clip) {
-    json j;
-    j["nx"] = clip.nx;
-    j["ny"] = clip.ny;
-    j["buf"] = clip.buf;
-    return j;
-}
-
-static clip_image_f32 * mtmd_clip_image_f32_from_json(const json & j) {
-    clip_image_f32 * clip = new clip_image_f32;
-    clip->nx = j["nx"];
-    clip->ny = j["ny"];
-    clip->buf = j["buf"].get<std::vector<float>>();
-    return clip;
-}
-
-static json mtmd_clip_image_f32_batch_to_json(const clip_image_f32_batch & batch, bool full = false) {
-    json j;
-    j["is_audio"] = batch.is_audio;
-    j["grid_x"] = batch.grid_x;
-    j["grid_y"] = batch.grid_y;
-
-    if (full) {
-        std::vector<nlohmann::json> entries;
-        for (auto & entry : batch.entries) {
-            entries.push_back(mtmd_clip_image_f32_to_json(*entry));
-        }
-        j["entries"] = entries;
-    }
-
-    return j;
-}
-
-static clip_image_f32_batch mtmd_clip_image_f32_batch_from_json(const json & j, bool full = false) {
-    clip_image_f32_batch batch;
-    if (j.contains("is_audio")) {
-        batch.is_audio = j["is_audio"];
-        batch.grid_x = j["grid_x"];
-        batch.grid_y = j["grid_y"];
-        if (full) {
-            auto entries = j["entries"];
-            if (entries.is_array()) {
-                for (auto & entry : entries) {
-                    clip_image_f32 * clip = mtmd_clip_image_f32_from_json(entry);
-                    batch.entries.push_back(clip_image_f32_ptr(clip));
-                }
-            }
-        }
-
-    }
-    return batch;
-}
-
 static mtmd_audio_tokens mtmd_audio_tokens_from_json(json & j) {
    return mtmd_audio_tokens{
        j.value<uint32_t>("n_tokens", 0),
-        mtmd_clip_image_f32_batch_from_json(j.value("batch_f32", json{})),
+        clip_image_f32_batch {},
        j.value("id","")
    };
 }
@ -1180,7 +1148,7 @@ static mtmd_image_tokens mtmd_image_tokens_from_json(json & j) {
        j.value<uint32_t>("nx", 0),
        j.value<uint32_t>("ny", 0),
        j.value("use_mrope_pos",false),
-        mtmd_clip_image_f32_batch_from_json(j.value("batch_f32", json{})),
+        clip_image_f32_batch {},
        j.value("id","")
    };
 }
@ -1190,7 +1158,6 @@ static json mtmd_audio_tokens_to_json(mtmd_audio_tokens *  chunk) {
    if (chunk) {
        j["n_tokens"] = chunk->n_tokens;
        j["id"] = chunk->id;
-        j["batch_f32"] = mtmd_clip_image_f32_batch_to_json(chunk->batch_f32);
    }
    return j;
 }
@ -1201,7 +1168,6 @@ static json mtmd_image_tokens_to_json(mtmd_image_tokens * chunk) {
        j["nx"] = chunk->nx;
        j["ny"] = chunk->ny;
        j["use_mrope_pos"] = chunk->use_mrope_pos;
-        j["batch_f32"] = mtmd_clip_image_f32_batch_to_json(chunk->batch_f32);
        j["id"] = chunk->id;
    }
    return j;
--- a/examples/mtmd/mtmd.h
+++ b/examples/mtmd/mtmd.h
@ -170,6 +170,9 @@ MTMD_API llama_pos                  mtmd_input_chunk_get_n_pos       (const mtmd
 MTMD_API mtmd_input_chunk * mtmd_input_chunk_copy(const mtmd_input_chunk * chunk);
 MTMD_API void               mtmd_input_chunk_free(mtmd_input_chunk * chunk);

+// Free the raw audio (PCM) or imagebuffers (RGB-f32 (!)) of a multimedia chunk.
+// Provided for the benefit of llama-server as a stopgap to fix memory issues
+MTMD_API void               mtmd_input_chunk_free_raw_data(mtmd_input_chunk * chunk);

 // mtmd_image_tokens
 //
--- a/examples/server/server-common.cpp
+++ b/examples/server/server-common.cpp
@ -1253,6 +1253,13 @@ const mtmd::input_chunk_ptr& server_tokens::find_chunk(size_t idx) const {
    throw std::runtime_error("Chunk not found, or idx is not the first token of a chunk");
 }

+void server_tokens::free_raw_media_data(size_t idx) {
+    auto it = map_idx_to_media.find(idx);
+    if (it != map_idx_to_media.end() && it->second) {
+        mtmd_input_chunk_free_raw_data(it->second.get());
+    }
+}
+
 void server_tokens::push_back(llama_token tok) {
    if (tok == LLAMA_TOKEN_NULL) {
        throw std::runtime_error("Invalid token");
--- a/examples/server/server-common.h
+++ b/examples/server/server-common.h
@ -404,6 +404,10 @@ public:

    const mtmd::input_chunk_ptr& find_chunk(size_t idx) const;

+    // Manual free for the raw audio (PCM) or imagebuffers (RGB-f32 (!)) of a multimedia chunk.
+    // This data will never be read again after encoding on the first turn that multimedia are received.
+    void free_raw_media_data(size_t idx);
+
    void push_back(llama_token tok);

    // will create a copy of the chunk if it contains non-text data
--- a/examples/server/server-context.cpp
+++ b/examples/server/server-context.cpp
@ -3946,6 +3946,7 @@ void server_context::batch_pending_prompt(const int32_t n_ubatch, const int32_t

                    // add the image chunk to cache
                    {
+                        slot.prompt_tokens.free_raw_media_data(slot.n_past_prompt);
                        const auto& chunk = slot.prompt_tokens.find_chunk(slot.n_past_prompt);
                        slot.cache_tokens.push_back(chunk.get()); // copy
                    }