Free raw multimedia data from server_tokens after encoding, as it will never be read again (#2029)

Data server_tokens.map_idx_to_media.tokens_image.batch_f32 is read exactly once, 
by mtmd_encode, however it was retained as long as the input image was present
in the sequence. Add a manual free function to clear out this data after encoding.

Solves:
* Memory wasted in struct server_tokens
* The same wasted memory in the ram cache 
* Long copy durations cloning this data to/from ram cache
* Accounting failures in ram cache (`batch_f32` can be larger than a sequence's entire KV)
* The above accounting failures leading to terminal memory leaks in pathological cases
* Remove JSON serialization for `batch_32` which was unused, and had no foreseeable usecase
This commit is contained in:
Farmadupe 2026-06-25 09:18:32 +01:00 committed by GitHub
parent bdf5c081dc
commit d3e86a5431
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
5 changed files with 38 additions and 57 deletions

View File

@ -783,12 +783,20 @@ int32_t mtmd_encode_chunk(mtmd_context * ctx, const mtmd_input_chunk * chunk) {
LOG_ERR("%s: model does not support vision input\n", __func__); LOG_ERR("%s: model does not support vision input\n", __func__);
return 1; return 1;
} }
// If in the future, we somehow accidentally try to reencode an already-encoded chunk,
// chunk->tokens_image will have been cleared out to save memory
GGML_ASSERT(!chunk->tokens_image->batch_f32.entries.empty()
&& "mtmd_encode_chunk: image data already released (double encode?)");
return mtmd_encode(ctx, chunk->tokens_image.get()); return mtmd_encode(ctx, chunk->tokens_image.get());
} else if (chunk->type == MTMD_INPUT_CHUNK_TYPE_AUDIO) { } else if (chunk->type == MTMD_INPUT_CHUNK_TYPE_AUDIO) {
if (!ctx->ctx_a) { if (!ctx->ctx_a) {
LOG_ERR("%s: model does not support audio input\n", __func__); LOG_ERR("%s: model does not support audio input\n", __func__);
return 1; return 1;
} }
// If in the future, we somehow accidentally try to reencode an already-encoded chunk,
// chunk->tokens_audio will have been cleared out to save memory
GGML_ASSERT(!chunk->tokens_audio->batch_f32.entries.empty()
&& "mtmd_encode_chunk: audio data already released (double encode?)");
int n_mmproj_embd = ctx->n_embd_text; int n_mmproj_embd = ctx->n_embd_text;
ctx->image_embd_v.resize(chunk->tokens_audio->n_tokens * n_mmproj_embd); ctx->image_embd_v.resize(chunk->tokens_audio->n_tokens * n_mmproj_embd);
bool ok = clip_image_batch_encode( bool ok = clip_image_batch_encode(
@ -1042,6 +1050,19 @@ void mtmd_input_chunk_free(mtmd_input_chunk * chunk) {
} }
} }
void mtmd_input_chunk_free_raw_data(mtmd_input_chunk * chunk) {
if (!chunk) {
return;
}
if (chunk->tokens_image) {
chunk->tokens_image->batch_f32 = clip_image_f32_batch{};
}
if (chunk->tokens_audio) {
chunk->tokens_audio->batch_f32 = clip_image_f32_batch{};
}
}
// mtmd_image_tokens // mtmd_image_tokens
size_t mtmd_image_tokens_get_n_tokens(const mtmd_image_tokens * image_tokens) { size_t mtmd_image_tokens_get_n_tokens(const mtmd_image_tokens * image_tokens) {
@ -1114,63 +1135,10 @@ mtmd_input_chunks * mtmd_test_create_input_chunks() {
return chunks; return chunks;
} }
static json mtmd_clip_image_f32_to_json(const clip_image_f32 & clip) {
json j;
j["nx"] = clip.nx;
j["ny"] = clip.ny;
j["buf"] = clip.buf;
return j;
}
static clip_image_f32 * mtmd_clip_image_f32_from_json(const json & j) {
clip_image_f32 * clip = new clip_image_f32;
clip->nx = j["nx"];
clip->ny = j["ny"];
clip->buf = j["buf"].get<std::vector<float>>();
return clip;
}
static json mtmd_clip_image_f32_batch_to_json(const clip_image_f32_batch & batch, bool full = false) {
json j;
j["is_audio"] = batch.is_audio;
j["grid_x"] = batch.grid_x;
j["grid_y"] = batch.grid_y;
if (full) {
std::vector<nlohmann::json> entries;
for (auto & entry : batch.entries) {
entries.push_back(mtmd_clip_image_f32_to_json(*entry));
}
j["entries"] = entries;
}
return j;
}
static clip_image_f32_batch mtmd_clip_image_f32_batch_from_json(const json & j, bool full = false) {
clip_image_f32_batch batch;
if (j.contains("is_audio")) {
batch.is_audio = j["is_audio"];
batch.grid_x = j["grid_x"];
batch.grid_y = j["grid_y"];
if (full) {
auto entries = j["entries"];
if (entries.is_array()) {
for (auto & entry : entries) {
clip_image_f32 * clip = mtmd_clip_image_f32_from_json(entry);
batch.entries.push_back(clip_image_f32_ptr(clip));
}
}
}
}
return batch;
}
static mtmd_audio_tokens mtmd_audio_tokens_from_json(json & j) { static mtmd_audio_tokens mtmd_audio_tokens_from_json(json & j) {
return mtmd_audio_tokens{ return mtmd_audio_tokens{
j.value<uint32_t>("n_tokens", 0), j.value<uint32_t>("n_tokens", 0),
mtmd_clip_image_f32_batch_from_json(j.value("batch_f32", json{})), clip_image_f32_batch {},
j.value("id","") j.value("id","")
}; };
} }
@ -1180,7 +1148,7 @@ static mtmd_image_tokens mtmd_image_tokens_from_json(json & j) {
j.value<uint32_t>("nx", 0), j.value<uint32_t>("nx", 0),
j.value<uint32_t>("ny", 0), j.value<uint32_t>("ny", 0),
j.value("use_mrope_pos",false), j.value("use_mrope_pos",false),
mtmd_clip_image_f32_batch_from_json(j.value("batch_f32", json{})), clip_image_f32_batch {},
j.value("id","") j.value("id","")
}; };
} }
@ -1190,7 +1158,6 @@ static json mtmd_audio_tokens_to_json(mtmd_audio_tokens * chunk) {
if (chunk) { if (chunk) {
j["n_tokens"] = chunk->n_tokens; j["n_tokens"] = chunk->n_tokens;
j["id"] = chunk->id; j["id"] = chunk->id;
j["batch_f32"] = mtmd_clip_image_f32_batch_to_json(chunk->batch_f32);
} }
return j; return j;
} }
@ -1201,7 +1168,6 @@ static json mtmd_image_tokens_to_json(mtmd_image_tokens * chunk) {
j["nx"] = chunk->nx; j["nx"] = chunk->nx;
j["ny"] = chunk->ny; j["ny"] = chunk->ny;
j["use_mrope_pos"] = chunk->use_mrope_pos; j["use_mrope_pos"] = chunk->use_mrope_pos;
j["batch_f32"] = mtmd_clip_image_f32_batch_to_json(chunk->batch_f32);
j["id"] = chunk->id; j["id"] = chunk->id;
} }
return j; return j;

View File

@ -170,6 +170,9 @@ MTMD_API llama_pos mtmd_input_chunk_get_n_pos (const mtmd
MTMD_API mtmd_input_chunk * mtmd_input_chunk_copy(const mtmd_input_chunk * chunk); MTMD_API mtmd_input_chunk * mtmd_input_chunk_copy(const mtmd_input_chunk * chunk);
MTMD_API void mtmd_input_chunk_free(mtmd_input_chunk * chunk); MTMD_API void mtmd_input_chunk_free(mtmd_input_chunk * chunk);
// Free the raw audio (PCM) or imagebuffers (RGB-f32 (!)) of a multimedia chunk.
// Provided for the benefit of llama-server as a stopgap to fix memory issues
MTMD_API void mtmd_input_chunk_free_raw_data(mtmd_input_chunk * chunk);
// mtmd_image_tokens // mtmd_image_tokens
// //

View File

@ -1253,6 +1253,13 @@ const mtmd::input_chunk_ptr& server_tokens::find_chunk(size_t idx) const {
throw std::runtime_error("Chunk not found, or idx is not the first token of a chunk"); throw std::runtime_error("Chunk not found, or idx is not the first token of a chunk");
} }
void server_tokens::free_raw_media_data(size_t idx) {
auto it = map_idx_to_media.find(idx);
if (it != map_idx_to_media.end() && it->second) {
mtmd_input_chunk_free_raw_data(it->second.get());
}
}
void server_tokens::push_back(llama_token tok) { void server_tokens::push_back(llama_token tok) {
if (tok == LLAMA_TOKEN_NULL) { if (tok == LLAMA_TOKEN_NULL) {
throw std::runtime_error("Invalid token"); throw std::runtime_error("Invalid token");

View File

@ -404,6 +404,10 @@ public:
const mtmd::input_chunk_ptr& find_chunk(size_t idx) const; const mtmd::input_chunk_ptr& find_chunk(size_t idx) const;
// Manual free for the raw audio (PCM) or imagebuffers (RGB-f32 (!)) of a multimedia chunk.
// This data will never be read again after encoding on the first turn that multimedia are received.
void free_raw_media_data(size_t idx);
void push_back(llama_token tok); void push_back(llama_token tok);
// will create a copy of the chunk if it contains non-text data // will create a copy of the chunk if it contains non-text data

View File

@ -3946,6 +3946,7 @@ void server_context::batch_pending_prompt(const int32_t n_ubatch, const int32_t
// add the image chunk to cache // add the image chunk to cache
{ {
slot.prompt_tokens.free_raw_media_data(slot.n_past_prompt);
const auto& chunk = slot.prompt_tokens.find_chunk(slot.n_past_prompt); const auto& chunk = slot.prompt_tokens.find_chunk(slot.n_past_prompt);
slot.cache_tokens.push_back(chunk.get()); // copy slot.cache_tokens.push_back(chunk.get()); // copy
} }