mirror of
https://github.com/ikawrakow/ik_llama.cpp.git
synced 2026-06-28 04:30:15 -05:00
Free raw multimedia data from server_tokens after encoding, as it will never be read again (#2029)
Data server_tokens.map_idx_to_media.tokens_image.batch_f32 is read exactly once, by mtmd_encode, however it was retained as long as the input image was present in the sequence. Add a manual free function to clear out this data after encoding. Solves: * Memory wasted in struct server_tokens * The same wasted memory in the ram cache * Long copy durations cloning this data to/from ram cache * Accounting failures in ram cache (`batch_f32` can be larger than a sequence's entire KV) * The above accounting failures leading to terminal memory leaks in pathological cases * Remove JSON serialization for `batch_32` which was unused, and had no foreseeable usecase
This commit is contained in:
parent
bdf5c081dc
commit
d3e86a5431
@ -783,12 +783,20 @@ int32_t mtmd_encode_chunk(mtmd_context * ctx, const mtmd_input_chunk * chunk) {
|
||||
LOG_ERR("%s: model does not support vision input\n", __func__);
|
||||
return 1;
|
||||
}
|
||||
// If in the future, we somehow accidentally try to reencode an already-encoded chunk,
|
||||
// chunk->tokens_image will have been cleared out to save memory
|
||||
GGML_ASSERT(!chunk->tokens_image->batch_f32.entries.empty()
|
||||
&& "mtmd_encode_chunk: image data already released (double encode?)");
|
||||
return mtmd_encode(ctx, chunk->tokens_image.get());
|
||||
} else if (chunk->type == MTMD_INPUT_CHUNK_TYPE_AUDIO) {
|
||||
if (!ctx->ctx_a) {
|
||||
LOG_ERR("%s: model does not support audio input\n", __func__);
|
||||
return 1;
|
||||
}
|
||||
// If in the future, we somehow accidentally try to reencode an already-encoded chunk,
|
||||
// chunk->tokens_audio will have been cleared out to save memory
|
||||
GGML_ASSERT(!chunk->tokens_audio->batch_f32.entries.empty()
|
||||
&& "mtmd_encode_chunk: audio data already released (double encode?)");
|
||||
int n_mmproj_embd = ctx->n_embd_text;
|
||||
ctx->image_embd_v.resize(chunk->tokens_audio->n_tokens * n_mmproj_embd);
|
||||
bool ok = clip_image_batch_encode(
|
||||
@ -1042,6 +1050,19 @@ void mtmd_input_chunk_free(mtmd_input_chunk * chunk) {
|
||||
}
|
||||
}
|
||||
|
||||
void mtmd_input_chunk_free_raw_data(mtmd_input_chunk * chunk) {
|
||||
if (!chunk) {
|
||||
return;
|
||||
}
|
||||
|
||||
if (chunk->tokens_image) {
|
||||
chunk->tokens_image->batch_f32 = clip_image_f32_batch{};
|
||||
}
|
||||
if (chunk->tokens_audio) {
|
||||
chunk->tokens_audio->batch_f32 = clip_image_f32_batch{};
|
||||
}
|
||||
}
|
||||
|
||||
// mtmd_image_tokens
|
||||
|
||||
size_t mtmd_image_tokens_get_n_tokens(const mtmd_image_tokens * image_tokens) {
|
||||
@ -1114,63 +1135,10 @@ mtmd_input_chunks * mtmd_test_create_input_chunks() {
|
||||
return chunks;
|
||||
}
|
||||
|
||||
static json mtmd_clip_image_f32_to_json(const clip_image_f32 & clip) {
|
||||
json j;
|
||||
j["nx"] = clip.nx;
|
||||
j["ny"] = clip.ny;
|
||||
j["buf"] = clip.buf;
|
||||
return j;
|
||||
}
|
||||
|
||||
static clip_image_f32 * mtmd_clip_image_f32_from_json(const json & j) {
|
||||
clip_image_f32 * clip = new clip_image_f32;
|
||||
clip->nx = j["nx"];
|
||||
clip->ny = j["ny"];
|
||||
clip->buf = j["buf"].get<std::vector<float>>();
|
||||
return clip;
|
||||
}
|
||||
|
||||
static json mtmd_clip_image_f32_batch_to_json(const clip_image_f32_batch & batch, bool full = false) {
|
||||
json j;
|
||||
j["is_audio"] = batch.is_audio;
|
||||
j["grid_x"] = batch.grid_x;
|
||||
j["grid_y"] = batch.grid_y;
|
||||
|
||||
if (full) {
|
||||
std::vector<nlohmann::json> entries;
|
||||
for (auto & entry : batch.entries) {
|
||||
entries.push_back(mtmd_clip_image_f32_to_json(*entry));
|
||||
}
|
||||
j["entries"] = entries;
|
||||
}
|
||||
|
||||
return j;
|
||||
}
|
||||
|
||||
static clip_image_f32_batch mtmd_clip_image_f32_batch_from_json(const json & j, bool full = false) {
|
||||
clip_image_f32_batch batch;
|
||||
if (j.contains("is_audio")) {
|
||||
batch.is_audio = j["is_audio"];
|
||||
batch.grid_x = j["grid_x"];
|
||||
batch.grid_y = j["grid_y"];
|
||||
if (full) {
|
||||
auto entries = j["entries"];
|
||||
if (entries.is_array()) {
|
||||
for (auto & entry : entries) {
|
||||
clip_image_f32 * clip = mtmd_clip_image_f32_from_json(entry);
|
||||
batch.entries.push_back(clip_image_f32_ptr(clip));
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
}
|
||||
return batch;
|
||||
}
|
||||
|
||||
static mtmd_audio_tokens mtmd_audio_tokens_from_json(json & j) {
|
||||
return mtmd_audio_tokens{
|
||||
j.value<uint32_t>("n_tokens", 0),
|
||||
mtmd_clip_image_f32_batch_from_json(j.value("batch_f32", json{})),
|
||||
clip_image_f32_batch {},
|
||||
j.value("id","")
|
||||
};
|
||||
}
|
||||
@ -1180,7 +1148,7 @@ static mtmd_image_tokens mtmd_image_tokens_from_json(json & j) {
|
||||
j.value<uint32_t>("nx", 0),
|
||||
j.value<uint32_t>("ny", 0),
|
||||
j.value("use_mrope_pos",false),
|
||||
mtmd_clip_image_f32_batch_from_json(j.value("batch_f32", json{})),
|
||||
clip_image_f32_batch {},
|
||||
j.value("id","")
|
||||
};
|
||||
}
|
||||
@ -1190,7 +1158,6 @@ static json mtmd_audio_tokens_to_json(mtmd_audio_tokens * chunk) {
|
||||
if (chunk) {
|
||||
j["n_tokens"] = chunk->n_tokens;
|
||||
j["id"] = chunk->id;
|
||||
j["batch_f32"] = mtmd_clip_image_f32_batch_to_json(chunk->batch_f32);
|
||||
}
|
||||
return j;
|
||||
}
|
||||
@ -1201,7 +1168,6 @@ static json mtmd_image_tokens_to_json(mtmd_image_tokens * chunk) {
|
||||
j["nx"] = chunk->nx;
|
||||
j["ny"] = chunk->ny;
|
||||
j["use_mrope_pos"] = chunk->use_mrope_pos;
|
||||
j["batch_f32"] = mtmd_clip_image_f32_batch_to_json(chunk->batch_f32);
|
||||
j["id"] = chunk->id;
|
||||
}
|
||||
return j;
|
||||
|
||||
@ -170,6 +170,9 @@ MTMD_API llama_pos mtmd_input_chunk_get_n_pos (const mtmd
|
||||
MTMD_API mtmd_input_chunk * mtmd_input_chunk_copy(const mtmd_input_chunk * chunk);
|
||||
MTMD_API void mtmd_input_chunk_free(mtmd_input_chunk * chunk);
|
||||
|
||||
// Free the raw audio (PCM) or imagebuffers (RGB-f32 (!)) of a multimedia chunk.
|
||||
// Provided for the benefit of llama-server as a stopgap to fix memory issues
|
||||
MTMD_API void mtmd_input_chunk_free_raw_data(mtmd_input_chunk * chunk);
|
||||
|
||||
// mtmd_image_tokens
|
||||
//
|
||||
|
||||
@ -1253,6 +1253,13 @@ const mtmd::input_chunk_ptr& server_tokens::find_chunk(size_t idx) const {
|
||||
throw std::runtime_error("Chunk not found, or idx is not the first token of a chunk");
|
||||
}
|
||||
|
||||
void server_tokens::free_raw_media_data(size_t idx) {
|
||||
auto it = map_idx_to_media.find(idx);
|
||||
if (it != map_idx_to_media.end() && it->second) {
|
||||
mtmd_input_chunk_free_raw_data(it->second.get());
|
||||
}
|
||||
}
|
||||
|
||||
void server_tokens::push_back(llama_token tok) {
|
||||
if (tok == LLAMA_TOKEN_NULL) {
|
||||
throw std::runtime_error("Invalid token");
|
||||
|
||||
@ -404,6 +404,10 @@ public:
|
||||
|
||||
const mtmd::input_chunk_ptr& find_chunk(size_t idx) const;
|
||||
|
||||
// Manual free for the raw audio (PCM) or imagebuffers (RGB-f32 (!)) of a multimedia chunk.
|
||||
// This data will never be read again after encoding on the first turn that multimedia are received.
|
||||
void free_raw_media_data(size_t idx);
|
||||
|
||||
void push_back(llama_token tok);
|
||||
|
||||
// will create a copy of the chunk if it contains non-text data
|
||||
|
||||
@ -3946,6 +3946,7 @@ void server_context::batch_pending_prompt(const int32_t n_ubatch, const int32_t
|
||||
|
||||
// add the image chunk to cache
|
||||
{
|
||||
slot.prompt_tokens.free_raw_media_data(slot.n_past_prompt);
|
||||
const auto& chunk = slot.prompt_tokens.find_chunk(slot.n_past_prompt);
|
||||
slot.cache_tokens.push_back(chunk.get()); // copy
|
||||
}
|
||||
|
||||
Loading…
x
Reference in New Issue
Block a user