mirror of
https://github.com/ggml-org/llama.cpp.git
synced 2026-06-27 23:50:20 -05:00
mtmd: llava_uhd should no longer use batch dim (#24732)
This commit is contained in:
parent
2e88c49c90
commit
f3e1828164
@ -1105,6 +1105,8 @@ bool mtmd_image_preprocessor_internvl::preprocess(const clip_image_u8 & img, cli
|
||||
img_u8_to_f32(*imgs[i], *res, hparams.image_mean, hparams.image_std);
|
||||
output.entries.push_back(std::move(res));
|
||||
}
|
||||
output.grid_x = inst.grid_size.width;
|
||||
output.grid_y = inst.grid_size.height;
|
||||
return true;
|
||||
}
|
||||
|
||||
@ -1558,3 +1560,22 @@ bool mtmd_image_preprocessor_youtuvl::preprocess(const clip_image_u8 & img, clip
|
||||
output.entries.push_back(std::move(img_f32));
|
||||
return true;
|
||||
}
|
||||
|
||||
bool mtmd_image_preprocessor_granite::preprocess(const clip_image_u8 & img, clip_image_f32_batch & output) {
|
||||
// call super class preprocessor
|
||||
bool ok = mtmd_image_preprocessor_llava_uhd::preprocess(img, output);
|
||||
if (!ok) {
|
||||
return false;
|
||||
}
|
||||
if (output.entries.size() == 1) {
|
||||
// Single-tile (overview only): append one newline row.
|
||||
output.entries[0]->add_newline = true;
|
||||
} else {
|
||||
// Multi-tile: overview gets no newline, grid tiles get one.
|
||||
output.entries[0]->add_newline = false;
|
||||
for (size_t i = 1; i < output.entries.size(); ++i) {
|
||||
output.entries[i]->add_newline = true;
|
||||
}
|
||||
}
|
||||
return true;
|
||||
}
|
||||
|
||||
@ -197,3 +197,9 @@ struct mtmd_image_preprocessor_youtuvl : mtmd_image_preprocessor {
|
||||
mtmd_image_preprocessor_youtuvl(const clip_ctx * ctx) : mtmd_image_preprocessor(ctx) {}
|
||||
bool preprocess(const clip_image_u8 & img, clip_image_f32_batch & output) override;
|
||||
};
|
||||
|
||||
// similar to llava_uhd, but has add_newline
|
||||
struct mtmd_image_preprocessor_granite : mtmd_image_preprocessor_llava_uhd {
|
||||
mtmd_image_preprocessor_granite(const clip_ctx * ctx) : mtmd_image_preprocessor_llava_uhd(ctx) {}
|
||||
bool preprocess(const clip_image_u8 & img, clip_image_f32_batch & output) override;
|
||||
};
|
||||
|
||||
@ -639,7 +639,7 @@ struct mtmd_context {
|
||||
{
|
||||
img_beg = "<image>";
|
||||
img_end = "";
|
||||
image_preproc = std::make_unique<mtmd_image_preprocessor_llava_uhd>(ctx_v);
|
||||
image_preproc = std::make_unique<mtmd_image_preprocessor_granite>(ctx_v);
|
||||
} break;
|
||||
default:
|
||||
throw std::runtime_error(string_format("%s: unexpected vision projector type %d\n", __func__, proj));
|
||||
@ -1033,7 +1033,10 @@ struct mtmd_tokenizer {
|
||||
int32_t add_media(std::vector<const mtmd_bitmap *> & bitmaps) {
|
||||
GGML_ASSERT(!bitmaps.empty());
|
||||
|
||||
if (!bitmaps[0]->is_audio) {
|
||||
// note: only one type of media is supported per call, caller should enforce this
|
||||
const bool is_vision = !bitmaps[0]->is_audio;
|
||||
|
||||
if (is_vision) {
|
||||
// handle image
|
||||
|
||||
if (!ctx->ctx_v) {
|
||||
@ -1085,31 +1088,9 @@ struct mtmd_tokenizer {
|
||||
batch_f32.grid_y = tmp_batch.grid_y;
|
||||
}
|
||||
|
||||
// Annotate llava-next style tiles so clip_n_output_tokens accounts
|
||||
// for per-tile newline injection.
|
||||
if (ctx->proj_type_v() == PROJECTOR_TYPE_GRANITE4_VISION) {
|
||||
if (batch_f32.entries.size() == 1) {
|
||||
// Single-tile (overview only): append one newline row.
|
||||
batch_f32.entries[0]->add_newline = true;
|
||||
} else {
|
||||
// Multi-tile: overview gets no newline, grid tiles get one.
|
||||
batch_f32.entries[0]->add_newline = false;
|
||||
for (size_t i = 1; i < batch_f32.entries.size(); ++i) {
|
||||
batch_f32.entries[i]->add_newline = true;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// handle llava-uhd style preprocessing
|
||||
const bool has_tiling_grid = batch_f32.grid_x > 0 && batch_f32.grid_y > 0;
|
||||
if (
|
||||
ctx->slice_tmpl == MTMD_SLICE_TMPL_MINICPMV_2_5
|
||||
|| ctx->slice_tmpl == MTMD_SLICE_TMPL_MINICPMV_2_6
|
||||
|| ctx->slice_tmpl == MTMD_SLICE_TMPL_LLAMA4
|
||||
|| ctx->slice_tmpl == MTMD_SLICE_TMPL_IDEFICS3
|
||||
|| ctx->slice_tmpl == MTMD_SLICE_TMPL_STEP3VL
|
||||
|| (ctx->slice_tmpl == MTMD_SLICE_TMPL_LFM2 && has_tiling_grid)
|
||||
) {
|
||||
if (has_tiling_grid) {
|
||||
// [QWEN_VIDEO] we do not support "frame merging" for llama-uhd style, so no batching for now
|
||||
GGML_ASSERT(bitmaps.size() == 1);
|
||||
|
||||
|
||||
Loading…
x
Reference in New Issue
Block a user