mtmd: add batching for mtmd-cli, add video tests (#24778)

This commit is contained in:
Xuan-Son Nguyen 2026-06-18 21:55:04 +02:00 committed by GitHub
parent 32eddaf2ea
commit a6b3260a42
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
2 changed files with 145 additions and 24 deletions

View File

@ -32,9 +32,9 @@ static volatile bool g_is_generating = false;
static volatile bool g_is_interrupted = false;
/**
* Please note that this is NOT a production-ready stuff.
* Please note that this is NOT a production-ready binary.
* It is a playground for trying multimodal support in llama.cpp.
* For contributors: please keep this code simple and easy to understand.
* For contributors: please keep this code simple and easy to understand. Do not add unnecessary complexity. The goal is to have a simple CLI for testing multimodal support.
*/
static void show_additional_info(int /*argc*/, char ** argv) {
@ -65,6 +65,14 @@ static void sigint_handler(int signo) {
}
#endif
// this is only used by tests.sh to capture the response ; it's not meant to be used in production
static void inject_test_response_marker() {
const char * env = std::getenv("MTMD_TEST_RESPONSE_MARKER");
if (env) {
LOG("%s\n", env);
}
}
struct mtmd_cli_context {
mtmd::context_ptr ctx_vision;
common_init_result_ptr llama_init;
@ -79,6 +87,8 @@ struct mtmd_cli_context {
mtmd::bitmaps bitmaps;
std::vector<mtmd_helper::video_ptr> videos;
mtmd::batch_ptr mbatch;
// chat template
common_chat_templates_ptr tmpls;
std::vector<common_chat_msg> chat_history;
@ -233,6 +243,8 @@ static std::string chat_add_and_format(mtmd_cli_context & ctx, common_chat_msg &
}
static int eval_message(mtmd_cli_context & ctx, common_chat_msg & msg) {
inject_test_response_marker();
bool add_bos = ctx.chat_history.empty();
auto formatted_chat = chat_add_and_format(ctx, msg);
LOG_DBG("formatted_chat.prompt: %s\n", formatted_chat.c_str());
@ -259,20 +271,95 @@ static int eval_message(mtmd_cli_context & ctx, common_chat_msg & msg) {
ctx.bitmaps.entries.clear();
ctx.videos.clear();
llama_pos new_n_past;
if (mtmd_helper_eval_chunks(ctx.ctx_vision.get(),
ctx.lctx, // lctx
chunks.ptr.get(), // chunks
ctx.n_past, // n_past
0, // seq_id
ctx.n_batch, // n_batch
true, // logits_last
&new_n_past)) {
LOG_ERR("Unable to eval prompt\n");
return 1;
}
// batch encode all media chunks, then decode each
size_t n_chunks = mtmd_input_chunks_size(chunks.ptr.get());
for (size_t i = 0; i < n_chunks; i++) {
auto chunk = mtmd_input_chunks_get(chunks.ptr.get(), i);
auto chunk_type = mtmd_input_chunk_get_type(chunk);
ctx.n_past = new_n_past;
if (chunk_type == MTMD_INPUT_CHUNK_TYPE_TEXT) {
// decode text chunk
llama_pos new_n_past = ctx.n_past;
res = mtmd_helper_eval_chunk_single(ctx.ctx_vision.get(),
ctx.lctx,
chunk,
ctx.n_past,
0, // seq_id
ctx.n_batch,
i == n_chunks - 1, // logits_last
&new_n_past);
if (res != 0) {
LOG_ERR("Unable to eval text chunk %zu\n", i);
return 1;
}
ctx.n_past = new_n_past;
} else {
// media chunk: try to get embd from existing batch, or create a new batch
float * embd = nullptr;
if (ctx.mbatch) {
embd = mtmd_batch_get_output_embd(ctx.mbatch.get(), chunk);
if (embd) {
LOG_DBG("found embd for media chunk %zu in existing batch\n", i);
} else {
LOG_DBG("media chunk %zu not found in existing batch, creating new batch\n", i);
}
}
if (!embd) {
// create and encode a new batch with as many media chunks as possible
ctx.mbatch.reset(mtmd_batch_init(ctx.ctx_vision.get()));
res = mtmd_batch_add_chunk(ctx.mbatch.get(), chunk);
GGML_ASSERT(res == 0); // first chunk must always succeed
int n_added = 1;
// add as many subsequent media chunks as possible
for (size_t j = i + 1; j < n_chunks; j++) {
auto next_chunk = mtmd_input_chunks_get(chunks.ptr.get(), j);
auto next_type = mtmd_input_chunk_get_type(next_chunk);
if (next_type == MTMD_INPUT_CHUNK_TYPE_TEXT) {
break; // text chunk splits the batch
}
res = mtmd_batch_add_chunk(ctx.mbatch.get(), next_chunk);
if (res != 0) {
break; // batch full or incompatible
}
n_added++;
}
int64_t time_start = ggml_time_ms();
LOG_INF("encoding mtmd batch, n_chunks = %d (done = %zu, total = %zu)\n", n_added, i, n_chunks);
res = mtmd_batch_encode(ctx.mbatch.get());
if (res != 0) {
LOG_ERR("Failed to encode mtmd batch, res = %d\n", res);
return 1;
}
LOG_INF("mtmd batch encoding done in %d ms\n", (int)(ggml_time_ms() - time_start));
embd = mtmd_batch_get_output_embd(ctx.mbatch.get(), chunk);
}
GGML_ASSERT(embd != nullptr);
llama_pos new_n_past = ctx.n_past;
res = mtmd_helper_decode_image_chunk(ctx.ctx_vision.get(),
ctx.lctx,
chunk,
embd,
ctx.n_past,
0, // seq_id
ctx.n_batch,
&new_n_past,
nullptr, // callback
nullptr // user_data
);
if (res != 0) {
LOG_ERR("Unable to decode media chunk %zu\n", i);
return 1;
}
ctx.n_past = new_n_past;
}
}
LOG("\n");

View File

@ -13,6 +13,8 @@ mkdir -p $SCRIPT_DIR/output
PROJ_ROOT="$SCRIPT_DIR/../.."
cd $PROJ_ROOT
export MTMD_TEST_RESPONSE_MARKER="<MTMD_TEST_RESPONSE_MARKER>"
# Check if the first argument is "big", then run test with big models
# This is useful if we're running the script on a larger machine, so we can test the big models
RUN_BIG_TESTS=false
@ -28,6 +30,15 @@ if [ "${1:-}" = "huge" ]; then
echo "Include BIG and HUGE models..."
fi
USE_VIDEO=false
if [ "${1:-}" = "video" ]; then
USE_VIDEO=true
echo "Using video as input..."
# behavior of USE_VIDEO:
# do NOT check if the output contains "new york", only verify if the exit code is 0
# when printing the result, print the OK/FAIL line then print the generated text
fi
# Check if the second argument is "flash", then enable flash attention
# This is useful to test if flash attention off works correctly
FLASH_ATTN="on"
@ -50,13 +61,20 @@ add_test_vision() {
if [ $# -gt 0 ]; then
extra_args=$(printf " %q" "$@")
fi
if [ "$USE_VIDEO" = true ]; then
arr_file+=("test-3.mp4")
else
arr_file+=("test-1.jpeg")
fi
arr_prefix+=("[vision]")
arr_hf+=("$hf")
arr_extra_args+=("$extra_args")
arr_file+=("test-1.jpeg")
}
add_test_audio() {
if [ "$USE_VIDEO" = true ]; then
return 0
fi
local hf=$1
shift
local extra_args=""
@ -166,19 +184,35 @@ for i in "${!arr_hf[@]}"; do
cmd+=" -p \"what is the publisher name of the newspaper?\""
fi
output=$(eval "$cmd" 2>&1 | tee /dev/tty)
exit_code=0
output=$(eval "$cmd" 2>&1 | tee /dev/tty) || exit_code=$?
echo "$output" > $SCRIPT_DIR/output/$bin-$(echo "$hf" | tr '/' '-').log
# either contains "new york" or both "men" and "walk"
if echo "$output" | grep -iq "new york" \
|| (echo "$output" | grep -iq "men" && echo "$output" | grep -iq "walk")
then
result="$prefix \033[32mOK\033[0m: $hf"
if [ "$USE_VIDEO" = true ]; then
# for video, only check exit code; do not grep for "new york"
if [ $exit_code -eq 0 ]; then
result="$prefix \033[32mOK\033[0m: $hf"
else
result="$prefix \033[31mFAIL\033[0m: $hf"
fi
# append generated text (after the response marker)
generated_text=$(echo "$output" | sed "1,/${MTMD_TEST_RESPONSE_MARKER}/d" | tail -10)
if [ -n "$generated_text" ]; then
result+="\n$generated_text"
fi
echo -e "$result"
else
result="$prefix \033[31mFAIL\033[0m: $hf"
# either contains "new york" or both "men" and "walk"
if echo "$output" | grep -iq "new york" \
|| (echo "$output" | grep -iq "men" && echo "$output" | grep -iq "walk")
then
result="$prefix \033[32mOK\033[0m: $hf"
else
result="$prefix \033[31mFAIL\033[0m: $hf"
fi
echo -e "$result"
fi
echo -e "$result"
arr_res+=("$result")
echo ""