diff --git a/tools/mtmd/clip-impl.h b/tools/mtmd/clip-impl.h index 393e085f71..794cb4d2b2 100644 --- a/tools/mtmd/clip-impl.h +++ b/tools/mtmd/clip-impl.h @@ -4,6 +4,7 @@ #include "gguf.h" #include "clip.h" +#include #include #include #include @@ -429,10 +430,68 @@ static projector_type clip_projector_type_from_string(const std::string & str) { // RGB uint8 image struct clip_image_u8 { - int nx; - int ny; + clip_image_size get_size() const { + return { nx, ny }; + } + void set_size(clip_image_size size, bool is_placeholder) { + nx = size.width; + ny = size.height; + if (is_placeholder) { + buf.clear(); + } else { + buf.resize((size_t) nx * (size_t) ny * 3); + } + } + + void cpy_buf(const std::vector & new_buf) { + buf = new_buf; + } + + const std::vector & get_ro_buf() const { + if (is_placeholder()) { + throw std::runtime_error("this clip_image_u8 is a placeholder"); + } + return buf; + } + + // note to contributors: NEVER add a get_rw_buf(), it is a DANGEROUS pattern. always use get_pixel / set_pixel for buffer manipulation + + bool is_placeholder() const { + return buf.empty(); + } + + std::array get_pixel(int x, int y) const { + if (is_placeholder()) { + // return a dummy value, so that legacy code can still process image without errors + return { 0, 0, 0 }; + } + int idx = (y * nx + x) * 3; + return { buf[idx], buf[idx + 1], buf[idx + 2] }; + } + + void set_pixel(int x, int y, const std::array & rgb) { + if (is_placeholder()) { + return; // no-op + } + int idx = (y * nx + x) * 3; + buf[idx] = rgb[0]; + buf[idx + 1] = rgb[1]; + buf[idx + 2] = rgb[2]; + } + + size_t n_pixels() const { + return (size_t) nx * (size_t) ny; + } + + size_t n_elements() const { + return n_pixels() * 3; + } + + private: std::vector buf; + int nx = 0; + int ny = 0; }; // For images, buf.size() == nx*ny*3 @@ -440,15 +499,87 @@ struct clip_image_u8 { // For audio, only one channel is used, buf.size() == nx*ny // nx will be n_frames and ny will be n_mel struct clip_image_f32 { - int nx; - int ny; - - std::vector buf; - // marks the global view in e.g., DeepSeek-OCR Models bool add_viewsep = false; - // whether a learned newline token should be appended after the image (eg Granite4 Vision) + // whether a learned newline (or EOI) token should be appended after the image (eg Granite4 Vision) bool add_newline = false; + + clip_image_size get_size() const { + return { nx_, ny_ }; + } + + int nx() const { return nx_; } + int ny() const { return ny_; } + + void set_size(clip_image_size size, bool is_placeholder, bool is_audio) { + nx_ = size.width; + ny_ = size.height; + if (is_placeholder) { + buf.clear(); + } else { + if (is_audio) { + buf.resize((size_t) nx_ * (size_t) ny_); + } else { + buf.resize((size_t) nx_ * (size_t) ny_ * 3); + } + } + } + + void cpy_buf(const std::vector & new_buf) { + buf = new_buf; + } + + void from_u8(const clip_image_u8 & img) { + auto size = img.get_size(); + nx_ = size.width; + ny_ = size.height; + if (img.is_placeholder()) { + buf.clear(); + return; // no-op + } + buf.resize(img.n_elements()); + const auto & u8_buf = img.get_ro_buf(); + for (size_t i = 0; i < img.n_elements(); ++i) { + buf[i] = (float) u8_buf[i] / 255.0f; + } + } + + size_t n_pixels() const { + return (size_t) nx_ * (size_t) ny_; + } + + size_t n_elements() const { + return n_pixels() * 3; + } + + void normalize(const float mean[3], const float std[3]) { + if (is_placeholder()) { + return; // no-op + } + for (size_t i = 0; i < n_pixels(); ++i) { + buf[i * 3 + 0] = (buf[i * 3 + 0] - mean[0]) / std[0]; + buf[i * 3 + 1] = (buf[i * 3 + 1] - mean[1]) / std[1]; + buf[i * 3 + 2] = (buf[i * 3 + 2] - mean[2]) / std[2]; + } + } + + const std::vector & get_ro_buf() const { + if (is_placeholder()) { + throw std::runtime_error("this clip_image_f32 is a placeholder"); + } + return buf; + } + + // note to contributors: NEVER add a get_rw_buf(), it is a DANGEROUS pattern + + bool is_placeholder() const { + return buf.empty(); + } + + private: + std::vector buf; + int nx_ = 0; + int ny_ = 0; }; // diff --git a/tools/mtmd/clip.cpp b/tools/mtmd/clip.cpp index c12c910a1c..6e54524da0 100644 --- a/tools/mtmd/clip.cpp +++ b/tools/mtmd/clip.cpp @@ -39,12 +39,14 @@ static void clip_image_write_image_to_ppm(const clip_image_u8& img, const std::s } // PPM header: P6 format, width, height, and max color value - file << "P6\n" << img.nx << " " << img.ny << "\n255\n"; + const auto ppm_size = img.get_size(); + file << "P6\n" << ppm_size.width << " " << ppm_size.height << "\n255\n"; // Write pixel data - for (size_t i = 0; i < img.buf.size(); i += 3) { + const auto & ppm_buf = img.get_ro_buf(); + for (size_t i = 0; i < ppm_buf.size(); i += 3) { // PPM expects binary data in RGB format, which matches our image buffer - file.write(reinterpret_cast(&img.buf[i]), 3); + file.write(reinterpret_cast(&ppm_buf[i]), 3); } file.close(); @@ -57,9 +59,10 @@ static void clip_image_save_to_bmp(const clip_image_u8& img, const std::string& return; } - int fileSize = 54 + 3 * img.nx * img.ny; // File header + info header + pixel data + const auto bmp_size = img.get_size(); + int fileSize = 54 + 3 * bmp_size.width * bmp_size.height; // File header + info header + pixel data int bytesPerPixel = 3; - int widthInBytes = img.nx * bytesPerPixel; + int widthInBytes = bmp_size.width * bytesPerPixel; int paddingAmount = (4 - (widthInBytes % 4)) % 4; int stride = widthInBytes + paddingAmount; @@ -72,7 +75,7 @@ static void clip_image_save_to_bmp(const clip_image_u8& img, const std::string& }; // Total file size - fileSize = 54 + (stride * img.ny); + fileSize = 54 + (stride * bmp_size.height); fileHeader[2] = (unsigned char)(fileSize); fileHeader[3] = (unsigned char)(fileSize >> 8); fileHeader[4] = (unsigned char)(fileSize >> 16); @@ -94,14 +97,14 @@ static void clip_image_save_to_bmp(const clip_image_u8& img, const std::string& }; // Width and height in the information header - infoHeader[4] = (unsigned char)(img.nx); - infoHeader[5] = (unsigned char)(img.nx >> 8); - infoHeader[6] = (unsigned char)(img.nx >> 16); - infoHeader[7] = (unsigned char)(img.nx >> 24); - infoHeader[8] = (unsigned char)(img.ny); - infoHeader[9] = (unsigned char)(img.ny >> 8); - infoHeader[10] = (unsigned char)(img.ny >> 16); - infoHeader[11] = (unsigned char)(img.ny >> 24); + infoHeader[4] = (unsigned char)(bmp_size.width); + infoHeader[5] = (unsigned char)(bmp_size.width >> 8); + infoHeader[6] = (unsigned char)(bmp_size.width >> 16); + infoHeader[7] = (unsigned char)(bmp_size.width >> 24); + infoHeader[8] = (unsigned char)(bmp_size.height); + infoHeader[9] = (unsigned char)(bmp_size.height >> 8); + infoHeader[10] = (unsigned char)(bmp_size.height >> 16); + infoHeader[11] = (unsigned char)(bmp_size.height >> 24); // Write file headers file.write(reinterpret_cast(fileHeader), sizeof(fileHeader)); @@ -109,14 +112,14 @@ static void clip_image_save_to_bmp(const clip_image_u8& img, const std::string& // Pixel data std::vector padding(3, 0); // Max padding size to be added to each row - for (int y = img.ny - 1; y >= 0; --y) { // BMP files are stored bottom-to-top - for (int x = 0; x < img.nx; ++x) { + for (int y = bmp_size.height - 1; y >= 0; --y) { // BMP files are stored bottom-to-top + for (int x = 0; x < bmp_size.width; ++x) { // Each pixel - size_t pixelIndex = (y * img.nx + x) * 3; + const auto px = img.get_pixel(x, y); unsigned char pixel[3] = { - img.buf[pixelIndex + 2], // BMP stores pixels in BGR format - img.buf[pixelIndex + 1], - img.buf[pixelIndex] + px[2], // BMP stores pixels in BGR format + px[1], + px[0] }; file.write(reinterpret_cast(pixel), 3); } @@ -129,12 +132,13 @@ static void clip_image_save_to_bmp(const clip_image_u8& img, const std::string& // debug function to convert f32 to u8 static void clip_image_convert_f32_to_u8(const clip_image_f32& src, clip_image_u8& dst) { - dst.nx = src.nx; - dst.ny = src.ny; - dst.buf.resize(3 * src.nx * src.ny); - for (size_t i = 0; i < src.buf.size(); ++i) { - dst.buf[i] = static_cast(std::min(std::max(int(src.buf[i] * 255.0f), 0), 255)); + dst.set_size(src.get_size(), false); + const auto & src_buf = src.get_ro_buf(); + std::vector dst_buf(src.n_elements()); + for (size_t i = 0; i < src.n_elements(); ++i) { + dst_buf[i] = static_cast(std::min(std::max(int(src_buf[i] * 255.0f), 0), 255)); } + dst.cpy_buf(dst_buf); } #endif @@ -241,8 +245,8 @@ clip_graph::clip_graph(clip_ctx * ctx, const clip_image_f32 & img) : proj_type(ctx->proj_type()), img(img), patch_size(hparams.patch_size), - n_patches_x(img.nx / patch_size), - n_patches_y(img.ny / patch_size), + n_patches_x(img.nx() / patch_size), + n_patches_y(img.ny() / patch_size), n_patches(n_patches_x * n_patches_y), n_embd(hparams.n_embd), n_head(hparams.n_head), @@ -278,8 +282,8 @@ void clip_graph::cb(ggml_tensor * cur, const char * name, int il) const { // siglip2 naflex ggml_tensor * clip_graph::resize_position_embeddings(uint32_t interpolation_mode) { ggml_tensor * pos_embd = model.position_embeddings; - const int height = img.ny / patch_size; - const int width = img.nx / patch_size; + const int height = img.ny() / patch_size; + const int width = img.nx() / patch_size; const uint32_t mode = interpolation_mode; const int n_per_side = (int)std::sqrt(pos_embd->ne[1]); @@ -523,7 +527,7 @@ ggml_tensor * clip_graph::build_inp() { } ggml_tensor * clip_graph::build_inp_raw(int channels) { - ggml_tensor * inp_raw = ggml_new_tensor_3d(ctx0, GGML_TYPE_F32, img.nx, img.ny, channels); + ggml_tensor * inp_raw = ggml_new_tensor_3d(ctx0, GGML_TYPE_F32, img.nx(), img.ny(), channels); ggml_set_name(inp_raw, "inp_raw"); ggml_set_input(inp_raw); return inp_raw; @@ -816,8 +820,8 @@ ggml_tensor * clip_graph::build_patch_merge_permute(ggml_tensor * cur, int scale GGML_ASSERT(scale_factor > 1); const int n_embd = cur->ne[0]; - int width = img.nx / patch_size; - int height = img.ny / patch_size; + int width = img.nx() / patch_size; + int height = img.ny() / patch_size; // pad width and height to factor const int64_t pad_width = CLIP_ALIGN(width, scale_factor) - width; @@ -2805,13 +2809,12 @@ struct clip_model_loader { clip_image_f32_batch batch; clip_image_f32_ptr img(clip_image_f32_init()); if (ctx_clip.model.modality == CLIP_MODALITY_VISION) { - img->nx = hparams.warmup_image_size; - img->ny = hparams.warmup_image_size; - LOG_INF("%s: warmup with image size = %d x %d\n", __func__, img->nx, img->ny); + const int sz = hparams.warmup_image_size; + img->set_size({sz, sz}, false, false); + LOG_INF("%s: warmup with image size = %d x %d\n", __func__, sz, sz); } else { - img->nx = hparams.warmup_audio_size; - img->ny = hparams.n_mel_bins; - LOG_INF("%s: warmup with audio size = %d\n", __func__, img->nx); + img->set_size({hparams.warmup_audio_size, hparams.n_mel_bins}, false, false); + LOG_INF("%s: warmup with audio size = %d\n", __func__, hparams.warmup_audio_size); } batch.entries.push_back(std::move(img)); warmup(ctx_clip, batch); @@ -3108,12 +3111,6 @@ struct clip_image_f32_batch * clip_image_f32_batch_init() { return new clip_image_f32_batch(); } -unsigned char * clip_image_u8_get_data(struct clip_image_u8 * img, uint32_t * nx, uint32_t * ny) { - if (nx) *nx = img->nx; - if (ny) *ny = img->ny; - return img->buf.data(); -} - void clip_image_size_free(struct clip_image_size * load_image_size) { if (load_image_size == nullptr) { return; @@ -3134,7 +3131,7 @@ size_t clip_image_f32_batch_nx(const struct clip_image_f32_batch * batch, int id LOG_ERR("%s: invalid index %d\n", __func__, idx); return 0; } - return batch->entries[idx]->nx; + return batch->entries[idx]->nx(); } size_t clip_image_f32_batch_ny(const struct clip_image_f32_batch * batch, int idx) { @@ -3142,7 +3139,7 @@ size_t clip_image_f32_batch_ny(const struct clip_image_f32_batch * batch, int id LOG_ERR("%s: invalid index %d\n", __func__, idx); return 0; } - return batch->entries[idx]->ny; + return batch->entries[idx]->ny(); } clip_image_f32 * clip_image_f32_get_img(const struct clip_image_f32_batch * batch, int idx) { @@ -3153,13 +3150,6 @@ clip_image_f32 * clip_image_f32_get_img(const struct clip_image_f32_batch * batc return batch->entries[idx].get(); } -void clip_build_img_from_pixels(const unsigned char * rgb_pixels, int nx, int ny, clip_image_u8 * img) { - img->nx = nx; - img->ny = ny; - img->buf.resize(3 * nx * ny); - memcpy(img->buf.data(), rgb_pixels, img->buf.size()); -} - void clip_free(clip_ctx * ctx) { if (ctx == nullptr) { return; @@ -3167,20 +3157,6 @@ void clip_free(clip_ctx * ctx) { delete ctx; } -// deprecated -size_t clip_embd_nbytes(const struct clip_ctx * ctx) { - const int32_t nx = ctx->model.hparams.image_size; - const int32_t ny = ctx->model.hparams.image_size; - return clip_embd_nbytes_by_img(ctx, nx, ny); -} - -size_t clip_embd_nbytes_by_img(const struct clip_ctx * ctx, int img_w, int img_h) { - clip_image_f32 img; - img.nx = img_w; - img.ny = img_h; - return clip_n_output_tokens(ctx, &img) * clip_n_mmproj_embd(ctx) * sizeof(float); -} - int32_t clip_get_image_size(const struct clip_ctx * ctx) { return ctx->model.hparams.image_size; } @@ -3211,9 +3187,9 @@ int clip_n_output_tokens_x(const struct clip_ctx * ctx, struct clip_image_f32 * case PROJECTOR_TYPE_PADDLEOCR: case PROJECTOR_TYPE_HUNYUANVL: case PROJECTOR_TYPE_YOUTUVL: - return (img->nx / params.patch_size) / 2; + return (img->nx() / params.patch_size) / 2; case PROJECTOR_TYPE_STEP3VL: - return img->nx / (params.patch_size * params.n_merge); + return img->nx() / (params.patch_size * params.n_merge); default: break; } @@ -3233,9 +3209,9 @@ int clip_n_output_tokens_y(const struct clip_ctx * ctx, struct clip_image_f32 * case PROJECTOR_TYPE_PADDLEOCR: case PROJECTOR_TYPE_HUNYUANVL: case PROJECTOR_TYPE_YOUTUVL: - return (img->ny / params.patch_size) / 2; + return (img->ny() / params.patch_size) / 2; case PROJECTOR_TYPE_STEP3VL: - return img->ny / (params.patch_size * params.n_merge); + return img->ny() / (params.patch_size * params.n_merge); default: break; } @@ -3247,7 +3223,7 @@ int clip_n_output_tokens(const struct clip_ctx * ctx, struct clip_image_f32 * im // for models with fixed size image, the input image is already pre-processed and resized to square int patch_size = params.patch_size; - int n_patches = (img->nx / patch_size) * (img->ny / patch_size); + int n_patches = (img->nx() / patch_size) * (img->ny() / patch_size); projector_type proj = ctx->proj_type(); @@ -3313,14 +3289,14 @@ int clip_n_output_tokens(const struct clip_ctx * ctx, struct clip_image_f32 * im case PROJECTOR_TYPE_YOUTUVL: { // dynamic size (2 conv, so double patch size) - int x_patch = img->nx / (params.patch_size * 2); - int y_patch = img->ny / (params.patch_size * 2); + int x_patch = img->nx() / (params.patch_size * 2); + int y_patch = img->ny() / (params.patch_size * 2); n_patches = x_patch * y_patch; } break; case PROJECTOR_TYPE_STEP3VL: { - int x_patch = img->nx / (params.patch_size * params.n_merge); - int y_patch = img->ny / (params.patch_size * params.n_merge); + int x_patch = img->nx() / (params.patch_size * params.n_merge); + int y_patch = img->ny() / (params.patch_size * params.n_merge); n_patches = x_patch * y_patch; } break; case PROJECTOR_TYPE_GEMMA3: @@ -3347,8 +3323,8 @@ int clip_n_output_tokens(const struct clip_ctx * ctx, struct clip_image_f32 * im { // dynamic size int out_patch_size = params.patch_size * ctx->model.hparams.n_merge; - int x_patch = CLIP_ALIGN(img->nx, out_patch_size) / out_patch_size; - int y_patch = CLIP_ALIGN(img->ny, out_patch_size) / out_patch_size; + int x_patch = CLIP_ALIGN(img->nx(), out_patch_size) / out_patch_size; + int y_patch = CLIP_ALIGN(img->ny(), out_patch_size) / out_patch_size; n_patches = x_patch * y_patch; } break; case PROJECTOR_TYPE_PADDLEOCR: @@ -3364,8 +3340,8 @@ int clip_n_output_tokens(const struct clip_ctx * ctx, struct clip_image_f32 * im { // dynamic size int n_merge = ctx->model.hparams.n_merge; - int n_patches_x = img->nx / patch_size / (n_merge > 0 ? n_merge : 1); - int n_patches_y = img->ny / patch_size / (n_merge > 0 ? n_merge : 1); + int n_patches_x = img->nx() / patch_size / (n_merge > 0 ? n_merge : 1); + int n_patches_y = img->ny() / patch_size / (n_merge > 0 ? n_merge : 1); if (ctx->model.token_embd_img_break) { n_patches = n_patches_y * n_patches_x + n_patches_y - 1; // + one [IMG_BREAK] per row, except the last row } else { @@ -3378,7 +3354,7 @@ int clip_n_output_tokens(const struct clip_ctx * ctx, struct clip_image_f32 * im case PROJECTOR_TYPE_MERALION: case PROJECTOR_TYPE_MUSIC_FLAMINGO: { - n_patches = img->nx; + n_patches = img->nx(); const int proj_stack_factor = ctx->model.hparams.proj_stack_factor; if (ctx->model.audio_has_stack_frames()) { @@ -3400,11 +3376,11 @@ int clip_n_output_tokens(const struct clip_ctx * ctx, struct clip_image_f32 * im // chunk_size=100 frames --> 3x stride-2 conv2d --> 13 tokens per chunk const int chunk_size = 100; const int tokens_per_chunk = 13; - n_patches = (img->nx / chunk_size) * tokens_per_chunk; + n_patches = (img->nx() / chunk_size) * tokens_per_chunk; } break; case PROJECTOR_TYPE_GLMA: { - n_patches = img->nx; + n_patches = img->nx(); // whisper downscales input token by half after conv1d n_patches /= 2; // reshape by merge_factor @@ -3431,8 +3407,8 @@ int clip_n_output_tokens(const struct clip_ctx * ctx, struct clip_image_f32 * im case PROJECTOR_TYPE_HUNYUANVL: { int merge = ctx->model.hparams.n_merge; - int ow = (img->nx / patch_size) / merge; - int oh = (img->ny / patch_size) / merge; + int ow = (img->nx() / patch_size) / merge; + int oh = (img->ny() / patch_size) / merge; n_patches = (ow + 1) * oh + 2; } break; case PROJECTOR_TYPE_DEEPSEEKOCR2: @@ -3446,13 +3422,13 @@ int clip_n_output_tokens(const struct clip_ctx * ctx, struct clip_image_f32 * im } break; case PROJECTOR_TYPE_LFM2A: { - n_patches = ((((img->nx + 1) / 2) + 1) / 2 + 1) / 2; + n_patches = ((((img->nx() + 1) / 2) + 1) / 2 + 1) / 2; } break; case PROJECTOR_TYPE_GEMMA4A: { // Two Conv2D stride-2: O = floor((I + 2p - k) / s) + 1, p=1, k=3, s=2 // O = floor((I - 1) / 2) + 1 - int n = img->nx; + int n = img->nx(); for (int i = 0; i < 2; i++) { n = (n - 1) / 2 + 1; } @@ -3460,13 +3436,13 @@ int clip_n_output_tokens(const struct clip_ctx * ctx, struct clip_image_f32 * im } break; case PROJECTOR_TYPE_GEMMA4UA: { - n_patches = img->nx; // no downsampling: one token per raw waveform frame + n_patches = img->nx(); // no downsampling: one token per raw waveform frame } break; case PROJECTOR_TYPE_GRANITE_SPEECH: { const int ws = ctx->model.hparams.audio_proj_window_size; const int ds = ctx->model.hparams.audio_proj_downsample_rate; - n_patches = ((img->nx + ws - 1) / ws) * (ws / ds); + n_patches = ((img->nx() + ws - 1) / ws) * (ws / ds); } break; case PROJECTOR_TYPE_GRANITE4_VISION: { @@ -3475,7 +3451,7 @@ int clip_n_output_tokens(const struct clip_ctx * ctx, struct clip_image_f32 * im // For 384×384 input: n = 24/8 = 3, query_side = 4 → 144. const int window_side = ctx->model.hparams.downsample_window_side; const int query_side = ctx->model.hparams.downsample_query_side; - const int side = img->nx / params.patch_size; + const int side = img->nx() / params.patch_size; const int n = side / window_side; n_patches = (query_side * n) * (query_side * n); if (img->add_newline) { @@ -3525,8 +3501,8 @@ bool clip_image_batch_encode(clip_ctx * ctx, const int n_threads, const clip_ima const auto & model = ctx->model; const auto & hparams = model.hparams; - const int image_size_width = imgs.entries[0]->nx; - const int image_size_height = imgs.entries[0]->ny; + const int image_size_width = imgs.entries[0]->nx(); + const int image_size_height = imgs.entries[0]->ny(); const int patch_size = hparams.patch_size; const int num_patches = ((image_size_width / patch_size) * (image_size_height / patch_size)); @@ -3546,7 +3522,7 @@ bool clip_image_batch_encode(clip_ctx * ctx, const int n_threads, const clip_ima return inp; }; - auto set_input_f32 = [&get_inp_tensor](const char * name, std::vector & values) { + auto set_input_f32 = [&get_inp_tensor](const char * name, const std::vector & values) { ggml_tensor * cur = get_inp_tensor(name); GGML_ASSERT(cur->type == GGML_TYPE_F32); GGML_ASSERT(ggml_nelements(cur) == (int64_t)values.size()); @@ -3564,7 +3540,7 @@ bool clip_image_batch_encode(clip_ctx * ctx, const int n_threads, const clip_ima if (!imgs.is_audio) { size_t nelem = 0; for (const auto & img : imgs.entries) { - nelem += img->nx * img->ny * 3; + nelem += img->nx() * img->ny() * 3; } std::vector inp_raw(nelem); @@ -3580,19 +3556,20 @@ bool clip_image_batch_encode(clip_ctx * ctx, const int n_threads, const clip_ima // ──────┘ x B for (size_t i = 0; i < imgs.entries.size(); i++) { - const int nx = imgs.entries[i]->nx; - const int ny = imgs.entries[i]->ny; + const int nx = imgs.entries[i]->nx(); + const int ny = imgs.entries[i]->ny(); const int n = nx * ny; for (int b = 0; b < batch_size; b++) { + const auto & buf = imgs.entries[b]->get_ro_buf(); float * batch_entry = inp_raw.data() + b * (3*n); for (int y = 0; y < ny; y++) { for (int x = 0; x < nx; x++) { size_t base_src = 3*(y * nx + x); // idx of the first channel size_t base_dst = y * nx + x; // idx of the first channel - batch_entry[ base_dst] = imgs.entries[b]->buf[base_src ]; - batch_entry[1*n + base_dst] = imgs.entries[b]->buf[base_src + 1]; - batch_entry[2*n + base_dst] = imgs.entries[b]->buf[base_src + 2]; + batch_entry[ base_dst] = buf[base_src ]; + batch_entry[1*n + base_dst] = buf[base_src + 1]; + batch_entry[2*n + base_dst] = buf[base_src + 2]; } } } @@ -3602,12 +3579,14 @@ bool clip_image_batch_encode(clip_ctx * ctx, const int n_threads, const clip_ima } else { // audio input GGML_ASSERT(imgs.entries.size() == 1); + const auto & mel_inp = imgs.entries[0]; - const int n_step = mel_inp->nx; - const int n_mel = mel_inp->ny; - std::vector inp_raw(n_step * n_mel); - std::memcpy(inp_raw.data(), mel_inp->buf.data(), n_step * n_mel * sizeof(float)); - set_input_f32("inp_raw", inp_raw); + const auto & buf = mel_inp->get_ro_buf(); + const int n_step = mel_inp->nx(); + const int n_mel = mel_inp->ny(); + GGML_ASSERT((size_t)n_step * n_mel == buf.size()); + + set_input_f32("inp_raw", buf); } // set input per projector @@ -4218,7 +4197,7 @@ bool clip_image_batch_encode(clip_ctx * ctx, const int n_threads, const clip_ima GGML_ASSERT(imgs.entries.size() == 1); const auto & img0 = imgs.entries.front(); // Compute n_pos matching SSCP output: two stride-2 convs - int n_pos = img0->nx; + int n_pos = img0->nx(); for (int i = 0; i < 2; i++) { n_pos = (n_pos - 1) / 2 + 1; } // Chunked local attention: blocked causal mask and RPE @@ -4324,7 +4303,7 @@ bool clip_image_batch_encode(clip_ctx * ctx, const int n_threads, const clip_ima // reshapes as ggml_get_rows gathers. The names are set // by g4v_gather() in models/granite4-vision.cpp. const int patch_size = model.hparams.patch_size; - const int image_side = imgs.entries.front()->nx / patch_size; + const int image_side = imgs.entries.front()->nx() / patch_size; const int window_side = hparams.downsample_window_side; const int query_side = hparams.downsample_query_side; const int n = image_side / window_side; @@ -4570,19 +4549,6 @@ bool clip_has_audio_encoder(const struct clip_ctx * ctx) { return ctx->model.modality == CLIP_MODALITY_AUDIO; } -bool clip_encode_float_image (struct clip_ctx * ctx, int n_threads, float * img, int h, int w, float * vec) { - clip_image_f32 clip_img; - clip_img.buf.resize(h * w * 3); - for (int i = 0; i < h*w*3; i++) - { - clip_img.buf[i] = img[i]; - } - clip_img.nx = w; - clip_img.ny = h; - clip_image_encode(ctx, n_threads, &clip_img, vec); - return true; -} - // // API used internally with mtmd // @@ -4591,17 +4557,6 @@ projector_type clip_get_projector_type(const struct clip_ctx * ctx) { return ctx->proj_type(); } -void clip_image_f32_batch_add_mel(struct clip_image_f32_batch * batch, int n_mel, int n_frames, float * mel) { - clip_image_f32 * audio = new clip_image_f32; - audio->nx = n_frames; - audio->ny = n_mel; - audio->buf.resize(n_frames * n_mel); - std::memcpy(audio->buf.data(), mel, n_frames * n_mel * sizeof(float)); - - batch->entries.push_back(clip_image_f32_ptr(audio)); - batch->is_audio = true; -} - const clip_hparams * clip_get_hparams(const struct clip_ctx * ctx) { return &ctx->model.hparams; } diff --git a/tools/mtmd/clip.h b/tools/mtmd/clip.h index a62c9d6187..ba5b619770 100644 --- a/tools/mtmd/clip.h +++ b/tools/mtmd/clip.h @@ -17,6 +17,9 @@ struct clip_ctx; struct clip_image_size { int width; int height; + bool operator==(const clip_image_size & other) const { + return width == other.width && height == other.height; + } }; struct clip_image_f32; @@ -54,9 +57,6 @@ struct clip_init_result clip_init(const char * fname, struct clip_context_params void clip_free(struct clip_ctx * ctx); -size_t clip_embd_nbytes(const struct clip_ctx * ctx); -size_t clip_embd_nbytes_by_img(const struct clip_ctx * ctx, int img_w, int img_h); - int32_t clip_get_image_size (const struct clip_ctx * ctx); int32_t clip_get_patch_size (const struct clip_ctx * ctx); int32_t clip_get_hidden_size(const struct clip_ctx * ctx); @@ -79,9 +79,6 @@ struct clip_image_u8 * clip_image_u8_init (void); struct clip_image_f32 * clip_image_f32_init(void); struct clip_image_f32_batch * clip_image_f32_batch_init(void); // only used by libllava -// nx, ny are the output image dimensions -unsigned char * clip_image_u8_get_data(struct clip_image_u8 * img, uint32_t * nx, uint32_t * ny); - void clip_image_size_free (struct clip_image_size * img_size); void clip_image_u8_free (struct clip_image_u8 * img); void clip_image_f32_free(struct clip_image_f32 * img); @@ -94,12 +91,6 @@ size_t clip_image_f32_batch_nx(const struct clip_image_f32_batch * batch, int id size_t clip_image_f32_batch_ny(const struct clip_image_f32_batch * batch, int idx); // equivalent to batch[idx]->ny struct clip_image_f32 * clip_image_f32_get_img(const struct clip_image_f32_batch * batch, int idx); // equivalent to batch[idx]->data -/** - * Build image from pixels decoded by other libraries instead of stb_image.h for better performance. - * The memory layout is RGBRGBRGB..., input buffer length must be 3*nx*ny bytes - */ -void clip_build_img_from_pixels(const unsigned char * rgb_pixels, int nx, int ny, struct clip_image_u8 * img); - bool clip_image_encode (struct clip_ctx * ctx, int n_threads, struct clip_image_f32 * img, float * vec); bool clip_image_batch_encode(struct clip_ctx * ctx, int n_threads, const struct clip_image_f32_batch * imgs, float * vec); @@ -107,11 +98,6 @@ bool clip_is_llava(const struct clip_ctx * ctx); // note for contributor: this clip_is_(model) pattern is deprecated // do NOT add new functions like this -bool clip_encode_float_image (struct clip_ctx * ctx, int n_threads, float * img, int h, int w, float * vec); - -// use by audio input -void clip_image_f32_batch_add_mel(struct clip_image_f32_batch * batch, int n_mel, int n_frames, float * mel); - bool clip_has_vision_encoder(const struct clip_ctx * ctx); bool clip_has_audio_encoder(const struct clip_ctx * ctx); diff --git a/tools/mtmd/models/conformer.cpp b/tools/mtmd/models/conformer.cpp index f58c5048f5..5f2c7b9731 100644 --- a/tools/mtmd/models/conformer.cpp +++ b/tools/mtmd/models/conformer.cpp @@ -1,7 +1,7 @@ #include "models.h" ggml_cgraph * clip_graph_conformer::build() { - const int n_frames = img.nx; + const int n_frames = img.nx(); const int n_pos = n_frames / 2; const int n_pos_embd = (((((n_frames + 1) / 2) + 1) / 2 + 1) / 2) * 2 - 1; GGML_ASSERT(model.position_embeddings->ne[1] >= n_pos); diff --git a/tools/mtmd/models/exaone4_5.cpp b/tools/mtmd/models/exaone4_5.cpp index 7bfbaca996..bd9e8c7488 100644 --- a/tools/mtmd/models/exaone4_5.cpp +++ b/tools/mtmd/models/exaone4_5.cpp @@ -22,8 +22,8 @@ ggml_cgraph * clip_graph_exaone4_5::build() { ggml_tensor * inp_raw = build_inp_raw(); ggml_tensor * inp = ggml_conv_2d(ctx0, model.patch_embeddings_0, inp_raw, patch_size, patch_size, 0, 0, 1, 1); - GGML_ASSERT(img.nx % (patch_size * 2) == 0); - GGML_ASSERT(img.ny % (patch_size * 2) == 0); + GGML_ASSERT(img.nx() % (patch_size * 2) == 0); + GGML_ASSERT(img.ny() % (patch_size * 2) == 0); { ggml_tensor * inp_1 = ggml_conv_2d(ctx0, model.patch_embeddings_1, inp_raw, patch_size, patch_size, 0, 0, 1, 1); diff --git a/tools/mtmd/models/glm4v.cpp b/tools/mtmd/models/glm4v.cpp index 623d2e384b..0e1d596b41 100644 --- a/tools/mtmd/models/glm4v.cpp +++ b/tools/mtmd/models/glm4v.cpp @@ -16,8 +16,8 @@ ggml_cgraph * clip_graph_glm4v::build() { ggml_set_name(positions, "positions"); ggml_set_input(positions); - GGML_ASSERT(img.nx % (patch_size * 2) == 0); - GGML_ASSERT(img.ny % (patch_size * 2) == 0); + GGML_ASSERT(img.nx() % (patch_size * 2) == 0); + GGML_ASSERT(img.ny() % (patch_size * 2) == 0); // second conv dimension { diff --git a/tools/mtmd/models/granite-speech.cpp b/tools/mtmd/models/granite-speech.cpp index 5e66f75d0a..0bd4d75ac5 100644 --- a/tools/mtmd/models/granite-speech.cpp +++ b/tools/mtmd/models/granite-speech.cpp @@ -1,7 +1,7 @@ #include "models.h" ggml_cgraph * clip_graph_granite_speech::build() { - const int n_frames = img.nx; + const int n_frames = img.nx(); const int context_size = hparams.audio_chunk_size; const int ctc_layer = n_layer / 2; const int conv_kernel = hparams.audio_conv_kernel_size; diff --git a/tools/mtmd/models/kimik25.cpp b/tools/mtmd/models/kimik25.cpp index cf9f27f63a..cb345f0fc6 100644 --- a/tools/mtmd/models/kimik25.cpp +++ b/tools/mtmd/models/kimik25.cpp @@ -7,8 +7,8 @@ // with a w*h? Also the permute is a bit different at (2, 1, 0, 3) instead of (2, 0, 1, 3). ggml_tensor * clip_graph_kimik25::resize_position_embeddings_3d(uint32_t interpolation_mode) { ggml_tensor * pos_embd = model.position_embeddings; - const int height = img.ny / patch_size; - const int width = img.nx / patch_size; + const int height = img.ny() / patch_size; + const int width = img.nx() / patch_size; const uint32_t mode = interpolation_mode; GGML_ASSERT(pos_embd); diff --git a/tools/mtmd/models/mimovl.cpp b/tools/mtmd/models/mimovl.cpp index 19db88f132..6ff1124a02 100644 --- a/tools/mtmd/models/mimovl.cpp +++ b/tools/mtmd/models/mimovl.cpp @@ -56,8 +56,8 @@ ggml_cgraph * clip_graph_mimovl::build() { patch_size, patch_size, 0, 0, 1, 1); inp = ggml_add(ctx0, inp, inp_1); - GGML_ASSERT(img.nx % (patch_size * 2) == 0); - GGML_ASSERT(img.ny % (patch_size * 2) == 0); + GGML_ASSERT(img.nx() % (patch_size * 2) == 0); + GGML_ASSERT(img.ny() % (patch_size * 2) == 0); inp = ggml_permute(ctx0, inp, 1, 2, 0, 3); // [w,h,c,b] -> [c,w,h,b] inp = ggml_cont_4d(ctx0, inp, n_embd * 2, n_patches_x / 2, n_patches_y, batch_size); diff --git a/tools/mtmd/models/qwen2vl.cpp b/tools/mtmd/models/qwen2vl.cpp index ebf1075737..b196587373 100644 --- a/tools/mtmd/models/qwen2vl.cpp +++ b/tools/mtmd/models/qwen2vl.cpp @@ -19,8 +19,8 @@ ggml_cgraph * clip_graph_qwen2vl::build() { ggml_tensor * inp_raw = build_inp_raw(); ggml_tensor * inp = ggml_conv_2d(ctx0, model.patch_embeddings_0, inp_raw, patch_size, patch_size, 0, 0, 1, 1); - GGML_ASSERT(img.nx % (patch_size * 2) == 0); - GGML_ASSERT(img.ny % (patch_size * 2) == 0); + GGML_ASSERT(img.nx() % (patch_size * 2) == 0); + GGML_ASSERT(img.ny() % (patch_size * 2) == 0); // second conv dimension { diff --git a/tools/mtmd/models/qwen3vl.cpp b/tools/mtmd/models/qwen3vl.cpp index fa1100dda8..9968933ed6 100644 --- a/tools/mtmd/models/qwen3vl.cpp +++ b/tools/mtmd/models/qwen3vl.cpp @@ -16,8 +16,8 @@ ggml_cgraph * clip_graph_qwen3vl::build() { ggml_tensor * inp_raw = build_inp_raw(); ggml_tensor * inp = ggml_conv_2d(ctx0, model.patch_embeddings_0, inp_raw, patch_size, patch_size, 0, 0, 1, 1); - GGML_ASSERT(img.nx % (patch_size * 2) == 0); - GGML_ASSERT(img.ny % (patch_size * 2) == 0); + GGML_ASSERT(img.nx() % (patch_size * 2) == 0); + GGML_ASSERT(img.ny() % (patch_size * 2) == 0); // second conv dimension { diff --git a/tools/mtmd/models/whisper-enc.cpp b/tools/mtmd/models/whisper-enc.cpp index 2a82ae50bf..49d5dd5add 100644 --- a/tools/mtmd/models/whisper-enc.cpp +++ b/tools/mtmd/models/whisper-enc.cpp @@ -1,7 +1,7 @@ #include "models.h" ggml_cgraph * clip_graph_whisper_enc::build() { - const int n_frames = img.nx; + const int n_frames = img.nx(); const int n_pos = n_frames / 2; GGML_ASSERT(model.position_embeddings->ne[1] >= n_pos); diff --git a/tools/mtmd/mtmd-cli.cpp b/tools/mtmd/mtmd-cli.cpp index d6e551618e..bd7f9871c3 100644 --- a/tools/mtmd/mtmd-cli.cpp +++ b/tools/mtmd/mtmd-cli.cpp @@ -166,7 +166,7 @@ struct mtmd_cli_context { } bool load_media(const std::string & fname) { - mtmd::bitmap bmp(mtmd_helper_bitmap_init_from_file(ctx_vision.get(), fname.c_str())); + mtmd::bitmap bmp(mtmd_helper_bitmap_init_from_file(ctx_vision.get(), fname.c_str(), false)); if (!bmp.ptr) { return false; } diff --git a/tools/mtmd/mtmd-helper.cpp b/tools/mtmd/mtmd-helper.cpp index 4094074163..94ad01511e 100644 --- a/tools/mtmd/mtmd-helper.cpp +++ b/tools/mtmd/mtmd-helper.cpp @@ -478,7 +478,7 @@ static bool decode_audio_from_buf(const unsigned char * buf_in, size_t len, int } // namespace audio_helpers -mtmd_bitmap * mtmd_helper_bitmap_init_from_buf(mtmd_context * ctx, const unsigned char * buf, size_t len) { +mtmd_bitmap * mtmd_helper_bitmap_init_from_buf(mtmd_context * ctx, const unsigned char * buf, size_t len, bool placeholder) { if (audio_helpers::is_audio_file((const char *)buf, len)) { std::vector pcmf32; const int sample_rate = mtmd_get_audio_sample_rate(ctx); @@ -490,7 +490,7 @@ mtmd_bitmap * mtmd_helper_bitmap_init_from_buf(mtmd_context * ctx, const unsigne LOG_ERR("Unable to read WAV audio file from buffer\n"); return nullptr; } - return mtmd_bitmap_init_from_audio(pcmf32.size(), pcmf32.data()); + return mtmd_bitmap_init_from_audio(pcmf32.size(), placeholder ? nullptr : pcmf32.data()); } // otherwise, we assume it's an image @@ -502,13 +502,13 @@ mtmd_bitmap * mtmd_helper_bitmap_init_from_buf(mtmd_context * ctx, const unsigne LOG_ERR("%s: failed to decode image bytes\n", __func__); return nullptr; } - result = mtmd_bitmap_init(nx, ny, data); + result = mtmd_bitmap_init(nx, ny, placeholder ? nullptr : data); stbi_image_free(data); } return result; } -mtmd_bitmap * mtmd_helper_bitmap_init_from_file(mtmd_context * ctx, const char * fname) { +mtmd_bitmap * mtmd_helper_bitmap_init_from_file(mtmd_context * ctx, const char * fname, bool placeholder) { std::vector buf; FILE * f = fopen(fname, "rb"); if (!f) { @@ -533,5 +533,6 @@ mtmd_bitmap * mtmd_helper_bitmap_init_from_file(mtmd_context * ctx, const char * return nullptr; } - return mtmd_helper_bitmap_init_from_buf(ctx, buf.data(), buf.size()); + return mtmd_helper_bitmap_init_from_buf(ctx, buf.data(), buf.size(), placeholder); } + diff --git a/tools/mtmd/mtmd-helper.h b/tools/mtmd/mtmd-helper.h index 57da78a754..7eecbb0672 100644 --- a/tools/mtmd/mtmd-helper.h +++ b/tools/mtmd/mtmd-helper.h @@ -29,7 +29,7 @@ MTMD_API void mtmd_helper_log_set(ggml_log_callback log_callback, void * user_da // it calls mtmd_helper_bitmap_init_from_buf() internally // returns nullptr on failure // this function is thread-safe -MTMD_API mtmd_bitmap * mtmd_helper_bitmap_init_from_file(mtmd_context * ctx, const char * fname); +MTMD_API mtmd_bitmap * mtmd_helper_bitmap_init_from_file(mtmd_context * ctx, const char * fname, bool placeholder); // helper function to construct a mtmd_bitmap from a buffer containing a file // supported formats: @@ -38,7 +38,7 @@ MTMD_API mtmd_bitmap * mtmd_helper_bitmap_init_from_file(mtmd_context * ctx, con // note: audio files will be auto-detected based on magic bytes // returns nullptr on failure // this function is thread-safe -MTMD_API mtmd_bitmap * mtmd_helper_bitmap_init_from_buf(mtmd_context * ctx, const unsigned char * buf, size_t len); +MTMD_API mtmd_bitmap * mtmd_helper_bitmap_init_from_buf(mtmd_context * ctx, const unsigned char * buf, size_t len, bool placeholder); // helper to count the total number of tokens from a list of chunks, useful to keep track of KV cache MTMD_API size_t mtmd_helper_get_n_tokens(const mtmd_input_chunks * chunks); diff --git a/tools/mtmd/mtmd-image.cpp b/tools/mtmd/mtmd-image.cpp index caf72d5362..c86a065c81 100644 --- a/tools/mtmd/mtmd-image.cpp +++ b/tools/mtmd/mtmd-image.cpp @@ -9,25 +9,12 @@ // void mtmd_image_preprocessor::img_u8_to_f32(const clip_image_u8 & src, clip_image_f32 & dst, const float mean[3], const float std[3]) { - dst.nx = src.nx; - dst.ny = src.ny; - dst.buf.resize(src.buf.size()); - - // TODO @ngxson : seems like this could be done more efficiently on cgraph - for (size_t i = 0; i < src.buf.size(); ++i) { - int c = i % 3; // rgb - dst.buf[i] = (static_cast(src.buf[i]) / 255.0f - mean[c]) / std[c]; - } + dst.from_u8(src); + dst.normalize(mean, std); } void mtmd_image_preprocessor::img_u8_to_f32(const clip_image_u8 & src, clip_image_f32 & dst) { - dst.nx = src.nx; - dst.ny = src.ny; - dst.buf.resize(src.buf.size()); - - for (size_t i = 0; i < src.buf.size(); ++i) { - dst.buf[i] = static_cast(src.buf[i]); - } + dst.from_u8(src); } // set of tools to manipulate images @@ -40,13 +27,16 @@ struct img_tool { resize_algo algo, pad_style padding = PAD_CEIL, std::array pad_color = {0, 0, 0}) { - dst.nx = target_resolution.width; - dst.ny = target_resolution.height; - dst.buf.resize(3 * dst.nx * dst.ny); + dst.set_size(target_resolution, src.is_placeholder()); - if (dst.nx == src.nx && dst.ny == src.ny) { + if (src.is_placeholder()) { + // no-op for placeholder image, just set the size and return + return; + } + + if (dst.get_size() == src.get_size()) { // no resize needed, simple copy - dst.buf = src.buf; + dst.cpy_buf(src.get_ro_buf()); return; } @@ -68,17 +58,17 @@ struct img_tool { } else { // resize with padding clip_image_u8 resized_image; - float scale_w = static_cast(target_resolution.width) / src.nx; - float scale_h = static_cast(target_resolution.height) / src.ny; + float scale_w = static_cast(target_resolution.width) / src.get_size().width; + float scale_h = static_cast(target_resolution.height) / src.get_size().height; float scale = std::min(scale_w, scale_h); int new_width, new_height; if (padding == PAD_NEAREST) { - new_width = std::min(static_cast(std::round(src.nx * scale)), target_resolution.width); - new_height = std::min(static_cast(std::round(src.ny * scale)), target_resolution.height); + new_width = std::min(static_cast(std::round(src.get_size().width * scale)), target_resolution.width); + new_height = std::min(static_cast(std::round(src.get_size().height * scale)), target_resolution.height); } else { - new_width = std::min(static_cast(std::ceil(src.nx * scale)), target_resolution.width); - new_height = std::min(static_cast(std::ceil(src.ny * scale)), target_resolution.height); + new_width = std::min(static_cast(std::ceil(src.get_size().width * scale)), target_resolution.width); + new_height = std::min(static_cast(std::ceil(src.get_size().height * scale)), target_resolution.height); } switch (algo) { @@ -112,18 +102,17 @@ struct img_tool { static void crop(const clip_image_u8 & image, clip_image_u8 & dst, int x, int y, int w, int h) { GGML_ASSERT(x >= 0 && y >= 0 && w > 0 && h > 0); - GGML_ASSERT(x + w <= image.nx && y + h <= image.ny); - dst.nx = w; - dst.ny = h; - dst.buf.resize(3 * w * h); + GGML_ASSERT(x + w <= image.get_size().width && y + h <= image.get_size().height); + dst.set_size({w, h}, image.is_placeholder()); + + if (image.is_placeholder()) { + // no-op for placeholder image, just set the size and return + return; + } for (int i = 0; i < h; ++i) { for (int j = 0; j < w; ++j) { - int src_idx = 3 * ((y + i)*image.nx + (x + j)); - int dst_idx = 3 * (i*w + j); - dst.buf[dst_idx] = image.buf[src_idx]; - dst.buf[dst_idx + 1] = image.buf[src_idx + 1]; - dst.buf[dst_idx + 2] = image.buf[src_idx + 2]; + dst.set_pixel(j, i, image.get_pixel(x + j, y + i)); } } } @@ -181,81 +170,101 @@ struct img_tool { // draw src image into dst image at offset (offset_x, offset_y) static void composite(clip_image_u8 & dst, const clip_image_u8 & src, int offset_x, int offset_y) { - for (int y = 0; y < src.ny; ++y) { - for (int x = 0; x < src.nx; ++x) { + if (src.is_placeholder()) { + // no-op for placeholder image + return; + } + + const auto src_size = src.get_size(); + const auto dst_size = dst.get_size(); + for (int y = 0; y < src_size.height; ++y) { + for (int x = 0; x < src_size.width; ++x) { int dx = x + offset_x; int dy = y + offset_y; // skip pixels that would be out of bounds in the destination - if (dx < 0 || dy < 0 || dx >= dst.nx || dy >= dst.ny) { + if (dx < 0 || dy < 0 || dx >= dst_size.width || dy >= dst_size.height) { continue; } - size_t dst_idx = 3 * (static_cast(dy) * dst.nx + static_cast(dx)); - size_t src_idx = 3 * (static_cast(y) * src.nx + static_cast(x)); - dst.buf[dst_idx + 0] = src.buf[src_idx + 0]; - dst.buf[dst_idx + 1] = src.buf[src_idx + 1]; - dst.buf[dst_idx + 2] = src.buf[src_idx + 2]; + dst.set_pixel(dx, dy, src.get_pixel(x, y)); } } } // fill the image with a solid color static void fill(clip_image_u8 & img, const std::array & color) { - for (size_t i = 0; i < img.buf.size(); i += 3) { - img.buf[i] = color[0]; - img.buf[i + 1] = color[1]; - img.buf[i + 2] = color[2]; + if (img.is_placeholder()) { + // no-op for placeholder image + return; + } + + const auto size = img.get_size(); + for (int y = 0; y < size.height; ++y) { + for (int x = 0; x < size.width; ++x) { + img.set_pixel(x, y, color); + } } } private: // Bilinear resize function static void resize_bilinear(const clip_image_u8 & src, clip_image_u8 & dst, int target_width, int target_height) { - if (src.nx == 0 || src.ny == 0) { dst.nx = dst.ny = 0; dst.buf.clear(); return; } + const auto src_size = src.get_size(); + if (src_size.width == 0 || src_size.height == 0) { dst.set_size({0, 0}, false); return; } if (target_width <= 0) target_width = 1; if (target_height <= 0) target_height = 1; - dst.nx = target_width; - dst.ny = target_height; - dst.buf.resize(3 * target_width * target_height); + dst.set_size({target_width, target_height}, false); - float x_ratio = target_width > 1 ? static_cast(src.nx - 1) / (target_width - 1) : 0.0f; - float y_ratio = target_height > 1 ? static_cast(src.ny - 1) / (target_height - 1) : 0.0f; + if (src.is_placeholder()) { + // no-op for placeholder image, just set the size and return + return; + } + + float x_ratio = target_width > 1 ? static_cast(src_size.width - 1) / (target_width - 1) : 0.0f; + float y_ratio = target_height > 1 ? static_cast(src_size.height - 1) / (target_height - 1) : 0.0f; for (int y = 0; y < target_height; ++y) { for (int x = 0; x < target_width; ++x) { float px = x * x_ratio; float py = y * y_ratio; - int x0 = std::min(static_cast(px), src.nx - 1); - int y0 = std::min(static_cast(py), src.ny - 1); - int x1 = std::min(x0 + 1, src.nx - 1); - int y1 = std::min(y0 + 1, src.ny - 1); + int x0 = std::min(static_cast(px), src_size.width - 1); + int y0 = std::min(static_cast(py), src_size.height - 1); + int x1 = std::min(x0 + 1, src_size.width - 1); + int y1 = std::min(y0 + 1, src_size.height - 1); float xf = px - x0; float yf = py - y0; + const auto p00 = src.get_pixel(x0, y0); + const auto p10 = src.get_pixel(x1, y0); + const auto p01 = src.get_pixel(x0, y1); + const auto p11 = src.get_pixel(x1, y1); + + std::array pixel; for (int c = 0; c < 3; ++c) { - float top = lerp(static_cast(src.buf[3 * (y0 * src.nx + x0) + c]), - static_cast(src.buf[3 * (y0 * src.nx + x1) + c]), - xf); - float bottom = lerp(static_cast(src.buf[3 * (y1 * src.nx + x0) + c]), - static_cast(src.buf[3 * (y1 * src.nx + x1) + c]), - xf); - dst.buf[3 * (y * target_width + x) + c] = static_cast(lerp(top, bottom, yf)); + float top = lerp(static_cast(p00[c]), static_cast(p10[c]), xf); + float bottom = lerp(static_cast(p01[c]), static_cast(p11[c]), xf); + pixel[c] = static_cast(lerp(top, bottom, yf)); } + dst.set_pixel(x, y, pixel); } } } // Bicubic resize function // part of image will be cropped if the aspect ratio is different - static bool resize_bicubic(const clip_image_u8 & img, clip_image_u8 & dst, int target_width, int target_height) { - const int nx = img.nx; - const int ny = img.ny; + static void resize_bicubic(const clip_image_u8 & img, clip_image_u8 & dst, int target_width, int target_height) { + const auto img_size = img.get_size(); + const int nx = img_size.width; + const int ny = img_size.height; - dst.nx = target_width; - dst.ny = target_height; - dst.buf.resize(3 * target_width * target_height); + dst.set_size({target_width, target_height}, false); + + if (img.is_placeholder()) { + // no-op for placeholder image, just set the size and return + return; + } float Cc; float C[5] = {}; @@ -280,12 +289,13 @@ private: dx = tx * j - x; dy = ty * i - y; + std::array pixel; for (k = 0; k < 3; k++) { for (jj = 0; jj <= 3; jj++) { - d0 = img.buf[(clip(y - 1 + jj, 0, ny - 1) * nx + clip(x - 1, 0, nx - 1)) * 3 + k] - img.buf[(clip(y - 1 + jj, 0, ny - 1) * nx + clip(x, 0, nx - 1)) * 3 + k]; - d2 = img.buf[(clip(y - 1 + jj, 0, ny - 1) * nx + clip(x + 1, 0, nx - 1)) * 3 + k] - img.buf[(clip(y - 1 + jj, 0, ny - 1) * nx + clip(x, 0, nx - 1)) * 3 + k]; - d3 = img.buf[(clip(y - 1 + jj, 0, ny - 1) * nx + clip(x + 2, 0, nx - 1)) * 3 + k] - img.buf[(clip(y - 1 + jj, 0, ny - 1) * nx + clip(x, 0, nx - 1)) * 3 + k]; - a0 = img.buf[(clip(y - 1 + jj, 0, ny - 1) * nx + clip(x, 0, nx - 1)) * 3 + k]; + d0 = img.get_pixel(clip(x - 1, 0, nx - 1), clip(y - 1 + jj, 0, ny - 1))[k] - img.get_pixel(clip(x, 0, nx - 1), clip(y - 1 + jj, 0, ny - 1))[k]; + d2 = img.get_pixel(clip(x + 1, 0, nx - 1), clip(y - 1 + jj, 0, ny - 1))[k] - img.get_pixel(clip(x, 0, nx - 1), clip(y - 1 + jj, 0, ny - 1))[k]; + d3 = img.get_pixel(clip(x + 2, 0, nx - 1), clip(y - 1 + jj, 0, ny - 1))[k] - img.get_pixel(clip(x, 0, nx - 1), clip(y - 1 + jj, 0, ny - 1))[k]; + a0 = img.get_pixel(clip(x, 0, nx - 1), clip(y - 1 + jj, 0, ny - 1))[k]; a1 = -1.0 / 3 * d0 + d2 - 1.0 / 6 * d3; a2 = 1.0 / 2 * d0 + 1.0 / 2 * d2; @@ -303,13 +313,12 @@ private: Cc = a0 + a1 * dy + a2 * dy * dy + a3 * dy * dy * dy; const uint8_t Cc2 = std::min(std::max(std::round(Cc), 0.0f), 255.0f); - dst.buf[(i * target_width + j) * 3 + k] = float(Cc2); + pixel[k] = Cc2; } } + dst.set_pixel(j, i, pixel); } } - - return true; } // Bicubic resize function using Pillow's ImagingResample algorithm @@ -455,16 +464,17 @@ private: }; // Horizontal resampling pass - // Resizes width from imIn.nx to imOut.nx, preserving height + // Resizes width from imIn to out_nx, preserving height auto resample_horizontal = [&](const clip_image_u8 & imIn, clip_image_u8 & imOut, + int out_nx, int ksize, const std::vector & bounds, const std::vector & weights) { - imOut.ny = imIn.ny; - imOut.buf.resize(3 * imOut.nx * imOut.ny); + const int in_ny = imIn.get_size().height; + imOut.set_size({out_nx, in_ny}, false); // Process each row independently - for (int yy = 0; yy < imOut.ny; yy++) { + for (int yy = 0; yy < in_ny; yy++) { // For each output pixel in this row - for (int xx = 0; xx < imOut.nx; xx++) { + for (int xx = 0; xx < out_nx; xx++) { // Get the range of input pixels and filter coefficients int xmin = bounds[xx * 2 + 0]; // First input pixel index int xcnt = bounds[xx * 2 + 1]; // Number of input pixels @@ -476,36 +486,36 @@ private: // Convolve: sum weighted input pixels for (int x = 0; x < xcnt; x++) { - int src_idx = ((yy * imIn.nx) + (x + xmin)) * 3; - ss0 += static_cast(imIn.buf[src_idx + 0]) * weights[xx * ksize + x]; // R channel - ss1 += static_cast(imIn.buf[src_idx + 1]) * weights[xx * ksize + x]; // G channel - ss2 += static_cast(imIn.buf[src_idx + 2]) * weights[xx * ksize + x]; // B channel + const auto src_px = imIn.get_pixel(x + xmin, yy); + ss0 += src_px[0] * weights[xx * ksize + x]; // R channel + ss1 += src_px[1] * weights[xx * ksize + x]; // G channel + ss2 += src_px[2] * weights[xx * ksize + x]; // B channel } // Convert back from fixed-point (divide by 2^PRECISION_BITS) and clamp to [0,255] - int dst_idx = (yy * imOut.nx + xx) * 3; - imOut.buf[dst_idx + 0] = clip8(ss0 >> PRECISION_BITS); - imOut.buf[dst_idx + 1] = clip8(ss1 >> PRECISION_BITS); - imOut.buf[dst_idx + 2] = clip8(ss2 >> PRECISION_BITS); + imOut.set_pixel(xx, yy, {clip8(ss0 >> PRECISION_BITS), + clip8(ss1 >> PRECISION_BITS), + clip8(ss2 >> PRECISION_BITS)}); } } }; // Vertical resampling pass - // Resizes height from imIn.ny to imOut.ny, preserving width + // Resizes height from imIn to out_ny, preserving width auto resample_vertical = [&](const clip_image_u8 & imIn, clip_image_u8 & imOut, + int out_ny, int ksize, const std::vector & bounds, const std::vector & weight) { - imOut.nx = imIn.nx; - imOut.buf.resize(3 * imOut.nx * imOut.ny); + const int in_nx = imIn.get_size().width; + imOut.set_size({in_nx, out_ny}, false); // For each output row - for (int yy = 0; yy < imOut.ny; yy++) { + for (int yy = 0; yy < out_ny; yy++) { // Get the range of input rows and filter coefficients int ymin = bounds[yy * 2 + 0]; // First input row index int ycnt = bounds[yy * 2 + 1]; // Number of input rows // Process each column in this output row - for (int xx = 0; xx < imOut.nx; xx++) { + for (int xx = 0; xx < in_nx; xx++) { // Initialize accumulators for RGB channels with rounding bias int32_t ss0 = 1 << (PRECISION_BITS - 1); int32_t ss1 = 1 << (PRECISION_BITS - 1); @@ -513,27 +523,23 @@ private: // Convolve: sum weighted input pixels vertically for (int y = 0; y < ycnt; y++) { - int src_idx = ((y + ymin) * imIn.nx + xx) * 3; - ss0 += static_cast(imIn.buf[src_idx + 0]) * weight[yy * ksize + y]; // R channel - ss1 += static_cast(imIn.buf[src_idx + 1]) * weight[yy * ksize + y]; // G channel - ss2 += static_cast(imIn.buf[src_idx + 2]) * weight[yy * ksize + y]; // B channel + const auto src_px = imIn.get_pixel(xx, y + ymin); + ss0 += src_px[0] * weight[yy * ksize + y]; // R channel + ss1 += src_px[1] * weight[yy * ksize + y]; // G channel + ss2 += src_px[2] * weight[yy * ksize + y]; // B channel } // Convert back from fixed-point and clamp to [0,255] - int dst_idx = (yy * imOut.nx + xx) * 3; - imOut.buf[dst_idx + 0] = clip8(ss0 >> PRECISION_BITS); - imOut.buf[dst_idx + 1] = clip8(ss1 >> PRECISION_BITS); - imOut.buf[dst_idx + 2] = clip8(ss2 >> PRECISION_BITS); + imOut.set_pixel(xx, yy, {clip8(ss0 >> PRECISION_BITS), + clip8(ss1 >> PRECISION_BITS), + clip8(ss2 >> PRECISION_BITS)}); } } }; // Main resampling logic using separable two-pass approach - const int src_width = img.nx; - const int src_height = img.ny; - - dst.nx = target_width; - dst.ny = target_height; + const int src_width = img.get_size().width; + const int src_height = img.get_size().height; bool need_horizontal = (target_width != src_width); bool need_vertical = (target_height != src_height); @@ -555,18 +561,20 @@ private: if (need_horizontal && need_vertical) { // Both horizontal and vertical clip_image_u8 temp; - temp.nx = target_width; - resample_horizontal(img, temp, ksize_horiz, bounds_horiz, weights_horiz); - resample_vertical(temp, dst, ksize_vert, bounds_vert, weights_vert); + resample_horizontal(img, temp, target_width, ksize_horiz, bounds_horiz, weights_horiz); + resample_vertical(temp, dst, target_height, ksize_vert, bounds_vert, weights_vert); } else if (need_horizontal) { // Only horizontal - resample_horizontal(img, dst, ksize_horiz, bounds_horiz, weights_horiz); + resample_horizontal(img, dst, target_width, ksize_horiz, bounds_horiz, weights_horiz); } else if (need_vertical) { // Only vertical - resample_vertical(img, dst, ksize_vert, bounds_vert, weights_vert); + resample_vertical(img, dst, target_height, ksize_vert, bounds_vert, weights_vert); } else { // No resizing needed - direct copy - dst.buf = img.buf; + dst.set_size(img.get_size(), img.is_placeholder()); + if (!img.is_placeholder()) { + dst.cpy_buf(img.get_ro_buf()); + } } return true; @@ -588,7 +596,7 @@ private: // bool mtmd_image_preprocessor_llava_uhd::preprocess(const clip_image_u8 & img, clip_image_f32_batch & output) { - const clip_image_size original_size{img.nx, img.ny}; + const clip_image_size original_size = img.get_size(); auto const inst = get_slice_instructions(original_size); std::vector imgs = slice_image(img, inst); @@ -883,7 +891,7 @@ bool mtmd_image_preprocessor_fixed_size::preprocess(const clip_image_u8 & img, c bool mtmd_image_preprocessor_dyn_size::preprocess(const clip_image_u8 & img, clip_image_f32_batch & output) { GGML_ASSERT(hparams.image_min_pixels > 0 && hparams.image_max_pixels > 0); clip_image_u8 resized_image; - const clip_image_size original_size{img.nx, img.ny}; + const clip_image_size original_size = img.get_size(); // the original pixtral model doesn't have n_merge const int cur_merge = hparams.n_merge == 0 ? 1 : hparams.n_merge; const clip_image_size target_size = img_tool::calc_size_preserved_ratio( @@ -908,7 +916,7 @@ bool mtmd_image_preprocessor_dyn_size::preprocess(const clip_image_u8 & img, cli bool mtmd_image_preprocessor_longest_edge::preprocess(const clip_image_u8 & img, clip_image_f32_batch & output) { GGML_ASSERT(hparams.image_longest_edge > 0); clip_image_u8 resized_image; - const clip_image_size original_size{img.nx, img.ny}; + const clip_image_size original_size = img.get_size(); // the original pixtral model doesn't have n_merge const int cur_merge = hparams.n_merge == 0 ? 1 : hparams.n_merge; const clip_image_size target_size = img_tool::calc_size_preserved_ratio( @@ -1040,7 +1048,7 @@ bool mtmd_image_preprocessor_idefics3::preprocess(const clip_image_u8 & img, cli // multiples of image_size (always rounding up) // // CITE: https://github.com/huggingface/transformers/blob/main/src/transformers/models/idefics3/image_processing_idefics3.py#L737 - const clip_image_size original_size{img.nx, img.ny}; + const clip_image_size original_size = img.get_size(); const clip_image_size refined_size = img_tool::calc_size_preserved_ratio( original_size, hparams.image_size, hparams.image_longest_edge); // LOG_INF("%s: original size: %d x %d, refined size: %d x %d\n", @@ -1088,7 +1096,7 @@ bool mtmd_image_preprocessor_idefics3::preprocess(const clip_image_u8 & img, cli bool mtmd_image_preprocessor_internvl::preprocess(const clip_image_u8 & img, clip_image_f32_batch & output) { GGML_ASSERT(!hparams.image_res_candidates.empty()); - const clip_image_size original_size{img.nx, img.ny}; + const clip_image_size original_size = img.get_size(); auto const inst = get_slice_instructions(original_size); std::vector imgs = slice_image(img, inst, false); @@ -1108,7 +1116,7 @@ bool mtmd_image_preprocessor_deepseekocr::preprocess(const clip_image_u8 & img, static constexpr int native_resolutions[] = { 1024 /* base */, 1280 /* large */ }; // TODO: support 512 (tiny) and 640 (small) once we have eval data for them - const int64_t orig_area = static_cast(img.nx) * img.ny; + const int64_t orig_area = static_cast(img.n_pixels()); size_t mode_i = 0; int64_t min_diff = std::numeric_limits::max(); @@ -1201,10 +1209,11 @@ bool mtmd_image_preprocessor_deepseekocr2::preprocess(const clip_image_u8 & img, // emit 768x768 local tiles when the image is larger than a tile in either // dimension, then always a 1024x1024 global view. order: [tiles..., global]. - if (img.nx > tile_size || img.ny > tile_size) { - const float aspect_ratio = static_cast(img.nx) / img.ny; + const auto img_size = img.get_size(); + if (img_size.width > tile_size || img_size.height > tile_size) { + const float aspect_ratio = static_cast(img_size.width) / img_size.height; const auto target_ratios = get_target_ratios(); - const clip_image_size grid = find_closest_aspect_ratio(aspect_ratio, target_ratios, img.nx, img.ny); + const clip_image_size grid = find_closest_aspect_ratio(aspect_ratio, target_ratios, img_size.width, img_size.height); // stretch onto the grid (no aspect preserve), then crop tiles row-major. clip_image_u8 refined; @@ -1247,50 +1256,57 @@ void mtmd_image_preprocessor_step3vl::img_u8_resize_bilinear_to_f32( int target_height, const float mean[3], const float std[3]) { - if (src.nx == target_width && src.ny == target_height) { + const auto src_size = src.get_size(); + if (src_size.width == target_width && src_size.height == target_height) { img_u8_to_f32(src, dst, mean, std); return; } - dst.nx = target_width; - dst.ny = target_height; - dst.buf.resize(3 * target_width * target_height); + dst.set_size({target_width, target_height}, false, false); - const float scale_x = static_cast(src.nx) / target_width; - const float scale_y = static_cast(src.ny) / target_height; + if (src.is_placeholder()) { + // no-op for placeholder image, just set the size and return + return; + } + + const float scale_x = static_cast(src_size.width) / target_width; + const float scale_y = static_cast(src_size.height) / target_height; + + std::vector local_buf(3 * target_width * target_height); for (int y = 0; y < target_height; ++y) { const float src_y = (static_cast(y) + 0.5f) * scale_y - 0.5f; const int y0_floor = static_cast(std::floor(src_y)); - const int y0 = std::max(0, std::min(y0_floor, src.ny - 1)); - const int y1 = std::max(0, std::min(y0_floor + 1, src.ny - 1)); + const int y0 = std::max(0, std::min(y0_floor, src_size.height - 1)); + const int y1 = std::max(0, std::min(y0_floor + 1, src_size.height - 1)); const float ly = src_y - y0_floor; for (int x = 0; x < target_width; ++x) { const float src_x = (static_cast(x) + 0.5f) * scale_x - 0.5f; const int x0_floor = static_cast(std::floor(src_x)); - const int x0 = std::max(0, std::min(x0_floor, src.nx - 1)); - const int x1 = std::max(0, std::min(x0_floor + 1, src.nx - 1)); + const int x0 = std::max(0, std::min(x0_floor, src_size.width - 1)); + const int x1 = std::max(0, std::min(x0_floor + 1, src_size.width - 1)); const float lx = src_x - x0_floor; - const size_t idx00 = 3 * (y0 * src.nx + x0); - const size_t idx01 = 3 * (y0 * src.nx + x1); - const size_t idx10 = 3 * (y1 * src.nx + x0); - const size_t idx11 = 3 * (y1 * src.nx + x1); - const size_t idx_dst = 3 * (y * target_width + x); + const auto p00 = src.get_pixel(x0, y0); + const auto p01 = src.get_pixel(x1, y0); + const auto p10 = src.get_pixel(x0, y1); + const auto p11 = src.get_pixel(x1, y1); + const size_t idx_dst = 3 * (y * target_width + x); for (int c = 0; c < 3; ++c) { - const float v00 = (static_cast(src.buf[idx00 + c]) / 255.0f - mean[c]) / std[c]; - const float v01 = (static_cast(src.buf[idx01 + c]) / 255.0f - mean[c]) / std[c]; - const float v10 = (static_cast(src.buf[idx10 + c]) / 255.0f - mean[c]) / std[c]; - const float v11 = (static_cast(src.buf[idx11 + c]) / 255.0f - mean[c]) / std[c]; + const float v00 = (static_cast(p00[c]) / 255.0f - mean[c]) / std[c]; + const float v01 = (static_cast(p01[c]) / 255.0f - mean[c]) / std[c]; + const float v10 = (static_cast(p10[c]) / 255.0f - mean[c]) / std[c]; + const float v11 = (static_cast(p11[c]) / 255.0f - mean[c]) / std[c]; const float top = v00 + (v01 - v00) * lx; const float bot = v10 + (v11 - v10) * lx; - dst.buf[idx_dst + c] = top + (bot - top) * ly; + local_buf[idx_dst + c] = top + (bot - top) * ly; } } } + dst.cpy_buf(local_buf); } int mtmd_image_preprocessor_step3vl::get_image_longest_edge(const clip_hparams & params) { @@ -1341,26 +1357,26 @@ std::vector mtmd_image_preprocessor_step3vl::calc_grid(int length, int wind clip_image_u8 mtmd_image_preprocessor_step3vl::prepare_image(const clip_image_u8 & img, const clip_hparams & params) { clip_image_u8 resized = img; - const float aspect_ratio = img.ny > 0 ? static_cast(img.nx) / img.ny : 1.0f; - if (std::min(img.nx, img.ny) < 32 && + const auto img_size = img.get_size(); + const float aspect_ratio = img_size.height > 0 ? static_cast(img_size.width) / img_size.height : 1.0f; + if (std::min(img_size.width, img_size.height) < 32 && (aspect_ratio > wide_aspect_ratio_limit || aspect_ratio < 1.0f / wide_aspect_ratio_limit)) { - const int square_size = std::max(img.nx, img.ny); + const int square_size = std::max(img_size.width, img_size.height); clip_image_u8 padded; - padded.nx = square_size; - padded.ny = square_size; - padded.buf.resize(3 * square_size * square_size); + padded.set_size({square_size, square_size}, false); img_tool::fill(padded, {0, 0, 0}); img_tool::composite(padded, img, 0, 0); resized = std::move(padded); } const int max_image_size = get_image_longest_edge(params); - if (std::max(resized.nx, resized.ny) > max_image_size) { - const float scale = static_cast(max_image_size) / std::max(resized.nx, resized.ny); + const auto resized_size = resized.get_size(); + if (std::max(resized_size.width, resized_size.height) > max_image_size) { + const float scale = static_cast(max_image_size) / std::max(resized_size.width, resized_size.height); const clip_image_size new_size = { - std::max(1, static_cast(std::floor(resized.nx * scale))), - std::max(1, static_cast(std::floor(resized.ny * scale))), + std::max(1, static_cast(std::floor(resized_size.width * scale))), + std::max(1, static_cast(std::floor(resized_size.height * scale))), }; clip_image_u8 scaled; img_tool::resize(resized, scaled, new_size, RESIZE_ALGO_BILINEAR, PAD_NONE); @@ -1372,14 +1388,14 @@ clip_image_u8 mtmd_image_preprocessor_step3vl::prepare_image(const clip_image_u8 clip_image_u8 mtmd_image_preprocessor_step3vl::crop_with_black_padding(const clip_image_u8 & image, int x, int y, int w, int h) { clip_image_u8 dst; - dst.nx = w; - dst.ny = h; - dst.buf.resize(3 * w * h, 0); + dst.set_size({w, h}, false); + img_tool::fill(dst, {0, 0, 0}); + const auto img_size = image.get_size(); const int src_x0 = std::max(0, x); const int src_y0 = std::max(0, y); - const int src_x1 = std::min(image.nx, x + w); - const int src_y1 = std::min(image.ny, y + h); + const int src_x1 = std::min(img_size.width, x + w); + const int src_y1 = std::min(img_size.height, y + h); if (src_x0 >= src_x1 || src_y0 >= src_y1) { return dst; @@ -1390,11 +1406,7 @@ clip_image_u8 mtmd_image_preprocessor_step3vl::crop_with_black_padding(const cli for (int yy = 0; yy < src_y1 - src_y0; ++yy) { for (int xx = 0; xx < src_x1 - src_x0; ++xx) { - const int src_idx = 3 * ((src_y0 + yy) * image.nx + (src_x0 + xx)); - const int dst_idx = 3 * ((dst_y0 + yy) * w + (dst_x0 + xx)); - dst.buf[dst_idx + 0] = image.buf[src_idx + 0]; - dst.buf[dst_idx + 1] = image.buf[src_idx + 1]; - dst.buf[dst_idx + 2] = image.buf[src_idx + 2]; + dst.set_pixel(dst_x0 + xx, dst_y0 + yy, image.get_pixel(src_x0 + xx, src_y0 + yy)); } } @@ -1443,7 +1455,7 @@ mtmd_image_preprocessor_step3vl::slice_instructions mtmd_image_preprocessor_step bool mtmd_image_preprocessor_step3vl::preprocess(const clip_image_u8 & img, clip_image_f32_batch & output) { clip_image_u8 prepared = prepare_image(img, hparams); - const auto instructions = build_slice_instructions(hparams, {prepared.nx, prepared.ny}); + const auto instructions = build_slice_instructions(hparams, prepared.get_size()); clip_image_f32_ptr overview_f32(clip_image_f32_init()); img_u8_resize_bilinear_to_f32( @@ -1462,7 +1474,8 @@ bool mtmd_image_preprocessor_step3vl::preprocess(const clip_image_u8 & img, clip } clip_image_u8 img_for_crop = prepared; - if (instructions.refined_size.width != prepared.nx || instructions.refined_size.height != prepared.ny) { + const auto prepared_size = prepared.get_size(); + if (instructions.refined_size.width != prepared_size.width || instructions.refined_size.height != prepared_size.height) { clip_image_u8 refined; img_tool::resize(prepared, refined, instructions.refined_size, RESIZE_ALGO_BILINEAR, PAD_NONE); img_for_crop = std::move(refined); @@ -1503,9 +1516,10 @@ bool mtmd_image_preprocessor_youtuvl::preprocess(const clip_image_u8 & img, clip hparams.image_max_pixels / (patch_size * patch_size) : 256; // Linear search for optimal scale to fit within max_num_patches + const auto img_size = img.get_size(); float scale = 1.0f; - int target_height = img.ny; - int target_width = img.nx; + int target_height = img_size.height; + int target_width = img_size.width; auto get_scaled_image_size = [align_size](float scale, int size) -> int { float scaled_size = size * scale; @@ -1517,8 +1531,8 @@ bool mtmd_image_preprocessor_youtuvl::preprocess(const clip_image_u8 & img, clip // Linear search with 0.02 step size while (scale > 0.0f) { - target_height = get_scaled_image_size(scale, img.ny); - target_width = get_scaled_image_size(scale, img.nx); + target_height = get_scaled_image_size(scale, img_size.height); + target_width = get_scaled_image_size(scale, img_size.width); int num_patches_h = target_height / patch_size; int num_patches_w = target_width / patch_size; diff --git a/tools/mtmd/mtmd.cpp b/tools/mtmd/mtmd.cpp index 260f307560..e1f8e2a335 100644 --- a/tools/mtmd/mtmd.cpp +++ b/tools/mtmd/mtmd.cpp @@ -26,12 +26,46 @@ // represents raw image data, layout is RGBRGBRGB... // length of data must be nx * ny * 3 +// for audio bitmap: nx = sample count, ny = 1, layout is F32 F32 F32 ... +// length of data must be nx * sizeof(float) struct mtmd_bitmap { - uint32_t nx; - uint32_t ny; - std::vector data; + uint32_t nx = 0; + uint32_t ny = 0; std::string id; // optional user-defined id, for ex: can be set to image hash, useful for KV cache tracking bool is_audio = false; // true if the bitmap is audio + + mtmd_bitmap(const unsigned char * data, uint32_t nx, uint32_t ny) + : nx(nx), ny(ny) { + if (data) { + size_t data_size = (size_t)nx * ny * 3; + this->data.resize(data_size); + std::memcpy(this->data.data(), data, data_size); + } + } + + mtmd_bitmap(const unsigned char * data, uint32_t n_samples) + : nx(n_samples), ny(1), is_audio(true) { + if (data) { + size_t data_size = (size_t)nx * sizeof(float); + this->data.resize(data_size); + std::memcpy(this->data.data(), data, data_size); + } + } + + const std::vector & get_ro_buf() const { + return data; + } + + bool is_placeholder() const { + return data.empty(); + } + + size_t n_bytes() const { + return data.size(); + } + + private: + std::vector data; }; // position indexing for decoder model @@ -42,8 +76,8 @@ enum mtmd_pos_type { }; struct mtmd_image_tokens { - uint32_t nx; // number of tokens in x direction - uint32_t ny; // number of tokens in y direction + uint32_t nx = 0; // number of tokens in x direction + uint32_t ny = 0; // number of tokens in y direction mtmd_pos_type pos = MTMD_POS_TYPE_NORMAL; uint32_t image_idx = 0; // 0-based position of this image among image chunks in the prompt(used by pos == MTMD_POS_TYPE_HUNYUANVL) uint32_t n_tokens() const { @@ -56,6 +90,16 @@ struct mtmd_image_tokens { clip_image_f32_batch batch_f32; // preprocessed image patches std::string id; // optional user-defined ID, useful for KV cache tracking + // true if one of entries in batch_f32 is a placeholder + bool is_placeholder() const { + for (const auto & entry : batch_f32.entries) { + if (entry->is_placeholder()) { + return true; + } + } + return false; + } + mtmd_image_tokens clone() { return mtmd_image_tokens{ nx, @@ -70,10 +114,20 @@ struct mtmd_image_tokens { using mtmd_image_tokens_ptr = std::unique_ptr; struct mtmd_audio_tokens { - uint32_t n_tokens; // number of tokens + uint32_t n_tokens = 0; // number of tokens clip_image_f32_batch batch_f32; // preprocessed image patches std::string id; // optional user-defined ID, useful for KV cache tracking + // true if one of entries in batch_f32 is a placeholder + bool is_placeholder() const { + for (const auto & entry : batch_f32.entries) { + if (entry->is_placeholder()) { + return true; + } + } + return false; + } + mtmd_audio_tokens clone() { return mtmd_audio_tokens{ n_tokens, @@ -795,16 +849,19 @@ struct mtmd_tokenizer { } // sanity check - GGML_ASSERT(bitmap->nx > 0 && bitmap->ny > 0); - GGML_ASSERT(bitmap->data.size() == (size_t)bitmap->nx * bitmap->ny * 3); + if (bitmap->nx <= 0 || bitmap->ny <= 0) { + LOG_ERR("%s: error: invalid bitmap dimensions: nx = %d, ny = %d\n", + __func__, bitmap->nx, bitmap->ny); + return 2; + } GGML_ASSERT(ctx->image_preproc != nullptr); // convert mtmd_bitmap to clip_image_u8 clip_image_u8_ptr img_u8(clip_image_u8_init()); - img_u8->nx = bitmap->nx; - img_u8->ny = bitmap->ny; - img_u8->buf.resize(bitmap->data.size()); - std::memcpy(img_u8->buf.data(), bitmap->data.data(), img_u8->nx * img_u8->ny * 3); + img_u8->set_size( + {(int)bitmap->nx, (int)bitmap->ny}, + bitmap->is_placeholder()); + img_u8->cpy_buf(bitmap->get_ro_buf()); // preprocess image clip_image_f32_batch batch_f32; @@ -949,7 +1006,7 @@ struct mtmd_tokenizer { return 2; } - if (bitmap->data.size() == 0) { + if (bitmap->nx == 0) { LOG_ERR("%s: error: empty audio data\n", __func__); return 2; } @@ -960,26 +1017,46 @@ struct mtmd_tokenizer { // sanity check GGML_ASSERT(ctx->audio_preproc != nullptr); - GGML_ASSERT(bitmap->data.size() > sizeof(float)); - GGML_ASSERT(bitmap->data.size() % sizeof(float) == 0); // preprocess audio std::vector mel_spec_chunks; - const float * samples = (const float *)bitmap->data.data(); - size_t n_samples = bitmap->data.size() / sizeof(float); - bool ok = ctx->audio_preproc->preprocess(samples, n_samples, mel_spec_chunks); - if (!ok) { - LOG_ERR("Unable to preprocess audio\n"); - return 2; + { + std::vector dummy; + const float * samples = nullptr; + size_t n_samples = 0; + if (bitmap->is_placeholder()) { + // TODO @ngxson : skip underlay processing if bitmap is placeholder + GGML_ASSERT(bitmap->ny == 1); + + dummy.resize(bitmap->nx); + samples = dummy.data(); + n_samples = dummy.size(); + } else { + const auto & buf = bitmap->get_ro_buf(); + GGML_ASSERT(buf.size() > sizeof(float)); + GGML_ASSERT(buf.size() % sizeof(float) == 0); + + samples = (const float *)buf.data(); + n_samples = buf.size() / sizeof(float); + } + bool ok = ctx->audio_preproc->preprocess(samples, n_samples, mel_spec_chunks); + if (!ok) { + LOG_ERR("Unable to preprocess audio\n"); + return 2; + } } // consider each mel_spec as a separate audio chunk // TODO: maybe support batching, but this may come with memory cost for (auto & mel_spec : mel_spec_chunks) { + const bool is_placeholder = mel_spec.data.empty(); + clip_image_f32_ptr mel_f32(clip_image_f32_init()); - mel_f32->nx = mel_spec.n_len; - mel_f32->ny = mel_spec.n_mel; - mel_f32->buf = std::move(mel_spec.data); + mel_f32->set_size( + {mel_spec.n_len, mel_spec.n_mel}, + is_placeholder, /* is_audio */ true); + mel_f32->cpy_buf(mel_spec.data); + size_t n_tokens = clip_n_output_tokens(ctx->ctx_a, mel_f32.get()); clip_image_f32_batch batch_f32; @@ -1098,12 +1175,28 @@ int32_t mtmd_encode_chunk(mtmd_context * ctx, const mtmd_input_chunk * chunk) { LOG_ERR("%s: model does not support vision input\n", __func__); return 1; } + if (chunk->tokens_image == nullptr) { + LOG_ERR("%s: image tokens are null\n", __func__); + return 1; + } + if (chunk->tokens_image->is_placeholder()) { + LOG_ERR("%s: image tokens batch is placeholder\n", __func__); + return 1; + } return mtmd_encode(ctx, chunk->tokens_image.get()); } else if (chunk->type == MTMD_INPUT_CHUNK_TYPE_AUDIO) { if (!ctx->ctx_a) { LOG_ERR("%s: model does not support audio input\n", __func__); return 1; } + if (chunk->tokens_audio == nullptr) { + LOG_ERR("%s: audio tokens are null\n", __func__); + return 1; + } + if (chunk->tokens_audio->is_placeholder()) { + LOG_ERR("%s: audio tokens batch is placeholder\n", __func__); + return 1; + } int n_mmproj_embd = ctx->n_embd_text; ctx->image_embd_v.resize(chunk->tokens_audio->n_tokens * n_mmproj_embd); bool ok = clip_image_batch_encode( @@ -1141,6 +1234,10 @@ int32_t mtmd_encode(mtmd_context * ctx, const mtmd_image_tokens * image_tokens) // e.g., DeepSeek-OCR-2: 144 per tile views, 257 for the global view size_t offset = 0; for (size_t i = 0; i < entries.size(); i++) { + if (entries[i]->is_placeholder()) { + LOG_ERR("%s: image tokens batch entry %zu is placeholder\n", __func__, i); + return 1; + } int n_tokens_per_image = clip_n_output_tokens(ctx_clip, entries[i].get()); ok = clip_image_encode( ctx_clip, @@ -1150,6 +1247,10 @@ int32_t mtmd_encode(mtmd_context * ctx, const mtmd_image_tokens * image_tokens) offset += static_cast(n_mmproj_embd) * n_tokens_per_image; } } else { + if (image_tokens->is_placeholder()) { + LOG_ERR("%s: image tokens batch is placeholder\n", __func__); + return 1; + } ok = clip_image_batch_encode( ctx_clip, ctx->n_threads, @@ -1207,24 +1308,17 @@ int mtmd_get_audio_sample_rate(const mtmd_context * ctx) { mtmd_bitmap * mtmd_bitmap_init(uint32_t nx, uint32_t ny, const unsigned char * data) { - mtmd_bitmap * bitmap = new mtmd_bitmap; - bitmap->nx = nx; - bitmap->ny = ny; - size_t data_size = (size_t)nx * ny * 3; - bitmap->data.resize(data_size); - std::memcpy(bitmap->data.data(), data, data_size); + mtmd_bitmap * bitmap = new mtmd_bitmap(data, nx, ny); return bitmap; } mtmd_bitmap * mtmd_bitmap_init_from_audio(size_t n_samples, const float * data) { - mtmd_bitmap * bitmap = new mtmd_bitmap; - bitmap->nx = n_samples; - bitmap->ny = 1; - bitmap->is_audio = true; - size_t data_size = n_samples * sizeof(float); - bitmap->data.resize(data_size); - std::memcpy(bitmap->data.data(), data, data_size); + mtmd_bitmap * bitmap = new mtmd_bitmap((const unsigned char *)data, n_samples); + GGML_ASSERT(bitmap->is_audio); + if (!bitmap->is_placeholder()) { + GGML_ASSERT(bitmap->get_ro_buf().size() == n_samples * sizeof(float)); + } return bitmap; } @@ -1237,11 +1331,11 @@ uint32_t mtmd_bitmap_get_ny(const mtmd_bitmap * bitmap) { } const unsigned char * mtmd_bitmap_get_data(const mtmd_bitmap * bitmap) { - return bitmap->data.data(); + return bitmap->get_ro_buf().data(); } size_t mtmd_bitmap_get_n_bytes(const mtmd_bitmap * bitmap) { - return bitmap->data.size(); + return bitmap->get_ro_buf().size(); } bool mtmd_bitmap_is_audio(const mtmd_bitmap * bitmap) { @@ -1535,14 +1629,16 @@ void mtmd_debug_encode_image(mtmd_context * ctx, const std::vector img_buf; + img_buf.reserve(img_sz * img_sz); for (const auto & row : image) { - inp_image.buf.insert(inp_image.buf.end(), row.begin(), row.end()); + img_buf.insert(img_buf.end(), row.begin(), row.end()); } - LOG_INF("%s: created input image with nx=%d, ny=%d\n", __func__, inp_image.nx, inp_image.ny); + clip_image_f32 inp_image; + inp_image.set_size({img_sz, img_sz}, false, false); + inp_image.cpy_buf(img_buf); + LOG_INF("%s: created input image with nx=%d, ny=%d\n", __func__, img_sz, img_sz); mtmd_debug_encode_impl(ctx, ctx->ctx_v, inp_image); } @@ -1552,16 +1648,17 @@ void mtmd_debug_encode_audio(mtmd_context * ctx, const std::vector & inpu return; } int n_mel = clip_get_hparams(ctx->ctx_a)->n_mel_bins; - clip_image_f32 inp_audio; - inp_audio.nx = input.size(); - inp_audio.ny = n_mel; - inp_audio.buf.resize(input.size() * n_mel); - for (size_t i = 0; i < input.size(); i++) { + const int audio_nx = (int)input.size(); + std::vector audio_buf(audio_nx * n_mel); + for (int i = 0; i < audio_nx; i++) { for (int j = 0; j < n_mel; j++) { - inp_audio.buf[j * inp_audio.nx + i] = input[i]; + audio_buf[j * audio_nx + i] = input[i]; } } - LOG_INF("%s: created input audio with nx=%d, ny=%d\n", __func__, inp_audio.nx, inp_audio.ny); + clip_image_f32 inp_audio; + inp_audio.set_size({audio_nx, n_mel}, false, true); + inp_audio.cpy_buf(audio_buf); + LOG_INF("%s: created input audio with nx=%d, ny=%d\n", __func__, audio_nx, n_mel); mtmd_debug_encode_impl(ctx, ctx->ctx_a, inp_audio); } @@ -1571,9 +1668,8 @@ void mtmd_debug_preprocess_image(mtmd_context * ctx, const std::vector return; } clip_image_u8 img_u8; - img_u8.nx = nx; - img_u8.ny = ny; - img_u8.buf = rgb_values; + img_u8.set_size({nx, ny}, false); + img_u8.cpy_buf(rgb_values); clip_image_f32_batch batch_f32; GGML_ASSERT(ctx->image_preproc != nullptr); bool ok = ctx->image_preproc->preprocess(img_u8, batch_f32); @@ -1583,7 +1679,7 @@ void mtmd_debug_preprocess_image(mtmd_context * ctx, const std::vector } LOG_INF("%s: preprocessed image to batch_f32 with %d entries\n", __func__, (int)batch_f32.entries.size()); for (size_t i = 0; i < batch_f32.entries.size(); i++) { - LOG_INF("%s: entry %zu has nx=%d, ny=%d\n", __func__, i, batch_f32.entries[i]->nx, batch_f32.entries[i]->ny); + LOG_INF("%s: entry %zu has nx=%d, ny=%d\n", __func__, i, batch_f32.entries[i]->nx(), batch_f32.entries[i]->ny()); // TODO: better way to dump entry content? } } diff --git a/tools/mtmd/mtmd.h b/tools/mtmd/mtmd.h index 5d518df799..b3154c8d55 100644 --- a/tools/mtmd/mtmd.h +++ b/tools/mtmd/mtmd.h @@ -136,6 +136,11 @@ MTMD_API int mtmd_get_audio_sample_rate(const mtmd_context * ctx); // if bitmap is audio: // length of data must be n_samples * sizeof(float) // the data is in float format (PCM F32) +// +// if data == nullptr: +// the bitmap is considered "empty", and will be treated as a placeholder for counting tokens +// you can pass the bitmap via mtmd_tokenize(), then call mtmd_*_get_n_tokens() to count the tokens +// note: passing a placeholder bitmap to mtmd_encode() will return an error MTMD_API mtmd_bitmap * mtmd_bitmap_init (uint32_t nx, uint32_t ny, const unsigned char * data); MTMD_API mtmd_bitmap * mtmd_bitmap_init_from_audio(size_t n_samples, const float * data); MTMD_API uint32_t mtmd_bitmap_get_nx (const mtmd_bitmap * bitmap); diff --git a/tools/server/README.md b/tools/server/README.md index 3e14f5e6a2..bf056dc60b 100644 --- a/tools/server/README.md +++ b/tools/server/README.md @@ -1447,6 +1447,36 @@ See [OpenAI Embeddings API documentation](https://platform.openai.com/docs/api-r }' ``` +### POST `/v1/responses/input_tokens`: Token Counting + +Similar to [Response input token counts API](https://developers.openai.com/api/reference/python/resources/responses/subresources/input_tokens/methods/count). + +Example response: + +```json +{ + "object": "response.input_tokens", + "input_tokens": 11 +} +``` + +### POST `/v1/chat/completions/input_tokens`: Token Counting + +Similar to [Response input token counts API](https://developers.openai.com/api/reference/python/resources/responses/subresources/input_tokens/methods/count), but accepts a chat completion body as input. + +Note: This is not an official OAI endpoint, but is added for completeness and convenience. + +Example response: + +```json +{ + "object": "response.input_tokens", + "input_tokens": 11 +} +``` + +## Anthropic-compatible API Endpoints + ### POST `/v1/messages`: Anthropic-compatible Messages API Given a list of `messages`, returns the assistant's response. Streaming is supported via Server-Sent Events. While no strong claims of compatibility with the Anthropic API spec are made, in our experience it suffices to support many apps. diff --git a/tools/server/server-common.cpp b/tools/server/server-common.cpp index 4c3f16a0a3..dfd286d24e 100644 --- a/tools/server/server-common.cpp +++ b/tools/server/server-common.cpp @@ -713,10 +713,10 @@ static std::string fnv_hash(const uint8_t * data, size_t len) { return std::to_string(hash); } -server_tokens process_mtmd_prompt(mtmd_context * mctx, std::string prompt, std::vector files) { +server_tokens process_mtmd_prompt(mtmd_context * mctx, const std::string & prompt, const std::vector & files, bool is_placeholder) { mtmd::bitmaps bitmaps; for (auto & file : files) { - mtmd::bitmap bmp(mtmd_helper_bitmap_init_from_buf(mctx, file.data(), file.size())); + mtmd::bitmap bmp(mtmd_helper_bitmap_init_from_buf(mctx, file.data(), file.size(), is_placeholder)); if (!bmp.ptr) { throw std::runtime_error("Failed to load image or audio file"); } diff --git a/tools/server/server-common.h b/tools/server/server-common.h index c28558d8b7..51b1613178 100644 --- a/tools/server/server-common.h +++ b/tools/server/server-common.h @@ -258,7 +258,8 @@ llama_tokens tokenize_mixed(const llama_vocab * vocab, const json & json_prompt, size_t validate_utf8(const std::string& text); // process mtmd prompt, return the server_tokens containing both text tokens and media chunks -server_tokens process_mtmd_prompt(mtmd_context * mctx, std::string prompt, std::vector files); +// if is_placeholder is true, the media chunk will be treated as placeholder for counting tokens; the output tokens are not usable for actual inference (e.g. for submitting a task to server_queue) +server_tokens process_mtmd_prompt(mtmd_context * mctx, const std::string & prompt, const std::vector & files, bool is_placeholder = false); /** * break the input "prompt" object into multiple prompt if needed, then tokenize them diff --git a/tools/server/server-context.cpp b/tools/server/server-context.cpp index ab0d594476..5d546d09c2 100644 --- a/tools/server/server-context.cpp +++ b/tools/server/server-context.cpp @@ -4333,6 +4333,10 @@ void server_routes::init_routes() { TASK_RESPONSE_TYPE_OAI_CHAT); }; + this->post_chat_completions_tok = [this](const server_http_req & req) { + return handle_count_tokens(ctx_server.vocab, ctx_server.mctx, req, TASK_RESPONSE_TYPE_OAI_CHAT); + }; + this->post_control = [this](const server_http_req & req) { auto res = create_response(); const json body = json::parse(req.body); @@ -4388,6 +4392,10 @@ void server_routes::init_routes() { TASK_RESPONSE_TYPE_OAI_RESP); }; + this->post_responses_tok_oai = [this](const server_http_req & req) { + return handle_count_tokens(ctx_server.vocab, ctx_server.mctx, req, TASK_RESPONSE_TYPE_OAI_RESP); + }; + this->post_transcriptions_oai = [this](const server_http_req & req) { auto res = create_response(); @@ -4435,20 +4443,7 @@ void server_routes::init_routes() { }; this->post_anthropic_count_tokens = [this](const server_http_req & req) { - auto res = create_response(); - std::vector files; - json body = server_chat_convert_anthropic_to_oai(json::parse(req.body)); - SRV_DBG("%s\n", "Request converted: Anthropic -> OpenAI Chat Completions"); - SRV_DBG("converted request: %s\n", body.dump().c_str()); - json body_parsed = oaicompat_chat_params_parse( - body, - meta->chat_params, - files); - - json prompt = body_parsed.at("prompt"); - llama_tokens tokens = tokenize_mixed(ctx_server.vocab, prompt, true, true); - res->ok({{"input_tokens", static_cast(tokens.size())}}); - return res; + return handle_count_tokens(ctx_server.vocab, ctx_server.mctx, req, TASK_RESPONSE_TYPE_ANTHROPIC); }; // same with handle_chat_completions, but without inference part @@ -4928,3 +4923,54 @@ std::unique_ptr server_routes::handle_embeddings_impl(cons res->ok(root); return res; } + +std::unique_ptr server_routes::handle_count_tokens(const llama_vocab * vocab, mtmd_context * mctx, const server_http_req & req, task_response_type res_type) { + auto res = create_response(); + std::vector files; + json body = json::parse(req.body); + bool is_oai = false; + + switch (res_type) { + case TASK_RESPONSE_TYPE_OAI_CHAT: + { + is_oai = true; + } break; + case TASK_RESPONSE_TYPE_OAI_RESP: + { + is_oai = true; + body = server_chat_convert_responses_to_chatcmpl(body); + } break; + case TASK_RESPONSE_TYPE_ANTHROPIC: + { + body = server_chat_convert_anthropic_to_oai(body); + } break; + default: + res->error(format_error_response("invalid res_type", ERROR_TYPE_INVALID_REQUEST)); + return res; + } + + json body_parsed = oaicompat_chat_params_parse( + body, + meta->chat_params, + files); + json prompt = body_parsed.at("prompt"); + // SRV_DBG("prompt = %s\n", prompt.dump().c_str()); + + // TODO @ngxson : refactor this code block, move this to server-common and reuse it in other places + size_t n_tokens; + if (mctx != nullptr) { + if (!prompt.is_string()) { + throw std::runtime_error("for mtmd, input prompt must be a string."); + } + n_tokens = process_mtmd_prompt(mctx, prompt.get(), files, true).size(); + } else { + n_tokens = tokenize_mixed(vocab, prompt, true, true).size(); + } + + json response = {{"input_tokens", static_cast(n_tokens)}}; + if (is_oai) { + response["object"] = "response.input_tokens"; + } + res->ok(response); + return res; +} diff --git a/tools/server/server-context.h b/tools/server/server-context.h index 73caff54a4..72a1f40e01 100644 --- a/tools/server/server-context.h +++ b/tools/server/server-context.h @@ -110,8 +110,10 @@ struct server_routes { server_http_context::handler_t post_completions; server_http_context::handler_t post_completions_oai; server_http_context::handler_t post_chat_completions; + server_http_context::handler_t post_chat_completions_tok; server_http_context::handler_t post_control; server_http_context::handler_t post_responses_oai; + server_http_context::handler_t post_responses_tok_oai; server_http_context::handler_t post_transcriptions_oai; server_http_context::handler_t post_anthropic_messages; server_http_context::handler_t post_anthropic_count_tokens; @@ -139,6 +141,7 @@ private: std::unique_ptr handle_slots_restore(const server_http_req & req, int id_slot); std::unique_ptr handle_slots_erase(const server_http_req &, int id_slot); std::unique_ptr handle_embeddings_impl(const server_http_req & req, task_response_type res_type); + std::unique_ptr handle_count_tokens(const llama_vocab * vocab, mtmd_context * mctx, const server_http_req & req, task_response_type res_type); // using unique_ptr to allow late initialization of const std::unique_ptr meta; diff --git a/tools/server/server.cpp b/tools/server/server.cpp index 769e80a802..a6ea749d0c 100644 --- a/tools/server/server.cpp +++ b/tools/server/server.cpp @@ -161,6 +161,8 @@ int llama_server(int argc, char ** argv) { routes.post_tokenize = models_routes->proxy_post; routes.post_detokenize = models_routes->proxy_post; routes.post_apply_template = models_routes->proxy_post; + routes.post_chat_completions_tok = models_routes->proxy_post; + routes.post_responses_tok_oai = models_routes->proxy_post; routes.get_lora_adapters = models_routes->proxy_get; routes.post_lora_adapters = models_routes->proxy_post; routes.get_slots = models_routes->proxy_get; @@ -192,7 +194,6 @@ int llama_server(int argc, char ** argv) { ctx_http.post("/v1/audio/transcriptions", ex_wrapper(routes.post_transcriptions_oai)); ctx_http.post("/audio/transcriptions", ex_wrapper(routes.post_transcriptions_oai)); ctx_http.post("/v1/messages", ex_wrapper(routes.post_anthropic_messages)); // anthropic messages API - ctx_http.post("/v1/messages/count_tokens", ex_wrapper(routes.post_anthropic_count_tokens)); // anthropic token counting ctx_http.post("/infill", ex_wrapper(routes.post_infill)); ctx_http.post("/embedding", ex_wrapper(routes.post_embeddings)); // legacy ctx_http.post("/embeddings", ex_wrapper(routes.post_embeddings)); @@ -204,6 +205,12 @@ int llama_server(int argc, char ** argv) { ctx_http.post("/tokenize", ex_wrapper(routes.post_tokenize)); ctx_http.post("/detokenize", ex_wrapper(routes.post_detokenize)); ctx_http.post("/apply-template", ex_wrapper(routes.post_apply_template)); + // token counting + ctx_http.post("/chat/completions/input_tokens", ex_wrapper(routes.post_chat_completions_tok)); + ctx_http.post("/v1/chat/completions/input_tokens", ex_wrapper(routes.post_chat_completions_tok)); + ctx_http.post("/responses/input_tokens", ex_wrapper(routes.post_responses_tok_oai)); + ctx_http.post("/v1/responses/input_tokens", ex_wrapper(routes.post_responses_tok_oai)); + ctx_http.post("/v1/messages/count_tokens", ex_wrapper(routes.post_anthropic_count_tokens)); // anthropic token counting // LoRA adapters hotswap ctx_http.get ("/lora-adapters", ex_wrapper(routes.get_lora_adapters)); ctx_http.post("/lora-adapters", ex_wrapper(routes.post_lora_adapters)); diff --git a/tools/server/tests/unit/test_chat_completion.py b/tools/server/tests/unit/test_chat_completion.py index f80e46133c..fe55dc5ab1 100644 --- a/tools/server/tests/unit/test_chat_completion.py +++ b/tools/server/tests/unit/test_chat_completion.py @@ -573,3 +573,19 @@ def test_chat_completions_multiple_choices(): for choice in res.body["choices"]: assert "assistant" == choice["message"]["role"] assert choice["finish_reason"] == "length" + + +def test_chat_completions_token_count(): + global server + server.start() + # make sure cache can be reused across multiple choices and multiple requests + # ref: https://github.com/ggml-org/llama.cpp/pull/18663 + for _ in range(2): + res = server.make_request("POST", "/chat/completions/input_tokens", data={ + "messages": [ + {"role": "system", "content": "Book"}, + {"role": "user", "content": "What is the best book"}, + ], + }) + assert res.status_code == 200 + assert res.body["input_tokens"] > 5 diff --git a/tools/server/tests/unit/test_vision_api.py b/tools/server/tests/unit/test_vision_api.py index fb77084c89..d74cc3a43e 100644 --- a/tools/server/tests/unit/test_vision_api.py +++ b/tools/server/tests/unit/test_vision_api.py @@ -98,6 +98,25 @@ def test_vision_chat_completion(prompt, image_url, success, re_content): assert res.status_code != 200 +def test_vision_chat_completion_token_count(): + global server + server.start() + res = server.make_request("POST", "/chat/completions/input_tokens", data={ + "temperature": 0.0, + "top_k": 1, + "messages": [ + {"role": "user", "content": [ + {"type": "text", "text": "What is this:"}, + {"type": "image_url", "image_url": { + "url": get_img_url("IMG_URL_0"), + }}, + ]}, + ], + }) + assert res.status_code == 200 + assert res.body["input_tokens"] > 10 + + @pytest.mark.parametrize( "prompt, image_data, success, re_content", [