diff --git a/tools/mtmd/clip-impl.h b/tools/mtmd/clip-impl.h
index 393e085f71..794cb4d2b2 100644
--- a/tools/mtmd/clip-impl.h
+++ b/tools/mtmd/clip-impl.h
@@ -4,6 +4,7 @@
 #include "gguf.h"
 #include "clip.h"
 
+#include <array>
 #include <climits>
 #include <cstdarg>
 #include <cinttypes>
@@ -429,10 +430,68 @@ static projector_type clip_projector_type_from_string(const std::string & str) {
 
 // RGB uint8 image
 struct clip_image_u8 {
-    int nx;
-    int ny;
+    clip_image_size get_size() const {
+        return { nx, ny };
+    }
 
+    void set_size(clip_image_size size, bool is_placeholder) {
+        nx = size.width;
+        ny = size.height;
+        if (is_placeholder) {
+            buf.clear();
+        } else {
+            buf.resize((size_t) nx * (size_t) ny * 3);
+        }
+    }
+
+    void cpy_buf(const std::vector<uint8_t> & new_buf) {
+        buf = new_buf;
+    }
+
+    const std::vector<uint8_t> & get_ro_buf() const {
+        if (is_placeholder()) {
+            throw std::runtime_error("this clip_image_u8 is a placeholder");
+        }
+        return buf;
+    }
+
+    // note to contributors: NEVER add a get_rw_buf(), it is a DANGEROUS pattern. always use get_pixel / set_pixel for buffer manipulation
+
+    bool is_placeholder() const {
+        return buf.empty();
+    }
+
+    std::array<uint8_t, 3> get_pixel(int x, int y) const {
+        if (is_placeholder()) {
+            // return a dummy value, so that legacy code can still process image without errors
+            return { 0, 0, 0 };
+        }
+        int idx = (y * nx + x) * 3;
+        return { buf[idx], buf[idx + 1], buf[idx + 2] };
+    }
+
+    void set_pixel(int x, int y, const std::array<uint8_t, 3> & rgb) {
+        if (is_placeholder()) {
+            return; // no-op
+        }
+        int idx = (y * nx + x) * 3;
+        buf[idx] = rgb[0];
+        buf[idx + 1] = rgb[1];
+        buf[idx + 2] = rgb[2];
+    }
+
+    size_t n_pixels() const {
+        return (size_t) nx * (size_t) ny;
+    }
+
+    size_t n_elements() const {
+        return n_pixels() * 3;
+    }
+
+  private:
     std::vector<uint8_t> buf;
+    int nx = 0;
+    int ny = 0;
 };
 
 // For images, buf.size() == nx*ny*3
@@ -440,15 +499,87 @@ struct clip_image_u8 {
 // For audio, only one channel is used, buf.size() == nx*ny
 //     nx will be n_frames and ny will be n_mel
 struct clip_image_f32 {
-    int nx;
-    int ny;
-
-    std::vector<float> buf;
-
     // marks the global view in e.g., DeepSeek-OCR Models
     bool add_viewsep = false;
-    // whether a learned newline token should be appended after the image (eg Granite4 Vision)
+    // whether a learned newline (or EOI) token should be appended after the image (eg Granite4 Vision)
     bool add_newline = false;
+
+    clip_image_size get_size() const {
+        return { nx_, ny_ };
+    }
+
+    int nx() const { return nx_; }
+    int ny() const { return ny_; }
+
+    void set_size(clip_image_size size, bool is_placeholder, bool is_audio) {
+        nx_ = size.width;
+        ny_ = size.height;
+        if (is_placeholder) {
+            buf.clear();
+        } else {
+            if (is_audio) {
+                buf.resize((size_t) nx_ * (size_t) ny_);
+            } else {
+                buf.resize((size_t) nx_ * (size_t) ny_ * 3);
+            }
+        }
+    }
+
+    void cpy_buf(const std::vector<float> & new_buf) {
+        buf = new_buf;
+    }
+
+    void from_u8(const clip_image_u8 & img) {
+        auto size = img.get_size();
+        nx_ = size.width;
+        ny_ = size.height;
+        if (img.is_placeholder()) {
+            buf.clear();
+            return; // no-op
+        }
+        buf.resize(img.n_elements());
+        const auto & u8_buf = img.get_ro_buf();
+        for (size_t i = 0; i < img.n_elements(); ++i) {
+            buf[i] = (float) u8_buf[i] / 255.0f;
+        }
+    }
+
+    size_t n_pixels() const {
+        return (size_t) nx_ * (size_t) ny_;
+    }
+
+    size_t n_elements() const {
+        return n_pixels() * 3;
+    }
+
+    void normalize(const float mean[3], const float std[3]) {
+        if (is_placeholder()) {
+            return; // no-op
+        }
+        for (size_t i = 0; i < n_pixels(); ++i) {
+            buf[i * 3 + 0] = (buf[i * 3 + 0] - mean[0]) / std[0];
+            buf[i * 3 + 1] = (buf[i * 3 + 1] - mean[1]) / std[1];
+            buf[i * 3 + 2] = (buf[i * 3 + 2] - mean[2]) / std[2];
+        }
+    }
+
+    const std::vector<float> & get_ro_buf() const {
+        if (is_placeholder()) {
+            throw std::runtime_error("this clip_image_f32 is a placeholder");
+        }
+        return buf;
+    }
+
+    // note to contributors: NEVER add a get_rw_buf(), it is a DANGEROUS pattern
+
+    bool is_placeholder() const {
+        return buf.empty();
+    }
+
+  private:
+    std::vector<float> buf;
+    int nx_ = 0;
+    int ny_ = 0;
 };
 
 //
diff --git a/tools/mtmd/clip.cpp b/tools/mtmd/clip.cpp
index c12c910a1c..6e54524da0 100644
--- a/tools/mtmd/clip.cpp
+++ b/tools/mtmd/clip.cpp
@@ -39,12 +39,14 @@ static void clip_image_write_image_to_ppm(const clip_image_u8& img, const std::s
     }
 
     // PPM header: P6 format, width, height, and max color value
-    file << "P6\n" << img.nx << " " << img.ny << "\n255\n";
+    const auto ppm_size = img.get_size();
+    file << "P6\n" << ppm_size.width << " " << ppm_size.height << "\n255\n";
 
     // Write pixel data
-    for (size_t i = 0; i < img.buf.size(); i += 3) {
+    const auto & ppm_buf = img.get_ro_buf();
+    for (size_t i = 0; i < ppm_buf.size(); i += 3) {
         // PPM expects binary data in RGB format, which matches our image buffer
-        file.write(reinterpret_cast<const char*>(&img.buf[i]), 3);
+        file.write(reinterpret_cast<const char*>(&ppm_buf[i]), 3);
     }
 
     file.close();
@@ -57,9 +59,10 @@ static void clip_image_save_to_bmp(const clip_image_u8& img, const std::string&
         return;
     }
 
-    int fileSize = 54 + 3 * img.nx * img.ny; // File header + info header + pixel data
+    const auto bmp_size = img.get_size();
+    int fileSize = 54 + 3 * bmp_size.width * bmp_size.height; // File header + info header + pixel data
     int bytesPerPixel = 3;
-    int widthInBytes = img.nx * bytesPerPixel;
+    int widthInBytes = bmp_size.width * bytesPerPixel;
     int paddingAmount = (4 - (widthInBytes % 4)) % 4;
     int stride = widthInBytes + paddingAmount;
 
@@ -72,7 +75,7 @@ static void clip_image_save_to_bmp(const clip_image_u8& img, const std::string&
     };
 
     // Total file size
-    fileSize = 54 + (stride * img.ny);
+    fileSize = 54 + (stride * bmp_size.height);
     fileHeader[2] = (unsigned char)(fileSize);
     fileHeader[3] = (unsigned char)(fileSize >> 8);
     fileHeader[4] = (unsigned char)(fileSize >> 16);
@@ -94,14 +97,14 @@ static void clip_image_save_to_bmp(const clip_image_u8& img, const std::string&
     };
 
     // Width and height in the information header
-    infoHeader[4] = (unsigned char)(img.nx);
-    infoHeader[5] = (unsigned char)(img.nx >> 8);
-    infoHeader[6] = (unsigned char)(img.nx >> 16);
-    infoHeader[7] = (unsigned char)(img.nx >> 24);
-    infoHeader[8] = (unsigned char)(img.ny);
-    infoHeader[9] = (unsigned char)(img.ny >> 8);
-    infoHeader[10] = (unsigned char)(img.ny >> 16);
-    infoHeader[11] = (unsigned char)(img.ny >> 24);
+    infoHeader[4] = (unsigned char)(bmp_size.width);
+    infoHeader[5] = (unsigned char)(bmp_size.width >> 8);
+    infoHeader[6] = (unsigned char)(bmp_size.width >> 16);
+    infoHeader[7] = (unsigned char)(bmp_size.width >> 24);
+    infoHeader[8] = (unsigned char)(bmp_size.height);
+    infoHeader[9] = (unsigned char)(bmp_size.height >> 8);
+    infoHeader[10] = (unsigned char)(bmp_size.height >> 16);
+    infoHeader[11] = (unsigned char)(bmp_size.height >> 24);
 
     // Write file headers
     file.write(reinterpret_cast<char*>(fileHeader), sizeof(fileHeader));
@@ -109,14 +112,14 @@ static void clip_image_save_to_bmp(const clip_image_u8& img, const std::string&
 
     // Pixel data
     std::vector<unsigned char> padding(3, 0); // Max padding size to be added to each row
-    for (int y = img.ny - 1; y >= 0; --y) { // BMP files are stored bottom-to-top
-        for (int x = 0; x < img.nx; ++x) {
+    for (int y = bmp_size.height - 1; y >= 0; --y) { // BMP files are stored bottom-to-top
+        for (int x = 0; x < bmp_size.width; ++x) {
             // Each pixel
-            size_t pixelIndex = (y * img.nx + x) * 3;
+            const auto px = img.get_pixel(x, y);
             unsigned char pixel[3] = {
-                img.buf[pixelIndex + 2], // BMP stores pixels in BGR format
-                img.buf[pixelIndex + 1],
-                img.buf[pixelIndex]
+                px[2], // BMP stores pixels in BGR format
+                px[1],
+                px[0]
             };
             file.write(reinterpret_cast<char*>(pixel), 3);
         }
@@ -129,12 +132,13 @@ static void clip_image_save_to_bmp(const clip_image_u8& img, const std::string&
 
 // debug function to convert f32 to u8
 static void clip_image_convert_f32_to_u8(const clip_image_f32& src, clip_image_u8& dst) {
-    dst.nx = src.nx;
-    dst.ny = src.ny;
-    dst.buf.resize(3 * src.nx * src.ny);
-    for (size_t i = 0; i < src.buf.size(); ++i) {
-        dst.buf[i] = static_cast<uint8_t>(std::min(std::max(int(src.buf[i] * 255.0f), 0), 255));
+    dst.set_size(src.get_size(), false);
+    const auto & src_buf = src.get_ro_buf();
+    std::vector<uint8_t> dst_buf(src.n_elements());
+    for (size_t i = 0; i < src.n_elements(); ++i) {
+        dst_buf[i] = static_cast<uint8_t>(std::min(std::max(int(src_buf[i] * 255.0f), 0), 255));
     }
+    dst.cpy_buf(dst_buf);
 }
 #endif
 
@@ -241,8 +245,8 @@ clip_graph::clip_graph(clip_ctx * ctx, const clip_image_f32 & img) :
         proj_type(ctx->proj_type()),
         img(img),
         patch_size(hparams.patch_size),
-        n_patches_x(img.nx / patch_size),
-        n_patches_y(img.ny / patch_size),
+        n_patches_x(img.nx() / patch_size),
+        n_patches_y(img.ny() / patch_size),
         n_patches(n_patches_x * n_patches_y),
         n_embd(hparams.n_embd),
         n_head(hparams.n_head),
@@ -278,8 +282,8 @@ void clip_graph::cb(ggml_tensor * cur, const char * name, int il) const {
 // siglip2 naflex
 ggml_tensor * clip_graph::resize_position_embeddings(uint32_t interpolation_mode) {
     ggml_tensor * pos_embd = model.position_embeddings;
-    const int height       = img.ny / patch_size;
-    const int width        = img.nx / patch_size;
+    const int height       = img.ny() / patch_size;
+    const int width        = img.nx() / patch_size;
     const uint32_t mode    = interpolation_mode;
     const int n_per_side   = (int)std::sqrt(pos_embd->ne[1]);
 
@@ -523,7 +527,7 @@ ggml_tensor * clip_graph::build_inp() {
 }
 
 ggml_tensor * clip_graph::build_inp_raw(int channels) {
-    ggml_tensor * inp_raw = ggml_new_tensor_3d(ctx0, GGML_TYPE_F32, img.nx, img.ny, channels);
+    ggml_tensor * inp_raw = ggml_new_tensor_3d(ctx0, GGML_TYPE_F32, img.nx(), img.ny(), channels);
     ggml_set_name(inp_raw, "inp_raw");
     ggml_set_input(inp_raw);
     return inp_raw;
@@ -816,8 +820,8 @@ ggml_tensor * clip_graph::build_patch_merge_permute(ggml_tensor * cur, int scale
     GGML_ASSERT(scale_factor > 1);
 
     const int n_embd = cur->ne[0];
-    int width  = img.nx / patch_size;
-    int height = img.ny / patch_size;
+    int width  = img.nx() / patch_size;
+    int height = img.ny() / patch_size;
 
     // pad width and height to factor
     const int64_t pad_width  = CLIP_ALIGN(width,  scale_factor) - width;
@@ -2805,13 +2809,12 @@ struct clip_model_loader {
         clip_image_f32_batch batch;
         clip_image_f32_ptr img(clip_image_f32_init());
         if (ctx_clip.model.modality == CLIP_MODALITY_VISION) {
-            img->nx = hparams.warmup_image_size;
-            img->ny = hparams.warmup_image_size;
-            LOG_INF("%s: warmup with image size = %d x %d\n", __func__, img->nx, img->ny);
+            const int sz = hparams.warmup_image_size;
+            img->set_size({sz, sz}, false, false);
+            LOG_INF("%s: warmup with image size = %d x %d\n", __func__, sz, sz);
         } else {
-            img->nx = hparams.warmup_audio_size;
-            img->ny = hparams.n_mel_bins;
-            LOG_INF("%s: warmup with audio size = %d\n", __func__, img->nx);
+            img->set_size({hparams.warmup_audio_size, hparams.n_mel_bins}, false, false);
+            LOG_INF("%s: warmup with audio size = %d\n", __func__, hparams.warmup_audio_size);
         }
         batch.entries.push_back(std::move(img));
         warmup(ctx_clip, batch);
@@ -3108,12 +3111,6 @@ struct clip_image_f32_batch * clip_image_f32_batch_init() {
     return new clip_image_f32_batch();
 }
 
-unsigned char * clip_image_u8_get_data(struct clip_image_u8 * img, uint32_t * nx, uint32_t * ny) {
-    if (nx) *nx = img->nx;
-    if (ny) *ny = img->ny;
-    return img->buf.data();
-}
-
 void clip_image_size_free(struct clip_image_size * load_image_size) {
     if (load_image_size == nullptr) {
         return;
@@ -3134,7 +3131,7 @@ size_t clip_image_f32_batch_nx(const struct clip_image_f32_batch * batch, int id
         LOG_ERR("%s: invalid index %d\n", __func__, idx);
         return 0;
     }
-    return batch->entries[idx]->nx;
+    return batch->entries[idx]->nx();
 }
 
 size_t clip_image_f32_batch_ny(const struct clip_image_f32_batch * batch, int idx) {
@@ -3142,7 +3139,7 @@ size_t clip_image_f32_batch_ny(const struct clip_image_f32_batch * batch, int id
         LOG_ERR("%s: invalid index %d\n", __func__, idx);
         return 0;
     }
-    return batch->entries[idx]->ny;
+    return batch->entries[idx]->ny();
 }
 
 clip_image_f32 * clip_image_f32_get_img(const struct clip_image_f32_batch * batch, int idx) {
@@ -3153,13 +3150,6 @@ clip_image_f32 * clip_image_f32_get_img(const struct clip_image_f32_batch * batc
     return batch->entries[idx].get();
 }
 
-void clip_build_img_from_pixels(const unsigned char * rgb_pixels, int nx, int ny, clip_image_u8 * img) {
-    img->nx = nx;
-    img->ny = ny;
-    img->buf.resize(3 * nx * ny);
-    memcpy(img->buf.data(), rgb_pixels, img->buf.size());
-}
-
 void clip_free(clip_ctx * ctx) {
     if (ctx == nullptr) {
         return;
@@ -3167,20 +3157,6 @@ void clip_free(clip_ctx * ctx) {
     delete ctx;
 }
 
-// deprecated
-size_t clip_embd_nbytes(const struct clip_ctx * ctx) {
-    const int32_t nx = ctx->model.hparams.image_size;
-    const int32_t ny = ctx->model.hparams.image_size;
-    return clip_embd_nbytes_by_img(ctx, nx, ny);
-}
-
-size_t clip_embd_nbytes_by_img(const struct clip_ctx * ctx, int img_w, int img_h) {
-    clip_image_f32 img;
-    img.nx = img_w;
-    img.ny = img_h;
-    return clip_n_output_tokens(ctx, &img) * clip_n_mmproj_embd(ctx) * sizeof(float);
-}
-
 int32_t clip_get_image_size(const struct clip_ctx * ctx) {
     return ctx->model.hparams.image_size;
 }
@@ -3211,9 +3187,9 @@ int clip_n_output_tokens_x(const struct clip_ctx * ctx, struct clip_image_f32 *
         case PROJECTOR_TYPE_PADDLEOCR:
         case PROJECTOR_TYPE_HUNYUANVL:
         case PROJECTOR_TYPE_YOUTUVL:
-            return (img->nx / params.patch_size) / 2;
+            return (img->nx() / params.patch_size) / 2;
         case PROJECTOR_TYPE_STEP3VL:
-            return img->nx / (params.patch_size * params.n_merge);
+            return img->nx() / (params.patch_size * params.n_merge);
         default:
             break;
     }
@@ -3233,9 +3209,9 @@ int clip_n_output_tokens_y(const struct clip_ctx * ctx, struct clip_image_f32 *
         case PROJECTOR_TYPE_PADDLEOCR:
         case PROJECTOR_TYPE_HUNYUANVL:
         case PROJECTOR_TYPE_YOUTUVL:
-            return (img->ny / params.patch_size) / 2;
+            return (img->ny() / params.patch_size) / 2;
         case PROJECTOR_TYPE_STEP3VL:
-            return img->ny / (params.patch_size * params.n_merge);
+            return img->ny() / (params.patch_size * params.n_merge);
         default:
             break;
     }
@@ -3247,7 +3223,7 @@ int clip_n_output_tokens(const struct clip_ctx * ctx, struct clip_image_f32 * im
 
     // for models with fixed size image, the input image is already pre-processed and resized to square
     int patch_size = params.patch_size;
-    int n_patches = (img->nx / patch_size) * (img->ny / patch_size);
+    int n_patches = (img->nx() / patch_size) * (img->ny() / patch_size);
 
     projector_type proj = ctx->proj_type();
 
@@ -3313,14 +3289,14 @@ int clip_n_output_tokens(const struct clip_ctx * ctx, struct clip_image_f32 * im
         case PROJECTOR_TYPE_YOUTUVL:
             {
                 // dynamic size (2 conv, so double patch size)
-                int x_patch = img->nx / (params.patch_size * 2);
-                int y_patch = img->ny / (params.patch_size * 2);
+                int x_patch = img->nx() / (params.patch_size * 2);
+                int y_patch = img->ny() / (params.patch_size * 2);
                 n_patches = x_patch * y_patch;
             } break;
         case PROJECTOR_TYPE_STEP3VL:
             {
-                int x_patch = img->nx / (params.patch_size * params.n_merge);
-                int y_patch = img->ny / (params.patch_size * params.n_merge);
+                int x_patch = img->nx() / (params.patch_size * params.n_merge);
+                int y_patch = img->ny() / (params.patch_size * params.n_merge);
                 n_patches = x_patch * y_patch;
             } break;
         case PROJECTOR_TYPE_GEMMA3:
@@ -3347,8 +3323,8 @@ int clip_n_output_tokens(const struct clip_ctx * ctx, struct clip_image_f32 * im
             {
                 // dynamic size
                 int out_patch_size = params.patch_size * ctx->model.hparams.n_merge;
-                int x_patch = CLIP_ALIGN(img->nx, out_patch_size) / out_patch_size;
-                int y_patch = CLIP_ALIGN(img->ny, out_patch_size) / out_patch_size;
+                int x_patch = CLIP_ALIGN(img->nx(), out_patch_size) / out_patch_size;
+                int y_patch = CLIP_ALIGN(img->ny(), out_patch_size) / out_patch_size;
                 n_patches = x_patch * y_patch;
             } break;
         case PROJECTOR_TYPE_PADDLEOCR:
@@ -3364,8 +3340,8 @@ int clip_n_output_tokens(const struct clip_ctx * ctx, struct clip_image_f32 * im
             {
                 // dynamic size
                 int n_merge = ctx->model.hparams.n_merge;
-                int n_patches_x = img->nx / patch_size / (n_merge > 0 ? n_merge : 1);
-                int n_patches_y = img->ny / patch_size / (n_merge > 0 ? n_merge : 1);
+                int n_patches_x = img->nx() / patch_size / (n_merge > 0 ? n_merge : 1);
+                int n_patches_y = img->ny() / patch_size / (n_merge > 0 ? n_merge : 1);
                 if (ctx->model.token_embd_img_break) {
                     n_patches = n_patches_y * n_patches_x + n_patches_y - 1; // + one [IMG_BREAK] per row, except the last row
                 } else {
@@ -3378,7 +3354,7 @@ int clip_n_output_tokens(const struct clip_ctx * ctx, struct clip_image_f32 * im
         case PROJECTOR_TYPE_MERALION:
         case PROJECTOR_TYPE_MUSIC_FLAMINGO:
             {
-                n_patches = img->nx;
+                n_patches = img->nx();
 
                 const int proj_stack_factor = ctx->model.hparams.proj_stack_factor;
                 if (ctx->model.audio_has_stack_frames()) {
@@ -3400,11 +3376,11 @@ int clip_n_output_tokens(const struct clip_ctx * ctx, struct clip_image_f32 * im
                 // chunk_size=100 frames --> 3x stride-2 conv2d --> 13 tokens per chunk
                 const int chunk_size       = 100;
                 const int tokens_per_chunk = 13;
-                n_patches = (img->nx / chunk_size) * tokens_per_chunk;
+                n_patches = (img->nx() / chunk_size) * tokens_per_chunk;
             } break;
         case PROJECTOR_TYPE_GLMA:
             {
-                n_patches = img->nx;
+                n_patches = img->nx();
                 // whisper downscales input token by half after conv1d
                 n_patches /= 2;
                 // reshape by merge_factor
@@ -3431,8 +3407,8 @@ int clip_n_output_tokens(const struct clip_ctx * ctx, struct clip_image_f32 * im
         case PROJECTOR_TYPE_HUNYUANVL:
             {
                 int merge = ctx->model.hparams.n_merge;
-                int ow = (img->nx / patch_size) / merge;
-                int oh = (img->ny / patch_size) / merge;
+                int ow = (img->nx() / patch_size) / merge;
+                int oh = (img->ny() / patch_size) / merge;
                 n_patches = (ow + 1) * oh + 2;
             } break;
         case PROJECTOR_TYPE_DEEPSEEKOCR2:
@@ -3446,13 +3422,13 @@ int clip_n_output_tokens(const struct clip_ctx * ctx, struct clip_image_f32 * im
         } break;
         case PROJECTOR_TYPE_LFM2A:
             {
-                n_patches = ((((img->nx + 1) / 2) + 1) / 2 + 1) / 2;
+                n_patches = ((((img->nx() + 1) / 2) + 1) / 2 + 1) / 2;
             } break;
         case PROJECTOR_TYPE_GEMMA4A:
             {
                 // Two Conv2D stride-2: O = floor((I + 2p - k) / s) + 1, p=1, k=3, s=2
                 // O = floor((I - 1) / 2) + 1
-                int n = img->nx;
+                int n = img->nx();
                 for (int i = 0; i < 2; i++) {
                     n = (n - 1) / 2 + 1;
                 }
@@ -3460,13 +3436,13 @@ int clip_n_output_tokens(const struct clip_ctx * ctx, struct clip_image_f32 * im
             } break;
         case PROJECTOR_TYPE_GEMMA4UA:
             {
-                n_patches = img->nx;  // no downsampling: one token per raw waveform frame
+                n_patches = img->nx();  // no downsampling: one token per raw waveform frame
             } break;
         case PROJECTOR_TYPE_GRANITE_SPEECH:
             {
                 const int ws = ctx->model.hparams.audio_proj_window_size;
                 const int ds = ctx->model.hparams.audio_proj_downsample_rate;
-                n_patches = ((img->nx + ws - 1) / ws) * (ws / ds);
+                n_patches = ((img->nx() + ws - 1) / ws) * (ws / ds);
             } break;
         case PROJECTOR_TYPE_GRANITE4_VISION:
             {
@@ -3475,7 +3451,7 @@ int clip_n_output_tokens(const struct clip_ctx * ctx, struct clip_image_f32 * im
                 // For 384×384 input: n = 24/8 = 3, query_side = 4 → 144.
                 const int window_side = ctx->model.hparams.downsample_window_side;
                 const int query_side  = ctx->model.hparams.downsample_query_side;
-                const int side        = img->nx / params.patch_size;
+                const int side        = img->nx() / params.patch_size;
                 const int n           = side / window_side;
                 n_patches             = (query_side * n) * (query_side * n);
                 if (img->add_newline) {
@@ -3525,8 +3501,8 @@ bool clip_image_batch_encode(clip_ctx * ctx, const int n_threads, const clip_ima
     const auto & model   = ctx->model;
     const auto & hparams = model.hparams;
 
-    const int image_size_width  = imgs.entries[0]->nx;
-    const int image_size_height = imgs.entries[0]->ny;
+    const int image_size_width  = imgs.entries[0]->nx();
+    const int image_size_height = imgs.entries[0]->ny();
 
     const int patch_size    = hparams.patch_size;
     const int num_patches   = ((image_size_width / patch_size) * (image_size_height / patch_size));
@@ -3546,7 +3522,7 @@ bool clip_image_batch_encode(clip_ctx * ctx, const int n_threads, const clip_ima
         return inp;
     };
 
-    auto set_input_f32 = [&get_inp_tensor](const char * name, std::vector<float> & values) {
+    auto set_input_f32 = [&get_inp_tensor](const char * name, const std::vector<float> & values) {
         ggml_tensor * cur = get_inp_tensor(name);
         GGML_ASSERT(cur->type == GGML_TYPE_F32);
         GGML_ASSERT(ggml_nelements(cur) == (int64_t)values.size());
@@ -3564,7 +3540,7 @@ bool clip_image_batch_encode(clip_ctx * ctx, const int n_threads, const clip_ima
     if (!imgs.is_audio) {
         size_t nelem = 0;
         for (const auto & img : imgs.entries) {
-            nelem += img->nx * img->ny * 3;
+            nelem += img->nx() * img->ny() * 3;
         }
         std::vector<float> inp_raw(nelem);
 
@@ -3580,19 +3556,20 @@ bool clip_image_batch_encode(clip_ctx * ctx, const int n_threads, const clip_ima
         //   ──────┘ x B
 
         for (size_t i = 0; i < imgs.entries.size(); i++) {
-            const int nx = imgs.entries[i]->nx;
-            const int ny = imgs.entries[i]->ny;
+            const int nx = imgs.entries[i]->nx();
+            const int ny = imgs.entries[i]->ny();
             const int n = nx * ny;
 
             for (int b = 0; b < batch_size; b++) {
+                const auto & buf = imgs.entries[b]->get_ro_buf();
                 float * batch_entry = inp_raw.data() + b * (3*n);
                 for (int y = 0; y < ny; y++) {
                     for (int x = 0; x < nx; x++) {
                         size_t base_src = 3*(y * nx + x); // idx of the first channel
                         size_t base_dst =    y * nx + x;  // idx of the first channel
-                        batch_entry[      base_dst] = imgs.entries[b]->buf[base_src    ];
-                        batch_entry[1*n + base_dst] = imgs.entries[b]->buf[base_src + 1];
-                        batch_entry[2*n + base_dst] = imgs.entries[b]->buf[base_src + 2];
+                        batch_entry[      base_dst] = buf[base_src    ];
+                        batch_entry[1*n + base_dst] = buf[base_src + 1];
+                        batch_entry[2*n + base_dst] = buf[base_src + 2];
                     }
                 }
             }
@@ -3602,12 +3579,14 @@ bool clip_image_batch_encode(clip_ctx * ctx, const int n_threads, const clip_ima
     } else {
         // audio input
         GGML_ASSERT(imgs.entries.size() == 1);
+
         const auto & mel_inp = imgs.entries[0];
-        const int n_step = mel_inp->nx;
-        const int n_mel  = mel_inp->ny;
-        std::vector<float> inp_raw(n_step * n_mel);
-        std::memcpy(inp_raw.data(), mel_inp->buf.data(), n_step * n_mel * sizeof(float));
-        set_input_f32("inp_raw", inp_raw);
+        const auto & buf = mel_inp->get_ro_buf();
+        const int n_step = mel_inp->nx();
+        const int n_mel  = mel_inp->ny();
+        GGML_ASSERT((size_t)n_step * n_mel == buf.size());
+
+        set_input_f32("inp_raw", buf);
     }
 
     // set input per projector
@@ -4218,7 +4197,7 @@ bool clip_image_batch_encode(clip_ctx * ctx, const int n_threads, const clip_ima
                 GGML_ASSERT(imgs.entries.size() == 1);
                 const auto & img0 = imgs.entries.front();
                 // Compute n_pos matching SSCP output: two stride-2 convs
-                int n_pos = img0->nx;
+                int n_pos = img0->nx();
                 for (int i = 0; i < 2; i++) { n_pos = (n_pos - 1) / 2 + 1; }
 
                 // Chunked local attention: blocked causal mask and RPE
@@ -4324,7 +4303,7 @@ bool clip_image_batch_encode(clip_ctx * ctx, const int n_threads, const clip_ima
                 // reshapes as ggml_get_rows gathers. The names are set
                 // by g4v_gather() in models/granite4-vision.cpp.
                 const int patch_size  = model.hparams.patch_size;
-                const int image_side  = imgs.entries.front()->nx / patch_size;
+                const int image_side  = imgs.entries.front()->nx() / patch_size;
                 const int window_side = hparams.downsample_window_side;
                 const int query_side  = hparams.downsample_query_side;
                 const int n           = image_side / window_side;
@@ -4570,19 +4549,6 @@ bool clip_has_audio_encoder(const struct clip_ctx * ctx) {
     return ctx->model.modality == CLIP_MODALITY_AUDIO;
 }
 
-bool clip_encode_float_image (struct clip_ctx * ctx, int n_threads, float * img, int h, int w, float * vec) {
-    clip_image_f32 clip_img;
-    clip_img.buf.resize(h * w * 3);
-    for (int i = 0; i < h*w*3; i++)
-    {
-        clip_img.buf[i] = img[i];
-    }
-    clip_img.nx = w;
-    clip_img.ny = h;
-    clip_image_encode(ctx, n_threads, &clip_img, vec);
-    return true;
-}
-
 //
 // API used internally with mtmd
 //
@@ -4591,17 +4557,6 @@ projector_type clip_get_projector_type(const struct clip_ctx * ctx) {
     return ctx->proj_type();
 }
 
-void clip_image_f32_batch_add_mel(struct clip_image_f32_batch * batch, int n_mel, int n_frames, float * mel) {
-    clip_image_f32 * audio = new clip_image_f32;
-    audio->nx = n_frames;
-    audio->ny = n_mel;
-    audio->buf.resize(n_frames * n_mel);
-    std::memcpy(audio->buf.data(), mel, n_frames * n_mel * sizeof(float));
-
-    batch->entries.push_back(clip_image_f32_ptr(audio));
-    batch->is_audio = true;
-}
-
 const clip_hparams * clip_get_hparams(const struct clip_ctx * ctx) {
     return &ctx->model.hparams;
 }
diff --git a/tools/mtmd/clip.h b/tools/mtmd/clip.h
index a62c9d6187..ba5b619770 100644
--- a/tools/mtmd/clip.h
+++ b/tools/mtmd/clip.h
@@ -17,6 +17,9 @@ struct clip_ctx;
 struct clip_image_size {
     int width;
     int height;
+    bool operator==(const clip_image_size & other) const {
+        return width == other.width && height == other.height;
+    }
 };
 
 struct clip_image_f32;
@@ -54,9 +57,6 @@ struct clip_init_result clip_init(const char * fname, struct clip_context_params
 
 void clip_free(struct clip_ctx * ctx);
 
-size_t clip_embd_nbytes(const struct clip_ctx * ctx);
-size_t clip_embd_nbytes_by_img(const struct clip_ctx * ctx, int img_w, int img_h);
-
 int32_t clip_get_image_size (const struct clip_ctx * ctx);
 int32_t clip_get_patch_size (const struct clip_ctx * ctx);
 int32_t clip_get_hidden_size(const struct clip_ctx * ctx);
@@ -79,9 +79,6 @@ struct clip_image_u8        * clip_image_u8_init (void);
 struct clip_image_f32       * clip_image_f32_init(void);
 struct clip_image_f32_batch * clip_image_f32_batch_init(void); // only used by libllava
 
-// nx, ny are the output image dimensions
-unsigned char * clip_image_u8_get_data(struct clip_image_u8 * img, uint32_t * nx, uint32_t * ny);
-
 void clip_image_size_free (struct clip_image_size * img_size);
 void clip_image_u8_free (struct clip_image_u8  * img);
 void clip_image_f32_free(struct clip_image_f32 * img);
@@ -94,12 +91,6 @@ size_t clip_image_f32_batch_nx(const struct clip_image_f32_batch * batch, int id
 size_t clip_image_f32_batch_ny(const struct clip_image_f32_batch * batch, int idx); // equivalent to batch[idx]->ny
 struct clip_image_f32 * clip_image_f32_get_img(const struct clip_image_f32_batch * batch, int idx); // equivalent to batch[idx]->data
 
-/**
- * Build image from pixels decoded by other libraries instead of stb_image.h for better performance.
- * The memory layout is RGBRGBRGB..., input buffer length must be 3*nx*ny bytes
- */
-void clip_build_img_from_pixels(const unsigned char * rgb_pixels, int nx, int ny, struct clip_image_u8 * img);
-
 bool clip_image_encode      (struct clip_ctx * ctx, int n_threads, struct clip_image_f32 * img, float * vec);
 bool clip_image_batch_encode(struct clip_ctx * ctx, int n_threads, const struct clip_image_f32_batch * imgs, float * vec);
 
@@ -107,11 +98,6 @@ bool clip_is_llava(const struct clip_ctx * ctx);
 // note for contributor: this clip_is_(model) pattern is deprecated
 //                       do NOT add new functions like this
 
-bool clip_encode_float_image (struct clip_ctx * ctx, int n_threads, float * img, int h, int w, float * vec);
-
-// use by audio input
-void clip_image_f32_batch_add_mel(struct clip_image_f32_batch * batch, int n_mel, int n_frames, float * mel);
-
 bool clip_has_vision_encoder(const struct clip_ctx * ctx);
 bool clip_has_audio_encoder(const struct clip_ctx * ctx);
 
diff --git a/tools/mtmd/models/conformer.cpp b/tools/mtmd/models/conformer.cpp
index f58c5048f5..5f2c7b9731 100644
--- a/tools/mtmd/models/conformer.cpp
+++ b/tools/mtmd/models/conformer.cpp
@@ -1,7 +1,7 @@
 #include "models.h"
 
 ggml_cgraph * clip_graph_conformer::build() {
-    const int n_frames   = img.nx;
+    const int n_frames   = img.nx();
     const int n_pos      = n_frames / 2;
     const int n_pos_embd = (((((n_frames + 1) / 2) + 1) / 2 + 1) / 2) * 2 - 1;
     GGML_ASSERT(model.position_embeddings->ne[1] >= n_pos);
diff --git a/tools/mtmd/models/exaone4_5.cpp b/tools/mtmd/models/exaone4_5.cpp
index 7bfbaca996..bd9e8c7488 100644
--- a/tools/mtmd/models/exaone4_5.cpp
+++ b/tools/mtmd/models/exaone4_5.cpp
@@ -22,8 +22,8 @@ ggml_cgraph * clip_graph_exaone4_5::build() {
     ggml_tensor * inp_raw = build_inp_raw();
     ggml_tensor * inp = ggml_conv_2d(ctx0, model.patch_embeddings_0, inp_raw, patch_size, patch_size, 0, 0, 1, 1);
 
-    GGML_ASSERT(img.nx % (patch_size * 2) == 0);
-    GGML_ASSERT(img.ny % (patch_size * 2) == 0);
+    GGML_ASSERT(img.nx() % (patch_size * 2) == 0);
+    GGML_ASSERT(img.ny() % (patch_size * 2) == 0);
 
     {
         ggml_tensor * inp_1 = ggml_conv_2d(ctx0, model.patch_embeddings_1, inp_raw, patch_size, patch_size, 0, 0, 1, 1);
diff --git a/tools/mtmd/models/glm4v.cpp b/tools/mtmd/models/glm4v.cpp
index 623d2e384b..0e1d596b41 100644
--- a/tools/mtmd/models/glm4v.cpp
+++ b/tools/mtmd/models/glm4v.cpp
@@ -16,8 +16,8 @@ ggml_cgraph * clip_graph_glm4v::build() {
     ggml_set_name(positions, "positions");
     ggml_set_input(positions);
 
-    GGML_ASSERT(img.nx % (patch_size * 2) == 0);
-    GGML_ASSERT(img.ny % (patch_size * 2) == 0);
+    GGML_ASSERT(img.nx() % (patch_size * 2) == 0);
+    GGML_ASSERT(img.ny() % (patch_size * 2) == 0);
 
     // second conv dimension
     {
diff --git a/tools/mtmd/models/granite-speech.cpp b/tools/mtmd/models/granite-speech.cpp
index 5e66f75d0a..0bd4d75ac5 100644
--- a/tools/mtmd/models/granite-speech.cpp
+++ b/tools/mtmd/models/granite-speech.cpp
@@ -1,7 +1,7 @@
 #include "models.h"
 
 ggml_cgraph * clip_graph_granite_speech::build() {
-    const int n_frames     = img.nx;
+    const int n_frames     = img.nx();
     const int context_size = hparams.audio_chunk_size;
     const int ctc_layer    = n_layer / 2;
     const int conv_kernel  = hparams.audio_conv_kernel_size;
diff --git a/tools/mtmd/models/kimik25.cpp b/tools/mtmd/models/kimik25.cpp
index cf9f27f63a..cb345f0fc6 100644
--- a/tools/mtmd/models/kimik25.cpp
+++ b/tools/mtmd/models/kimik25.cpp
@@ -7,8 +7,8 @@
 // with a w*h? Also the permute is a bit different at (2, 1, 0, 3) instead of (2, 0, 1, 3).
 ggml_tensor * clip_graph_kimik25::resize_position_embeddings_3d(uint32_t interpolation_mode) {
     ggml_tensor * pos_embd = model.position_embeddings;
-    const int height       = img.ny / patch_size;
-    const int width        = img.nx / patch_size;
+    const int height       = img.ny() / patch_size;
+    const int width        = img.nx() / patch_size;
     const uint32_t mode    = interpolation_mode;
 
     GGML_ASSERT(pos_embd);
diff --git a/tools/mtmd/models/mimovl.cpp b/tools/mtmd/models/mimovl.cpp
index 19db88f132..6ff1124a02 100644
--- a/tools/mtmd/models/mimovl.cpp
+++ b/tools/mtmd/models/mimovl.cpp
@@ -56,8 +56,8 @@ ggml_cgraph * clip_graph_mimovl::build() {
                                            patch_size, patch_size, 0, 0, 1, 1);
         inp = ggml_add(ctx0, inp, inp_1);
 
-        GGML_ASSERT(img.nx % (patch_size * 2) == 0);
-        GGML_ASSERT(img.ny % (patch_size * 2) == 0);
+        GGML_ASSERT(img.nx() % (patch_size * 2) == 0);
+        GGML_ASSERT(img.ny() % (patch_size * 2) == 0);
 
         inp = ggml_permute(ctx0, inp, 1, 2, 0, 3);  // [w,h,c,b] -> [c,w,h,b]
         inp = ggml_cont_4d(ctx0, inp, n_embd * 2, n_patches_x / 2, n_patches_y, batch_size);
diff --git a/tools/mtmd/models/qwen2vl.cpp b/tools/mtmd/models/qwen2vl.cpp
index ebf1075737..b196587373 100644
--- a/tools/mtmd/models/qwen2vl.cpp
+++ b/tools/mtmd/models/qwen2vl.cpp
@@ -19,8 +19,8 @@ ggml_cgraph * clip_graph_qwen2vl::build() {
     ggml_tensor * inp_raw = build_inp_raw();
     ggml_tensor * inp = ggml_conv_2d(ctx0, model.patch_embeddings_0, inp_raw, patch_size, patch_size, 0, 0, 1, 1);
 
-    GGML_ASSERT(img.nx % (patch_size * 2) == 0);
-    GGML_ASSERT(img.ny % (patch_size * 2) == 0);
+    GGML_ASSERT(img.nx() % (patch_size * 2) == 0);
+    GGML_ASSERT(img.ny() % (patch_size * 2) == 0);
 
     // second conv dimension
     {
diff --git a/tools/mtmd/models/qwen3vl.cpp b/tools/mtmd/models/qwen3vl.cpp
index fa1100dda8..9968933ed6 100644
--- a/tools/mtmd/models/qwen3vl.cpp
+++ b/tools/mtmd/models/qwen3vl.cpp
@@ -16,8 +16,8 @@ ggml_cgraph * clip_graph_qwen3vl::build() {
     ggml_tensor * inp_raw = build_inp_raw();
     ggml_tensor * inp = ggml_conv_2d(ctx0, model.patch_embeddings_0, inp_raw, patch_size, patch_size, 0, 0, 1, 1);
 
-    GGML_ASSERT(img.nx % (patch_size * 2) == 0);
-    GGML_ASSERT(img.ny % (patch_size * 2) == 0);
+    GGML_ASSERT(img.nx() % (patch_size * 2) == 0);
+    GGML_ASSERT(img.ny() % (patch_size * 2) == 0);
 
     // second conv dimension
     {
diff --git a/tools/mtmd/models/whisper-enc.cpp b/tools/mtmd/models/whisper-enc.cpp
index 2a82ae50bf..49d5dd5add 100644
--- a/tools/mtmd/models/whisper-enc.cpp
+++ b/tools/mtmd/models/whisper-enc.cpp
@@ -1,7 +1,7 @@
 #include "models.h"
 
 ggml_cgraph * clip_graph_whisper_enc::build() {
-    const int n_frames = img.nx;
+    const int n_frames = img.nx();
     const int n_pos    = n_frames / 2;
     GGML_ASSERT(model.position_embeddings->ne[1] >= n_pos);
 
diff --git a/tools/mtmd/mtmd-cli.cpp b/tools/mtmd/mtmd-cli.cpp
index d6e551618e..bd7f9871c3 100644
--- a/tools/mtmd/mtmd-cli.cpp
+++ b/tools/mtmd/mtmd-cli.cpp
@@ -166,7 +166,7 @@ struct mtmd_cli_context {
     }
 
     bool load_media(const std::string & fname) {
-        mtmd::bitmap bmp(mtmd_helper_bitmap_init_from_file(ctx_vision.get(), fname.c_str()));
+        mtmd::bitmap bmp(mtmd_helper_bitmap_init_from_file(ctx_vision.get(), fname.c_str(), false));
         if (!bmp.ptr) {
             return false;
         }
diff --git a/tools/mtmd/mtmd-helper.cpp b/tools/mtmd/mtmd-helper.cpp
index 4094074163..94ad01511e 100644
--- a/tools/mtmd/mtmd-helper.cpp
+++ b/tools/mtmd/mtmd-helper.cpp
@@ -478,7 +478,7 @@ static bool decode_audio_from_buf(const unsigned char * buf_in, size_t len, int
 
 } // namespace audio_helpers
 
-mtmd_bitmap * mtmd_helper_bitmap_init_from_buf(mtmd_context * ctx, const unsigned char * buf, size_t len) {
+mtmd_bitmap * mtmd_helper_bitmap_init_from_buf(mtmd_context * ctx, const unsigned char * buf, size_t len, bool placeholder) {
     if (audio_helpers::is_audio_file((const char *)buf, len)) {
         std::vector<float> pcmf32;
         const int sample_rate = mtmd_get_audio_sample_rate(ctx);
@@ -490,7 +490,7 @@ mtmd_bitmap * mtmd_helper_bitmap_init_from_buf(mtmd_context * ctx, const unsigne
             LOG_ERR("Unable to read WAV audio file from buffer\n");
             return nullptr;
         }
-        return mtmd_bitmap_init_from_audio(pcmf32.size(), pcmf32.data());
+        return mtmd_bitmap_init_from_audio(pcmf32.size(), placeholder ? nullptr : pcmf32.data());
     }
 
     // otherwise, we assume it's an image
@@ -502,13 +502,13 @@ mtmd_bitmap * mtmd_helper_bitmap_init_from_buf(mtmd_context * ctx, const unsigne
             LOG_ERR("%s: failed to decode image bytes\n", __func__);
             return nullptr;
         }
-        result = mtmd_bitmap_init(nx, ny, data);
+        result = mtmd_bitmap_init(nx, ny, placeholder ? nullptr : data);
         stbi_image_free(data);
     }
     return result;
 }
 
-mtmd_bitmap * mtmd_helper_bitmap_init_from_file(mtmd_context * ctx, const char * fname) {
+mtmd_bitmap * mtmd_helper_bitmap_init_from_file(mtmd_context * ctx, const char * fname, bool placeholder) {
     std::vector<unsigned char> buf;
     FILE * f = fopen(fname, "rb");
     if (!f) {
@@ -533,5 +533,6 @@ mtmd_bitmap * mtmd_helper_bitmap_init_from_file(mtmd_context * ctx, const char *
         return nullptr;
     }
 
-    return mtmd_helper_bitmap_init_from_buf(ctx, buf.data(), buf.size());
+    return mtmd_helper_bitmap_init_from_buf(ctx, buf.data(), buf.size(), placeholder);
 }
+
diff --git a/tools/mtmd/mtmd-helper.h b/tools/mtmd/mtmd-helper.h
index 57da78a754..7eecbb0672 100644
--- a/tools/mtmd/mtmd-helper.h
+++ b/tools/mtmd/mtmd-helper.h
@@ -29,7 +29,7 @@ MTMD_API void mtmd_helper_log_set(ggml_log_callback log_callback, void * user_da
 // it calls mtmd_helper_bitmap_init_from_buf() internally
 // returns nullptr on failure
 // this function is thread-safe
-MTMD_API mtmd_bitmap * mtmd_helper_bitmap_init_from_file(mtmd_context * ctx, const char * fname);
+MTMD_API mtmd_bitmap * mtmd_helper_bitmap_init_from_file(mtmd_context * ctx, const char * fname, bool placeholder);
 
 // helper function to construct a mtmd_bitmap from a buffer containing a file
 // supported formats:
@@ -38,7 +38,7 @@ MTMD_API mtmd_bitmap * mtmd_helper_bitmap_init_from_file(mtmd_context * ctx, con
 // note: audio files will be auto-detected based on magic bytes
 // returns nullptr on failure
 // this function is thread-safe
-MTMD_API mtmd_bitmap * mtmd_helper_bitmap_init_from_buf(mtmd_context * ctx, const unsigned char * buf, size_t len);
+MTMD_API mtmd_bitmap * mtmd_helper_bitmap_init_from_buf(mtmd_context * ctx, const unsigned char * buf, size_t len, bool placeholder);
 
 // helper to count the total number of tokens from a list of chunks, useful to keep track of KV cache
 MTMD_API size_t mtmd_helper_get_n_tokens(const mtmd_input_chunks * chunks);
diff --git a/tools/mtmd/mtmd-image.cpp b/tools/mtmd/mtmd-image.cpp
index caf72d5362..c86a065c81 100644
--- a/tools/mtmd/mtmd-image.cpp
+++ b/tools/mtmd/mtmd-image.cpp
@@ -9,25 +9,12 @@
 //
 
 void mtmd_image_preprocessor::img_u8_to_f32(const clip_image_u8 & src, clip_image_f32 & dst, const float mean[3], const float std[3]) {
-    dst.nx = src.nx;
-    dst.ny = src.ny;
-    dst.buf.resize(src.buf.size());
-
-    // TODO @ngxson : seems like this could be done more efficiently on cgraph
-    for (size_t i = 0; i < src.buf.size(); ++i) {
-        int c = i % 3; // rgb
-        dst.buf[i] = (static_cast<float>(src.buf[i]) / 255.0f - mean[c]) / std[c];
-    }
+    dst.from_u8(src);
+    dst.normalize(mean, std);
 }
 
 void mtmd_image_preprocessor::img_u8_to_f32(const clip_image_u8 & src, clip_image_f32 & dst) {
-    dst.nx = src.nx;
-    dst.ny = src.ny;
-    dst.buf.resize(src.buf.size());
-
-    for (size_t i = 0; i < src.buf.size(); ++i) {
-        dst.buf[i] = static_cast<float>(src.buf[i]);
-    }
+    dst.from_u8(src);
 }
 
 // set of tools to manipulate images
@@ -40,13 +27,16 @@ struct img_tool {
             resize_algo algo,
             pad_style padding = PAD_CEIL,
             std::array<uint8_t, 3> pad_color = {0, 0, 0}) {
-        dst.nx = target_resolution.width;
-        dst.ny = target_resolution.height;
-        dst.buf.resize(3 * dst.nx * dst.ny);
+        dst.set_size(target_resolution, src.is_placeholder());
 
-        if (dst.nx == src.nx && dst.ny == src.ny) {
+        if (src.is_placeholder()) {
+            // no-op for placeholder image, just set the size and return
+            return;
+        }
+
+        if (dst.get_size() == src.get_size()) {
             // no resize needed, simple copy
-            dst.buf = src.buf;
+            dst.cpy_buf(src.get_ro_buf());
             return;
         }
 
@@ -68,17 +58,17 @@ struct img_tool {
         } else {
             // resize with padding
             clip_image_u8 resized_image;
-            float scale_w = static_cast<float>(target_resolution.width) / src.nx;
-            float scale_h = static_cast<float>(target_resolution.height) / src.ny;
+            float scale_w = static_cast<float>(target_resolution.width) / src.get_size().width;
+            float scale_h = static_cast<float>(target_resolution.height) / src.get_size().height;
             float scale = std::min(scale_w, scale_h);
 
             int new_width, new_height;
             if (padding == PAD_NEAREST) {
-                new_width  = std::min(static_cast<int>(std::round(src.nx * scale)), target_resolution.width);
-                new_height = std::min(static_cast<int>(std::round(src.ny * scale)), target_resolution.height);
+                new_width  = std::min(static_cast<int>(std::round(src.get_size().width * scale)), target_resolution.width);
+                new_height = std::min(static_cast<int>(std::round(src.get_size().height * scale)), target_resolution.height);
             } else {
-                new_width  = std::min(static_cast<int>(std::ceil(src.nx * scale)), target_resolution.width);
-                new_height = std::min(static_cast<int>(std::ceil(src.ny * scale)), target_resolution.height);
+                new_width  = std::min(static_cast<int>(std::ceil(src.get_size().width * scale)), target_resolution.width);
+                new_height = std::min(static_cast<int>(std::ceil(src.get_size().height * scale)), target_resolution.height);
             }
 
             switch (algo) {
@@ -112,18 +102,17 @@ struct img_tool {
 
     static void crop(const clip_image_u8 & image, clip_image_u8 & dst, int x, int y, int w, int h) {
         GGML_ASSERT(x >= 0 && y >= 0 && w > 0 && h > 0);
-        GGML_ASSERT(x + w <= image.nx && y + h <= image.ny);
-        dst.nx = w;
-        dst.ny = h;
-        dst.buf.resize(3 * w * h);
+        GGML_ASSERT(x + w <= image.get_size().width && y + h <= image.get_size().height);
+        dst.set_size({w, h}, image.is_placeholder());
+
+        if (image.is_placeholder()) {
+            // no-op for placeholder image, just set the size and return
+            return;
+        }
 
         for (int i = 0; i < h; ++i) {
             for (int j = 0; j < w; ++j) {
-                int src_idx = 3 * ((y + i)*image.nx + (x + j));
-                int dst_idx = 3 * (i*w + j);
-                dst.buf[dst_idx]     = image.buf[src_idx];
-                dst.buf[dst_idx + 1] = image.buf[src_idx + 1];
-                dst.buf[dst_idx + 2] = image.buf[src_idx + 2];
+                dst.set_pixel(j, i, image.get_pixel(x + j, y + i));
             }
         }
     }
@@ -181,81 +170,101 @@ struct img_tool {
 
     // draw src image into dst image at offset (offset_x, offset_y)
     static void composite(clip_image_u8 & dst, const clip_image_u8 & src, int offset_x, int offset_y) {
-        for (int y = 0; y < src.ny; ++y) {
-            for (int x = 0; x < src.nx; ++x) {
+        if (src.is_placeholder()) {
+            // no-op for placeholder image
+            return;
+        }
+
+        const auto src_size = src.get_size();
+        const auto dst_size = dst.get_size();
+        for (int y = 0; y < src_size.height; ++y) {
+            for (int x = 0; x < src_size.width; ++x) {
                 int dx = x + offset_x;
                 int dy = y + offset_y;
                 // skip pixels that would be out of bounds in the destination
-                if (dx < 0 || dy < 0 || dx >= dst.nx || dy >= dst.ny) {
+                if (dx < 0 || dy < 0 || dx >= dst_size.width || dy >= dst_size.height) {
                     continue;
                 }
-                size_t dst_idx = 3 * (static_cast<size_t>(dy) * dst.nx + static_cast<size_t>(dx));
-                size_t src_idx = 3 * (static_cast<size_t>(y) * src.nx + static_cast<size_t>(x));
-                dst.buf[dst_idx + 0] = src.buf[src_idx + 0];
-                dst.buf[dst_idx + 1] = src.buf[src_idx + 1];
-                dst.buf[dst_idx + 2] = src.buf[src_idx + 2];
+                dst.set_pixel(dx, dy, src.get_pixel(x, y));
             }
         }
     }
 
     // fill the image with a solid color
     static void fill(clip_image_u8 & img, const std::array<uint8_t, 3> & color) {
-        for (size_t i = 0; i < img.buf.size(); i += 3) {
-            img.buf[i]     = color[0];
-            img.buf[i + 1] = color[1];
-            img.buf[i + 2] = color[2];
+        if (img.is_placeholder()) {
+            // no-op for placeholder image
+            return;
+        }
+
+        const auto size = img.get_size();
+        for (int y = 0; y < size.height; ++y) {
+            for (int x = 0; x < size.width; ++x) {
+                img.set_pixel(x, y, color);
+            }
         }
     }
 
 private:
     // Bilinear resize function
     static void resize_bilinear(const clip_image_u8 & src, clip_image_u8 & dst, int target_width, int target_height) {
-        if (src.nx == 0 || src.ny == 0) { dst.nx = dst.ny = 0; dst.buf.clear(); return; }
+        const auto src_size = src.get_size();
+        if (src_size.width == 0 || src_size.height == 0) { dst.set_size({0, 0}, false); return; }
         if (target_width  <= 0) target_width  = 1;
         if (target_height <= 0) target_height = 1;
 
-        dst.nx = target_width;
-        dst.ny = target_height;
-        dst.buf.resize(3 * target_width * target_height);
+        dst.set_size({target_width, target_height}, false);
 
-        float x_ratio = target_width  > 1 ? static_cast<float>(src.nx - 1) / (target_width  - 1) : 0.0f;
-        float y_ratio = target_height > 1 ? static_cast<float>(src.ny - 1) / (target_height - 1) : 0.0f;
+        if (src.is_placeholder()) {
+            // no-op for placeholder image, just set the size and return
+            return;
+        }
+
+        float x_ratio = target_width  > 1 ? static_cast<float>(src_size.width  - 1) / (target_width  - 1) : 0.0f;
+        float y_ratio = target_height > 1 ? static_cast<float>(src_size.height - 1) / (target_height - 1) : 0.0f;
 
         for (int y = 0; y < target_height; ++y) {
             for (int x = 0; x < target_width; ++x) {
                 float px = x * x_ratio;
                 float py = y * y_ratio;
 
-                int x0 = std::min(static_cast<int>(px), src.nx - 1);
-                int y0 = std::min(static_cast<int>(py), src.ny - 1);
-                int x1 = std::min(x0 + 1, src.nx - 1);
-                int y1 = std::min(y0 + 1, src.ny - 1);
+                int x0 = std::min(static_cast<int>(px), src_size.width  - 1);
+                int y0 = std::min(static_cast<int>(py), src_size.height - 1);
+                int x1 = std::min(x0 + 1, src_size.width  - 1);
+                int y1 = std::min(y0 + 1, src_size.height - 1);
 
                 float xf = px - x0;
                 float yf = py - y0;
 
+                const auto p00 = src.get_pixel(x0, y0);
+                const auto p10 = src.get_pixel(x1, y0);
+                const auto p01 = src.get_pixel(x0, y1);
+                const auto p11 = src.get_pixel(x1, y1);
+
+                std::array<uint8_t, 3> pixel;
                 for (int c = 0; c < 3; ++c) {
-                    float top    = lerp(static_cast<float>(src.buf[3 * (y0 * src.nx + x0) + c]),
-                                        static_cast<float>(src.buf[3 * (y0 * src.nx + x1) + c]),
-                                        xf);
-                    float bottom = lerp(static_cast<float>(src.buf[3 * (y1 * src.nx + x0) + c]),
-                                        static_cast<float>(src.buf[3 * (y1 * src.nx + x1) + c]),
-                                        xf);
-                    dst.buf[3 * (y * target_width + x) + c] = static_cast<uint8_t>(lerp(top, bottom, yf));
+                    float top    = lerp(static_cast<float>(p00[c]), static_cast<float>(p10[c]), xf);
+                    float bottom = lerp(static_cast<float>(p01[c]), static_cast<float>(p11[c]), xf);
+                    pixel[c] = static_cast<uint8_t>(lerp(top, bottom, yf));
                 }
+                dst.set_pixel(x, y, pixel);
             }
         }
     }
 
     // Bicubic resize function
     // part of image will be cropped if the aspect ratio is different
-    static bool resize_bicubic(const clip_image_u8 & img, clip_image_u8 & dst, int target_width, int target_height) {
-        const int nx = img.nx;
-        const int ny = img.ny;
+    static void resize_bicubic(const clip_image_u8 & img, clip_image_u8 & dst, int target_width, int target_height) {
+        const auto img_size = img.get_size();
+        const int nx = img_size.width;
+        const int ny = img_size.height;
 
-        dst.nx = target_width;
-        dst.ny = target_height;
-        dst.buf.resize(3 * target_width * target_height);
+        dst.set_size({target_width, target_height}, false);
+
+        if (img.is_placeholder()) {
+            // no-op for placeholder image, just set the size and return
+            return;
+        }
 
         float Cc;
         float C[5] = {};
@@ -280,12 +289,13 @@ private:
                 dx = tx * j - x;
                 dy = ty * i - y;
 
+                std::array<uint8_t, 3> pixel;
                 for (k = 0; k < 3; k++) {
                     for (jj = 0; jj <= 3; jj++) {
-                        d0 = img.buf[(clip(y - 1 + jj, 0, ny - 1) * nx + clip(x - 1, 0, nx - 1)) * 3 + k] - img.buf[(clip(y - 1 + jj, 0, ny - 1) * nx + clip(x, 0, nx - 1)) * 3 + k];
-                        d2 = img.buf[(clip(y - 1 + jj, 0, ny - 1) * nx + clip(x + 1, 0, nx - 1)) * 3 + k] - img.buf[(clip(y - 1 + jj, 0, ny - 1) * nx + clip(x, 0, nx - 1)) * 3 + k];
-                        d3 = img.buf[(clip(y - 1 + jj, 0, ny - 1) * nx + clip(x + 2, 0, nx - 1)) * 3 + k] - img.buf[(clip(y - 1 + jj, 0, ny - 1) * nx + clip(x, 0, nx - 1)) * 3 + k];
-                        a0 = img.buf[(clip(y - 1 + jj, 0, ny - 1) * nx + clip(x, 0, nx - 1)) * 3 + k];
+                        d0 = img.get_pixel(clip(x - 1, 0, nx - 1), clip(y - 1 + jj, 0, ny - 1))[k] - img.get_pixel(clip(x, 0, nx - 1), clip(y - 1 + jj, 0, ny - 1))[k];
+                        d2 = img.get_pixel(clip(x + 1, 0, nx - 1), clip(y - 1 + jj, 0, ny - 1))[k] - img.get_pixel(clip(x, 0, nx - 1), clip(y - 1 + jj, 0, ny - 1))[k];
+                        d3 = img.get_pixel(clip(x + 2, 0, nx - 1), clip(y - 1 + jj, 0, ny - 1))[k] - img.get_pixel(clip(x, 0, nx - 1), clip(y - 1 + jj, 0, ny - 1))[k];
+                        a0 = img.get_pixel(clip(x, 0, nx - 1), clip(y - 1 + jj, 0, ny - 1))[k];
 
                         a1 = -1.0 / 3 * d0 + d2 - 1.0 / 6 * d3;
                         a2 =  1.0 / 2 * d0 +      1.0 / 2 * d2;
@@ -303,13 +313,12 @@ private:
                         Cc = a0 + a1 * dy + a2 * dy * dy + a3 * dy * dy * dy;
 
                         const uint8_t Cc2 = std::min(std::max(std::round(Cc), 0.0f), 255.0f);
-                        dst.buf[(i * target_width + j) * 3 + k] = float(Cc2);
+                        pixel[k] = Cc2;
                     }
                 }
+                dst.set_pixel(j, i, pixel);
             }
         }
-
-        return true;
     }
 
     // Bicubic resize function using Pillow's ImagingResample algorithm
@@ -455,16 +464,17 @@ private:
         };
 
         // Horizontal resampling pass
-        // Resizes width from imIn.nx to imOut.nx, preserving height
+        // Resizes width from imIn to out_nx, preserving height
         auto resample_horizontal = [&](const clip_image_u8 & imIn, clip_image_u8 & imOut,
+                                       int out_nx,
                                        int ksize, const std::vector<int> & bounds, const std::vector<int32_t> & weights) {
-            imOut.ny = imIn.ny;
-            imOut.buf.resize(3 * imOut.nx * imOut.ny);
+            const int in_ny = imIn.get_size().height;
+            imOut.set_size({out_nx, in_ny}, false);
 
             // Process each row independently
-            for (int yy = 0; yy < imOut.ny; yy++) {
+            for (int yy = 0; yy < in_ny; yy++) {
                 // For each output pixel in this row
-                for (int xx = 0; xx < imOut.nx; xx++) {
+                for (int xx = 0; xx < out_nx; xx++) {
                     // Get the range of input pixels and filter coefficients
                     int xmin = bounds[xx * 2 + 0];  // First input pixel index
                     int xcnt = bounds[xx * 2 + 1];  // Number of input pixels
@@ -476,36 +486,36 @@ private:
 
                     // Convolve: sum weighted input pixels
                     for (int x = 0; x < xcnt; x++) {
-                        int src_idx = ((yy * imIn.nx) + (x + xmin)) * 3;
-                        ss0 += static_cast<uint8_t>(imIn.buf[src_idx + 0]) * weights[xx * ksize + x];  // R channel
-                        ss1 += static_cast<uint8_t>(imIn.buf[src_idx + 1]) * weights[xx * ksize + x];  // G channel
-                        ss2 += static_cast<uint8_t>(imIn.buf[src_idx + 2]) * weights[xx * ksize + x];  // B channel
+                        const auto src_px = imIn.get_pixel(x + xmin, yy);
+                        ss0 += src_px[0] * weights[xx * ksize + x];  // R channel
+                        ss1 += src_px[1] * weights[xx * ksize + x];  // G channel
+                        ss2 += src_px[2] * weights[xx * ksize + x];  // B channel
                     }
 
                     // Convert back from fixed-point (divide by 2^PRECISION_BITS) and clamp to [0,255]
-                    int dst_idx = (yy * imOut.nx + xx) * 3;
-                    imOut.buf[dst_idx + 0] = clip8(ss0 >> PRECISION_BITS);
-                    imOut.buf[dst_idx + 1] = clip8(ss1 >> PRECISION_BITS);
-                    imOut.buf[dst_idx + 2] = clip8(ss2 >> PRECISION_BITS);
+                    imOut.set_pixel(xx, yy, {clip8(ss0 >> PRECISION_BITS),
+                                             clip8(ss1 >> PRECISION_BITS),
+                                             clip8(ss2 >> PRECISION_BITS)});
                 }
             }
         };
 
         // Vertical resampling pass
-        // Resizes height from imIn.ny to imOut.ny, preserving width
+        // Resizes height from imIn to out_ny, preserving width
         auto resample_vertical = [&](const clip_image_u8 & imIn, clip_image_u8 & imOut,
+                                     int out_ny,
                                      int ksize, const std::vector<int> & bounds, const std::vector<int32_t> & weight) {
-            imOut.nx = imIn.nx;
-            imOut.buf.resize(3 * imOut.nx * imOut.ny);
+            const int in_nx = imIn.get_size().width;
+            imOut.set_size({in_nx, out_ny}, false);
 
             // For each output row
-            for (int yy = 0; yy < imOut.ny; yy++) {
+            for (int yy = 0; yy < out_ny; yy++) {
                 // Get the range of input rows and filter coefficients
                 int ymin = bounds[yy * 2 + 0];  // First input row index
                 int ycnt = bounds[yy * 2 + 1];  // Number of input rows
 
                 // Process each column in this output row
-                for (int xx = 0; xx < imOut.nx; xx++) {
+                for (int xx = 0; xx < in_nx; xx++) {
                     // Initialize accumulators for RGB channels with rounding bias
                     int32_t ss0 = 1 << (PRECISION_BITS - 1);
                     int32_t ss1 = 1 << (PRECISION_BITS - 1);
@@ -513,27 +523,23 @@ private:
 
                     // Convolve: sum weighted input pixels vertically
                     for (int y = 0; y < ycnt; y++) {
-                        int src_idx = ((y + ymin) * imIn.nx + xx) * 3;
-                        ss0 += static_cast<uint8_t>(imIn.buf[src_idx + 0]) * weight[yy * ksize + y];  // R channel
-                        ss1 += static_cast<uint8_t>(imIn.buf[src_idx + 1]) * weight[yy * ksize + y];  // G channel
-                        ss2 += static_cast<uint8_t>(imIn.buf[src_idx + 2]) * weight[yy * ksize + y];  // B channel
+                        const auto src_px = imIn.get_pixel(xx, y + ymin);
+                        ss0 += src_px[0] * weight[yy * ksize + y];  // R channel
+                        ss1 += src_px[1] * weight[yy * ksize + y];  // G channel
+                        ss2 += src_px[2] * weight[yy * ksize + y];  // B channel
                     }
 
                     // Convert back from fixed-point and clamp to [0,255]
-                    int dst_idx = (yy * imOut.nx + xx) * 3;
-                    imOut.buf[dst_idx + 0] = clip8(ss0 >> PRECISION_BITS);
-                    imOut.buf[dst_idx + 1] = clip8(ss1 >> PRECISION_BITS);
-                    imOut.buf[dst_idx + 2] = clip8(ss2 >> PRECISION_BITS);
+                    imOut.set_pixel(xx, yy, {clip8(ss0 >> PRECISION_BITS),
+                                             clip8(ss1 >> PRECISION_BITS),
+                                             clip8(ss2 >> PRECISION_BITS)});
                 }
             }
         };
 
         // Main resampling logic using separable two-pass approach
-        const int src_width = img.nx;
-        const int src_height = img.ny;
-
-        dst.nx = target_width;
-        dst.ny = target_height;
+        const int src_width  = img.get_size().width;
+        const int src_height = img.get_size().height;
 
         bool need_horizontal = (target_width != src_width);
         bool need_vertical = (target_height != src_height);
@@ -555,18 +561,20 @@ private:
         if (need_horizontal && need_vertical) {
             // Both horizontal and vertical
             clip_image_u8 temp;
-            temp.nx = target_width;
-            resample_horizontal(img, temp, ksize_horiz, bounds_horiz, weights_horiz);
-            resample_vertical(temp, dst, ksize_vert, bounds_vert, weights_vert);
+            resample_horizontal(img, temp, target_width, ksize_horiz, bounds_horiz, weights_horiz);
+            resample_vertical(temp, dst, target_height, ksize_vert, bounds_vert, weights_vert);
         } else if (need_horizontal) {
             // Only horizontal
-            resample_horizontal(img, dst, ksize_horiz, bounds_horiz, weights_horiz);
+            resample_horizontal(img, dst, target_width, ksize_horiz, bounds_horiz, weights_horiz);
         } else if (need_vertical) {
             // Only vertical
-            resample_vertical(img, dst, ksize_vert, bounds_vert, weights_vert);
+            resample_vertical(img, dst, target_height, ksize_vert, bounds_vert, weights_vert);
         } else {
             // No resizing needed - direct copy
-            dst.buf = img.buf;
+            dst.set_size(img.get_size(), img.is_placeholder());
+            if (!img.is_placeholder()) {
+                dst.cpy_buf(img.get_ro_buf());
+            }
         }
 
         return true;
@@ -588,7 +596,7 @@ private:
 //
 
 bool mtmd_image_preprocessor_llava_uhd::preprocess(const clip_image_u8 & img, clip_image_f32_batch & output) {
-    const clip_image_size original_size{img.nx, img.ny};
+    const clip_image_size original_size = img.get_size();
     auto const inst = get_slice_instructions(original_size);
     std::vector<clip_image_u8_ptr> imgs = slice_image(img, inst);
 
@@ -883,7 +891,7 @@ bool mtmd_image_preprocessor_fixed_size::preprocess(const clip_image_u8 & img, c
 bool mtmd_image_preprocessor_dyn_size::preprocess(const clip_image_u8 & img, clip_image_f32_batch & output) {
     GGML_ASSERT(hparams.image_min_pixels > 0 && hparams.image_max_pixels > 0);
     clip_image_u8 resized_image;
-    const clip_image_size original_size{img.nx, img.ny};
+    const clip_image_size original_size = img.get_size();
     // the original pixtral model doesn't have n_merge
     const int cur_merge = hparams.n_merge == 0 ? 1 : hparams.n_merge;
     const clip_image_size target_size = img_tool::calc_size_preserved_ratio(
@@ -908,7 +916,7 @@ bool mtmd_image_preprocessor_dyn_size::preprocess(const clip_image_u8 & img, cli
 bool mtmd_image_preprocessor_longest_edge::preprocess(const clip_image_u8 & img, clip_image_f32_batch & output) {
     GGML_ASSERT(hparams.image_longest_edge > 0);
     clip_image_u8 resized_image;
-    const clip_image_size original_size{img.nx, img.ny};
+    const clip_image_size original_size = img.get_size();
     // the original pixtral model doesn't have n_merge
     const int cur_merge = hparams.n_merge == 0 ? 1 : hparams.n_merge;
     const clip_image_size target_size = img_tool::calc_size_preserved_ratio(
@@ -1040,7 +1048,7 @@ bool mtmd_image_preprocessor_idefics3::preprocess(const clip_image_u8 & img, cli
     //      multiples of image_size (always rounding up)
     //
     // CITE: https://github.com/huggingface/transformers/blob/main/src/transformers/models/idefics3/image_processing_idefics3.py#L737
-    const clip_image_size original_size{img.nx, img.ny};
+    const clip_image_size original_size = img.get_size();
     const clip_image_size refined_size = img_tool::calc_size_preserved_ratio(
         original_size, hparams.image_size, hparams.image_longest_edge);
     // LOG_INF("%s: original size: %d x %d, refined size: %d x %d\n",
@@ -1088,7 +1096,7 @@ bool mtmd_image_preprocessor_idefics3::preprocess(const clip_image_u8 & img, cli
 
 bool mtmd_image_preprocessor_internvl::preprocess(const clip_image_u8 & img, clip_image_f32_batch & output) {
     GGML_ASSERT(!hparams.image_res_candidates.empty());
-    const clip_image_size original_size{img.nx, img.ny};
+    const clip_image_size original_size = img.get_size();
     auto const inst = get_slice_instructions(original_size);
     std::vector<clip_image_u8_ptr> imgs = slice_image(img, inst, false);
 
@@ -1108,7 +1116,7 @@ bool mtmd_image_preprocessor_deepseekocr::preprocess(const clip_image_u8 & img,
     static constexpr int native_resolutions[] = { 1024 /* base */, 1280 /* large */ };
     // TODO: support 512 (tiny) and 640 (small) once we have eval data for them
 
-    const int64_t orig_area = static_cast<int64_t>(img.nx) * img.ny;
+    const int64_t orig_area = static_cast<int64_t>(img.n_pixels());
 
     size_t  mode_i   = 0;
     int64_t min_diff = std::numeric_limits<int64_t>::max();
@@ -1201,10 +1209,11 @@ bool mtmd_image_preprocessor_deepseekocr2::preprocess(const clip_image_u8 & img,
     // emit 768x768 local tiles when the image is larger than a tile in either
     // dimension, then always a 1024x1024 global view. order: [tiles..., global].
 
-    if (img.nx > tile_size || img.ny > tile_size) {
-        const float           aspect_ratio  = static_cast<float>(img.nx) / img.ny;
+    const auto img_size = img.get_size();
+    if (img_size.width > tile_size || img_size.height > tile_size) {
+        const float           aspect_ratio  = static_cast<float>(img_size.width) / img_size.height;
         const auto            target_ratios = get_target_ratios();
-        const clip_image_size grid          = find_closest_aspect_ratio(aspect_ratio, target_ratios, img.nx, img.ny);
+        const clip_image_size grid          = find_closest_aspect_ratio(aspect_ratio, target_ratios, img_size.width, img_size.height);
 
         // stretch onto the grid (no aspect preserve), then crop tiles row-major.
         clip_image_u8 refined;
@@ -1247,50 +1256,57 @@ void mtmd_image_preprocessor_step3vl::img_u8_resize_bilinear_to_f32(
         int target_height,
         const float mean[3],
         const float std[3]) {
-    if (src.nx == target_width && src.ny == target_height) {
+    const auto src_size = src.get_size();
+    if (src_size.width == target_width && src_size.height == target_height) {
         img_u8_to_f32(src, dst, mean, std);
         return;
     }
 
-    dst.nx = target_width;
-    dst.ny = target_height;
-    dst.buf.resize(3 * target_width * target_height);
+    dst.set_size({target_width, target_height}, false, false);
 
-    const float scale_x = static_cast<float>(src.nx) / target_width;
-    const float scale_y = static_cast<float>(src.ny) / target_height;
+    if (src.is_placeholder()) {
+        // no-op for placeholder image, just set the size and return
+        return;
+    }
+
+    const float scale_x = static_cast<float>(src_size.width)  / target_width;
+    const float scale_y = static_cast<float>(src_size.height) / target_height;
+
+    std::vector<float> local_buf(3 * target_width * target_height);
 
     for (int y = 0; y < target_height; ++y) {
         const float src_y = (static_cast<float>(y) + 0.5f) * scale_y - 0.5f;
         const int y0_floor = static_cast<int>(std::floor(src_y));
-        const int y0 = std::max(0, std::min(y0_floor, src.ny - 1));
-        const int y1 = std::max(0, std::min(y0_floor + 1, src.ny - 1));
+        const int y0 = std::max(0, std::min(y0_floor,     src_size.height - 1));
+        const int y1 = std::max(0, std::min(y0_floor + 1, src_size.height - 1));
         const float ly = src_y - y0_floor;
 
         for (int x = 0; x < target_width; ++x) {
             const float src_x = (static_cast<float>(x) + 0.5f) * scale_x - 0.5f;
             const int x0_floor = static_cast<int>(std::floor(src_x));
-            const int x0 = std::max(0, std::min(x0_floor, src.nx - 1));
-            const int x1 = std::max(0, std::min(x0_floor + 1, src.nx - 1));
+            const int x0 = std::max(0, std::min(x0_floor,     src_size.width - 1));
+            const int x1 = std::max(0, std::min(x0_floor + 1, src_size.width - 1));
             const float lx = src_x - x0_floor;
 
-            const size_t idx00 = 3 * (y0 * src.nx + x0);
-            const size_t idx01 = 3 * (y0 * src.nx + x1);
-            const size_t idx10 = 3 * (y1 * src.nx + x0);
-            const size_t idx11 = 3 * (y1 * src.nx + x1);
-            const size_t idx_dst = 3 * (y * target_width + x);
+            const auto p00 = src.get_pixel(x0, y0);
+            const auto p01 = src.get_pixel(x1, y0);
+            const auto p10 = src.get_pixel(x0, y1);
+            const auto p11 = src.get_pixel(x1, y1);
 
+            const size_t idx_dst = 3 * (y * target_width + x);
             for (int c = 0; c < 3; ++c) {
-                const float v00 = (static_cast<float>(src.buf[idx00 + c]) / 255.0f - mean[c]) / std[c];
-                const float v01 = (static_cast<float>(src.buf[idx01 + c]) / 255.0f - mean[c]) / std[c];
-                const float v10 = (static_cast<float>(src.buf[idx10 + c]) / 255.0f - mean[c]) / std[c];
-                const float v11 = (static_cast<float>(src.buf[idx11 + c]) / 255.0f - mean[c]) / std[c];
+                const float v00 = (static_cast<float>(p00[c]) / 255.0f - mean[c]) / std[c];
+                const float v01 = (static_cast<float>(p01[c]) / 255.0f - mean[c]) / std[c];
+                const float v10 = (static_cast<float>(p10[c]) / 255.0f - mean[c]) / std[c];
+                const float v11 = (static_cast<float>(p11[c]) / 255.0f - mean[c]) / std[c];
 
                 const float top = v00 + (v01 - v00) * lx;
                 const float bot = v10 + (v11 - v10) * lx;
-                dst.buf[idx_dst + c] = top + (bot - top) * ly;
+                local_buf[idx_dst + c] = top + (bot - top) * ly;
             }
         }
     }
+    dst.cpy_buf(local_buf);
 }
 
 int mtmd_image_preprocessor_step3vl::get_image_longest_edge(const clip_hparams & params) {
@@ -1341,26 +1357,26 @@ std::vector<int> mtmd_image_preprocessor_step3vl::calc_grid(int length, int wind
 
 clip_image_u8 mtmd_image_preprocessor_step3vl::prepare_image(const clip_image_u8 & img, const clip_hparams & params) {
     clip_image_u8 resized = img;
-    const float aspect_ratio = img.ny > 0 ? static_cast<float>(img.nx) / img.ny : 1.0f;
-    if (std::min(img.nx, img.ny) < 32 &&
+    const auto img_size = img.get_size();
+    const float aspect_ratio = img_size.height > 0 ? static_cast<float>(img_size.width) / img_size.height : 1.0f;
+    if (std::min(img_size.width, img_size.height) < 32 &&
         (aspect_ratio > wide_aspect_ratio_limit ||
          aspect_ratio < 1.0f / wide_aspect_ratio_limit)) {
-        const int square_size = std::max(img.nx, img.ny);
+        const int square_size = std::max(img_size.width, img_size.height);
         clip_image_u8 padded;
-        padded.nx = square_size;
-        padded.ny = square_size;
-        padded.buf.resize(3 * square_size * square_size);
+        padded.set_size({square_size, square_size}, false);
         img_tool::fill(padded, {0, 0, 0});
         img_tool::composite(padded, img, 0, 0);
         resized = std::move(padded);
     }
 
     const int max_image_size = get_image_longest_edge(params);
-    if (std::max(resized.nx, resized.ny) > max_image_size) {
-        const float scale = static_cast<float>(max_image_size) / std::max(resized.nx, resized.ny);
+    const auto resized_size = resized.get_size();
+    if (std::max(resized_size.width, resized_size.height) > max_image_size) {
+        const float scale = static_cast<float>(max_image_size) / std::max(resized_size.width, resized_size.height);
         const clip_image_size new_size = {
-            std::max(1, static_cast<int>(std::floor(resized.nx * scale))),
-            std::max(1, static_cast<int>(std::floor(resized.ny * scale))),
+            std::max(1, static_cast<int>(std::floor(resized_size.width  * scale))),
+            std::max(1, static_cast<int>(std::floor(resized_size.height * scale))),
         };
         clip_image_u8 scaled;
         img_tool::resize(resized, scaled, new_size, RESIZE_ALGO_BILINEAR, PAD_NONE);
@@ -1372,14 +1388,14 @@ clip_image_u8 mtmd_image_preprocessor_step3vl::prepare_image(const clip_image_u8
 
 clip_image_u8 mtmd_image_preprocessor_step3vl::crop_with_black_padding(const clip_image_u8 & image, int x, int y, int w, int h) {
     clip_image_u8 dst;
-    dst.nx = w;
-    dst.ny = h;
-    dst.buf.resize(3 * w * h, 0);
+    dst.set_size({w, h}, false);
+    img_tool::fill(dst, {0, 0, 0});
 
+    const auto img_size = image.get_size();
     const int src_x0 = std::max(0, x);
     const int src_y0 = std::max(0, y);
-    const int src_x1 = std::min(image.nx, x + w);
-    const int src_y1 = std::min(image.ny, y + h);
+    const int src_x1 = std::min(img_size.width,  x + w);
+    const int src_y1 = std::min(img_size.height, y + h);
 
     if (src_x0 >= src_x1 || src_y0 >= src_y1) {
         return dst;
@@ -1390,11 +1406,7 @@ clip_image_u8 mtmd_image_preprocessor_step3vl::crop_with_black_padding(const cli
 
     for (int yy = 0; yy < src_y1 - src_y0; ++yy) {
         for (int xx = 0; xx < src_x1 - src_x0; ++xx) {
-            const int src_idx = 3 * ((src_y0 + yy) * image.nx + (src_x0 + xx));
-            const int dst_idx = 3 * ((dst_y0 + yy) * w + (dst_x0 + xx));
-            dst.buf[dst_idx + 0] = image.buf[src_idx + 0];
-            dst.buf[dst_idx + 1] = image.buf[src_idx + 1];
-            dst.buf[dst_idx + 2] = image.buf[src_idx + 2];
+            dst.set_pixel(dst_x0 + xx, dst_y0 + yy, image.get_pixel(src_x0 + xx, src_y0 + yy));
         }
     }
 
@@ -1443,7 +1455,7 @@ mtmd_image_preprocessor_step3vl::slice_instructions mtmd_image_preprocessor_step
 
 bool mtmd_image_preprocessor_step3vl::preprocess(const clip_image_u8 & img, clip_image_f32_batch & output) {
     clip_image_u8 prepared = prepare_image(img, hparams);
-    const auto instructions = build_slice_instructions(hparams, {prepared.nx, prepared.ny});
+    const auto instructions = build_slice_instructions(hparams, prepared.get_size());
 
     clip_image_f32_ptr overview_f32(clip_image_f32_init());
     img_u8_resize_bilinear_to_f32(
@@ -1462,7 +1474,8 @@ bool mtmd_image_preprocessor_step3vl::preprocess(const clip_image_u8 & img, clip
     }
 
     clip_image_u8 img_for_crop = prepared;
-    if (instructions.refined_size.width != prepared.nx || instructions.refined_size.height != prepared.ny) {
+    const auto prepared_size = prepared.get_size();
+    if (instructions.refined_size.width != prepared_size.width || instructions.refined_size.height != prepared_size.height) {
         clip_image_u8 refined;
         img_tool::resize(prepared, refined, instructions.refined_size, RESIZE_ALGO_BILINEAR, PAD_NONE);
         img_for_crop = std::move(refined);
@@ -1503,9 +1516,10 @@ bool mtmd_image_preprocessor_youtuvl::preprocess(const clip_image_u8 & img, clip
         hparams.image_max_pixels / (patch_size * patch_size) : 256;
 
     // Linear search for optimal scale to fit within max_num_patches
+    const auto img_size = img.get_size();
     float scale = 1.0f;
-    int target_height = img.ny;
-    int target_width  = img.nx;
+    int target_height = img_size.height;
+    int target_width  = img_size.width;
 
     auto get_scaled_image_size = [align_size](float scale, int size) -> int {
         float scaled_size = size * scale;
@@ -1517,8 +1531,8 @@ bool mtmd_image_preprocessor_youtuvl::preprocess(const clip_image_u8 & img, clip
 
     // Linear search with 0.02 step size
     while (scale > 0.0f) {
-        target_height = get_scaled_image_size(scale, img.ny);
-        target_width  = get_scaled_image_size(scale, img.nx);
+        target_height = get_scaled_image_size(scale, img_size.height);
+        target_width  = get_scaled_image_size(scale, img_size.width);
 
         int num_patches_h = target_height / patch_size;
         int num_patches_w = target_width / patch_size;
diff --git a/tools/mtmd/mtmd.cpp b/tools/mtmd/mtmd.cpp
index 260f307560..e1f8e2a335 100644
--- a/tools/mtmd/mtmd.cpp
+++ b/tools/mtmd/mtmd.cpp
@@ -26,12 +26,46 @@
 
 // represents raw image data, layout is RGBRGBRGB...
 // length of data must be nx * ny * 3
+// for audio bitmap: nx = sample count, ny = 1, layout is F32 F32 F32 ...
+// length of data must be nx * sizeof(float)
 struct mtmd_bitmap {
-    uint32_t nx;
-    uint32_t ny;
-    std::vector<unsigned char> data;
+    uint32_t nx = 0;
+    uint32_t ny = 0;
     std::string id; // optional user-defined id, for ex: can be set to image hash, useful for KV cache tracking
     bool is_audio = false; // true if the bitmap is audio
+
+    mtmd_bitmap(const unsigned char * data, uint32_t nx, uint32_t ny)
+        : nx(nx), ny(ny) {
+        if (data) {
+            size_t data_size = (size_t)nx * ny * 3;
+            this->data.resize(data_size);
+            std::memcpy(this->data.data(), data, data_size);
+        }
+    }
+
+    mtmd_bitmap(const unsigned char * data, uint32_t n_samples)
+        : nx(n_samples), ny(1), is_audio(true) {
+        if (data) {
+            size_t data_size = (size_t)nx * sizeof(float);
+            this->data.resize(data_size);
+            std::memcpy(this->data.data(), data, data_size);
+        }
+    }
+
+    const std::vector<unsigned char> & get_ro_buf() const {
+        return data;
+    }
+
+    bool is_placeholder() const {
+        return data.empty();
+    }
+
+    size_t n_bytes() const {
+        return data.size();
+    }
+
+  private:
+    std::vector<unsigned char> data;
 };
 
 // position indexing for decoder model
@@ -42,8 +76,8 @@ enum mtmd_pos_type {
 };
 
 struct mtmd_image_tokens {
-    uint32_t nx; // number of tokens in x direction
-    uint32_t ny; // number of tokens in y direction
+    uint32_t nx = 0; // number of tokens in x direction
+    uint32_t ny = 0; // number of tokens in y direction
     mtmd_pos_type pos = MTMD_POS_TYPE_NORMAL;
     uint32_t image_idx = 0; // 0-based position of this image among image chunks in the prompt(used by pos == MTMD_POS_TYPE_HUNYUANVL)
     uint32_t n_tokens() const {
@@ -56,6 +90,16 @@ struct mtmd_image_tokens {
     clip_image_f32_batch batch_f32; // preprocessed image patches
     std::string id; // optional user-defined ID, useful for KV cache tracking
 
+    // true if one of entries in batch_f32 is a placeholder
+    bool is_placeholder() const {
+        for (const auto & entry : batch_f32.entries) {
+            if (entry->is_placeholder()) {
+                return true;
+            }
+        }
+        return false;
+    }
+
     mtmd_image_tokens clone() {
         return mtmd_image_tokens{
             nx,
@@ -70,10 +114,20 @@ struct mtmd_image_tokens {
 using mtmd_image_tokens_ptr = std::unique_ptr<mtmd_image_tokens>;
 
 struct mtmd_audio_tokens {
-    uint32_t n_tokens; // number of tokens
+    uint32_t n_tokens = 0; // number of tokens
     clip_image_f32_batch batch_f32; // preprocessed image patches
     std::string id; // optional user-defined ID, useful for KV cache tracking
 
+    // true if one of entries in batch_f32 is a placeholder
+    bool is_placeholder() const {
+        for (const auto & entry : batch_f32.entries) {
+            if (entry->is_placeholder()) {
+                return true;
+            }
+        }
+        return false;
+    }
+
     mtmd_audio_tokens clone() {
         return mtmd_audio_tokens{
             n_tokens,
@@ -795,16 +849,19 @@ struct mtmd_tokenizer {
             }
 
             // sanity check
-            GGML_ASSERT(bitmap->nx > 0 && bitmap->ny > 0);
-            GGML_ASSERT(bitmap->data.size() == (size_t)bitmap->nx * bitmap->ny * 3);
+            if (bitmap->nx <= 0 || bitmap->ny <= 0) {
+                LOG_ERR("%s: error: invalid bitmap dimensions: nx = %d, ny = %d\n",
+                        __func__, bitmap->nx, bitmap->ny);
+                return 2;
+            }
             GGML_ASSERT(ctx->image_preproc != nullptr);
 
             // convert mtmd_bitmap to clip_image_u8
             clip_image_u8_ptr img_u8(clip_image_u8_init());
-            img_u8->nx = bitmap->nx;
-            img_u8->ny = bitmap->ny;
-            img_u8->buf.resize(bitmap->data.size());
-            std::memcpy(img_u8->buf.data(), bitmap->data.data(), img_u8->nx * img_u8->ny * 3);
+            img_u8->set_size(
+                {(int)bitmap->nx, (int)bitmap->ny},
+                bitmap->is_placeholder());
+            img_u8->cpy_buf(bitmap->get_ro_buf());
 
             // preprocess image
             clip_image_f32_batch batch_f32;
@@ -949,7 +1006,7 @@ struct mtmd_tokenizer {
                 return 2;
             }
 
-            if (bitmap->data.size() == 0) {
+            if (bitmap->nx == 0) {
                 LOG_ERR("%s: error: empty audio data\n", __func__);
                 return 2;
             }
@@ -960,26 +1017,46 @@ struct mtmd_tokenizer {
 
             // sanity check
             GGML_ASSERT(ctx->audio_preproc != nullptr);
-            GGML_ASSERT(bitmap->data.size() > sizeof(float));
-            GGML_ASSERT(bitmap->data.size() % sizeof(float) == 0);
 
             // preprocess audio
             std::vector<mtmd_audio_mel> mel_spec_chunks;
-            const float * samples = (const float *)bitmap->data.data();
-            size_t n_samples = bitmap->data.size() / sizeof(float);
-            bool ok = ctx->audio_preproc->preprocess(samples, n_samples, mel_spec_chunks);
-            if (!ok) {
-                LOG_ERR("Unable to preprocess audio\n");
-                return 2;
+            {
+                std::vector<float> dummy;
+                const float * samples = nullptr;
+                size_t n_samples = 0;
+                if (bitmap->is_placeholder()) {
+                    // TODO @ngxson : skip underlay processing if bitmap is placeholder
+                    GGML_ASSERT(bitmap->ny == 1);
+
+                    dummy.resize(bitmap->nx);
+                    samples = dummy.data();
+                    n_samples = dummy.size();
+                } else {
+                    const auto & buf = bitmap->get_ro_buf();
+                    GGML_ASSERT(buf.size() > sizeof(float));
+                    GGML_ASSERT(buf.size() % sizeof(float) == 0);
+
+                    samples = (const float *)buf.data();
+                    n_samples = buf.size() / sizeof(float);
+                }
+                bool ok = ctx->audio_preproc->preprocess(samples, n_samples, mel_spec_chunks);
+                if (!ok) {
+                    LOG_ERR("Unable to preprocess audio\n");
+                    return 2;
+                }
             }
 
             // consider each mel_spec as a separate audio chunk
             // TODO: maybe support batching, but this may come with memory cost
             for (auto & mel_spec : mel_spec_chunks) {
+                const bool is_placeholder = mel_spec.data.empty();
+
                 clip_image_f32_ptr mel_f32(clip_image_f32_init());
-                mel_f32->nx  = mel_spec.n_len;
-                mel_f32->ny  = mel_spec.n_mel;
-                mel_f32->buf = std::move(mel_spec.data);
+                mel_f32->set_size(
+                    {mel_spec.n_len, mel_spec.n_mel},
+                    is_placeholder, /* is_audio */ true);
+                mel_f32->cpy_buf(mel_spec.data);
+
                 size_t n_tokens = clip_n_output_tokens(ctx->ctx_a, mel_f32.get());
 
                 clip_image_f32_batch batch_f32;
@@ -1098,12 +1175,28 @@ int32_t mtmd_encode_chunk(mtmd_context * ctx, const mtmd_input_chunk * chunk) {
             LOG_ERR("%s: model does not support vision input\n", __func__);
             return 1;
         }
+        if (chunk->tokens_image == nullptr) {
+            LOG_ERR("%s: image tokens are null\n", __func__);
+            return 1;
+        }
+        if (chunk->tokens_image->is_placeholder()) {
+            LOG_ERR("%s: image tokens batch is placeholder\n", __func__);
+            return 1;
+        }
         return mtmd_encode(ctx, chunk->tokens_image.get());
     } else if (chunk->type == MTMD_INPUT_CHUNK_TYPE_AUDIO) {
         if (!ctx->ctx_a) {
             LOG_ERR("%s: model does not support audio input\n", __func__);
             return 1;
         }
+        if (chunk->tokens_audio == nullptr) {
+            LOG_ERR("%s: audio tokens are null\n", __func__);
+            return 1;
+        }
+        if (chunk->tokens_audio->is_placeholder()) {
+            LOG_ERR("%s: audio tokens batch is placeholder\n", __func__);
+            return 1;
+        }
         int n_mmproj_embd = ctx->n_embd_text;
         ctx->image_embd_v.resize(chunk->tokens_audio->n_tokens * n_mmproj_embd);
         bool ok = clip_image_batch_encode(
@@ -1141,6 +1234,10 @@ int32_t mtmd_encode(mtmd_context * ctx, const mtmd_image_tokens * image_tokens)
         // e.g., DeepSeek-OCR-2: 144 per tile views, 257 for the global view
         size_t offset = 0;
         for (size_t i = 0; i < entries.size(); i++) {
+            if (entries[i]->is_placeholder()) {
+                LOG_ERR("%s: image tokens batch entry %zu is placeholder\n", __func__, i);
+                return 1;
+            }
             int n_tokens_per_image = clip_n_output_tokens(ctx_clip, entries[i].get());
             ok = clip_image_encode(
                 ctx_clip,
@@ -1150,6 +1247,10 @@ int32_t mtmd_encode(mtmd_context * ctx, const mtmd_image_tokens * image_tokens)
             offset += static_cast<size_t>(n_mmproj_embd) * n_tokens_per_image;
         }
     } else {
+        if (image_tokens->is_placeholder()) {
+            LOG_ERR("%s: image tokens batch is placeholder\n", __func__);
+            return 1;
+        }
         ok = clip_image_batch_encode(
             ctx_clip,
             ctx->n_threads,
@@ -1207,24 +1308,17 @@ int mtmd_get_audio_sample_rate(const mtmd_context * ctx) {
 mtmd_bitmap * mtmd_bitmap_init(uint32_t nx,
                                uint32_t ny,
                                const unsigned char * data) {
-    mtmd_bitmap * bitmap = new mtmd_bitmap;
-    bitmap->nx = nx;
-    bitmap->ny = ny;
-    size_t data_size = (size_t)nx * ny * 3;
-    bitmap->data.resize(data_size);
-    std::memcpy(bitmap->data.data(), data, data_size);
+    mtmd_bitmap * bitmap = new mtmd_bitmap(data, nx, ny);
     return bitmap;
 }
 
 mtmd_bitmap * mtmd_bitmap_init_from_audio(size_t n_samples,
                                           const float * data) {
-    mtmd_bitmap * bitmap = new mtmd_bitmap;
-    bitmap->nx = n_samples;
-    bitmap->ny = 1;
-    bitmap->is_audio = true;
-    size_t data_size = n_samples * sizeof(float);
-    bitmap->data.resize(data_size);
-    std::memcpy(bitmap->data.data(), data, data_size);
+    mtmd_bitmap * bitmap = new mtmd_bitmap((const unsigned char *)data, n_samples);
+    GGML_ASSERT(bitmap->is_audio);
+    if (!bitmap->is_placeholder()) {
+        GGML_ASSERT(bitmap->get_ro_buf().size() == n_samples * sizeof(float));
+    }
     return bitmap;
 }
 
@@ -1237,11 +1331,11 @@ uint32_t mtmd_bitmap_get_ny(const mtmd_bitmap * bitmap) {
 }
 
 const unsigned char * mtmd_bitmap_get_data(const mtmd_bitmap * bitmap) {
-    return bitmap->data.data();
+    return bitmap->get_ro_buf().data();
 }
 
 size_t mtmd_bitmap_get_n_bytes(const mtmd_bitmap * bitmap) {
-    return bitmap->data.size();
+    return bitmap->get_ro_buf().size();
 }
 
 bool mtmd_bitmap_is_audio(const mtmd_bitmap * bitmap) {
@@ -1535,14 +1629,16 @@ void mtmd_debug_encode_image(mtmd_context * ctx, const std::vector<std::vector<f
         LOG_ERR("%s: model does not support vision input\n", __func__);
         return;
     }
-    clip_image_f32 inp_image;
-    inp_image.nx = image.size();
-    inp_image.ny = inp_image.nx;
-    inp_image.buf.reserve(inp_image.nx * inp_image.ny);
+    const int img_sz = (int)image.size();
+    std::vector<float> img_buf;
+    img_buf.reserve(img_sz * img_sz);
     for (const auto & row : image) {
-        inp_image.buf.insert(inp_image.buf.end(), row.begin(), row.end());
+        img_buf.insert(img_buf.end(), row.begin(), row.end());
     }
-    LOG_INF("%s: created input image with nx=%d, ny=%d\n", __func__, inp_image.nx, inp_image.ny);
+    clip_image_f32 inp_image;
+    inp_image.set_size({img_sz, img_sz}, false, false);
+    inp_image.cpy_buf(img_buf);
+    LOG_INF("%s: created input image with nx=%d, ny=%d\n", __func__, img_sz, img_sz);
     mtmd_debug_encode_impl(ctx, ctx->ctx_v, inp_image);
 }
 
@@ -1552,16 +1648,17 @@ void mtmd_debug_encode_audio(mtmd_context * ctx, const std::vector<float> & inpu
         return;
     }
     int n_mel = clip_get_hparams(ctx->ctx_a)->n_mel_bins;
-    clip_image_f32 inp_audio;
-    inp_audio.nx = input.size();
-    inp_audio.ny = n_mel;
-    inp_audio.buf.resize(input.size() * n_mel);
-    for (size_t i = 0; i < input.size(); i++) {
+    const int audio_nx = (int)input.size();
+    std::vector<float> audio_buf(audio_nx * n_mel);
+    for (int i = 0; i < audio_nx; i++) {
         for (int j = 0; j < n_mel; j++) {
-            inp_audio.buf[j * inp_audio.nx + i] = input[i];
+            audio_buf[j * audio_nx + i] = input[i];
         }
     }
-    LOG_INF("%s: created input audio with nx=%d, ny=%d\n", __func__, inp_audio.nx, inp_audio.ny);
+    clip_image_f32 inp_audio;
+    inp_audio.set_size({audio_nx, n_mel}, false, true);
+    inp_audio.cpy_buf(audio_buf);
+    LOG_INF("%s: created input audio with nx=%d, ny=%d\n", __func__, audio_nx, n_mel);
     mtmd_debug_encode_impl(ctx, ctx->ctx_a, inp_audio);
 }
 
@@ -1571,9 +1668,8 @@ void mtmd_debug_preprocess_image(mtmd_context * ctx, const std::vector<uint8_t>
         return;
     }
     clip_image_u8 img_u8;
-    img_u8.nx = nx;
-    img_u8.ny = ny;
-    img_u8.buf = rgb_values;
+    img_u8.set_size({nx, ny}, false);
+    img_u8.cpy_buf(rgb_values);
     clip_image_f32_batch batch_f32;
     GGML_ASSERT(ctx->image_preproc != nullptr);
     bool ok = ctx->image_preproc->preprocess(img_u8, batch_f32);
@@ -1583,7 +1679,7 @@ void mtmd_debug_preprocess_image(mtmd_context * ctx, const std::vector<uint8_t>
     }
     LOG_INF("%s: preprocessed image to batch_f32 with %d entries\n", __func__, (int)batch_f32.entries.size());
     for (size_t i = 0; i < batch_f32.entries.size(); i++) {
-        LOG_INF("%s: entry %zu has nx=%d, ny=%d\n", __func__, i, batch_f32.entries[i]->nx, batch_f32.entries[i]->ny);
+        LOG_INF("%s: entry %zu has nx=%d, ny=%d\n", __func__, i, batch_f32.entries[i]->nx(), batch_f32.entries[i]->ny());
         // TODO: better way to dump entry content?
     }
 }
diff --git a/tools/mtmd/mtmd.h b/tools/mtmd/mtmd.h
index 5d518df799..b3154c8d55 100644
--- a/tools/mtmd/mtmd.h
+++ b/tools/mtmd/mtmd.h
@@ -136,6 +136,11 @@ MTMD_API int mtmd_get_audio_sample_rate(const mtmd_context * ctx);
 // if bitmap is audio:
 //     length of data must be n_samples * sizeof(float)
 //     the data is in float format (PCM F32)
+//
+// if data == nullptr:
+//     the bitmap is considered "empty", and will be treated as a placeholder for counting tokens
+//     you can pass the bitmap via mtmd_tokenize(), then call mtmd_*_get_n_tokens() to count the tokens
+//     note: passing a placeholder bitmap to mtmd_encode() will return an error
 MTMD_API mtmd_bitmap *         mtmd_bitmap_init           (uint32_t nx, uint32_t ny, const unsigned char * data);
 MTMD_API mtmd_bitmap *         mtmd_bitmap_init_from_audio(size_t n_samples,         const float         * data);
 MTMD_API uint32_t              mtmd_bitmap_get_nx     (const mtmd_bitmap * bitmap);
diff --git a/tools/server/README.md b/tools/server/README.md
index 3e14f5e6a2..bf056dc60b 100644
--- a/tools/server/README.md
+++ b/tools/server/README.md
@@ -1447,6 +1447,36 @@ See [OpenAI Embeddings API documentation](https://platform.openai.com/docs/api-r
   }'
   ```
 
+### POST `/v1/responses/input_tokens`: Token Counting
+
+Similar to [Response input token counts API](https://developers.openai.com/api/reference/python/resources/responses/subresources/input_tokens/methods/count).
+
+Example response:
+
+```json
+{
+  "object": "response.input_tokens",
+  "input_tokens": 11
+}
+```
+
+### POST `/v1/chat/completions/input_tokens`: Token Counting
+
+Similar to [Response input token counts API](https://developers.openai.com/api/reference/python/resources/responses/subresources/input_tokens/methods/count), but accepts a chat completion body as input.
+
+Note: This is not an official OAI endpoint, but is added for completeness and convenience.
+
+Example response:
+
+```json
+{
+  "object": "response.input_tokens",
+  "input_tokens": 11
+}
+```
+
+## Anthropic-compatible API Endpoints
+
 ### POST `/v1/messages`: Anthropic-compatible Messages API
 
 Given a list of `messages`, returns the assistant's response. Streaming is supported via Server-Sent Events. While no strong claims of compatibility with the Anthropic API spec are made, in our experience it suffices to support many apps.
diff --git a/tools/server/server-common.cpp b/tools/server/server-common.cpp
index 4c3f16a0a3..dfd286d24e 100644
--- a/tools/server/server-common.cpp
+++ b/tools/server/server-common.cpp
@@ -713,10 +713,10 @@ static std::string fnv_hash(const uint8_t * data, size_t len) {
     return std::to_string(hash);
 }
 
-server_tokens process_mtmd_prompt(mtmd_context * mctx, std::string prompt, std::vector<raw_buffer> files) {
+server_tokens process_mtmd_prompt(mtmd_context * mctx, const std::string & prompt, const std::vector<raw_buffer> & files, bool is_placeholder) {
     mtmd::bitmaps bitmaps;
     for (auto & file : files) {
-        mtmd::bitmap bmp(mtmd_helper_bitmap_init_from_buf(mctx, file.data(), file.size()));
+        mtmd::bitmap bmp(mtmd_helper_bitmap_init_from_buf(mctx, file.data(), file.size(), is_placeholder));
         if (!bmp.ptr) {
             throw std::runtime_error("Failed to load image or audio file");
         }
diff --git a/tools/server/server-common.h b/tools/server/server-common.h
index c28558d8b7..51b1613178 100644
--- a/tools/server/server-common.h
+++ b/tools/server/server-common.h
@@ -258,7 +258,8 @@ llama_tokens tokenize_mixed(const llama_vocab * vocab, const json & json_prompt,
 size_t validate_utf8(const std::string& text);
 
 // process mtmd prompt, return the server_tokens containing both text tokens and media chunks
-server_tokens process_mtmd_prompt(mtmd_context * mctx, std::string prompt, std::vector<raw_buffer> files);
+// if is_placeholder is true, the media chunk will be treated as placeholder for counting tokens; the output tokens are not usable for actual inference (e.g. for submitting a task to server_queue)
+server_tokens process_mtmd_prompt(mtmd_context * mctx, const std::string & prompt, const std::vector<raw_buffer> & files, bool is_placeholder = false);
 
 /**
  * break the input "prompt" object into multiple prompt if needed, then tokenize them
diff --git a/tools/server/server-context.cpp b/tools/server/server-context.cpp
index ab0d594476..5d546d09c2 100644
--- a/tools/server/server-context.cpp
+++ b/tools/server/server-context.cpp
@@ -4333,6 +4333,10 @@ void server_routes::init_routes() {
             TASK_RESPONSE_TYPE_OAI_CHAT);
     };
 
+    this->post_chat_completions_tok = [this](const server_http_req & req) {
+        return handle_count_tokens(ctx_server.vocab, ctx_server.mctx, req, TASK_RESPONSE_TYPE_OAI_CHAT);
+    };
+
     this->post_control = [this](const server_http_req & req) {
         auto res = create_response();
         const json body = json::parse(req.body);
@@ -4388,6 +4392,10 @@ void server_routes::init_routes() {
             TASK_RESPONSE_TYPE_OAI_RESP);
     };
 
+    this->post_responses_tok_oai = [this](const server_http_req & req) {
+        return handle_count_tokens(ctx_server.vocab, ctx_server.mctx, req, TASK_RESPONSE_TYPE_OAI_RESP);
+    };
+
     this->post_transcriptions_oai = [this](const server_http_req & req) {
         auto res = create_response();
 
@@ -4435,20 +4443,7 @@ void server_routes::init_routes() {
     };
 
     this->post_anthropic_count_tokens = [this](const server_http_req & req) {
-        auto res = create_response();
-        std::vector<raw_buffer> files;
-        json body = server_chat_convert_anthropic_to_oai(json::parse(req.body));
-        SRV_DBG("%s\n", "Request converted: Anthropic -> OpenAI Chat Completions");
-        SRV_DBG("converted request: %s\n", body.dump().c_str());
-        json body_parsed = oaicompat_chat_params_parse(
-            body,
-            meta->chat_params,
-            files);
-
-        json prompt = body_parsed.at("prompt");
-        llama_tokens tokens = tokenize_mixed(ctx_server.vocab, prompt, true, true);
-        res->ok({{"input_tokens", static_cast<int>(tokens.size())}});
-        return res;
+        return handle_count_tokens(ctx_server.vocab, ctx_server.mctx, req, TASK_RESPONSE_TYPE_ANTHROPIC);
     };
 
     // same with handle_chat_completions, but without inference part
@@ -4928,3 +4923,54 @@ std::unique_ptr<server_res_generator> server_routes::handle_embeddings_impl(cons
     res->ok(root);
     return res;
 }
+
+std::unique_ptr<server_res_generator> server_routes::handle_count_tokens(const llama_vocab * vocab, mtmd_context * mctx, const server_http_req & req, task_response_type res_type) {
+    auto res = create_response();
+    std::vector<raw_buffer> files;
+    json body = json::parse(req.body);
+    bool is_oai = false;
+
+    switch (res_type) {
+        case TASK_RESPONSE_TYPE_OAI_CHAT:
+            {
+                is_oai = true;
+            } break;
+        case TASK_RESPONSE_TYPE_OAI_RESP:
+            {
+                is_oai = true;
+                body = server_chat_convert_responses_to_chatcmpl(body);
+            } break;
+        case TASK_RESPONSE_TYPE_ANTHROPIC:
+            {
+                body = server_chat_convert_anthropic_to_oai(body);
+            } break;
+        default:
+            res->error(format_error_response("invalid res_type", ERROR_TYPE_INVALID_REQUEST));
+            return res;
+    }
+
+    json body_parsed = oaicompat_chat_params_parse(
+            body,
+            meta->chat_params,
+            files);
+    json prompt = body_parsed.at("prompt");
+    // SRV_DBG("prompt = %s\n", prompt.dump().c_str());
+
+    // TODO @ngxson : refactor this code block, move this to server-common and reuse it in other places
+    size_t n_tokens;
+    if (mctx != nullptr) {
+        if (!prompt.is_string()) {
+            throw std::runtime_error("for mtmd, input prompt must be a string.");
+        }
+        n_tokens = process_mtmd_prompt(mctx, prompt.get<std::string>(), files, true).size();
+    } else {
+        n_tokens = tokenize_mixed(vocab, prompt, true, true).size();
+    }
+
+    json response = {{"input_tokens", static_cast<int>(n_tokens)}};
+    if (is_oai) {
+        response["object"] = "response.input_tokens";
+    }
+    res->ok(response);
+    return res;
+}
diff --git a/tools/server/server-context.h b/tools/server/server-context.h
index 73caff54a4..72a1f40e01 100644
--- a/tools/server/server-context.h
+++ b/tools/server/server-context.h
@@ -110,8 +110,10 @@ struct server_routes {
     server_http_context::handler_t post_completions;
     server_http_context::handler_t post_completions_oai;
     server_http_context::handler_t post_chat_completions;
+    server_http_context::handler_t post_chat_completions_tok;
     server_http_context::handler_t post_control;
     server_http_context::handler_t post_responses_oai;
+    server_http_context::handler_t post_responses_tok_oai;
     server_http_context::handler_t post_transcriptions_oai;
     server_http_context::handler_t post_anthropic_messages;
     server_http_context::handler_t post_anthropic_count_tokens;
@@ -139,6 +141,7 @@ private:
     std::unique_ptr<server_res_generator> handle_slots_restore(const server_http_req & req, int id_slot);
     std::unique_ptr<server_res_generator> handle_slots_erase(const server_http_req &, int id_slot);
     std::unique_ptr<server_res_generator> handle_embeddings_impl(const server_http_req & req, task_response_type res_type);
+    std::unique_ptr<server_res_generator> handle_count_tokens(const llama_vocab * vocab, mtmd_context * mctx, const server_http_req & req, task_response_type res_type);
 
     // using unique_ptr to allow late initialization of const
     std::unique_ptr<const server_context_meta> meta;
diff --git a/tools/server/server.cpp b/tools/server/server.cpp
index 769e80a802..a6ea749d0c 100644
--- a/tools/server/server.cpp
+++ b/tools/server/server.cpp
@@ -161,6 +161,8 @@ int llama_server(int argc, char ** argv) {
         routes.post_tokenize               = models_routes->proxy_post;
         routes.post_detokenize             = models_routes->proxy_post;
         routes.post_apply_template         = models_routes->proxy_post;
+        routes.post_chat_completions_tok   = models_routes->proxy_post;
+        routes.post_responses_tok_oai      = models_routes->proxy_post;
         routes.get_lora_adapters           = models_routes->proxy_get;
         routes.post_lora_adapters          = models_routes->proxy_post;
         routes.get_slots                   = models_routes->proxy_get;
@@ -192,7 +194,6 @@ int llama_server(int argc, char ** argv) {
     ctx_http.post("/v1/audio/transcriptions",  ex_wrapper(routes.post_transcriptions_oai));
     ctx_http.post("/audio/transcriptions",     ex_wrapper(routes.post_transcriptions_oai));
     ctx_http.post("/v1/messages",              ex_wrapper(routes.post_anthropic_messages)); // anthropic messages API
-    ctx_http.post("/v1/messages/count_tokens", ex_wrapper(routes.post_anthropic_count_tokens)); // anthropic token counting
     ctx_http.post("/infill",                   ex_wrapper(routes.post_infill));
     ctx_http.post("/embedding",                ex_wrapper(routes.post_embeddings)); // legacy
     ctx_http.post("/embeddings",               ex_wrapper(routes.post_embeddings));
@@ -204,6 +205,12 @@ int llama_server(int argc, char ** argv) {
     ctx_http.post("/tokenize",                 ex_wrapper(routes.post_tokenize));
     ctx_http.post("/detokenize",               ex_wrapper(routes.post_detokenize));
     ctx_http.post("/apply-template",           ex_wrapper(routes.post_apply_template));
+    // token counting
+    ctx_http.post("/chat/completions/input_tokens",    ex_wrapper(routes.post_chat_completions_tok));
+    ctx_http.post("/v1/chat/completions/input_tokens", ex_wrapper(routes.post_chat_completions_tok));
+    ctx_http.post("/responses/input_tokens",           ex_wrapper(routes.post_responses_tok_oai));
+    ctx_http.post("/v1/responses/input_tokens",        ex_wrapper(routes.post_responses_tok_oai));
+    ctx_http.post("/v1/messages/count_tokens",         ex_wrapper(routes.post_anthropic_count_tokens)); // anthropic token counting
     // LoRA adapters hotswap
     ctx_http.get ("/lora-adapters",            ex_wrapper(routes.get_lora_adapters));
     ctx_http.post("/lora-adapters",            ex_wrapper(routes.post_lora_adapters));
diff --git a/tools/server/tests/unit/test_chat_completion.py b/tools/server/tests/unit/test_chat_completion.py
index f80e46133c..fe55dc5ab1 100644
--- a/tools/server/tests/unit/test_chat_completion.py
+++ b/tools/server/tests/unit/test_chat_completion.py
@@ -573,3 +573,19 @@ def test_chat_completions_multiple_choices():
         for choice in res.body["choices"]:
             assert "assistant" == choice["message"]["role"]
             assert choice["finish_reason"] == "length"
+
+
+def test_chat_completions_token_count():
+    global server
+    server.start()
+    # make sure cache can be reused across multiple choices and multiple requests
+    # ref: https://github.com/ggml-org/llama.cpp/pull/18663
+    for _ in range(2):
+        res = server.make_request("POST", "/chat/completions/input_tokens", data={
+            "messages": [
+                {"role": "system", "content": "Book"},
+                {"role": "user", "content": "What is the best book"},
+            ],
+        })
+        assert res.status_code == 200
+        assert res.body["input_tokens"] > 5
diff --git a/tools/server/tests/unit/test_vision_api.py b/tools/server/tests/unit/test_vision_api.py
index fb77084c89..d74cc3a43e 100644
--- a/tools/server/tests/unit/test_vision_api.py
+++ b/tools/server/tests/unit/test_vision_api.py
@@ -98,6 +98,25 @@ def test_vision_chat_completion(prompt, image_url, success, re_content):
         assert res.status_code != 200
 
 
+def test_vision_chat_completion_token_count():
+    global server
+    server.start()
+    res = server.make_request("POST", "/chat/completions/input_tokens", data={
+        "temperature": 0.0,
+        "top_k": 1,
+        "messages": [
+            {"role": "user", "content": [
+                {"type": "text", "text": "What is this:"},
+                {"type": "image_url", "image_url": {
+                    "url": get_img_url("IMG_URL_0"),
+                }},
+            ]},
+        ],
+    })
+    assert res.status_code == 200
+    assert res.body["input_tokens"] > 10
+
+
 @pytest.mark.parametrize(
     "prompt, image_data, success, re_content",
     [