diff --git a/tools/mtmd/clip.cpp b/tools/mtmd/clip.cpp index 17079815d4..10840a851f 100644 --- a/tools/mtmd/clip.cpp +++ b/tools/mtmd/clip.cpp @@ -1675,6 +1675,9 @@ struct clip_model_loader { // note: some models having hparams.image_size == 0, which means the image size is dynamic throw std::runtime_error(string_format("%s: image_size (%d) cannot be negative\n", __func__, hparams.image_size)); } + if (hparams.image_size > 65536) { + throw std::runtime_error(string_format("%s: image_size (%d) is too large (max 65536)\n", __func__, hparams.image_size)); + } if (hparams.patch_size <= 0) { throw std::runtime_error(string_format("%s: patch_size (%d) must be greater than 0\n", __func__, hparams.patch_size)); } @@ -1723,6 +1726,19 @@ struct clip_model_loader { LOG_INF("%s: audio_n_fft: %d\n", __func__, hparams.audio_n_fft); LOG_INF("%s: audio_window_len: %d\n", __func__, hparams.audio_window_len); LOG_INF("%s: audio_hop_len: %d\n", __func__, hparams.audio_hop_len); + + // GEMMA4UA is encoder-free: it uses n_mel_bins as a raw-waveform frame size (640) and has no FFT/filterbank, so the mel-range and FFT + // checks below do not apply to it. + const bool fft_based = model.proj_type != PROJECTOR_TYPE_GEMMA4UA; + + // Validate audio hparams loaded from GGUF metadata + if (hparams.n_mel_bins <= 0 || (fft_based && hparams.n_mel_bins > 256)) { + throw std::runtime_error(string_format("%s: n_mel_bins (%d) must be in range [1, 256]\n", __func__, hparams.n_mel_bins)); + } + if (fft_based && (hparams.audio_sample_rate <= 0 || hparams.audio_n_fft <= 0 || hparams.audio_hop_len <= 0 || hparams.audio_window_len <= 0)) { + throw std::runtime_error(string_format("%s: audio hparams invalid: sample_rate=%d n_fft=%d window_len=%d hop_len=%d\n", + __func__, hparams.audio_sample_rate, hparams.audio_n_fft, hparams.audio_window_len, hparams.audio_hop_len)); + } } LOG_INF("\n"); LOG_INF("%s: model size: %.2f MiB\n", __func__, model_size / 1024.0 / 1024.0); @@ -2831,6 +2847,12 @@ struct clip_model_loader { img.set_size({sz, sz}, false, false); LOG_INF("%s: warmup with image size = %d x %d\n", __func__, sz, sz); } else { + // GEMMA4UA uses n_mel_bins as a raw-waveform frame size (640), not a mel-bin count, + // so the [1, 256] bound only applies to FFT-based models. + const bool fft_based = ctx_clip.model.proj_type != PROJECTOR_TYPE_GEMMA4UA; + if (hparams.n_mel_bins <= 0 || (fft_based && hparams.n_mel_bins > 256)) { + throw std::runtime_error(string_format("%s: invalid n_mel_bins (%d), must be in [1, 256]\n", __func__, hparams.n_mel_bins)); + } img.set_size({hparams.warmup_audio_size, hparams.n_mel_bins}, false, false); LOG_INF("%s: warmup with audio size = %d\n", __func__, hparams.warmup_audio_size); } @@ -2994,7 +3016,13 @@ struct clip_model_loader { } return; } - output = gguf_get_val_u32(ctx_gguf.get(), i); + const uint32_t val = gguf_get_val_u32(ctx_gguf.get(), i); + // sanity check + if (val > (uint32_t) INT32_MAX) { + throw std::runtime_error(string_format("%s: value %u for key '%s' exceeds INT32_MAX\n", + __func__, val, key.c_str())); + } + output = (int) val; } void get_f32(const std::string & key, float & output, bool required = true) const { diff --git a/tools/mtmd/clip.h b/tools/mtmd/clip.h index f66f4bc3bb..e0f1d298c8 100644 --- a/tools/mtmd/clip.h +++ b/tools/mtmd/clip.h @@ -24,6 +24,9 @@ struct clip_image_size { return !(*this == other); } int area() const { + // avoid overflow when computing area + GGML_ASSERT(width >= 0 && width <= 46000); + GGML_ASSERT(height >= 0 && height <= 46000); return width * height; } }; diff --git a/tools/mtmd/mtmd-audio.cpp b/tools/mtmd/mtmd-audio.cpp index 13f211fd90..b72fd067a5 100644 --- a/tools/mtmd/mtmd-audio.cpp +++ b/tools/mtmd/mtmd-audio.cpp @@ -32,8 +32,8 @@ void mtmd_audio_cache::fill_hann_window(uint32_t length, bool periodic) { } } -void mtmd_audio_cache::fill_mel_filterbank_matrix(int n_mel, - int n_fft, +void mtmd_audio_cache::fill_mel_filterbank_matrix(int64_t n_mel, + int64_t n_fft, int sample_rate, float fmin, float fmax, @@ -86,11 +86,16 @@ void mtmd_audio_cache::fill_mel_filterbank_matrix(int n_mel, hz_pts[i] = mel_to_hz(mel_pts[i]); } - const int n_fft_bins = n_fft / 2 + 1; + const int64_t n_fft_bins = n_fft / 2 + 1; + + // Validate allocation size + if ((size_t)n_mel * (size_t)n_fft_bins > SIZE_MAX) { + GGML_ASSERT(false && "mel filterbank allocation too large"); + } // filterbank - std::vector out(n_mel * n_fft_bins, 0); - for (int m = 0; m < n_mel; ++m) { + std::vector out((size_t)n_mel * (size_t)n_fft_bins, 0); + for (int64_t m = 0; m < n_mel; ++m) { const double f_left = hz_pts[m]; const double f_center = hz_pts[m + 1]; const double f_right = hz_pts[m + 2]; @@ -266,8 +271,8 @@ static void ifft(const mtmd_audio_cache & cache, float * in, int N, float * out) } struct filter_params { - int32_t n_mel; - int32_t n_fft_bins; + int64_t n_mel; + int64_t n_fft_bins; int32_t hann_window_size; int32_t hop_length; int32_t sample_rate; @@ -293,8 +298,8 @@ static void log_mel_spectrogram_worker_thread(int ith, std::vector fft_in(frame_size * 2, 0.0); std::vector fft_out(frame_size * 2 * 2 * 2); - int n_fft_bins = params.n_fft_bins; - int i = ith; + int64_t n_fft_bins = params.n_fft_bins; + int64_t i = ith; const auto & filters = cache.filters; @@ -302,17 +307,18 @@ static void log_mel_spectrogram_worker_thread(int ith, GGML_ASSERT(n_fft_bins == 1 + (frame_size / 2)); GGML_ASSERT(cache.sin_vals.size() == cache.cos_vals.size()); // calculate FFT only when fft_in are not all zero - for (; i < std::min(n_samples / frame_step + 1, out.n_len); i += n_threads) { - const int offset = i * frame_step; + for (; i < std::min((int64_t)(n_samples / frame_step + 1), out.n_len); i += n_threads) { + const int64_t offset = i * frame_step; // apply Hann window (~10% faster) - for (int j = 0; j < std::min(frame_size, n_samples - offset); j++) { + const int valid_len = std::min(frame_size, std::max(0, n_samples - (int)offset)); + for (int j = 0; j < valid_len; j++) { fft_in[j] = hann[j] * samples[offset + j]; } // fill the rest with zeros - if (n_samples - offset < frame_size) { - std::fill(fft_in.begin() + (n_samples - offset), fft_in.end(), 0.0); + if (valid_len < frame_size) { + std::fill(fft_in.begin() + valid_len, fft_in.end(), 0.0); } // FFT @@ -325,7 +331,7 @@ static void log_mel_spectrogram_worker_thread(int ith, } // mel spectrogram - for (int j = 0; j < out.n_mel; j++) { + for (int64_t j = 0; j < out.n_mel; j++) { double sum = 0.0; // unroll loop (suggested by GH user @lunixbochs) int k = 0; @@ -339,21 +345,21 @@ static void log_mel_spectrogram_worker_thread(int ith, } // handle n_fft remainder for (; k < n_fft_bins; k++) { - sum += fft_out[k] * filters.data[j * n_fft_bins + k]; + sum += fft_out[k] * filters.data[(size_t)j * n_fft_bins + k]; } sum = std::max(sum, (double)params.mel_floor); sum = params.use_natural_log ? log(sum) : log10(sum); - out.data[j * out.n_len + i] = sum; + out.data[(size_t)j * out.n_len + i] = sum; } } // Otherwise fft_out are all zero double sum = params.use_natural_log ? log(1e-10) : log10(1e-10); for (; i < out.n_len; i += n_threads) { - for (int j = 0; j < out.n_mel; j++) { - out.data[j * out.n_len + i] = sum; + for (int64_t j = 0; j < out.n_mel; j++) { + out.data[(size_t)j * out.n_len + i] = sum; } } } @@ -437,16 +443,21 @@ static bool log_mel_spectrogram( GGML_ASSERT(params.hop_length > 0); out.n_mel = params.n_mel; out.n_len = (n_samples - frame_size) / frame_step + 1; - // TODO: handle these checks better - if (out.n_mel > 0 && (unsigned long)out.n_len > SIZE_MAX / out.n_mel) { - LOG_ERR("%s: size overflow\n", __func__); + // Validate dimensions before allocation to prevent integer overflow + if (out.n_mel <= 0 || out.n_len <= 0) { + LOG_ERR("%s: invalid mel dimensions n_mel=%lld n_len=%lld\n", __func__, (long long)out.n_mel, (long long)out.n_len); + return false; + } + const size_t total_size = (size_t)out.n_mel * (size_t)out.n_len; + if (total_size > SIZE_MAX / sizeof(float)) { + LOG_ERR("%s: size overflow: n_mel=%lld n_len=%lld\n", __func__, (long long)out.n_mel, (long long)out.n_len); return false; } if (n_samples < frame_size) { LOG_ERR("%s: not enough samples after padding\n", __func__); return false; } - out.data.resize(out.n_mel * out.n_len); + out.data.resize(total_size); { std::vector workers(n_threads - 1); @@ -464,38 +475,39 @@ static bool log_mel_spectrogram( } } - const int effective_n_len = n_samples_in / frame_step; + const int64_t effective_n_len = n_samples_in / frame_step; if (params.norm_per_feature) { GGML_ASSERT(effective_n_len > 1); - for (int i = 0; i < out.n_mel; i++) { + for (int64_t i = 0; i < out.n_mel; i++) { double mean = 0; - for (int j = 0; j < effective_n_len; ++j) { - mean += out.data[i * out.n_len + j]; + for (int64_t j = 0; j < effective_n_len; ++j) { + mean += out.data[(size_t)i * out.n_len + j]; } mean /= effective_n_len; double var = 0.0; - for (int j = 0; j < effective_n_len; ++j) { - const double value = out.data[i * out.n_len + j] - mean; + for (int64_t j = 0; j < effective_n_len; ++j) { + const double value = out.data[(size_t)i * out.n_len + j] - mean; var += value * value; } var /= effective_n_len - 1; // unbiased const double mstd = std::sqrt(var + 1e-5); - for (int j = 0; j < effective_n_len; ++j) { - auto &value = out.data[i * out.n_len + j]; + for (int64_t j = 0; j < effective_n_len; ++j) { + auto &value = out.data[(size_t)i * out.n_len + j]; value = (value - mean) / mstd; } // pad the rest with zeros - for (int j = effective_n_len; j < out.n_len; ++j) { - out.data[i * out.n_len + j] = 0.0; + for (int64_t j = effective_n_len; j < out.n_len; ++j) { + out.data[(size_t)i * out.n_len + j] = 0.0; } } } else if (!params.no_padding) { // Whisper-style clamping and normalization (NOT used by Gemma4) double mmax = -1e20; - for (int i = 0; i < out.n_mel*out.n_len; i++) { + const size_t mel_size = (size_t)out.n_mel * (size_t)out.n_len; + for (size_t i = 0; i < mel_size; i++) { if (out.data[i] > mmax) { mmax = out.data[i]; } @@ -503,7 +515,7 @@ static bool log_mel_spectrogram( mmax -= 8.0; - for (int i = 0; i < out.n_mel*out.n_len; i++) { + for (size_t i = 0; i < mel_size; i++) { if (out.data[i] < mmax) { out.data[i] = mmax; } @@ -582,13 +594,13 @@ bool mtmd_audio_preprocessor_whisper::preprocess(const float * s // because the cgraph in clip.cpp only accepts 3000 frames each, we need to split the mel // we always expect the mel to have 3000 silent frames at the end if (DEBUG) { - printf("output: n_mel = %d, n_len = %d\n", out_full.n_mel, out_full.n_len); + printf("output: n_mel = %d, n_len = %d\n", (int) out_full.n_mel, (int) out_full.n_len); } const size_t frames_per_chunk = 3000; GGML_ASSERT((size_t) out_full.n_len > frames_per_chunk); for (size_t off = 0; off < (size_t) out_full.n_len; off += frames_per_chunk) { - int n_len = std::min(frames_per_chunk, (size_t) out_full.n_len - off); - if ((size_t) n_len < frames_per_chunk) { + int64_t n_len = std::min((int64_t)frames_per_chunk, out_full.n_len - (int64_t)off); + if (n_len < (int64_t)frames_per_chunk) { break; // last incomplete chunk will always be a padded chunk, safe to ignore } @@ -596,10 +608,10 @@ bool mtmd_audio_preprocessor_whisper::preprocess(const float * s out_chunk.n_len = n_len; out_chunk.n_mel = out_full.n_mel; out_chunk.n_len_org = out_full.n_mel; // unused - out_chunk.data.reserve(out_chunk.n_mel * out_chunk.n_len); + out_chunk.data.reserve((size_t)out_chunk.n_mel * (size_t)out_chunk.n_len); - for (int i = 0; i < out_full.n_mel; i++) { - auto src = out_full.data.begin() + i * out_full.n_len + off; + for (int64_t i = 0; i < out_full.n_mel; i++) { + auto src = out_full.data.begin() + (size_t)i * out_full.n_len + off; out_chunk.data.insert(out_chunk.data.end(), src, src + frames_per_chunk); } @@ -681,8 +693,8 @@ bool mtmd_audio_preprocessor_qwen3a::preprocess(const float * sa // The effective frame count: center-padded STFT gives ~n_samples/hop_length frames. // We take min(mel_full.n_len, n_samples/hop + 1) to avoid including excess frames. - const int n_eff = std::min(mel_full.n_len, - (int)(n_samples / hparams.audio_hop_len) + 1); + const int64_t n_eff = std::min(mel_full.n_len, + (int64_t)(n_samples / hparams.audio_hop_len) + 1); // Split into inference windows matching n_window_infer=800 from model config. // Each window is padded to the next multiple of chunk_size for the cgraph. @@ -690,18 +702,18 @@ bool mtmd_audio_preprocessor_qwen3a::preprocess(const float * sa const int chunk_size = 100; // conv sub-chunk size (n_window * 2, n_window=50) const int window_size = 800; // mel frames per forward pass (n_window_infer=800) - for (int off = 0; off < n_eff; off += window_size) { - const int win_eff = std::min(window_size, n_eff - off); - const int n_chunks = (win_eff + chunk_size - 1) / chunk_size; - const int n_padded = n_chunks * chunk_size; + for (int64_t off = 0; off < n_eff; off += window_size) { + const int64_t win_eff = std::min((int64_t)window_size, n_eff - off); + const int64_t n_chunks = (win_eff + chunk_size - 1) / chunk_size; + const int64_t n_padded = n_chunks * chunk_size; mtmd_audio_mel out; out.n_mel = mel_full.n_mel; out.n_len = n_padded; out.n_len_org = win_eff; - out.data.assign(out.n_mel * out.n_len, 0.0f); - for (int m = 0; m < out.n_mel; m++) { - const int copy_len = std::min(win_eff, mel_full.n_len - off); + out.data.assign((size_t)out.n_mel * (size_t)out.n_len, 0.0f); + for (int64_t m = 0; m < out.n_mel; m++) { + const int64_t copy_len = std::min((int64_t)win_eff, mel_full.n_len - off); if (copy_len > 0) { std::copy(mel_full.data.begin() + (size_t)m * mel_full.n_len + off, mel_full.data.begin() + (size_t)m * mel_full.n_len + off + copy_len, @@ -823,37 +835,38 @@ bool mtmd_audio_preprocessor_granite_speech::preprocess(const float * } double mmax = -1e20; - for (int i = 0; i < mel.n_mel * mel.n_len; i++) { + const size_t mel_size = (size_t)mel.n_mel * (size_t)mel.n_len; + for (size_t i = 0; i < mel_size; i++) { if (mel.data[i] > mmax) { mmax = mel.data[i]; } } mmax -= 8.0; - for (int i = 0; i < mel.n_mel * mel.n_len; i++) { + for (size_t i = 0; i < mel_size; i++) { if (mel.data[i] < mmax) { mel.data[i] = mmax; } mel.data[i] = (mel.data[i] + 4.0) / 4.0; } - int n_frames = mel.n_len; + int64_t n_frames = mel.n_len; if (n_frames % 2 == 1) { n_frames--; } - const int n_mel = mel.n_mel; - const int n_stacked = n_frames / 2; + const int64_t n_mel = mel.n_mel; + const int64_t n_stacked = n_frames / 2; mtmd_audio_mel stacked; stacked.n_mel = 2 * n_mel; stacked.n_len = n_stacked; - stacked.n_len_org = (int)n_samples; - stacked.data.resize(2 * n_mel * n_stacked); + stacked.n_len_org = (int64_t)n_samples; + stacked.data.resize((size_t)2 * (size_t)n_mel * (size_t)n_stacked); - for (int t = 0; t < n_stacked; t++) { - for (int m = 0; m < n_mel; m++) { - stacked.data[m * n_stacked + t] = mel.data[m * mel.n_len + 2 * t]; - stacked.data[(m + n_mel) * n_stacked + t] = mel.data[m * mel.n_len + 2 * t + 1]; + for (int64_t t = 0; t < n_stacked; t++) { + for (int64_t m = 0; m < n_mel; m++) { + stacked.data[(size_t)m * n_stacked + t] = mel.data[(size_t)m * mel.n_len + 2 * t]; + stacked.data[(size_t)(m + n_mel) * n_stacked + t] = mel.data[(size_t)m * mel.n_len + 2 * t + 1]; } } @@ -921,8 +934,8 @@ bool mtmd_audio_preprocessor_gemma4a::preprocess(const float * s const int hop = hparams.audio_hop_len; const int n_with_left = (int)chunk_len + pad_left; // PyTorch: unfold(size=frame_length+1, step=hop) on semicausal-padded waveform - const int pt_frames = (n_with_left - (hparams.audio_window_len + 1)) / hop + 1; - const int n_padded_needed = (pt_frames - 1) * hop + fft_size; + const int64_t pt_frames = (n_with_left - (hparams.audio_window_len + 1)) / hop + 1; + const int64_t n_padded_needed = (pt_frames - 1) * hop + fft_size; const int total_pad = std::max((int)(n_padded_needed - (int)chunk_len), pad_left); std::vector padded_samples(total_pad + chunk_len, 0.0f); std::copy(chunk_ptr, chunk_ptr + chunk_len, padded_samples.data() + pad_left); diff --git a/tools/mtmd/mtmd-audio.h b/tools/mtmd/mtmd-audio.h index 9656e3940f..ad96bd847c 100644 --- a/tools/mtmd/mtmd-audio.h +++ b/tools/mtmd/mtmd-audio.h @@ -10,16 +10,16 @@ #define MTMD_INTERNAL_HEADER struct mtmd_audio_mel { - int n_len; - int n_len_org; - int n_mel; + int64_t n_len; + int64_t n_len_org; + int64_t n_mel; std::vector data; }; struct mtmd_audio_mel_filters { - int32_t n_mel; - int32_t n_fft; + int64_t n_mel; + int64_t n_fft; std::vector data; }; @@ -39,8 +39,8 @@ struct mtmd_audio_cache { // Build mel filterbank matrix [n_mel × n_fft_bins] at runtime. // n_fft_bins must be (N_fft / 2 + 1). Example: if N_fft=512 -> n_fft_bins=257. - void fill_mel_filterbank_matrix(int n_mel, - int n_fft, + void fill_mel_filterbank_matrix(int64_t n_mel, + int64_t n_fft, int sample_rate, // e.g. 16000 float fmin = 0.0f, // e.g. 0.0 float fmax = -1.0f, // e.g. sr/2; pass -1 for auto diff --git a/tools/mtmd/mtmd.cpp b/tools/mtmd/mtmd.cpp index abba2ebf2c..cbaac1d377 100644 --- a/tools/mtmd/mtmd.cpp +++ b/tools/mtmd/mtmd.cpp @@ -1295,9 +1295,12 @@ struct mtmd_tokenizer { for (auto & mel_spec : mel_spec_chunks) { const bool is_placeholder = mel_spec.data.empty(); + // Validate dimensions fit in clip_image_size (int) + GGML_ASSERT(mel_spec.n_len <= INT32_MAX && mel_spec.n_len >= 0); + GGML_ASSERT(mel_spec.n_mel <= INT32_MAX && mel_spec.n_mel >= 0); clip_image_f32 mel_f32; mel_f32.set_size( - {mel_spec.n_len, mel_spec.n_mel}, + {(int)mel_spec.n_len, (int)mel_spec.n_mel}, is_placeholder, /* is_audio */ true); mel_f32.cpy_buf(mel_spec.data);