mirror of
https://github.com/ggml-org/llama.cpp.git
synced 2026-06-27 23:50:20 -05:00
mtmd: several bug fixes (#24784)
* mtmd: several bug fixes * fix build * fix gemma4ua * add sanity check in get_u32() * fix build (2) * area() avoid overflow
This commit is contained in:
parent
b14e3fb90c
commit
e2e7a9b2d0
@ -1675,6 +1675,9 @@ struct clip_model_loader {
|
||||
// note: some models having hparams.image_size == 0, which means the image size is dynamic
|
||||
throw std::runtime_error(string_format("%s: image_size (%d) cannot be negative\n", __func__, hparams.image_size));
|
||||
}
|
||||
if (hparams.image_size > 65536) {
|
||||
throw std::runtime_error(string_format("%s: image_size (%d) is too large (max 65536)\n", __func__, hparams.image_size));
|
||||
}
|
||||
if (hparams.patch_size <= 0) {
|
||||
throw std::runtime_error(string_format("%s: patch_size (%d) must be greater than 0\n", __func__, hparams.patch_size));
|
||||
}
|
||||
@ -1723,6 +1726,19 @@ struct clip_model_loader {
|
||||
LOG_INF("%s: audio_n_fft: %d\n", __func__, hparams.audio_n_fft);
|
||||
LOG_INF("%s: audio_window_len: %d\n", __func__, hparams.audio_window_len);
|
||||
LOG_INF("%s: audio_hop_len: %d\n", __func__, hparams.audio_hop_len);
|
||||
|
||||
// GEMMA4UA is encoder-free: it uses n_mel_bins as a raw-waveform frame size (640) and has no FFT/filterbank, so the mel-range and FFT
|
||||
// checks below do not apply to it.
|
||||
const bool fft_based = model.proj_type != PROJECTOR_TYPE_GEMMA4UA;
|
||||
|
||||
// Validate audio hparams loaded from GGUF metadata
|
||||
if (hparams.n_mel_bins <= 0 || (fft_based && hparams.n_mel_bins > 256)) {
|
||||
throw std::runtime_error(string_format("%s: n_mel_bins (%d) must be in range [1, 256]\n", __func__, hparams.n_mel_bins));
|
||||
}
|
||||
if (fft_based && (hparams.audio_sample_rate <= 0 || hparams.audio_n_fft <= 0 || hparams.audio_hop_len <= 0 || hparams.audio_window_len <= 0)) {
|
||||
throw std::runtime_error(string_format("%s: audio hparams invalid: sample_rate=%d n_fft=%d window_len=%d hop_len=%d\n",
|
||||
__func__, hparams.audio_sample_rate, hparams.audio_n_fft, hparams.audio_window_len, hparams.audio_hop_len));
|
||||
}
|
||||
}
|
||||
LOG_INF("\n");
|
||||
LOG_INF("%s: model size: %.2f MiB\n", __func__, model_size / 1024.0 / 1024.0);
|
||||
@ -2831,6 +2847,12 @@ struct clip_model_loader {
|
||||
img.set_size({sz, sz}, false, false);
|
||||
LOG_INF("%s: warmup with image size = %d x %d\n", __func__, sz, sz);
|
||||
} else {
|
||||
// GEMMA4UA uses n_mel_bins as a raw-waveform frame size (640), not a mel-bin count,
|
||||
// so the [1, 256] bound only applies to FFT-based models.
|
||||
const bool fft_based = ctx_clip.model.proj_type != PROJECTOR_TYPE_GEMMA4UA;
|
||||
if (hparams.n_mel_bins <= 0 || (fft_based && hparams.n_mel_bins > 256)) {
|
||||
throw std::runtime_error(string_format("%s: invalid n_mel_bins (%d), must be in [1, 256]\n", __func__, hparams.n_mel_bins));
|
||||
}
|
||||
img.set_size({hparams.warmup_audio_size, hparams.n_mel_bins}, false, false);
|
||||
LOG_INF("%s: warmup with audio size = %d\n", __func__, hparams.warmup_audio_size);
|
||||
}
|
||||
@ -2994,7 +3016,13 @@ struct clip_model_loader {
|
||||
}
|
||||
return;
|
||||
}
|
||||
output = gguf_get_val_u32(ctx_gguf.get(), i);
|
||||
const uint32_t val = gguf_get_val_u32(ctx_gguf.get(), i);
|
||||
// sanity check
|
||||
if (val > (uint32_t) INT32_MAX) {
|
||||
throw std::runtime_error(string_format("%s: value %u for key '%s' exceeds INT32_MAX\n",
|
||||
__func__, val, key.c_str()));
|
||||
}
|
||||
output = (int) val;
|
||||
}
|
||||
|
||||
void get_f32(const std::string & key, float & output, bool required = true) const {
|
||||
|
||||
@ -24,6 +24,9 @@ struct clip_image_size {
|
||||
return !(*this == other);
|
||||
}
|
||||
int area() const {
|
||||
// avoid overflow when computing area
|
||||
GGML_ASSERT(width >= 0 && width <= 46000);
|
||||
GGML_ASSERT(height >= 0 && height <= 46000);
|
||||
return width * height;
|
||||
}
|
||||
};
|
||||
|
||||
@ -32,8 +32,8 @@ void mtmd_audio_cache::fill_hann_window(uint32_t length, bool periodic) {
|
||||
}
|
||||
}
|
||||
|
||||
void mtmd_audio_cache::fill_mel_filterbank_matrix(int n_mel,
|
||||
int n_fft,
|
||||
void mtmd_audio_cache::fill_mel_filterbank_matrix(int64_t n_mel,
|
||||
int64_t n_fft,
|
||||
int sample_rate,
|
||||
float fmin,
|
||||
float fmax,
|
||||
@ -86,11 +86,16 @@ void mtmd_audio_cache::fill_mel_filterbank_matrix(int n_mel,
|
||||
hz_pts[i] = mel_to_hz(mel_pts[i]);
|
||||
}
|
||||
|
||||
const int n_fft_bins = n_fft / 2 + 1;
|
||||
const int64_t n_fft_bins = n_fft / 2 + 1;
|
||||
|
||||
// Validate allocation size
|
||||
if ((size_t)n_mel * (size_t)n_fft_bins > SIZE_MAX) {
|
||||
GGML_ASSERT(false && "mel filterbank allocation too large");
|
||||
}
|
||||
|
||||
// filterbank
|
||||
std::vector<float> out(n_mel * n_fft_bins, 0);
|
||||
for (int m = 0; m < n_mel; ++m) {
|
||||
std::vector<float> out((size_t)n_mel * (size_t)n_fft_bins, 0);
|
||||
for (int64_t m = 0; m < n_mel; ++m) {
|
||||
const double f_left = hz_pts[m];
|
||||
const double f_center = hz_pts[m + 1];
|
||||
const double f_right = hz_pts[m + 2];
|
||||
@ -266,8 +271,8 @@ static void ifft(const mtmd_audio_cache & cache, float * in, int N, float * out)
|
||||
}
|
||||
|
||||
struct filter_params {
|
||||
int32_t n_mel;
|
||||
int32_t n_fft_bins;
|
||||
int64_t n_mel;
|
||||
int64_t n_fft_bins;
|
||||
int32_t hann_window_size;
|
||||
int32_t hop_length;
|
||||
int32_t sample_rate;
|
||||
@ -293,8 +298,8 @@ static void log_mel_spectrogram_worker_thread(int ith,
|
||||
std::vector<float> fft_in(frame_size * 2, 0.0);
|
||||
std::vector<float> fft_out(frame_size * 2 * 2 * 2);
|
||||
|
||||
int n_fft_bins = params.n_fft_bins;
|
||||
int i = ith;
|
||||
int64_t n_fft_bins = params.n_fft_bins;
|
||||
int64_t i = ith;
|
||||
|
||||
const auto & filters = cache.filters;
|
||||
|
||||
@ -302,17 +307,18 @@ static void log_mel_spectrogram_worker_thread(int ith,
|
||||
GGML_ASSERT(n_fft_bins == 1 + (frame_size / 2));
|
||||
GGML_ASSERT(cache.sin_vals.size() == cache.cos_vals.size());
|
||||
// calculate FFT only when fft_in are not all zero
|
||||
for (; i < std::min(n_samples / frame_step + 1, out.n_len); i += n_threads) {
|
||||
const int offset = i * frame_step;
|
||||
for (; i < std::min((int64_t)(n_samples / frame_step + 1), out.n_len); i += n_threads) {
|
||||
const int64_t offset = i * frame_step;
|
||||
|
||||
// apply Hann window (~10% faster)
|
||||
for (int j = 0; j < std::min(frame_size, n_samples - offset); j++) {
|
||||
const int valid_len = std::min(frame_size, std::max(0, n_samples - (int)offset));
|
||||
for (int j = 0; j < valid_len; j++) {
|
||||
fft_in[j] = hann[j] * samples[offset + j];
|
||||
}
|
||||
|
||||
// fill the rest with zeros
|
||||
if (n_samples - offset < frame_size) {
|
||||
std::fill(fft_in.begin() + (n_samples - offset), fft_in.end(), 0.0);
|
||||
if (valid_len < frame_size) {
|
||||
std::fill(fft_in.begin() + valid_len, fft_in.end(), 0.0);
|
||||
}
|
||||
|
||||
// FFT
|
||||
@ -325,7 +331,7 @@ static void log_mel_spectrogram_worker_thread(int ith,
|
||||
}
|
||||
|
||||
// mel spectrogram
|
||||
for (int j = 0; j < out.n_mel; j++) {
|
||||
for (int64_t j = 0; j < out.n_mel; j++) {
|
||||
double sum = 0.0;
|
||||
// unroll loop (suggested by GH user @lunixbochs)
|
||||
int k = 0;
|
||||
@ -339,21 +345,21 @@ static void log_mel_spectrogram_worker_thread(int ith,
|
||||
}
|
||||
// handle n_fft remainder
|
||||
for (; k < n_fft_bins; k++) {
|
||||
sum += fft_out[k] * filters.data[j * n_fft_bins + k];
|
||||
sum += fft_out[k] * filters.data[(size_t)j * n_fft_bins + k];
|
||||
}
|
||||
sum = std::max(sum, (double)params.mel_floor);
|
||||
sum = params.use_natural_log
|
||||
? log(sum)
|
||||
: log10(sum);
|
||||
out.data[j * out.n_len + i] = sum;
|
||||
out.data[(size_t)j * out.n_len + i] = sum;
|
||||
}
|
||||
}
|
||||
|
||||
// Otherwise fft_out are all zero
|
||||
double sum = params.use_natural_log ? log(1e-10) : log10(1e-10);
|
||||
for (; i < out.n_len; i += n_threads) {
|
||||
for (int j = 0; j < out.n_mel; j++) {
|
||||
out.data[j * out.n_len + i] = sum;
|
||||
for (int64_t j = 0; j < out.n_mel; j++) {
|
||||
out.data[(size_t)j * out.n_len + i] = sum;
|
||||
}
|
||||
}
|
||||
}
|
||||
@ -437,16 +443,21 @@ static bool log_mel_spectrogram(
|
||||
GGML_ASSERT(params.hop_length > 0);
|
||||
out.n_mel = params.n_mel;
|
||||
out.n_len = (n_samples - frame_size) / frame_step + 1;
|
||||
// TODO: handle these checks better
|
||||
if (out.n_mel > 0 && (unsigned long)out.n_len > SIZE_MAX / out.n_mel) {
|
||||
LOG_ERR("%s: size overflow\n", __func__);
|
||||
// Validate dimensions before allocation to prevent integer overflow
|
||||
if (out.n_mel <= 0 || out.n_len <= 0) {
|
||||
LOG_ERR("%s: invalid mel dimensions n_mel=%lld n_len=%lld\n", __func__, (long long)out.n_mel, (long long)out.n_len);
|
||||
return false;
|
||||
}
|
||||
const size_t total_size = (size_t)out.n_mel * (size_t)out.n_len;
|
||||
if (total_size > SIZE_MAX / sizeof(float)) {
|
||||
LOG_ERR("%s: size overflow: n_mel=%lld n_len=%lld\n", __func__, (long long)out.n_mel, (long long)out.n_len);
|
||||
return false;
|
||||
}
|
||||
if (n_samples < frame_size) {
|
||||
LOG_ERR("%s: not enough samples after padding\n", __func__);
|
||||
return false;
|
||||
}
|
||||
out.data.resize(out.n_mel * out.n_len);
|
||||
out.data.resize(total_size);
|
||||
|
||||
{
|
||||
std::vector<std::thread> workers(n_threads - 1);
|
||||
@ -464,38 +475,39 @@ static bool log_mel_spectrogram(
|
||||
}
|
||||
}
|
||||
|
||||
const int effective_n_len = n_samples_in / frame_step;
|
||||
const int64_t effective_n_len = n_samples_in / frame_step;
|
||||
if (params.norm_per_feature) {
|
||||
GGML_ASSERT(effective_n_len > 1);
|
||||
for (int i = 0; i < out.n_mel; i++) {
|
||||
for (int64_t i = 0; i < out.n_mel; i++) {
|
||||
double mean = 0;
|
||||
for (int j = 0; j < effective_n_len; ++j) {
|
||||
mean += out.data[i * out.n_len + j];
|
||||
for (int64_t j = 0; j < effective_n_len; ++j) {
|
||||
mean += out.data[(size_t)i * out.n_len + j];
|
||||
}
|
||||
mean /= effective_n_len;
|
||||
|
||||
double var = 0.0;
|
||||
for (int j = 0; j < effective_n_len; ++j) {
|
||||
const double value = out.data[i * out.n_len + j] - mean;
|
||||
for (int64_t j = 0; j < effective_n_len; ++j) {
|
||||
const double value = out.data[(size_t)i * out.n_len + j] - mean;
|
||||
var += value * value;
|
||||
}
|
||||
var /= effective_n_len - 1; // unbiased
|
||||
const double mstd = std::sqrt(var + 1e-5);
|
||||
|
||||
for (int j = 0; j < effective_n_len; ++j) {
|
||||
auto &value = out.data[i * out.n_len + j];
|
||||
for (int64_t j = 0; j < effective_n_len; ++j) {
|
||||
auto &value = out.data[(size_t)i * out.n_len + j];
|
||||
value = (value - mean) / mstd;
|
||||
}
|
||||
|
||||
// pad the rest with zeros
|
||||
for (int j = effective_n_len; j < out.n_len; ++j) {
|
||||
out.data[i * out.n_len + j] = 0.0;
|
||||
for (int64_t j = effective_n_len; j < out.n_len; ++j) {
|
||||
out.data[(size_t)i * out.n_len + j] = 0.0;
|
||||
}
|
||||
}
|
||||
} else if (!params.no_padding) {
|
||||
// Whisper-style clamping and normalization (NOT used by Gemma4)
|
||||
double mmax = -1e20;
|
||||
for (int i = 0; i < out.n_mel*out.n_len; i++) {
|
||||
const size_t mel_size = (size_t)out.n_mel * (size_t)out.n_len;
|
||||
for (size_t i = 0; i < mel_size; i++) {
|
||||
if (out.data[i] > mmax) {
|
||||
mmax = out.data[i];
|
||||
}
|
||||
@ -503,7 +515,7 @@ static bool log_mel_spectrogram(
|
||||
|
||||
mmax -= 8.0;
|
||||
|
||||
for (int i = 0; i < out.n_mel*out.n_len; i++) {
|
||||
for (size_t i = 0; i < mel_size; i++) {
|
||||
if (out.data[i] < mmax) {
|
||||
out.data[i] = mmax;
|
||||
}
|
||||
@ -582,13 +594,13 @@ bool mtmd_audio_preprocessor_whisper::preprocess(const float * s
|
||||
// because the cgraph in clip.cpp only accepts 3000 frames each, we need to split the mel
|
||||
// we always expect the mel to have 3000 silent frames at the end
|
||||
if (DEBUG) {
|
||||
printf("output: n_mel = %d, n_len = %d\n", out_full.n_mel, out_full.n_len);
|
||||
printf("output: n_mel = %d, n_len = %d\n", (int) out_full.n_mel, (int) out_full.n_len);
|
||||
}
|
||||
const size_t frames_per_chunk = 3000;
|
||||
GGML_ASSERT((size_t) out_full.n_len > frames_per_chunk);
|
||||
for (size_t off = 0; off < (size_t) out_full.n_len; off += frames_per_chunk) {
|
||||
int n_len = std::min(frames_per_chunk, (size_t) out_full.n_len - off);
|
||||
if ((size_t) n_len < frames_per_chunk) {
|
||||
int64_t n_len = std::min((int64_t)frames_per_chunk, out_full.n_len - (int64_t)off);
|
||||
if (n_len < (int64_t)frames_per_chunk) {
|
||||
break; // last incomplete chunk will always be a padded chunk, safe to ignore
|
||||
}
|
||||
|
||||
@ -596,10 +608,10 @@ bool mtmd_audio_preprocessor_whisper::preprocess(const float * s
|
||||
out_chunk.n_len = n_len;
|
||||
out_chunk.n_mel = out_full.n_mel;
|
||||
out_chunk.n_len_org = out_full.n_mel; // unused
|
||||
out_chunk.data.reserve(out_chunk.n_mel * out_chunk.n_len);
|
||||
out_chunk.data.reserve((size_t)out_chunk.n_mel * (size_t)out_chunk.n_len);
|
||||
|
||||
for (int i = 0; i < out_full.n_mel; i++) {
|
||||
auto src = out_full.data.begin() + i * out_full.n_len + off;
|
||||
for (int64_t i = 0; i < out_full.n_mel; i++) {
|
||||
auto src = out_full.data.begin() + (size_t)i * out_full.n_len + off;
|
||||
out_chunk.data.insert(out_chunk.data.end(), src, src + frames_per_chunk);
|
||||
}
|
||||
|
||||
@ -681,8 +693,8 @@ bool mtmd_audio_preprocessor_qwen3a::preprocess(const float * sa
|
||||
|
||||
// The effective frame count: center-padded STFT gives ~n_samples/hop_length frames.
|
||||
// We take min(mel_full.n_len, n_samples/hop + 1) to avoid including excess frames.
|
||||
const int n_eff = std::min(mel_full.n_len,
|
||||
(int)(n_samples / hparams.audio_hop_len) + 1);
|
||||
const int64_t n_eff = std::min(mel_full.n_len,
|
||||
(int64_t)(n_samples / hparams.audio_hop_len) + 1);
|
||||
|
||||
// Split into inference windows matching n_window_infer=800 from model config.
|
||||
// Each window is padded to the next multiple of chunk_size for the cgraph.
|
||||
@ -690,18 +702,18 @@ bool mtmd_audio_preprocessor_qwen3a::preprocess(const float * sa
|
||||
const int chunk_size = 100; // conv sub-chunk size (n_window * 2, n_window=50)
|
||||
const int window_size = 800; // mel frames per forward pass (n_window_infer=800)
|
||||
|
||||
for (int off = 0; off < n_eff; off += window_size) {
|
||||
const int win_eff = std::min(window_size, n_eff - off);
|
||||
const int n_chunks = (win_eff + chunk_size - 1) / chunk_size;
|
||||
const int n_padded = n_chunks * chunk_size;
|
||||
for (int64_t off = 0; off < n_eff; off += window_size) {
|
||||
const int64_t win_eff = std::min((int64_t)window_size, n_eff - off);
|
||||
const int64_t n_chunks = (win_eff + chunk_size - 1) / chunk_size;
|
||||
const int64_t n_padded = n_chunks * chunk_size;
|
||||
|
||||
mtmd_audio_mel out;
|
||||
out.n_mel = mel_full.n_mel;
|
||||
out.n_len = n_padded;
|
||||
out.n_len_org = win_eff;
|
||||
out.data.assign(out.n_mel * out.n_len, 0.0f);
|
||||
for (int m = 0; m < out.n_mel; m++) {
|
||||
const int copy_len = std::min(win_eff, mel_full.n_len - off);
|
||||
out.data.assign((size_t)out.n_mel * (size_t)out.n_len, 0.0f);
|
||||
for (int64_t m = 0; m < out.n_mel; m++) {
|
||||
const int64_t copy_len = std::min((int64_t)win_eff, mel_full.n_len - off);
|
||||
if (copy_len > 0) {
|
||||
std::copy(mel_full.data.begin() + (size_t)m * mel_full.n_len + off,
|
||||
mel_full.data.begin() + (size_t)m * mel_full.n_len + off + copy_len,
|
||||
@ -823,37 +835,38 @@ bool mtmd_audio_preprocessor_granite_speech::preprocess(const float *
|
||||
}
|
||||
|
||||
double mmax = -1e20;
|
||||
for (int i = 0; i < mel.n_mel * mel.n_len; i++) {
|
||||
const size_t mel_size = (size_t)mel.n_mel * (size_t)mel.n_len;
|
||||
for (size_t i = 0; i < mel_size; i++) {
|
||||
if (mel.data[i] > mmax) {
|
||||
mmax = mel.data[i];
|
||||
}
|
||||
}
|
||||
mmax -= 8.0;
|
||||
|
||||
for (int i = 0; i < mel.n_mel * mel.n_len; i++) {
|
||||
for (size_t i = 0; i < mel_size; i++) {
|
||||
if (mel.data[i] < mmax) {
|
||||
mel.data[i] = mmax;
|
||||
}
|
||||
mel.data[i] = (mel.data[i] + 4.0) / 4.0;
|
||||
}
|
||||
|
||||
int n_frames = mel.n_len;
|
||||
int64_t n_frames = mel.n_len;
|
||||
if (n_frames % 2 == 1) {
|
||||
n_frames--;
|
||||
}
|
||||
const int n_mel = mel.n_mel;
|
||||
const int n_stacked = n_frames / 2;
|
||||
const int64_t n_mel = mel.n_mel;
|
||||
const int64_t n_stacked = n_frames / 2;
|
||||
|
||||
mtmd_audio_mel stacked;
|
||||
stacked.n_mel = 2 * n_mel;
|
||||
stacked.n_len = n_stacked;
|
||||
stacked.n_len_org = (int)n_samples;
|
||||
stacked.data.resize(2 * n_mel * n_stacked);
|
||||
stacked.n_len_org = (int64_t)n_samples;
|
||||
stacked.data.resize((size_t)2 * (size_t)n_mel * (size_t)n_stacked);
|
||||
|
||||
for (int t = 0; t < n_stacked; t++) {
|
||||
for (int m = 0; m < n_mel; m++) {
|
||||
stacked.data[m * n_stacked + t] = mel.data[m * mel.n_len + 2 * t];
|
||||
stacked.data[(m + n_mel) * n_stacked + t] = mel.data[m * mel.n_len + 2 * t + 1];
|
||||
for (int64_t t = 0; t < n_stacked; t++) {
|
||||
for (int64_t m = 0; m < n_mel; m++) {
|
||||
stacked.data[(size_t)m * n_stacked + t] = mel.data[(size_t)m * mel.n_len + 2 * t];
|
||||
stacked.data[(size_t)(m + n_mel) * n_stacked + t] = mel.data[(size_t)m * mel.n_len + 2 * t + 1];
|
||||
}
|
||||
}
|
||||
|
||||
@ -921,8 +934,8 @@ bool mtmd_audio_preprocessor_gemma4a::preprocess(const float * s
|
||||
const int hop = hparams.audio_hop_len;
|
||||
const int n_with_left = (int)chunk_len + pad_left;
|
||||
// PyTorch: unfold(size=frame_length+1, step=hop) on semicausal-padded waveform
|
||||
const int pt_frames = (n_with_left - (hparams.audio_window_len + 1)) / hop + 1;
|
||||
const int n_padded_needed = (pt_frames - 1) * hop + fft_size;
|
||||
const int64_t pt_frames = (n_with_left - (hparams.audio_window_len + 1)) / hop + 1;
|
||||
const int64_t n_padded_needed = (pt_frames - 1) * hop + fft_size;
|
||||
const int total_pad = std::max((int)(n_padded_needed - (int)chunk_len), pad_left);
|
||||
std::vector<float> padded_samples(total_pad + chunk_len, 0.0f);
|
||||
std::copy(chunk_ptr, chunk_ptr + chunk_len, padded_samples.data() + pad_left);
|
||||
|
||||
@ -10,16 +10,16 @@
|
||||
#define MTMD_INTERNAL_HEADER
|
||||
|
||||
struct mtmd_audio_mel {
|
||||
int n_len;
|
||||
int n_len_org;
|
||||
int n_mel;
|
||||
int64_t n_len;
|
||||
int64_t n_len_org;
|
||||
int64_t n_mel;
|
||||
|
||||
std::vector<float> data;
|
||||
};
|
||||
|
||||
struct mtmd_audio_mel_filters {
|
||||
int32_t n_mel;
|
||||
int32_t n_fft;
|
||||
int64_t n_mel;
|
||||
int64_t n_fft;
|
||||
|
||||
std::vector<float> data;
|
||||
};
|
||||
@ -39,8 +39,8 @@ struct mtmd_audio_cache {
|
||||
|
||||
// Build mel filterbank matrix [n_mel × n_fft_bins] at runtime.
|
||||
// n_fft_bins must be (N_fft / 2 + 1). Example: if N_fft=512 -> n_fft_bins=257.
|
||||
void fill_mel_filterbank_matrix(int n_mel,
|
||||
int n_fft,
|
||||
void fill_mel_filterbank_matrix(int64_t n_mel,
|
||||
int64_t n_fft,
|
||||
int sample_rate, // e.g. 16000
|
||||
float fmin = 0.0f, // e.g. 0.0
|
||||
float fmax = -1.0f, // e.g. sr/2; pass -1 for auto
|
||||
|
||||
@ -1295,9 +1295,12 @@ struct mtmd_tokenizer {
|
||||
for (auto & mel_spec : mel_spec_chunks) {
|
||||
const bool is_placeholder = mel_spec.data.empty();
|
||||
|
||||
// Validate dimensions fit in clip_image_size (int)
|
||||
GGML_ASSERT(mel_spec.n_len <= INT32_MAX && mel_spec.n_len >= 0);
|
||||
GGML_ASSERT(mel_spec.n_mel <= INT32_MAX && mel_spec.n_mel >= 0);
|
||||
clip_image_f32 mel_f32;
|
||||
mel_f32.set_size(
|
||||
{mel_spec.n_len, mel_spec.n_mel},
|
||||
{(int)mel_spec.n_len, (int)mel_spec.n_mel},
|
||||
is_placeholder, /* is_audio */ true);
|
||||
mel_f32.cpy_buf(mel_spec.data);
|
||||
|
||||
|
||||
Loading…
x
Reference in New Issue
Block a user