mtmd, model: allow skip build_vit() (#24077)

* add model

* nits
This commit is contained in:
Xuan-Son Nguyen 2026-06-03 17:10:35 +02:00 committed by GitHub
parent ee4cf705bb
commit a731805ced
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
20 changed files with 363 additions and 3 deletions

View File

@ -77,6 +77,7 @@ TEXT_MODEL_MAP: dict[str, str] = {
"Gemma3nForConditionalGeneration": "gemma",
"Gemma4ForConditionalGeneration": "gemma",
"Gemma4ForCausalLM": "gemma",
"Gemma4UnifiedForConditionalGeneration": "gemma",
"GemmaForCausalLM": "gemma",
"Glm4ForCausalLM": "glm",
"Glm4MoeForCausalLM": "glm",
@ -247,6 +248,7 @@ MMPROJ_MODEL_MAP: dict[str, str] = {
"Gemma3ForConditionalGeneration": "gemma",
"Gemma3nForConditionalGeneration": "gemma",
"Gemma4ForConditionalGeneration": "gemma",
"Gemma4UnifiedForConditionalGeneration": "gemma",
"Glm4vForConditionalGeneration": "qwen3vl",
"Glm4vMoeForConditionalGeneration": "qwen3vl",
"GlmOcrForConditionalGeneration": "qwen3vl",

View File

@ -3,7 +3,7 @@ from __future__ import annotations
import json
import re
from typing import Callable, Iterable, TYPE_CHECKING
from typing import Callable, Iterable, TYPE_CHECKING, Sequence
import torch
@ -765,6 +765,26 @@ class Gemma4Model(Gemma3Model):
yield from super().modify_tensors(data_torch, name, bid)
@ModelBase.register("Gemma4UnifiedForConditionalGeneration")
class Gemma4UnifiedModel(Gemma4Model):
model_arch = gguf.MODEL_ARCH.GEMMA4
def _get_suppress_tokens(self) -> Sequence[int] | None:
gen_cfg_path = self.dir_model / "generation_config.json"
if gen_cfg_path.is_file():
with open(gen_cfg_path, encoding="utf-8") as f:
gen_cfg = json.load(f)
return gen_cfg.get("suppress_tokens")
return None
def set_gguf_parameters(self):
super().set_gguf_parameters()
suppress_tokens = self._get_suppress_tokens()
if suppress_tokens is not None:
self.gguf_writer.add_suppress_tokens(suppress_tokens)
@ModelBase.register("Gemma4ForConditionalGeneration")
class Gemma4VisionAudioModel(MmprojModel):
has_audio_encoder = True
@ -839,3 +859,61 @@ class Gemma4VisionAudioModel(MmprojModel):
data_torch = data_torch.permute(0, 3, 1, 2).contiguous()
mapped_name = self.map_tensor_name(name, (".weight", ".bias", ".input_max", ".input_min", ".output_max", ".output_min"))
yield (mapped_name, data_torch)
@ModelBase.register("Gemma4UnifiedForConditionalGeneration")
class Gemma4UnifiedVisionAudioModel(Gemma4VisionAudioModel):
has_audio_encoder = True
has_vision_encoder = True
def __init__(self, *args, **kwargs):
super().__init__(*args, **kwargs)
assert self.hparams_vision is not None
assert self.hparams_audio is not None
text_embd_dim = self.hparams_vision["mm_embed_dim"]
self.hparams_vision["hidden_size"] = text_embd_dim
self.hparams_audio["hidden_size"] = text_embd_dim
# this is a transformer-less vision tower, the params below are redundant but set to avoid error
self.hparams_vision["intermediate_size"] = 0
self.hparams_vision["num_layers"] = 0
self.hparams_vision["num_attention_heads"] = 0
self.hparams_audio["intermediate_size"] = 0
self.hparams_audio["num_layers"] = 0
self.hparams_audio["num_attention_heads"] = 0
def set_gguf_parameters(self):
super().set_gguf_parameters()
self.gguf_writer.add_clip_vision_projector_type(gguf.VisionProjectorType.GEMMA4UV)
self.gguf_writer.add_clip_audio_projector_type(gguf.VisionProjectorType.GEMMA4UA)
def modify_tensors(self, data_torch, name, bid):
if name.endswith("pos_embedding"):
name += ".weight"
data_torch = data_torch.permute(1, 0, 2)
elif ".pos_norm." in name:
# rename to patch_ln3 to reuse the tensor name scheme
name = name.replace(".pos_norm.", ".patch_ln3.")
elif "patch_dense.weight" in name:
# ggml im2col outputs in RR..GG..BB.. (CHW) order, but weight expects RGBRGB.. (HWC).
# Permute columns so column i aligns with CHW input position i.
assert self.hparams_vision is not None
p = self.hparams_vision["model_patch_size"]
i = torch.arange(p * p * 3)
ch = i // (p * p)
row = (i % (p * p)) // p
col = i % p
# perm[i] = HWC column index for CHW position i
perm = row * p * 3 + col * 3 + ch
data_torch = data_torch[:, perm]
elif "patch_ln1.weight" in name or "patch_ln1.bias" in name:
# same permutation for patch_ln1 as patch_dense to align with CHW input order
assert self.hparams_vision is not None
p = self.hparams_vision["model_patch_size"]
i = torch.arange(p * p * 3)
ch = i // (p * p)
row = (i % (p * p)) // p
col = i % p
# perm[i] = HWC index for CHW position i
perm = row * p * 3 + col * 3 + ch
data_torch = data_torch[perm]
return super().modify_tensors(data_torch, name, bid)

View File

@ -264,6 +264,7 @@ class Keys:
ADD_PREFIX = "tokenizer.ggml.add_space_prefix"
REMOVE_EXTRA_WS = "tokenizer.ggml.remove_extra_whitespaces"
PRECOMPILED_CHARSMAP = "tokenizer.ggml.precompiled_charsmap"
SUPPRESS_TOKENS = "tokenizer.ggml.suppress_tokens"
HF_JSON = "tokenizer.huggingface.json"
RWKV = "tokenizer.rwkv.world"
CHAT_TEMPLATE = "tokenizer.chat_template"
@ -731,6 +732,7 @@ class MODEL_TENSOR(IntEnum):
V_ENC_EMBD_CLS = auto()
V_ENC_EMBD_PATCH = auto()
V_ENC_EMBD_NORM = auto()
V_ENC_EMBD_PATCH_NORM = auto() # allow multiple norms in the same embd, e.g. for gemma4u
V_ENC_EMBD_POS = auto()
V_ENC_INPUT_NORM = auto()
V_ENC_ATTN_QKV = auto()
@ -1250,6 +1252,7 @@ TENSOR_NAMES: dict[MODEL_TENSOR, str] = {
MODEL_TENSOR.V_ENC_EMBD_CLS: "v.class_embd",
MODEL_TENSOR.V_ENC_EMBD_PATCH: "v.patch_embd",
MODEL_TENSOR.V_ENC_EMBD_NORM: "v.norm_embd",
MODEL_TENSOR.V_ENC_EMBD_PATCH_NORM: "v.patch_norm.{bid}",
MODEL_TENSOR.V_ENC_EMBD_POS: "v.position_embd",
MODEL_TENSOR.V_ENC_ATTN_QKV: "v.blk.{bid}.attn_qkv",
MODEL_TENSOR.V_ENC_ATTN_Q: "v.blk.{bid}.attn_q",
@ -1431,6 +1434,7 @@ MODEL_TENSORS: dict[MODEL_ARCH, list[MODEL_TENSOR]] = {
MODEL_TENSOR.V_ENC_EMBD_CLS,
MODEL_TENSOR.V_ENC_EMBD_PATCH,
MODEL_TENSOR.V_ENC_EMBD_NORM,
MODEL_TENSOR.V_ENC_EMBD_PATCH_NORM,
MODEL_TENSOR.V_ENC_EMBD_POS,
MODEL_TENSOR.V_ENC_EMBD_IMGNL,
MODEL_TENSOR.V_ENC_EMBD_VSEP,
@ -4346,6 +4350,8 @@ class VisionProjectorType:
GEMMA3NA = "gemma3na"
GEMMA4V = "gemma4v"
GEMMA4A = "gemma4a"
GEMMA4UV = "gemma4uv" # "unified" variant
GEMMA4UA = "gemma4ua" # "unified" variant
PHI4 = "phi4"
IDEFICS3 = "idefics3"
PIXTRAL = "pixtral"

View File

@ -1113,6 +1113,9 @@ class GGUFWriter:
self.add_string(Keys.Tokenizer.CHAT_TEMPLATE, value)
def add_suppress_tokens(self, tokens: Sequence[int]) -> None:
self.add_array(Keys.Tokenizer.SUPPRESS_TOKENS, tokens)
def add_normalizer_lowercase(self, value: bool) -> None:
self.add_bool(Keys.Tokenizer.NORMALIZER_LOWERCASE, value)

View File

@ -1426,6 +1426,7 @@ class TensorNameMap:
"model.vision_tower.patch_embedder.input_proj", # gemma4
"vision_tower.patch_embed.patchifier.proj", # dots.ocr
"vision_model.conv1", # Step3-VL
"model.vision_embedder.patch_dense", # gemma4 unified
),
MODEL_TENSOR.V_ENC_EMBD_NORM: (
@ -1433,6 +1434,10 @@ class TensorNameMap:
"vision_tower.patch_embed.patchifier.norm", # dots.ocr
),
MODEL_TENSOR.V_ENC_EMBD_PATCH_NORM: (
"model.vision_embedder.patch_ln{bid}", # gemma4 unified
),
MODEL_TENSOR.V_ENC_EMBD_POS: (
"vision_tower.vision_model.embeddings.position_embedding",
"model.vision_tower.embeddings.position_embedding", # minicpmv4_6
@ -1448,6 +1453,7 @@ class TensorNameMap:
"vision_model.radio_model.model.patch_generator.pos_embed", # Nemotron Nano v2 VL
"model.vision_tower.patch_embedder.position_embedding_table", # gemma4
"vision_model.positional_embedding", # Step3-VL
"model.vision_embedder.pos_embedding", # gemma4 unified
),
MODEL_TENSOR.V_ENC_EMBD_IMGNL: (

View File

@ -329,6 +329,7 @@ static const std::map<llm_kv, const char *> LLM_KV_NAMES = {
{ LLM_KV_TOKENIZER_FIM_PAD_ID, "tokenizer.ggml.fim_pad_token_id" },
{ LLM_KV_TOKENIZER_FIM_REP_ID, "tokenizer.ggml.fim_rep_token_id" },
{ LLM_KV_TOKENIZER_FIM_SEP_ID, "tokenizer.ggml.fim_sep_token_id" },
{ LLM_KV_TOKENIZER_SUPPRESS_TOKENS, "tokenizer.ggml.suppress_tokens" },
{ LLM_KV_ADAPTER_TYPE, "adapter.type" },
{ LLM_KV_ADAPTER_LORA_ALPHA, "adapter.lora.alpha" },

View File

@ -318,6 +318,7 @@ enum llm_kv {
LLM_KV_TOKENIZER_FIM_PAD_ID,
LLM_KV_TOKENIZER_FIM_REP_ID,
LLM_KV_TOKENIZER_FIM_SEP_ID,
LLM_KV_TOKENIZER_SUPPRESS_TOKENS,
LLM_KV_ADAPTER_TYPE,
LLM_KV_ADAPTER_LORA_ALPHA,

View File

@ -1815,6 +1815,8 @@ struct llama_vocab::impl {
// set of all tokens that cause "end of generation"
std::set<llama_token> special_eog_ids;
std::vector<llama_token> suppress_tokens;
std::unique_ptr<llm_tokenizer> tokenizer;
std::vector<char> precompiled_charsmap;
@ -2533,6 +2535,16 @@ void llama_vocab::impl::load(llama_model_loader & ml, const LLM_KV & kv) {
// Lowercase normalizer flag (consulted by WPM / whitespace BPE)
ml.get_key(LLM_KV_TOKENIZER_NORMALIZER_LOWERCASE, normalizer_lowercase, false);
// suppress tokens
{
const int suppress_idx = gguf_find_key(ctx, kv(LLM_KV_TOKENIZER_SUPPRESS_TOKENS).c_str());
if (suppress_idx != -1) {
const int n = gguf_get_arr_n(ctx, suppress_idx);
const int32_t * data = (const int32_t *) gguf_get_arr_data(ctx, suppress_idx);
suppress_tokens.assign(data, data + n);
}
}
// auto-detect special tokens by text
// TODO: convert scripts should provide these tokens through the KV metadata LLM_KV_TOKENIZER_...
// for now, we apply this workaround to find the tokens based on their text
@ -3961,6 +3973,10 @@ bool llama_vocab::get_normalizer_lowercase() const {
return pimpl->normalizer_lowercase;
}
const std::vector<llama_token> & llama_vocab::get_suppress_tokens() const {
return pimpl->suppress_tokens;
}
int llama_vocab::max_token_len() const {
return pimpl->max_token_len;
}

View File

@ -143,6 +143,8 @@ struct llama_vocab {
bool get_treat_whitespace_as_suffix() const;
bool get_normalizer_lowercase () const;
const std::vector<llama_token> & get_suppress_tokens() const;
int max_token_len() const;
int find_bpe_rank(const std::string & token_left, const std::string & token_right) const;

View File

@ -142,6 +142,31 @@ static ggml_tensor * ggml_view_2d_slice(ggml_context * ctx0, ggml_tensor * x, in
idx * x->ne[0] * x->ne[1] * ggml_element_size(x));
}
// TODO @ngxson : maybe improve this in the future
class llm_graph_input_logits_bias : public llm_graph_input_i {
public:
llm_graph_input_logits_bias(const llama_vocab & vocab) {
arr.resize(vocab.n_tokens(), 0.0f);
for (llama_token id : vocab.get_suppress_tokens()) {
if (0 <= id && id < (int32_t)vocab.n_tokens()) {
arr[id] = -INFINITY;
}
}
}
virtual ~llm_graph_input_logits_bias() = default;
void set_input(const llama_ubatch *) override {
const int64_t n_vocab = arr.size();
ggml_backend_tensor_set(logits_bias, arr.data(), 0, n_vocab*ggml_element_size(logits_bias));
}
// bool can_reuse(const llm_graph_params & params) override;
ggml_tensor * logits_bias = nullptr; // F32 [n_vocab]
std::vector<float> arr;
};
llama_model_gemma4::graph::graph(const llama_model & model, const llm_graph_params & params) :
llm_graph_context(params),
model(model),
@ -388,6 +413,16 @@ llama_model_gemma4::graph::graph(const llama_model & model, const llm_graph_para
cur = ggml_scale(ctx0, cur, hparams.f_final_logit_softcapping);
}
// apply logits bias if needed (e.g. for gemma4_unified patch)
// this is to mirror the suppress_tokens patch on transformers, to avoid model from outputing <image|> and <audio|> tokens (which is a known issue related to the checkpoint)
// TODO: maybe handle this inside the sampling system in the future
if (!model.vocab.get_suppress_tokens().empty()) {
auto inp_bias = std::make_unique<llm_graph_input_logits_bias>(model.vocab);
inp_bias->logits_bias = ggml_new_tensor_1d(ctx0, GGML_TYPE_F32, inp_bias->arr.size());
cur = ggml_add(ctx0, cur, inp_bias->logits_bias);
res->add_input(std::move(inp_bias));
}
cb(cur, "result_output", -1);
res->t_logits = cur;

View File

@ -21,6 +21,8 @@ add_library(mtmd
models/exaone4_5.cpp
models/gemma4a.cpp
models/gemma4v.cpp
models/gemma4ua.cpp
models/gemma4uv.cpp
models/glm4v.cpp
models/granite-speech.cpp
models/hunyuanvl.cpp

View File

@ -83,6 +83,7 @@
#define TN_PATCH_EMBD_1 "v.patch_embd.weight.1"
#define TN_PATCH_BIAS "v.patch_embd.bias"
#define TN_NORM_EMBD "v.norm_embd.%s"
#define TN_PATCH_NORM "v.patch_norm.%d.%s"
#define TN_ATTN_QKV "%s.blk.%d.attn_qkv.%s"
#define TN_ATTN_K "%s.blk.%d.attn_k.%s"
#define TN_ATTN_Q "%s.blk.%d.attn_q.%s"
@ -317,6 +318,8 @@ enum projector_type {
PROJECTOR_TYPE_GEMMA3NA,
PROJECTOR_TYPE_GEMMA4V,
PROJECTOR_TYPE_GEMMA4A,
PROJECTOR_TYPE_GEMMA4UV,
PROJECTOR_TYPE_GEMMA4UA,
PROJECTOR_TYPE_PHI4,
PROJECTOR_TYPE_IDEFICS3,
PROJECTOR_TYPE_PIXTRAL,
@ -369,6 +372,8 @@ static std::map<projector_type, std::string> PROJECTOR_TYPE_NAMES = {
{ PROJECTOR_TYPE_GEMMA3NA, "gemma3na"},
{ PROJECTOR_TYPE_GEMMA4V, "gemma4v"},
{ PROJECTOR_TYPE_GEMMA4A, "gemma4a"},
{ PROJECTOR_TYPE_GEMMA4UV, "gemma4uv"},
{ PROJECTOR_TYPE_GEMMA4UA, "gemma4ua"},
{ PROJECTOR_TYPE_PHI4, "phi4"},
{ PROJECTOR_TYPE_IDEFICS3, "idefics3"},
{ PROJECTOR_TYPE_PIXTRAL, "pixtral"},

View File

@ -339,6 +339,14 @@ struct clip_model {
ggml_tensor * norm_embd_w = nullptr;
ggml_tensor * norm_embd_b = nullptr;
// "indexed" patch embedding norms
ggml_tensor * patch_norm_1_w = nullptr;
ggml_tensor * patch_norm_1_b = nullptr;
ggml_tensor * patch_norm_2_w = nullptr;
ggml_tensor * patch_norm_2_b = nullptr;
ggml_tensor * patch_norm_3_w = nullptr;
ggml_tensor * patch_norm_3_b = nullptr;
ggml_tensor * pre_ln_w = nullptr;
ggml_tensor * pre_ln_b = nullptr;

View File

@ -866,6 +866,10 @@ static ggml_cgraph * clip_image_build_graph(clip_ctx * ctx, const clip_image_f32
{
builder = std::make_unique<clip_graph_gemma4v>(ctx, img);
} break;
case PROJECTOR_TYPE_GEMMA4UV:
{
builder = std::make_unique<clip_graph_gemma4uv>(ctx, img);
} break;
case PROJECTOR_TYPE_PIXTRAL:
case PROJECTOR_TYPE_LIGHTONOCR:
{
@ -969,6 +973,10 @@ static ggml_cgraph * clip_image_build_graph(clip_ctx * ctx, const clip_image_f32
{
builder = std::make_unique<clip_graph_gemma4a>(ctx, img);
} break;
case PROJECTOR_TYPE_GEMMA4UA:
{
builder = std::make_unique<clip_graph_gemma4ua>(ctx, img);
} break;
case PROJECTOR_TYPE_GRANITE_SPEECH:
{
builder = std::make_unique<clip_graph_granite_speech>(ctx, img);
@ -1386,13 +1394,19 @@ struct clip_model_loader {
} break;
case PROJECTOR_TYPE_GEMMA4V:
case PROJECTOR_TYPE_GEMMA4UV:
{
hparams.rope_theta = 100.0f;
hparams.n_merge = 3; // pooling_kernel_size
hparams.image_resize_algo = RESIZE_ALGO_BILINEAR;
get_u32(KEY_PROJ_SCALE_FACTOR, hparams.n_merge, false);
if (model.proj_type == PROJECTOR_TYPE_GEMMA4UV) {
// for "unified" variant, we directly use a bigger patch size, because the "token merging" is done directly on conv layer
hparams.patch_size = hparams.patch_size * hparams.n_merge;
hparams.n_merge = 1;
}
// @ngxson : the model performs quite poor with small images, we need to bump minimum image tokens to 40 to avoid that
hparams.set_limit_image_tokens(252, 280);
hparams.set_limit_image_tokens(40, 280);
hparams.set_warmup_n_tokens(256); // avoid OOM on warmup
} break;
@ -1586,6 +1600,14 @@ struct clip_model_loader {
// since all gemma4a models use 1e-6, we just hardcode it here to avoid re-conversion
hparams.eps = 1e-6f;
} break;
case PROJECTOR_TYPE_GEMMA4UA:
{
// Encoder-free: raw 16 kHz waveform chunked into 640-sample frames.
hparams.audio_chunk_len = 0;
hparams.audio_sample_rate = 16000;
hparams.eps = 1e-6f;
hparams.n_mel_bins = 640;
} break;
case PROJECTOR_TYPE_GRANITE_SPEECH:
{
hparams.audio_chunk_len = 0;
@ -2097,6 +2119,16 @@ struct clip_model_loader {
}
}
} break;
case PROJECTOR_TYPE_GEMMA4UV:
{
model.mm_input_proj_w = get_tensor(TN_MM_INP_PROJ);
model.patch_norm_1_w = get_tensor(string_format(TN_PATCH_NORM, 1, "weight"));
model.patch_norm_1_b = get_tensor(string_format(TN_PATCH_NORM, 1, "bias"));
model.patch_norm_2_w = get_tensor(string_format(TN_PATCH_NORM, 2, "weight"));
model.patch_norm_2_b = get_tensor(string_format(TN_PATCH_NORM, 2, "bias"));
model.patch_norm_3_w = get_tensor(string_format(TN_PATCH_NORM, 3, "weight")); // pos_norm
model.patch_norm_3_b = get_tensor(string_format(TN_PATCH_NORM, 3, "bias")); // pos_norm
} break;
case PROJECTOR_TYPE_GEMMA3NV:
{
model.mobilenet_stem_conv_w = get_tensor(TN_MNV5_STEM_CONV, false);
@ -2510,6 +2542,10 @@ struct clip_model_loader {
}
}
} break;
case PROJECTOR_TYPE_GEMMA4UA:
{
model.mm_input_proj_w = get_tensor(string_format(TN_A_MM_INP_PROJ, "weight"));
} break;
case PROJECTOR_TYPE_LFM2A:
{
for (int i : {0, 2, 3, 5, 6}) {
@ -3218,6 +3254,7 @@ int clip_n_output_tokens(const struct clip_ctx * ctx, struct clip_image_f32 * im
} break;
case PROJECTOR_TYPE_GEMMA3:
case PROJECTOR_TYPE_GEMMA4V:
case PROJECTOR_TYPE_GEMMA4UV:
case PROJECTOR_TYPE_IDEFICS3:
case PROJECTOR_TYPE_INTERNVL:
case PROJECTOR_TYPE_NEMOTRON_V2_VL:
@ -3350,6 +3387,10 @@ int clip_n_output_tokens(const struct clip_ctx * ctx, struct clip_image_f32 * im
}
n_patches = n;
} break;
case PROJECTOR_TYPE_GEMMA4UA:
{
n_patches = img->nx; // no downsampling: one token per raw waveform frame
} break;
case PROJECTOR_TYPE_GRANITE_SPEECH:
{
const int ws = ctx->model.hparams.audio_proj_window_size;
@ -3917,6 +3958,7 @@ bool clip_image_batch_encode(clip_ctx * ctx, const int n_threads, const clip_ima
set_input_i32("patches", patches);
} break;
case PROJECTOR_TYPE_GEMMA4V:
case PROJECTOR_TYPE_GEMMA4UV:
{
// set (col, row) patch positions for learned positional embedding
const int n_cols = image_size_width / patch_size;
@ -3998,6 +4040,7 @@ bool clip_image_batch_encode(clip_ctx * ctx, const int n_threads, const clip_ima
case PROJECTOR_TYPE_PHI4:
case PROJECTOR_TYPE_COGVLM:
case PROJECTOR_TYPE_YASA2:
case PROJECTOR_TYPE_GEMMA4UA:
{
// do nothing
} break;
@ -4303,6 +4346,7 @@ int clip_n_mmproj_embd(const struct clip_ctx * ctx) {
case PROJECTOR_TYPE_GEMMA3NV:
return ctx->model.mm_input_proj_w->ne[0];
case PROJECTOR_TYPE_GEMMA4V:
case PROJECTOR_TYPE_GEMMA4UV:
return ctx->model.mm_input_proj_w->ne[1];
case PROJECTOR_TYPE_IDEFICS3:
return ctx->model.mm_fc_w->ne[1];
@ -4337,7 +4381,7 @@ int clip_n_mmproj_embd(const struct clip_ctx * ctx) {
return ctx->model.mm_fc_w->ne[1];
case PROJECTOR_TYPE_LFM2A:
return ctx->model.position_embeddings->ne[0];
case PROJECTOR_TYPE_GEMMA4A:
case PROJECTOR_TYPE_GEMMA4UA:
return ctx->model.hparams.projection_dim;
case PROJECTOR_TYPE_GRANITE_SPEECH:
return ctx->model.qf_proj_linear_w->ne[1];

View File

@ -0,0 +1,19 @@
#include "models.h"
#include <cmath>
ggml_cgraph * clip_graph_gemma4ua::build() {
ggml_tensor * inp = build_inp_raw(1);
auto cur = ggml_cont(ctx0, ggml_permute(ctx0, inp, 1, 0, 2, 3));
// Gemma4UnifiedMultimodalEmbedder
{
// embedding_pre_projection_norm
cur = ggml_rms_norm(ctx0, cur, hparams.eps);
cur = build_mm(model.mm_input_proj_w, cur);
cb(cur, "projected", -1);
}
ggml_build_forward_expand(gf, cur);
return gf;
}

View File

@ -0,0 +1,71 @@
#include "models.h"
#include <cmath>
ggml_cgraph * clip_graph_gemma4uv::build() {
ggml_tensor * inp_raw = build_inp_raw();
// Gemma4UnifiedVisionEmbedder uses default pytorch LayerNorm, not RMSNorm
float eps = 1e-5f; // default eps for pytorch LayerNorm
ggml_tensor * inp = nullptr;
{
// note: we cannot use ggml_conv_2d here because we need to apply norm after im2col
auto c = inp_raw->ne[2];
ggml_tensor * kernel = ggml_new_tensor_3d(ctx0, GGML_TYPE_F32, patch_size, patch_size, c);
inp = ggml_im2col(ctx0, kernel, inp_raw, patch_size, patch_size, 0, 0, 1, 1, true, inp_raw->type);
// inp shape: [patch_size * patch_size * c, n_patches_w, n_patches_h]
inp = ggml_reshape_2d(ctx0, inp, inp->ne[0], inp->ne[1] * inp->ne[2] * inp->ne[3]);
inp = build_norm(inp, model.patch_norm_1_w, model.patch_norm_1_b, NORM_TYPE_NORMAL, eps, -1);
// inp shape: [patch_size * patch_size * c, n_patches]
inp = ggml_mul_mat(ctx0, model.patch_embeddings_0, inp);
inp = ggml_add(ctx0, inp, model.patch_bias);
// inp shape: [n_embd, n_patches]
inp = build_norm(inp, model.patch_norm_2_w, model.patch_norm_2_b, NORM_TYPE_NORMAL, eps, -1);
}
ggml_tensor * pos_x = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, n_patches);
ggml_set_name(pos_x, "pos_x");
ggml_set_input(pos_x);
ggml_tensor * pos_y = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, n_patches);
ggml_set_name(pos_y, "pos_y");
ggml_set_input(pos_y);
{
const int64_t pos_size = model.position_embeddings->ne[1];
const size_t nb1 = ggml_row_size(model.position_embeddings->type, n_embd);
// positional embeddings are stored as lookup tables (one for x, one for y)
ggml_tensor * tbl_x = ggml_view_2d(ctx0, model.position_embeddings,
n_embd, pos_size, nb1, 0);
ggml_tensor * tbl_y = ggml_view_2d(ctx0, model.position_embeddings,
n_embd, pos_size, nb1, pos_size * nb1);
// ggml_get_rows: [n_embd, n_patches]
ggml_tensor * emb_x = ggml_get_rows(ctx0, tbl_x, pos_x);
ggml_tensor * emb_y = ggml_get_rows(ctx0, tbl_y, pos_y);
inp = ggml_add(ctx0, inp, emb_x);
inp = ggml_add(ctx0, inp, emb_y);
cb(inp, "pos_embd", -1);
// pos_norm
inp = build_norm(inp, model.patch_norm_3_w, model.patch_norm_3_b, NORM_TYPE_NORMAL, eps, -1);
}
auto cur = inp;
// Gemma4UnifiedMultimodalEmbedder
{
// embedding_pre_projection_norm
cur = ggml_rms_norm(ctx0, cur, hparams.eps);
cur = build_mm(model.mm_input_proj_w, cur);
cb(cur, "projected", -1);
}
ggml_build_forward_expand(gf, cur);
return gf;
}

View File

@ -18,6 +18,11 @@ struct clip_graph_gemma4v : clip_graph {
ggml_tensor * build_mm(ggml_tensor * w, ggml_tensor * x) const override;
};
struct clip_graph_gemma4uv : clip_graph {
clip_graph_gemma4uv(clip_ctx * ctx, const clip_image_f32 & img) : clip_graph(ctx, img) {}
ggml_cgraph * build() override;
};
struct clip_graph_pixtral : clip_graph {
clip_graph_pixtral(clip_ctx * ctx, const clip_image_f32 & img) : clip_graph(ctx, img) {}
ggml_cgraph * build() override;
@ -142,6 +147,11 @@ struct clip_graph_gemma4a : clip_graph {
ggml_tensor * build_mm(ggml_tensor * w, ggml_tensor * x) const override;
};
struct clip_graph_gemma4ua : clip_graph {
clip_graph_gemma4ua(clip_ctx * ctx, const clip_image_f32 & img) : clip_graph(ctx, img) {}
ggml_cgraph * build() override;
};
struct clip_graph_glm4v : clip_graph {
clip_graph_glm4v(clip_ctx * ctx, const clip_image_f32 & img) : clip_graph(ctx, img) {}
ggml_cgraph * build() override;

View File

@ -942,6 +942,44 @@ bool mtmd_audio_preprocessor_gemma4a::preprocess(const float * s
return true;
}
//
// mtmd_audio_preprocessor_gemma4ua
//
void mtmd_audio_preprocessor_gemma4ua::initialize() {
// no-op: no FFT or filterbank needed
}
bool mtmd_audio_preprocessor_gemma4ua::preprocess(const float * samples,
size_t n_samples,
std::vector<mtmd_audio_mel> & output) {
if (n_samples == 0) {
return false;
}
const int frame_size = hparams.n_mel_bins; // 640 samples per token @ 16 kHz = 40 ms
const int n_tokens = ((int)n_samples + frame_size - 1) / frame_size;
mtmd_audio_mel mel;
mel.n_len = n_tokens;
mel.n_len_org = n_tokens;
mel.n_mel = frame_size;
mel.data.assign((size_t)frame_size * n_tokens, 0.0f);
// Store mel-major (data[f * n_tokens + t]) so the ggml tensor loads as
// [n_tokens, frame_size] with ne[0]=n_tokens, ne[1]=frame_size.
// The graph builder transposes before RMSNorm so normalization is over frame_size.
for (int t = 0; t < n_tokens; t++) {
for (int f = 0; f < frame_size; f++) {
size_t src = (size_t)t * frame_size + f;
mel.data[(size_t)f * n_tokens + t] = (src < n_samples) ? samples[src] : 0.0f;
}
}
output.push_back(std::move(mel));
return true;
}
//
// mtmd_audio_streaming_istft implementation
//

View File

@ -96,6 +96,12 @@ struct mtmd_audio_preprocessor_gemma4a : mtmd_audio_preprocessor {
mtmd_audio_cache cache;
};
struct mtmd_audio_preprocessor_gemma4ua : mtmd_audio_preprocessor {
mtmd_audio_preprocessor_gemma4ua(const clip_ctx * ctx) : mtmd_audio_preprocessor(ctx) {}
void initialize() override;
bool preprocess(const float * samples, size_t n_samples, std::vector<mtmd_audio_mel> & output) override;
};
struct mtmd_audio_preprocessor_qwen3a : mtmd_audio_preprocessor {
mtmd_audio_preprocessor_qwen3a(const clip_ctx * ctx) : mtmd_audio_preprocessor(ctx) {}
void initialize() override;

View File

@ -482,6 +482,7 @@ struct mtmd_context {
image_preproc = std::make_unique<mtmd_image_preprocessor_dyn_size>(ctx_v);
} break;
case PROJECTOR_TYPE_GEMMA4V:
case PROJECTOR_TYPE_GEMMA4UV:
{
// <|image> ... (image embeddings) ... <image|>
img_beg = "<|image>";
@ -576,6 +577,12 @@ struct mtmd_context {
aud_end = "<audio|>";
audio_preproc = std::make_unique<mtmd_audio_preprocessor_gemma4a>(ctx_a);
} break;
case PROJECTOR_TYPE_GEMMA4UA:
{
aud_beg = "<|audio>";
aud_end = "<audio|>";
audio_preproc = std::make_unique<mtmd_audio_preprocessor_gemma4ua>(ctx_a);
} break;
default:
throw std::runtime_error(string_format("%s: unexpected audio projector type %d\n", __func__, proj));
}