From e8023568d05ffdb9d0ac65695c521ea7e72c6b75 Mon Sep 17 00:00:00 2001 From: Pedro Cuenca Date: Thu, 4 Jun 2026 15:21:38 +0200 Subject: [PATCH] convert: Fix Gemma 4 Unified conversion (#24118) * Fix Gemma 4 Unified conversion * Set audio hidden size to audio_embed_dim --- conversion/gemma.py | 15 +++++++++++---- 1 file changed, 11 insertions(+), 4 deletions(-) diff --git a/conversion/gemma.py b/conversion/gemma.py index 2025e782b7..379876629f 100644 --- a/conversion/gemma.py +++ b/conversion/gemma.py @@ -798,7 +798,8 @@ class Gemma4VisionAudioModel(MmprojModel): # remap audio hparams if self.hparams_audio: self.hparams_audio["feat_in"] = self.hparams_audio.get("input_feat_size", 128) - self.hparams_audio["intermediate_size"] = self.hparams_audio["hidden_size"] * 4 + if "hidden_size" in self.hparams_audio: + self.hparams_audio["intermediate_size"] = self.hparams_audio["hidden_size"] * 4 else: self.has_audio_encoder = False @@ -872,7 +873,7 @@ class Gemma4UnifiedVisionAudioModel(Gemma4VisionAudioModel): assert self.hparams_audio is not None text_embd_dim = self.hparams_vision["mm_embed_dim"] self.hparams_vision["hidden_size"] = text_embd_dim - self.hparams_audio["hidden_size"] = text_embd_dim + self.hparams_audio["hidden_size"] = self.hparams_audio["audio_embed_dim"] # this is a transformer-less vision tower, the params below are redundant but set to avoid error self.hparams_vision["intermediate_size"] = 0 self.hparams_vision["num_layers"] = 0 @@ -897,7 +898,10 @@ class Gemma4UnifiedVisionAudioModel(Gemma4VisionAudioModel): # ggml im2col outputs in RR..GG..BB.. (CHW) order, but weight expects RGBRGB.. (HWC). # Permute columns so column i aligns with CHW input position i. assert self.hparams_vision is not None - p = self.hparams_vision["model_patch_size"] + if "model_patch_size" in self.hparams_vision: + p = self.hparams_vision["model_patch_size"] + else: + p = self.hparams_vision["patch_size"] * self.hparams_vision["pooling_kernel_size"] i = torch.arange(p * p * 3) ch = i // (p * p) row = (i % (p * p)) // p @@ -908,7 +912,10 @@ class Gemma4UnifiedVisionAudioModel(Gemma4VisionAudioModel): elif "patch_ln1.weight" in name or "patch_ln1.bias" in name: # same permutation for patch_ln1 as patch_dense to align with CHW input order assert self.hparams_vision is not None - p = self.hparams_vision["model_patch_size"] + if "model_patch_size" in self.hparams_vision: + p = self.hparams_vision["model_patch_size"] + else: + p = self.hparams_vision["patch_size"] * self.hparams_vision["pooling_kernel_size"] i = torch.arange(p * p * 3) ch = i // (p * p) row = (i % (p * p)) // p