diff --git a/conversion/__init__.py b/conversion/__init__.py index 00192cf33a..c6af6f7318 100644 --- a/conversion/__init__.py +++ b/conversion/__init__.py @@ -96,6 +96,7 @@ TEXT_MODEL_MAP: dict[str, str] = { "GraniteMoeHybridForCausalLM": "granite", "GraniteMoeSharedForCausalLM": "granite", "GraniteSpeechForConditionalGeneration": "granite", + "GraniteSpeechPlusForConditionalGeneration": "granite", "Grok1ForCausalLM": "grok", "GrokForCausalLM": "grok", "GroveMoeForCausalLM": "grovemoe", @@ -261,6 +262,7 @@ MMPROJ_MODEL_MAP: dict[str, str] = { "GlmasrModel": "ultravox", "Granite4VisionForConditionalGeneration": "granite", "GraniteSpeechForConditionalGeneration": "granite", + "GraniteSpeechPlusForConditionalGeneration": "granite", "HunYuanVLForConditionalGeneration": "hunyuan", "Idefics3ForConditionalGeneration": "smolvlm", "InternVisionModel": "internvl", diff --git a/conversion/granite.py b/conversion/granite.py index 53441fe570..8367ed225d 100644 --- a/conversion/granite.py +++ b/conversion/granite.py @@ -348,6 +348,34 @@ class GraniteSpeechMmprojModel(MmprojModel): yield from super().modify_tensors(data_torch, name, bid) +@ModelBase.register("GraniteSpeechPlusForConditionalGeneration") +class GraniteSpeechPlusMmprojModel(GraniteSpeechMmprojModel): + """Conversion for GraniteSpeechPlus - extends GraniteSpeech with feature layer concatenation""" + has_vision_encoder = False + has_audio_encoder = True + + def set_gguf_parameters(self): + assert self.hparams_audio is not None + super().set_gguf_parameters() + + # Add feature_layer if present in encoder config + if feature_layers := self.hparams_audio.get("cat_hidden_layers"): + self.gguf_writer.add_audio_feature_layers(feature_layers) + logger.info(f"gguf: audio feature_layers = {feature_layers}") + + # Validate projector dimension matches concatenated encoder output + hidden_dim = self.hparams_audio["hidden_dim"] + expected_dim = hidden_dim * (len(feature_layers) + 1) + projector_dim = self.global_config["projector_config"]["encoder_hidden_size"] + + if projector_dim != expected_dim: + raise ValueError( + f"Projector encoder_hidden_size ({projector_dim}) does not match " + f"expected concatenated dimension ({expected_dim}). " + f"Expected: hidden_dim ({hidden_dim}) * (len(feature_layers) + 1) = {expected_dim}" + ) + + @ModelBase.register("Granite4VisionForConditionalGeneration") class Granite4VisionMmprojModel(MmprojModel): has_vision_encoder = True diff --git a/gguf-py/gguf/constants.py b/gguf-py/gguf/constants.py index 463963f2ac..1bda9452dd 100644 --- a/gguf-py/gguf/constants.py +++ b/gguf-py/gguf/constants.py @@ -359,6 +359,7 @@ class Keys: CHUNK_SIZE = "clip.audio.chunk_size" CONV_KERNEL_SIZE = "clip.audio.conv_kernel_size" MAX_POS_EMB = "clip.audio.max_pos_emb" + FEATURE_LAYERS = "clip.audio.feature_layer" # Granite Speech Plus class Attention: HEAD_COUNT = "clip.audio.attention.head_count" diff --git a/gguf-py/gguf/gguf_writer.py b/gguf-py/gguf/gguf_writer.py index f707f29dc5..a06ec88b32 100644 --- a/gguf-py/gguf/gguf_writer.py +++ b/gguf-py/gguf/gguf_writer.py @@ -1310,6 +1310,9 @@ class GGUFWriter: def add_audio_max_pos_emb(self, value: int) -> None: self.add_uint32(Keys.ClipAudio.MAX_POS_EMB, value) + def add_audio_feature_layers(self, layers: Sequence[int]) -> None: + self.add_array(Keys.ClipAudio.FEATURE_LAYERS, layers) + def add_audio_projector_window_size(self, value: int) -> None: self.add_uint32(Keys.ClipAudio.Projector.WINDOW_SIZE, value) diff --git a/tools/mtmd/clip-impl.h b/tools/mtmd/clip-impl.h index e7b5301445..5b413681f0 100644 --- a/tools/mtmd/clip-impl.h +++ b/tools/mtmd/clip-impl.h @@ -42,6 +42,7 @@ #define KEY_N_HEAD "clip.%s.attention.head_count" #define KEY_N_HEAD_KV "clip.%s.attention.head_count_kv" #define KEY_LAYER_NORM_EPS "clip.%s.attention.layer_norm_epsilon" +#define KEY_FEATURE_LAYERS "clip.%s.feature_layer" // vision-specific #define KEY_VISION_PROJ_TYPE "clip.vision.projector_type" // for models with mixed modalities @@ -54,7 +55,6 @@ #define KEY_PATCH_SIZE "clip.vision.patch_size" #define KEY_IMAGE_MEAN "clip.vision.image_mean" #define KEY_IMAGE_STD "clip.vision.image_std" -#define KEY_FEATURE_LAYER "clip.vision.feature_layer" #define KEY_PROJ_SCALE_FACTOR "clip.vision.projector.scale_factor" #define KEY_PROJ_SAMPLE_QUERY_SIDE "clip.vision.projector.query_side" #define KEY_PROJ_SAMPLE_WINDOW_SIDE "clip.vision.projector.window_side" diff --git a/tools/mtmd/clip-model.h b/tools/mtmd/clip-model.h index 48796b6306..f86702eba4 100644 --- a/tools/mtmd/clip-model.h +++ b/tools/mtmd/clip-model.h @@ -91,7 +91,7 @@ struct clip_hparams { float eps = 1e-6; float rope_theta = 0.0; - std::vector vision_feature_layer; + std::vector feature_layers; int32_t attn_window_size = 0; int32_t n_wa_pattern = 0; std::unordered_set wa_layer_indexes; // explicit layer indexes that use full attention (for irregular patterns like YoutuVL) @@ -165,8 +165,8 @@ struct clip_hparams { return false; } - bool is_vision_feature_layer(int32_t layer) const { - return std::find(vision_feature_layer.begin(), vision_feature_layer.end(), layer) != vision_feature_layer.end(); + bool is_feature_layer(int32_t layer) const { + return std::find(feature_layers.begin(), feature_layers.end(), layer) != feature_layers.end(); } }; diff --git a/tools/mtmd/clip.cpp b/tools/mtmd/clip.cpp index 7dd7023c41..7bd486030f 100644 --- a/tools/mtmd/clip.cpp +++ b/tools/mtmd/clip.cpp @@ -1264,12 +1264,10 @@ struct clip_model_loader { } } - // Load the vision feature layer indices if they are explicitly provided; - // if multiple vision feature layers are present, the values will be concatenated - // to form the final visual features. + // Load the vision/audio feature layer indices if they are explicitly provided // NOTE: gguf conversions should standardize the values of the vision feature layer to // be non-negative, since we use -1 to mark values as unset here. - get_arr_int(KEY_FEATURE_LAYER, hparams.vision_feature_layer, false); + get_arr_int(string_format(KEY_FEATURE_LAYERS, prefix), hparams.feature_layers, false); // model-specific params switch (model.proj_type) { @@ -1651,6 +1649,7 @@ struct clip_model_loader { get_u32(KEY_A_PROJ_WINDOW_SIZE, hparams.audio_proj_window_size); get_u32(KEY_A_PROJ_DOWNSAMPLE_RATE, hparams.audio_proj_downsample_rate); get_u32(KEY_A_PROJ_HEAD_COUNT, hparams.audio_proj_head_count); + // NOTE: feature layers loaded above in common path } break; case PROJECTOR_TYPE_JANUS_PRO: { @@ -1663,11 +1662,11 @@ struct clip_model_loader { hparams.image_resize_algo = RESIZE_ALGO_BICUBIC_PILLOW; hparams.image_resize_pad = PAD_CEIL; - get_arr_int(KEY_FEATURE_LAYER, hparams.vision_feature_layer); + // NOTE: feature_layers loaded in common path as optional get_arr_int(KEY_PROJ_SPATIAL_OFFSETS, hparams.proj_spatial_offsets); - if (hparams.vision_feature_layer.size() != hparams.proj_spatial_offsets.size()) { - throw std::runtime_error(string_format("%s: vision_feature_layer.size() %d != proj_spatial_offsets.size() %d", - hparams.vision_feature_layer.size(), hparams.proj_spatial_offsets.size())); + if (hparams.feature_layers.size() != hparams.proj_spatial_offsets.size()) { + throw std::runtime_error(string_format("%s: feature_layers.size() %d != proj_spatial_offsets.size() %d", + hparams.feature_layers.size(), hparams.proj_spatial_offsets.size())); } get_u32(KEY_PROJ_SAMPLE_QUERY_SIDE, hparams.downsample_query_side); @@ -2740,7 +2739,7 @@ struct clip_model_loader { model.image_newline = get_tensor(TN_IMAGE_NEWLINE); // Load separate layerwise and spatial projector tensors - const auto projector_count = hparams.vision_feature_layer.size(); + const auto projector_count = hparams.feature_layers.size(); model.qf_proj_blocks.resize(projector_count); for (size_t bid = 0; bid < projector_count; ++bid) { auto & b = model.qf_proj_blocks[bid]; @@ -4388,7 +4387,7 @@ bool clip_image_batch_encode(clip_ctx * ctx, int n_threads, const clip_image_f32 // Stage 1b only uses block 0's permutations; future stages // will upload all blocks. - for (size_t bid = 0; bid < hparams.vision_feature_layer.size(); ++bid) { + for (size_t bid = 0; bid < hparams.feature_layers.size(); ++bid) { const std::string prefix = "g4v_blk" + std::to_string(bid) + "_"; upload(prefix + "win_idx", make_win_idx(image_side, window_side)); upload(prefix + "qwin_idx", make_win_idx(new_side, query_side)); diff --git a/tools/mtmd/models/granite-speech.cpp b/tools/mtmd/models/granite-speech.cpp index 0bd4d75ac5..a158a59ce9 100644 --- a/tools/mtmd/models/granite-speech.cpp +++ b/tools/mtmd/models/granite-speech.cpp @@ -1,5 +1,7 @@ #include "models.h" +#include + ggml_cgraph * clip_graph_granite_speech::build() { const int n_frames = img.nx(); const int context_size = hparams.audio_chunk_size; @@ -11,6 +13,10 @@ ggml_cgraph * clip_graph_granite_speech::build() { const int padded_len = num_blocks * context_size; const int remainder = n_frames % context_size; + // Calculate projector input dimension based on feature layers + const int proj_input_dim = n_embd * (hparams.feature_layers.size() + 1); + const bool use_feature_concat = !hparams.feature_layers.empty(); + ggml_tensor * attn_dists = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, context_size * context_size); ggml_set_name(attn_dists, "attn_dists"); ggml_set_input(attn_dists); @@ -31,6 +37,15 @@ ggml_cgraph * clip_graph_granite_speech::build() { cur = ggml_add(ctx0, cur, model.inp_proj_b); cb(cur, "inp_linear", -1); + // Capture layer 0 if requested (after input_linear) + ggml_tensor * concat_result = nullptr; + if (use_feature_concat) { + if (std::find(hparams.feature_layers.begin(), hparams.feature_layers.end(), 0) != hparams.feature_layers.end()) { + concat_result = cur; + cb(concat_result, "feature_layer_0", -1); + } + } + for (int il = 0; il < n_layer; il++) { const auto & layer = model.layers[il]; auto * residual = cur; @@ -168,6 +183,18 @@ ggml_cgraph * clip_graph_granite_speech::build() { NORM_TYPE_NORMAL, eps, il); cb(cur, "layer_out", il); + // Capture intermediate layer (il + 1) if requested + if (use_feature_concat) { + if (hparams.is_feature_layer(il + 1)) { + if (concat_result == nullptr) { + concat_result = cur; + } else { + concat_result = ggml_concat(ctx0, concat_result, cur, 0); + } + cb(concat_result, string_format("feature_layer_%d", il + 1).c_str(), il); + } + } + // CTC branch if (il + 1 == ctc_layer) { auto * mid = build_mm(model.ctc_out_w, cur); @@ -180,6 +207,13 @@ ggml_cgraph * clip_graph_granite_speech::build() { } } + // Append final output to concatenated features if using feature concatenation + if (use_feature_concat && concat_result != nullptr) { + concat_result = ggml_concat(ctx0, concat_result, cur, 0); + cb(concat_result, "concat_final", -1); + cur = concat_result; + } + cb(cur, "encoder_out", -1); // QFormer projector @@ -197,7 +231,7 @@ ggml_cgraph * clip_graph_granite_speech::build() { cur = ggml_pad(ctx0, cur, 0, padded_proj - n_frames, 0, 0); } - ggml_tensor * enc_windows = ggml_reshape_3d(ctx0, cur, n_embd, window_size, nblocks_proj); + ggml_tensor * enc_windows = ggml_reshape_3d(ctx0, cur, proj_input_dim, window_size, nblocks_proj); ggml_tensor * queries = build_norm(model.qf_proj_blocks[0].qf_proj_query, model.qf_proj_blocks[0].qf_proj_norm_w, model.qf_proj_blocks[0].qf_proj_norm_b, diff --git a/tools/mtmd/models/granite4-vision.cpp b/tools/mtmd/models/granite4-vision.cpp index 9adb6f0fdb..1b252543c0 100644 --- a/tools/mtmd/models/granite4-vision.cpp +++ b/tools/mtmd/models/granite4-vision.cpp @@ -304,14 +304,14 @@ ggml_cgraph * clip_graph_granite4_vision::build() { } // --- Stage 1b/1c: WindowQFormer blocks --- - const int projector_count = hparams.vision_feature_layer.size(); + const int projector_count = hparams.feature_layers.size(); const float qformer_eps = 1e-12f; ggml_tensor * mmproj = nullptr; for (int bid = 0; bid < projector_count; ++bid) { const auto & blk = model.qf_proj_blocks[bid]; - int vlayer = hparams.vision_feature_layer[bid]; + int vlayer = hparams.feature_layers[bid]; GGML_ASSERT(vlayer >= 0 && vlayer < n_layer); ggml_tensor * h = layer_outs[vlayer]; diff --git a/tools/mtmd/models/llava.cpp b/tools/mtmd/models/llava.cpp index 5aa3d2f0fa..47efe68bd8 100644 --- a/tools/mtmd/models/llava.cpp +++ b/tools/mtmd/models/llava.cpp @@ -21,7 +21,7 @@ ggml_cgraph * clip_graph_llava::build() { // If we set explicit vision feature layers, only go up to the deepest one // NOTE: only used by granite-vision models for now - for (const auto & feature_layer : hparams.vision_feature_layer) { + for (const auto & feature_layer : hparams.feature_layers) { if (feature_layer > deepest_feature_layer) { deepest_feature_layer = feature_layer; } @@ -59,7 +59,7 @@ ggml_cgraph * clip_graph_llava::build() { // If this is an embedding feature layer, save the output. // NOTE: 0 index here refers to the input to the encoder. - if (hparams.is_vision_feature_layer(il)) { + if (hparams.is_feature_layer(il)) { embedding_stack.push_back(cur); } @@ -134,7 +134,7 @@ ggml_cgraph * clip_graph_llava::build() { // process vision feature layers (used by granite) { // final layer is a vision feature layer - if (hparams.is_vision_feature_layer(max_feature_layer)) { + if (hparams.is_feature_layer(max_feature_layer)) { embedding_stack.push_back(inpL); }