model: Granite Speech Plus (#24818)

* feat: Add conversion support for Granite Speech Plus Branch: GraniteSpeechPlus AI-usage: full (Bob, OpenCode + Qwen3.6-35b) Signed-off-by: Gabe Goodhart <ghart@us.ibm.com> * feat: Extend granite_speech to support plus multi-layer concatenation Branch: GraniteSpeechPlus AI-usage: draft (Bob, OpenCode + Qwen3.6-35b) Signed-off-by: Gabe Goodhart <ghart@us.ibm.com> * fix(conversion): Fix plural naming for feature_layers for audio Branch: GraniteSpeechPlus AI-usage: none Signed-off-by: Gabe Goodhart <ghart@us.ibm.com> * fix(mtmd): Align feature_layer usage and naming everywhere Branch: GraniteSpeechPlus AI-usage: none Signed-off-by: Gabe Goodhart <ghart@us.ibm.com> * style: Use fstring for log Signed-off-by: Gabe Goodhart <ghart@us.ibm.com> Co-authored-by: Xuan-Son Nguyen <thichthat@gmail.com> --------- Co-authored-by: Xuan-Son Nguyen <thichthat@gmail.com>
2026-06-27 23:50:20 -05:00 · 2026-06-23 04:03:31 -06:00 · 2026-06-23 04:03:31 -06:00 · a3900a6694
commit a3900a6694
parent 7c908502ea
10 changed files with 87 additions and 20 deletions
--- a/conversion/init.py
+++ b/conversion/init.py
@ -96,6 +96,7 @@ TEXT_MODEL_MAP: dict[str, str] = {
    "GraniteMoeHybridForCausalLM": "granite",
    "GraniteMoeSharedForCausalLM": "granite",
    "GraniteSpeechForConditionalGeneration": "granite",
+    "GraniteSpeechPlusForConditionalGeneration": "granite",
    "Grok1ForCausalLM": "grok",
    "GrokForCausalLM": "grok",
    "GroveMoeForCausalLM": "grovemoe",
@ -261,6 +262,7 @@ MMPROJ_MODEL_MAP: dict[str, str] = {
    "GlmasrModel": "ultravox",
    "Granite4VisionForConditionalGeneration": "granite",
    "GraniteSpeechForConditionalGeneration": "granite",
+    "GraniteSpeechPlusForConditionalGeneration": "granite",
    "HunYuanVLForConditionalGeneration": "hunyuan",
    "Idefics3ForConditionalGeneration": "smolvlm",
    "InternVisionModel": "internvl",
--- a/conversion/granite.py
+++ b/conversion/granite.py
@ -348,6 +348,34 @@ class GraniteSpeechMmprojModel(MmprojModel):
        yield from super().modify_tensors(data_torch, name, bid)


+@ModelBase.register("GraniteSpeechPlusForConditionalGeneration")
+class GraniteSpeechPlusMmprojModel(GraniteSpeechMmprojModel):
+    """Conversion for GraniteSpeechPlus - extends GraniteSpeech with feature layer concatenation"""
+    has_vision_encoder = False
+    has_audio_encoder = True
+
+    def set_gguf_parameters(self):
+        assert self.hparams_audio is not None
+        super().set_gguf_parameters()
+
+        # Add feature_layer if present in encoder config
+        if feature_layers := self.hparams_audio.get("cat_hidden_layers"):
+            self.gguf_writer.add_audio_feature_layers(feature_layers)
+            logger.info(f"gguf: audio feature_layers = {feature_layers}")
+
+            # Validate projector dimension matches concatenated encoder output
+            hidden_dim = self.hparams_audio["hidden_dim"]
+            expected_dim = hidden_dim * (len(feature_layers) + 1)
+            projector_dim = self.global_config["projector_config"]["encoder_hidden_size"]
+
+            if projector_dim != expected_dim:
+                raise ValueError(
+                    f"Projector encoder_hidden_size ({projector_dim}) does not match "
+                    f"expected concatenated dimension ({expected_dim}). "
+                    f"Expected: hidden_dim ({hidden_dim}) * (len(feature_layers) + 1) = {expected_dim}"
+                )
+
+
@ModelBase.register("Granite4VisionForConditionalGeneration")
 class Granite4VisionMmprojModel(MmprojModel):
    has_vision_encoder = True
--- a/gguf-py/gguf/constants.py
+++ b/gguf-py/gguf/constants.py
@ -359,6 +359,7 @@ class Keys:
        CHUNK_SIZE          = "clip.audio.chunk_size"
        CONV_KERNEL_SIZE    = "clip.audio.conv_kernel_size"
        MAX_POS_EMB         = "clip.audio.max_pos_emb"
+        FEATURE_LAYERS      = "clip.audio.feature_layer" # Granite Speech Plus

        class Attention:
            HEAD_COUNT      = "clip.audio.attention.head_count"
--- a/gguf-py/gguf/gguf_writer.py
+++ b/gguf-py/gguf/gguf_writer.py
@ -1310,6 +1310,9 @@ class GGUFWriter:
    def add_audio_max_pos_emb(self, value: int) -> None:
        self.add_uint32(Keys.ClipAudio.MAX_POS_EMB, value)

+    def add_audio_feature_layers(self, layers: Sequence[int]) -> None:
+        self.add_array(Keys.ClipAudio.FEATURE_LAYERS, layers)
+
    def add_audio_projector_window_size(self, value: int) -> None:
        self.add_uint32(Keys.ClipAudio.Projector.WINDOW_SIZE, value)

--- a/tools/mtmd/clip-impl.h
+++ b/tools/mtmd/clip-impl.h
@ -42,6 +42,7 @@
 #define KEY_N_HEAD              "clip.%s.attention.head_count"
 #define KEY_N_HEAD_KV           "clip.%s.attention.head_count_kv"
 #define KEY_LAYER_NORM_EPS      "clip.%s.attention.layer_norm_epsilon"
+#define KEY_FEATURE_LAYERS      "clip.%s.feature_layer"

 // vision-specific
 #define KEY_VISION_PROJ_TYPE        "clip.vision.projector_type" // for models with mixed modalities
@ -54,7 +55,6 @@
 #define KEY_PATCH_SIZE              "clip.vision.patch_size"
 #define KEY_IMAGE_MEAN              "clip.vision.image_mean"
 #define KEY_IMAGE_STD               "clip.vision.image_std"
-#define KEY_FEATURE_LAYER           "clip.vision.feature_layer"
 #define KEY_PROJ_SCALE_FACTOR       "clip.vision.projector.scale_factor"
 #define KEY_PROJ_SAMPLE_QUERY_SIDE  "clip.vision.projector.query_side"
 #define KEY_PROJ_SAMPLE_WINDOW_SIDE "clip.vision.projector.window_side"
--- a/tools/mtmd/clip-model.h
+++ b/tools/mtmd/clip-model.h
@ -91,7 +91,7 @@ struct clip_hparams {

    float eps = 1e-6;
    float rope_theta = 0.0;
-    std::vector<int32_t> vision_feature_layer;
+    std::vector<int32_t> feature_layers;
    int32_t attn_window_size = 0;
    int32_t n_wa_pattern = 0;
    std::unordered_set<int32_t> wa_layer_indexes; // explicit layer indexes that use full attention (for irregular patterns like YoutuVL)
@ -165,8 +165,8 @@ struct clip_hparams {
        return false;
    }

-    bool is_vision_feature_layer(int32_t layer) const {
-        return std::find(vision_feature_layer.begin(), vision_feature_layer.end(), layer) != vision_feature_layer.end();
+    bool is_feature_layer(int32_t layer) const {
+        return std::find(feature_layers.begin(), feature_layers.end(), layer) != feature_layers.end();
    }
 };

--- a/tools/mtmd/clip.cpp
+++ b/tools/mtmd/clip.cpp
@ -1264,12 +1264,10 @@ struct clip_model_loader {
                }
            }

-            // Load the vision feature layer indices if they are explicitly provided;
-            // if multiple vision feature layers are present, the values will be concatenated
-            // to form the final visual features.
+            // Load the vision/audio feature layer indices if they are explicitly provided
            // NOTE: gguf conversions should standardize the values of the vision feature layer to
            // be non-negative, since we use -1 to mark values as unset here.
-            get_arr_int(KEY_FEATURE_LAYER, hparams.vision_feature_layer, false);
+            get_arr_int(string_format(KEY_FEATURE_LAYERS, prefix), hparams.feature_layers, false);

            // model-specific params
            switch (model.proj_type) {
@ -1651,6 +1649,7 @@ struct clip_model_loader {
                        get_u32(KEY_A_PROJ_WINDOW_SIZE,     hparams.audio_proj_window_size);
                        get_u32(KEY_A_PROJ_DOWNSAMPLE_RATE, hparams.audio_proj_downsample_rate);
                        get_u32(KEY_A_PROJ_HEAD_COUNT,      hparams.audio_proj_head_count);
+                        // NOTE: feature layers loaded above in common path
                    } break;
                case PROJECTOR_TYPE_JANUS_PRO:
                    {
@ -1663,11 +1662,11 @@ struct clip_model_loader {
                        hparams.image_resize_algo = RESIZE_ALGO_BICUBIC_PILLOW;
                        hparams.image_resize_pad = PAD_CEIL;

-                        get_arr_int(KEY_FEATURE_LAYER, hparams.vision_feature_layer);
+                        // NOTE: feature_layers loaded in common path as optional
                        get_arr_int(KEY_PROJ_SPATIAL_OFFSETS, hparams.proj_spatial_offsets);
-                        if (hparams.vision_feature_layer.size() != hparams.proj_spatial_offsets.size()) {
-                            throw std::runtime_error(string_format("%s: vision_feature_layer.size() %d != proj_spatial_offsets.size() %d",
-                                                                   hparams.vision_feature_layer.size(), hparams.proj_spatial_offsets.size()));
+                        if (hparams.feature_layers.size() != hparams.proj_spatial_offsets.size()) {
+                            throw std::runtime_error(string_format("%s: feature_layers.size() %d != proj_spatial_offsets.size() %d",
+                                                                   hparams.feature_layers.size(), hparams.proj_spatial_offsets.size()));
                        }

                        get_u32(KEY_PROJ_SAMPLE_QUERY_SIDE,  hparams.downsample_query_side);
@ -2740,7 +2739,7 @@ struct clip_model_loader {
                    model.image_newline = get_tensor(TN_IMAGE_NEWLINE);

                    // Load separate layerwise and spatial projector tensors
-                    const auto projector_count = hparams.vision_feature_layer.size();
+                    const auto projector_count = hparams.feature_layers.size();
                    model.qf_proj_blocks.resize(projector_count);
                    for (size_t bid = 0; bid < projector_count; ++bid) {
                        auto & b = model.qf_proj_blocks[bid];
@ -4388,7 +4387,7 @@ bool clip_image_batch_encode(clip_ctx * ctx, int n_threads, const clip_image_f32

                // Stage 1b only uses block 0's permutations; future stages
                // will upload all blocks.
-                for (size_t bid = 0; bid < hparams.vision_feature_layer.size(); ++bid) {
+                for (size_t bid = 0; bid < hparams.feature_layers.size(); ++bid) {
                    const std::string prefix = "g4v_blk" + std::to_string(bid) + "_";
                    upload(prefix + "win_idx",     make_win_idx(image_side, window_side));
                    upload(prefix + "qwin_idx",    make_win_idx(new_side, query_side));
--- a/tools/mtmd/models/granite-speech.cpp
+++ b/tools/mtmd/models/granite-speech.cpp
@ -1,5 +1,7 @@
 #include "models.h"

+#include <algorithm>
+
 ggml_cgraph * clip_graph_granite_speech::build() {
    const int n_frames     = img.nx();
    const int context_size = hparams.audio_chunk_size;
@ -11,6 +13,10 @@ ggml_cgraph * clip_graph_granite_speech::build() {
    const int padded_len   = num_blocks * context_size;
    const int remainder    = n_frames % context_size;

+    // Calculate projector input dimension based on feature layers
+    const int proj_input_dim = n_embd * (hparams.feature_layers.size() + 1);
+    const bool use_feature_concat = !hparams.feature_layers.empty();
+
    ggml_tensor * attn_dists = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, context_size * context_size);
    ggml_set_name(attn_dists, "attn_dists");
    ggml_set_input(attn_dists);
@ -31,6 +37,15 @@ ggml_cgraph * clip_graph_granite_speech::build() {
    cur = ggml_add(ctx0, cur, model.inp_proj_b);
    cb(cur, "inp_linear", -1);

+    // Capture layer 0 if requested (after input_linear)
+    ggml_tensor * concat_result = nullptr;
+    if (use_feature_concat) {
+        if (std::find(hparams.feature_layers.begin(), hparams.feature_layers.end(), 0) != hparams.feature_layers.end()) {
+            concat_result = cur;
+            cb(concat_result, "feature_layer_0", -1);
+        }
+    }
+
    for (int il = 0; il < n_layer; il++) {
        const auto & layer = model.layers[il];
        auto * residual = cur;
@ -168,6 +183,18 @@ ggml_cgraph * clip_graph_granite_speech::build() {
                         NORM_TYPE_NORMAL, eps, il);
        cb(cur, "layer_out", il);

+        // Capture intermediate layer (il + 1) if requested
+        if (use_feature_concat) {
+            if (hparams.is_feature_layer(il + 1)) {
+                if (concat_result == nullptr) {
+                    concat_result = cur;
+                } else {
+                    concat_result = ggml_concat(ctx0, concat_result, cur, 0);
+                }
+                cb(concat_result, string_format("feature_layer_%d", il + 1).c_str(), il);
+            }
+        }
+
        // CTC branch
        if (il + 1 == ctc_layer) {
            auto * mid = build_mm(model.ctc_out_w, cur);
@ -180,6 +207,13 @@ ggml_cgraph * clip_graph_granite_speech::build() {
        }
    }

+    // Append final output to concatenated features if using feature concatenation
+    if (use_feature_concat && concat_result != nullptr) {
+        concat_result = ggml_concat(ctx0, concat_result, cur, 0);
+        cb(concat_result, "concat_final", -1);
+        cur = concat_result;
+    }
+
    cb(cur, "encoder_out", -1);

    // QFormer projector
@ -197,7 +231,7 @@ ggml_cgraph * clip_graph_granite_speech::build() {
            cur = ggml_pad(ctx0, cur, 0, padded_proj - n_frames, 0, 0);
        }

-        ggml_tensor * enc_windows = ggml_reshape_3d(ctx0, cur, n_embd, window_size, nblocks_proj);
+        ggml_tensor * enc_windows = ggml_reshape_3d(ctx0, cur, proj_input_dim, window_size, nblocks_proj);

        ggml_tensor * queries = build_norm(model.qf_proj_blocks[0].qf_proj_query,
            model.qf_proj_blocks[0].qf_proj_norm_w, model.qf_proj_blocks[0].qf_proj_norm_b,
--- a/tools/mtmd/models/granite4-vision.cpp
+++ b/tools/mtmd/models/granite4-vision.cpp
@ -304,14 +304,14 @@ ggml_cgraph * clip_graph_granite4_vision::build() {
    }

    // --- Stage 1b/1c: WindowQFormer blocks ---
-    const int projector_count = hparams.vision_feature_layer.size();
+    const int projector_count = hparams.feature_layers.size();
    const float qformer_eps = 1e-12f;

    ggml_tensor * mmproj = nullptr;
    for (int bid = 0; bid < projector_count; ++bid) {
        const auto & blk = model.qf_proj_blocks[bid];

-        int vlayer = hparams.vision_feature_layer[bid];
+        int vlayer = hparams.feature_layers[bid];
        GGML_ASSERT(vlayer >= 0 && vlayer < n_layer);
        ggml_tensor * h = layer_outs[vlayer];

--- a/tools/mtmd/models/llava.cpp
+++ b/tools/mtmd/models/llava.cpp
@ -21,7 +21,7 @@ ggml_cgraph * clip_graph_llava::build() {

        // If we set explicit vision feature layers, only go up to the deepest one
        // NOTE: only used by granite-vision models for now
-        for (const auto & feature_layer : hparams.vision_feature_layer) {
+        for (const auto & feature_layer : hparams.feature_layers) {
            if (feature_layer > deepest_feature_layer) {
                deepest_feature_layer = feature_layer;
            }
@ -59,7 +59,7 @@ ggml_cgraph * clip_graph_llava::build() {

        // If this is an embedding feature layer, save the output.
        // NOTE: 0 index here refers to the input to the encoder.
-        if (hparams.is_vision_feature_layer(il)) {
+        if (hparams.is_feature_layer(il)) {
            embedding_stack.push_back(cur);
        }

@ -134,7 +134,7 @@ ggml_cgraph * clip_graph_llava::build() {
    // process vision feature layers (used by granite)
    {
        // final layer is a vision feature layer
-        if (hparams.is_vision_feature_layer(max_feature_layer)) {
+        if (hparams.is_feature_layer(max_feature_layer)) {
            embedding_stack.push_back(inpL);
        }