mirror of
https://github.com/ggml-org/llama.cpp.git
synced 2026-06-27 23:50:20 -05:00
model: Granite Speech Plus (#24818)
* feat: Add conversion support for Granite Speech Plus Branch: GraniteSpeechPlus AI-usage: full (Bob, OpenCode + Qwen3.6-35b) Signed-off-by: Gabe Goodhart <ghart@us.ibm.com> * feat: Extend granite_speech to support plus multi-layer concatenation Branch: GraniteSpeechPlus AI-usage: draft (Bob, OpenCode + Qwen3.6-35b) Signed-off-by: Gabe Goodhart <ghart@us.ibm.com> * fix(conversion): Fix plural naming for feature_layers for audio Branch: GraniteSpeechPlus AI-usage: none Signed-off-by: Gabe Goodhart <ghart@us.ibm.com> * fix(mtmd): Align feature_layer usage and naming everywhere Branch: GraniteSpeechPlus AI-usage: none Signed-off-by: Gabe Goodhart <ghart@us.ibm.com> * style: Use fstring for log Signed-off-by: Gabe Goodhart <ghart@us.ibm.com> Co-authored-by: Xuan-Son Nguyen <thichthat@gmail.com> --------- Co-authored-by: Xuan-Son Nguyen <thichthat@gmail.com>
This commit is contained in:
parent
7c908502ea
commit
a3900a6694
@ -96,6 +96,7 @@ TEXT_MODEL_MAP: dict[str, str] = {
|
||||
"GraniteMoeHybridForCausalLM": "granite",
|
||||
"GraniteMoeSharedForCausalLM": "granite",
|
||||
"GraniteSpeechForConditionalGeneration": "granite",
|
||||
"GraniteSpeechPlusForConditionalGeneration": "granite",
|
||||
"Grok1ForCausalLM": "grok",
|
||||
"GrokForCausalLM": "grok",
|
||||
"GroveMoeForCausalLM": "grovemoe",
|
||||
@ -261,6 +262,7 @@ MMPROJ_MODEL_MAP: dict[str, str] = {
|
||||
"GlmasrModel": "ultravox",
|
||||
"Granite4VisionForConditionalGeneration": "granite",
|
||||
"GraniteSpeechForConditionalGeneration": "granite",
|
||||
"GraniteSpeechPlusForConditionalGeneration": "granite",
|
||||
"HunYuanVLForConditionalGeneration": "hunyuan",
|
||||
"Idefics3ForConditionalGeneration": "smolvlm",
|
||||
"InternVisionModel": "internvl",
|
||||
|
||||
@ -348,6 +348,34 @@ class GraniteSpeechMmprojModel(MmprojModel):
|
||||
yield from super().modify_tensors(data_torch, name, bid)
|
||||
|
||||
|
||||
@ModelBase.register("GraniteSpeechPlusForConditionalGeneration")
|
||||
class GraniteSpeechPlusMmprojModel(GraniteSpeechMmprojModel):
|
||||
"""Conversion for GraniteSpeechPlus - extends GraniteSpeech with feature layer concatenation"""
|
||||
has_vision_encoder = False
|
||||
has_audio_encoder = True
|
||||
|
||||
def set_gguf_parameters(self):
|
||||
assert self.hparams_audio is not None
|
||||
super().set_gguf_parameters()
|
||||
|
||||
# Add feature_layer if present in encoder config
|
||||
if feature_layers := self.hparams_audio.get("cat_hidden_layers"):
|
||||
self.gguf_writer.add_audio_feature_layers(feature_layers)
|
||||
logger.info(f"gguf: audio feature_layers = {feature_layers}")
|
||||
|
||||
# Validate projector dimension matches concatenated encoder output
|
||||
hidden_dim = self.hparams_audio["hidden_dim"]
|
||||
expected_dim = hidden_dim * (len(feature_layers) + 1)
|
||||
projector_dim = self.global_config["projector_config"]["encoder_hidden_size"]
|
||||
|
||||
if projector_dim != expected_dim:
|
||||
raise ValueError(
|
||||
f"Projector encoder_hidden_size ({projector_dim}) does not match "
|
||||
f"expected concatenated dimension ({expected_dim}). "
|
||||
f"Expected: hidden_dim ({hidden_dim}) * (len(feature_layers) + 1) = {expected_dim}"
|
||||
)
|
||||
|
||||
|
||||
@ModelBase.register("Granite4VisionForConditionalGeneration")
|
||||
class Granite4VisionMmprojModel(MmprojModel):
|
||||
has_vision_encoder = True
|
||||
|
||||
@ -359,6 +359,7 @@ class Keys:
|
||||
CHUNK_SIZE = "clip.audio.chunk_size"
|
||||
CONV_KERNEL_SIZE = "clip.audio.conv_kernel_size"
|
||||
MAX_POS_EMB = "clip.audio.max_pos_emb"
|
||||
FEATURE_LAYERS = "clip.audio.feature_layer" # Granite Speech Plus
|
||||
|
||||
class Attention:
|
||||
HEAD_COUNT = "clip.audio.attention.head_count"
|
||||
|
||||
@ -1310,6 +1310,9 @@ class GGUFWriter:
|
||||
def add_audio_max_pos_emb(self, value: int) -> None:
|
||||
self.add_uint32(Keys.ClipAudio.MAX_POS_EMB, value)
|
||||
|
||||
def add_audio_feature_layers(self, layers: Sequence[int]) -> None:
|
||||
self.add_array(Keys.ClipAudio.FEATURE_LAYERS, layers)
|
||||
|
||||
def add_audio_projector_window_size(self, value: int) -> None:
|
||||
self.add_uint32(Keys.ClipAudio.Projector.WINDOW_SIZE, value)
|
||||
|
||||
|
||||
@ -42,6 +42,7 @@
|
||||
#define KEY_N_HEAD "clip.%s.attention.head_count"
|
||||
#define KEY_N_HEAD_KV "clip.%s.attention.head_count_kv"
|
||||
#define KEY_LAYER_NORM_EPS "clip.%s.attention.layer_norm_epsilon"
|
||||
#define KEY_FEATURE_LAYERS "clip.%s.feature_layer"
|
||||
|
||||
// vision-specific
|
||||
#define KEY_VISION_PROJ_TYPE "clip.vision.projector_type" // for models with mixed modalities
|
||||
@ -54,7 +55,6 @@
|
||||
#define KEY_PATCH_SIZE "clip.vision.patch_size"
|
||||
#define KEY_IMAGE_MEAN "clip.vision.image_mean"
|
||||
#define KEY_IMAGE_STD "clip.vision.image_std"
|
||||
#define KEY_FEATURE_LAYER "clip.vision.feature_layer"
|
||||
#define KEY_PROJ_SCALE_FACTOR "clip.vision.projector.scale_factor"
|
||||
#define KEY_PROJ_SAMPLE_QUERY_SIDE "clip.vision.projector.query_side"
|
||||
#define KEY_PROJ_SAMPLE_WINDOW_SIDE "clip.vision.projector.window_side"
|
||||
|
||||
@ -91,7 +91,7 @@ struct clip_hparams {
|
||||
|
||||
float eps = 1e-6;
|
||||
float rope_theta = 0.0;
|
||||
std::vector<int32_t> vision_feature_layer;
|
||||
std::vector<int32_t> feature_layers;
|
||||
int32_t attn_window_size = 0;
|
||||
int32_t n_wa_pattern = 0;
|
||||
std::unordered_set<int32_t> wa_layer_indexes; // explicit layer indexes that use full attention (for irregular patterns like YoutuVL)
|
||||
@ -165,8 +165,8 @@ struct clip_hparams {
|
||||
return false;
|
||||
}
|
||||
|
||||
bool is_vision_feature_layer(int32_t layer) const {
|
||||
return std::find(vision_feature_layer.begin(), vision_feature_layer.end(), layer) != vision_feature_layer.end();
|
||||
bool is_feature_layer(int32_t layer) const {
|
||||
return std::find(feature_layers.begin(), feature_layers.end(), layer) != feature_layers.end();
|
||||
}
|
||||
};
|
||||
|
||||
|
||||
@ -1264,12 +1264,10 @@ struct clip_model_loader {
|
||||
}
|
||||
}
|
||||
|
||||
// Load the vision feature layer indices if they are explicitly provided;
|
||||
// if multiple vision feature layers are present, the values will be concatenated
|
||||
// to form the final visual features.
|
||||
// Load the vision/audio feature layer indices if they are explicitly provided
|
||||
// NOTE: gguf conversions should standardize the values of the vision feature layer to
|
||||
// be non-negative, since we use -1 to mark values as unset here.
|
||||
get_arr_int(KEY_FEATURE_LAYER, hparams.vision_feature_layer, false);
|
||||
get_arr_int(string_format(KEY_FEATURE_LAYERS, prefix), hparams.feature_layers, false);
|
||||
|
||||
// model-specific params
|
||||
switch (model.proj_type) {
|
||||
@ -1651,6 +1649,7 @@ struct clip_model_loader {
|
||||
get_u32(KEY_A_PROJ_WINDOW_SIZE, hparams.audio_proj_window_size);
|
||||
get_u32(KEY_A_PROJ_DOWNSAMPLE_RATE, hparams.audio_proj_downsample_rate);
|
||||
get_u32(KEY_A_PROJ_HEAD_COUNT, hparams.audio_proj_head_count);
|
||||
// NOTE: feature layers loaded above in common path
|
||||
} break;
|
||||
case PROJECTOR_TYPE_JANUS_PRO:
|
||||
{
|
||||
@ -1663,11 +1662,11 @@ struct clip_model_loader {
|
||||
hparams.image_resize_algo = RESIZE_ALGO_BICUBIC_PILLOW;
|
||||
hparams.image_resize_pad = PAD_CEIL;
|
||||
|
||||
get_arr_int(KEY_FEATURE_LAYER, hparams.vision_feature_layer);
|
||||
// NOTE: feature_layers loaded in common path as optional
|
||||
get_arr_int(KEY_PROJ_SPATIAL_OFFSETS, hparams.proj_spatial_offsets);
|
||||
if (hparams.vision_feature_layer.size() != hparams.proj_spatial_offsets.size()) {
|
||||
throw std::runtime_error(string_format("%s: vision_feature_layer.size() %d != proj_spatial_offsets.size() %d",
|
||||
hparams.vision_feature_layer.size(), hparams.proj_spatial_offsets.size()));
|
||||
if (hparams.feature_layers.size() != hparams.proj_spatial_offsets.size()) {
|
||||
throw std::runtime_error(string_format("%s: feature_layers.size() %d != proj_spatial_offsets.size() %d",
|
||||
hparams.feature_layers.size(), hparams.proj_spatial_offsets.size()));
|
||||
}
|
||||
|
||||
get_u32(KEY_PROJ_SAMPLE_QUERY_SIDE, hparams.downsample_query_side);
|
||||
@ -2740,7 +2739,7 @@ struct clip_model_loader {
|
||||
model.image_newline = get_tensor(TN_IMAGE_NEWLINE);
|
||||
|
||||
// Load separate layerwise and spatial projector tensors
|
||||
const auto projector_count = hparams.vision_feature_layer.size();
|
||||
const auto projector_count = hparams.feature_layers.size();
|
||||
model.qf_proj_blocks.resize(projector_count);
|
||||
for (size_t bid = 0; bid < projector_count; ++bid) {
|
||||
auto & b = model.qf_proj_blocks[bid];
|
||||
@ -4388,7 +4387,7 @@ bool clip_image_batch_encode(clip_ctx * ctx, int n_threads, const clip_image_f32
|
||||
|
||||
// Stage 1b only uses block 0's permutations; future stages
|
||||
// will upload all blocks.
|
||||
for (size_t bid = 0; bid < hparams.vision_feature_layer.size(); ++bid) {
|
||||
for (size_t bid = 0; bid < hparams.feature_layers.size(); ++bid) {
|
||||
const std::string prefix = "g4v_blk" + std::to_string(bid) + "_";
|
||||
upload(prefix + "win_idx", make_win_idx(image_side, window_side));
|
||||
upload(prefix + "qwin_idx", make_win_idx(new_side, query_side));
|
||||
|
||||
@ -1,5 +1,7 @@
|
||||
#include "models.h"
|
||||
|
||||
#include <algorithm>
|
||||
|
||||
ggml_cgraph * clip_graph_granite_speech::build() {
|
||||
const int n_frames = img.nx();
|
||||
const int context_size = hparams.audio_chunk_size;
|
||||
@ -11,6 +13,10 @@ ggml_cgraph * clip_graph_granite_speech::build() {
|
||||
const int padded_len = num_blocks * context_size;
|
||||
const int remainder = n_frames % context_size;
|
||||
|
||||
// Calculate projector input dimension based on feature layers
|
||||
const int proj_input_dim = n_embd * (hparams.feature_layers.size() + 1);
|
||||
const bool use_feature_concat = !hparams.feature_layers.empty();
|
||||
|
||||
ggml_tensor * attn_dists = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, context_size * context_size);
|
||||
ggml_set_name(attn_dists, "attn_dists");
|
||||
ggml_set_input(attn_dists);
|
||||
@ -31,6 +37,15 @@ ggml_cgraph * clip_graph_granite_speech::build() {
|
||||
cur = ggml_add(ctx0, cur, model.inp_proj_b);
|
||||
cb(cur, "inp_linear", -1);
|
||||
|
||||
// Capture layer 0 if requested (after input_linear)
|
||||
ggml_tensor * concat_result = nullptr;
|
||||
if (use_feature_concat) {
|
||||
if (std::find(hparams.feature_layers.begin(), hparams.feature_layers.end(), 0) != hparams.feature_layers.end()) {
|
||||
concat_result = cur;
|
||||
cb(concat_result, "feature_layer_0", -1);
|
||||
}
|
||||
}
|
||||
|
||||
for (int il = 0; il < n_layer; il++) {
|
||||
const auto & layer = model.layers[il];
|
||||
auto * residual = cur;
|
||||
@ -168,6 +183,18 @@ ggml_cgraph * clip_graph_granite_speech::build() {
|
||||
NORM_TYPE_NORMAL, eps, il);
|
||||
cb(cur, "layer_out", il);
|
||||
|
||||
// Capture intermediate layer (il + 1) if requested
|
||||
if (use_feature_concat) {
|
||||
if (hparams.is_feature_layer(il + 1)) {
|
||||
if (concat_result == nullptr) {
|
||||
concat_result = cur;
|
||||
} else {
|
||||
concat_result = ggml_concat(ctx0, concat_result, cur, 0);
|
||||
}
|
||||
cb(concat_result, string_format("feature_layer_%d", il + 1).c_str(), il);
|
||||
}
|
||||
}
|
||||
|
||||
// CTC branch
|
||||
if (il + 1 == ctc_layer) {
|
||||
auto * mid = build_mm(model.ctc_out_w, cur);
|
||||
@ -180,6 +207,13 @@ ggml_cgraph * clip_graph_granite_speech::build() {
|
||||
}
|
||||
}
|
||||
|
||||
// Append final output to concatenated features if using feature concatenation
|
||||
if (use_feature_concat && concat_result != nullptr) {
|
||||
concat_result = ggml_concat(ctx0, concat_result, cur, 0);
|
||||
cb(concat_result, "concat_final", -1);
|
||||
cur = concat_result;
|
||||
}
|
||||
|
||||
cb(cur, "encoder_out", -1);
|
||||
|
||||
// QFormer projector
|
||||
@ -197,7 +231,7 @@ ggml_cgraph * clip_graph_granite_speech::build() {
|
||||
cur = ggml_pad(ctx0, cur, 0, padded_proj - n_frames, 0, 0);
|
||||
}
|
||||
|
||||
ggml_tensor * enc_windows = ggml_reshape_3d(ctx0, cur, n_embd, window_size, nblocks_proj);
|
||||
ggml_tensor * enc_windows = ggml_reshape_3d(ctx0, cur, proj_input_dim, window_size, nblocks_proj);
|
||||
|
||||
ggml_tensor * queries = build_norm(model.qf_proj_blocks[0].qf_proj_query,
|
||||
model.qf_proj_blocks[0].qf_proj_norm_w, model.qf_proj_blocks[0].qf_proj_norm_b,
|
||||
|
||||
@ -304,14 +304,14 @@ ggml_cgraph * clip_graph_granite4_vision::build() {
|
||||
}
|
||||
|
||||
// --- Stage 1b/1c: WindowQFormer blocks ---
|
||||
const int projector_count = hparams.vision_feature_layer.size();
|
||||
const int projector_count = hparams.feature_layers.size();
|
||||
const float qformer_eps = 1e-12f;
|
||||
|
||||
ggml_tensor * mmproj = nullptr;
|
||||
for (int bid = 0; bid < projector_count; ++bid) {
|
||||
const auto & blk = model.qf_proj_blocks[bid];
|
||||
|
||||
int vlayer = hparams.vision_feature_layer[bid];
|
||||
int vlayer = hparams.feature_layers[bid];
|
||||
GGML_ASSERT(vlayer >= 0 && vlayer < n_layer);
|
||||
ggml_tensor * h = layer_outs[vlayer];
|
||||
|
||||
|
||||
@ -21,7 +21,7 @@ ggml_cgraph * clip_graph_llava::build() {
|
||||
|
||||
// If we set explicit vision feature layers, only go up to the deepest one
|
||||
// NOTE: only used by granite-vision models for now
|
||||
for (const auto & feature_layer : hparams.vision_feature_layer) {
|
||||
for (const auto & feature_layer : hparams.feature_layers) {
|
||||
if (feature_layer > deepest_feature_layer) {
|
||||
deepest_feature_layer = feature_layer;
|
||||
}
|
||||
@ -59,7 +59,7 @@ ggml_cgraph * clip_graph_llava::build() {
|
||||
|
||||
// If this is an embedding feature layer, save the output.
|
||||
// NOTE: 0 index here refers to the input to the encoder.
|
||||
if (hparams.is_vision_feature_layer(il)) {
|
||||
if (hparams.is_feature_layer(il)) {
|
||||
embedding_stack.push_back(cur);
|
||||
}
|
||||
|
||||
@ -134,7 +134,7 @@ ggml_cgraph * clip_graph_llava::build() {
|
||||
// process vision feature layers (used by granite)
|
||||
{
|
||||
// final layer is a vision feature layer
|
||||
if (hparams.is_vision_feature_layer(max_feature_layer)) {
|
||||
if (hparams.is_feature_layer(max_feature_layer)) {
|
||||
embedding_stack.push_back(inpL);
|
||||
}
|
||||
|
||||
|
||||
Loading…
x
Reference in New Issue
Block a user