From bfb4308b058b334c6e68085c661ec9eb7e3d59f4 Mon Sep 17 00:00:00 2001 From: Hans Florian Date: Tue, 2 Jun 2026 11:55:11 -0400 Subject: [PATCH] model : support granite multilingual embeddings R2 (ibm-granite/granite-embedding-{97,311}m-multilingual-r2) (#22716) * Add support for the ibm-granite/granite-embedding-{97m,311m}-multilingual-r2 embedding models: * Added a version of the gpt4o tokenizer that has a fixed regex (better handling of marks), and different token merging setting for the 97m model * Reused gemma4 tokenizer for the 311m model * granite-embedding-*-multilingual-r2 : add support SwiGLU FFN for Granite Embedding Multilingual R2 * added new GGUF key .hidden_activation (LLM_KV_HIDDEN_ACT) + writer * added a forward declaration of llm_ffn_op_type to llama-hparams.h * added llm_ffn_op in hparams * added LLM_FFN_NONE = 0 sentinel to llm_ffn_op_type (value-initialization), modern-bert: explicitly assigns LLM_FFN_GEGLU before reading GGUF (unchanged). * centralized hidden_act mapping in llama-model.cpp, added llm_ffn_op_type_from_string() helper, mirroring rope_scaling_type/llama_rope_scaling_type_from_string() * modern-bert reads the GGUF key (when present) and uses the resulting op in its FFN graph * Added granite-embedding-{97m,311m}-multilingual-r2 to the converter code * Added the hashes for the granite embedding multilingual R2 models * Set the hidden_activation in the GGUF if the field is present in config.json (such as for the granite embedding models) --- conversion/base.py | 6 ++ conversion/bert.py | 6 ++ convert_hf_to_gguf_update.py | 2 + gguf-py/gguf/constants.py | 1 + gguf-py/gguf/gguf_writer.py | 3 + src/llama-arch.cpp | 1 + src/llama-arch.h | 1 + src/llama-graph.h | 3 +- src/llama-hparams.h | 11 ++++ src/llama-model.cpp | 22 +++++++ src/llama-model.h | 4 ++ src/llama-vocab.cpp | 17 +++++- src/llama-vocab.h | 109 ++++++++++++++++++----------------- src/models/modern-bert.cpp | 11 +++- 14 files changed, 140 insertions(+), 57 deletions(-) diff --git a/conversion/base.py b/conversion/base.py index 729ddbca4a..69bc472b72 100644 --- a/conversion/base.py +++ b/conversion/base.py @@ -1657,6 +1657,12 @@ class TextModel(ModelBase): if chkhsh == "36f3066e97b7f3994b379aaacde306c1444c6ae84e81a5ae3cd2b7ed3b8c42d4": # ref: https://huggingface.co/openbmb/MiniCPM5-1B res = "minicpm5" + if chkhsh == "f241072145675bf8322086f115aebad05e9f869557a238bf2150a2a417d1bf60": + # ref: https://huggingface.co/ibm-granite/granite-embedding-97m-multilingual-r2 + res = "granite-embed-multi-97m" + if chkhsh == "789696f5946cc0fc59371f39f6097cafed196b3acded6140432f26bbb1ae1669": + # ref: https://huggingface.co/ibm-granite/granite-embedding-311m-multilingual-r2 + res = "granite-embed-multi-311m" if res is None: logger.warning("\n") diff --git a/conversion/bert.py b/conversion/bert.py index 9eb320e58a..49a6948f6c 100644 --- a/conversion/bert.py +++ b/conversion/bert.py @@ -603,6 +603,12 @@ class ModernBertModel(BertModel): self.gguf_writer.add_sliding_window_pattern(sliding_window_pattern) self.gguf_writer.add_rope_scaling_type(gguf.RopeScalingType.NONE) self.gguf_writer.add_vocab_size(self.hparams["vocab_size"]) + # FFN activation: ModernBert uses a GLU pair (ffn_up output is 2*n_ff). The + # original ModernBERT uses GELU (-> GeGLU); some derivatives such as IBM + # Granite Embedding 97m R2 use SiLU (-> SwiGLU). Persist this so the + # llama.cpp graph can pick the matching activation. + if hidden_act := self.hparams.get("hidden_activation"): + self.gguf_writer.add_hidden_act(hidden_act) @classmethod def filter_tensors(cls, item: tuple[str, Callable[[], Tensor]]) -> tuple[str, Callable[[], Tensor]] | None: diff --git a/convert_hf_to_gguf_update.py b/convert_hf_to_gguf_update.py index 827af277b9..00e5888970 100755 --- a/convert_hf_to_gguf_update.py +++ b/convert_hf_to_gguf_update.py @@ -158,6 +158,8 @@ models = [ {"name": "sarvam-moe", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/sarvamai/sarvam-30b", }, {"name": "talkie", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/lewtun/talkie-1930-13b-it-hf", }, {"name": "minicpm5", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/openbmb/MiniCPM5-1B"}, + {"name": "granite-embed-multi-97m", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/ibm-granite/granite-embedding-97m-multilingual-r2", }, + {"name": "granite-embed-multi-311m", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/ibm-granite/granite-embedding-311m-multilingual-r2", }, ] # some models are known to be broken upstream, so we will skip them as exceptions diff --git a/gguf-py/gguf/constants.py b/gguf-py/gguf/constants.py index 8c6ff280d9..fc54063fea 100644 --- a/gguf-py/gguf/constants.py +++ b/gguf-py/gguf/constants.py @@ -150,6 +150,7 @@ class Keys: EMBD_LENGTH_PER_LAYER_INP = "{arch}.embedding_length_per_layer_input" SWIGLU_CLAMP_EXP = "{arch}.swiglu_clamp_exp" SWIGLU_CLAMP_SHEXP = "{arch}.swiglu_clamp_shexp" + HIDDEN_ACT = "{arch}.hidden_activation" DENSE_FEAT_IN_SIZE = "{arch}.{dense}_feat_in" DENSE_FEAT_OUT_SIZE = "{arch}.{dense}_feat_out" diff --git a/gguf-py/gguf/gguf_writer.py b/gguf-py/gguf/gguf_writer.py index e94b47badb..63cf6debcc 100644 --- a/gguf-py/gguf/gguf_writer.py +++ b/gguf-py/gguf/gguf_writer.py @@ -853,6 +853,9 @@ class GGUFWriter: def add_swiglu_clamp_shexp(self, values: Sequence[float]) -> None: self.add_array(Keys.LLM.SWIGLU_CLAMP_SHEXP.format(arch=self.arch), values) + def add_hidden_act(self, value: str) -> None: + self.add_string(Keys.LLM.HIDDEN_ACT.format(arch=self.arch), value) + def add_expert_group_scale(self, value: float) -> None: self.add_float32(Keys.LLM.EXPERT_GROUP_SCALE.format(arch=self.arch), value) diff --git a/src/llama-arch.cpp b/src/llama-arch.cpp index be8f73cc1e..9d5a7b6e9e 100644 --- a/src/llama-arch.cpp +++ b/src/llama-arch.cpp @@ -195,6 +195,7 @@ static const std::map LLM_KV_NAMES = { { LLM_KV_MOE_LATENT_SIZE, "%s.moe_latent_size" }, { LLM_KV_NEXTN_PREDICT_LAYERS, "%s.nextn_predict_layers" }, { LLM_KV_NUM_DEEPSTACK_LAYERS, "%s.n_deepstack_layers" }, + { LLM_KV_HIDDEN_ACT, "%s.hidden_activation" }, { LLM_KV_POOLING_TYPE, "%s.pooling_type" }, { LLM_KV_LOGIT_SCALE, "%s.logit_scale" }, { LLM_KV_DECODER_START_TOKEN_ID, "%s.decoder_start_token_id" }, diff --git a/src/llama-arch.h b/src/llama-arch.h index 2c71bbe815..233b29de67 100644 --- a/src/llama-arch.h +++ b/src/llama-arch.h @@ -199,6 +199,7 @@ enum llm_kv { LLM_KV_MOE_LATENT_SIZE, LLM_KV_NEXTN_PREDICT_LAYERS, LLM_KV_NUM_DEEPSTACK_LAYERS, + LLM_KV_HIDDEN_ACT, LLM_KV_POOLING_TYPE, LLM_KV_LOGIT_SCALE, LLM_KV_DECODER_START_TOKEN_ID, diff --git a/src/llama-graph.h b/src/llama-graph.h index eab82bd0d7..f2b952b2c3 100644 --- a/src/llama-graph.h +++ b/src/llama-graph.h @@ -36,7 +36,8 @@ enum llm_graph_type { LLM_GRAPH_TYPE_DECODER_MTP, }; -enum llm_ffn_op_type { +enum llm_ffn_op_type : int { + LLM_FFN_NONE = 0, // sentinel: unset; archs must assign before use LLM_FFN_SILU, LLM_FFN_GELU, LLM_FFN_RELU, diff --git a/src/llama-hparams.h b/src/llama-hparams.h index e2d051edc6..e4601d30f5 100644 --- a/src/llama-hparams.h +++ b/src/llama-hparams.h @@ -23,6 +23,9 @@ enum llama_swa_type { LLAMA_SWA_TYPE_SYMMETRIC = 3, }; +// forward declaration; full definition in llama-graph.h +enum llm_ffn_op_type : int; + struct llama_hparams_posnet { uint32_t n_embd; uint32_t n_layer; @@ -227,6 +230,14 @@ struct llama_hparams { enum llama_rope_scaling_type rope_scaling_type_train = LLAMA_ROPE_SCALING_TYPE_NONE; + // Resolved FFN gated activation flavor for archs that read + // `.hidden_activation` from the GGUF (e.g. ModernBert derivatives). + // Defaults to LLM_FFN_NONE (sentinel = 0); the mapping from the GGUF + // string to a real op is done at hparam-load time via + // llm_ffn_op_type_from_string() in llama-model.cpp, mirroring how + // rope_scaling_type_train is handled. + enum llm_ffn_op_type llm_ffn_op; + // Step35: optional per-layer clamps for (Swi)GLU std::array swiglu_clamp_exp; // clamping for expert FFN std::array swiglu_clamp_shexp; // shared expert diff --git a/src/llama-model.cpp b/src/llama-model.cpp index 3e236f8c17..42d104d22c 100644 --- a/src/llama-model.cpp +++ b/src/llama-model.cpp @@ -822,6 +822,28 @@ static llama_rope_scaling_type llama_rope_scaling_type_from_string(const std::st return LLAMA_ROPE_SCALING_TYPE_UNSPECIFIED; } +// Maps the GGUF `.hidden_activation` string to the FFN op type used by the +// graph builders. Only gated activations that map cleanly to llm_ffn_op_type are +// listed; unrecognized values fall back to GeGLU, which matches the historical +// default for ModernBert-style architectures. +static const std::map LLM_FFN_OP_TYPES_FROM_STRING = { + { "gelu", LLM_FFN_GEGLU }, + { "geglu", LLM_FFN_GEGLU }, + { "silu", LLM_FFN_SWIGLU }, + { "swish", LLM_FFN_SWIGLU }, + { "swiglu", LLM_FFN_SWIGLU }, + { "relu", LLM_FFN_RELU }, + { "reglu", LLM_FFN_REGLU }, +}; + +llm_ffn_op_type llm_ffn_op_type_from_string(const std::string & name, llm_ffn_op_type fallback) { + const auto it = LLM_FFN_OP_TYPES_FROM_STRING.find(name); + if (it != LLM_FFN_OP_TYPES_FROM_STRING.end()) { + return it->second; + } + return fallback; +} + // CPU: ACCEL -> GPU host -> CPU extra -> CPU static buft_list_t make_cpu_buft_list(const std::vector & devices, bool use_extra_bufts, bool no_host) { buft_list_t buft_list; diff --git a/src/llama-model.h b/src/llama-model.h index 743feb970d..d510d4a938 100644 --- a/src/llama-model.h +++ b/src/llama-model.h @@ -145,6 +145,10 @@ enum llm_type { std::string llama_rope_scaling_type_name(llama_rope_scaling_type rope_scaling_type); +// Map a GGUF activation-name string to llm_ffn_op_type. Returns `fallback` if +// the string is empty or not recognized. +llm_ffn_op_type llm_ffn_op_type_from_string(const std::string & name, llm_ffn_op_type fallback); + struct llama_layer_posnet { // resnet struct ggml_tensor * norm1 = nullptr; diff --git a/src/llama-vocab.cpp b/src/llama-vocab.cpp index 04183efc4d..79f14ac248 100644 --- a/src/llama-vocab.cpp +++ b/src/llama-vocab.cpp @@ -432,6 +432,15 @@ struct llm_tokenizer_bpe : llm_tokenizer { "[^\\r\\n\\p{L}\\p{N}]?((?=[\\p{L}])([^a-z]))*((?=[\\p{L}])([^A-Z]))+(?:'[sS]|'[tT]|'[rR][eE]|'[vV][eE]|'[mM]|'[lL][lL]|'[dD])?|[^\\r\\n\\p{L}\\p{N}]?((?=[\\p{L}])([^a-z]))+((?=[\\p{L}])([^A-Z]))*(?:'[sS]|'[tT]|'[rR][eE]|'[vV][eE]|'[mM]|'[lL][lL]|'[dD])?|\\p{N}{1,3}| ?[^\\s\\p{L}\\p{N}]+[\\r\\n/]*|\\s*[\\r\\n]+|\\s+(?!\\S)|\\s+", }; break; + case LLAMA_VOCAB_PRE_TYPE_GRANITE_EMB_MULTI: + // Same lookaheads as GPT4O but with \p{M} added so combining marks + // (diacritics) attach to their base letters. Avoids excessive + // backtracking on scripts that use them heavily (Bengali, Hindi, + // Telugu, Thai, ...). See PR #22716 for benchmarks. + regex_exprs = { + "[^\\r\\n\\p{L}\\p{N}]?((?=[\\p{L}\\p{M}])([^a-z]))*((?=[\\p{L}\\p{M}])([^A-Z]))+(?:'[sS]|'[tT]|'[rR][eE]|'[vV][eE]|'[mM]|'[lL][lL]|'[dD])?|[^\\r\\n\\p{L}\\p{N}]?((?=[\\p{L}\\p{M}])([^a-z]))+((?=[\\p{L}\\p{M}])([^A-Z]))*(?:'[sS]|'[tT]|'[rR][eE]|'[vV][eE]|'[mM]|'[lL][lL]|'[dD])?|\\p{N}{1,3}| ?[^\\s\\p{L}\\p{N}]+[\\r\\n/]*|\\s*[\\r\\n]+|\\s+(?!\\S)|\\s+", + }; + break; case LLAMA_VOCAB_PRE_TYPE_TINY_AYA: regex_exprs = { // original regex from tokenizer.json: "\\d{1,3}(?=(?:\\d{3})*\\b)" @@ -2142,7 +2151,8 @@ void llama_vocab::impl::load(llama_model_loader & ml, const LLM_KV & kv) { tokenizer_pre == "jais-2") { pre_type = LLAMA_VOCAB_PRE_TYPE_JAIS2; } else if ( - tokenizer_pre == "gemma4") { + tokenizer_pre == "gemma4" || + tokenizer_pre == "granite-embed-multi-311m") { pre_type = LLAMA_VOCAB_PRE_TYPE_GEMMA4; escape_whitespaces = true; } else if ( @@ -2252,6 +2262,11 @@ void llama_vocab::impl::load(llama_model_loader & ml, const LLM_KV & kv) { tokenizer_pre == "talkie") { pre_type = LLAMA_VOCAB_PRE_TYPE_GPT4O; clean_spaces = false; + } else if ( + tokenizer_pre == "granite-embed-multi-97m") { + pre_type = LLAMA_VOCAB_PRE_TYPE_GRANITE_EMB_MULTI; + clean_spaces = false; + ignore_merges = true; } else if ( tokenizer_pre == "tiny_aya") { pre_type = LLAMA_VOCAB_PRE_TYPE_TINY_AYA; diff --git a/src/llama-vocab.h b/src/llama-vocab.h index 093e5d02cd..416eab522b 100644 --- a/src/llama-vocab.h +++ b/src/llama-vocab.h @@ -8,60 +8,61 @@ // pre-tokenization types enum llama_vocab_pre_type { - LLAMA_VOCAB_PRE_TYPE_DEFAULT = 0, - LLAMA_VOCAB_PRE_TYPE_LLAMA3 = 1, - LLAMA_VOCAB_PRE_TYPE_DEEPSEEK_LLM = 2, - LLAMA_VOCAB_PRE_TYPE_DEEPSEEK_CODER = 3, - LLAMA_VOCAB_PRE_TYPE_FALCON = 4, - LLAMA_VOCAB_PRE_TYPE_MPT = 5, - LLAMA_VOCAB_PRE_TYPE_STARCODER = 6, - LLAMA_VOCAB_PRE_TYPE_GPT2 = 7, - LLAMA_VOCAB_PRE_TYPE_REFACT = 8, - LLAMA_VOCAB_PRE_TYPE_COMMAND_R = 9, - LLAMA_VOCAB_PRE_TYPE_STABLELM2 = 10, - LLAMA_VOCAB_PRE_TYPE_QWEN2 = 11, - LLAMA_VOCAB_PRE_TYPE_OLMO = 12, - LLAMA_VOCAB_PRE_TYPE_DBRX = 13, - LLAMA_VOCAB_PRE_TYPE_SMAUG = 14, - LLAMA_VOCAB_PRE_TYPE_PORO = 15, - LLAMA_VOCAB_PRE_TYPE_CHATGLM3 = 16, - LLAMA_VOCAB_PRE_TYPE_CHATGLM4 = 17, - LLAMA_VOCAB_PRE_TYPE_VIKING = 18, - LLAMA_VOCAB_PRE_TYPE_JAIS = 19, - LLAMA_VOCAB_PRE_TYPE_TEKKEN = 20, - LLAMA_VOCAB_PRE_TYPE_SMOLLM = 21, - LLAMA_VOCAB_PRE_TYPE_CODESHELL = 22, - LLAMA_VOCAB_PRE_TYPE_BLOOM = 23, - LLAMA_VOCAB_PRE_TYPE_GPT3_FINNISH = 24, - LLAMA_VOCAB_PRE_TYPE_EXAONE = 25, - LLAMA_VOCAB_PRE_TYPE_CHAMELEON = 26, - LLAMA_VOCAB_PRE_TYPE_MINERVA = 27, - LLAMA_VOCAB_PRE_TYPE_DEEPSEEK3_LLM = 28, - LLAMA_VOCAB_PRE_TYPE_GPT4O = 29, - LLAMA_VOCAB_PRE_TYPE_SUPERBPE = 30, - LLAMA_VOCAB_PRE_TYPE_TRILLION = 31, - LLAMA_VOCAB_PRE_TYPE_BAILINGMOE = 32, - LLAMA_VOCAB_PRE_TYPE_LLAMA4 = 33, - LLAMA_VOCAB_PRE_TYPE_PIXTRAL = 34, - LLAMA_VOCAB_PRE_TYPE_SEED_CODER = 35, - LLAMA_VOCAB_PRE_TYPE_HUNYUAN = 36, - LLAMA_VOCAB_PRE_TYPE_KIMI_K2 = 37, - LLAMA_VOCAB_PRE_TYPE_HUNYUAN_DENSE = 38, - LLAMA_VOCAB_PRE_TYPE_GROK_2 = 39, - LLAMA_VOCAB_PRE_TYPE_GRANITE_DOCLING = 40, - LLAMA_VOCAB_PRE_TYPE_MINIMAX_M2 = 41, - LLAMA_VOCAB_PRE_TYPE_AFMOE = 42, - LLAMA_VOCAB_PRE_TYPE_SOLAR_OPEN = 43, - LLAMA_VOCAB_PRE_TYPE_YOUTU = 44, - LLAMA_VOCAB_PRE_TYPE_EXAONE_MOE = 45, - LLAMA_VOCAB_PRE_TYPE_QWEN35 = 46, - LLAMA_VOCAB_PRE_TYPE_TINY_AYA = 47, - LLAMA_VOCAB_PRE_TYPE_JOYAI_LLM = 48, - LLAMA_VOCAB_PRE_TYPE_JAIS2 = 49, - LLAMA_VOCAB_PRE_TYPE_GEMMA4 = 50, - LLAMA_VOCAB_PRE_TYPE_SARVAM_MOE = 51, - LLAMA_VOCAB_PRE_TYPE_MINICPM5 = 52, - LLAMA_VOCAB_PRE_TYPE_WHITESPACE = 53, + LLAMA_VOCAB_PRE_TYPE_DEFAULT = 0, + LLAMA_VOCAB_PRE_TYPE_LLAMA3 = 1, + LLAMA_VOCAB_PRE_TYPE_DEEPSEEK_LLM = 2, + LLAMA_VOCAB_PRE_TYPE_DEEPSEEK_CODER = 3, + LLAMA_VOCAB_PRE_TYPE_FALCON = 4, + LLAMA_VOCAB_PRE_TYPE_MPT = 5, + LLAMA_VOCAB_PRE_TYPE_STARCODER = 6, + LLAMA_VOCAB_PRE_TYPE_GPT2 = 7, + LLAMA_VOCAB_PRE_TYPE_REFACT = 8, + LLAMA_VOCAB_PRE_TYPE_COMMAND_R = 9, + LLAMA_VOCAB_PRE_TYPE_STABLELM2 = 10, + LLAMA_VOCAB_PRE_TYPE_QWEN2 = 11, + LLAMA_VOCAB_PRE_TYPE_OLMO = 12, + LLAMA_VOCAB_PRE_TYPE_DBRX = 13, + LLAMA_VOCAB_PRE_TYPE_SMAUG = 14, + LLAMA_VOCAB_PRE_TYPE_PORO = 15, + LLAMA_VOCAB_PRE_TYPE_CHATGLM3 = 16, + LLAMA_VOCAB_PRE_TYPE_CHATGLM4 = 17, + LLAMA_VOCAB_PRE_TYPE_VIKING = 18, + LLAMA_VOCAB_PRE_TYPE_JAIS = 19, + LLAMA_VOCAB_PRE_TYPE_TEKKEN = 20, + LLAMA_VOCAB_PRE_TYPE_SMOLLM = 21, + LLAMA_VOCAB_PRE_TYPE_CODESHELL = 22, + LLAMA_VOCAB_PRE_TYPE_BLOOM = 23, + LLAMA_VOCAB_PRE_TYPE_GPT3_FINNISH = 24, + LLAMA_VOCAB_PRE_TYPE_EXAONE = 25, + LLAMA_VOCAB_PRE_TYPE_CHAMELEON = 26, + LLAMA_VOCAB_PRE_TYPE_MINERVA = 27, + LLAMA_VOCAB_PRE_TYPE_DEEPSEEK3_LLM = 28, + LLAMA_VOCAB_PRE_TYPE_GPT4O = 29, + LLAMA_VOCAB_PRE_TYPE_SUPERBPE = 30, + LLAMA_VOCAB_PRE_TYPE_TRILLION = 31, + LLAMA_VOCAB_PRE_TYPE_BAILINGMOE = 32, + LLAMA_VOCAB_PRE_TYPE_LLAMA4 = 33, + LLAMA_VOCAB_PRE_TYPE_PIXTRAL = 34, + LLAMA_VOCAB_PRE_TYPE_SEED_CODER = 35, + LLAMA_VOCAB_PRE_TYPE_HUNYUAN = 36, + LLAMA_VOCAB_PRE_TYPE_KIMI_K2 = 37, + LLAMA_VOCAB_PRE_TYPE_HUNYUAN_DENSE = 38, + LLAMA_VOCAB_PRE_TYPE_GROK_2 = 39, + LLAMA_VOCAB_PRE_TYPE_GRANITE_DOCLING = 40, + LLAMA_VOCAB_PRE_TYPE_MINIMAX_M2 = 41, + LLAMA_VOCAB_PRE_TYPE_AFMOE = 42, + LLAMA_VOCAB_PRE_TYPE_SOLAR_OPEN = 43, + LLAMA_VOCAB_PRE_TYPE_YOUTU = 44, + LLAMA_VOCAB_PRE_TYPE_EXAONE_MOE = 45, + LLAMA_VOCAB_PRE_TYPE_QWEN35 = 46, + LLAMA_VOCAB_PRE_TYPE_TINY_AYA = 47, + LLAMA_VOCAB_PRE_TYPE_JOYAI_LLM = 48, + LLAMA_VOCAB_PRE_TYPE_JAIS2 = 49, + LLAMA_VOCAB_PRE_TYPE_GEMMA4 = 50, + LLAMA_VOCAB_PRE_TYPE_SARVAM_MOE = 51, + LLAMA_VOCAB_PRE_TYPE_MINICPM5 = 52, + LLAMA_VOCAB_PRE_TYPE_WHITESPACE = 53, + LLAMA_VOCAB_PRE_TYPE_GRANITE_EMB_MULTI = 54, }; struct LLM_KV; diff --git a/src/models/modern-bert.cpp b/src/models/modern-bert.cpp index e9b79ffc6d..5ab51867cc 100644 --- a/src/models/modern-bert.cpp +++ b/src/models/modern-bert.cpp @@ -14,6 +14,14 @@ void llama_model_modern_bert::load_arch_hparams(llama_model_loader & ml) { ml.get_key(LLM_KV_ATTENTION_LAYERNORM_EPS, hparams.f_norm_eps); + // Some ModernBert derivatives (e.g. IBM Granite Embedding 97m R2) use + // SiLU/SwiGLU in the FFN instead of the default GELU/GeGLU. + hparams.llm_ffn_op = LLM_FFN_GEGLU; + std::string hidden_act; + if (ml.get_key(LLM_KV_HIDDEN_ACT, hidden_act, false)) { + hparams.llm_ffn_op = llm_ffn_op_type_from_string(hidden_act, LLM_FFN_GEGLU); + } + switch (hparams.n_layer) { case 12: type = LLM_TYPE_47M; break; // granite-embedding-small @@ -144,7 +152,8 @@ llama_model_modern_bert::graph::graph(const llama_model & model, const llm_graph NULL, NULL, NULL, model.layers[il].ffn_down, NULL, NULL, NULL, - LLM_FFN_GEGLU, LLM_FFN_SEQ, il); + hparams.llm_ffn_op, + LLM_FFN_SEQ, il); // attentions bypass the intermediate layer cur = ggml_add(ctx0, cur, ffn_inp);