model : Add LFM2.5-ColBERT-350M and LFM2.5-Embedding-350M (#24913)

* model : Add LFM2.5-ColBERT-350M and LFM2.5-Embedding-350M * Restore LFM2 models in README.md
2026-06-27 23:50:20 -05:00 · 2026-06-24 08:49:46 +02:00 · 2026-06-24 08:49:46 +02:00 · 88636e178f
commit 88636e178f
parent ac4105d68b
4 changed files with 28 additions and 8 deletions
--- a/README.md
+++ b/README.md
@ -142,7 +142,9 @@ Instructions for adding support for new models: [HOWTO-add-model.md](docs/develo
 - [x] [GigaChat-20B-A3B](https://huggingface.co/ai-sage/GigaChat-20B-A3B-instruct)
 - [X] [Trillion-7B-preview](https://huggingface.co/trillionlabs/Trillion-7B-preview)
 - [x] [Ling models](https://huggingface.co/collections/inclusionAI/ling-67c51c85b34a7ea0aba94c32)
- [x] [LFM2 models](https://huggingface.co/collections/LiquidAI/lfm2-686d721927015b2ad73eaa38)
+- [x] [Liquid LFM2 models](https://huggingface.co/collections/LiquidAI/lfm2)
 - [x] [Liquid LFM2.5 models](https://huggingface.co/collections/LiquidAI/lfm25)
 - [x] [Liquid Nanos](https://huggingface.co/collections/LiquidAI/liquid-nanos)
 - [x] [Hunyuan models](https://huggingface.co/collections/tencent/hunyuan-dense-model-6890632cda26b19119c9c5e7)
 - [x] [BailingMoeV2 (Ring/Ling 2.0) models](https://huggingface.co/collections/inclusionAI/ling-v2-68bf1dd2fc34c306c1fa6f86)
 - [x] [Mellum models](https://huggingface.co/JetBrains/models?search=mellum)
--- a/conversion/init.py
+++ b/conversion/init.py
@ -124,6 +124,7 @@ TEXT_MODEL_MAP: dict[str, str] = {
    "LLaDAModelLM": "llada",
    "LLaMAForCausalLM": "llama",
    "Lfm25AudioTokenizer": "lfm2",
    "Lfm2BidirectionalModel": "lfm2",
    "Lfm2ForCausalLM": "lfm2",
    "Lfm2Model": "lfm2",
    "Lfm2MoeForCausalLM": "lfm2",
--- a/conversion/lfm2.py
+++ b/conversion/lfm2.py
@ -64,11 +64,17 @@ class LFM2Model(TextModel):
        yield from super().modify_tensors(data_torch, name, bid)
-@ModelBase.register("Lfm2Model")
+@ModelBase.register("Lfm2Model", "Lfm2BidirectionalModel")
 class LFM2ColBertModel(LFM2Model):
    model_arch = gguf.MODEL_ARCH.LFM2
    dense_tensor_name = "dense_2"
    def set_gguf_parameters(self):
        super().set_gguf_parameters()
        if self.hf_arch == "Lfm2BidirectionalModel":
            self.gguf_writer.add_causal_attention(False)
        self._try_set_pooling_type()
    def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]:
        if not name.startswith(self.dense_tensor_name):
            name = "model." + name
@ -76,10 +82,11 @@ class LFM2ColBertModel(LFM2Model):
        yield from super().modify_tensors(data_torch, name, bid)
    def generate_extra_tensors(self) -> Iterable[tuple[str, Tensor]]:
-        # dense tensor is stored in a separate safetensors file
+        # optional dense tensor is stored in a separate safetensors file
        from safetensors.torch import load_file
        tensors_file = self.dir_model / "1_Dense" / "model.safetensors"
-        assert tensors_file.is_file()
+        if not tensors_file.is_file():
            return
        tensor = load_file(tensors_file)["linear.weight"]
        self.gguf_writer.add_embedding_length_out(tensor.shape[0])
        yield f"{self.dense_tensor_name}.weight", tensor.clone()
--- a/src/models/lfm2.cpp
+++ b/src/models/lfm2.cpp
@ -190,7 +190,15 @@ llama_model_lfm2::graph<iswa>::graph(const llama_model & model, const llm_graph_
        auto * conv_rs    = build_rs(inp_recr, conv_state, hparams.n_embd_r(), n_seqs);
        auto * conv       = ggml_reshape_3d(ctx0, conv_rs, d_conv, hparams.n_embd, n_seqs);
-        bx = ggml_concat(ctx0, conv, bx, 0);
+        // causal prepends the state, non-causal pads symmetrically for a centered window
        if (hparams.causal_attn) {
            bx = ggml_concat(ctx0, conv, bx, 0);
        } else {
            const int64_t pad = (hparams.n_shortconv_l_cache - 1) / 2;
            auto * left = ggml_cont(ctx0,
                ggml_view_3d(ctx0, conv, pad, hparams.n_embd, n_seqs, conv->nb[1], conv->nb[2], (d_conv - pad) * conv->nb[0]));
            bx = ggml_pad_ext(ctx0, ggml_concat(ctx0, left, bx, 0), 0, pad, 0, 0, 0, 0, 0, 0);
        }
        GGML_ASSERT(bx->ne[0] > conv->ne[0]);
        // last d_conv columns is a new conv state
@ -266,10 +274,12 @@ llama_model_lfm2::graph<iswa>::graph(const llama_model & model, const llm_graph_
    cb(cur, "result_norm", -1);
    res->t_embd = cur;
-    cur = build_lora_mm(model.output, cur, model.output_s);
+    if (!cparams.embeddings) {
-    cb(cur, "result_output", -1);
+        cur = build_lora_mm(model.output, cur, model.output_s);
        cb(cur, "result_output", -1);
-    res->t_logits = cur;
+        res->t_logits = cur;
    }
    ggml_build_forward_expand(gf, cur);
 }