diff --git a/README.md b/README.md index 0652d13f29..e98f2b7f18 100644 --- a/README.md +++ b/README.md @@ -142,7 +142,9 @@ Instructions for adding support for new models: [HOWTO-add-model.md](docs/develo - [x] [GigaChat-20B-A3B](https://huggingface.co/ai-sage/GigaChat-20B-A3B-instruct) - [X] [Trillion-7B-preview](https://huggingface.co/trillionlabs/Trillion-7B-preview) - [x] [Ling models](https://huggingface.co/collections/inclusionAI/ling-67c51c85b34a7ea0aba94c32) -- [x] [LFM2 models](https://huggingface.co/collections/LiquidAI/lfm2-686d721927015b2ad73eaa38) +- [x] [Liquid LFM2 models](https://huggingface.co/collections/LiquidAI/lfm2) +- [x] [Liquid LFM2.5 models](https://huggingface.co/collections/LiquidAI/lfm25) +- [x] [Liquid Nanos](https://huggingface.co/collections/LiquidAI/liquid-nanos) - [x] [Hunyuan models](https://huggingface.co/collections/tencent/hunyuan-dense-model-6890632cda26b19119c9c5e7) - [x] [BailingMoeV2 (Ring/Ling 2.0) models](https://huggingface.co/collections/inclusionAI/ling-v2-68bf1dd2fc34c306c1fa6f86) - [x] [Mellum models](https://huggingface.co/JetBrains/models?search=mellum) diff --git a/conversion/__init__.py b/conversion/__init__.py index c6af6f7318..2bce1bbd7c 100644 --- a/conversion/__init__.py +++ b/conversion/__init__.py @@ -124,6 +124,7 @@ TEXT_MODEL_MAP: dict[str, str] = { "LLaDAModelLM": "llada", "LLaMAForCausalLM": "llama", "Lfm25AudioTokenizer": "lfm2", + "Lfm2BidirectionalModel": "lfm2", "Lfm2ForCausalLM": "lfm2", "Lfm2Model": "lfm2", "Lfm2MoeForCausalLM": "lfm2", diff --git a/conversion/lfm2.py b/conversion/lfm2.py index f28fccf10f..70ce45658b 100644 --- a/conversion/lfm2.py +++ b/conversion/lfm2.py @@ -64,11 +64,17 @@ class LFM2Model(TextModel): yield from super().modify_tensors(data_torch, name, bid) -@ModelBase.register("Lfm2Model") +@ModelBase.register("Lfm2Model", "Lfm2BidirectionalModel") class LFM2ColBertModel(LFM2Model): model_arch = gguf.MODEL_ARCH.LFM2 dense_tensor_name = "dense_2" + def set_gguf_parameters(self): + super().set_gguf_parameters() + if self.hf_arch == "Lfm2BidirectionalModel": + self.gguf_writer.add_causal_attention(False) + self._try_set_pooling_type() + def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]: if not name.startswith(self.dense_tensor_name): name = "model." + name @@ -76,10 +82,11 @@ class LFM2ColBertModel(LFM2Model): yield from super().modify_tensors(data_torch, name, bid) def generate_extra_tensors(self) -> Iterable[tuple[str, Tensor]]: - # dense tensor is stored in a separate safetensors file + # optional dense tensor is stored in a separate safetensors file from safetensors.torch import load_file tensors_file = self.dir_model / "1_Dense" / "model.safetensors" - assert tensors_file.is_file() + if not tensors_file.is_file(): + return tensor = load_file(tensors_file)["linear.weight"] self.gguf_writer.add_embedding_length_out(tensor.shape[0]) yield f"{self.dense_tensor_name}.weight", tensor.clone() diff --git a/src/models/lfm2.cpp b/src/models/lfm2.cpp index 97da8a6abb..07b7346ee4 100644 --- a/src/models/lfm2.cpp +++ b/src/models/lfm2.cpp @@ -190,7 +190,15 @@ llama_model_lfm2::graph::graph(const llama_model & model, const llm_graph_ auto * conv_rs = build_rs(inp_recr, conv_state, hparams.n_embd_r(), n_seqs); auto * conv = ggml_reshape_3d(ctx0, conv_rs, d_conv, hparams.n_embd, n_seqs); - bx = ggml_concat(ctx0, conv, bx, 0); + // causal prepends the state, non-causal pads symmetrically for a centered window + if (hparams.causal_attn) { + bx = ggml_concat(ctx0, conv, bx, 0); + } else { + const int64_t pad = (hparams.n_shortconv_l_cache - 1) / 2; + auto * left = ggml_cont(ctx0, + ggml_view_3d(ctx0, conv, pad, hparams.n_embd, n_seqs, conv->nb[1], conv->nb[2], (d_conv - pad) * conv->nb[0])); + bx = ggml_pad_ext(ctx0, ggml_concat(ctx0, left, bx, 0), 0, pad, 0, 0, 0, 0, 0, 0); + } GGML_ASSERT(bx->ne[0] > conv->ne[0]); // last d_conv columns is a new conv state @@ -266,10 +274,12 @@ llama_model_lfm2::graph::graph(const llama_model & model, const llm_graph_ cb(cur, "result_norm", -1); res->t_embd = cur; - cur = build_lora_mm(model.output, cur, model.output_s); - cb(cur, "result_output", -1); + if (!cparams.embeddings) { + cur = build_lora_mm(model.output, cur, model.output_s); + cb(cur, "result_output", -1); - res->t_logits = cur; + res->t_logits = cur; + } ggml_build_forward_expand(gf, cur); }