From d73cd076740db9c111d0e58ddd4486904469e75e Mon Sep 17 00:00:00 2001 From: Aarnav Pai <52203828+arnu515@users.noreply.github.com> Date: Tue, 9 Jun 2026 23:16:27 +0530 Subject: [PATCH] graph: Fix granite speech model inference by applying embedding scale when deepstack is not used (#24357) * llama-graph : apply embedding scale when deepstack is not used * nits: remove non-existant hunyuan-vl from the tests * apply suggestion from @gabe-l-hart --------- Co-authored-by: Xuan Son Nguyen --- src/llama-graph.cpp | 6 +++--- tools/mtmd/tests.sh | 1 - 2 files changed, 3 insertions(+), 4 deletions(-) diff --git a/src/llama-graph.cpp b/src/llama-graph.cpp index 4cc4a4a16a..3d942ba4fe 100644 --- a/src/llama-graph.cpp +++ b/src/llama-graph.cpp @@ -1873,9 +1873,9 @@ ggml_tensor * llm_graph_context::build_inp_embd(ggml_tensor * tok_embd) const { res->t_inp_embd = cur; // For Granite architecture - // NOTE: Only apply scale to token inputs. Raw embeddings are assumed to be - // multimodal inputs that should not be scaled. - if (ubatch.token && hparams.f_embedding_scale != 0.0f) { + // NOTE: For deepstack models, only apply scale to token inputs (ie text-only input). + // Raw embeddings are assumed to be multimodal inputs that should not be scaled. + if (hparams.f_embedding_scale != 0.0f && (ubatch.token || hparams.n_deepstack_layers == 0)) { if (!ggml_is_contiguous(cur)) { cur = ggml_cont(ctx0, cur); } diff --git a/tools/mtmd/tests.sh b/tools/mtmd/tests.sh index 83416fb272..5da48d61bf 100755 --- a/tools/mtmd/tests.sh +++ b/tools/mtmd/tests.sh @@ -91,7 +91,6 @@ add_test_vision "ggml-org/LightOnOCR-1B-1025-GGUF:Q8_0" add_test_vision "ggml-org/DeepSeek-OCR-GGUF:Q8_0" -p "Free OCR." --chat-template deepseek-ocr add_test_vision "ggml-org/dots.ocr-GGUF:Q8_0" -p "OCR" add_test_vision "ggml-org/HunyuanOCR-GGUF:Q8_0" -p "OCR" -add_test_vision "ggml-org/HunyuanVL-4B-GGUF:Q8_0" add_test_vision "ggml-org/gemma-4-E2B-it-GGUF:Q8_0" --jinja add_test_audio "ggml-org/ultravox-v0_5-llama-3_2-1b-GGUF:Q8_0"