From d73cd076740db9c111d0e58ddd4486904469e75e Mon Sep 17 00:00:00 2001
From: Aarnav Pai <52203828+arnu515@users.noreply.github.com>
Date: Tue, 9 Jun 2026 23:16:27 +0530
Subject: [PATCH] graph: Fix granite speech model inference by applying
 embedding scale when deepstack is not used (#24357)

* llama-graph : apply embedding scale when deepstack is not used

* nits: remove non-existant hunyuan-vl from the tests

* apply suggestion from @gabe-l-hart

---------

Co-authored-by: Xuan Son Nguyen <son@huggingface.co>
---
 src/llama-graph.cpp | 6 +++---
 tools/mtmd/tests.sh | 1 -
 2 files changed, 3 insertions(+), 4 deletions(-)

diff --git a/src/llama-graph.cpp b/src/llama-graph.cpp
index 4cc4a4a16a..3d942ba4fe 100644
--- a/src/llama-graph.cpp
+++ b/src/llama-graph.cpp
@@ -1873,9 +1873,9 @@ ggml_tensor * llm_graph_context::build_inp_embd(ggml_tensor * tok_embd) const {
     res->t_inp_embd = cur;
 
     // For Granite architecture
-    // NOTE: Only apply scale to token inputs. Raw embeddings are assumed to be
-    //  multimodal inputs that should not be scaled.
-    if (ubatch.token && hparams.f_embedding_scale != 0.0f) {
+    // NOTE: For deepstack models, only apply scale to token inputs (ie text-only input).
+    //  Raw embeddings are assumed to be multimodal inputs that should not be scaled.
+    if (hparams.f_embedding_scale != 0.0f && (ubatch.token || hparams.n_deepstack_layers == 0)) {
         if (!ggml_is_contiguous(cur)) {
             cur = ggml_cont(ctx0, cur);
         }
diff --git a/tools/mtmd/tests.sh b/tools/mtmd/tests.sh
index 83416fb272..5da48d61bf 100755
--- a/tools/mtmd/tests.sh
+++ b/tools/mtmd/tests.sh
@@ -91,7 +91,6 @@ add_test_vision "ggml-org/LightOnOCR-1B-1025-GGUF:Q8_0"
 add_test_vision "ggml-org/DeepSeek-OCR-GGUF:Q8_0" -p "Free OCR." --chat-template deepseek-ocr
 add_test_vision "ggml-org/dots.ocr-GGUF:Q8_0" -p "OCR"
 add_test_vision "ggml-org/HunyuanOCR-GGUF:Q8_0" -p "OCR"
-add_test_vision "ggml-org/HunyuanVL-4B-GGUF:Q8_0"
 add_test_vision "ggml-org/gemma-4-E2B-it-GGUF:Q8_0" --jinja
 
 add_test_audio  "ggml-org/ultravox-v0_5-llama-3_2-1b-GGUF:Q8_0"