graph: Fix granite speech model inference by applying embedding scale when deepstack is not used (#24357)

* llama-graph : apply embedding scale when deepstack is not used * nits: remove non-existant hunyuan-vl from the tests * apply suggestion from @gabe-l-hart --------- Co-authored-by: Xuan Son Nguyen <son@huggingface.co>
2026-06-27 23:50:20 -05:00 · 2026-06-09 23:16:27 +05:30 · 2026-06-09 23:16:27 +05:30 · d73cd07674
commit d73cd07674
parent e25a32e98c
2 changed files with 3 additions and 4 deletions
--- a/src/llama-graph.cpp
+++ b/src/llama-graph.cpp
@ -1873,9 +1873,9 @@ ggml_tensor * llm_graph_context::build_inp_embd(ggml_tensor * tok_embd) const {
    res->t_inp_embd = cur;

    // For Granite architecture
-    // NOTE: Only apply scale to token inputs. Raw embeddings are assumed to be
-    //  multimodal inputs that should not be scaled.
-    if (ubatch.token && hparams.f_embedding_scale != 0.0f) {
+    // NOTE: For deepstack models, only apply scale to token inputs (ie text-only input).
+    //  Raw embeddings are assumed to be multimodal inputs that should not be scaled.
+    if (hparams.f_embedding_scale != 0.0f && (ubatch.token || hparams.n_deepstack_layers == 0)) {
        if (!ggml_is_contiguous(cur)) {
            cur = ggml_cont(ctx0, cur);
        }
--- a/tools/mtmd/tests.sh
+++ b/tools/mtmd/tests.sh
@ -91,7 +91,6 @@ add_test_vision "ggml-org/LightOnOCR-1B-1025-GGUF:Q8_0"
 add_test_vision "ggml-org/DeepSeek-OCR-GGUF:Q8_0" -p "Free OCR." --chat-template deepseek-ocr
 add_test_vision "ggml-org/dots.ocr-GGUF:Q8_0" -p "OCR"
 add_test_vision "ggml-org/HunyuanOCR-GGUF:Q8_0" -p "OCR"
-add_test_vision "ggml-org/HunyuanVL-4B-GGUF:Q8_0"
 add_test_vision "ggml-org/gemma-4-E2B-it-GGUF:Q8_0" --jinja

 add_test_audio  "ggml-org/ultravox-v0_5-llama-3_2-1b-GGUF:Q8_0"