graph: Fix granite speech model inference by applying embedding scale when deepstack is not used (#24357)

* llama-graph : apply embedding scale when deepstack is not used

* nits: remove non-existant hunyuan-vl from the tests

* apply suggestion from @gabe-l-hart

---------

Co-authored-by: Xuan Son Nguyen <son@huggingface.co>
This commit is contained in:
Aarnav Pai 2026-06-09 23:16:27 +05:30 committed by GitHub
parent e25a32e98c
commit d73cd07674
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
2 changed files with 3 additions and 4 deletions

View File

@ -1873,9 +1873,9 @@ ggml_tensor * llm_graph_context::build_inp_embd(ggml_tensor * tok_embd) const {
res->t_inp_embd = cur;
// For Granite architecture
// NOTE: Only apply scale to token inputs. Raw embeddings are assumed to be
// multimodal inputs that should not be scaled.
if (ubatch.token && hparams.f_embedding_scale != 0.0f) {
// NOTE: For deepstack models, only apply scale to token inputs (ie text-only input).
// Raw embeddings are assumed to be multimodal inputs that should not be scaled.
if (hparams.f_embedding_scale != 0.0f && (ubatch.token || hparams.n_deepstack_layers == 0)) {
if (!ggml_is_contiguous(cur)) {
cur = ggml_cont(ctx0, cur);
}

View File

@ -91,7 +91,6 @@ add_test_vision "ggml-org/LightOnOCR-1B-1025-GGUF:Q8_0"
add_test_vision "ggml-org/DeepSeek-OCR-GGUF:Q8_0" -p "Free OCR." --chat-template deepseek-ocr
add_test_vision "ggml-org/dots.ocr-GGUF:Q8_0" -p "OCR"
add_test_vision "ggml-org/HunyuanOCR-GGUF:Q8_0" -p "OCR"
add_test_vision "ggml-org/HunyuanVL-4B-GGUF:Q8_0"
add_test_vision "ggml-org/gemma-4-E2B-it-GGUF:Q8_0" --jinja
add_test_audio "ggml-org/ultravox-v0_5-llama-3_2-1b-GGUF:Q8_0"